[yt-dlp.git] / test / test_download.py

#!/usr/bin/env python3

# Allow direct execution
import os
import sys
import unittest

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


import collections
import hashlib
import http.client
import json
import socket
import urllib.error

from test.helper import (
    assertGreaterEqual,
    expect_info_dict,
    expect_warnings,
    get_params,
    gettestcases,
    getwebpagetestcases,
    is_download_test,
    report_warning,
    try_rm,
)

import yt_dlp.YoutubeDL  # isort: split
from yt_dlp.extractor import get_info_extractor
from yt_dlp.utils import (
    DownloadError,
    ExtractorError,
    UnavailableVideoError,
    format_bytes,
    join_nonempty,
)

RETRIES = 3


class YoutubeDL(yt_dlp.YoutubeDL):
    def __init__(self, *args, **kwargs):
        self.to_stderr = self.to_screen
        self.processed_info_dicts = []
        super().__init__(*args, **kwargs)

    def report_warning(self, message, *args, **kwargs):
        # Don't accept warnings during tests
        raise ExtractorError(message)

    def process_info(self, info_dict):
        self.processed_info_dicts.append(info_dict.copy())
        return super().process_info(info_dict)


def _file_md5(fn):
    with open(fn, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()


normal_test_cases = gettestcases()
webpage_test_cases = getwebpagetestcases()
tests_counter = collections.defaultdict(collections.Counter)


@is_download_test
class TestDownload(unittest.TestCase):
    # Parallel testing in nosetests. See
    # http://nose.readthedocs.org/en/latest/doc_tests/test_multiprocess/multiprocess.html
    _multiprocess_shared_ = True

    maxDiff = None

    COMPLETED_TESTS = {}

    def __str__(self):
        """Identify each test with the `add_ie` attribute, if available."""
        cls, add_ie = type(self), getattr(self, self._testMethodName).add_ie
        return f'{self._testMethodName} ({cls.__module__}.{cls.__name__}){f" [{add_ie}]" if add_ie else ""}:'


# Dynamically generate tests

def generator(test_case, tname):
    def test_template(self):
        if self.COMPLETED_TESTS.get(tname):
            return
        self.COMPLETED_TESTS[tname] = True
        ie = yt_dlp.extractor.get_info_extractor(test_case['name'])()
        other_ies = [get_info_extractor(ie_key)() for ie_key in test_case.get('add_ie', [])]
        is_playlist = any(k.startswith('playlist') for k in test_case)
        test_cases = test_case.get(
            'playlist', [] if is_playlist else [test_case])

        def print_skipping(reason):
            print('Skipping %s: %s' % (test_case['name'], reason))
            self.skipTest(reason)

        if not ie.working():
            print_skipping('IE marked as not _WORKING')

        for tc in test_cases:
            info_dict = tc.get('info_dict', {})
            params = tc.get('params', {})
            if not info_dict.get('id'):
                raise Exception(f'Test {tname} definition incorrect - "id" key is not present')
            elif not info_dict.get('ext'):
                if params.get('skip_download') and params.get('ignore_no_formats_error'):
                    continue
                raise Exception(f'Test {tname} definition incorrect - "ext" key must be present to define the output file')

        if 'skip' in test_case:
            print_skipping(test_case['skip'])

        for other_ie in other_ies:
            if not other_ie.working():
                print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key())

        params = get_params(test_case.get('params', {}))
        params['outtmpl'] = tname + '_' + params['outtmpl']
        if is_playlist and 'playlist' not in test_case:
            params.setdefault('extract_flat', 'in_playlist')
            params.setdefault('playlistend', test_case.get('playlist_mincount'))
            params.setdefault('skip_download', True)

        ydl = YoutubeDL(params, auto_init=False)
        ydl.add_default_info_extractors()
        finished_hook_called = set()

        def _hook(status):
            if status['status'] == 'finished':
                finished_hook_called.add(status['filename'])
        ydl.add_progress_hook(_hook)
        expect_warnings(ydl, test_case.get('expected_warnings', []))

        def get_tc_filename(tc):
            return ydl.prepare_filename(dict(tc.get('info_dict', {})))

        res_dict = None

        def try_rm_tcs_files(tcs=None):
            if tcs is None:
                tcs = test_cases
            for tc in tcs:
                tc_filename = get_tc_filename(tc)
                try_rm(tc_filename)
                try_rm(tc_filename + '.part')
                try_rm(os.path.splitext(tc_filename)[0] + '.info.json')
        try_rm_tcs_files()
        try:
            try_num = 1
            while True:
                try:
                    # We're not using .download here since that is just a shim
                    # for outside error handling, and returns the exit code
                    # instead of the result dict.
                    res_dict = ydl.extract_info(
                        test_case['url'],
                        force_generic_extractor=params.get('force_generic_extractor', False))
                except (DownloadError, ExtractorError) as err:
                    # Check if the exception is not a network related one
                    if (err.exc_info[0] not in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine)
                            or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503)):
                        err.msg = f'{getattr(err, "msg", err)} ({tname})'
                        raise

                    if try_num == RETRIES:
                        report_warning('%s failed due to network errors, skipping...' % tname)
                        return

                    print(f'Retrying: {try_num} failed tries\n\n##########\n\n')

                    try_num += 1
                else:
                    break

            if is_playlist:
                self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video'])
                self.assertTrue('entries' in res_dict)
                expect_info_dict(self, res_dict, test_case.get('info_dict', {}))

            if 'playlist_mincount' in test_case:
                assertGreaterEqual(
                    self,
                    len(res_dict['entries']),
                    test_case['playlist_mincount'],
                    'Expected at least %d in playlist %s, but got only %d' % (
                        test_case['playlist_mincount'], test_case['url'],
                        len(res_dict['entries'])))
            if 'playlist_count' in test_case:
                self.assertEqual(
                    len(res_dict['entries']),
                    test_case['playlist_count'],
                    'Expected %d entries in playlist %s, but got %d.' % (
                        test_case['playlist_count'],
                        test_case['url'],
                        len(res_dict['entries']),
                    ))
            if 'playlist_duration_sum' in test_case:
                got_duration = sum(e['duration'] for e in res_dict['entries'])
                self.assertEqual(
                    test_case['playlist_duration_sum'], got_duration)

            # Generalize both playlists and single videos to unified format for
            # simplicity
            if 'entries' not in res_dict:
                res_dict['entries'] = [res_dict]

            for tc_num, tc in enumerate(test_cases):
                tc_res_dict = res_dict['entries'][tc_num]
                # First, check test cases' data against extracted data alone
                expect_info_dict(self, tc_res_dict, tc.get('info_dict', {}))
                # Now, check downloaded file consistency
                tc_filename = get_tc_filename(tc)
                if not test_case.get('params', {}).get('skip_download', False):
                    self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
                    self.assertTrue(tc_filename in finished_hook_called)
                    expected_minsize = tc.get('file_minsize', 10000)
                    if expected_minsize is not None:
                        if params.get('test'):
                            expected_minsize = max(expected_minsize, 10000)
                        got_fsize = os.path.getsize(tc_filename)
                        assertGreaterEqual(
                            self, got_fsize, expected_minsize,
                            'Expected %s to be at least %s, but it\'s only %s ' %
                            (tc_filename, format_bytes(expected_minsize),
                                format_bytes(got_fsize)))
                    if 'md5' in tc:
                        md5_for_file = _file_md5(tc_filename)
                        self.assertEqual(tc['md5'], md5_for_file)
                # Finally, check test cases' data again but this time against
                # extracted data from info JSON file written during processing
                info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
                self.assertTrue(
                    os.path.exists(info_json_fn),
                    'Missing info file %s' % info_json_fn)
                with open(info_json_fn, encoding='utf-8') as infof:
                    info_dict = json.load(infof)
                expect_info_dict(self, info_dict, tc.get('info_dict', {}))
        finally:
            try_rm_tcs_files()
            if is_playlist and res_dict is not None and res_dict.get('entries'):
                # Remove all other files that may have been extracted if the
                # extractor returns full results even with extract_flat
                res_tcs = [{'info_dict': e} for e in res_dict['entries']]
                try_rm_tcs_files(res_tcs)

    return test_template


# And add them to TestDownload
def inject_tests(test_cases, label=''):
    for test_case in test_cases:
        name = test_case['name']
        tname = join_nonempty('test', name, label, tests_counter[name][label], delim='_')
        tests_counter[name][label] += 1

        test_method = generator(test_case, tname)
        test_method.__name__ = tname
        test_method.add_ie = ','.join(test_case.get('add_ie', []))
        setattr(TestDownload, test_method.__name__, test_method)


inject_tests(normal_test_cases)

# TODO: disable redirection to the IE to ensure we are actually testing the webpage extraction
inject_tests(webpage_test_cases, 'webpage')


def batch_generator(name):
    def test_template(self):
        for label, num_tests in tests_counter[name].items():
            for i in range(num_tests):
                test_name = join_nonempty('test', name, label, i, delim='_')
                try:
                    getattr(self, test_name)()
                except unittest.SkipTest:
                    print(f'Skipped {test_name}')

    return test_template


for name in tests_counter:
    test_method = batch_generator(name)
    test_method.__name__ = f'test_{name}_all'
    test_method.add_ie = ''
    setattr(TestDownload, test_method.__name__, test_method)
del test_method


if __name__ == '__main__':
    unittest.main()
Commit	Line	Data
cc52de43	1	#!/usr/bin/env python3
54007a45	2
44a5f171 PH	3	# Allow direct execution
	4	import os
	5	import sys
	6	import unittest
f8271158	7
44a5f171 PH	8	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
44a5f171 PH	9
ac668111	10
f2e8dbcc	11	import collections
ac668111	12	import hashlib
54007a45	13	import http.client
ac668111	14	import json
	15	import socket
	16	import urllib.error
ac668111	17
dd508b7c	18	from test.helper import (
0990305d	19	assertGreaterEqual,
060ac762	20	expect_info_dict,
70b7e3fb	21	expect_warnings,
dd508b7c	22	get_params,
ff14fc49	23	gettestcases,
f2e8dbcc	24	getwebpagetestcases,
060ac762	25	is_download_test,
257cfebf	26	report_warning,
060ac762	27	try_rm,
dd508b7c	28	)
54007a45	29
ac668111	30	import yt_dlp.YoutubeDL # isort: split
f8271158	31	from yt_dlp.extractor import get_info_extractor
7a5c1cfe	32	from yt_dlp.utils import (
44a5f171 PH	33	DownloadError,
	34	ExtractorError,
	35	UnavailableVideoError,
f8271158	36	format_bytes,
f2e8dbcc	37	join_nonempty,
44a5f171	38	)
fd5ff020	39
8cc83b8d FV	40	RETRIES = 3
8cc83b8d FV	41
5f6a1245	42
7a5c1cfe	43	class YoutubeDL(yt_dlp.YoutubeDL):
fd5ff020	44	def __init__(self, args, *kwargs):
fd5ff020	45	self.to_stderr = self.to_screen
0eaf520d	46	self.processed_info_dicts = []
86e5f3ed	47	super().__init__(args, *kwargs)
5f6a1245	48
f0500bd1	49	def report_warning(self, message, args, *kwargs):
be95cac1 FV	50	# Don't accept warnings during tests
be95cac1 FV	51	raise ExtractorError(message)
5f6a1245	52
0eaf520d	53	def process_info(self, info_dict):
f46e2f9d	54	self.processed_info_dicts.append(info_dict.copy())
86e5f3ed	55	return super().process_info(info_dict)
1535ac2a	56
5f6a1245	57
fd5ff020 FV	58	def _file_md5(fn):
	59	with open(fn, 'rb') as f:
	60	return hashlib.md5(f.read()).hexdigest()
	61
582be358	62
f2e8dbcc	63	normal_test_cases = gettestcases()
	64	webpage_test_cases = getwebpagetestcases()
	65	tests_counter = collections.defaultdict(collections.Counter)
6b47c7f2	66
0eaf520d	67
060ac762	68	@is_download_test
1535ac2a	69	class TestDownload(unittest.TestCase):
8936f68a YCH	70	# Parallel testing in nosetests. See
	71	# http://nose.readthedocs.org/en/latest/doc_tests/test_multiprocess/multiprocess.html
	72	_multiprocess_shared_ = True
	73
744435f2	74	maxDiff = None
5f6a1245	75
243c57cf	76	COMPLETED_TESTS = {}
243c57cf	77
c6c22e98 JH	78	def __str__(self):
c6c22e98 JH	79	"""Identify each test with the `add_ie` attribute, if available."""
f2e8dbcc	80	cls, add_ie = type(self), getattr(self, self._testMethodName).add_ie
f2e8dbcc	81	return f'{self._testMethodName} ({cls.__module__}.{cls.__name__}){f" [{add_ie}]" if add_ie else ""}:'
c6c22e98	82
fd5ff020	83
5f6a1245 JW	84	# Dynamically generate tests
5f6a1245 JW	85
8936f68a	86	def generator(test_case, tname):
1535ac2a	87	def test_template(self):
243c57cf	88	if self.COMPLETED_TESTS.get(tname):
	89	return
	90	self.COMPLETED_TESTS[tname] = True
7a5c1cfe	91	ie = yt_dlp.extractor.get_info_extractor(test_case['name'])()
655c4100	92	other_ies = [get_info_extractor(ie_key)() for ie_key in test_case.get('add_ie', [])]
e8ee972c PH	93	is_playlist = any(k.startswith('playlist') for k in test_case)
	94	test_cases = test_case.get(
	95	'playlist', [] if is_playlist else [test_case])
	96
bc2884af JMF	97	def print_skipping(reason):
bc2884af JMF	98	print('Skipping %s: %s' % (test_case['name'], reason))
6d1b3489	99	self.skipTest(reason)
6d1b3489	100
9ee2b5f6	101	if not ie.working():
bc2884af	102	print_skipping('IE marked as not _WORKING')
e8ee972c PH	103
	104	for tc in test_cases:
	105	info_dict = tc.get('info_dict', {})
0855702f	106	params = tc.get('params', {})
0855702f	107	if not info_dict.get('id'):
d7118397	108	raise Exception(f'Test {tname} definition incorrect - "id" key is not present')
0855702f	109	elif not info_dict.get('ext'):
	110	if params.get('skip_download') and params.get('ignore_no_formats_error'):
	111	continue
d7118397	112	raise Exception(f'Test {tname} definition incorrect - "ext" key must be present to define the output file')
e8ee972c	113
fd5ff020	114	if 'skip' in test_case:
bc2884af	115	print_skipping(test_case['skip'])
6d1b3489	116
9ee2b5f6 JMF	117	for other_ie in other_ies:
9ee2b5f6 JMF	118	if not other_ie.working():
e075a44a	119	print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key())
0eaf520d	120
44a5f171	121	params = get_params(test_case.get('params', {}))
8936f68a	122	params['outtmpl'] = tname + '_' + params['outtmpl']
e8ee972c	123	if is_playlist and 'playlist' not in test_case:
65d49afa	124	params.setdefault('extract_flat', 'in_playlist')
6911e11e	125	params.setdefault('playlistend', test_case.get('playlist_mincount'))
e8ee972c	126	params.setdefault('skip_download', True)
0eaf520d	127
ac35c266	128	ydl = YoutubeDL(params, auto_init=False)
023fa8c4	129	ydl.add_default_info_extractors()
bffbd5f0	130	finished_hook_called = set()
5f6a1245	131
bffbd5f0 PH	132	def _hook(status):
	133	if status['status'] == 'finished':
	134	finished_hook_called.add(status['filename'])
933605d7	135	ydl.add_progress_hook(_hook)
70b7e3fb	136	expect_warnings(ydl, test_case.get('expected_warnings', []))
5c892b0b	137
702665c0	138	def get_tc_filename(tc):
ad3dc496	139	return ydl.prepare_filename(dict(tc.get('info_dict', {})))
702665c0	140
28570840	141	res_dict = None
5f6a1245	142
28570840 PH	143	def try_rm_tcs_files(tcs=None):
	144	if tcs is None:
	145	tcs = test_cases
	146	for tc in tcs:
702665c0 JMF	147	tc_filename = get_tc_filename(tc)
	148	try_rm(tc_filename)
	149	try_rm(tc_filename + '.part')
4eb92208	150	try_rm(os.path.splitext(tc_filename)[0] + '.info.json')
702665c0	151	try_rm_tcs_files()
5c892b0b	152	try:
dd508b7c FV	153	try_num = 1
dd508b7c FV	154	while True:
8cc83b8d	155	try:
3bef10a5	156	# We're not using .download here since that is just a shim
e8ee972c PH	157	# for outside error handling, and returns the exit code
e8ee972c PH	158	# instead of the result dict.
308cfe0a S	159	res_dict = ydl.extract_info(
	160	test_case['url'],
	161	force_generic_extractor=params.get('force_generic_extractor', False))
8cc83b8d	162	except (DownloadError, ExtractorError) as err:
8cc83b8d	163	# Check if the exception is not a network related one
d7118397	164	if (err.exc_info[0] not in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine)
	165	or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503)):
	166	err.msg = f'{getattr(err, "msg", err)} ({tname})'
8cc83b8d FV	167	raise
8cc83b8d FV	168
dd508b7c	169	if try_num == RETRIES:
8936f68a	170	report_warning('%s failed due to network errors, skipping...' % tname)
dd508b7c FV	171	return
dd508b7c FV	172
86e5f3ed	173	print(f'Retrying: {try_num} failed tries\n\n##########\n\n')
dd508b7c FV	174
dd508b7c FV	175	try_num += 1
8cc83b8d FV	176	else:
8cc83b8d FV	177	break
5c892b0b	178
e8ee972c	179	if is_playlist:
880ee801	180	self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video'])
d6e6a422	181	self.assertTrue('entries' in res_dict)
f74b341d	182	expect_info_dict(self, res_dict, test_case.get('info_dict', {}))
d6e6a422	183
e8ee972c	184	if 'playlist_mincount' in test_case:
0990305d PH	185	assertGreaterEqual(
0990305d PH	186	self,
e8ee972c PH	187	len(res_dict['entries']),
	188	test_case['playlist_mincount'],
	189	'Expected at least %d in playlist %s, but got only %d' % (
	190	test_case['playlist_mincount'], test_case['url'],
	191	len(res_dict['entries'])))
829476b8 PH	192	if 'playlist_count' in test_case:
	193	self.assertEqual(
	194	len(res_dict['entries']),
	195	test_case['playlist_count'],
28570840	196	'Expected %d entries in playlist %s, but got %d.' % (
22a6f150	197	test_case['playlist_count'],
28570840	198	test_case['url'],
22a6f150 PH	199	len(res_dict['entries']),
22a6f150 PH	200	))
28570840 PH	201	if 'playlist_duration_sum' in test_case:
	202	got_duration = sum(e['duration'] for e in res_dict['entries'])
	203	self.assertEqual(
	204	test_case['playlist_duration_sum'], got_duration)
e8ee972c	205
364a69e8 S	206	# Generalize both playlists and single videos to unified format for
	207	# simplicity
	208	if 'entries' not in res_dict:
	209	res_dict['entries'] = [res_dict]
	210
80b2fdf9	211	for tc_num, tc in enumerate(test_cases):
364a69e8 S	212	tc_res_dict = res_dict['entries'][tc_num]
364a69e8 S	213	# First, check test cases' data against extracted data alone
80b2fdf9	214	expect_info_dict(self, tc_res_dict, tc.get('info_dict', {}))
364a69e8	215	# Now, check downloaded file consistency
702665c0	216	tc_filename = get_tc_filename(tc)
511eda8e	217	if not test_case.get('params', {}).get('skip_download', False):
702665c0 JMF	218	self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
702665c0 JMF	219	self.assertTrue(tc_filename in finished_hook_called)
08a36c35 S	220	expected_minsize = tc.get('file_minsize', 10000)
	221	if expected_minsize is not None:
	222	if params.get('test'):
	223	expected_minsize = max(expected_minsize, 10000)
	224	got_fsize = os.path.getsize(tc_filename)
	225	assertGreaterEqual(
	226	self, got_fsize, expected_minsize,
	227	'Expected %s to be at least %s, but it\'s only %s ' %
	228	(tc_filename, format_bytes(expected_minsize),
	229	format_bytes(got_fsize)))
	230	if 'md5' in tc:
	231	md5_for_file = _file_md5(tc_filename)
374560f0	232	self.assertEqual(tc['md5'], md5_for_file)
364a69e8 S	233	# Finally, check test cases' data again but this time against
364a69e8 S	234	# extracted data from info JSON file written during processing
4eb92208	235	info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
f744c0f3 PH	236	self.assertTrue(
	237	os.path.exists(info_json_fn),
	238	'Missing info file %s' % info_json_fn)
86e5f3ed	239	with open(info_json_fn, encoding='utf-8') as infof:
5c892b0b	240	info_dict = json.load(infof)
f74b341d	241	expect_info_dict(self, info_dict, tc.get('info_dict', {}))
5c892b0b	242	finally:
702665c0	243	try_rm_tcs_files()
d6e6a422	244	if is_playlist and res_dict is not None and res_dict.get('entries'):
28570840 PH	245	# Remove all other files that may have been extracted if the
	246	# extractor returns full results even with extract_flat
	247	res_tcs = [{'info_dict': e} for e in res_dict['entries']]
	248	try_rm_tcs_files(res_tcs)
fd5ff020	249
1535ac2a	250	return test_template
fd5ff020	251
582be358	252
5f6a1245	253	# And add them to TestDownload
f2e8dbcc	254	def inject_tests(test_cases, label=''):
	255	for test_case in test_cases:
	256	name = test_case['name']
	257	tname = join_nonempty('test', name, label, tests_counter[name][label], delim='_')
	258	tests_counter[name][label] += 1
cdab8aa3	259
f2e8dbcc	260	test_method = generator(test_case, tname)
	261	test_method.__name__ = tname
	262	test_method.add_ie = ','.join(test_case.get('add_ie', []))
	263	setattr(TestDownload, test_method.__name__, test_method)
cdab8aa3	264
243c57cf	265
f2e8dbcc	266	inject_tests(normal_test_cases)
	267
	268	# TODO: disable redirection to the IE to ensure we are actually testing the webpage extraction
	269	inject_tests(webpage_test_cases, 'webpage')
	270
	271
	272	def batch_generator(name):
243c57cf	273	def test_template(self):
f2e8dbcc	274	for label, num_tests in tests_counter[name].items():
	275	for i in range(num_tests):
	276	test_name = join_nonempty('test', name, label, i, delim='_')
	277	try:
	278	getattr(self, test_name)()
	279	except unittest.SkipTest:
	280	print(f'Skipped {test_name}')
243c57cf	281
	282	return test_template
	283
	284
f2e8dbcc	285	for name in tests_counter:
f2e8dbcc	286	test_method = batch_generator(name)
243c57cf	287	test_method.__name__ = f'test_{name}_all'
	288	test_method.add_ie = ''
	289	setattr(TestDownload, test_method.__name__, test_method)
f2e8dbcc	290	del test_method
243c57cf	291
243c57cf	292
cdab8aa3 PH	293	if __name__ == '__main__':
cdab8aa3 PH	294	unittest.main()