[yt-dlp.git] / test / test_download.py

#!/usr/bin/env python3

# Allow direct execution
import os
import sys
import unittest

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


import collections
import hashlib
import http.client
import json
import socket
import urllib.error

from test.helper import (
    assertGreaterEqual,
    expect_info_dict,
    expect_warnings,
    get_params,
    gettestcases,
    getwebpagetestcases,
    is_download_test,
    report_warning,
    try_rm,
)

import yt_dlp.YoutubeDL  # isort: split
from yt_dlp.extractor import get_info_extractor
from yt_dlp.utils import (
    DownloadError,
    ExtractorError,
    UnavailableVideoError,
    format_bytes,
    join_nonempty,
)

RETRIES = 3


class YoutubeDL(yt_dlp.YoutubeDL):
    def __init__(self, *args, **kwargs):
        self.to_stderr = self.to_screen
        self.processed_info_dicts = []
        super().__init__(*args, **kwargs)

    def report_warning(self, message, *args, **kwargs):
        # Don't accept warnings during tests
        raise ExtractorError(message)

    def process_info(self, info_dict):
        self.processed_info_dicts.append(info_dict.copy())
        return super().process_info(info_dict)


def _file_md5(fn):
    with open(fn, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()


normal_test_cases = gettestcases()
webpage_test_cases = getwebpagetestcases()
tests_counter = collections.defaultdict(collections.Counter)


@is_download_test
class TestDownload(unittest.TestCase):
    # Parallel testing in nosetests. See
    # http://nose.readthedocs.org/en/latest/doc_tests/test_multiprocess/multiprocess.html
    _multiprocess_shared_ = True

    maxDiff = None

    COMPLETED_TESTS = {}

    def __str__(self):
        """Identify each test with the `add_ie` attribute, if available."""
        cls, add_ie = type(self), getattr(self, self._testMethodName).add_ie
        return f'{self._testMethodName} ({cls.__module__}.{cls.__name__}){f" [{add_ie}]" if add_ie else ""}:'


# Dynamically generate tests

def generator(test_case, tname):
    def test_template(self):
        if self.COMPLETED_TESTS.get(tname):
            return
        self.COMPLETED_TESTS[tname] = True
        ie = yt_dlp.extractor.get_info_extractor(test_case['name'])()
        other_ies = [get_info_extractor(ie_key)() for ie_key in test_case.get('add_ie', [])]
        is_playlist = any(k.startswith('playlist') for k in test_case)
        test_cases = test_case.get(
            'playlist', [] if is_playlist else [test_case])

        def print_skipping(reason):
            print('Skipping %s: %s' % (test_case['name'], reason))
            self.skipTest(reason)

        if not ie.working():
            print_skipping('IE marked as not _WORKING')

        for tc in test_cases:
            info_dict = tc.get('info_dict', {})
            params = tc.get('params', {})
            if not info_dict.get('id'):
                raise Exception('Test definition incorrect. \'id\' key is not present')
            elif not info_dict.get('ext'):
                if params.get('skip_download') and params.get('ignore_no_formats_error'):
                    continue
                raise Exception('Test definition incorrect. The output file cannot be known. \'ext\' key is not present')

        if 'skip' in test_case:
            print_skipping(test_case['skip'])

        for other_ie in other_ies:
            if not other_ie.working():
                print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key())

        params = get_params(test_case.get('params', {}))
        params['outtmpl'] = tname + '_' + params['outtmpl']
        if is_playlist and 'playlist' not in test_case:
            params.setdefault('extract_flat', 'in_playlist')
            params.setdefault('playlistend', test_case.get('playlist_mincount'))
            params.setdefault('skip_download', True)

        ydl = YoutubeDL(params, auto_init=False)
        ydl.add_default_info_extractors()
        finished_hook_called = set()

        def _hook(status):
            if status['status'] == 'finished':
                finished_hook_called.add(status['filename'])
        ydl.add_progress_hook(_hook)
        expect_warnings(ydl, test_case.get('expected_warnings', []))

        def get_tc_filename(tc):
            return ydl.prepare_filename(dict(tc.get('info_dict', {})))

        res_dict = None

        def try_rm_tcs_files(tcs=None):
            if tcs is None:
                tcs = test_cases
            for tc in tcs:
                tc_filename = get_tc_filename(tc)
                try_rm(tc_filename)
                try_rm(tc_filename + '.part')
                try_rm(os.path.splitext(tc_filename)[0] + '.info.json')
        try_rm_tcs_files()
        try:
            try_num = 1
            while True:
                try:
                    # We're not using .download here since that is just a shim
                    # for outside error handling, and returns the exit code
                    # instead of the result dict.
                    res_dict = ydl.extract_info(
                        test_case['url'],
                        force_generic_extractor=params.get('force_generic_extractor', False))
                except (DownloadError, ExtractorError) as err:
                    # Check if the exception is not a network related one
                    if not err.exc_info[0] in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine) or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503):
                        raise

                    if try_num == RETRIES:
                        report_warning('%s failed due to network errors, skipping...' % tname)
                        return

                    print(f'Retrying: {try_num} failed tries\n\n##########\n\n')

                    try_num += 1
                else:
                    break

            if is_playlist:
                self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video'])
                self.assertTrue('entries' in res_dict)
                expect_info_dict(self, res_dict, test_case.get('info_dict', {}))

            if 'playlist_mincount' in test_case:
                assertGreaterEqual(
                    self,
                    len(res_dict['entries']),
                    test_case['playlist_mincount'],
                    'Expected at least %d in playlist %s, but got only %d' % (
                        test_case['playlist_mincount'], test_case['url'],
                        len(res_dict['entries'])))
            if 'playlist_count' in test_case:
                self.assertEqual(
                    len(res_dict['entries']),
                    test_case['playlist_count'],
                    'Expected %d entries in playlist %s, but got %d.' % (
                        test_case['playlist_count'],
                        test_case['url'],
                        len(res_dict['entries']),
                    ))
            if 'playlist_duration_sum' in test_case:
                got_duration = sum(e['duration'] for e in res_dict['entries'])
                self.assertEqual(
                    test_case['playlist_duration_sum'], got_duration)

            # Generalize both playlists and single videos to unified format for
            # simplicity
            if 'entries' not in res_dict:
                res_dict['entries'] = [res_dict]

            for tc_num, tc in enumerate(test_cases):
                tc_res_dict = res_dict['entries'][tc_num]
                # First, check test cases' data against extracted data alone
                expect_info_dict(self, tc_res_dict, tc.get('info_dict', {}))
                # Now, check downloaded file consistency
                tc_filename = get_tc_filename(tc)
                if not test_case.get('params', {}).get('skip_download', False):
                    self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
                    self.assertTrue(tc_filename in finished_hook_called)
                    expected_minsize = tc.get('file_minsize', 10000)
                    if expected_minsize is not None:
                        if params.get('test'):
                            expected_minsize = max(expected_minsize, 10000)
                        got_fsize = os.path.getsize(tc_filename)
                        assertGreaterEqual(
                            self, got_fsize, expected_minsize,
                            'Expected %s to be at least %s, but it\'s only %s ' %
                            (tc_filename, format_bytes(expected_minsize),
                                format_bytes(got_fsize)))
                    if 'md5' in tc:
                        md5_for_file = _file_md5(tc_filename)
                        self.assertEqual(tc['md5'], md5_for_file)
                # Finally, check test cases' data again but this time against
                # extracted data from info JSON file written during processing
                info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
                self.assertTrue(
                    os.path.exists(info_json_fn),
                    'Missing info file %s' % info_json_fn)
                with open(info_json_fn, encoding='utf-8') as infof:
                    info_dict = json.load(infof)
                expect_info_dict(self, info_dict, tc.get('info_dict', {}))
        finally:
            try_rm_tcs_files()
            if is_playlist and res_dict is not None and res_dict.get('entries'):
                # Remove all other files that may have been extracted if the
                # extractor returns full results even with extract_flat
                res_tcs = [{'info_dict': e} for e in res_dict['entries']]
                try_rm_tcs_files(res_tcs)

    return test_template


# And add them to TestDownload
def inject_tests(test_cases, label=''):
    for test_case in test_cases:
        name = test_case['name']
        tname = join_nonempty('test', name, label, tests_counter[name][label], delim='_')
        tests_counter[name][label] += 1

        test_method = generator(test_case, tname)
        test_method.__name__ = tname
        test_method.add_ie = ','.join(test_case.get('add_ie', []))
        setattr(TestDownload, test_method.__name__, test_method)


inject_tests(normal_test_cases)

# TODO: disable redirection to the IE to ensure we are actually testing the webpage extraction
inject_tests(webpage_test_cases, 'webpage')


def batch_generator(name):
    def test_template(self):
        for label, num_tests in tests_counter[name].items():
            for i in range(num_tests):
                test_name = join_nonempty('test', name, label, i, delim='_')
                try:
                    getattr(self, test_name)()
                except unittest.SkipTest:
                    print(f'Skipped {test_name}')

    return test_template


for name in tests_counter:
    test_method = batch_generator(name)
    test_method.__name__ = f'test_{name}_all'
    test_method.add_ie = ''
    setattr(TestDownload, test_method.__name__, test_method)
del test_method


if __name__ == '__main__':
    unittest.main()
Commit	Line	Data
cc52de43	1	#!/usr/bin/env python3
54007a45	2
44a5f171 PH	3	# Allow direct execution
	4	import os
	5	import sys
	6	import unittest
f8271158	7
44a5f171 PH	8	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
44a5f171 PH	9
ac668111	10
f2e8dbcc	11	import collections
ac668111	12	import hashlib
54007a45	13	import http.client
ac668111	14	import json
	15	import socket
	16	import urllib.error
ac668111	17
dd508b7c	18	from test.helper import (
0990305d	19	assertGreaterEqual,
060ac762	20	expect_info_dict,
70b7e3fb	21	expect_warnings,
dd508b7c	22	get_params,
ff14fc49	23	gettestcases,
f2e8dbcc	24	getwebpagetestcases,
060ac762	25	is_download_test,
257cfebf	26	report_warning,
060ac762	27	try_rm,
dd508b7c	28	)
54007a45	29
ac668111	30	import yt_dlp.YoutubeDL # isort: split
f8271158	31	from yt_dlp.extractor import get_info_extractor
7a5c1cfe	32	from yt_dlp.utils import (
44a5f171 PH	33	DownloadError,
	34	ExtractorError,
	35	UnavailableVideoError,
f8271158	36	format_bytes,
f2e8dbcc	37	join_nonempty,
44a5f171	38	)
fd5ff020	39
8cc83b8d FV	40	RETRIES = 3
8cc83b8d FV	41
5f6a1245	42
7a5c1cfe	43	class YoutubeDL(yt_dlp.YoutubeDL):
fd5ff020	44	def __init__(self, args, *kwargs):
fd5ff020	45	self.to_stderr = self.to_screen
0eaf520d	46	self.processed_info_dicts = []
86e5f3ed	47	super().__init__(args, *kwargs)
5f6a1245	48
f0500bd1	49	def report_warning(self, message, args, *kwargs):
be95cac1 FV	50	# Don't accept warnings during tests
be95cac1 FV	51	raise ExtractorError(message)
5f6a1245	52
0eaf520d	53	def process_info(self, info_dict):
f46e2f9d	54	self.processed_info_dicts.append(info_dict.copy())
86e5f3ed	55	return super().process_info(info_dict)
1535ac2a	56
5f6a1245	57
fd5ff020 FV	58	def _file_md5(fn):
	59	with open(fn, 'rb') as f:
	60	return hashlib.md5(f.read()).hexdigest()
	61
582be358	62
f2e8dbcc	63	normal_test_cases = gettestcases()
	64	webpage_test_cases = getwebpagetestcases()
	65	tests_counter = collections.defaultdict(collections.Counter)
6b47c7f2	66
0eaf520d	67
060ac762	68	@is_download_test
1535ac2a	69	class TestDownload(unittest.TestCase):
8936f68a YCH	70	# Parallel testing in nosetests. See
	71	# http://nose.readthedocs.org/en/latest/doc_tests/test_multiprocess/multiprocess.html
	72	_multiprocess_shared_ = True
	73
744435f2	74	maxDiff = None
5f6a1245	75
243c57cf	76	COMPLETED_TESTS = {}
243c57cf	77
c6c22e98 JH	78	def __str__(self):
c6c22e98 JH	79	"""Identify each test with the `add_ie` attribute, if available."""
f2e8dbcc	80	cls, add_ie = type(self), getattr(self, self._testMethodName).add_ie
f2e8dbcc	81	return f'{self._testMethodName} ({cls.__module__}.{cls.__name__}){f" [{add_ie}]" if add_ie else ""}:'
c6c22e98	82
fd5ff020	83
5f6a1245 JW	84	# Dynamically generate tests
5f6a1245 JW	85
8936f68a	86	def generator(test_case, tname):
1535ac2a	87	def test_template(self):
243c57cf	88	if self.COMPLETED_TESTS.get(tname):
	89	return
	90	self.COMPLETED_TESTS[tname] = True
7a5c1cfe	91	ie = yt_dlp.extractor.get_info_extractor(test_case['name'])()
655c4100	92	other_ies = [get_info_extractor(ie_key)() for ie_key in test_case.get('add_ie', [])]
e8ee972c PH	93	is_playlist = any(k.startswith('playlist') for k in test_case)
	94	test_cases = test_case.get(
	95	'playlist', [] if is_playlist else [test_case])
	96
bc2884af JMF	97	def print_skipping(reason):
bc2884af JMF	98	print('Skipping %s: %s' % (test_case['name'], reason))
6d1b3489	99	self.skipTest(reason)
6d1b3489	100
9ee2b5f6	101	if not ie.working():
bc2884af	102	print_skipping('IE marked as not _WORKING')
e8ee972c PH	103
	104	for tc in test_cases:
	105	info_dict = tc.get('info_dict', {})
0855702f	106	params = tc.get('params', {})
	107	if not info_dict.get('id'):
	108	raise Exception('Test definition incorrect. \'id\' key is not present')
	109	elif not info_dict.get('ext'):
	110	if params.get('skip_download') and params.get('ignore_no_formats_error'):
	111	continue
	112	raise Exception('Test definition incorrect. The output file cannot be known. \'ext\' key is not present')
e8ee972c	113
fd5ff020	114	if 'skip' in test_case:
bc2884af	115	print_skipping(test_case['skip'])
6d1b3489	116
9ee2b5f6 JMF	117	for other_ie in other_ies:
9ee2b5f6 JMF	118	if not other_ie.working():
e075a44a	119	print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key())
0eaf520d	120
44a5f171	121	params = get_params(test_case.get('params', {}))
8936f68a	122	params['outtmpl'] = tname + '_' + params['outtmpl']
e8ee972c	123	if is_playlist and 'playlist' not in test_case:
65d49afa	124	params.setdefault('extract_flat', 'in_playlist')
6911e11e	125	params.setdefault('playlistend', test_case.get('playlist_mincount'))
e8ee972c	126	params.setdefault('skip_download', True)
0eaf520d	127
ac35c266	128	ydl = YoutubeDL(params, auto_init=False)
023fa8c4	129	ydl.add_default_info_extractors()
bffbd5f0	130	finished_hook_called = set()
5f6a1245	131
bffbd5f0 PH	132	def _hook(status):
	133	if status['status'] == 'finished':
	134	finished_hook_called.add(status['filename'])
933605d7	135	ydl.add_progress_hook(_hook)
70b7e3fb	136	expect_warnings(ydl, test_case.get('expected_warnings', []))
5c892b0b	137
702665c0	138	def get_tc_filename(tc):
ad3dc496	139	return ydl.prepare_filename(dict(tc.get('info_dict', {})))
702665c0	140
28570840	141	res_dict = None
5f6a1245	142
28570840 PH	143	def try_rm_tcs_files(tcs=None):
	144	if tcs is None:
	145	tcs = test_cases
	146	for tc in tcs:
702665c0 JMF	147	tc_filename = get_tc_filename(tc)
	148	try_rm(tc_filename)
	149	try_rm(tc_filename + '.part')
4eb92208	150	try_rm(os.path.splitext(tc_filename)[0] + '.info.json')
702665c0	151	try_rm_tcs_files()
5c892b0b	152	try:
dd508b7c FV	153	try_num = 1
dd508b7c FV	154	while True:
8cc83b8d	155	try:
3bef10a5	156	# We're not using .download here since that is just a shim
e8ee972c PH	157	# for outside error handling, and returns the exit code
e8ee972c PH	158	# instead of the result dict.
308cfe0a S	159	res_dict = ydl.extract_info(
	160	test_case['url'],
	161	force_generic_extractor=params.get('force_generic_extractor', False))
8cc83b8d	162	except (DownloadError, ExtractorError) as err:
8cc83b8d	163	# Check if the exception is not a network related one
14f25df2	164	if not err.exc_info[0] in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine) or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503):
8cc83b8d FV	165	raise
8cc83b8d FV	166
dd508b7c	167	if try_num == RETRIES:
8936f68a	168	report_warning('%s failed due to network errors, skipping...' % tname)
dd508b7c FV	169	return
dd508b7c FV	170
86e5f3ed	171	print(f'Retrying: {try_num} failed tries\n\n##########\n\n')
dd508b7c FV	172
dd508b7c FV	173	try_num += 1
8cc83b8d FV	174	else:
8cc83b8d FV	175	break
5c892b0b	176
e8ee972c	177	if is_playlist:
880ee801	178	self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video'])
d6e6a422	179	self.assertTrue('entries' in res_dict)
f74b341d	180	expect_info_dict(self, res_dict, test_case.get('info_dict', {}))
d6e6a422	181
e8ee972c	182	if 'playlist_mincount' in test_case:
0990305d PH	183	assertGreaterEqual(
0990305d PH	184	self,
e8ee972c PH	185	len(res_dict['entries']),
	186	test_case['playlist_mincount'],
	187	'Expected at least %d in playlist %s, but got only %d' % (
	188	test_case['playlist_mincount'], test_case['url'],
	189	len(res_dict['entries'])))
829476b8 PH	190	if 'playlist_count' in test_case:
	191	self.assertEqual(
	192	len(res_dict['entries']),
	193	test_case['playlist_count'],
28570840	194	'Expected %d entries in playlist %s, but got %d.' % (
22a6f150	195	test_case['playlist_count'],
28570840	196	test_case['url'],
22a6f150 PH	197	len(res_dict['entries']),
22a6f150 PH	198	))
28570840 PH	199	if 'playlist_duration_sum' in test_case:
	200	got_duration = sum(e['duration'] for e in res_dict['entries'])
	201	self.assertEqual(
	202	test_case['playlist_duration_sum'], got_duration)
e8ee972c	203
364a69e8 S	204	# Generalize both playlists and single videos to unified format for
	205	# simplicity
	206	if 'entries' not in res_dict:
	207	res_dict['entries'] = [res_dict]
	208
80b2fdf9	209	for tc_num, tc in enumerate(test_cases):
364a69e8 S	210	tc_res_dict = res_dict['entries'][tc_num]
364a69e8 S	211	# First, check test cases' data against extracted data alone
80b2fdf9	212	expect_info_dict(self, tc_res_dict, tc.get('info_dict', {}))
364a69e8	213	# Now, check downloaded file consistency
702665c0	214	tc_filename = get_tc_filename(tc)
511eda8e	215	if not test_case.get('params', {}).get('skip_download', False):
702665c0 JMF	216	self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
702665c0 JMF	217	self.assertTrue(tc_filename in finished_hook_called)
08a36c35 S	218	expected_minsize = tc.get('file_minsize', 10000)
	219	if expected_minsize is not None:
	220	if params.get('test'):
	221	expected_minsize = max(expected_minsize, 10000)
	222	got_fsize = os.path.getsize(tc_filename)
	223	assertGreaterEqual(
	224	self, got_fsize, expected_minsize,
	225	'Expected %s to be at least %s, but it\'s only %s ' %
	226	(tc_filename, format_bytes(expected_minsize),
	227	format_bytes(got_fsize)))
	228	if 'md5' in tc:
	229	md5_for_file = _file_md5(tc_filename)
374560f0	230	self.assertEqual(tc['md5'], md5_for_file)
364a69e8 S	231	# Finally, check test cases' data again but this time against
364a69e8 S	232	# extracted data from info JSON file written during processing
4eb92208	233	info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
f744c0f3 PH	234	self.assertTrue(
	235	os.path.exists(info_json_fn),
	236	'Missing info file %s' % info_json_fn)
86e5f3ed	237	with open(info_json_fn, encoding='utf-8') as infof:
5c892b0b	238	info_dict = json.load(infof)
f74b341d	239	expect_info_dict(self, info_dict, tc.get('info_dict', {}))
5c892b0b	240	finally:
702665c0	241	try_rm_tcs_files()
d6e6a422	242	if is_playlist and res_dict is not None and res_dict.get('entries'):
28570840 PH	243	# Remove all other files that may have been extracted if the
	244	# extractor returns full results even with extract_flat
	245	res_tcs = [{'info_dict': e} for e in res_dict['entries']]
	246	try_rm_tcs_files(res_tcs)
fd5ff020	247
1535ac2a	248	return test_template
fd5ff020	249
582be358	250
5f6a1245	251	# And add them to TestDownload
f2e8dbcc	252	def inject_tests(test_cases, label=''):
	253	for test_case in test_cases:
	254	name = test_case['name']
	255	tname = join_nonempty('test', name, label, tests_counter[name][label], delim='_')
	256	tests_counter[name][label] += 1
cdab8aa3	257
f2e8dbcc	258	test_method = generator(test_case, tname)
	259	test_method.__name__ = tname
	260	test_method.add_ie = ','.join(test_case.get('add_ie', []))
	261	setattr(TestDownload, test_method.__name__, test_method)
cdab8aa3	262
243c57cf	263
f2e8dbcc	264	inject_tests(normal_test_cases)
	265
	266	# TODO: disable redirection to the IE to ensure we are actually testing the webpage extraction
	267	inject_tests(webpage_test_cases, 'webpage')
	268
	269
	270	def batch_generator(name):
243c57cf	271	def test_template(self):
f2e8dbcc	272	for label, num_tests in tests_counter[name].items():
	273	for i in range(num_tests):
	274	test_name = join_nonempty('test', name, label, i, delim='_')
	275	try:
	276	getattr(self, test_name)()
	277	except unittest.SkipTest:
	278	print(f'Skipped {test_name}')
243c57cf	279
	280	return test_template
	281
	282
f2e8dbcc	283	for name in tests_counter:
f2e8dbcc	284	test_method = batch_generator(name)
243c57cf	285	test_method.__name__ = f'test_{name}_all'
	286	test_method.add_ie = ''
	287	setattr(TestDownload, test_method.__name__, test_method)
f2e8dbcc	288	del test_method
243c57cf	289
243c57cf	290
cdab8aa3 PH	291	if __name__ == '__main__':
cdab8aa3 PH	292	unittest.main()