[yt-dlp.git] / test / test_download.py

#!/usr/bin/env python3

# Allow direct execution
import os
import sys
import unittest

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


import collections
import hashlib
import http.client
import json
import socket
import urllib.error

from test.helper import (
    assertGreaterEqual,
    expect_info_dict,
    expect_warnings,
    get_params,
    gettestcases,
    getwebpagetestcases,
    is_download_test,
    report_warning,
    try_rm,
)

import yt_dlp.YoutubeDL  # isort: split
from yt_dlp.extractor import get_info_extractor
from yt_dlp.utils import (
    DownloadError,
    ExtractorError,
    UnavailableVideoError,
    format_bytes,
    join_nonempty,
)

RETRIES = 3


class YoutubeDL(yt_dlp.YoutubeDL):
    def __init__(self, *args, **kwargs):
        self.to_stderr = self.to_screen
        self.processed_info_dicts = []
        super().__init__(*args, **kwargs)

    def report_warning(self, message, *args, **kwargs):
        # Don't accept warnings during tests
        raise ExtractorError(message)

    def process_info(self, info_dict):
        self.processed_info_dicts.append(info_dict.copy())
        return super().process_info(info_dict)


def _file_md5(fn):
    with open(fn, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()


normal_test_cases = gettestcases()
webpage_test_cases = getwebpagetestcases()
tests_counter = collections.defaultdict(collections.Counter)


@is_download_test
class TestDownload(unittest.TestCase):
    # Parallel testing in nosetests. See
    # http://nose.readthedocs.org/en/latest/doc_tests/test_multiprocess/multiprocess.html
    _multiprocess_shared_ = True

    maxDiff = None

    COMPLETED_TESTS = {}

    def __str__(self):
        """Identify each test with the `add_ie` attribute, if available."""
        cls, add_ie = type(self), getattr(self, self._testMethodName).add_ie
        return f'{self._testMethodName} ({cls.__module__}.{cls.__name__}){f" [{add_ie}]" if add_ie else ""}:'


# Dynamically generate tests

def generator(test_case, tname):
    def test_template(self):
        if self.COMPLETED_TESTS.get(tname):
            return
        self.COMPLETED_TESTS[tname] = True
        ie = yt_dlp.extractor.get_info_extractor(test_case['name'])()
        other_ies = [get_info_extractor(ie_key)() for ie_key in test_case.get('add_ie', [])]
        is_playlist = any(k.startswith('playlist') for k in test_case)
        test_cases = test_case.get(
            'playlist', [] if is_playlist else [test_case])

        def print_skipping(reason):
            print('Skipping %s: %s' % (test_case['name'], reason))
            self.skipTest(reason)

        if not ie.working():
            print_skipping('IE marked as not _WORKING')

        for tc in test_cases:
            info_dict = tc.get('info_dict', {})
            params = tc.get('params', {})
            if not info_dict.get('id'):
                raise Exception(f'Test {tname} definition incorrect - "id" key is not present')
            elif not info_dict.get('ext') and info_dict.get('_type', 'video') == 'video':
                if params.get('skip_download') and params.get('ignore_no_formats_error'):
                    continue
                raise Exception(f'Test {tname} definition incorrect - "ext" key must be present to define the output file')

        if 'skip' in test_case:
            print_skipping(test_case['skip'])

        for other_ie in other_ies:
            if not other_ie.working():
                print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key())

        params = get_params(test_case.get('params', {}))
        params['outtmpl'] = tname + '_' + params['outtmpl']
        if is_playlist and 'playlist' not in test_case:
            params.setdefault('extract_flat', 'in_playlist')
            params.setdefault('playlistend', test_case.get(
                'playlist_mincount', test_case.get('playlist_count', -2) + 1))
            params.setdefault('skip_download', True)

        ydl = YoutubeDL(params, auto_init=False)
        ydl.add_default_info_extractors()
        finished_hook_called = set()

        def _hook(status):
            if status['status'] == 'finished':
                finished_hook_called.add(status['filename'])
        ydl.add_progress_hook(_hook)
        expect_warnings(ydl, test_case.get('expected_warnings', []))

        def get_tc_filename(tc):
            return ydl.prepare_filename(dict(tc.get('info_dict', {})))

        res_dict = None

        def try_rm_tcs_files(tcs=None):
            if tcs is None:
                tcs = test_cases
            for tc in tcs:
                tc_filename = get_tc_filename(tc)
                try_rm(tc_filename)
                try_rm(tc_filename + '.part')
                try_rm(os.path.splitext(tc_filename)[0] + '.info.json')
        try_rm_tcs_files()
        try:
            try_num = 1
            while True:
                try:
                    # We're not using .download here since that is just a shim
                    # for outside error handling, and returns the exit code
                    # instead of the result dict.
                    res_dict = ydl.extract_info(
                        test_case['url'],
                        force_generic_extractor=params.get('force_generic_extractor', False))
                except (DownloadError, ExtractorError) as err:
                    # Check if the exception is not a network related one
                    if (err.exc_info[0] not in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine)
                            or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503)):
                        err.msg = f'{getattr(err, "msg", err)} ({tname})'
                        raise

                    if try_num == RETRIES:
                        report_warning('%s failed due to network errors, skipping...' % tname)
                        return

                    print(f'Retrying: {try_num} failed tries\n\n##########\n\n')

                    try_num += 1
                else:
                    break

            if is_playlist:
                self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video'])
                self.assertTrue('entries' in res_dict)
                expect_info_dict(self, res_dict, test_case.get('info_dict', {}))

            if 'playlist_mincount' in test_case:
                assertGreaterEqual(
                    self,
                    len(res_dict['entries']),
                    test_case['playlist_mincount'],
                    'Expected at least %d in playlist %s, but got only %d' % (
                        test_case['playlist_mincount'], test_case['url'],
                        len(res_dict['entries'])))
            if 'playlist_count' in test_case:
                self.assertEqual(
                    len(res_dict['entries']),
                    test_case['playlist_count'],
                    'Expected %d entries in playlist %s, but got %d.' % (
                        test_case['playlist_count'],
                        test_case['url'],
                        len(res_dict['entries']),
                    ))
            if 'playlist_duration_sum' in test_case:
                got_duration = sum(e['duration'] for e in res_dict['entries'])
                self.assertEqual(
                    test_case['playlist_duration_sum'], got_duration)

            # Generalize both playlists and single videos to unified format for
            # simplicity
            if 'entries' not in res_dict:
                res_dict['entries'] = [res_dict]

            for tc_num, tc in enumerate(test_cases):
                tc_res_dict = res_dict['entries'][tc_num]
                # First, check test cases' data against extracted data alone
                expect_info_dict(self, tc_res_dict, tc.get('info_dict', {}))
                if tc_res_dict.get('_type', 'video') != 'video':
                    continue
                # Now, check downloaded file consistency
                tc_filename = get_tc_filename(tc)
                if not test_case.get('params', {}).get('skip_download', False):
                    self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
                    self.assertTrue(tc_filename in finished_hook_called)
                    expected_minsize = tc.get('file_minsize', 10000)
                    if expected_minsize is not None:
                        if params.get('test'):
                            expected_minsize = max(expected_minsize, 10000)
                        got_fsize = os.path.getsize(tc_filename)
                        assertGreaterEqual(
                            self, got_fsize, expected_minsize,
                            'Expected %s to be at least %s, but it\'s only %s ' %
                            (tc_filename, format_bytes(expected_minsize),
                                format_bytes(got_fsize)))
                    if 'md5' in tc:
                        md5_for_file = _file_md5(tc_filename)
                        self.assertEqual(tc['md5'], md5_for_file)
                # Finally, check test cases' data again but this time against
                # extracted data from info JSON file written during processing
                info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
                self.assertTrue(
                    os.path.exists(info_json_fn),
                    'Missing info file %s' % info_json_fn)
                with open(info_json_fn, encoding='utf-8') as infof:
                    info_dict = json.load(infof)
                expect_info_dict(self, info_dict, tc.get('info_dict', {}))
        finally:
            try_rm_tcs_files()
            if is_playlist and res_dict is not None and res_dict.get('entries'):
                # Remove all other files that may have been extracted if the
                # extractor returns full results even with extract_flat
                res_tcs = [{'info_dict': e} for e in res_dict['entries']]
                try_rm_tcs_files(res_tcs)

    return test_template


# And add them to TestDownload
def inject_tests(test_cases, label=''):
    for test_case in test_cases:
        name = test_case['name']
        tname = join_nonempty('test', name, label, tests_counter[name][label], delim='_')
        tests_counter[name][label] += 1

        test_method = generator(test_case, tname)
        test_method.__name__ = tname
        test_method.add_ie = ','.join(test_case.get('add_ie', []))
        setattr(TestDownload, test_method.__name__, test_method)


inject_tests(normal_test_cases)

# TODO: disable redirection to the IE to ensure we are actually testing the webpage extraction
inject_tests(webpage_test_cases, 'webpage')


def batch_generator(name):
    def test_template(self):
        for label, num_tests in tests_counter[name].items():
            for i in range(num_tests):
                test_name = join_nonempty('test', name, label, i, delim='_')
                try:
                    getattr(self, test_name)()
                except unittest.SkipTest:
                    print(f'Skipped {test_name}')

    return test_template


for name in tests_counter:
    test_method = batch_generator(name)
    test_method.__name__ = f'test_{name}_all'
    test_method.add_ie = ''
    setattr(TestDownload, test_method.__name__, test_method)
del test_method


if __name__ == '__main__':
    unittest.main()
Commit	Line	Data
cc52de43	1	#!/usr/bin/env python3
54007a45	2
44a5f171 PH	3	# Allow direct execution
	4	import os
	5	import sys
	6	import unittest
f8271158	7
44a5f171 PH	8	sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
44a5f171 PH	9
ac668111	10
f2e8dbcc	11	import collections
ac668111	12	import hashlib
54007a45	13	import http.client
ac668111	14	import json
	15	import socket
	16	import urllib.error
ac668111	17
dd508b7c	18	from test.helper import (
0990305d	19	assertGreaterEqual,
060ac762	20	expect_info_dict,
70b7e3fb	21	expect_warnings,
dd508b7c	22	get_params,
ff14fc49	23	gettestcases,
f2e8dbcc	24	getwebpagetestcases,
060ac762	25	is_download_test,
257cfebf	26	report_warning,
060ac762	27	try_rm,
dd508b7c	28	)
54007a45	29
ac668111	30	import yt_dlp.YoutubeDL # isort: split
f8271158	31	from yt_dlp.extractor import get_info_extractor
7a5c1cfe	32	from yt_dlp.utils import (
44a5f171 PH	33	DownloadError,
	34	ExtractorError,
	35	UnavailableVideoError,
f8271158	36	format_bytes,
f2e8dbcc	37	join_nonempty,
44a5f171	38	)
fd5ff020	39
8cc83b8d FV	40	RETRIES = 3
8cc83b8d FV	41
5f6a1245	42
7a5c1cfe	43	class YoutubeDL(yt_dlp.YoutubeDL):
fd5ff020	44	def __init__(self, args, *kwargs):
fd5ff020	45	self.to_stderr = self.to_screen
0eaf520d	46	self.processed_info_dicts = []
86e5f3ed	47	super().__init__(args, *kwargs)
5f6a1245	48
f0500bd1	49	def report_warning(self, message, args, *kwargs):
be95cac1 FV	50	# Don't accept warnings during tests
be95cac1 FV	51	raise ExtractorError(message)
5f6a1245	52
0eaf520d	53	def process_info(self, info_dict):
f46e2f9d	54	self.processed_info_dicts.append(info_dict.copy())
86e5f3ed	55	return super().process_info(info_dict)
1535ac2a	56
5f6a1245	57
fd5ff020 FV	58	def _file_md5(fn):
	59	with open(fn, 'rb') as f:
	60	return hashlib.md5(f.read()).hexdigest()
	61
582be358	62
f2e8dbcc	63	normal_test_cases = gettestcases()
	64	webpage_test_cases = getwebpagetestcases()
	65	tests_counter = collections.defaultdict(collections.Counter)
6b47c7f2	66
0eaf520d	67
060ac762	68	@is_download_test
1535ac2a	69	class TestDownload(unittest.TestCase):
8936f68a YCH	70	# Parallel testing in nosetests. See
	71	# http://nose.readthedocs.org/en/latest/doc_tests/test_multiprocess/multiprocess.html
	72	_multiprocess_shared_ = True
	73
744435f2	74	maxDiff = None
5f6a1245	75
243c57cf	76	COMPLETED_TESTS = {}
243c57cf	77
c6c22e98 JH	78	def __str__(self):
c6c22e98 JH	79	"""Identify each test with the `add_ie` attribute, if available."""
f2e8dbcc	80	cls, add_ie = type(self), getattr(self, self._testMethodName).add_ie
f2e8dbcc	81	return f'{self._testMethodName} ({cls.__module__}.{cls.__name__}){f" [{add_ie}]" if add_ie else ""}:'
c6c22e98	82
fd5ff020	83
5f6a1245 JW	84	# Dynamically generate tests
5f6a1245 JW	85
8936f68a	86	def generator(test_case, tname):
1535ac2a	87	def test_template(self):
243c57cf	88	if self.COMPLETED_TESTS.get(tname):
	89	return
	90	self.COMPLETED_TESTS[tname] = True
7a5c1cfe	91	ie = yt_dlp.extractor.get_info_extractor(test_case['name'])()
655c4100	92	other_ies = [get_info_extractor(ie_key)() for ie_key in test_case.get('add_ie', [])]
e8ee972c PH	93	is_playlist = any(k.startswith('playlist') for k in test_case)
	94	test_cases = test_case.get(
	95	'playlist', [] if is_playlist else [test_case])
	96
bc2884af JMF	97	def print_skipping(reason):
bc2884af JMF	98	print('Skipping %s: %s' % (test_case['name'], reason))
6d1b3489	99	self.skipTest(reason)
6d1b3489	100
9ee2b5f6	101	if not ie.working():
bc2884af	102	print_skipping('IE marked as not _WORKING')
e8ee972c PH	103
	104	for tc in test_cases:
	105	info_dict = tc.get('info_dict', {})
0855702f	106	params = tc.get('params', {})
0855702f	107	if not info_dict.get('id'):
d7118397	108	raise Exception(f'Test {tname} definition incorrect - "id" key is not present')
495322b9	109	elif not info_dict.get('ext') and info_dict.get('_type', 'video') == 'video':
0855702f	110	if params.get('skip_download') and params.get('ignore_no_formats_error'):
0855702f	111	continue
d7118397	112	raise Exception(f'Test {tname} definition incorrect - "ext" key must be present to define the output file')
e8ee972c	113
fd5ff020	114	if 'skip' in test_case:
bc2884af	115	print_skipping(test_case['skip'])
6d1b3489	116
9ee2b5f6 JMF	117	for other_ie in other_ies:
9ee2b5f6 JMF	118	if not other_ie.working():
e075a44a	119	print_skipping('test depends on %sIE, marked as not WORKING' % other_ie.ie_key())
0eaf520d	120
44a5f171	121	params = get_params(test_case.get('params', {}))
8936f68a	122	params['outtmpl'] = tname + '_' + params['outtmpl']
e8ee972c	123	if is_playlist and 'playlist' not in test_case:
65d49afa	124	params.setdefault('extract_flat', 'in_playlist')
c9bd6518 AK	125	params.setdefault('playlistend', test_case.get(
c9bd6518 AK	126	'playlist_mincount', test_case.get('playlist_count', -2) + 1))
e8ee972c	127	params.setdefault('skip_download', True)
0eaf520d	128
ac35c266	129	ydl = YoutubeDL(params, auto_init=False)
023fa8c4	130	ydl.add_default_info_extractors()
bffbd5f0	131	finished_hook_called = set()
5f6a1245	132
bffbd5f0 PH	133	def _hook(status):
	134	if status['status'] == 'finished':
	135	finished_hook_called.add(status['filename'])
933605d7	136	ydl.add_progress_hook(_hook)
70b7e3fb	137	expect_warnings(ydl, test_case.get('expected_warnings', []))
5c892b0b	138
702665c0	139	def get_tc_filename(tc):
ad3dc496	140	return ydl.prepare_filename(dict(tc.get('info_dict', {})))
702665c0	141
28570840	142	res_dict = None
5f6a1245	143
28570840 PH	144	def try_rm_tcs_files(tcs=None):
	145	if tcs is None:
	146	tcs = test_cases
	147	for tc in tcs:
702665c0 JMF	148	tc_filename = get_tc_filename(tc)
	149	try_rm(tc_filename)
	150	try_rm(tc_filename + '.part')
4eb92208	151	try_rm(os.path.splitext(tc_filename)[0] + '.info.json')
702665c0	152	try_rm_tcs_files()
5c892b0b	153	try:
dd508b7c FV	154	try_num = 1
dd508b7c FV	155	while True:
8cc83b8d	156	try:
3bef10a5	157	# We're not using .download here since that is just a shim
e8ee972c PH	158	# for outside error handling, and returns the exit code
e8ee972c PH	159	# instead of the result dict.
308cfe0a S	160	res_dict = ydl.extract_info(
	161	test_case['url'],
	162	force_generic_extractor=params.get('force_generic_extractor', False))
8cc83b8d	163	except (DownloadError, ExtractorError) as err:
8cc83b8d	164	# Check if the exception is not a network related one
d7118397	165	if (err.exc_info[0] not in (urllib.error.URLError, socket.timeout, UnavailableVideoError, http.client.BadStatusLine)
	166	or (err.exc_info[0] == urllib.error.HTTPError and err.exc_info[1].code == 503)):
	167	err.msg = f'{getattr(err, "msg", err)} ({tname})'
8cc83b8d FV	168	raise
8cc83b8d FV	169
dd508b7c	170	if try_num == RETRIES:
8936f68a	171	report_warning('%s failed due to network errors, skipping...' % tname)
dd508b7c FV	172	return
dd508b7c FV	173
86e5f3ed	174	print(f'Retrying: {try_num} failed tries\n\n##########\n\n')
dd508b7c FV	175
dd508b7c FV	176	try_num += 1
8cc83b8d FV	177	else:
8cc83b8d FV	178	break
5c892b0b	179
e8ee972c	180	if is_playlist:
880ee801	181	self.assertTrue(res_dict['_type'] in ['playlist', 'multi_video'])
d6e6a422	182	self.assertTrue('entries' in res_dict)
f74b341d	183	expect_info_dict(self, res_dict, test_case.get('info_dict', {}))
d6e6a422	184
e8ee972c	185	if 'playlist_mincount' in test_case:
0990305d PH	186	assertGreaterEqual(
0990305d PH	187	self,
e8ee972c PH	188	len(res_dict['entries']),
	189	test_case['playlist_mincount'],
	190	'Expected at least %d in playlist %s, but got only %d' % (
	191	test_case['playlist_mincount'], test_case['url'],
	192	len(res_dict['entries'])))
829476b8 PH	193	if 'playlist_count' in test_case:
	194	self.assertEqual(
	195	len(res_dict['entries']),
	196	test_case['playlist_count'],
28570840	197	'Expected %d entries in playlist %s, but got %d.' % (
22a6f150	198	test_case['playlist_count'],
28570840	199	test_case['url'],
22a6f150 PH	200	len(res_dict['entries']),
22a6f150 PH	201	))
28570840 PH	202	if 'playlist_duration_sum' in test_case:
	203	got_duration = sum(e['duration'] for e in res_dict['entries'])
	204	self.assertEqual(
	205	test_case['playlist_duration_sum'], got_duration)
e8ee972c	206
364a69e8 S	207	# Generalize both playlists and single videos to unified format for
	208	# simplicity
	209	if 'entries' not in res_dict:
	210	res_dict['entries'] = [res_dict]
	211
80b2fdf9	212	for tc_num, tc in enumerate(test_cases):
364a69e8 S	213	tc_res_dict = res_dict['entries'][tc_num]
364a69e8 S	214	# First, check test cases' data against extracted data alone
80b2fdf9	215	expect_info_dict(self, tc_res_dict, tc.get('info_dict', {}))
495322b9	216	if tc_res_dict.get('_type', 'video') != 'video':
495322b9	217	continue
364a69e8	218	# Now, check downloaded file consistency
702665c0	219	tc_filename = get_tc_filename(tc)
511eda8e	220	if not test_case.get('params', {}).get('skip_download', False):
702665c0 JMF	221	self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
702665c0 JMF	222	self.assertTrue(tc_filename in finished_hook_called)
08a36c35 S	223	expected_minsize = tc.get('file_minsize', 10000)
	224	if expected_minsize is not None:
	225	if params.get('test'):
	226	expected_minsize = max(expected_minsize, 10000)
	227	got_fsize = os.path.getsize(tc_filename)
	228	assertGreaterEqual(
	229	self, got_fsize, expected_minsize,
	230	'Expected %s to be at least %s, but it\'s only %s ' %
	231	(tc_filename, format_bytes(expected_minsize),
	232	format_bytes(got_fsize)))
	233	if 'md5' in tc:
	234	md5_for_file = _file_md5(tc_filename)
374560f0	235	self.assertEqual(tc['md5'], md5_for_file)
364a69e8 S	236	# Finally, check test cases' data again but this time against
364a69e8 S	237	# extracted data from info JSON file written during processing
4eb92208	238	info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
f744c0f3 PH	239	self.assertTrue(
	240	os.path.exists(info_json_fn),
	241	'Missing info file %s' % info_json_fn)
86e5f3ed	242	with open(info_json_fn, encoding='utf-8') as infof:
5c892b0b	243	info_dict = json.load(infof)
f74b341d	244	expect_info_dict(self, info_dict, tc.get('info_dict', {}))
5c892b0b	245	finally:
702665c0	246	try_rm_tcs_files()
d6e6a422	247	if is_playlist and res_dict is not None and res_dict.get('entries'):
28570840 PH	248	# Remove all other files that may have been extracted if the
	249	# extractor returns full results even with extract_flat
	250	res_tcs = [{'info_dict': e} for e in res_dict['entries']]
	251	try_rm_tcs_files(res_tcs)
fd5ff020	252
1535ac2a	253	return test_template
fd5ff020	254
582be358	255
5f6a1245	256	# And add them to TestDownload
f2e8dbcc	257	def inject_tests(test_cases, label=''):
	258	for test_case in test_cases:
	259	name = test_case['name']
	260	tname = join_nonempty('test', name, label, tests_counter[name][label], delim='_')
	261	tests_counter[name][label] += 1
cdab8aa3	262
f2e8dbcc	263	test_method = generator(test_case, tname)
	264	test_method.__name__ = tname
	265	test_method.add_ie = ','.join(test_case.get('add_ie', []))
	266	setattr(TestDownload, test_method.__name__, test_method)
cdab8aa3	267
243c57cf	268
f2e8dbcc	269	inject_tests(normal_test_cases)
	270
	271	# TODO: disable redirection to the IE to ensure we are actually testing the webpage extraction
	272	inject_tests(webpage_test_cases, 'webpage')
	273
	274
	275	def batch_generator(name):
243c57cf	276	def test_template(self):
f2e8dbcc	277	for label, num_tests in tests_counter[name].items():
	278	for i in range(num_tests):
	279	test_name = join_nonempty('test', name, label, i, delim='_')
	280	try:
	281	getattr(self, test_name)()
	282	except unittest.SkipTest:
	283	print(f'Skipped {test_name}')
243c57cf	284
	285	return test_template
	286
	287
f2e8dbcc	288	for name in tests_counter:
f2e8dbcc	289	test_method = batch_generator(name)
243c57cf	290	test_method.__name__ = f'test_{name}_all'
	291	test_method.add_ie = ''
	292	setattr(TestDownload, test_method.__name__, test_method)
f2e8dbcc	293	del test_method
243c57cf	294
243c57cf	295
cdab8aa3 PH	296	if __name__ == '__main__':
cdab8aa3 PH	297	unittest.main()