X-Git-Url: https://jfr.im/git/yt-dlp.git/blobdiff_plain/177877c54493d0cb32f65e87ff9ed88a030cfbdb..ac668111128b5f124b4271b3aa4c35f6e71a4749:/test/test_InfoExtractor.py diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index c4b7f689e..f0571c41a 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1,27 +1,31 @@ -#!/usr/bin/env python - -from __future__ import unicode_literals - +#!/usr/bin/env python3 # Allow direct execution -import io import os import sys import unittest + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, expect_dict, expect_value, http_server_port -from yt_dlp.compat import compat_etree_fromstring, compat_http_server -from yt_dlp.extractor.common import InfoExtractor -from yt_dlp.extractor import YoutubeIE, get_info_extractor -from yt_dlp.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError + +import http.server import threading +from test.helper import FakeYDL, expect_dict, expect_value, http_server_port +from yt_dlp.compat import compat_etree_fromstring +from yt_dlp.extractor import YoutubeIE, get_info_extractor +from yt_dlp.extractor.common import InfoExtractor +from yt_dlp.utils import ( + ExtractorError, + RegexNotFoundError, + encode_data_uri, + strip_jsonp, +) TEAPOT_RESPONSE_STATUS = 418 TEAPOT_RESPONSE_BODY = "

418 I'm a teapot

" -class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): +class InfoExtractorTestRequestHandler(http.server.BaseHTTPRequestHandler): def log_message(self, format, *args): pass @@ -35,13 +39,13 @@ def do_GET(self): assert False -class TestIE(InfoExtractor): +class DummyIE(InfoExtractor): pass class TestInfoExtractor(unittest.TestCase): def setUp(self): - self.ie = TestIE(FakeYDL()) + self.ie = DummyIE(FakeYDL()) def test_ie_key(self): self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) @@ -99,10 +103,10 @@ def test_html_search_meta(self): self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) def test_search_json_ld_realworld(self): - # https://github.com/ytdl-org/youtube-dl/issues/23306 - expect_dict( - self, - self.ie._search_json_ld(r'''''', None), - { - 'title': '1 On 1 With Kleio', - 'description': 'Kleio Valentien', - 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', - 'timestamp': 1449347075, - 'duration': 743.0, - 'view_count': 1120958, - 'width': 1920, - 'height': 1080, - }) + ''', + { + 'title': '1 On 1 With Kleio', + 'description': 'Kleio Valentien', + 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', + 'timestamp': 1449347075, + 'duration': 743.0, + 'view_count': 1120958, + 'width': 1920, + 'height': 1080, + }, + {}, + ), + ( + r'''''', + { + 'timestamp': 1636523400, + 'title': 'md5:91fe569e952e4d146485740ae927662b', + }, + {'expected_type': 'NewsArticle'}, + ), + ( + r''' + ''', + { + 'chapters': [ + {"title": "Explosie Turnhout", "start_time": 70, "end_time": 440}, + {"title": "Jaarwisseling", "start_time": 440, "end_time": 1179}, + {"title": "Natuurbranden Colorado", "start_time": 1179, "end_time": 1263}, + {"title": "Klimaatverandering", "start_time": 1263, "end_time": 1367}, + {"title": "Zacht weer", "start_time": 1367, "end_time": 1383}, + {"title": "Financiële balans", "start_time": 1383, "end_time": 1484}, + {"title": "Club Brugge", "start_time": 1484, "end_time": 1575}, + {"title": "Mentale gezondheid bij topsporters", "start_time": 1575, "end_time": 1728}, + {"title": "Olympische Winterspelen", "start_time": 1728, "end_time": 1873}, + {"title": "Sober oudjaar in Nederland", "start_time": 1873, "end_time": 2079.23} + ], + 'title': 'Het journaal - Aflevering 365 (Seizoen 2021)' + }, {} + ), + ( + # test multiple thumbnails in a list + r''' +''', + { + 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}], + }, + {}, + ), + ( + # test single thumbnail + r''' +''', + { + 'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}], + }, + {}, + ) + ] + for html, expected_dict, search_json_ld_kwargs in _TESTS: + expect_dict( + self, + self.ie._search_json_ld(html, None, **search_json_ld_kwargs), + expected_dict + ) def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') @@ -346,6 +504,24 @@ def test_parse_html5_media_entries(self): }], }) + # from https://0000.studio/ + # with type attribute but without extension in URL + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://0000.studio', + r''' + + ''', None)[0], + { + 'formats': [{ + 'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92', + 'ext': 'mp4', + }], + }) + def test_extract_jwplayer_data_realworld(self): # from http://www.suffolk.edu/sjc/ expect_dict( @@ -857,8 +1033,7 @@ def test_parse_m3u8_formats(self): ] for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES: - with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file, - mode='r', encoding='utf-8') as f: + with open('./test/testdata/m3u8/%s.m3u8' % m3u8_file, encoding='utf-8') as f: formats, subs = self.ie._parse_m3u8_formats_and_subtitles( f.read(), m3u8_url, ext='mp4') self.ie._sort_formats(formats) @@ -1203,10 +1378,9 @@ def test_parse_mpd_formats(self): ] for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES: - with io.open('./test/testdata/mpd/%s.mpd' % mpd_file, - mode='r', encoding='utf-8') as f: + with open('./test/testdata/mpd/%s.mpd' % mpd_file, encoding='utf-8') as f: formats, subtitles = self.ie._parse_mpd_formats_and_subtitles( - compat_etree_fromstring(f.read().encode('utf-8')), + compat_etree_fromstring(f.read().encode()), mpd_base_url=mpd_base_url, mpd_url=mpd_url) self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) @@ -1395,10 +1569,9 @@ def test_parse_ism_formats(self): ] for ism_file, ism_url, expected_formats, expected_subtitles in _TEST_CASES: - with io.open('./test/testdata/ism/%s.Manifest' % ism_file, - mode='r', encoding='utf-8') as f: + with open('./test/testdata/ism/%s.Manifest' % ism_file, encoding='utf-8') as f: formats, subtitles = self.ie._parse_ism_formats_and_subtitles( - compat_etree_fromstring(f.read().encode('utf-8')), ism_url=ism_url) + compat_etree_fromstring(f.read().encode()), ism_url=ism_url) self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) expect_value(self, subtitles, expected_subtitles, None) @@ -1422,10 +1595,9 @@ def test_parse_f4m_formats(self): ] for f4m_file, f4m_url, expected_formats in _TEST_CASES: - with io.open('./test/testdata/f4m/%s.f4m' % f4m_file, - mode='r', encoding='utf-8') as f: + with open('./test/testdata/f4m/%s.f4m' % f4m_file, encoding='utf-8') as f: formats = self.ie._parse_f4m_formats( - compat_etree_fromstring(f.read().encode('utf-8')), + compat_etree_fromstring(f.read().encode()), f4m_url, None) self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) @@ -1470,10 +1642,9 @@ def test_parse_xspf(self): ] for xspf_file, xspf_url, expected_entries in _TEST_CASES: - with io.open('./test/testdata/xspf/%s.xspf' % xspf_file, - mode='r', encoding='utf-8') as f: + with open('./test/testdata/xspf/%s.xspf' % xspf_file, encoding='utf-8') as f: entries = self.ie._parse_xspf( - compat_etree_fromstring(f.read().encode('utf-8')), + compat_etree_fromstring(f.read().encode()), xspf_file, xspf_url=xspf_url, xspf_base_url=xspf_url) expect_value(self, entries, expected_entries, None) for i in range(len(entries)): @@ -1486,7 +1657,7 @@ def test_response_with_expected_status_returns_content(self): # or the underlying `_download_webpage_handle` returning no content # when a response matches `expected_status`. - httpd = compat_http_server.HTTPServer( + httpd = http.server.HTTPServer( ('127.0.0.1', 0), InfoExtractorTestRequestHandler) port = http_server_port(httpd) server_thread = threading.Thread(target=httpd.serve_forever)