]> jfr.im git - yt-dlp.git/blobdiff - test/test_InfoExtractor.py
[compat] Remove more functions
[yt-dlp.git] / test / test_InfoExtractor.py
index c4b7f689e3210efabddaa48d49c6ed0c60394749..f0571c41a92ec94724c04f69240eb43c1858e87f 100644 (file)
@@ -1,27 +1,31 @@
-#!/usr/bin/env python
-
-from __future__ import unicode_literals
-
+#!/usr/bin/env python3
 # Allow direct execution
-import io
 import os
 import sys
 import unittest
+
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from test.helper import FakeYDL, expect_dict, expect_value, http_server_port
-from yt_dlp.compat import compat_etree_fromstring, compat_http_server
-from yt_dlp.extractor.common import InfoExtractor
-from yt_dlp.extractor import YoutubeIE, get_info_extractor
-from yt_dlp.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
+
+import http.server
 import threading
 
+from test.helper import FakeYDL, expect_dict, expect_value, http_server_port
+from yt_dlp.compat import compat_etree_fromstring
+from yt_dlp.extractor import YoutubeIE, get_info_extractor
+from yt_dlp.extractor.common import InfoExtractor
+from yt_dlp.utils import (
+    ExtractorError,
+    RegexNotFoundError,
+    encode_data_uri,
+    strip_jsonp,
+)
 
 TEAPOT_RESPONSE_STATUS = 418
 TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>"
 
 
-class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
+class InfoExtractorTestRequestHandler(http.server.BaseHTTPRequestHandler):
     def log_message(self, format, *args):
         pass
 
@@ -35,13 +39,13 @@ def do_GET(self):
             assert False
 
 
-class TestIE(InfoExtractor):
+class DummyIE(InfoExtractor):
     pass
 
 
 class TestInfoExtractor(unittest.TestCase):
     def setUp(self):
-        self.ie = TestIE(FakeYDL())
+        self.ie = DummyIE(FakeYDL())
 
     def test_ie_key(self):
         self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE)
@@ -99,10 +103,10 @@ def test_html_search_meta(self):
         self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
 
     def test_search_json_ld_realworld(self):
-        # https://github.com/ytdl-org/youtube-dl/issues/23306
-        expect_dict(
-            self,
-            self.ie._search_json_ld(r'''<script type="application/ld+json">
+        _TESTS = [
+            # https://github.com/ytdl-org/youtube-dl/issues/23306
+            (
+                r'''<script type="application/ld+json">
 {
 "@context": "http://schema.org/",
 "@type": "VideoObject",
@@ -135,17 +139,171 @@ def test_search_json_ld_realworld(self):
 "name": "Kleio Valentien",
 "url": "https://www.eporner.com/pornstar/kleio-valentien/"
 }]}
-</script>''', None),
-            {
-                'title': '1 On 1 With Kleio',
-                'description': 'Kleio Valentien',
-                'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
-                'timestamp': 1449347075,
-                'duration': 743.0,
-                'view_count': 1120958,
-                'width': 1920,
-                'height': 1080,
-            })
+                </script>''',
+                {
+                    'title': '1 On 1 With Kleio',
+                    'description': 'Kleio Valentien',
+                    'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4',
+                    'timestamp': 1449347075,
+                    'duration': 743.0,
+                    'view_count': 1120958,
+                    'width': 1920,
+                    'height': 1080,
+                },
+                {},
+            ),
+            (
+                r'''<script type="application/ld+json">
+      {
+      "@context": "https://schema.org",
+      "@graph": [
+      {
+      "@type": "NewsArticle",
+      "mainEntityOfPage": {
+      "@type": "WebPage",
+      "@id": "https://www.ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn"
+      },
+      "headline": "Συμμορία ανηλίκων – δικηγόρος θυμάτων: ήθελαν να τους αποτελειώσουν",
+      "name": "Συμμορία ανηλίκων – δικηγόρος θυμάτων: ήθελαν να τους αποτελειώσουν",
+      "description": "Τα παιδιά δέχθηκαν την επίθεση επειδή αρνήθηκαν να γίνουν μέλη της συμμορίας, ανέφερε ο Γ. Ζαχαρόπουλος.",
+      "image": {
+      "@type": "ImageObject",
+      "url": "https://ant1media.azureedge.net/imgHandler/1100/a635c968-be71-447c-bf9c-80d843ece21e.jpg",
+      "width": 1100,
+      "height": 756            },
+      "datePublished": "2021-11-10T08:50:00+03:00",
+      "dateModified": "2021-11-10T08:52:53+03:00",
+      "author": {
+      "@type": "Person",
+      "@id": "https://www.ant1news.gr/",
+      "name": "Ant1news",
+      "image": "https://www.ant1news.gr/images/logo-e5d7e4b3e714c88e8d2eca96130142f6.png",
+      "url": "https://www.ant1news.gr/"
+      },
+      "publisher": {
+      "@type": "Organization",
+      "@id": "https://www.ant1news.gr#publisher",
+      "name": "Ant1news",
+      "url": "https://www.ant1news.gr",
+      "logo": {
+      "@type": "ImageObject",
+      "url": "https://www.ant1news.gr/images/logo-e5d7e4b3e714c88e8d2eca96130142f6.png",
+      "width": 400,
+      "height": 400                },
+      "sameAs": [
+      "https://www.facebook.com/Ant1news.gr",
+      "https://twitter.com/antennanews",
+      "https://www.youtube.com/channel/UC0smvAbfczoN75dP0Hw4Pzw",
+      "https://www.instagram.com/ant1news/"
+      ]
+      },
+
+      "keywords": "μαχαίρωμα,συμμορία ανηλίκων,ΕΙΔΗΣΕΙΣ,ΕΙΔΗΣΕΙΣ ΣΗΜΕΡΑ,ΝΕΑ,Κοινωνία - Ant1news",
+
+
+      "articleSection": "Κοινωνία"
+      }
+      ]
+      }
+                </script>''',
+                {
+                    'timestamp': 1636523400,
+                    'title': 'md5:91fe569e952e4d146485740ae927662b',
+                },
+                {'expected_type': 'NewsArticle'},
+            ),
+            (
+                r'''<script type="application/ld+json">
+                {"url":"/vrtnu/a-z/het-journaal/2021/het-journaal-het-journaal-19u-20211231/",
+                "name":"Het journaal 19u",
+                "description":"Het journaal 19u van vrijdag 31 december 2021.",
+                "potentialAction":{"url":"https://vrtnu.page.link/pfVy6ihgCAJKgHqe8","@type":"ShareAction"},
+                "mainEntityOfPage":{"@id":"1640092242445","@type":"WebPage"},
+                "publication":[{
+                    "startDate":"2021-12-31T19:00:00.000+01:00",
+                    "endDate":"2022-01-30T23:55:00.000+01:00",
+                    "publishedBy":{"name":"een","@type":"Organization"},
+                    "publishedOn":{"url":"https://www.vrt.be/vrtnu/","name":"VRT NU","@type":"BroadcastService"},
+                    "@id":"pbs-pub-3a7ec233-da95-4c1e-9b2b-cf5fdfebcbe8",
+                    "@type":"BroadcastEvent"
+                    }],
+                "video":{
+                    "name":"Het journaal - Aflevering 365 (Seizoen 2021)",
+                    "description":"Het journaal 19u van vrijdag 31 december 2021. Bekijk aflevering 365 van seizoen 2021 met VRT NU via de site of app.",
+                    "thumbnailUrl":"//images.vrt.be/width1280/2021/12/31/80d5ed00-6a64-11ec-b07d-02b7b76bf47f.jpg",
+                    "expires":"2022-01-30T23:55:00.000+01:00",
+                    "hasPart":[
+                        {"name":"Explosie Turnhout","startOffset":70,"@type":"Clip"},
+                        {"name":"Jaarwisseling","startOffset":440,"@type":"Clip"},
+                        {"name":"Natuurbranden Colorado","startOffset":1179,"@type":"Clip"},
+                        {"name":"Klimaatverandering","startOffset":1263,"@type":"Clip"},
+                        {"name":"Zacht weer","startOffset":1367,"@type":"Clip"},
+                        {"name":"Financiële balans","startOffset":1383,"@type":"Clip"},
+                        {"name":"Club Brugge","startOffset":1484,"@type":"Clip"},
+                        {"name":"Mentale gezondheid bij topsporters","startOffset":1575,"@type":"Clip"},
+                        {"name":"Olympische Winterspelen","startOffset":1728,"@type":"Clip"},
+                        {"name":"Sober oudjaar in Nederland","startOffset":1873,"@type":"Clip"}
+                        ],
+                    "duration":"PT34M39.23S",
+                    "uploadDate":"2021-12-31T19:00:00.000+01:00",
+                    "@id":"vid-9457d0c6-b8ac-4aba-b5e1-15aa3a3295b5",
+                    "@type":"VideoObject"
+                },
+                "genre":["Nieuws en actua"],
+                "episodeNumber":365,
+                "partOfSeries":{"name":"Het journaal","@id":"222831405527","@type":"TVSeries"},
+                "partOfSeason":{"name":"Seizoen 2021","@id":"961809365527","@type":"TVSeason"},
+                "@context":"https://schema.org","@id":"961685295527","@type":"TVEpisode"}</script>
+                ''',
+                {
+                    'chapters': [
+                        {"title": "Explosie Turnhout", "start_time": 70, "end_time": 440},
+                        {"title": "Jaarwisseling", "start_time": 440, "end_time": 1179},
+                        {"title": "Natuurbranden Colorado", "start_time": 1179, "end_time": 1263},
+                        {"title": "Klimaatverandering", "start_time": 1263, "end_time": 1367},
+                        {"title": "Zacht weer", "start_time": 1367, "end_time": 1383},
+                        {"title": "Financiële balans", "start_time": 1383, "end_time": 1484},
+                        {"title": "Club Brugge", "start_time": 1484, "end_time": 1575},
+                        {"title": "Mentale gezondheid bij topsporters", "start_time": 1575, "end_time": 1728},
+                        {"title": "Olympische Winterspelen", "start_time": 1728, "end_time": 1873},
+                        {"title": "Sober oudjaar in Nederland", "start_time": 1873, "end_time": 2079.23}
+                    ],
+                    'title': 'Het journaal - Aflevering 365 (Seizoen 2021)'
+                }, {}
+            ),
+            (
+                # test multiple thumbnails in a list
+                r'''
+<script type="application/ld+json">
+{"@context":"https://schema.org",
+"@type":"VideoObject",
+"thumbnailUrl":["https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"]}
+</script>''',
+                {
+                    'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
+                },
+                {},
+            ),
+            (
+                # test single thumbnail
+                r'''
+<script type="application/ld+json">
+{"@context":"https://schema.org",
+"@type":"VideoObject",
+"thumbnailUrl":"https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"}
+</script>''',
+                {
+                    'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
+                },
+                {},
+            )
+        ]
+        for html, expected_dict, search_json_ld_kwargs in _TESTS:
+            expect_dict(
+                self,
+                self.ie._search_json_ld(html, None, **search_json_ld_kwargs),
+                expected_dict
+            )
 
     def test_download_json(self):
         uri = encode_data_uri(b'{"foo": "blah"}', 'application/json')
@@ -346,6 +504,24 @@ def test_parse_html5_media_entries(self):
                 }],
             })
 
+        # from https://0000.studio/
+        # with type attribute but without extension in URL
+        expect_dict(
+            self,
+            self.ie._parse_html5_media_entries(
+                'https://0000.studio',
+                r'''
+                <video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92"
+                    controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain">
+                </video>
+                ''', None)[0],
+            {
+                'formats': [{
+                    'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92',
+                    'ext': 'mp4',
+                }],
+            })
+
     def test_extract_jwplayer_data_realworld(self):
         # from http://www.suffolk.edu/sjc/
         expect_dict(
@@ -857,8 +1033,7 @@ def test_parse_m3u8_formats(self):
         ]
 
         for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES:
-            with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file,
-                         mode='r', encoding='utf-8') as f:
+            with open('./test/testdata/m3u8/%s.m3u8' % m3u8_file, encoding='utf-8') as f:
                 formats, subs = self.ie._parse_m3u8_formats_and_subtitles(
                     f.read(), m3u8_url, ext='mp4')
                 self.ie._sort_formats(formats)
@@ -1203,10 +1378,9 @@ def test_parse_mpd_formats(self):
         ]
 
         for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES:
-            with io.open('./test/testdata/mpd/%s.mpd' % mpd_file,
-                         mode='r', encoding='utf-8') as f:
+            with open('./test/testdata/mpd/%s.mpd' % mpd_file, encoding='utf-8') as f:
                 formats, subtitles = self.ie._parse_mpd_formats_and_subtitles(
-                    compat_etree_fromstring(f.read().encode('utf-8')),
+                    compat_etree_fromstring(f.read().encode()),
                     mpd_base_url=mpd_base_url, mpd_url=mpd_url)
                 self.ie._sort_formats(formats)
                 expect_value(self, formats, expected_formats, None)
@@ -1395,10 +1569,9 @@ def test_parse_ism_formats(self):
         ]
 
         for ism_file, ism_url, expected_formats, expected_subtitles in _TEST_CASES:
-            with io.open('./test/testdata/ism/%s.Manifest' % ism_file,
-                         mode='r', encoding='utf-8') as f:
+            with open('./test/testdata/ism/%s.Manifest' % ism_file, encoding='utf-8') as f:
                 formats, subtitles = self.ie._parse_ism_formats_and_subtitles(
-                    compat_etree_fromstring(f.read().encode('utf-8')), ism_url=ism_url)
+                    compat_etree_fromstring(f.read().encode()), ism_url=ism_url)
                 self.ie._sort_formats(formats)
                 expect_value(self, formats, expected_formats, None)
                 expect_value(self, subtitles, expected_subtitles, None)
@@ -1422,10 +1595,9 @@ def test_parse_f4m_formats(self):
         ]
 
         for f4m_file, f4m_url, expected_formats in _TEST_CASES:
-            with io.open('./test/testdata/f4m/%s.f4m' % f4m_file,
-                         mode='r', encoding='utf-8') as f:
+            with open('./test/testdata/f4m/%s.f4m' % f4m_file, encoding='utf-8') as f:
                 formats = self.ie._parse_f4m_formats(
-                    compat_etree_fromstring(f.read().encode('utf-8')),
+                    compat_etree_fromstring(f.read().encode()),
                     f4m_url, None)
                 self.ie._sort_formats(formats)
                 expect_value(self, formats, expected_formats, None)
@@ -1470,10 +1642,9 @@ def test_parse_xspf(self):
         ]
 
         for xspf_file, xspf_url, expected_entries in _TEST_CASES:
-            with io.open('./test/testdata/xspf/%s.xspf' % xspf_file,
-                         mode='r', encoding='utf-8') as f:
+            with open('./test/testdata/xspf/%s.xspf' % xspf_file, encoding='utf-8') as f:
                 entries = self.ie._parse_xspf(
-                    compat_etree_fromstring(f.read().encode('utf-8')),
+                    compat_etree_fromstring(f.read().encode()),
                     xspf_file, xspf_url=xspf_url, xspf_base_url=xspf_url)
                 expect_value(self, entries, expected_entries, None)
                 for i in range(len(entries)):
@@ -1486,7 +1657,7 @@ def test_response_with_expected_status_returns_content(self):
         # or the underlying `_download_webpage_handle` returning no content
         # when a response matches `expected_status`.
 
-        httpd = compat_http_server.HTTPServer(
+        httpd = http.server.HTTPServer(
             ('127.0.0.1', 0), InfoExtractorTestRequestHandler)
         port = http_server_port(httpd)
         server_thread = threading.Thread(target=httpd.serve_forever)