]> jfr.im git - yt-dlp.git/blobdiff - test/test_InfoExtractor.py
[cleanup] Consistent style for file heads
[yt-dlp.git] / test / test_InfoExtractor.py
index cf06dbde46d2130869e8a6d8e6cd0bcbc7701bb3..f57a29ffc7becd1fb0cf808b258d5a94544cb49f 100644 (file)
@@ -1,27 +1,32 @@
 #!/usr/bin/env python3
 
-from __future__ import unicode_literals
-
 # Allow direct execution
-import io
 import os
 import sys
 import unittest
+
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from test.helper import FakeYDL, expect_dict, expect_value, http_server_port
-from yt_dlp.compat import compat_etree_fromstring, compat_http_server
-from yt_dlp.extractor.common import InfoExtractor
-from yt_dlp.extractor import YoutubeIE, get_info_extractor
-from yt_dlp.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
+
+import http.server
 import threading
 
+from test.helper import FakeYDL, expect_dict, expect_value, http_server_port
+from yt_dlp.compat import compat_etree_fromstring
+from yt_dlp.extractor import YoutubeIE, get_info_extractor
+from yt_dlp.extractor.common import InfoExtractor
+from yt_dlp.utils import (
+    ExtractorError,
+    RegexNotFoundError,
+    encode_data_uri,
+    strip_jsonp,
+)
 
 TEAPOT_RESPONSE_STATUS = 418
 TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>"
 
 
-class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
+class InfoExtractorTestRequestHandler(http.server.BaseHTTPRequestHandler):
     def log_message(self, format, *args):
         pass
 
@@ -208,6 +213,91 @@ def test_search_json_ld_realworld(self):
                 },
                 {'expected_type': 'NewsArticle'},
             ),
+            (
+                r'''<script type="application/ld+json">
+                {"url":"/vrtnu/a-z/het-journaal/2021/het-journaal-het-journaal-19u-20211231/",
+                "name":"Het journaal 19u",
+                "description":"Het journaal 19u van vrijdag 31 december 2021.",
+                "potentialAction":{"url":"https://vrtnu.page.link/pfVy6ihgCAJKgHqe8","@type":"ShareAction"},
+                "mainEntityOfPage":{"@id":"1640092242445","@type":"WebPage"},
+                "publication":[{
+                    "startDate":"2021-12-31T19:00:00.000+01:00",
+                    "endDate":"2022-01-30T23:55:00.000+01:00",
+                    "publishedBy":{"name":"een","@type":"Organization"},
+                    "publishedOn":{"url":"https://www.vrt.be/vrtnu/","name":"VRT NU","@type":"BroadcastService"},
+                    "@id":"pbs-pub-3a7ec233-da95-4c1e-9b2b-cf5fdfebcbe8",
+                    "@type":"BroadcastEvent"
+                    }],
+                "video":{
+                    "name":"Het journaal - Aflevering 365 (Seizoen 2021)",
+                    "description":"Het journaal 19u van vrijdag 31 december 2021. Bekijk aflevering 365 van seizoen 2021 met VRT NU via de site of app.",
+                    "thumbnailUrl":"//images.vrt.be/width1280/2021/12/31/80d5ed00-6a64-11ec-b07d-02b7b76bf47f.jpg",
+                    "expires":"2022-01-30T23:55:00.000+01:00",
+                    "hasPart":[
+                        {"name":"Explosie Turnhout","startOffset":70,"@type":"Clip"},
+                        {"name":"Jaarwisseling","startOffset":440,"@type":"Clip"},
+                        {"name":"Natuurbranden Colorado","startOffset":1179,"@type":"Clip"},
+                        {"name":"Klimaatverandering","startOffset":1263,"@type":"Clip"},
+                        {"name":"Zacht weer","startOffset":1367,"@type":"Clip"},
+                        {"name":"FinanciĆ«le balans","startOffset":1383,"@type":"Clip"},
+                        {"name":"Club Brugge","startOffset":1484,"@type":"Clip"},
+                        {"name":"Mentale gezondheid bij topsporters","startOffset":1575,"@type":"Clip"},
+                        {"name":"Olympische Winterspelen","startOffset":1728,"@type":"Clip"},
+                        {"name":"Sober oudjaar in Nederland","startOffset":1873,"@type":"Clip"}
+                        ],
+                    "duration":"PT34M39.23S",
+                    "uploadDate":"2021-12-31T19:00:00.000+01:00",
+                    "@id":"vid-9457d0c6-b8ac-4aba-b5e1-15aa3a3295b5",
+                    "@type":"VideoObject"
+                },
+                "genre":["Nieuws en actua"],
+                "episodeNumber":365,
+                "partOfSeries":{"name":"Het journaal","@id":"222831405527","@type":"TVSeries"},
+                "partOfSeason":{"name":"Seizoen 2021","@id":"961809365527","@type":"TVSeason"},
+                "@context":"https://schema.org","@id":"961685295527","@type":"TVEpisode"}</script>
+                ''',
+                {
+                    'chapters': [
+                        {"title": "Explosie Turnhout", "start_time": 70, "end_time": 440},
+                        {"title": "Jaarwisseling", "start_time": 440, "end_time": 1179},
+                        {"title": "Natuurbranden Colorado", "start_time": 1179, "end_time": 1263},
+                        {"title": "Klimaatverandering", "start_time": 1263, "end_time": 1367},
+                        {"title": "Zacht weer", "start_time": 1367, "end_time": 1383},
+                        {"title": "FinanciĆ«le balans", "start_time": 1383, "end_time": 1484},
+                        {"title": "Club Brugge", "start_time": 1484, "end_time": 1575},
+                        {"title": "Mentale gezondheid bij topsporters", "start_time": 1575, "end_time": 1728},
+                        {"title": "Olympische Winterspelen", "start_time": 1728, "end_time": 1873},
+                        {"title": "Sober oudjaar in Nederland", "start_time": 1873, "end_time": 2079.23}
+                    ],
+                    'title': 'Het journaal - Aflevering 365 (Seizoen 2021)'
+                }, {}
+            ),
+            (
+                # test multiple thumbnails in a list
+                r'''
+<script type="application/ld+json">
+{"@context":"https://schema.org",
+"@type":"VideoObject",
+"thumbnailUrl":["https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"]}
+</script>''',
+                {
+                    'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
+                },
+                {},
+            ),
+            (
+                # test single thumbnail
+                r'''
+<script type="application/ld+json">
+{"@context":"https://schema.org",
+"@type":"VideoObject",
+"thumbnailUrl":"https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg"}
+</script>''',
+                {
+                    'thumbnails': [{'url': 'https://www.rainews.it/cropgd/640x360/dl/img/2021/12/30/1640886376927_GettyImages.jpg'}],
+                },
+                {},
+            )
         ]
         for html, expected_dict, search_json_ld_kwargs in _TESTS:
             expect_dict(
@@ -415,6 +505,24 @@ def test_parse_html5_media_entries(self):
                 }],
             })
 
+        # from https://0000.studio/
+        # with type attribute but without extension in URL
+        expect_dict(
+            self,
+            self.ie._parse_html5_media_entries(
+                'https://0000.studio',
+                r'''
+                <video src="https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92"
+                    controls="controls" type="video/mp4" preload="metadata" autoplay="autoplay" playsinline class="object-contain">
+                </video>
+                ''', None)[0],
+            {
+                'formats': [{
+                    'url': 'https://d1ggyt9m8pwf3g.cloudfront.net/protected/ap-northeast-1:1864af40-28d5-492b-b739-b32314b1a527/archive/clip/838db6a7-8973-4cd6-840d-8517e4093c92',
+                    'ext': 'mp4',
+                }],
+            })
+
     def test_extract_jwplayer_data_realworld(self):
         # from http://www.suffolk.edu/sjc/
         expect_dict(
@@ -926,8 +1034,7 @@ def test_parse_m3u8_formats(self):
         ]
 
         for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES:
-            with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file,
-                         mode='r', encoding='utf-8') as f:
+            with open('./test/testdata/m3u8/%s.m3u8' % m3u8_file, encoding='utf-8') as f:
                 formats, subs = self.ie._parse_m3u8_formats_and_subtitles(
                     f.read(), m3u8_url, ext='mp4')
                 self.ie._sort_formats(formats)
@@ -1272,10 +1379,9 @@ def test_parse_mpd_formats(self):
         ]
 
         for mpd_file, mpd_url, mpd_base_url, expected_formats, expected_subtitles in _TEST_CASES:
-            with io.open('./test/testdata/mpd/%s.mpd' % mpd_file,
-                         mode='r', encoding='utf-8') as f:
+            with open('./test/testdata/mpd/%s.mpd' % mpd_file, encoding='utf-8') as f:
                 formats, subtitles = self.ie._parse_mpd_formats_and_subtitles(
-                    compat_etree_fromstring(f.read().encode('utf-8')),
+                    compat_etree_fromstring(f.read().encode()),
                     mpd_base_url=mpd_base_url, mpd_url=mpd_url)
                 self.ie._sort_formats(formats)
                 expect_value(self, formats, expected_formats, None)
@@ -1464,10 +1570,9 @@ def test_parse_ism_formats(self):
         ]
 
         for ism_file, ism_url, expected_formats, expected_subtitles in _TEST_CASES:
-            with io.open('./test/testdata/ism/%s.Manifest' % ism_file,
-                         mode='r', encoding='utf-8') as f:
+            with open('./test/testdata/ism/%s.Manifest' % ism_file, encoding='utf-8') as f:
                 formats, subtitles = self.ie._parse_ism_formats_and_subtitles(
-                    compat_etree_fromstring(f.read().encode('utf-8')), ism_url=ism_url)
+                    compat_etree_fromstring(f.read().encode()), ism_url=ism_url)
                 self.ie._sort_formats(formats)
                 expect_value(self, formats, expected_formats, None)
                 expect_value(self, subtitles, expected_subtitles, None)
@@ -1491,10 +1596,9 @@ def test_parse_f4m_formats(self):
         ]
 
         for f4m_file, f4m_url, expected_formats in _TEST_CASES:
-            with io.open('./test/testdata/f4m/%s.f4m' % f4m_file,
-                         mode='r', encoding='utf-8') as f:
+            with open('./test/testdata/f4m/%s.f4m' % f4m_file, encoding='utf-8') as f:
                 formats = self.ie._parse_f4m_formats(
-                    compat_etree_fromstring(f.read().encode('utf-8')),
+                    compat_etree_fromstring(f.read().encode()),
                     f4m_url, None)
                 self.ie._sort_formats(formats)
                 expect_value(self, formats, expected_formats, None)
@@ -1539,10 +1643,9 @@ def test_parse_xspf(self):
         ]
 
         for xspf_file, xspf_url, expected_entries in _TEST_CASES:
-            with io.open('./test/testdata/xspf/%s.xspf' % xspf_file,
-                         mode='r', encoding='utf-8') as f:
+            with open('./test/testdata/xspf/%s.xspf' % xspf_file, encoding='utf-8') as f:
                 entries = self.ie._parse_xspf(
-                    compat_etree_fromstring(f.read().encode('utf-8')),
+                    compat_etree_fromstring(f.read().encode()),
                     xspf_file, xspf_url=xspf_url, xspf_base_url=xspf_url)
                 expect_value(self, entries, expected_entries, None)
                 for i in range(len(entries)):
@@ -1555,7 +1658,7 @@ def test_response_with_expected_status_returns_content(self):
         # or the underlying `_download_webpage_handle` returning no content
         # when a response matches `expected_status`.
 
-        httpd = compat_http_server.HTTPServer(
+        httpd = http.server.HTTPServer(
             ('127.0.0.1', 0), InfoExtractorTestRequestHandler)
         port = http_server_port(httpd)
         server_thread = threading.Thread(target=httpd.serve_forever)