[youtube] Prefer UTC upload date for videos (#2223)

[yt-dlp.git] / yt_dlp / extractor / generic.py
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py

index baff0280f79755d34b95811ddf1a7c7101a00596..0ddd050ff12056dc5d9239ffdc50498d0d9f19eb 100644 (file)
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -28,6 +28,7 @@
      mimetype2ext,
      orderedSet,
      parse_duration,
+    parse_resolution,
      sanitized_Request,
      smuggle_url,
      unescapeHTML,
@@ -100,6 +101,9 @@
  from .arte import ArteTVEmbedIE
  from .videopress import VideoPressIE
  from .rutube import RutubeIE
+from .glomex import GlomexEmbedIE
+from .megatvcom import MegaTVComEmbedIE
+from .ant1newsgr import Ant1NewsGrEmbedIE
  from .limelight import LimelightBaseIE
  from .anvato import AnvatoIE
  from .washingtonpost import WashingtonPostIE
@@ -112,6 +116,7 @@
  from .vshare import VShareIE
  from .mediasite import MediasiteIE
  from .springboardplatform import SpringboardPlatformIE
+from .ted import TedEmbedIE
  from .yapfiles import YapFilesIE
  from .vice import ViceIE
  from .xfileshare import XFileShareIE
@@ -135,8 +140,11 @@
  from .medialaan import MedialaanIE
  from .simplecast import SimplecastIE
  from .wimtv import WimTVIE
+from .tvopengr import TVOpenGrEmbedIE
+from .ertgr import ERTWebtvEmbedIE
  from .tvp import TVPEmbedIE
  from .blogger import BloggerIE
+from .mainstreaming import MainStreamingIE
  from .gfycat import GfycatIE
  
  
@@ -206,7 +214,7 @@ class GenericIE(InfoExtractor):
          {
              'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
              'info_dict': {
-                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+                'id': 'https://phihag.de/2014/youtube-dl/rss2.xml',
                  'title': 'Zero Punctuation',
                  'description': 're:.*groundbreaking video review series.*'
              },
@@ -251,6 +259,9 @@ class GenericIE(InfoExtractor):
                      'episode_number': 1,
                      'season_number': 1,
                      'age_limit': 0,
+                    'season': 'Season 1',
+                    'direct': True,
+                    'episode': 'Episode 1',
                  },
              }],
              'params': {
@@ -267,6 +278,16 @@ class GenericIE(InfoExtractor):
              },
              'playlist_mincount': 100,
          },
+        # RSS feed with guid
+        {
+            'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
+            'info_dict': {
+                'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
+                'description': 'md5:be809a44b63b0c56fb485caf68685520',
+                'title': 'The Little Red Podcast',
+            },
+            'playlist_mincount': 76,
+        },
          # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
          {
              'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
@@ -1449,24 +1470,6 @@ class GenericIE(InfoExtractor):
                  'duration': 45.115,
              },
          },
-        # 5min embed
-        {
-            'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
-            'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
-            'info_dict': {
-                'id': '518726732',
-                'ext': 'mp4',
-                'title': 'Facebook Creates "On This Day" | Crunch Report',
-                'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild',
-                'timestamp': 1427237531,
-                'uploader': 'Crunch Report',
-                'upload_date': '20150324',
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        },
          # Crooks and Liars embed
          {
              'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
@@ -1870,6 +1873,62 @@ class GenericIE(InfoExtractor):
              },
              'add_ie': [RutubeIE.ie_key()],
          },
+        {
+            # glomex:embed
+            'url': 'https://www.skai.gr/news/world/iatrikos-syllogos-tourkias-to-turkovac-aplo-dialyma-erntogan-eiste-apateones-kai-pseytes',
+            'info_dict': {
+                'id': 'v-ch2nkhcirwc9-sf',
+                'ext': 'mp4',
+                'title': 'md5:786e1e24e06c55993cee965ef853a0c1',
+                'description': 'md5:8b517a61d577efe7e36fde72fd535995',
+                'timestamp': 1641885019,
+                'upload_date': '20220111',
+                'duration': 460000,
+                'thumbnail': 'https://i3thumbs.glomex.com/dC1idjJwdndiMjRzeGwvMjAyMi8wMS8xMS8wNy8xMF8zNV82MWRkMmQ2YmU5ZTgyLmpwZw==/profile:player-960x540',
+            },
+        },
+        {
+            # megatvcom:embed
+            'url': 'https://www.in.gr/2021/12/18/greece/apokalypsi-mega-poios-parelave-tin-ereyna-tsiodra-ek-merous-tis-kyvernisis-o-prothypourgos-telika-gnorize/',
+            'info_dict': {
+                'id': 'apokalypsi-mega-poios-parelave-tin-ereyna-tsiodra-ek-merous-tis-kyvernisis-o-prothypourgos-telika-gnorize',
+                'title': 'md5:5e569cf996ec111057c2764ec272848f',
+            },
+            'playlist': [{
+                'md5': '1afa26064ff00ccb91617957dbc73dc1',
+                'info_dict': {
+                    'ext': 'mp4',
+                    'id': '564916',
+                    'display_id': 'md5:6cdf22d3a2e7bacb274b7295089a1770',
+                    'title': 'md5:33b9dd39584685b62873043670eb52a6',
+                    'description': 'md5:c1db7310f390518ac36dd69d947ef1a1',
+                    'timestamp': 1639753145,
+                    'upload_date': '20211217',
+                    'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/12/prezerakos-1024x597.jpg',
+                },
+            }, {
+                'md5': '4a1c220695f1ef865a8b7966a53e2474',
+                'info_dict': {
+                    'ext': 'mp4',
+                    'id': '564905',
+                    'display_id': 'md5:ead15695e485e649aed2b81ebd699b88',
+                    'title': 'md5:2b71fd54249a3ca34609fe39ae31c47b',
+                    'description': 'md5:c42e12f638d0a97d6de4508e2c4df982',
+                    'timestamp': 1639753047,
+                    'upload_date': '20211217',
+                    'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/12/tsiodras-mitsotakis-1024x545.jpg',
+                },
+            }]
+        },
+        {
+            'url': 'https://www.ertnews.gr/video/manolis-goyalles-o-anthropos-piso-apo-ti-diadiktyaki-vasilopita/',
+            'info_dict': {
+                'id': '2022/tv/news-themata-ianouarios/20220114-apotis6-gouales-pita.mp4',
+                'ext': 'mp4',
+                'title': 'md5:df64f5b61c06d0e9556c0cdd5cf14464',
+                'thumbnail': 'https://www.ert.gr/themata/photos/2021/20220114-apotis6-gouales-pita.jpg',
+            },
+        },
          {
              # ThePlatform embedded with whitespaces in URLs
              'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',
@@ -2175,6 +2234,22 @@ class GenericIE(InfoExtractor):
                  'skip_download': True,
              },
          },
+        {
+            # tvopengr:embed
+            'url': 'https://www.ethnos.gr/World/article/190604/hparosiaxekinoynoisynomiliessthgeneyhmethskiatoypolemoypanoapothnoykrania',
+            'md5': 'eb0c3995d0a6f18f6538c8e057865d7d',
+            'info_dict': {
+                'id': '101119',
+                'ext': 'mp4',
+                'display_id': 'oikarpoitondiapragmateyseonhparosias',
+                'title': 'md5:b979f4d640c568617d6547035528a149',
+                'description': 'md5:e54fc1977c7159b01cc11cd7d9d85550',
+                'timestamp': 1641772800,
+                'upload_date': '20220110',
+                'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/70bc39fa-895b-4918-a364-c39d2135fc6d.jpg',
+
+            }
+        },
          {
              # blogger embed
              'url': 'https://blog.tomeuvizoso.net/2019/01/a-panfrost-milestone.html',
@@ -2384,6 +2459,19 @@ class GenericIE(InfoExtractor):
                  'upload_date': '20211113'
              }
          },
+        {
+            # MainStreaming player
+            'url': 'https://www.lactv.it/2021/10/03/lac-news24-la-settimana-03-10-2021/',
+            'info_dict': {
+                'id': 'EUlZfGWkGpOd',
+                'title': 'La Settimana ',
+                'description': '03 Ottobre ore 02:00',
+                'ext': 'mp4',
+                'live_status': 'not_live',
+                'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
+                'duration': 1512
+            }
+        },
          {
              # Multiple gfycat iframe embeds
              'url': 'https://www.gezip.net/bbs/board.php?bo_table=entertaine&wr_id=613422',
@@ -2411,7 +2499,6 @@ class GenericIE(InfoExtractor):
              },
              'playlist_count': 9
          }
-        #
      ]
  
      def report_following_redirect(self, new_url):
@@ -2445,6 +2532,9 @@ def _extract_rss(self, url, video_id, doc):
              if not next_url:
                  continue
  
+            if it.find('guid').text is not None:
+                next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text})
+
              def itunes(key):
                  return xpath_text(
                      it, xpath_with_ns('./itunes:%s' % key, NS_MAP),
@@ -3111,10 +3201,9 @@ def _real_extract(self, url):
              return self.url_result(mobj.group('url'), 'Tvigle')
  
          # Look for embedded TED player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'TED')
+        ted_urls = TedEmbedIE._extract_urls(webpage)
+        if ted_urls:
+            return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key())
  
          # Look for embedded Ustream videos
          ustream_url = UstreamIE._extract_url(webpage)
@@ -3247,12 +3336,6 @@ def _real_extract(self, url):
          if mobj is not None:
              return self.url_result(mobj.group('url'))
  
-        # Look for 5min embeds
-        mobj = re.search(
-            r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
-        if mobj is not None:
-            return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
-
          # Look for Crooks and Liars embeds
          mobj = re.search(
              r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
@@ -3450,6 +3533,24 @@ def _real_extract(self, url):
              return self.playlist_from_matches(
                  rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
  
+        # Look for Glomex embeds
+        glomex_urls = list(GlomexEmbedIE._extract_urls(webpage, url))
+        if glomex_urls:
+            return self.playlist_from_matches(
+                glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key())
+
+        # Look for megatv.com embeds
+        megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage))
+        if megatvcom_urls:
+            return self.playlist_from_matches(
+                megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key())
+
+        # Look for ant1news.gr embeds
+        ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage))
+        if ant1newsgr_urls:
+            return self.playlist_from_matches(
+                ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key())
+
          # Look for WashingtonPost embeds
          wapo_urls = WashingtonPostIE._extract_urls(webpage)
          if wapo_urls:
@@ -3596,14 +3697,32 @@ def _real_extract(self, url):
              return self.playlist_from_matches(
                  rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key())
  
+        # Look for (tvopen|ethnos).gr embeds
+        tvopengr_urls = list(TVOpenGrEmbedIE._extract_urls(webpage))
+        if tvopengr_urls:
+            return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key())
+
+        # Look for ert.gr webtv embeds
+        ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage))
+        if len(ertwebtv_urls) == 1:
+            return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True)
+        elif ertwebtv_urls:
+            return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key())
+
          tvp_urls = TVPEmbedIE._extract_urls(webpage)
          if tvp_urls:
              return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key())
  
+        # Look for MainStreaming embeds
+        mainstreaming_urls = MainStreamingIE._extract_urls(webpage)
+        if mainstreaming_urls:
+            return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key())
+
          # Look for Gfycat Embeds
          gfycat_urls = GfycatIE._extract_urls(webpage)
          if gfycat_urls:
              return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key())
+
          # Look for HTML5 media
          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
          if entries:
@@ -3695,12 +3814,16 @@ def _real_extract(self, url):
  
          # Looking for http://schema.org/VideoObject
          json_ld = self._search_json_ld(webpage, video_id, default={})
-        if json_ld.get('url'):
+        if json_ld.get('url') not in (url, None):
              self.report_detected('JSON LD')
-            if determine_ext(json_ld.get('url')) == 'm3u8':
+            if determine_ext(json_ld['url']) == 'm3u8':
                  json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles(
                      json_ld['url'], video_id, 'mp4')
                  json_ld.pop('url')
+                self._sort_formats(json_ld['formats'])
+            else:
+                json_ld['_type'] = 'url_transparent'
+                json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True})
              return merge_dicts(json_ld, info_dict)
  
          def check_video(vurl):
@@ -3755,20 +3878,21 @@ def filter_video(urls):
                      protocol, _, _ = url.partition('/')
                      thumbnail = protocol + thumbnail
  
+                url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys()))
                  formats = []
-                for key in ('video_url', 'video_alt_url', 'video_alt_url2'):
-                    if key in flashvars and '/get_file/' in flashvars[key]:
-                        next_format = {
-                            'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
-                            'format_id': flashvars.get(key + '_text', key),
-                            'ext': 'mp4',
-                        }
-                        height = re.search(r'%s_(\d+)p\.mp4(?:/[?].*)?$' % flashvars['video_id'], flashvars[key])
-                        if height:
-                            next_format['height'] = int(height.group(1))
-                        else:
-                            next_format['quality'] = 1
-                        formats.append(next_format)
+                for key in url_keys:
+                    if '/get_file/' not in flashvars[key]:
+                        continue
+                    format_id = flashvars.get(f'{key}_text', key)
+                    formats.append({
+                        'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
+                        'format_id': format_id,
+                        'ext': 'mp4',
+                        **(parse_resolution(format_id) or parse_resolution(flashvars[key]))
+                    })
+                    if not formats[-1].get('height'):
+                        formats[-1]['quality'] = 1
+
                  self._sort_formats(formats)
  
                  return {
@@ -3874,12 +3998,16 @@ def filter_video(urls):
  
              # here's a fun little line of code for you:
              video_id = os.path.splitext(video_id)[0]
+            headers = {
+                'referer': full_response.geturl()
+            }
  
              entry_info_dict = {
                  'id': video_id,
                  'uploader': video_uploader,
                  'title': video_title,
                  'age_limit': age_limit,
+                'http_headers': headers,
              }
  
              if RtmpIE.suitable(video_url):
@@ -3897,11 +4025,11 @@ def filter_video(urls):
              elif ext == 'xspf':
                  return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
              elif ext == 'm3u8':
-                entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4')
+                entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
              elif ext == 'mpd':
-                entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id)
+                entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
              elif ext == 'f4m':
-                entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
+                entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
              elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
                  # Just matching .ism/manifest is not enough to be reliably sure
                  # whether it's actually an ISM manifest or some other streaming