[ie/Sejm,RedCDNLivx] Add extractors (#8676)

author lauren n. liberda <redacted>

Sun, 21 Jan 2024 02:22:26 +0000 (03:22 +0100)

committer GitHub <redacted>

Sun, 21 Jan 2024 02:22:26 +0000 (03:22 +0100)
author lauren n. liberda <redacted>
Sun, 21 Jan 2024 02:22:26 +0000 (03:22 +0100)
committer GitHub <redacted>
Sun, 21 Jan 2024 02:22:26 +0000 (03:22 +0100)
diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py

index 3d5c3eb60f319f788b22e771eca73c53bfb842fd..31bef1eb5d5851fb0b1396ac911b6d0eee55ee87 100644 (file)
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@@ -1593,6 +1593,7 @@
      RedBullIE,
  )
  from .reddit import RedditIE
      RedBullIE,
  )
  from .reddit import RedditIE
+from .redge import RedCDNLivxIE
  from .redgifs import (
      RedGifsIE,
      RedGifsSearchIE,
  from .redgifs import (
      RedGifsIE,
      RedGifsSearchIE,
@@ -1727,6 +1728,7 @@
  )
  from .scrolller import ScrolllerIE
  from .seeker import SeekerIE
  )
  from .scrolller import ScrolllerIE
  from .seeker import SeekerIE
+from .sejmpl import SejmIE
  from .senalcolombia import SenalColombiaLiveIE
  from .senategov import SenateISVPIE, SenateGovIE
  from .sendtonews import SendtoNewsIE
  from .senalcolombia import SenalColombiaLiveIE
  from .senategov import SenateISVPIE, SenateGovIE
  from .sendtonews import SendtoNewsIE
diff --git a/yt_dlp/extractor/redge.py b/yt_dlp/extractor/redge.py

new file mode 100644 (file)

index 0000000..875d6f8
--- /dev/null
+++ b/yt_dlp/extractor/redge.py
@@ -0,0 +1,135 @@
+import functools
+
+from .common import InfoExtractor
+from ..networking import HEADRequest
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    join_nonempty,
+    parse_qs,
+    update_url_query,
+)
+from ..utils.traversal import traverse_obj
+
+
+class RedCDNLivxIE(InfoExtractor):
+    _VALID_URL = r'https?://[^.]+\.(?:dcs\.redcdn|atmcdn)\.pl/(?:live(?:dash|hls|ss)|nvr)/o2/(?P<tenant>[^/?#]+)/(?P<id>[^?#]+)\.livx'
+    IE_NAME = 'redcdnlivx'
+
+    _TESTS = [{
+        'url': 'https://r.dcs.redcdn.pl/livedash/o2/senat/ENC02/channel.livx?indexMode=true&startTime=638272860000&stopTime=638292544000',
+        'info_dict': {
+            'id': 'ENC02-638272860000-638292544000',
+            'ext': 'mp4',
+            'title': 'ENC02',
+            'duration': 19683.982,
+            'live_status': 'was_live',
+        },
+    }, {
+        'url': 'https://r.dcs.redcdn.pl/livedash/o2/sejm/ENC18/live.livx?indexMode=true&startTime=722333096000&stopTime=722335562000',
+        'info_dict': {
+            'id': 'ENC18-722333096000-722335562000',
+            'ext': 'mp4',
+            'title': 'ENC18',
+            'duration': 2463.995,
+            'live_status': 'was_live',
+        },
+    }, {
+        'url': 'https://r.dcs.redcdn.pl/livehls/o2/sportevolution/live/triathlon2018/warsaw.livx/playlist.m3u8?startTime=550305000000&stopTime=550327620000',
+        'info_dict': {
+            'id': 'triathlon2018-warsaw-550305000000-550327620000',
+            'ext': 'mp4',
+            'title': 'triathlon2018/warsaw',
+            'duration': 22619.98,
+            'live_status': 'was_live',
+        },
+    }, {
+        'url': 'https://n-25-12.dcs.redcdn.pl/nvr/o2/sejm/Migacz-ENC01/1.livx?startTime=722347200000&stopTime=722367345000',
+        'only_matching': True,
+    }, {
+        'url': 'https://redir.atmcdn.pl/nvr/o2/sejm/ENC08/1.livx?startTime=503831270000&stopTime=503840040000',
+        'only_matching': True,
+    }]
+
+    """
+    Known methods (first in url path):
+    - `livedash` - DASH MPD
+    - `livehls` - HTTP Live Streaming
+    - `livess` - IIS Smooth Streaming
+    - `nvr` - CCTV mode, directly returns a file, typically flv, avc1, aac
+    - `sc` - shoutcast/icecast (audio streams, like radio)
+    """
+
+    def _real_extract(self, url):
+        tenant, path = self._match_valid_url(url).group('tenant', 'id')
+        qs = parse_qs(url)
+        start_time = traverse_obj(qs, ('startTime', 0, {int_or_none}))
+        stop_time = traverse_obj(qs, ('stopTime', 0, {int_or_none}))
+
+        def livx_mode(mode):
+            suffix = ''
+            if mode == 'livess':
+                suffix = '/manifest'
+            elif mode == 'livehls':
+                suffix = '/playlist.m3u8'
+            file_qs = {}
+            if start_time:
+                file_qs['startTime'] = start_time
+            if stop_time:
+                file_qs['stopTime'] = stop_time
+            if mode == 'nvr':
+                file_qs['nolimit'] = 1
+            elif mode != 'sc':
+                file_qs['indexMode'] = 'true'
+            return update_url_query(f'https://r.dcs.redcdn.pl/{mode}/o2/{tenant}/{path}.livx{suffix}', file_qs)
+
+        # no id or title for a transmission. making ones up.
+        title = path \
+            .replace('/live', '').replace('live/', '') \
+            .replace('/channel', '').replace('channel/', '') \
+            .strip('/')
+        video_id = join_nonempty(title.replace('/', '-'), start_time, stop_time)
+
+        formats = []
+        # downloading the manifest separately here instead of _extract_ism_formats to also get some stream metadata
+        ism_res = self._download_xml_handle(
+            livx_mode('livess'), video_id,
+            note='Downloading ISM manifest',
+            errnote='Failed to download ISM manifest',
+            fatal=False)
+        ism_doc = None
+        if ism_res is not False:
+            ism_doc, ism_urlh = ism_res
+            formats, _ = self._parse_ism_formats_and_subtitles(ism_doc, ism_urlh.url, 'ss')
+
+        nvr_urlh = self._request_webpage(
+            HEADRequest(livx_mode('nvr')), video_id, 'Follow flv file redirect', fatal=False,
+            expected_status=lambda _: True)
+        if nvr_urlh and nvr_urlh.status == 200:
+            formats.append({
+                'url': nvr_urlh.url,
+                'ext': 'flv',
+                'format_id': 'direct-0',
+                'preference': -1,   # might be slow
+            })
+        formats.extend(self._extract_mpd_formats(livx_mode('livedash'), video_id, mpd_id='dash', fatal=False))
+        formats.extend(self._extract_m3u8_formats(
+            livx_mode('livehls'), video_id, m3u8_id='hls', ext='mp4', fatal=False))
+
+        time_scale = traverse_obj(ism_doc, ('@TimeScale', {int_or_none})) or 10000000
+        duration = traverse_obj(
+            ism_doc, ('@Duration', {functools.partial(float_or_none, scale=time_scale)})) or None
+
+        live_status = None
+        if traverse_obj(ism_doc, '@IsLive') == 'TRUE':
+            live_status = 'is_live'
+        elif duration:
+            live_status = 'was_live'
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'duration': duration,
+            'live_status': live_status,
+        }
diff --git a/yt_dlp/extractor/sejmpl.py b/yt_dlp/extractor/sejmpl.py

new file mode 100644 (file)

index 0000000..29cb015
--- /dev/null
+++ b/yt_dlp/extractor/sejmpl.py
@@ -0,0 +1,218 @@
+import datetime
+
+from .common import InfoExtractor
+from .redge import RedCDNLivxIE
+from ..utils import (
+    clean_html,
+    join_nonempty,
+    js_to_json,
+    strip_or_none,
+    update_url_query,
+)
+from ..utils.traversal import traverse_obj
+
+
+def is_dst(date):
+    last_march = datetime.datetime(date.year, 3, 31)
+    last_october = datetime.datetime(date.year, 10, 31)
+    last_sunday_march = last_march - datetime.timedelta(days=last_march.isoweekday() % 7)
+    last_sunday_october = last_october - datetime.timedelta(days=last_october.isoweekday() % 7)
+    return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3)
+
+
+def rfc3339_to_atende(date):
+    date = datetime.datetime.fromisoformat(date)
+    date = date + datetime.timedelta(hours=1 if is_dst(date) else 0)
+    return int((date.timestamp() - 978307200) * 1000)
+
+
+class SejmIE(InfoExtractor):
+    _VALID_URL = (
+        r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P<id>[\dA-F]+)',
+        r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P<id>[\dA-F]+)',
+        r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P<term>\d+)\.nsf/VideoFrame\.xsp/(?P<id>[\dA-F]+)',
+    )
+    IE_NAME = 'sejm'
+
+    _TESTS = [{
+        # multiple cameras, polish SL iterpreter
+        'url': 'https://www.sejm.gov.pl/Sejm10.nsf/transmisje_arch.xsp#6181EF1AD9CEEBB5C1258A6D006452B5',
+        'info_dict': {
+            'id': '6181EF1AD9CEEBB5C1258A6D006452B5',
+            'title': '1. posiedzenie Sejmu X kadencji',
+            'duration': 20145,
+            'live_status': 'was_live',
+            'location': 'Sala Posiedzeń',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': 'ENC01-722340000000-722360145000',
+                'ext': 'mp4',
+                'duration': 20145,
+                'title': '1. posiedzenie Sejmu X kadencji - ENC01',
+                'live_status': 'was_live',
+            },
+        }, {
+            'info_dict': {
+                'id': 'ENC30-722340000000-722360145000',
+                'ext': 'mp4',
+                'duration': 20145,
+                'title': '1. posiedzenie Sejmu X kadencji - ENC30',
+                'live_status': 'was_live',
+            },
+        }, {
+            'info_dict': {
+                'id': 'ENC31-722340000000-722360145000',
+                'ext': 'mp4',
+                'duration': 20145,
+                'title': '1. posiedzenie Sejmu X kadencji - ENC31',
+                'live_status': 'was_live',
+            },
+        }, {
+            'info_dict': {
+                'id': 'ENC32-722340000000-722360145000',
+                'ext': 'mp4',
+                'duration': 20145,
+                'title': '1. posiedzenie Sejmu X kadencji - ENC32',
+                'live_status': 'was_live',
+            },
+        }, {
+            # sign lang interpreter
+            'info_dict': {
+                'id': 'Migacz-ENC01-1-722340000000-722360145000',
+                'ext': 'mp4',
+                'duration': 20145,
+                'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01',
+                'live_status': 'was_live',
+            },
+        }],
+    }, {
+        'url': 'https://www.sejm.gov.pl/Sejm8.nsf/transmisje.xsp?unid=9377A9D65518E9A5C125808E002E9FF2',
+        'info_dict': {
+            'id': '9377A9D65518E9A5C125808E002E9FF2',
+            'title': 'Debata "Lepsza Polska: obywatelska"',
+            'description': 'KP .Nowoczesna',
+            'duration': 8770,
+            'live_status': 'was_live',
+            'location': 'sala kolumnowa im. Kazimierza Pużaka (bud. C-D)',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': 'ENC08-1-503831270000-503840040000',
+                'ext': 'mp4',
+                'duration': 8770,
+                'title': 'Debata "Lepsza Polska: obywatelska" - ENC08',
+                'live_status': 'was_live',
+            },
+        }],
+    }, {
+        # 7th term is very special, since it does not use redcdn livx
+        'url': 'https://www.sejm.gov.pl/sejm7.nsf/transmisje_arch.xsp?rok=2015&month=11#A6E6D475ECCC6FE5C1257EF90034817F',
+        'info_dict': {
+            'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
+            'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
+            'description': 'SLD - Biuro Prasowe Klubu',
+            'duration': 514,
+            'location': 'sala 101/bud. C',
+            'live_status': 'was_live',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
+                'ext': 'mp4',
+                'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
+                'duration': 514,
+            },
+        }],
+    }, {
+        'url': 'https://sejm-embed.redcdn.pl/Sejm10.nsf/VideoFrame.xsp/FED58EABB97FBD53C1258A7400386492',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        term, video_id = self._match_valid_url(url).group('term', 'id')
+        frame = self._download_webpage(
+            f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}',
+            video_id)
+        # despite it says "transmisje_arch", it works for live streams too!
+        data = self._download_json(
+            f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}',
+            video_id)
+        params = data['params']
+
+        title = strip_or_none(data.get('title'))
+
+        if data.get('status') == 'VIDEO_ENDED':
+            live_status = 'was_live'
+        elif data.get('status') == 'VIDEO_PLAYING':
+            live_status = 'is_live'
+        else:
+            live_status = None
+            self.report_warning(f'unknown status: {data.get("status")}')
+
+        start_time = rfc3339_to_atende(params['start'])
+        # current streams have a stop time of *expected* end of session, but actual times
+        # can change during the transmission. setting a stop_time would artificially
+        # end the stream at that time, while the session actually keeps going.
+        if live_status == 'was_live':
+            stop_time = rfc3339_to_atende(params['stop'])
+            duration = (stop_time - start_time) // 1000
+        else:
+            stop_time, duration = None, None
+
+        entries = []
+
+        def add_entry(file, legacy_file=False):
+            if not file:
+                return
+            file = self._proto_relative_url(file)
+            if not legacy_file:
+                file = update_url_query(file, {'startTime': start_time})
+                if stop_time is not None:
+                    file = update_url_query(file, {'stopTime': stop_time})
+                stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id')
+            common_info = {
+                'url': file,
+                'duration': duration,
+            }
+            if legacy_file:
+                entries.append({
+                    **common_info,
+                    'id': video_id,
+                    'title': title,
+                })
+            else:
+                entries.append({
+                    **common_info,
+                    '_type': 'url_transparent',
+                    'ie_key': RedCDNLivxIE.ie_key(),
+                    'id': stream_id,
+                    'title': join_nonempty(title, stream_id, delim=' - '),
+                })
+
+        cameras = self._search_json(
+            r'var\s+cameras\s*=', frame, 'camera list', video_id,
+            contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json,
+            fatal=False) or []
+        for camera_file in traverse_obj(cameras, (..., 'file', {dict})):
+            if camera_file.get('flv'):
+                add_entry(camera_file['flv'])
+            elif camera_file.get('mp4'):
+                # this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx
+                add_entry(camera_file['mp4'], legacy_file=True)
+            else:
+                self.report_warning('Unknown camera stream type found')
+
+        if params.get('mig'):
+            add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False))
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'id': video_id,
+            'title': title,
+            'description': clean_html(data.get('desc')) or None,
+            'duration': duration,
+            'live_status': live_status,
+            'location': strip_or_none(data.get('location')),
+        }
author	lauren n. liberda <redacted>
	Sun, 21 Jan 2024 02:22:26 +0000 (03:22 +0100)
committer	GitHub <redacted>
	Sun, 21 Jan 2024 02:22:26 +0000 (03:22 +0100)
yt_dlp/extractor/_extractors.py		patch \| blob \| blame \| history
yt_dlp/extractor/redge.py	[new file with mode: 0644]	patch \| blob
yt_dlp/extractor/sejmpl.py	[new file with mode: 0644]	patch \| blob