yt_dlp/extractor/parlview.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5 from ..compat import compat_str
   6 from ..utils import (
   7     int_or_none,
   8     try_get,
   9     unified_timestamp,
  10 )
  11
  12
  13 class ParlviewIE(InfoExtractor):
  14
  15     _VALID_URL = r'https?://(?:www\.)?parlview\.aph\.gov\.au/(?:[^/]+)?\bvideoID=(?P<id>\d{6})'
  16     _TESTS = [{
  17         'url': 'https://parlview.aph.gov.au/mediaPlayer.php?videoID=542661',
  18         'info_dict': {
  19             'id': '542661',
  20             'ext': 'mp4',
  21             'title': "Australia's Family Law System [Part 2]",
  22             'duration': 5799,
  23             'description': 'md5:7099883b391619dbae435891ca871a62',
  24             'timestamp': 1621430700,
  25             'upload_date': '20210519',
  26             'uploader': 'Joint Committee',
  27         },
  28         'params': {
  29             'skip_download': True,
  30         }
  31     }, {
  32         'url': 'https://parlview.aph.gov.au/mediaPlayer.php?videoID=539936',
  33         'only_matching': True,
  34     }]
  35     _API_URL = 'https://parlview.aph.gov.au/api_v3/1/playback/getUniversalPlayerConfig?videoID=%s&format=json'
  36     _MEDIA_INFO_URL = 'https://parlview.aph.gov.au/ajaxPlayer.php?videoID=%s&tabNum=4&action=loadTab'
  37
  38     def _real_extract(self, url):
  39         video_id = self._match_id(url)
  40         webpage = self._download_webpage(url, video_id)
  41         media = self._download_json(self._API_URL % video_id, video_id).get('media')
  42         timestamp = try_get(media, lambda x: x['timeMap']['source']['timecode_offsets'][0], compat_str) or '/'
  43
  44         stream = try_get(media, lambda x: x['renditions'][0], dict)
  45         if not stream:
  46             self.raise_no_formats('No streams were detected')
  47         elif stream.get('streamType') != 'VOD':
  48             self.raise_no_formats('Unknown type of stream was detected: "%s"' % str(stream.get('streamType')))
  49         formats = self._extract_m3u8_formats(stream['url'], video_id, 'mp4', 'm3u8_native')
  50         self._sort_formats(formats)
  51
  52         media_info = self._download_webpage(
  53             self._MEDIA_INFO_URL % video_id, video_id, note='Downloading media info', fatal=False)
  54
  55         return {
  56             'id': video_id,
  57             'url': url,
  58             'title': self._html_search_regex(r'<h2>([^<]+)<', webpage, 'title', fatal=False),
  59             'formats': formats,
  60             'duration': int_or_none(media.get('duration')),
  61             'timestamp': unified_timestamp(timestamp.split('/', 1)[1].replace('_', ' ')),
  62             'description': self._html_search_regex(
  63                 r'<div[^>]+class="descripti?on"[^>]*>[^>]+<strong>[^>]+>[^>]+>([^<]+)',
  64                 webpage, 'description', fatal=False),
  65             'uploader': self._html_search_regex(
  66                 r'<td>[^>]+>Channel:[^>]+>([^<]+)', media_info, 'channel', fatal=False),
  67             'thumbnail': media.get('staticImage'),
  68         }