yt_dlp/extractor/nfb.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5 from ..utils import int_or_none
   6
   7
   8 class NFBIE(InfoExtractor):
   9     _VALID_URL = r'https?://(?:www\.)?nfb\.ca/film/(?P<id>[^/?#&]+)'
  10     _TESTS = [{
  11         'url': 'https://www.nfb.ca/film/trafficopter/',
  12         'info_dict': {
  13             'id': 'trafficopter',
  14             'ext': 'mp4',
  15             'title': 'Trafficopter',
  16             'description': 'md5:060228455eb85cf88785c41656776bc0',
  17             'thumbnail': r're:^https?://.*\.jpg$',
  18             'uploader': 'Barrie Howells',
  19             'release_year': 1972,
  20         },
  21     }]
  22
  23     def _real_extract(self, url):
  24         video_id = self._match_id(url)
  25
  26         webpage = self._download_webpage('https://www.nfb.ca/film/%s/' % video_id, video_id)
  27
  28         iframe = self._html_search_regex(
  29             r'<[^>]+\bid=["\']player-iframe["\'][^>]*src=["\']([^"\']+)',
  30             webpage, 'iframe', default=None, fatal=True)
  31         if iframe.startswith('/'):
  32             iframe = f'https://www.nfb.ca{iframe}'
  33
  34         player = self._download_webpage(iframe, video_id)
  35
  36         source = self._html_search_regex(
  37             r'source:\s*\'([^\']+)',
  38             player, 'source', default=None, fatal=True)
  39
  40         formats, subtitles = self._extract_m3u8_formats_and_subtitles(source, video_id, ext='mp4')
  41         self._sort_formats(formats)
  42
  43         return {
  44             'id': video_id,
  45             'title': self._html_search_regex(
  46                 r'<[^>]+\bid=["\']titleHeader["\'][^>]*>\s*<h1[^>]*>\s*([^<]+?)\s*</h1>',
  47                 webpage, 'title', default=None),
  48             'description': self._html_search_regex(
  49                 r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*<p[^>]*>\s*([^<]+)',
  50                 webpage, 'description', default=None),
  51             'thumbnail': self._html_search_regex(
  52                 r'poster:\s*\'([^\']+)',
  53                 player, 'thumbnail', default=None),
  54             'uploader': self._html_search_regex(
  55                 r'<[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
  56                 webpage, 'uploader', default=None),
  57             'release_year': int_or_none(self._html_search_regex(
  58                 r'<[^>]+\bitemprop=["\']datePublished["\'][^>]*>([^<]+)',
  59                 webpage, 'release_year', default=None)),
  60             'formats': formats,
  61             'subtitles': subtitles,
  62         }