]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/nfb.py
[viewster] extract all http formats
[yt-dlp.git] / youtube_dl / extractor / nfb.py
CommitLineData
94c4abce
S
1from __future__ import unicode_literals
2
94c4abce 3from .common import InfoExtractor
6e6bc8da
S
4from ..utils import (
5 sanitized_Request,
6 urlencode_postdata,
7)
94c4abce
S
8
9
10class NFBIE(InfoExtractor):
11 IE_NAME = 'nfb'
12 IE_DESC = 'National Film Board of Canada'
1cc79574 13 _VALID_URL = r'https?://(?:www\.)?(?:nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
94c4abce
S
14
15 _TEST = {
16 'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
17 'info_dict': {
18 'id': 'qallunaat_why_white_people_are_funny',
19 'ext': 'mp4',
20 'title': 'Qallunaat! Why White People Are Funny ',
21 'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
22 'duration': 3128,
23 'uploader': 'Mark Sandiford',
24 'uploader_id': 'mark-sandiford',
25 },
26 'params': {
27 # rtmp download
28 'skip_download': True,
29 }
30 }
31
32 def _real_extract(self, url):
1cc79574
PH
33 video_id = self._match_id(url)
34 page = self._download_webpage(
35 'https://www.nfb.ca/film/%s' % video_id, video_id,
36 'Downloading film page')
94c4abce
S
37
38 uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
9e1a5b84 39 page, 'director id', fatal=False)
94c4abce 40 uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
9e1a5b84 41 page, 'director name', fatal=False)
94c4abce 42
5c2266df
S
43 request = sanitized_Request(
44 'https://www.nfb.ca/film/%s/player_config' % video_id,
6e6bc8da 45 urlencode_postdata({'getConfig': 'true'}))
94c4abce
S
46 request.add_header('Content-Type', 'application/x-www-form-urlencoded')
47 request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
48
49 config = self._download_xml(request, video_id, 'Downloading player config XML')
50
c6fdba23
S
51 title = None
52 description = None
53 thumbnail = None
54 duration = None
55 formats = []
94c4abce 56
c6fdba23
S
57 def extract_thumbnail(media):
58 thumbnails = {}
59 for asset in media.findall('assets/asset'):
60 thumbnails[asset.get('quality')] = asset.find('default/url').text
61 if not thumbnails:
62 return None
63 if 'high' in thumbnails:
64 return thumbnails['high']
65 return list(thumbnails.values())[0]
66
67 for media in config.findall('./player/stream/media'):
68 if media.get('type') == 'posterImage':
69 thumbnail = extract_thumbnail(media)
70 elif media.get('type') == 'video':
71 duration = int(media.get('duration'))
72 title = media.find('title').text
73 description = media.find('description').text
74 # It seems assets always go from lower to better quality, so no need to sort
f71959fc
S
75 for asset in media.findall('assets/asset'):
76 for x in asset:
77 formats.append({
78 'url': x.find('streamerURI').text,
79 'app': x.find('streamerURI').text.split('/', 3)[3],
80 'play_path': x.find('url').text,
81 'rtmp_live': False,
82 'ext': 'mp4',
83 'format_id': '%s-%s' % (x.tag, asset.get('quality')),
84 })
94c4abce
S
85
86 return {
87 'id': video_id,
88 'title': title,
89 'description': description,
90 'thumbnail': thumbnail,
91 'duration': duration,
92 'uploader': uploader,
93 'uploader_id': uploader_id,
94 'formats': formats,
5f6a1245 95 }