]> jfr.im git - yt-dlp.git/blob - youtube_dlc/extractor/elonet.py
Merge branch 'elonet' of https://github.com/tpikonen/youtube-dl into tpikonen-elonet
[yt-dlp.git] / youtube_dlc / extractor / elonet.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import os
5 import re
6 import tempfile
7
8 from .common import InfoExtractor
9 from ..utils import (
10 base_url,
11 ExtractorError,
12 try_get,
13 )
14 from ..compat import compat_str
15 from ..downloader.hls import HlsFD
16
17
18 class ElonetIE(InfoExtractor):
19 _VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P<id>[0-9]+)'
20 _TEST = {
21 'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867',
22 'md5': '8efc954b96c543711707f87de757caea',
23 'info_dict': {
24 'id': '107867',
25 'ext': 'mp4',
26 'title': 'Valkoinen peura',
27 'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...',
28 'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large',
29 },
30 }
31
32 def _download_m3u8_chunked_subtitle(self, chunklist_url):
33 """
34 Download VTT subtitles from pieces in manifest URL.
35 Return a string containing joined chunks with extra headers removed.
36 """
37 with tempfile.NamedTemporaryFile(delete=True) as outfile:
38 fname = outfile.name
39 hlsdl = HlsFD(self._downloader, {})
40 hlsdl.download(compat_str(fname), {"url": chunklist_url})
41 with open(fname, 'r') as fin:
42 # Remove (some) headers
43 fdata = re.sub(r'X-TIMESTAMP-MAP.*\n+|WEBVTT\n+', '', fin.read())
44 os.remove(fname)
45 return "WEBVTT\n\n" + fdata
46
47 def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url):
48 """
49 Parse subtitles from HLS / m3u8 manifest.
50 """
51 subtitles = {}
52 baseurl = m3u8_url[:m3u8_url.rindex('/') + 1]
53 for line in m3u8_doc.split('\n'):
54 if 'EXT-X-MEDIA:TYPE=SUBTITLES' in line:
55 lang = self._search_regex(
56 r'LANGUAGE="(.+?)"', line, 'lang', default=False)
57 uri = self._search_regex(
58 r'URI="(.+?)"', line, 'uri', default=False)
59 if lang and uri:
60 data = self._download_m3u8_chunked_subtitle(baseurl + uri)
61 subtitles[lang] = [{'ext': 'vtt', 'data': data}]
62 return subtitles
63
64 def _parse_mpd_subtitles(self, mpd_doc):
65 """
66 Parse subtitles from MPD manifest.
67 """
68 ns = '{urn:mpeg:dash:schema:mpd:2011}'
69 subtitles = {}
70 for aset in mpd_doc.findall(".//%sAdaptationSet[@mimeType='text/vtt']" % (ns)):
71 lang = aset.attrib.get('lang', 'unk')
72 url = aset.find("./%sRepresentation/%sBaseURL" % (ns, ns)).text
73 subtitles[lang] = [{'ext': 'vtt', 'url': url}]
74 return subtitles
75
76 def _get_subtitles(self, fmt, doc, url):
77 if fmt == 'm3u8':
78 subs = self._parse_m3u8_subtitles(doc, url)
79 elif fmt == 'mpd':
80 subs = self._parse_mpd_subtitles(doc)
81 else:
82 self._downloader.report_warning(
83 "Cannot download subtitles from '%s' streams." % (fmt))
84 subs = {}
85 return subs
86
87 def _real_extract(self, url):
88 video_id = self._match_id(url)
89 webpage = self._download_webpage(url, video_id)
90
91 title = self._html_search_regex(
92 r'<meta .*property="og&#x3A;title" .*content="(.+?)"', webpage, 'title')
93 description = self._html_search_regex(
94 r'<meta .*property="og&#x3A;description" .*content="(.+?)"', webpage, 'description')
95 thumbnail = self._html_search_regex(
96 r'<meta .*property="og&#x3A;image" .*content="(.+?)"', webpage, 'thumbnail')
97
98 json_s = self._html_search_regex(
99 r'data-video-sources="(.+?)"', webpage, 'json')
100 src = try_get(
101 self._parse_json(json_s, video_id),
102 lambda x: x[0]["src"], compat_str)
103 formats = []
104 if re.search(r'\.m3u8\??', src):
105 fmt = 'm3u8'
106 res = self._download_webpage_handle(
107 # elonet servers have certificate problems
108 src.replace('https:', 'http:'), video_id,
109 note='Downloading m3u8 information',
110 errnote='Failed to download m3u8 information')
111 if res:
112 doc, urlh = res
113 url = urlh.geturl()
114 formats = self._parse_m3u8_formats(doc, url)
115 for f in formats:
116 f['ext'] = 'mp4'
117 elif re.search(r'\.mpd\??', src):
118 fmt = 'mpd'
119 res = self._download_xml_handle(
120 src, video_id,
121 note='Downloading MPD manifest',
122 errnote='Failed to download MPD manifest')
123 if res:
124 doc, urlh = res
125 url = base_url(urlh.geturl())
126 formats = self._parse_mpd_formats(doc, mpd_base_url=url)
127 else:
128 raise ExtractorError("Unknown streaming format")
129
130 return {
131 'id': video_id,
132 'title': title,
133 'description': description,
134 'thumbnail': thumbnail,
135 'formats': formats,
136 'subtitles': self.extract_subtitles(fmt, doc, url),
137 }