]> jfr.im git - yt-dlp.git/blob - yt_dlp/downloader/f4m.py
[cleanup] Minor fixes (See desc)
[yt-dlp.git] / yt_dlp / downloader / f4m.py
1 import io
2 import itertools
3 import time
4
5 from .fragment import FragmentFD
6 from ..compat import (
7 compat_b64decode,
8 compat_etree_fromstring,
9 compat_struct_pack,
10 compat_struct_unpack,
11 compat_urllib_error,
12 compat_urllib_parse_urlparse,
13 compat_urlparse,
14 )
15 from ..utils import fix_xml_ampersands, xpath_text
16
17
18 class DataTruncatedError(Exception):
19 pass
20
21
22 class FlvReader(io.BytesIO):
23 """
24 Reader for Flv files
25 The file format is documented in https://www.adobe.com/devnet/f4v.html
26 """
27
28 def read_bytes(self, n):
29 data = self.read(n)
30 if len(data) < n:
31 raise DataTruncatedError(
32 'FlvReader error: need %d bytes while only %d bytes got' % (
33 n, len(data)))
34 return data
35
36 # Utility functions for reading numbers and strings
37 def read_unsigned_long_long(self):
38 return compat_struct_unpack('!Q', self.read_bytes(8))[0]
39
40 def read_unsigned_int(self):
41 return compat_struct_unpack('!I', self.read_bytes(4))[0]
42
43 def read_unsigned_char(self):
44 return compat_struct_unpack('!B', self.read_bytes(1))[0]
45
46 def read_string(self):
47 res = b''
48 while True:
49 char = self.read_bytes(1)
50 if char == b'\x00':
51 break
52 res += char
53 return res
54
55 def read_box_info(self):
56 """
57 Read a box and return the info as a tuple: (box_size, box_type, box_data)
58 """
59 real_size = size = self.read_unsigned_int()
60 box_type = self.read_bytes(4)
61 header_end = 8
62 if size == 1:
63 real_size = self.read_unsigned_long_long()
64 header_end = 16
65 return real_size, box_type, self.read_bytes(real_size - header_end)
66
67 def read_asrt(self):
68 # version
69 self.read_unsigned_char()
70 # flags
71 self.read_bytes(3)
72 quality_entry_count = self.read_unsigned_char()
73 # QualityEntryCount
74 for i in range(quality_entry_count):
75 self.read_string()
76
77 segment_run_count = self.read_unsigned_int()
78 segments = []
79 for i in range(segment_run_count):
80 first_segment = self.read_unsigned_int()
81 fragments_per_segment = self.read_unsigned_int()
82 segments.append((first_segment, fragments_per_segment))
83
84 return {
85 'segment_run': segments,
86 }
87
88 def read_afrt(self):
89 # version
90 self.read_unsigned_char()
91 # flags
92 self.read_bytes(3)
93 # time scale
94 self.read_unsigned_int()
95
96 quality_entry_count = self.read_unsigned_char()
97 # QualitySegmentUrlModifiers
98 for i in range(quality_entry_count):
99 self.read_string()
100
101 fragments_count = self.read_unsigned_int()
102 fragments = []
103 for i in range(fragments_count):
104 first = self.read_unsigned_int()
105 first_ts = self.read_unsigned_long_long()
106 duration = self.read_unsigned_int()
107 if duration == 0:
108 discontinuity_indicator = self.read_unsigned_char()
109 else:
110 discontinuity_indicator = None
111 fragments.append({
112 'first': first,
113 'ts': first_ts,
114 'duration': duration,
115 'discontinuity_indicator': discontinuity_indicator,
116 })
117
118 return {
119 'fragments': fragments,
120 }
121
122 def read_abst(self):
123 # version
124 self.read_unsigned_char()
125 # flags
126 self.read_bytes(3)
127
128 self.read_unsigned_int() # BootstrapinfoVersion
129 # Profile,Live,Update,Reserved
130 flags = self.read_unsigned_char()
131 live = flags & 0x20 != 0
132 # time scale
133 self.read_unsigned_int()
134 # CurrentMediaTime
135 self.read_unsigned_long_long()
136 # SmpteTimeCodeOffset
137 self.read_unsigned_long_long()
138
139 self.read_string() # MovieIdentifier
140 server_count = self.read_unsigned_char()
141 # ServerEntryTable
142 for i in range(server_count):
143 self.read_string()
144 quality_count = self.read_unsigned_char()
145 # QualityEntryTable
146 for i in range(quality_count):
147 self.read_string()
148 # DrmData
149 self.read_string()
150 # MetaData
151 self.read_string()
152
153 segments_count = self.read_unsigned_char()
154 segments = []
155 for i in range(segments_count):
156 box_size, box_type, box_data = self.read_box_info()
157 assert box_type == b'asrt'
158 segment = FlvReader(box_data).read_asrt()
159 segments.append(segment)
160 fragments_run_count = self.read_unsigned_char()
161 fragments = []
162 for i in range(fragments_run_count):
163 box_size, box_type, box_data = self.read_box_info()
164 assert box_type == b'afrt'
165 fragments.append(FlvReader(box_data).read_afrt())
166
167 return {
168 'segments': segments,
169 'fragments': fragments,
170 'live': live,
171 }
172
173 def read_bootstrap_info(self):
174 total_size, box_type, box_data = self.read_box_info()
175 assert box_type == b'abst'
176 return FlvReader(box_data).read_abst()
177
178
179 def read_bootstrap_info(bootstrap_bytes):
180 return FlvReader(bootstrap_bytes).read_bootstrap_info()
181
182
183 def build_fragments_list(boot_info):
184 """ Return a list of (segment, fragment) for each fragment in the video """
185 res = []
186 segment_run_table = boot_info['segments'][0]
187 fragment_run_entry_table = boot_info['fragments'][0]['fragments']
188 first_frag_number = fragment_run_entry_table[0]['first']
189 fragments_counter = itertools.count(first_frag_number)
190 for segment, fragments_count in segment_run_table['segment_run']:
191 # In some live HDS streams (for example Rai), `fragments_count` is
192 # abnormal and causing out-of-memory errors. It's OK to change the
193 # number of fragments for live streams as they are updated periodically
194 if fragments_count == 4294967295 and boot_info['live']:
195 fragments_count = 2
196 for _ in range(fragments_count):
197 res.append((segment, next(fragments_counter)))
198
199 if boot_info['live']:
200 res = res[-2:]
201
202 return res
203
204
205 def write_unsigned_int(stream, val):
206 stream.write(compat_struct_pack('!I', val))
207
208
209 def write_unsigned_int_24(stream, val):
210 stream.write(compat_struct_pack('!I', val)[1:])
211
212
213 def write_flv_header(stream):
214 """Writes the FLV header to stream"""
215 # FLV header
216 stream.write(b'FLV\x01')
217 stream.write(b'\x05')
218 stream.write(b'\x00\x00\x00\x09')
219 stream.write(b'\x00\x00\x00\x00')
220
221
222 def write_metadata_tag(stream, metadata):
223 """Writes optional metadata tag to stream"""
224 SCRIPT_TAG = b'\x12'
225 FLV_TAG_HEADER_LEN = 11
226
227 if metadata:
228 stream.write(SCRIPT_TAG)
229 write_unsigned_int_24(stream, len(metadata))
230 stream.write(b'\x00\x00\x00\x00\x00\x00\x00')
231 stream.write(metadata)
232 write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata))
233
234
235 def remove_encrypted_media(media):
236 return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib
237 and 'drmAdditionalHeaderSetId' not in e.attrib,
238 media))
239
240
241 def _add_ns(prop, ver=1):
242 return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop)
243
244
245 def get_base_url(manifest):
246 base_url = xpath_text(
247 manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)],
248 'base URL', default=None)
249 if base_url:
250 base_url = base_url.strip()
251 return base_url
252
253
254 class F4mFD(FragmentFD):
255 """
256 A downloader for f4m manifests or AdobeHDS.
257 """
258
259 FD_NAME = 'f4m'
260
261 def _get_unencrypted_media(self, doc):
262 media = doc.findall(_add_ns('media'))
263 if not media:
264 self.report_error('No media found')
265 if not self.params.get('allow_unplayable_formats'):
266 for e in (doc.findall(_add_ns('drmAdditionalHeader'))
267 + doc.findall(_add_ns('drmAdditionalHeaderSet'))):
268 # If id attribute is missing it's valid for all media nodes
269 # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute
270 if 'id' not in e.attrib:
271 self.report_error('Missing ID in f4m DRM')
272 media = remove_encrypted_media(media)
273 if not media:
274 self.report_error('Unsupported DRM')
275 return media
276
277 def _get_bootstrap_from_url(self, bootstrap_url):
278 bootstrap = self.ydl.urlopen(bootstrap_url).read()
279 return read_bootstrap_info(bootstrap)
280
281 def _update_live_fragments(self, bootstrap_url, latest_fragment):
282 fragments_list = []
283 retries = 30
284 while (not fragments_list) and (retries > 0):
285 boot_info = self._get_bootstrap_from_url(bootstrap_url)
286 fragments_list = build_fragments_list(boot_info)
287 fragments_list = [f for f in fragments_list if f[1] > latest_fragment]
288 if not fragments_list:
289 # Retry after a while
290 time.sleep(5.0)
291 retries -= 1
292
293 if not fragments_list:
294 self.report_error('Failed to update fragments')
295
296 return fragments_list
297
298 def _parse_bootstrap_node(self, node, base_url):
299 # Sometimes non empty inline bootstrap info can be specified along
300 # with bootstrap url attribute (e.g. dummy inline bootstrap info
301 # contains whitespace characters in [1]). We will prefer bootstrap
302 # url over inline bootstrap info when present.
303 # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m
304 bootstrap_url = node.get('url')
305 if bootstrap_url:
306 bootstrap_url = compat_urlparse.urljoin(
307 base_url, bootstrap_url)
308 boot_info = self._get_bootstrap_from_url(bootstrap_url)
309 else:
310 bootstrap_url = None
311 bootstrap = compat_b64decode(node.text)
312 boot_info = read_bootstrap_info(bootstrap)
313 return boot_info, bootstrap_url
314
315 def real_download(self, filename, info_dict):
316 man_url = info_dict['url']
317 requested_bitrate = info_dict.get('tbr')
318 self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME)
319
320 urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
321 man_url = urlh.geturl()
322 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
323 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244
324 # and https://github.com/ytdl-org/youtube-dl/issues/7823)
325 manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip()
326
327 doc = compat_etree_fromstring(manifest)
328 formats = [(int(f.attrib.get('bitrate', -1)), f)
329 for f in self._get_unencrypted_media(doc)]
330 if requested_bitrate is None or len(formats) == 1:
331 # get the best format
332 formats = sorted(formats, key=lambda f: f[0])
333 rate, media = formats[-1]
334 else:
335 rate, media = list(filter(
336 lambda f: int(f[0]) == requested_bitrate, formats))[0]
337
338 # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec.
339 man_base_url = get_base_url(doc) or man_url
340
341 base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url'])
342 bootstrap_node = doc.find(_add_ns('bootstrapInfo'))
343 boot_info, bootstrap_url = self._parse_bootstrap_node(
344 bootstrap_node, man_base_url)
345 live = boot_info['live']
346 metadata_node = media.find(_add_ns('metadata'))
347 if metadata_node is not None:
348 metadata = compat_b64decode(metadata_node.text)
349 else:
350 metadata = None
351
352 fragments_list = build_fragments_list(boot_info)
353 test = self.params.get('test', False)
354 if test:
355 # We only download the first fragment
356 fragments_list = fragments_list[:1]
357 total_frags = len(fragments_list)
358 # For some akamai manifests we'll need to add a query to the fragment url
359 akamai_pv = xpath_text(doc, _add_ns('pv-2.0'))
360
361 ctx = {
362 'filename': filename,
363 'total_frags': total_frags,
364 'live': bool(live),
365 }
366
367 self._prepare_frag_download(ctx)
368
369 dest_stream = ctx['dest_stream']
370
371 if ctx['complete_frags_downloaded_bytes'] == 0:
372 write_flv_header(dest_stream)
373 if not live:
374 write_metadata_tag(dest_stream, metadata)
375
376 base_url_parsed = compat_urllib_parse_urlparse(base_url)
377
378 self._start_frag_download(ctx, info_dict)
379
380 frag_index = 0
381 while fragments_list:
382 seg_i, frag_i = fragments_list.pop(0)
383 frag_index += 1
384 if frag_index <= ctx['fragment_index']:
385 continue
386 name = 'Seg%d-Frag%d' % (seg_i, frag_i)
387 query = []
388 if base_url_parsed.query:
389 query.append(base_url_parsed.query)
390 if akamai_pv:
391 query.append(akamai_pv.strip(';'))
392 if info_dict.get('extra_param_to_segment_url'):
393 query.append(info_dict['extra_param_to_segment_url'])
394 url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query))
395 try:
396 success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict)
397 if not success:
398 return False
399 reader = FlvReader(down_data)
400 while True:
401 try:
402 _, box_type, box_data = reader.read_box_info()
403 except DataTruncatedError:
404 if test:
405 # In tests, segments may be truncated, and thus
406 # FlvReader may not be able to parse the whole
407 # chunk. If so, write the segment as is
408 # See https://github.com/ytdl-org/youtube-dl/issues/9214
409 dest_stream.write(down_data)
410 break
411 raise
412 if box_type == b'mdat':
413 self._append_fragment(ctx, box_data)
414 break
415 except compat_urllib_error.HTTPError as err:
416 if live and (err.code == 404 or err.code == 410):
417 # We didn't keep up with the live window. Continue
418 # with the next available fragment.
419 msg = 'Fragment %d unavailable' % frag_i
420 self.report_warning(msg)
421 fragments_list = []
422 else:
423 raise
424
425 if not fragments_list and not test and live and bootstrap_url:
426 fragments_list = self._update_live_fragments(bootstrap_url, frag_i)
427 total_frags += len(fragments_list)
428 if fragments_list and (fragments_list[0][1] > frag_i + 1):
429 msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1))
430 self.report_warning(msg)
431
432 self._finish_frag_download(ctx, info_dict)
433
434 return True