]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
twitch.tv chapters (#810): print out start and end time
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import gzip
5 import io
6 import json
7 import locale
8 import os
9 import re
10 import sys
11 import traceback
12 import zlib
13 import email.utils
14 import json
15 import datetime
16
17 try:
18 import urllib.request as compat_urllib_request
19 except ImportError: # Python 2
20 import urllib2 as compat_urllib_request
21
22 try:
23 import urllib.error as compat_urllib_error
24 except ImportError: # Python 2
25 import urllib2 as compat_urllib_error
26
27 try:
28 import urllib.parse as compat_urllib_parse
29 except ImportError: # Python 2
30 import urllib as compat_urllib_parse
31
32 try:
33 from urllib.parse import urlparse as compat_urllib_parse_urlparse
34 except ImportError: # Python 2
35 from urlparse import urlparse as compat_urllib_parse_urlparse
36
37 try:
38 import http.cookiejar as compat_cookiejar
39 except ImportError: # Python 2
40 import cookielib as compat_cookiejar
41
42 try:
43 import html.entities as compat_html_entities
44 except ImportError: # Python 2
45 import htmlentitydefs as compat_html_entities
46
47 try:
48 import html.parser as compat_html_parser
49 except ImportError: # Python 2
50 import HTMLParser as compat_html_parser
51
52 try:
53 import http.client as compat_http_client
54 except ImportError: # Python 2
55 import httplib as compat_http_client
56
57 try:
58 from subprocess import DEVNULL
59 compat_subprocess_get_DEVNULL = lambda: DEVNULL
60 except ImportError:
61 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
62
63 try:
64 from urllib.parse import parse_qs as compat_parse_qs
65 except ImportError: # Python 2
66 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
67 # Python 2's version is apparently totally broken
68 def _unquote(string, encoding='utf-8', errors='replace'):
69 if string == '':
70 return string
71 res = string.split('%')
72 if len(res) == 1:
73 return string
74 if encoding is None:
75 encoding = 'utf-8'
76 if errors is None:
77 errors = 'replace'
78 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
79 pct_sequence = b''
80 string = res[0]
81 for item in res[1:]:
82 try:
83 if not item:
84 raise ValueError
85 pct_sequence += item[:2].decode('hex')
86 rest = item[2:]
87 if not rest:
88 # This segment was just a single percent-encoded character.
89 # May be part of a sequence of code units, so delay decoding.
90 # (Stored in pct_sequence).
91 continue
92 except ValueError:
93 rest = '%' + item
94 # Encountered non-percent-encoded characters. Flush the current
95 # pct_sequence.
96 string += pct_sequence.decode(encoding, errors) + rest
97 pct_sequence = b''
98 if pct_sequence:
99 # Flush the final pct_sequence
100 string += pct_sequence.decode(encoding, errors)
101 return string
102
103 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
104 encoding='utf-8', errors='replace'):
105 qs, _coerce_result = qs, unicode
106 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
107 r = []
108 for name_value in pairs:
109 if not name_value and not strict_parsing:
110 continue
111 nv = name_value.split('=', 1)
112 if len(nv) != 2:
113 if strict_parsing:
114 raise ValueError("bad query field: %r" % (name_value,))
115 # Handle case of a control-name with no equal sign
116 if keep_blank_values:
117 nv.append('')
118 else:
119 continue
120 if len(nv[1]) or keep_blank_values:
121 name = nv[0].replace('+', ' ')
122 name = _unquote(name, encoding=encoding, errors=errors)
123 name = _coerce_result(name)
124 value = nv[1].replace('+', ' ')
125 value = _unquote(value, encoding=encoding, errors=errors)
126 value = _coerce_result(value)
127 r.append((name, value))
128 return r
129
130 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
131 encoding='utf-8', errors='replace'):
132 parsed_result = {}
133 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
134 encoding=encoding, errors=errors)
135 for name, value in pairs:
136 if name in parsed_result:
137 parsed_result[name].append(value)
138 else:
139 parsed_result[name] = [value]
140 return parsed_result
141
142 try:
143 compat_str = unicode # Python 2
144 except NameError:
145 compat_str = str
146
147 try:
148 compat_chr = unichr # Python 2
149 except NameError:
150 compat_chr = chr
151
152 std_headers = {
153 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
154 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
155 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
156 'Accept-Encoding': 'gzip, deflate',
157 'Accept-Language': 'en-us,en;q=0.5',
158 }
159
160 def preferredencoding():
161 """Get preferred encoding.
162
163 Returns the best encoding scheme for the system, based on
164 locale.getpreferredencoding() and some further tweaks.
165 """
166 try:
167 pref = locale.getpreferredencoding()
168 u'TEST'.encode(pref)
169 except:
170 pref = 'UTF-8'
171
172 return pref
173
174 if sys.version_info < (3,0):
175 def compat_print(s):
176 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
177 else:
178 def compat_print(s):
179 assert type(s) == type(u'')
180 print(s)
181
182 # In Python 2.x, json.dump expects a bytestream.
183 # In Python 3.x, it writes to a character stream
184 if sys.version_info < (3,0):
185 def write_json_file(obj, fn):
186 with open(fn, 'wb') as f:
187 json.dump(obj, f)
188 else:
189 def write_json_file(obj, fn):
190 with open(fn, 'w', encoding='utf-8') as f:
191 json.dump(obj, f)
192
193 def htmlentity_transform(matchobj):
194 """Transforms an HTML entity to a character.
195
196 This function receives a match object and is intended to be used with
197 the re.sub() function.
198 """
199 entity = matchobj.group(1)
200
201 # Known non-numeric HTML entity
202 if entity in compat_html_entities.name2codepoint:
203 return compat_chr(compat_html_entities.name2codepoint[entity])
204
205 mobj = re.match(u'(?u)#(x?\\d+)', entity)
206 if mobj is not None:
207 numstr = mobj.group(1)
208 if numstr.startswith(u'x'):
209 base = 16
210 numstr = u'0%s' % numstr
211 else:
212 base = 10
213 return compat_chr(int(numstr, base))
214
215 # Unknown entity in name, return its literal representation
216 return (u'&%s;' % entity)
217
218 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
219 class AttrParser(compat_html_parser.HTMLParser):
220 """Modified HTMLParser that isolates a tag with the specified attribute"""
221 def __init__(self, attribute, value):
222 self.attribute = attribute
223 self.value = value
224 self.result = None
225 self.started = False
226 self.depth = {}
227 self.html = None
228 self.watch_startpos = False
229 self.error_count = 0
230 compat_html_parser.HTMLParser.__init__(self)
231
232 def error(self, message):
233 if self.error_count > 10 or self.started:
234 raise compat_html_parser.HTMLParseError(message, self.getpos())
235 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
236 self.error_count += 1
237 self.goahead(1)
238
239 def loads(self, html):
240 self.html = html
241 self.feed(html)
242 self.close()
243
244 def handle_starttag(self, tag, attrs):
245 attrs = dict(attrs)
246 if self.started:
247 self.find_startpos(None)
248 if self.attribute in attrs and attrs[self.attribute] == self.value:
249 self.result = [tag]
250 self.started = True
251 self.watch_startpos = True
252 if self.started:
253 if not tag in self.depth: self.depth[tag] = 0
254 self.depth[tag] += 1
255
256 def handle_endtag(self, tag):
257 if self.started:
258 if tag in self.depth: self.depth[tag] -= 1
259 if self.depth[self.result[0]] == 0:
260 self.started = False
261 self.result.append(self.getpos())
262
263 def find_startpos(self, x):
264 """Needed to put the start position of the result (self.result[1])
265 after the opening tag with the requested id"""
266 if self.watch_startpos:
267 self.watch_startpos = False
268 self.result.append(self.getpos())
269 handle_entityref = handle_charref = handle_data = handle_comment = \
270 handle_decl = handle_pi = unknown_decl = find_startpos
271
272 def get_result(self):
273 if self.result is None:
274 return None
275 if len(self.result) != 3:
276 return None
277 lines = self.html.split('\n')
278 lines = lines[self.result[1][0]-1:self.result[2][0]]
279 lines[0] = lines[0][self.result[1][1]:]
280 if len(lines) == 1:
281 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
282 lines[-1] = lines[-1][:self.result[2][1]]
283 return '\n'.join(lines).strip()
284 # Hack for https://github.com/rg3/youtube-dl/issues/662
285 if sys.version_info < (2, 7, 3):
286 AttrParser.parse_endtag = (lambda self, i:
287 i + len("</scr'+'ipt>")
288 if self.rawdata[i:].startswith("</scr'+'ipt>")
289 else compat_html_parser.HTMLParser.parse_endtag(self, i))
290
291 def get_element_by_id(id, html):
292 """Return the content of the tag with the specified ID in the passed HTML document"""
293 return get_element_by_attribute("id", id, html)
294
295 def get_element_by_attribute(attribute, value, html):
296 """Return the content of the tag with the specified attribute in the passed HTML document"""
297 parser = AttrParser(attribute, value)
298 try:
299 parser.loads(html)
300 except compat_html_parser.HTMLParseError:
301 pass
302 return parser.get_result()
303
304
305 def clean_html(html):
306 """Clean an HTML snippet into a readable string"""
307 # Newline vs <br />
308 html = html.replace('\n', ' ')
309 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
310 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
311 # Strip html tags
312 html = re.sub('<.*?>', '', html)
313 # Replace html entities
314 html = unescapeHTML(html)
315 return html.strip()
316
317
318 def sanitize_open(filename, open_mode):
319 """Try to open the given filename, and slightly tweak it if this fails.
320
321 Attempts to open the given filename. If this fails, it tries to change
322 the filename slightly, step by step, until it's either able to open it
323 or it fails and raises a final exception, like the standard open()
324 function.
325
326 It returns the tuple (stream, definitive_file_name).
327 """
328 try:
329 if filename == u'-':
330 if sys.platform == 'win32':
331 import msvcrt
332 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
333 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
334 stream = open(encodeFilename(filename), open_mode)
335 return (stream, filename)
336 except (IOError, OSError) as err:
337 # In case of error, try to remove win32 forbidden chars
338 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
339
340 # An exception here should be caught in the caller
341 stream = open(encodeFilename(filename), open_mode)
342 return (stream, filename)
343
344
345 def timeconvert(timestr):
346 """Convert RFC 2822 defined time string into system timestamp"""
347 timestamp = None
348 timetuple = email.utils.parsedate_tz(timestr)
349 if timetuple is not None:
350 timestamp = email.utils.mktime_tz(timetuple)
351 return timestamp
352
353 def sanitize_filename(s, restricted=False, is_id=False):
354 """Sanitizes a string so it could be used as part of a filename.
355 If restricted is set, use a stricter subset of allowed characters.
356 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
357 """
358 def replace_insane(char):
359 if char == '?' or ord(char) < 32 or ord(char) == 127:
360 return ''
361 elif char == '"':
362 return '' if restricted else '\''
363 elif char == ':':
364 return '_-' if restricted else ' -'
365 elif char in '\\/|*<>':
366 return '_'
367 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
368 return '_'
369 if restricted and ord(char) > 127:
370 return '_'
371 return char
372
373 result = u''.join(map(replace_insane, s))
374 if not is_id:
375 while '__' in result:
376 result = result.replace('__', '_')
377 result = result.strip('_')
378 # Common case of "Foreign band name - English song title"
379 if restricted and result.startswith('-_'):
380 result = result[2:]
381 if not result:
382 result = '_'
383 return result
384
385 def orderedSet(iterable):
386 """ Remove all duplicates from the input iterable """
387 res = []
388 for el in iterable:
389 if el not in res:
390 res.append(el)
391 return res
392
393 def unescapeHTML(s):
394 """
395 @param s a string
396 """
397 assert type(s) == type(u'')
398
399 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
400 return result
401
402 def encodeFilename(s):
403 """
404 @param s The name of the file
405 """
406
407 assert type(s) == type(u'')
408
409 # Python 3 has a Unicode API
410 if sys.version_info >= (3, 0):
411 return s
412
413 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
414 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
415 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
416 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
417 return s
418 else:
419 encoding = sys.getfilesystemencoding()
420 if encoding is None:
421 encoding = 'utf-8'
422 return s.encode(encoding, 'ignore')
423
424 def decodeOption(optval):
425 if optval is None:
426 return optval
427 if isinstance(optval, bytes):
428 optval = optval.decode(preferredencoding())
429
430 assert isinstance(optval, compat_str)
431 return optval
432
433 def formatSeconds(secs):
434 if secs > 3600:
435 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
436 elif secs > 60:
437 return '%d:%02d' % (secs // 60, secs % 60)
438 else:
439 return '%d' % secs
440
441 class ExtractorError(Exception):
442 """Error during info extraction."""
443 def __init__(self, msg, tb=None):
444 """ tb, if given, is the original traceback (so that it can be printed out). """
445 super(ExtractorError, self).__init__(msg)
446 self.traceback = tb
447 self.exc_info = sys.exc_info() # preserve original exception
448
449 def format_traceback(self):
450 if self.traceback is None:
451 return None
452 return u''.join(traceback.format_tb(self.traceback))
453
454
455 class DownloadError(Exception):
456 """Download Error exception.
457
458 This exception may be thrown by FileDownloader objects if they are not
459 configured to continue on errors. They will contain the appropriate
460 error message.
461 """
462 def __init__(self, msg, exc_info=None):
463 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
464 super(DownloadError, self).__init__(msg)
465 self.exc_info = exc_info
466
467
468 class SameFileError(Exception):
469 """Same File exception.
470
471 This exception will be thrown by FileDownloader objects if they detect
472 multiple files would have to be downloaded to the same file on disk.
473 """
474 pass
475
476
477 class PostProcessingError(Exception):
478 """Post Processing exception.
479
480 This exception may be raised by PostProcessor's .run() method to
481 indicate an error in the postprocessing task.
482 """
483 def __init__(self, msg):
484 self.msg = msg
485
486 class MaxDownloadsReached(Exception):
487 """ --max-downloads limit has been reached. """
488 pass
489
490
491 class UnavailableVideoError(Exception):
492 """Unavailable Format exception.
493
494 This exception will be thrown when a video is requested
495 in a format that is not available for that video.
496 """
497 pass
498
499
500 class ContentTooShortError(Exception):
501 """Content Too Short exception.
502
503 This exception may be raised by FileDownloader objects when a file they
504 download is too small for what the server announced first, indicating
505 the connection was probably interrupted.
506 """
507 # Both in bytes
508 downloaded = None
509 expected = None
510
511 def __init__(self, downloaded, expected):
512 self.downloaded = downloaded
513 self.expected = expected
514
515 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
516 """Handler for HTTP requests and responses.
517
518 This class, when installed with an OpenerDirector, automatically adds
519 the standard headers to every HTTP request and handles gzipped and
520 deflated responses from web servers. If compression is to be avoided in
521 a particular request, the original request in the program code only has
522 to include the HTTP header "Youtubedl-No-Compression", which will be
523 removed before making the real request.
524
525 Part of this code was copied from:
526
527 http://techknack.net/python-urllib2-handlers/
528
529 Andrew Rowls, the author of that code, agreed to release it to the
530 public domain.
531 """
532
533 @staticmethod
534 def deflate(data):
535 try:
536 return zlib.decompress(data, -zlib.MAX_WBITS)
537 except zlib.error:
538 return zlib.decompress(data)
539
540 @staticmethod
541 def addinfourl_wrapper(stream, headers, url, code):
542 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
543 return compat_urllib_request.addinfourl(stream, headers, url, code)
544 ret = compat_urllib_request.addinfourl(stream, headers, url)
545 ret.code = code
546 return ret
547
548 def http_request(self, req):
549 for h,v in std_headers.items():
550 if h in req.headers:
551 del req.headers[h]
552 req.add_header(h, v)
553 if 'Youtubedl-no-compression' in req.headers:
554 if 'Accept-encoding' in req.headers:
555 del req.headers['Accept-encoding']
556 del req.headers['Youtubedl-no-compression']
557 if 'Youtubedl-user-agent' in req.headers:
558 if 'User-agent' in req.headers:
559 del req.headers['User-agent']
560 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
561 del req.headers['Youtubedl-user-agent']
562 return req
563
564 def http_response(self, req, resp):
565 old_resp = resp
566 # gzip
567 if resp.headers.get('Content-encoding', '') == 'gzip':
568 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
569 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
570 resp.msg = old_resp.msg
571 # deflate
572 if resp.headers.get('Content-encoding', '') == 'deflate':
573 gz = io.BytesIO(self.deflate(resp.read()))
574 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
575 resp.msg = old_resp.msg
576 return resp
577
578 https_request = http_request
579 https_response = http_response
580
581 def unified_strdate(date_str):
582 """Return a string with the date in the format YYYYMMDD"""
583 upload_date = None
584 #Replace commas
585 date_str = date_str.replace(',',' ')
586 # %z (UTC offset) is only supported in python>=3.2
587 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
588 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
589 for expression in format_expressions:
590 try:
591 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
592 except:
593 pass
594 return upload_date
595
596 def date_from_str(date_str):
597 """
598 Return a datetime object from a string in the format YYYYMMDD or
599 (now|today)[+-][0-9](day|week|month|year)(s)?"""
600 today = datetime.date.today()
601 if date_str == 'now'or date_str == 'today':
602 return today
603 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
604 if match is not None:
605 sign = match.group('sign')
606 time = int(match.group('time'))
607 if sign == '-':
608 time = -time
609 unit = match.group('unit')
610 #A bad aproximation?
611 if unit == 'month':
612 unit = 'day'
613 time *= 30
614 elif unit == 'year':
615 unit = 'day'
616 time *= 365
617 unit += 's'
618 delta = datetime.timedelta(**{unit: time})
619 return today + delta
620 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
621
622 class DateRange(object):
623 """Represents a time interval between two dates"""
624 def __init__(self, start=None, end=None):
625 """start and end must be strings in the format accepted by date"""
626 if start is not None:
627 self.start = date_from_str(start)
628 else:
629 self.start = datetime.datetime.min.date()
630 if end is not None:
631 self.end = date_from_str(end)
632 else:
633 self.end = datetime.datetime.max.date()
634 if self.start > self.end:
635 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
636 @classmethod
637 def day(cls, day):
638 """Returns a range that only contains the given day"""
639 return cls(day,day)
640 def __contains__(self, date):
641 """Check if the date is in the range"""
642 if not isinstance(date, datetime.date):
643 date = date_from_str(date)
644 return self.start <= date <= self.end
645 def __str__(self):
646 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())