]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Merge remote-tracking branch 'origin/master'
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
c496ca96
PH
4import datetime
5import email.utils
f45c185f 6import errno
d77c3dfd 7import gzip
03f9daab 8import io
f4bfd65f 9import json
d77c3dfd
FV
10import locale
11import os
4eb7f1d1 12import pipes
c496ca96 13import platform
d77c3dfd 14import re
c496ca96 15import socket
d77c3dfd 16import sys
01951dda 17import traceback
d77c3dfd 18import zlib
d77c3dfd 19
01ba00ca 20try:
59ae15a5 21 import urllib.request as compat_urllib_request
01ba00ca 22except ImportError: # Python 2
59ae15a5 23 import urllib2 as compat_urllib_request
01ba00ca
PH
24
25try:
59ae15a5 26 import urllib.error as compat_urllib_error
01ba00ca 27except ImportError: # Python 2
59ae15a5 28 import urllib2 as compat_urllib_error
01ba00ca
PH
29
30try:
59ae15a5 31 import urllib.parse as compat_urllib_parse
01ba00ca 32except ImportError: # Python 2
59ae15a5 33 import urllib as compat_urllib_parse
01ba00ca 34
799c0763
PH
35try:
36 from urllib.parse import urlparse as compat_urllib_parse_urlparse
37except ImportError: # Python 2
38 from urlparse import urlparse as compat_urllib_parse_urlparse
39
6543f0dc
JMF
40try:
41 import urllib.parse as compat_urlparse
42except ImportError: # Python 2
43 import urlparse as compat_urlparse
44
01ba00ca 45try:
59ae15a5 46 import http.cookiejar as compat_cookiejar
01ba00ca 47except ImportError: # Python 2
59ae15a5 48 import cookielib as compat_cookiejar
01ba00ca 49
3e669f36 50try:
59ae15a5 51 import html.entities as compat_html_entities
9f37a959 52except ImportError: # Python 2
59ae15a5 53 import htmlentitydefs as compat_html_entities
3e669f36 54
a8156c1d 55try:
59ae15a5 56 import html.parser as compat_html_parser
9f37a959 57except ImportError: # Python 2
59ae15a5 58 import HTMLParser as compat_html_parser
a8156c1d 59
348d0a7a 60try:
59ae15a5 61 import http.client as compat_http_client
9f37a959 62except ImportError: # Python 2
59ae15a5 63 import httplib as compat_http_client
348d0a7a 64
2eabb802 65try:
0e283428 66 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
67except ImportError: # Python 2
68 from urllib2 import HTTPError as compat_HTTPError
69
e0df6211
PH
70try:
71 from urllib.request import urlretrieve as compat_urlretrieve
72except ImportError: # Python 2
73 from urllib import urlretrieve as compat_urlretrieve
74
75
5910e210
PH
76try:
77 from subprocess import DEVNULL
78 compat_subprocess_get_DEVNULL = lambda: DEVNULL
79except ImportError:
80 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
81
9f37a959 82try:
59ae15a5 83 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 84except ImportError: # Python 2
59ae15a5
PH
85 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
86 # Python 2's version is apparently totally broken
87 def _unquote(string, encoding='utf-8', errors='replace'):
88 if string == '':
89 return string
90 res = string.split('%')
91 if len(res) == 1:
92 return string
93 if encoding is None:
94 encoding = 'utf-8'
95 if errors is None:
96 errors = 'replace'
97 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
98 pct_sequence = b''
99 string = res[0]
100 for item in res[1:]:
101 try:
102 if not item:
103 raise ValueError
104 pct_sequence += item[:2].decode('hex')
105 rest = item[2:]
106 if not rest:
107 # This segment was just a single percent-encoded character.
108 # May be part of a sequence of code units, so delay decoding.
109 # (Stored in pct_sequence).
110 continue
111 except ValueError:
112 rest = '%' + item
113 # Encountered non-percent-encoded characters. Flush the current
114 # pct_sequence.
115 string += pct_sequence.decode(encoding, errors) + rest
116 pct_sequence = b''
117 if pct_sequence:
118 # Flush the final pct_sequence
119 string += pct_sequence.decode(encoding, errors)
120 return string
121
122 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
123 encoding='utf-8', errors='replace'):
124 qs, _coerce_result = qs, unicode
125 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
126 r = []
127 for name_value in pairs:
128 if not name_value and not strict_parsing:
129 continue
130 nv = name_value.split('=', 1)
131 if len(nv) != 2:
132 if strict_parsing:
133 raise ValueError("bad query field: %r" % (name_value,))
134 # Handle case of a control-name with no equal sign
135 if keep_blank_values:
136 nv.append('')
137 else:
138 continue
139 if len(nv[1]) or keep_blank_values:
140 name = nv[0].replace('+', ' ')
141 name = _unquote(name, encoding=encoding, errors=errors)
142 name = _coerce_result(name)
143 value = nv[1].replace('+', ' ')
144 value = _unquote(value, encoding=encoding, errors=errors)
145 value = _coerce_result(value)
146 r.append((name, value))
147 return r
148
149 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
150 encoding='utf-8', errors='replace'):
151 parsed_result = {}
152 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
153 encoding=encoding, errors=errors)
154 for name, value in pairs:
155 if name in parsed_result:
156 parsed_result[name].append(value)
157 else:
158 parsed_result[name] = [value]
159 return parsed_result
348d0a7a 160
3e669f36 161try:
59ae15a5 162 compat_str = unicode # Python 2
3e669f36 163except NameError:
59ae15a5 164 compat_str = str
3e669f36
PH
165
166try:
59ae15a5 167 compat_chr = unichr # Python 2
3e669f36 168except NameError:
59ae15a5 169 compat_chr = chr
3e669f36 170
b31756c1
FV
171def compat_ord(c):
172 if type(c) is int: return c
173 else: return ord(c)
174
468e2e92
FV
175# This is not clearly defined otherwise
176compiled_regex_type = type(re.compile(''))
177
3e669f36 178std_headers = {
41e8bca4 179 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
180 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
181 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
182 'Accept-Encoding': 'gzip, deflate',
183 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 184}
f427df17 185
d77c3dfd 186def preferredencoding():
59ae15a5 187 """Get preferred encoding.
d77c3dfd 188
59ae15a5
PH
189 Returns the best encoding scheme for the system, based on
190 locale.getpreferredencoding() and some further tweaks.
191 """
192 try:
193 pref = locale.getpreferredencoding()
194 u'TEST'.encode(pref)
195 except:
196 pref = 'UTF-8'
bae611f2 197
59ae15a5 198 return pref
d77c3dfd 199
8cd10ac4 200if sys.version_info < (3,0):
59ae15a5
PH
201 def compat_print(s):
202 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 203else:
59ae15a5
PH
204 def compat_print(s):
205 assert type(s) == type(u'')
206 print(s)
d77c3dfd 207
f4bfd65f
PH
208# In Python 2.x, json.dump expects a bytestream.
209# In Python 3.x, it writes to a character stream
210if sys.version_info < (3,0):
211 def write_json_file(obj, fn):
212 with open(fn, 'wb') as f:
213 json.dump(obj, f)
214else:
215 def write_json_file(obj, fn):
216 with open(fn, 'w', encoding='utf-8') as f:
217 json.dump(obj, f)
218
59ae56fa
PH
219if sys.version_info >= (2,7):
220 def find_xpath_attr(node, xpath, key, val):
221 """ Find the xpath xpath[@key=val] """
5de3ece2 222 assert re.match(r'^[a-zA-Z]+$', key)
54543467 223 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
59ae56fa
PH
224 expr = xpath + u"[@%s='%s']" % (key, val)
225 return node.find(expr)
226else:
227 def find_xpath_attr(node, xpath, key, val):
228 for f in node.findall(xpath):
229 if f.attrib.get(key) == val:
230 return f
231 return None
232
d7e66d39
JMF
233# On python2.6 the xml.etree.ElementTree.Element methods don't support
234# the namespace parameter
235def xpath_with_ns(path, ns_map):
236 components = [c.split(':') for c in path.split('/')]
237 replaced = []
238 for c in components:
239 if len(c) == 1:
240 replaced.append(c[0])
241 else:
242 ns, tag = c
243 replaced.append('{%s}%s' % (ns_map[ns], tag))
244 return '/'.join(replaced)
245
d77c3dfd 246def htmlentity_transform(matchobj):
59ae15a5
PH
247 """Transforms an HTML entity to a character.
248
249 This function receives a match object and is intended to be used with
250 the re.sub() function.
251 """
252 entity = matchobj.group(1)
253
254 # Known non-numeric HTML entity
255 if entity in compat_html_entities.name2codepoint:
256 return compat_chr(compat_html_entities.name2codepoint[entity])
257
258 mobj = re.match(u'(?u)#(x?\\d+)', entity)
259 if mobj is not None:
260 numstr = mobj.group(1)
261 if numstr.startswith(u'x'):
262 base = 16
263 numstr = u'0%s' % numstr
264 else:
265 base = 10
266 return compat_chr(int(numstr, base))
267
268 # Unknown entity in name, return its literal representation
269 return (u'&%s;' % entity)
d77c3dfd 270
a8156c1d 271compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
272class BaseHTMLParser(compat_html_parser.HTMLParser):
273 def __init(self):
274 compat_html_parser.HTMLParser.__init__(self)
275 self.html = None
276
277 def loads(self, html):
278 self.html = html
279 self.feed(html)
280 self.close()
281
282class AttrParser(BaseHTMLParser):
43e8fafd
ND
283 """Modified HTMLParser that isolates a tag with the specified attribute"""
284 def __init__(self, attribute, value):
285 self.attribute = attribute
286 self.value = value
59ae15a5
PH
287 self.result = None
288 self.started = False
289 self.depth = {}
59ae15a5
PH
290 self.watch_startpos = False
291 self.error_count = 0
a921f407 292 BaseHTMLParser.__init__(self)
59ae15a5
PH
293
294 def error(self, message):
295 if self.error_count > 10 or self.started:
296 raise compat_html_parser.HTMLParseError(message, self.getpos())
297 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
298 self.error_count += 1
299 self.goahead(1)
300
59ae15a5
PH
301 def handle_starttag(self, tag, attrs):
302 attrs = dict(attrs)
303 if self.started:
304 self.find_startpos(None)
43e8fafd 305 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
306 self.result = [tag]
307 self.started = True
308 self.watch_startpos = True
309 if self.started:
310 if not tag in self.depth: self.depth[tag] = 0
311 self.depth[tag] += 1
312
313 def handle_endtag(self, tag):
314 if self.started:
315 if tag in self.depth: self.depth[tag] -= 1
316 if self.depth[self.result[0]] == 0:
317 self.started = False
318 self.result.append(self.getpos())
319
320 def find_startpos(self, x):
321 """Needed to put the start position of the result (self.result[1])
322 after the opening tag with the requested id"""
323 if self.watch_startpos:
324 self.watch_startpos = False
325 self.result.append(self.getpos())
326 handle_entityref = handle_charref = handle_data = handle_comment = \
327 handle_decl = handle_pi = unknown_decl = find_startpos
328
329 def get_result(self):
330 if self.result is None:
331 return None
332 if len(self.result) != 3:
333 return None
334 lines = self.html.split('\n')
335 lines = lines[self.result[1][0]-1:self.result[2][0]]
336 lines[0] = lines[0][self.result[1][1]:]
337 if len(lines) == 1:
338 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
339 lines[-1] = lines[-1][:self.result[2][1]]
340 return '\n'.join(lines).strip()
3b024e17
PH
341# Hack for https://github.com/rg3/youtube-dl/issues/662
342if sys.version_info < (2, 7, 3):
343 AttrParser.parse_endtag = (lambda self, i:
344 i + len("</scr'+'ipt>")
345 if self.rawdata[i:].startswith("</scr'+'ipt>")
346 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
347
348def get_element_by_id(id, html):
43e8fafd
ND
349 """Return the content of the tag with the specified ID in the passed HTML document"""
350 return get_element_by_attribute("id", id, html)
351
352def get_element_by_attribute(attribute, value, html):
353 """Return the content of the tag with the specified attribute in the passed HTML document"""
354 parser = AttrParser(attribute, value)
59ae15a5
PH
355 try:
356 parser.loads(html)
357 except compat_html_parser.HTMLParseError:
358 pass
359 return parser.get_result()
9e6dd238 360
a921f407
JMF
361class MetaParser(BaseHTMLParser):
362 """
363 Modified HTMLParser that isolates a meta tag with the specified name
364 attribute.
365 """
366 def __init__(self, name):
367 BaseHTMLParser.__init__(self)
368 self.name = name
369 self.content = None
370 self.result = None
371
372 def handle_starttag(self, tag, attrs):
373 if tag != 'meta':
374 return
375 attrs = dict(attrs)
376 if attrs.get('name') == self.name:
377 self.result = attrs.get('content')
378
379 def get_result(self):
380 return self.result
381
382def get_meta_content(name, html):
383 """
384 Return the content attribute from the meta tag with the given name attribute.
385 """
386 parser = MetaParser(name)
387 try:
388 parser.loads(html)
389 except compat_html_parser.HTMLParseError:
390 pass
391 return parser.get_result()
392
9e6dd238
FV
393
394def clean_html(html):
59ae15a5
PH
395 """Clean an HTML snippet into a readable string"""
396 # Newline vs <br />
397 html = html.replace('\n', ' ')
6b3aef80
FV
398 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
399 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
400 # Strip html tags
401 html = re.sub('<.*?>', '', html)
402 # Replace html entities
403 html = unescapeHTML(html)
7decf895 404 return html.strip()
9e6dd238
FV
405
406
d77c3dfd 407def sanitize_open(filename, open_mode):
59ae15a5
PH
408 """Try to open the given filename, and slightly tweak it if this fails.
409
410 Attempts to open the given filename. If this fails, it tries to change
411 the filename slightly, step by step, until it's either able to open it
412 or it fails and raises a final exception, like the standard open()
413 function.
414
415 It returns the tuple (stream, definitive_file_name).
416 """
417 try:
418 if filename == u'-':
419 if sys.platform == 'win32':
420 import msvcrt
421 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 422 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
423 stream = open(encodeFilename(filename), open_mode)
424 return (stream, filename)
425 except (IOError, OSError) as err:
f45c185f
PH
426 if err.errno in (errno.EACCES,):
427 raise
59ae15a5 428
f45c185f
PH
429 # In case of error, try to remove win32 forbidden chars
430 alt_filename = os.path.join(
431 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
432 for path_part in os.path.split(filename)
433 )
434 if alt_filename == filename:
435 raise
436 else:
437 # An exception here should be caught in the caller
438 stream = open(encodeFilename(filename), open_mode)
439 return (stream, alt_filename)
d77c3dfd
FV
440
441
442def timeconvert(timestr):
59ae15a5
PH
443 """Convert RFC 2822 defined time string into system timestamp"""
444 timestamp = None
445 timetuple = email.utils.parsedate_tz(timestr)
446 if timetuple is not None:
447 timestamp = email.utils.mktime_tz(timetuple)
448 return timestamp
1c469a94 449
796173d0 450def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
451 """Sanitizes a string so it could be used as part of a filename.
452 If restricted is set, use a stricter subset of allowed characters.
796173d0 453 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
454 """
455 def replace_insane(char):
456 if char == '?' or ord(char) < 32 or ord(char) == 127:
457 return ''
458 elif char == '"':
459 return '' if restricted else '\''
460 elif char == ':':
461 return '_-' if restricted else ' -'
462 elif char in '\\/|*<>':
463 return '_'
627dcfff 464 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
465 return '_'
466 if restricted and ord(char) > 127:
467 return '_'
468 return char
469
470 result = u''.join(map(replace_insane, s))
796173d0
PH
471 if not is_id:
472 while '__' in result:
473 result = result.replace('__', '_')
474 result = result.strip('_')
475 # Common case of "Foreign band name - English song title"
476 if restricted and result.startswith('-_'):
477 result = result[2:]
478 if not result:
479 result = '_'
59ae15a5 480 return result
d77c3dfd
FV
481
482def orderedSet(iterable):
59ae15a5
PH
483 """ Remove all duplicates from the input iterable """
484 res = []
485 for el in iterable:
486 if el not in res:
487 res.append(el)
488 return res
d77c3dfd
FV
489
490def unescapeHTML(s):
59ae15a5
PH
491 """
492 @param s a string
493 """
494 assert type(s) == type(u'')
d77c3dfd 495
59ae15a5
PH
496 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
497 return result
d77c3dfd
FV
498
499def encodeFilename(s):
59ae15a5
PH
500 """
501 @param s The name of the file
502 """
d77c3dfd 503
59ae15a5 504 assert type(s) == type(u'')
d77c3dfd 505
59ae15a5
PH
506 # Python 3 has a Unicode API
507 if sys.version_info >= (3, 0):
508 return s
0f00efed 509
59ae15a5
PH
510 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
511 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
512 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
513 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
514 return s
515 else:
6df40dcb
PH
516 encoding = sys.getfilesystemencoding()
517 if encoding is None:
518 encoding = 'utf-8'
519 return s.encode(encoding, 'ignore')
d77c3dfd 520
8271226a
PH
521def decodeOption(optval):
522 if optval is None:
523 return optval
524 if isinstance(optval, bytes):
525 optval = optval.decode(preferredencoding())
526
527 assert isinstance(optval, compat_str)
528 return optval
1c256f70 529
4539dd30
PH
530def formatSeconds(secs):
531 if secs > 3600:
532 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
533 elif secs > 60:
534 return '%d:%02d' % (secs // 60, secs % 60)
535 else:
536 return '%d' % secs
537
ea6d901e
PH
538def make_HTTPS_handler(opts):
539 if sys.version_info < (3,2):
540 # Python's 2.x handler is very simplistic
acebc9cd 541 return compat_urllib_request.HTTPSHandler()
ea6d901e
PH
542 else:
543 import ssl
544 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
545 context.set_default_verify_paths()
546
547 context.verify_mode = (ssl.CERT_NONE
548 if opts.no_check_certificate
549 else ssl.CERT_REQUIRED)
acebc9cd 550 return compat_urllib_request.HTTPSHandler(context=context)
ea6d901e 551
1c256f70
PH
552class ExtractorError(Exception):
553 """Error during info extraction."""
2eabb802 554 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
555 """ tb, if given, is the original traceback (so that it can be printed out).
556 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
557 """
558
559 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
560 expected = True
561 if not expected:
298f833b 562 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 563 super(ExtractorError, self).__init__(msg)
d5979c5d 564
1c256f70 565 self.traceback = tb
8cc83b8d 566 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 567 self.cause = cause
1c256f70 568
01951dda
PH
569 def format_traceback(self):
570 if self.traceback is None:
571 return None
572 return u''.join(traceback.format_tb(self.traceback))
573
1c256f70 574
d77c3dfd 575class DownloadError(Exception):
59ae15a5 576 """Download Error exception.
d77c3dfd 577
59ae15a5
PH
578 This exception may be thrown by FileDownloader objects if they are not
579 configured to continue on errors. They will contain the appropriate
580 error message.
581 """
8cc83b8d
FV
582 def __init__(self, msg, exc_info=None):
583 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
584 super(DownloadError, self).__init__(msg)
585 self.exc_info = exc_info
d77c3dfd
FV
586
587
588class SameFileError(Exception):
59ae15a5 589 """Same File exception.
d77c3dfd 590
59ae15a5
PH
591 This exception will be thrown by FileDownloader objects if they detect
592 multiple files would have to be downloaded to the same file on disk.
593 """
594 pass
d77c3dfd
FV
595
596
597class PostProcessingError(Exception):
59ae15a5 598 """Post Processing exception.
d77c3dfd 599
59ae15a5
PH
600 This exception may be raised by PostProcessor's .run() method to
601 indicate an error in the postprocessing task.
602 """
7851b379
PH
603 def __init__(self, msg):
604 self.msg = msg
d77c3dfd
FV
605
606class MaxDownloadsReached(Exception):
59ae15a5
PH
607 """ --max-downloads limit has been reached. """
608 pass
d77c3dfd
FV
609
610
611class UnavailableVideoError(Exception):
59ae15a5 612 """Unavailable Format exception.
d77c3dfd 613
59ae15a5
PH
614 This exception will be thrown when a video is requested
615 in a format that is not available for that video.
616 """
617 pass
d77c3dfd
FV
618
619
620class ContentTooShortError(Exception):
59ae15a5 621 """Content Too Short exception.
d77c3dfd 622
59ae15a5
PH
623 This exception may be raised by FileDownloader objects when a file they
624 download is too small for what the server announced first, indicating
625 the connection was probably interrupted.
626 """
627 # Both in bytes
628 downloaded = None
629 expected = None
d77c3dfd 630
59ae15a5
PH
631 def __init__(self, downloaded, expected):
632 self.downloaded = downloaded
633 self.expected = expected
d77c3dfd 634
acebc9cd 635class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
636 """Handler for HTTP requests and responses.
637
638 This class, when installed with an OpenerDirector, automatically adds
639 the standard headers to every HTTP request and handles gzipped and
640 deflated responses from web servers. If compression is to be avoided in
641 a particular request, the original request in the program code only has
642 to include the HTTP header "Youtubedl-No-Compression", which will be
643 removed before making the real request.
644
645 Part of this code was copied from:
646
647 http://techknack.net/python-urllib2-handlers/
648
649 Andrew Rowls, the author of that code, agreed to release it to the
650 public domain.
651 """
652
653 @staticmethod
654 def deflate(data):
655 try:
656 return zlib.decompress(data, -zlib.MAX_WBITS)
657 except zlib.error:
658 return zlib.decompress(data)
659
660 @staticmethod
661 def addinfourl_wrapper(stream, headers, url, code):
662 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
663 return compat_urllib_request.addinfourl(stream, headers, url, code)
664 ret = compat_urllib_request.addinfourl(stream, headers, url)
665 ret.code = code
666 return ret
667
acebc9cd
PH
668 def http_request(self, req):
669 for h,v in std_headers.items():
59ae15a5
PH
670 if h in req.headers:
671 del req.headers[h]
335959e7 672 req.add_header(h, v)
59ae15a5
PH
673 if 'Youtubedl-no-compression' in req.headers:
674 if 'Accept-encoding' in req.headers:
675 del req.headers['Accept-encoding']
676 del req.headers['Youtubedl-no-compression']
3446dfb7 677 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
678 if 'User-agent' in req.headers:
679 del req.headers['User-agent']
680 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 681 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
682 return req
683
acebc9cd 684 def http_response(self, req, resp):
59ae15a5
PH
685 old_resp = resp
686 # gzip
687 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
688 content = resp.read()
689 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
690 try:
691 uncompressed = io.BytesIO(gz.read())
692 except IOError as original_ioerror:
693 # There may be junk add the end of the file
694 # See http://stackoverflow.com/q/4928560/35070 for details
695 for i in range(1, 1024):
696 try:
697 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
698 uncompressed = io.BytesIO(gz.read())
699 except IOError:
700 continue
701 break
702 else:
703 raise original_ioerror
704 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
705 resp.msg = old_resp.msg
706 # deflate
707 if resp.headers.get('Content-encoding', '') == 'deflate':
708 gz = io.BytesIO(self.deflate(resp.read()))
709 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
710 resp.msg = old_resp.msg
711 return resp
0f8d03f8 712
acebc9cd
PH
713 https_request = http_request
714 https_response = http_response
bf50b038
JMF
715
716def unified_strdate(date_str):
717 """Return a string with the date in the format YYYYMMDD"""
718 upload_date = None
719 #Replace commas
720 date_str = date_str.replace(',',' ')
721 # %z (UTC offset) is only supported in python>=3.2
722 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
19e1d359
JMF
723 format_expressions = [
724 '%d %B %Y',
725 '%B %d %Y',
726 '%b %d %Y',
727 '%Y-%m-%d',
728 '%d/%m/%Y',
729 '%Y/%m/%d %H:%M:%S',
730 '%d.%m.%Y %H:%M',
731 '%Y-%m-%dT%H:%M:%SZ',
2e1fa03b 732 '%Y-%m-%dT%H:%M:%S',
19e1d359 733 ]
bf50b038
JMF
734 for expression in format_expressions:
735 try:
736 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
737 except:
738 pass
739 return upload_date
740
cbdbb766 741def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
742 guess = url.partition(u'?')[0].rpartition(u'.')[2]
743 if re.match(r'^[A-Za-z0-9]+$', guess):
744 return guess
745 else:
cbdbb766 746 return default_ext
73e79f2a 747
d4051a8e
JMF
748def subtitles_filename(filename, sub_lang, sub_format):
749 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
750
bd558525 751def date_from_str(date_str):
37254abc
JMF
752 """
753 Return a datetime object from a string in the format YYYYMMDD or
754 (now|today)[+-][0-9](day|week|month|year)(s)?"""
755 today = datetime.date.today()
756 if date_str == 'now'or date_str == 'today':
757 return today
758 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
759 if match is not None:
760 sign = match.group('sign')
761 time = int(match.group('time'))
762 if sign == '-':
763 time = -time
764 unit = match.group('unit')
765 #A bad aproximation?
766 if unit == 'month':
767 unit = 'day'
768 time *= 30
769 elif unit == 'year':
770 unit = 'day'
771 time *= 365
772 unit += 's'
773 delta = datetime.timedelta(**{unit: time})
774 return today + delta
bd558525
JMF
775 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
776
777class DateRange(object):
778 """Represents a time interval between two dates"""
779 def __init__(self, start=None, end=None):
780 """start and end must be strings in the format accepted by date"""
781 if start is not None:
782 self.start = date_from_str(start)
783 else:
784 self.start = datetime.datetime.min.date()
785 if end is not None:
786 self.end = date_from_str(end)
787 else:
788 self.end = datetime.datetime.max.date()
37254abc 789 if self.start > self.end:
bd558525
JMF
790 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
791 @classmethod
792 def day(cls, day):
793 """Returns a range that only contains the given day"""
794 return cls(day,day)
795 def __contains__(self, date):
796 """Check if the date is in the range"""
37254abc
JMF
797 if not isinstance(date, datetime.date):
798 date = date_from_str(date)
799 return self.start <= date <= self.end
bd558525
JMF
800 def __str__(self):
801 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
802
803
804def platform_name():
805 """ Returns the platform name as a compat_str """
806 res = platform.platform()
807 if isinstance(res, bytes):
808 res = res.decode(preferredencoding())
809
810 assert isinstance(res, compat_str)
811 return res
c257baff
PH
812
813
7459e3a2
PH
814def write_string(s, out=None):
815 if out is None:
816 out = sys.stderr
817 assert type(s) == type(u'')
818
819 if ('b' in getattr(out, 'mode', '') or
820 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
821 s = s.encode(preferredencoding(), 'ignore')
822 out.write(s)
823 out.flush()
824
825
48ea9cea
PH
826def bytes_to_intlist(bs):
827 if not bs:
828 return []
829 if isinstance(bs[0], int): # Python 3
830 return list(bs)
831 else:
832 return [ord(c) for c in bs]
833
c257baff 834
cba892fa 835def intlist_to_bytes(xs):
836 if not xs:
837 return b''
838 if isinstance(chr(0), bytes): # Python 2
839 return ''.join([chr(x) for x in xs])
840 else:
841 return bytes(xs)
c38b1e77
PH
842
843
844def get_cachedir(params={}):
845 cache_root = os.environ.get('XDG_CACHE_HOME',
846 os.path.expanduser('~/.cache'))
847 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
848
849
850# Cross-platform file locking
851if sys.platform == 'win32':
852 import ctypes.wintypes
853 import msvcrt
854
855 class OVERLAPPED(ctypes.Structure):
856 _fields_ = [
857 ('Internal', ctypes.wintypes.LPVOID),
858 ('InternalHigh', ctypes.wintypes.LPVOID),
859 ('Offset', ctypes.wintypes.DWORD),
860 ('OffsetHigh', ctypes.wintypes.DWORD),
861 ('hEvent', ctypes.wintypes.HANDLE),
862 ]
863
864 kernel32 = ctypes.windll.kernel32
865 LockFileEx = kernel32.LockFileEx
866 LockFileEx.argtypes = [
867 ctypes.wintypes.HANDLE, # hFile
868 ctypes.wintypes.DWORD, # dwFlags
869 ctypes.wintypes.DWORD, # dwReserved
870 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
871 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
872 ctypes.POINTER(OVERLAPPED) # Overlapped
873 ]
874 LockFileEx.restype = ctypes.wintypes.BOOL
875 UnlockFileEx = kernel32.UnlockFileEx
876 UnlockFileEx.argtypes = [
877 ctypes.wintypes.HANDLE, # hFile
878 ctypes.wintypes.DWORD, # dwReserved
879 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
880 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
881 ctypes.POINTER(OVERLAPPED) # Overlapped
882 ]
883 UnlockFileEx.restype = ctypes.wintypes.BOOL
884 whole_low = 0xffffffff
885 whole_high = 0x7fffffff
886
887 def _lock_file(f, exclusive):
888 overlapped = OVERLAPPED()
889 overlapped.Offset = 0
890 overlapped.OffsetHigh = 0
891 overlapped.hEvent = 0
892 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
893 handle = msvcrt.get_osfhandle(f.fileno())
894 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
895 whole_low, whole_high, f._lock_file_overlapped_p):
896 raise OSError('Locking file failed: %r' % ctypes.FormatError())
897
898 def _unlock_file(f):
899 assert f._lock_file_overlapped_p
900 handle = msvcrt.get_osfhandle(f.fileno())
901 if not UnlockFileEx(handle, 0,
902 whole_low, whole_high, f._lock_file_overlapped_p):
903 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
904
905else:
906 import fcntl
907
908 def _lock_file(f, exclusive):
909 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
910
911 def _unlock_file(f):
912 fcntl.lockf(f, fcntl.LOCK_UN)
913
914
915class locked_file(object):
916 def __init__(self, filename, mode, encoding=None):
917 assert mode in ['r', 'a', 'w']
918 self.f = io.open(filename, mode, encoding=encoding)
919 self.mode = mode
920
921 def __enter__(self):
922 exclusive = self.mode != 'r'
923 try:
924 _lock_file(self.f, exclusive)
925 except IOError:
926 self.f.close()
927 raise
928 return self
929
930 def __exit__(self, etype, value, traceback):
931 try:
932 _unlock_file(self.f)
933 finally:
934 self.f.close()
935
936 def __iter__(self):
937 return iter(self.f)
938
939 def write(self, *args):
940 return self.f.write(*args)
941
942 def read(self, *args):
943 return self.f.read(*args)
4eb7f1d1
JMF
944
945
946def shell_quote(args):
947 return ' '.join(map(pipes.quote, args))
9d4660ca
PH
948
949
f4d96df0
PH
950def takewhile_inclusive(pred, seq):
951 """ Like itertools.takewhile, but include the latest evaluated element
952 (the first element so that Not pred(e)) """
953 for e in seq:
954 yield e
955 if not pred(e):
956 return
957
958
9d4660ca
PH
959def smuggle_url(url, data):
960 """ Pass additional data in a URL for internal use. """
961
962 sdata = compat_urllib_parse.urlencode(
963 {u'__youtubedl_smuggle': json.dumps(data)})
964 return url + u'#' + sdata
965
966
967def unsmuggle_url(smug_url):
968 if not '#__youtubedl_smuggle' in smug_url:
969 return smug_url, None
970 url, _, sdata = smug_url.rpartition(u'#')
971 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
972 data = json.loads(jsond)
973 return url, data