]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
release 2013.11.18
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
c496ca96
PH
4import datetime
5import email.utils
f45c185f 6import errno
d77c3dfd 7import gzip
03f9daab 8import io
f4bfd65f 9import json
d77c3dfd
FV
10import locale
11import os
4eb7f1d1 12import pipes
c496ca96 13import platform
d77c3dfd 14import re
c496ca96 15import socket
d77c3dfd 16import sys
01951dda 17import traceback
d77c3dfd 18import zlib
d77c3dfd 19
01ba00ca 20try:
59ae15a5 21 import urllib.request as compat_urllib_request
01ba00ca 22except ImportError: # Python 2
59ae15a5 23 import urllib2 as compat_urllib_request
01ba00ca
PH
24
25try:
59ae15a5 26 import urllib.error as compat_urllib_error
01ba00ca 27except ImportError: # Python 2
59ae15a5 28 import urllib2 as compat_urllib_error
01ba00ca
PH
29
30try:
59ae15a5 31 import urllib.parse as compat_urllib_parse
01ba00ca 32except ImportError: # Python 2
59ae15a5 33 import urllib as compat_urllib_parse
01ba00ca 34
799c0763
PH
35try:
36 from urllib.parse import urlparse as compat_urllib_parse_urlparse
37except ImportError: # Python 2
38 from urlparse import urlparse as compat_urllib_parse_urlparse
39
6543f0dc
JMF
40try:
41 import urllib.parse as compat_urlparse
42except ImportError: # Python 2
43 import urlparse as compat_urlparse
44
01ba00ca 45try:
59ae15a5 46 import http.cookiejar as compat_cookiejar
01ba00ca 47except ImportError: # Python 2
59ae15a5 48 import cookielib as compat_cookiejar
01ba00ca 49
3e669f36 50try:
59ae15a5 51 import html.entities as compat_html_entities
9f37a959 52except ImportError: # Python 2
59ae15a5 53 import htmlentitydefs as compat_html_entities
3e669f36 54
a8156c1d 55try:
59ae15a5 56 import html.parser as compat_html_parser
9f37a959 57except ImportError: # Python 2
59ae15a5 58 import HTMLParser as compat_html_parser
a8156c1d 59
348d0a7a 60try:
59ae15a5 61 import http.client as compat_http_client
9f37a959 62except ImportError: # Python 2
59ae15a5 63 import httplib as compat_http_client
348d0a7a 64
2eabb802 65try:
0e283428 66 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
67except ImportError: # Python 2
68 from urllib2 import HTTPError as compat_HTTPError
69
e0df6211
PH
70try:
71 from urllib.request import urlretrieve as compat_urlretrieve
72except ImportError: # Python 2
73 from urllib import urlretrieve as compat_urlretrieve
74
75
5910e210
PH
76try:
77 from subprocess import DEVNULL
78 compat_subprocess_get_DEVNULL = lambda: DEVNULL
79except ImportError:
80 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
81
9f37a959 82try:
59ae15a5 83 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 84except ImportError: # Python 2
59ae15a5
PH
85 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
86 # Python 2's version is apparently totally broken
87 def _unquote(string, encoding='utf-8', errors='replace'):
88 if string == '':
89 return string
90 res = string.split('%')
91 if len(res) == 1:
92 return string
93 if encoding is None:
94 encoding = 'utf-8'
95 if errors is None:
96 errors = 'replace'
97 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
98 pct_sequence = b''
99 string = res[0]
100 for item in res[1:]:
101 try:
102 if not item:
103 raise ValueError
104 pct_sequence += item[:2].decode('hex')
105 rest = item[2:]
106 if not rest:
107 # This segment was just a single percent-encoded character.
108 # May be part of a sequence of code units, so delay decoding.
109 # (Stored in pct_sequence).
110 continue
111 except ValueError:
112 rest = '%' + item
113 # Encountered non-percent-encoded characters. Flush the current
114 # pct_sequence.
115 string += pct_sequence.decode(encoding, errors) + rest
116 pct_sequence = b''
117 if pct_sequence:
118 # Flush the final pct_sequence
119 string += pct_sequence.decode(encoding, errors)
120 return string
121
122 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
123 encoding='utf-8', errors='replace'):
124 qs, _coerce_result = qs, unicode
125 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
126 r = []
127 for name_value in pairs:
128 if not name_value and not strict_parsing:
129 continue
130 nv = name_value.split('=', 1)
131 if len(nv) != 2:
132 if strict_parsing:
133 raise ValueError("bad query field: %r" % (name_value,))
134 # Handle case of a control-name with no equal sign
135 if keep_blank_values:
136 nv.append('')
137 else:
138 continue
139 if len(nv[1]) or keep_blank_values:
140 name = nv[0].replace('+', ' ')
141 name = _unquote(name, encoding=encoding, errors=errors)
142 name = _coerce_result(name)
143 value = nv[1].replace('+', ' ')
144 value = _unquote(value, encoding=encoding, errors=errors)
145 value = _coerce_result(value)
146 r.append((name, value))
147 return r
148
149 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
150 encoding='utf-8', errors='replace'):
151 parsed_result = {}
152 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
153 encoding=encoding, errors=errors)
154 for name, value in pairs:
155 if name in parsed_result:
156 parsed_result[name].append(value)
157 else:
158 parsed_result[name] = [value]
159 return parsed_result
348d0a7a 160
3e669f36 161try:
59ae15a5 162 compat_str = unicode # Python 2
3e669f36 163except NameError:
59ae15a5 164 compat_str = str
3e669f36
PH
165
166try:
59ae15a5 167 compat_chr = unichr # Python 2
3e669f36 168except NameError:
59ae15a5 169 compat_chr = chr
3e669f36 170
b31756c1
FV
171def compat_ord(c):
172 if type(c) is int: return c
173 else: return ord(c)
174
468e2e92
FV
175# This is not clearly defined otherwise
176compiled_regex_type = type(re.compile(''))
177
3e669f36 178std_headers = {
96b31b65 179 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome) (iPhone)',
59ae15a5
PH
180 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
181 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
182 'Accept-Encoding': 'gzip, deflate',
183 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 184}
f427df17 185
d77c3dfd 186def preferredencoding():
59ae15a5 187 """Get preferred encoding.
d77c3dfd 188
59ae15a5
PH
189 Returns the best encoding scheme for the system, based on
190 locale.getpreferredencoding() and some further tweaks.
191 """
192 try:
193 pref = locale.getpreferredencoding()
194 u'TEST'.encode(pref)
195 except:
196 pref = 'UTF-8'
bae611f2 197
59ae15a5 198 return pref
d77c3dfd 199
8cd10ac4 200if sys.version_info < (3,0):
59ae15a5
PH
201 def compat_print(s):
202 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 203else:
59ae15a5
PH
204 def compat_print(s):
205 assert type(s) == type(u'')
206 print(s)
d77c3dfd 207
f4bfd65f
PH
208# In Python 2.x, json.dump expects a bytestream.
209# In Python 3.x, it writes to a character stream
210if sys.version_info < (3,0):
211 def write_json_file(obj, fn):
212 with open(fn, 'wb') as f:
213 json.dump(obj, f)
214else:
215 def write_json_file(obj, fn):
216 with open(fn, 'w', encoding='utf-8') as f:
217 json.dump(obj, f)
218
59ae56fa
PH
219if sys.version_info >= (2,7):
220 def find_xpath_attr(node, xpath, key, val):
221 """ Find the xpath xpath[@key=val] """
5de3ece2 222 assert re.match(r'^[a-zA-Z]+$', key)
54543467 223 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
59ae56fa
PH
224 expr = xpath + u"[@%s='%s']" % (key, val)
225 return node.find(expr)
226else:
227 def find_xpath_attr(node, xpath, key, val):
228 for f in node.findall(xpath):
229 if f.attrib.get(key) == val:
230 return f
231 return None
232
d7e66d39
JMF
233# On python2.6 the xml.etree.ElementTree.Element methods don't support
234# the namespace parameter
235def xpath_with_ns(path, ns_map):
236 components = [c.split(':') for c in path.split('/')]
237 replaced = []
238 for c in components:
239 if len(c) == 1:
240 replaced.append(c[0])
241 else:
242 ns, tag = c
243 replaced.append('{%s}%s' % (ns_map[ns], tag))
244 return '/'.join(replaced)
245
d77c3dfd 246def htmlentity_transform(matchobj):
59ae15a5
PH
247 """Transforms an HTML entity to a character.
248
249 This function receives a match object and is intended to be used with
250 the re.sub() function.
251 """
252 entity = matchobj.group(1)
253
254 # Known non-numeric HTML entity
255 if entity in compat_html_entities.name2codepoint:
256 return compat_chr(compat_html_entities.name2codepoint[entity])
257
258 mobj = re.match(u'(?u)#(x?\\d+)', entity)
259 if mobj is not None:
260 numstr = mobj.group(1)
261 if numstr.startswith(u'x'):
262 base = 16
263 numstr = u'0%s' % numstr
264 else:
265 base = 10
266 return compat_chr(int(numstr, base))
267
268 # Unknown entity in name, return its literal representation
269 return (u'&%s;' % entity)
d77c3dfd 270
a8156c1d 271compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
272class BaseHTMLParser(compat_html_parser.HTMLParser):
273 def __init(self):
274 compat_html_parser.HTMLParser.__init__(self)
275 self.html = None
276
277 def loads(self, html):
278 self.html = html
279 self.feed(html)
280 self.close()
281
282class AttrParser(BaseHTMLParser):
43e8fafd
ND
283 """Modified HTMLParser that isolates a tag with the specified attribute"""
284 def __init__(self, attribute, value):
285 self.attribute = attribute
286 self.value = value
59ae15a5
PH
287 self.result = None
288 self.started = False
289 self.depth = {}
59ae15a5
PH
290 self.watch_startpos = False
291 self.error_count = 0
a921f407 292 BaseHTMLParser.__init__(self)
59ae15a5
PH
293
294 def error(self, message):
295 if self.error_count > 10 or self.started:
296 raise compat_html_parser.HTMLParseError(message, self.getpos())
297 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
298 self.error_count += 1
299 self.goahead(1)
300
59ae15a5
PH
301 def handle_starttag(self, tag, attrs):
302 attrs = dict(attrs)
303 if self.started:
304 self.find_startpos(None)
43e8fafd 305 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
306 self.result = [tag]
307 self.started = True
308 self.watch_startpos = True
309 if self.started:
310 if not tag in self.depth: self.depth[tag] = 0
311 self.depth[tag] += 1
312
313 def handle_endtag(self, tag):
314 if self.started:
315 if tag in self.depth: self.depth[tag] -= 1
316 if self.depth[self.result[0]] == 0:
317 self.started = False
318 self.result.append(self.getpos())
319
320 def find_startpos(self, x):
321 """Needed to put the start position of the result (self.result[1])
322 after the opening tag with the requested id"""
323 if self.watch_startpos:
324 self.watch_startpos = False
325 self.result.append(self.getpos())
326 handle_entityref = handle_charref = handle_data = handle_comment = \
327 handle_decl = handle_pi = unknown_decl = find_startpos
328
329 def get_result(self):
330 if self.result is None:
331 return None
332 if len(self.result) != 3:
333 return None
334 lines = self.html.split('\n')
335 lines = lines[self.result[1][0]-1:self.result[2][0]]
336 lines[0] = lines[0][self.result[1][1]:]
337 if len(lines) == 1:
338 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
339 lines[-1] = lines[-1][:self.result[2][1]]
340 return '\n'.join(lines).strip()
3b024e17
PH
341# Hack for https://github.com/rg3/youtube-dl/issues/662
342if sys.version_info < (2, 7, 3):
343 AttrParser.parse_endtag = (lambda self, i:
344 i + len("</scr'+'ipt>")
345 if self.rawdata[i:].startswith("</scr'+'ipt>")
346 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
347
348def get_element_by_id(id, html):
43e8fafd
ND
349 """Return the content of the tag with the specified ID in the passed HTML document"""
350 return get_element_by_attribute("id", id, html)
351
352def get_element_by_attribute(attribute, value, html):
353 """Return the content of the tag with the specified attribute in the passed HTML document"""
354 parser = AttrParser(attribute, value)
59ae15a5
PH
355 try:
356 parser.loads(html)
357 except compat_html_parser.HTMLParseError:
358 pass
359 return parser.get_result()
9e6dd238 360
a921f407
JMF
361class MetaParser(BaseHTMLParser):
362 """
363 Modified HTMLParser that isolates a meta tag with the specified name
364 attribute.
365 """
366 def __init__(self, name):
367 BaseHTMLParser.__init__(self)
368 self.name = name
369 self.content = None
370 self.result = None
371
372 def handle_starttag(self, tag, attrs):
373 if tag != 'meta':
374 return
375 attrs = dict(attrs)
376 if attrs.get('name') == self.name:
377 self.result = attrs.get('content')
378
379 def get_result(self):
380 return self.result
381
382def get_meta_content(name, html):
383 """
384 Return the content attribute from the meta tag with the given name attribute.
385 """
386 parser = MetaParser(name)
387 try:
388 parser.loads(html)
389 except compat_html_parser.HTMLParseError:
390 pass
391 return parser.get_result()
392
9e6dd238
FV
393
394def clean_html(html):
59ae15a5
PH
395 """Clean an HTML snippet into a readable string"""
396 # Newline vs <br />
397 html = html.replace('\n', ' ')
6b3aef80
FV
398 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
399 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
400 # Strip html tags
401 html = re.sub('<.*?>', '', html)
402 # Replace html entities
403 html = unescapeHTML(html)
7decf895 404 return html.strip()
9e6dd238
FV
405
406
d77c3dfd 407def sanitize_open(filename, open_mode):
59ae15a5
PH
408 """Try to open the given filename, and slightly tweak it if this fails.
409
410 Attempts to open the given filename. If this fails, it tries to change
411 the filename slightly, step by step, until it's either able to open it
412 or it fails and raises a final exception, like the standard open()
413 function.
414
415 It returns the tuple (stream, definitive_file_name).
416 """
417 try:
418 if filename == u'-':
419 if sys.platform == 'win32':
420 import msvcrt
421 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 422 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
423 stream = open(encodeFilename(filename), open_mode)
424 return (stream, filename)
425 except (IOError, OSError) as err:
f45c185f
PH
426 if err.errno in (errno.EACCES,):
427 raise
59ae15a5 428
f45c185f
PH
429 # In case of error, try to remove win32 forbidden chars
430 alt_filename = os.path.join(
431 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
432 for path_part in os.path.split(filename)
433 )
434 if alt_filename == filename:
435 raise
436 else:
437 # An exception here should be caught in the caller
438 stream = open(encodeFilename(filename), open_mode)
439 return (stream, alt_filename)
d77c3dfd
FV
440
441
442def timeconvert(timestr):
59ae15a5
PH
443 """Convert RFC 2822 defined time string into system timestamp"""
444 timestamp = None
445 timetuple = email.utils.parsedate_tz(timestr)
446 if timetuple is not None:
447 timestamp = email.utils.mktime_tz(timetuple)
448 return timestamp
1c469a94 449
796173d0 450def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
451 """Sanitizes a string so it could be used as part of a filename.
452 If restricted is set, use a stricter subset of allowed characters.
796173d0 453 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
454 """
455 def replace_insane(char):
456 if char == '?' or ord(char) < 32 or ord(char) == 127:
457 return ''
458 elif char == '"':
459 return '' if restricted else '\''
460 elif char == ':':
461 return '_-' if restricted else ' -'
462 elif char in '\\/|*<>':
463 return '_'
627dcfff 464 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
465 return '_'
466 if restricted and ord(char) > 127:
467 return '_'
468 return char
469
470 result = u''.join(map(replace_insane, s))
796173d0
PH
471 if not is_id:
472 while '__' in result:
473 result = result.replace('__', '_')
474 result = result.strip('_')
475 # Common case of "Foreign band name - English song title"
476 if restricted and result.startswith('-_'):
477 result = result[2:]
478 if not result:
479 result = '_'
59ae15a5 480 return result
d77c3dfd
FV
481
482def orderedSet(iterable):
59ae15a5
PH
483 """ Remove all duplicates from the input iterable """
484 res = []
485 for el in iterable:
486 if el not in res:
487 res.append(el)
488 return res
d77c3dfd
FV
489
490def unescapeHTML(s):
59ae15a5
PH
491 """
492 @param s a string
493 """
494 assert type(s) == type(u'')
d77c3dfd 495
59ae15a5
PH
496 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
497 return result
d77c3dfd
FV
498
499def encodeFilename(s):
59ae15a5
PH
500 """
501 @param s The name of the file
502 """
d77c3dfd 503
59ae15a5 504 assert type(s) == type(u'')
d77c3dfd 505
59ae15a5
PH
506 # Python 3 has a Unicode API
507 if sys.version_info >= (3, 0):
508 return s
0f00efed 509
59ae15a5
PH
510 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
511 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
512 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
513 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
514 return s
515 else:
6df40dcb
PH
516 encoding = sys.getfilesystemencoding()
517 if encoding is None:
518 encoding = 'utf-8'
519 return s.encode(encoding, 'ignore')
d77c3dfd 520
8271226a
PH
521def decodeOption(optval):
522 if optval is None:
523 return optval
524 if isinstance(optval, bytes):
525 optval = optval.decode(preferredencoding())
526
527 assert isinstance(optval, compat_str)
528 return optval
1c256f70 529
4539dd30
PH
530def formatSeconds(secs):
531 if secs > 3600:
532 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
533 elif secs > 60:
534 return '%d:%02d' % (secs // 60, secs % 60)
535 else:
536 return '%d' % secs
537
ea6d901e
PH
538def make_HTTPS_handler(opts):
539 if sys.version_info < (3,2):
540 # Python's 2.x handler is very simplistic
acebc9cd 541 return compat_urllib_request.HTTPSHandler()
ea6d901e
PH
542 else:
543 import ssl
544 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
545 context.set_default_verify_paths()
546
547 context.verify_mode = (ssl.CERT_NONE
548 if opts.no_check_certificate
549 else ssl.CERT_REQUIRED)
acebc9cd 550 return compat_urllib_request.HTTPSHandler(context=context)
ea6d901e 551
1c256f70
PH
552class ExtractorError(Exception):
553 """Error during info extraction."""
2eabb802 554 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
555 """ tb, if given, is the original traceback (so that it can be printed out).
556 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
557 """
558
559 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
560 expected = True
561 if not expected:
298f833b 562 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 563 super(ExtractorError, self).__init__(msg)
d5979c5d 564
1c256f70 565 self.traceback = tb
8cc83b8d 566 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 567 self.cause = cause
1c256f70 568
01951dda
PH
569 def format_traceback(self):
570 if self.traceback is None:
571 return None
572 return u''.join(traceback.format_tb(self.traceback))
573
1c256f70 574
55b3e45b
JMF
575class RegexNotFoundError(ExtractorError):
576 """Error when a regex didn't match"""
577 pass
578
579
d77c3dfd 580class DownloadError(Exception):
59ae15a5 581 """Download Error exception.
d77c3dfd 582
59ae15a5
PH
583 This exception may be thrown by FileDownloader objects if they are not
584 configured to continue on errors. They will contain the appropriate
585 error message.
586 """
8cc83b8d
FV
587 def __init__(self, msg, exc_info=None):
588 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
589 super(DownloadError, self).__init__(msg)
590 self.exc_info = exc_info
d77c3dfd
FV
591
592
593class SameFileError(Exception):
59ae15a5 594 """Same File exception.
d77c3dfd 595
59ae15a5
PH
596 This exception will be thrown by FileDownloader objects if they detect
597 multiple files would have to be downloaded to the same file on disk.
598 """
599 pass
d77c3dfd
FV
600
601
602class PostProcessingError(Exception):
59ae15a5 603 """Post Processing exception.
d77c3dfd 604
59ae15a5
PH
605 This exception may be raised by PostProcessor's .run() method to
606 indicate an error in the postprocessing task.
607 """
7851b379
PH
608 def __init__(self, msg):
609 self.msg = msg
d77c3dfd
FV
610
611class MaxDownloadsReached(Exception):
59ae15a5
PH
612 """ --max-downloads limit has been reached. """
613 pass
d77c3dfd
FV
614
615
616class UnavailableVideoError(Exception):
59ae15a5 617 """Unavailable Format exception.
d77c3dfd 618
59ae15a5
PH
619 This exception will be thrown when a video is requested
620 in a format that is not available for that video.
621 """
622 pass
d77c3dfd
FV
623
624
625class ContentTooShortError(Exception):
59ae15a5 626 """Content Too Short exception.
d77c3dfd 627
59ae15a5
PH
628 This exception may be raised by FileDownloader objects when a file they
629 download is too small for what the server announced first, indicating
630 the connection was probably interrupted.
631 """
632 # Both in bytes
633 downloaded = None
634 expected = None
d77c3dfd 635
59ae15a5
PH
636 def __init__(self, downloaded, expected):
637 self.downloaded = downloaded
638 self.expected = expected
d77c3dfd 639
acebc9cd 640class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
641 """Handler for HTTP requests and responses.
642
643 This class, when installed with an OpenerDirector, automatically adds
644 the standard headers to every HTTP request and handles gzipped and
645 deflated responses from web servers. If compression is to be avoided in
646 a particular request, the original request in the program code only has
647 to include the HTTP header "Youtubedl-No-Compression", which will be
648 removed before making the real request.
649
650 Part of this code was copied from:
651
652 http://techknack.net/python-urllib2-handlers/
653
654 Andrew Rowls, the author of that code, agreed to release it to the
655 public domain.
656 """
657
658 @staticmethod
659 def deflate(data):
660 try:
661 return zlib.decompress(data, -zlib.MAX_WBITS)
662 except zlib.error:
663 return zlib.decompress(data)
664
665 @staticmethod
666 def addinfourl_wrapper(stream, headers, url, code):
667 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
668 return compat_urllib_request.addinfourl(stream, headers, url, code)
669 ret = compat_urllib_request.addinfourl(stream, headers, url)
670 ret.code = code
671 return ret
672
acebc9cd
PH
673 def http_request(self, req):
674 for h,v in std_headers.items():
59ae15a5
PH
675 if h in req.headers:
676 del req.headers[h]
335959e7 677 req.add_header(h, v)
59ae15a5
PH
678 if 'Youtubedl-no-compression' in req.headers:
679 if 'Accept-encoding' in req.headers:
680 del req.headers['Accept-encoding']
681 del req.headers['Youtubedl-no-compression']
3446dfb7 682 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
683 if 'User-agent' in req.headers:
684 del req.headers['User-agent']
685 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 686 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
687 return req
688
acebc9cd 689 def http_response(self, req, resp):
59ae15a5
PH
690 old_resp = resp
691 # gzip
692 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
693 content = resp.read()
694 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
695 try:
696 uncompressed = io.BytesIO(gz.read())
697 except IOError as original_ioerror:
698 # There may be junk add the end of the file
699 # See http://stackoverflow.com/q/4928560/35070 for details
700 for i in range(1, 1024):
701 try:
702 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
703 uncompressed = io.BytesIO(gz.read())
704 except IOError:
705 continue
706 break
707 else:
708 raise original_ioerror
709 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
710 resp.msg = old_resp.msg
711 # deflate
712 if resp.headers.get('Content-encoding', '') == 'deflate':
713 gz = io.BytesIO(self.deflate(resp.read()))
714 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
715 resp.msg = old_resp.msg
716 return resp
0f8d03f8 717
acebc9cd
PH
718 https_request = http_request
719 https_response = http_response
bf50b038
JMF
720
721def unified_strdate(date_str):
722 """Return a string with the date in the format YYYYMMDD"""
723 upload_date = None
724 #Replace commas
725 date_str = date_str.replace(',',' ')
726 # %z (UTC offset) is only supported in python>=3.2
727 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
19e1d359
JMF
728 format_expressions = [
729 '%d %B %Y',
730 '%B %d %Y',
731 '%b %d %Y',
732 '%Y-%m-%d',
733 '%d/%m/%Y',
734 '%Y/%m/%d %H:%M:%S',
735 '%d.%m.%Y %H:%M',
736 '%Y-%m-%dT%H:%M:%SZ',
2e1fa03b 737 '%Y-%m-%dT%H:%M:%S',
19e1d359 738 ]
bf50b038
JMF
739 for expression in format_expressions:
740 try:
741 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
742 except:
743 pass
744 return upload_date
745
cbdbb766 746def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
747 guess = url.partition(u'?')[0].rpartition(u'.')[2]
748 if re.match(r'^[A-Za-z0-9]+$', guess):
749 return guess
750 else:
cbdbb766 751 return default_ext
73e79f2a 752
d4051a8e
JMF
753def subtitles_filename(filename, sub_lang, sub_format):
754 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
755
bd558525 756def date_from_str(date_str):
37254abc
JMF
757 """
758 Return a datetime object from a string in the format YYYYMMDD or
759 (now|today)[+-][0-9](day|week|month|year)(s)?"""
760 today = datetime.date.today()
761 if date_str == 'now'or date_str == 'today':
762 return today
763 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
764 if match is not None:
765 sign = match.group('sign')
766 time = int(match.group('time'))
767 if sign == '-':
768 time = -time
769 unit = match.group('unit')
770 #A bad aproximation?
771 if unit == 'month':
772 unit = 'day'
773 time *= 30
774 elif unit == 'year':
775 unit = 'day'
776 time *= 365
777 unit += 's'
778 delta = datetime.timedelta(**{unit: time})
779 return today + delta
bd558525
JMF
780 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
781
782class DateRange(object):
783 """Represents a time interval between two dates"""
784 def __init__(self, start=None, end=None):
785 """start and end must be strings in the format accepted by date"""
786 if start is not None:
787 self.start = date_from_str(start)
788 else:
789 self.start = datetime.datetime.min.date()
790 if end is not None:
791 self.end = date_from_str(end)
792 else:
793 self.end = datetime.datetime.max.date()
37254abc 794 if self.start > self.end:
bd558525
JMF
795 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
796 @classmethod
797 def day(cls, day):
798 """Returns a range that only contains the given day"""
799 return cls(day,day)
800 def __contains__(self, date):
801 """Check if the date is in the range"""
37254abc
JMF
802 if not isinstance(date, datetime.date):
803 date = date_from_str(date)
804 return self.start <= date <= self.end
bd558525
JMF
805 def __str__(self):
806 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
807
808
809def platform_name():
810 """ Returns the platform name as a compat_str """
811 res = platform.platform()
812 if isinstance(res, bytes):
813 res = res.decode(preferredencoding())
814
815 assert isinstance(res, compat_str)
816 return res
c257baff
PH
817
818
7459e3a2
PH
819def write_string(s, out=None):
820 if out is None:
821 out = sys.stderr
822 assert type(s) == type(u'')
823
824 if ('b' in getattr(out, 'mode', '') or
825 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
826 s = s.encode(preferredencoding(), 'ignore')
827 out.write(s)
828 out.flush()
829
830
48ea9cea
PH
831def bytes_to_intlist(bs):
832 if not bs:
833 return []
834 if isinstance(bs[0], int): # Python 3
835 return list(bs)
836 else:
837 return [ord(c) for c in bs]
838
c257baff 839
cba892fa 840def intlist_to_bytes(xs):
841 if not xs:
842 return b''
843 if isinstance(chr(0), bytes): # Python 2
844 return ''.join([chr(x) for x in xs])
845 else:
846 return bytes(xs)
c38b1e77
PH
847
848
849def get_cachedir(params={}):
850 cache_root = os.environ.get('XDG_CACHE_HOME',
851 os.path.expanduser('~/.cache'))
852 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
853
854
855# Cross-platform file locking
856if sys.platform == 'win32':
857 import ctypes.wintypes
858 import msvcrt
859
860 class OVERLAPPED(ctypes.Structure):
861 _fields_ = [
862 ('Internal', ctypes.wintypes.LPVOID),
863 ('InternalHigh', ctypes.wintypes.LPVOID),
864 ('Offset', ctypes.wintypes.DWORD),
865 ('OffsetHigh', ctypes.wintypes.DWORD),
866 ('hEvent', ctypes.wintypes.HANDLE),
867 ]
868
869 kernel32 = ctypes.windll.kernel32
870 LockFileEx = kernel32.LockFileEx
871 LockFileEx.argtypes = [
872 ctypes.wintypes.HANDLE, # hFile
873 ctypes.wintypes.DWORD, # dwFlags
874 ctypes.wintypes.DWORD, # dwReserved
875 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
876 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
877 ctypes.POINTER(OVERLAPPED) # Overlapped
878 ]
879 LockFileEx.restype = ctypes.wintypes.BOOL
880 UnlockFileEx = kernel32.UnlockFileEx
881 UnlockFileEx.argtypes = [
882 ctypes.wintypes.HANDLE, # hFile
883 ctypes.wintypes.DWORD, # dwReserved
884 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
885 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
886 ctypes.POINTER(OVERLAPPED) # Overlapped
887 ]
888 UnlockFileEx.restype = ctypes.wintypes.BOOL
889 whole_low = 0xffffffff
890 whole_high = 0x7fffffff
891
892 def _lock_file(f, exclusive):
893 overlapped = OVERLAPPED()
894 overlapped.Offset = 0
895 overlapped.OffsetHigh = 0
896 overlapped.hEvent = 0
897 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
898 handle = msvcrt.get_osfhandle(f.fileno())
899 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
900 whole_low, whole_high, f._lock_file_overlapped_p):
901 raise OSError('Locking file failed: %r' % ctypes.FormatError())
902
903 def _unlock_file(f):
904 assert f._lock_file_overlapped_p
905 handle = msvcrt.get_osfhandle(f.fileno())
906 if not UnlockFileEx(handle, 0,
907 whole_low, whole_high, f._lock_file_overlapped_p):
908 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
909
910else:
911 import fcntl
912
913 def _lock_file(f, exclusive):
914 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
915
916 def _unlock_file(f):
917 fcntl.lockf(f, fcntl.LOCK_UN)
918
919
920class locked_file(object):
921 def __init__(self, filename, mode, encoding=None):
922 assert mode in ['r', 'a', 'w']
923 self.f = io.open(filename, mode, encoding=encoding)
924 self.mode = mode
925
926 def __enter__(self):
927 exclusive = self.mode != 'r'
928 try:
929 _lock_file(self.f, exclusive)
930 except IOError:
931 self.f.close()
932 raise
933 return self
934
935 def __exit__(self, etype, value, traceback):
936 try:
937 _unlock_file(self.f)
938 finally:
939 self.f.close()
940
941 def __iter__(self):
942 return iter(self.f)
943
944 def write(self, *args):
945 return self.f.write(*args)
946
947 def read(self, *args):
948 return self.f.read(*args)
4eb7f1d1
JMF
949
950
951def shell_quote(args):
952 return ' '.join(map(pipes.quote, args))
9d4660ca
PH
953
954
f4d96df0
PH
955def takewhile_inclusive(pred, seq):
956 """ Like itertools.takewhile, but include the latest evaluated element
957 (the first element so that Not pred(e)) """
958 for e in seq:
959 yield e
960 if not pred(e):
961 return
962
963
9d4660ca
PH
964def smuggle_url(url, data):
965 """ Pass additional data in a URL for internal use. """
966
967 sdata = compat_urllib_parse.urlencode(
968 {u'__youtubedl_smuggle': json.dumps(data)})
969 return url + u'#' + sdata
970
971
972def unsmuggle_url(smug_url):
973 if not '#__youtubedl_smuggle' in smug_url:
974 return smug_url, None
975 url, _, sdata = smug_url.rpartition(u'#')
976 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
977 data = json.loads(jsond)
978 return url, data