]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[dailymotion] Extract view count (#1895)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import email.utils
6 import errno
7 import gzip
8 import io
9 import json
10 import locale
11 import math
12 import os
13 import pipes
14 import platform
15 import re
16 import ssl
17 import socket
18 import sys
19 import traceback
20 import zlib
21
22 try:
23 import urllib.request as compat_urllib_request
24 except ImportError: # Python 2
25 import urllib2 as compat_urllib_request
26
27 try:
28 import urllib.error as compat_urllib_error
29 except ImportError: # Python 2
30 import urllib2 as compat_urllib_error
31
32 try:
33 import urllib.parse as compat_urllib_parse
34 except ImportError: # Python 2
35 import urllib as compat_urllib_parse
36
37 try:
38 from urllib.parse import urlparse as compat_urllib_parse_urlparse
39 except ImportError: # Python 2
40 from urlparse import urlparse as compat_urllib_parse_urlparse
41
42 try:
43 import urllib.parse as compat_urlparse
44 except ImportError: # Python 2
45 import urlparse as compat_urlparse
46
47 try:
48 import http.cookiejar as compat_cookiejar
49 except ImportError: # Python 2
50 import cookielib as compat_cookiejar
51
52 try:
53 import html.entities as compat_html_entities
54 except ImportError: # Python 2
55 import htmlentitydefs as compat_html_entities
56
57 try:
58 import html.parser as compat_html_parser
59 except ImportError: # Python 2
60 import HTMLParser as compat_html_parser
61
62 try:
63 import http.client as compat_http_client
64 except ImportError: # Python 2
65 import httplib as compat_http_client
66
67 try:
68 from urllib.error import HTTPError as compat_HTTPError
69 except ImportError: # Python 2
70 from urllib2 import HTTPError as compat_HTTPError
71
72 try:
73 from urllib.request import urlretrieve as compat_urlretrieve
74 except ImportError: # Python 2
75 from urllib import urlretrieve as compat_urlretrieve
76
77
78 try:
79 from subprocess import DEVNULL
80 compat_subprocess_get_DEVNULL = lambda: DEVNULL
81 except ImportError:
82 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
83
84 try:
85 from urllib.parse import parse_qs as compat_parse_qs
86 except ImportError: # Python 2
87 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
88 # Python 2's version is apparently totally broken
89 def _unquote(string, encoding='utf-8', errors='replace'):
90 if string == '':
91 return string
92 res = string.split('%')
93 if len(res) == 1:
94 return string
95 if encoding is None:
96 encoding = 'utf-8'
97 if errors is None:
98 errors = 'replace'
99 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
100 pct_sequence = b''
101 string = res[0]
102 for item in res[1:]:
103 try:
104 if not item:
105 raise ValueError
106 pct_sequence += item[:2].decode('hex')
107 rest = item[2:]
108 if not rest:
109 # This segment was just a single percent-encoded character.
110 # May be part of a sequence of code units, so delay decoding.
111 # (Stored in pct_sequence).
112 continue
113 except ValueError:
114 rest = '%' + item
115 # Encountered non-percent-encoded characters. Flush the current
116 # pct_sequence.
117 string += pct_sequence.decode(encoding, errors) + rest
118 pct_sequence = b''
119 if pct_sequence:
120 # Flush the final pct_sequence
121 string += pct_sequence.decode(encoding, errors)
122 return string
123
124 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
125 encoding='utf-8', errors='replace'):
126 qs, _coerce_result = qs, unicode
127 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
128 r = []
129 for name_value in pairs:
130 if not name_value and not strict_parsing:
131 continue
132 nv = name_value.split('=', 1)
133 if len(nv) != 2:
134 if strict_parsing:
135 raise ValueError("bad query field: %r" % (name_value,))
136 # Handle case of a control-name with no equal sign
137 if keep_blank_values:
138 nv.append('')
139 else:
140 continue
141 if len(nv[1]) or keep_blank_values:
142 name = nv[0].replace('+', ' ')
143 name = _unquote(name, encoding=encoding, errors=errors)
144 name = _coerce_result(name)
145 value = nv[1].replace('+', ' ')
146 value = _unquote(value, encoding=encoding, errors=errors)
147 value = _coerce_result(value)
148 r.append((name, value))
149 return r
150
151 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
152 encoding='utf-8', errors='replace'):
153 parsed_result = {}
154 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
155 encoding=encoding, errors=errors)
156 for name, value in pairs:
157 if name in parsed_result:
158 parsed_result[name].append(value)
159 else:
160 parsed_result[name] = [value]
161 return parsed_result
162
163 try:
164 compat_str = unicode # Python 2
165 except NameError:
166 compat_str = str
167
168 try:
169 compat_chr = unichr # Python 2
170 except NameError:
171 compat_chr = chr
172
173 def compat_ord(c):
174 if type(c) is int: return c
175 else: return ord(c)
176
177 # This is not clearly defined otherwise
178 compiled_regex_type = type(re.compile(''))
179
180 std_headers = {
181 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
182 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
183 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
184 'Accept-Encoding': 'gzip, deflate',
185 'Accept-Language': 'en-us,en;q=0.5',
186 }
187
188 def preferredencoding():
189 """Get preferred encoding.
190
191 Returns the best encoding scheme for the system, based on
192 locale.getpreferredencoding() and some further tweaks.
193 """
194 try:
195 pref = locale.getpreferredencoding()
196 u'TEST'.encode(pref)
197 except:
198 pref = 'UTF-8'
199
200 return pref
201
202 if sys.version_info < (3,0):
203 def compat_print(s):
204 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
205 else:
206 def compat_print(s):
207 assert type(s) == type(u'')
208 print(s)
209
210 # In Python 2.x, json.dump expects a bytestream.
211 # In Python 3.x, it writes to a character stream
212 if sys.version_info < (3,0):
213 def write_json_file(obj, fn):
214 with open(fn, 'wb') as f:
215 json.dump(obj, f)
216 else:
217 def write_json_file(obj, fn):
218 with open(fn, 'w', encoding='utf-8') as f:
219 json.dump(obj, f)
220
221 if sys.version_info >= (2,7):
222 def find_xpath_attr(node, xpath, key, val):
223 """ Find the xpath xpath[@key=val] """
224 assert re.match(r'^[a-zA-Z]+$', key)
225 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
226 expr = xpath + u"[@%s='%s']" % (key, val)
227 return node.find(expr)
228 else:
229 def find_xpath_attr(node, xpath, key, val):
230 for f in node.findall(xpath):
231 if f.attrib.get(key) == val:
232 return f
233 return None
234
235 # On python2.6 the xml.etree.ElementTree.Element methods don't support
236 # the namespace parameter
237 def xpath_with_ns(path, ns_map):
238 components = [c.split(':') for c in path.split('/')]
239 replaced = []
240 for c in components:
241 if len(c) == 1:
242 replaced.append(c[0])
243 else:
244 ns, tag = c
245 replaced.append('{%s}%s' % (ns_map[ns], tag))
246 return '/'.join(replaced)
247
248 def htmlentity_transform(matchobj):
249 """Transforms an HTML entity to a character.
250
251 This function receives a match object and is intended to be used with
252 the re.sub() function.
253 """
254 entity = matchobj.group(1)
255
256 # Known non-numeric HTML entity
257 if entity in compat_html_entities.name2codepoint:
258 return compat_chr(compat_html_entities.name2codepoint[entity])
259
260 mobj = re.match(u'(?u)#(x?\\d+)', entity)
261 if mobj is not None:
262 numstr = mobj.group(1)
263 if numstr.startswith(u'x'):
264 base = 16
265 numstr = u'0%s' % numstr
266 else:
267 base = 10
268 return compat_chr(int(numstr, base))
269
270 # Unknown entity in name, return its literal representation
271 return (u'&%s;' % entity)
272
273 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
274 class BaseHTMLParser(compat_html_parser.HTMLParser):
275 def __init(self):
276 compat_html_parser.HTMLParser.__init__(self)
277 self.html = None
278
279 def loads(self, html):
280 self.html = html
281 self.feed(html)
282 self.close()
283
284 class AttrParser(BaseHTMLParser):
285 """Modified HTMLParser that isolates a tag with the specified attribute"""
286 def __init__(self, attribute, value):
287 self.attribute = attribute
288 self.value = value
289 self.result = None
290 self.started = False
291 self.depth = {}
292 self.watch_startpos = False
293 self.error_count = 0
294 BaseHTMLParser.__init__(self)
295
296 def error(self, message):
297 if self.error_count > 10 or self.started:
298 raise compat_html_parser.HTMLParseError(message, self.getpos())
299 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
300 self.error_count += 1
301 self.goahead(1)
302
303 def handle_starttag(self, tag, attrs):
304 attrs = dict(attrs)
305 if self.started:
306 self.find_startpos(None)
307 if self.attribute in attrs and attrs[self.attribute] == self.value:
308 self.result = [tag]
309 self.started = True
310 self.watch_startpos = True
311 if self.started:
312 if not tag in self.depth: self.depth[tag] = 0
313 self.depth[tag] += 1
314
315 def handle_endtag(self, tag):
316 if self.started:
317 if tag in self.depth: self.depth[tag] -= 1
318 if self.depth[self.result[0]] == 0:
319 self.started = False
320 self.result.append(self.getpos())
321
322 def find_startpos(self, x):
323 """Needed to put the start position of the result (self.result[1])
324 after the opening tag with the requested id"""
325 if self.watch_startpos:
326 self.watch_startpos = False
327 self.result.append(self.getpos())
328 handle_entityref = handle_charref = handle_data = handle_comment = \
329 handle_decl = handle_pi = unknown_decl = find_startpos
330
331 def get_result(self):
332 if self.result is None:
333 return None
334 if len(self.result) != 3:
335 return None
336 lines = self.html.split('\n')
337 lines = lines[self.result[1][0]-1:self.result[2][0]]
338 lines[0] = lines[0][self.result[1][1]:]
339 if len(lines) == 1:
340 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
341 lines[-1] = lines[-1][:self.result[2][1]]
342 return '\n'.join(lines).strip()
343 # Hack for https://github.com/rg3/youtube-dl/issues/662
344 if sys.version_info < (2, 7, 3):
345 AttrParser.parse_endtag = (lambda self, i:
346 i + len("</scr'+'ipt>")
347 if self.rawdata[i:].startswith("</scr'+'ipt>")
348 else compat_html_parser.HTMLParser.parse_endtag(self, i))
349
350 def get_element_by_id(id, html):
351 """Return the content of the tag with the specified ID in the passed HTML document"""
352 return get_element_by_attribute("id", id, html)
353
354 def get_element_by_attribute(attribute, value, html):
355 """Return the content of the tag with the specified attribute in the passed HTML document"""
356 parser = AttrParser(attribute, value)
357 try:
358 parser.loads(html)
359 except compat_html_parser.HTMLParseError:
360 pass
361 return parser.get_result()
362
363 class MetaParser(BaseHTMLParser):
364 """
365 Modified HTMLParser that isolates a meta tag with the specified name
366 attribute.
367 """
368 def __init__(self, name):
369 BaseHTMLParser.__init__(self)
370 self.name = name
371 self.content = None
372 self.result = None
373
374 def handle_starttag(self, tag, attrs):
375 if tag != 'meta':
376 return
377 attrs = dict(attrs)
378 if attrs.get('name') == self.name:
379 self.result = attrs.get('content')
380
381 def get_result(self):
382 return self.result
383
384 def get_meta_content(name, html):
385 """
386 Return the content attribute from the meta tag with the given name attribute.
387 """
388 parser = MetaParser(name)
389 try:
390 parser.loads(html)
391 except compat_html_parser.HTMLParseError:
392 pass
393 return parser.get_result()
394
395
396 def clean_html(html):
397 """Clean an HTML snippet into a readable string"""
398 # Newline vs <br />
399 html = html.replace('\n', ' ')
400 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
401 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
402 # Strip html tags
403 html = re.sub('<.*?>', '', html)
404 # Replace html entities
405 html = unescapeHTML(html)
406 return html.strip()
407
408
409 def sanitize_open(filename, open_mode):
410 """Try to open the given filename, and slightly tweak it if this fails.
411
412 Attempts to open the given filename. If this fails, it tries to change
413 the filename slightly, step by step, until it's either able to open it
414 or it fails and raises a final exception, like the standard open()
415 function.
416
417 It returns the tuple (stream, definitive_file_name).
418 """
419 try:
420 if filename == u'-':
421 if sys.platform == 'win32':
422 import msvcrt
423 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
424 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
425 stream = open(encodeFilename(filename), open_mode)
426 return (stream, filename)
427 except (IOError, OSError) as err:
428 if err.errno in (errno.EACCES,):
429 raise
430
431 # In case of error, try to remove win32 forbidden chars
432 alt_filename = os.path.join(
433 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
434 for path_part in os.path.split(filename)
435 )
436 if alt_filename == filename:
437 raise
438 else:
439 # An exception here should be caught in the caller
440 stream = open(encodeFilename(filename), open_mode)
441 return (stream, alt_filename)
442
443
444 def timeconvert(timestr):
445 """Convert RFC 2822 defined time string into system timestamp"""
446 timestamp = None
447 timetuple = email.utils.parsedate_tz(timestr)
448 if timetuple is not None:
449 timestamp = email.utils.mktime_tz(timetuple)
450 return timestamp
451
452 def sanitize_filename(s, restricted=False, is_id=False):
453 """Sanitizes a string so it could be used as part of a filename.
454 If restricted is set, use a stricter subset of allowed characters.
455 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
456 """
457 def replace_insane(char):
458 if char == '?' or ord(char) < 32 or ord(char) == 127:
459 return ''
460 elif char == '"':
461 return '' if restricted else '\''
462 elif char == ':':
463 return '_-' if restricted else ' -'
464 elif char in '\\/|*<>':
465 return '_'
466 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
467 return '_'
468 if restricted and ord(char) > 127:
469 return '_'
470 return char
471
472 result = u''.join(map(replace_insane, s))
473 if not is_id:
474 while '__' in result:
475 result = result.replace('__', '_')
476 result = result.strip('_')
477 # Common case of "Foreign band name - English song title"
478 if restricted and result.startswith('-_'):
479 result = result[2:]
480 if not result:
481 result = '_'
482 return result
483
484 def orderedSet(iterable):
485 """ Remove all duplicates from the input iterable """
486 res = []
487 for el in iterable:
488 if el not in res:
489 res.append(el)
490 return res
491
492 def unescapeHTML(s):
493 """
494 @param s a string
495 """
496 assert type(s) == type(u'')
497
498 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
499 return result
500
501 def encodeFilename(s):
502 """
503 @param s The name of the file
504 """
505
506 assert type(s) == type(u'')
507
508 # Python 3 has a Unicode API
509 if sys.version_info >= (3, 0):
510 return s
511
512 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
513 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
514 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
515 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
516 return s
517 else:
518 encoding = sys.getfilesystemencoding()
519 if encoding is None:
520 encoding = 'utf-8'
521 return s.encode(encoding, 'ignore')
522
523 def decodeOption(optval):
524 if optval is None:
525 return optval
526 if isinstance(optval, bytes):
527 optval = optval.decode(preferredencoding())
528
529 assert isinstance(optval, compat_str)
530 return optval
531
532 def formatSeconds(secs):
533 if secs > 3600:
534 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
535 elif secs > 60:
536 return '%d:%02d' % (secs // 60, secs % 60)
537 else:
538 return '%d' % secs
539
540 def make_HTTPS_handler(opts_no_check_certificate):
541 if sys.version_info < (3, 2):
542 import httplib
543
544 class HTTPSConnectionV3(httplib.HTTPSConnection):
545 def __init__(self, *args, **kwargs):
546 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
547
548 def connect(self):
549 sock = socket.create_connection((self.host, self.port), self.timeout)
550 if self._tunnel_host:
551 self.sock = sock
552 self._tunnel()
553 try:
554 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
555 except ssl.SSLError:
556 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
557
558 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
559 def https_open(self, req):
560 return self.do_open(HTTPSConnectionV3, req)
561 return HTTPSHandlerV3()
562 else:
563 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
564 context.set_default_verify_paths()
565
566 context.verify_mode = (ssl.CERT_NONE
567 if opts_no_check_certificate
568 else ssl.CERT_REQUIRED)
569 return compat_urllib_request.HTTPSHandler(context=context)
570
571 class ExtractorError(Exception):
572 """Error during info extraction."""
573 def __init__(self, msg, tb=None, expected=False, cause=None):
574 """ tb, if given, is the original traceback (so that it can be printed out).
575 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
576 """
577
578 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
579 expected = True
580 if not expected:
581 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
582 super(ExtractorError, self).__init__(msg)
583
584 self.traceback = tb
585 self.exc_info = sys.exc_info() # preserve original exception
586 self.cause = cause
587
588 def format_traceback(self):
589 if self.traceback is None:
590 return None
591 return u''.join(traceback.format_tb(self.traceback))
592
593
594 class RegexNotFoundError(ExtractorError):
595 """Error when a regex didn't match"""
596 pass
597
598
599 class DownloadError(Exception):
600 """Download Error exception.
601
602 This exception may be thrown by FileDownloader objects if they are not
603 configured to continue on errors. They will contain the appropriate
604 error message.
605 """
606 def __init__(self, msg, exc_info=None):
607 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
608 super(DownloadError, self).__init__(msg)
609 self.exc_info = exc_info
610
611
612 class SameFileError(Exception):
613 """Same File exception.
614
615 This exception will be thrown by FileDownloader objects if they detect
616 multiple files would have to be downloaded to the same file on disk.
617 """
618 pass
619
620
621 class PostProcessingError(Exception):
622 """Post Processing exception.
623
624 This exception may be raised by PostProcessor's .run() method to
625 indicate an error in the postprocessing task.
626 """
627 def __init__(self, msg):
628 self.msg = msg
629
630 class MaxDownloadsReached(Exception):
631 """ --max-downloads limit has been reached. """
632 pass
633
634
635 class UnavailableVideoError(Exception):
636 """Unavailable Format exception.
637
638 This exception will be thrown when a video is requested
639 in a format that is not available for that video.
640 """
641 pass
642
643
644 class ContentTooShortError(Exception):
645 """Content Too Short exception.
646
647 This exception may be raised by FileDownloader objects when a file they
648 download is too small for what the server announced first, indicating
649 the connection was probably interrupted.
650 """
651 # Both in bytes
652 downloaded = None
653 expected = None
654
655 def __init__(self, downloaded, expected):
656 self.downloaded = downloaded
657 self.expected = expected
658
659 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
660 """Handler for HTTP requests and responses.
661
662 This class, when installed with an OpenerDirector, automatically adds
663 the standard headers to every HTTP request and handles gzipped and
664 deflated responses from web servers. If compression is to be avoided in
665 a particular request, the original request in the program code only has
666 to include the HTTP header "Youtubedl-No-Compression", which will be
667 removed before making the real request.
668
669 Part of this code was copied from:
670
671 http://techknack.net/python-urllib2-handlers/
672
673 Andrew Rowls, the author of that code, agreed to release it to the
674 public domain.
675 """
676
677 @staticmethod
678 def deflate(data):
679 try:
680 return zlib.decompress(data, -zlib.MAX_WBITS)
681 except zlib.error:
682 return zlib.decompress(data)
683
684 @staticmethod
685 def addinfourl_wrapper(stream, headers, url, code):
686 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
687 return compat_urllib_request.addinfourl(stream, headers, url, code)
688 ret = compat_urllib_request.addinfourl(stream, headers, url)
689 ret.code = code
690 return ret
691
692 def http_request(self, req):
693 for h,v in std_headers.items():
694 if h in req.headers:
695 del req.headers[h]
696 req.add_header(h, v)
697 if 'Youtubedl-no-compression' in req.headers:
698 if 'Accept-encoding' in req.headers:
699 del req.headers['Accept-encoding']
700 del req.headers['Youtubedl-no-compression']
701 if 'Youtubedl-user-agent' in req.headers:
702 if 'User-agent' in req.headers:
703 del req.headers['User-agent']
704 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
705 del req.headers['Youtubedl-user-agent']
706 return req
707
708 def http_response(self, req, resp):
709 old_resp = resp
710 # gzip
711 if resp.headers.get('Content-encoding', '') == 'gzip':
712 content = resp.read()
713 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
714 try:
715 uncompressed = io.BytesIO(gz.read())
716 except IOError as original_ioerror:
717 # There may be junk add the end of the file
718 # See http://stackoverflow.com/q/4928560/35070 for details
719 for i in range(1, 1024):
720 try:
721 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
722 uncompressed = io.BytesIO(gz.read())
723 except IOError:
724 continue
725 break
726 else:
727 raise original_ioerror
728 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
729 resp.msg = old_resp.msg
730 # deflate
731 if resp.headers.get('Content-encoding', '') == 'deflate':
732 gz = io.BytesIO(self.deflate(resp.read()))
733 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
734 resp.msg = old_resp.msg
735 return resp
736
737 https_request = http_request
738 https_response = http_response
739
740 def unified_strdate(date_str):
741 """Return a string with the date in the format YYYYMMDD"""
742 upload_date = None
743 #Replace commas
744 date_str = date_str.replace(',',' ')
745 # %z (UTC offset) is only supported in python>=3.2
746 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
747 format_expressions = [
748 '%d %B %Y',
749 '%B %d %Y',
750 '%b %d %Y',
751 '%Y-%m-%d',
752 '%d/%m/%Y',
753 '%Y/%m/%d %H:%M:%S',
754 '%d.%m.%Y %H:%M',
755 '%Y-%m-%dT%H:%M:%SZ',
756 '%Y-%m-%dT%H:%M:%S.%fZ',
757 '%Y-%m-%dT%H:%M:%S.%f0Z',
758 '%Y-%m-%dT%H:%M:%S',
759 ]
760 for expression in format_expressions:
761 try:
762 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
763 except:
764 pass
765 return upload_date
766
767 def determine_ext(url, default_ext=u'unknown_video'):
768 guess = url.partition(u'?')[0].rpartition(u'.')[2]
769 if re.match(r'^[A-Za-z0-9]+$', guess):
770 return guess
771 else:
772 return default_ext
773
774 def subtitles_filename(filename, sub_lang, sub_format):
775 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
776
777 def date_from_str(date_str):
778 """
779 Return a datetime object from a string in the format YYYYMMDD or
780 (now|today)[+-][0-9](day|week|month|year)(s)?"""
781 today = datetime.date.today()
782 if date_str == 'now'or date_str == 'today':
783 return today
784 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
785 if match is not None:
786 sign = match.group('sign')
787 time = int(match.group('time'))
788 if sign == '-':
789 time = -time
790 unit = match.group('unit')
791 #A bad aproximation?
792 if unit == 'month':
793 unit = 'day'
794 time *= 30
795 elif unit == 'year':
796 unit = 'day'
797 time *= 365
798 unit += 's'
799 delta = datetime.timedelta(**{unit: time})
800 return today + delta
801 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
802
803 class DateRange(object):
804 """Represents a time interval between two dates"""
805 def __init__(self, start=None, end=None):
806 """start and end must be strings in the format accepted by date"""
807 if start is not None:
808 self.start = date_from_str(start)
809 else:
810 self.start = datetime.datetime.min.date()
811 if end is not None:
812 self.end = date_from_str(end)
813 else:
814 self.end = datetime.datetime.max.date()
815 if self.start > self.end:
816 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
817 @classmethod
818 def day(cls, day):
819 """Returns a range that only contains the given day"""
820 return cls(day,day)
821 def __contains__(self, date):
822 """Check if the date is in the range"""
823 if not isinstance(date, datetime.date):
824 date = date_from_str(date)
825 return self.start <= date <= self.end
826 def __str__(self):
827 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
828
829
830 def platform_name():
831 """ Returns the platform name as a compat_str """
832 res = platform.platform()
833 if isinstance(res, bytes):
834 res = res.decode(preferredencoding())
835
836 assert isinstance(res, compat_str)
837 return res
838
839
840 def write_string(s, out=None):
841 if out is None:
842 out = sys.stderr
843 assert type(s) == type(u'')
844
845 if ('b' in getattr(out, 'mode', '') or
846 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
847 s = s.encode(preferredencoding(), 'ignore')
848 out.write(s)
849 out.flush()
850
851
852 def bytes_to_intlist(bs):
853 if not bs:
854 return []
855 if isinstance(bs[0], int): # Python 3
856 return list(bs)
857 else:
858 return [ord(c) for c in bs]
859
860
861 def intlist_to_bytes(xs):
862 if not xs:
863 return b''
864 if isinstance(chr(0), bytes): # Python 2
865 return ''.join([chr(x) for x in xs])
866 else:
867 return bytes(xs)
868
869
870 def get_cachedir(params={}):
871 cache_root = os.environ.get('XDG_CACHE_HOME',
872 os.path.expanduser('~/.cache'))
873 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
874
875
876 # Cross-platform file locking
877 if sys.platform == 'win32':
878 import ctypes.wintypes
879 import msvcrt
880
881 class OVERLAPPED(ctypes.Structure):
882 _fields_ = [
883 ('Internal', ctypes.wintypes.LPVOID),
884 ('InternalHigh', ctypes.wintypes.LPVOID),
885 ('Offset', ctypes.wintypes.DWORD),
886 ('OffsetHigh', ctypes.wintypes.DWORD),
887 ('hEvent', ctypes.wintypes.HANDLE),
888 ]
889
890 kernel32 = ctypes.windll.kernel32
891 LockFileEx = kernel32.LockFileEx
892 LockFileEx.argtypes = [
893 ctypes.wintypes.HANDLE, # hFile
894 ctypes.wintypes.DWORD, # dwFlags
895 ctypes.wintypes.DWORD, # dwReserved
896 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
897 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
898 ctypes.POINTER(OVERLAPPED) # Overlapped
899 ]
900 LockFileEx.restype = ctypes.wintypes.BOOL
901 UnlockFileEx = kernel32.UnlockFileEx
902 UnlockFileEx.argtypes = [
903 ctypes.wintypes.HANDLE, # hFile
904 ctypes.wintypes.DWORD, # dwReserved
905 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
906 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
907 ctypes.POINTER(OVERLAPPED) # Overlapped
908 ]
909 UnlockFileEx.restype = ctypes.wintypes.BOOL
910 whole_low = 0xffffffff
911 whole_high = 0x7fffffff
912
913 def _lock_file(f, exclusive):
914 overlapped = OVERLAPPED()
915 overlapped.Offset = 0
916 overlapped.OffsetHigh = 0
917 overlapped.hEvent = 0
918 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
919 handle = msvcrt.get_osfhandle(f.fileno())
920 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
921 whole_low, whole_high, f._lock_file_overlapped_p):
922 raise OSError('Locking file failed: %r' % ctypes.FormatError())
923
924 def _unlock_file(f):
925 assert f._lock_file_overlapped_p
926 handle = msvcrt.get_osfhandle(f.fileno())
927 if not UnlockFileEx(handle, 0,
928 whole_low, whole_high, f._lock_file_overlapped_p):
929 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
930
931 else:
932 import fcntl
933
934 def _lock_file(f, exclusive):
935 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
936
937 def _unlock_file(f):
938 fcntl.lockf(f, fcntl.LOCK_UN)
939
940
941 class locked_file(object):
942 def __init__(self, filename, mode, encoding=None):
943 assert mode in ['r', 'a', 'w']
944 self.f = io.open(filename, mode, encoding=encoding)
945 self.mode = mode
946
947 def __enter__(self):
948 exclusive = self.mode != 'r'
949 try:
950 _lock_file(self.f, exclusive)
951 except IOError:
952 self.f.close()
953 raise
954 return self
955
956 def __exit__(self, etype, value, traceback):
957 try:
958 _unlock_file(self.f)
959 finally:
960 self.f.close()
961
962 def __iter__(self):
963 return iter(self.f)
964
965 def write(self, *args):
966 return self.f.write(*args)
967
968 def read(self, *args):
969 return self.f.read(*args)
970
971
972 def shell_quote(args):
973 quoted_args = []
974 encoding = sys.getfilesystemencoding()
975 if encoding is None:
976 encoding = 'utf-8'
977 for a in args:
978 if isinstance(a, bytes):
979 # We may get a filename encoded with 'encodeFilename'
980 a = a.decode(encoding)
981 quoted_args.append(pipes.quote(a))
982 return u' '.join(quoted_args)
983
984
985 def takewhile_inclusive(pred, seq):
986 """ Like itertools.takewhile, but include the latest evaluated element
987 (the first element so that Not pred(e)) """
988 for e in seq:
989 yield e
990 if not pred(e):
991 return
992
993
994 def smuggle_url(url, data):
995 """ Pass additional data in a URL for internal use. """
996
997 sdata = compat_urllib_parse.urlencode(
998 {u'__youtubedl_smuggle': json.dumps(data)})
999 return url + u'#' + sdata
1000
1001
1002 def unsmuggle_url(smug_url):
1003 if not '#__youtubedl_smuggle' in smug_url:
1004 return smug_url, None
1005 url, _, sdata = smug_url.rpartition(u'#')
1006 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1007 data = json.loads(jsond)
1008 return url, data
1009
1010
1011 def format_bytes(bytes):
1012 if bytes is None:
1013 return u'N/A'
1014 if type(bytes) is str:
1015 bytes = float(bytes)
1016 if bytes == 0.0:
1017 exponent = 0
1018 else:
1019 exponent = int(math.log(bytes, 1024.0))
1020 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1021 converted = float(bytes) / float(1024 ** exponent)
1022 return u'%.2f%s' % (converted, suffix)
1023
1024 def str_to_int(int_str):
1025 int_str = re.sub(r'[,\.]', u'', int_str)
1026 return int(int_str)