]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Added '--xattrs' option which writes metadata to the file's extended attributes using...
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import ctypes
5 import datetime
6 import email.utils
7 import errno
8 import gzip
9 import io
10 import json
11 import locale
12 import math
13 import os
14 import pipes
15 import platform
16 import re
17 import ssl
18 import socket
19 import subprocess
20 import sys
21 import traceback
22 import zlib
23
24 try:
25 import urllib.request as compat_urllib_request
26 except ImportError: # Python 2
27 import urllib2 as compat_urllib_request
28
29 try:
30 import urllib.error as compat_urllib_error
31 except ImportError: # Python 2
32 import urllib2 as compat_urllib_error
33
34 try:
35 import urllib.parse as compat_urllib_parse
36 except ImportError: # Python 2
37 import urllib as compat_urllib_parse
38
39 try:
40 from urllib.parse import urlparse as compat_urllib_parse_urlparse
41 except ImportError: # Python 2
42 from urlparse import urlparse as compat_urllib_parse_urlparse
43
44 try:
45 import urllib.parse as compat_urlparse
46 except ImportError: # Python 2
47 import urlparse as compat_urlparse
48
49 try:
50 import http.cookiejar as compat_cookiejar
51 except ImportError: # Python 2
52 import cookielib as compat_cookiejar
53
54 try:
55 import html.entities as compat_html_entities
56 except ImportError: # Python 2
57 import htmlentitydefs as compat_html_entities
58
59 try:
60 import html.parser as compat_html_parser
61 except ImportError: # Python 2
62 import HTMLParser as compat_html_parser
63
64 try:
65 import http.client as compat_http_client
66 except ImportError: # Python 2
67 import httplib as compat_http_client
68
69 try:
70 from urllib.error import HTTPError as compat_HTTPError
71 except ImportError: # Python 2
72 from urllib2 import HTTPError as compat_HTTPError
73
74 try:
75 from urllib.request import urlretrieve as compat_urlretrieve
76 except ImportError: # Python 2
77 from urllib import urlretrieve as compat_urlretrieve
78
79
80 try:
81 from subprocess import DEVNULL
82 compat_subprocess_get_DEVNULL = lambda: DEVNULL
83 except ImportError:
84 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
85
86 try:
87 from urllib.parse import parse_qs as compat_parse_qs
88 except ImportError: # Python 2
89 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
90 # Python 2's version is apparently totally broken
91 def _unquote(string, encoding='utf-8', errors='replace'):
92 if string == '':
93 return string
94 res = string.split('%')
95 if len(res) == 1:
96 return string
97 if encoding is None:
98 encoding = 'utf-8'
99 if errors is None:
100 errors = 'replace'
101 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
102 pct_sequence = b''
103 string = res[0]
104 for item in res[1:]:
105 try:
106 if not item:
107 raise ValueError
108 pct_sequence += item[:2].decode('hex')
109 rest = item[2:]
110 if not rest:
111 # This segment was just a single percent-encoded character.
112 # May be part of a sequence of code units, so delay decoding.
113 # (Stored in pct_sequence).
114 continue
115 except ValueError:
116 rest = '%' + item
117 # Encountered non-percent-encoded characters. Flush the current
118 # pct_sequence.
119 string += pct_sequence.decode(encoding, errors) + rest
120 pct_sequence = b''
121 if pct_sequence:
122 # Flush the final pct_sequence
123 string += pct_sequence.decode(encoding, errors)
124 return string
125
126 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
127 encoding='utf-8', errors='replace'):
128 qs, _coerce_result = qs, unicode
129 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
130 r = []
131 for name_value in pairs:
132 if not name_value and not strict_parsing:
133 continue
134 nv = name_value.split('=', 1)
135 if len(nv) != 2:
136 if strict_parsing:
137 raise ValueError("bad query field: %r" % (name_value,))
138 # Handle case of a control-name with no equal sign
139 if keep_blank_values:
140 nv.append('')
141 else:
142 continue
143 if len(nv[1]) or keep_blank_values:
144 name = nv[0].replace('+', ' ')
145 name = _unquote(name, encoding=encoding, errors=errors)
146 name = _coerce_result(name)
147 value = nv[1].replace('+', ' ')
148 value = _unquote(value, encoding=encoding, errors=errors)
149 value = _coerce_result(value)
150 r.append((name, value))
151 return r
152
153 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
154 encoding='utf-8', errors='replace'):
155 parsed_result = {}
156 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
157 encoding=encoding, errors=errors)
158 for name, value in pairs:
159 if name in parsed_result:
160 parsed_result[name].append(value)
161 else:
162 parsed_result[name] = [value]
163 return parsed_result
164
165 try:
166 compat_str = unicode # Python 2
167 except NameError:
168 compat_str = str
169
170 try:
171 compat_chr = unichr # Python 2
172 except NameError:
173 compat_chr = chr
174
175 def compat_ord(c):
176 if type(c) is int: return c
177 else: return ord(c)
178
179 # This is not clearly defined otherwise
180 compiled_regex_type = type(re.compile(''))
181
182 std_headers = {
183 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
184 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
185 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
186 'Accept-Encoding': 'gzip, deflate',
187 'Accept-Language': 'en-us,en;q=0.5',
188 }
189
190 def preferredencoding():
191 """Get preferred encoding.
192
193 Returns the best encoding scheme for the system, based on
194 locale.getpreferredencoding() and some further tweaks.
195 """
196 try:
197 pref = locale.getpreferredencoding()
198 u'TEST'.encode(pref)
199 except:
200 pref = 'UTF-8'
201
202 return pref
203
204 if sys.version_info < (3,0):
205 def compat_print(s):
206 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
207 else:
208 def compat_print(s):
209 assert type(s) == type(u'')
210 print(s)
211
212 # In Python 2.x, json.dump expects a bytestream.
213 # In Python 3.x, it writes to a character stream
214 if sys.version_info < (3,0):
215 def write_json_file(obj, fn):
216 with open(fn, 'wb') as f:
217 json.dump(obj, f)
218 else:
219 def write_json_file(obj, fn):
220 with open(fn, 'w', encoding='utf-8') as f:
221 json.dump(obj, f)
222
223 if sys.version_info >= (2,7):
224 def find_xpath_attr(node, xpath, key, val):
225 """ Find the xpath xpath[@key=val] """
226 assert re.match(r'^[a-zA-Z]+$', key)
227 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
228 expr = xpath + u"[@%s='%s']" % (key, val)
229 return node.find(expr)
230 else:
231 def find_xpath_attr(node, xpath, key, val):
232 for f in node.findall(xpath):
233 if f.attrib.get(key) == val:
234 return f
235 return None
236
237 # On python2.6 the xml.etree.ElementTree.Element methods don't support
238 # the namespace parameter
239 def xpath_with_ns(path, ns_map):
240 components = [c.split(':') for c in path.split('/')]
241 replaced = []
242 for c in components:
243 if len(c) == 1:
244 replaced.append(c[0])
245 else:
246 ns, tag = c
247 replaced.append('{%s}%s' % (ns_map[ns], tag))
248 return '/'.join(replaced)
249
250 def htmlentity_transform(matchobj):
251 """Transforms an HTML entity to a character.
252
253 This function receives a match object and is intended to be used with
254 the re.sub() function.
255 """
256 entity = matchobj.group(1)
257
258 # Known non-numeric HTML entity
259 if entity in compat_html_entities.name2codepoint:
260 return compat_chr(compat_html_entities.name2codepoint[entity])
261
262 mobj = re.match(u'(?u)#(x?\\d+)', entity)
263 if mobj is not None:
264 numstr = mobj.group(1)
265 if numstr.startswith(u'x'):
266 base = 16
267 numstr = u'0%s' % numstr
268 else:
269 base = 10
270 return compat_chr(int(numstr, base))
271
272 # Unknown entity in name, return its literal representation
273 return (u'&%s;' % entity)
274
275 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
276 class BaseHTMLParser(compat_html_parser.HTMLParser):
277 def __init(self):
278 compat_html_parser.HTMLParser.__init__(self)
279 self.html = None
280
281 def loads(self, html):
282 self.html = html
283 self.feed(html)
284 self.close()
285
286 class AttrParser(BaseHTMLParser):
287 """Modified HTMLParser that isolates a tag with the specified attribute"""
288 def __init__(self, attribute, value):
289 self.attribute = attribute
290 self.value = value
291 self.result = None
292 self.started = False
293 self.depth = {}
294 self.watch_startpos = False
295 self.error_count = 0
296 BaseHTMLParser.__init__(self)
297
298 def error(self, message):
299 if self.error_count > 10 or self.started:
300 raise compat_html_parser.HTMLParseError(message, self.getpos())
301 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
302 self.error_count += 1
303 self.goahead(1)
304
305 def handle_starttag(self, tag, attrs):
306 attrs = dict(attrs)
307 if self.started:
308 self.find_startpos(None)
309 if self.attribute in attrs and attrs[self.attribute] == self.value:
310 self.result = [tag]
311 self.started = True
312 self.watch_startpos = True
313 if self.started:
314 if not tag in self.depth: self.depth[tag] = 0
315 self.depth[tag] += 1
316
317 def handle_endtag(self, tag):
318 if self.started:
319 if tag in self.depth: self.depth[tag] -= 1
320 if self.depth[self.result[0]] == 0:
321 self.started = False
322 self.result.append(self.getpos())
323
324 def find_startpos(self, x):
325 """Needed to put the start position of the result (self.result[1])
326 after the opening tag with the requested id"""
327 if self.watch_startpos:
328 self.watch_startpos = False
329 self.result.append(self.getpos())
330 handle_entityref = handle_charref = handle_data = handle_comment = \
331 handle_decl = handle_pi = unknown_decl = find_startpos
332
333 def get_result(self):
334 if self.result is None:
335 return None
336 if len(self.result) != 3:
337 return None
338 lines = self.html.split('\n')
339 lines = lines[self.result[1][0]-1:self.result[2][0]]
340 lines[0] = lines[0][self.result[1][1]:]
341 if len(lines) == 1:
342 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
343 lines[-1] = lines[-1][:self.result[2][1]]
344 return '\n'.join(lines).strip()
345 # Hack for https://github.com/rg3/youtube-dl/issues/662
346 if sys.version_info < (2, 7, 3):
347 AttrParser.parse_endtag = (lambda self, i:
348 i + len("</scr'+'ipt>")
349 if self.rawdata[i:].startswith("</scr'+'ipt>")
350 else compat_html_parser.HTMLParser.parse_endtag(self, i))
351
352 def get_element_by_id(id, html):
353 """Return the content of the tag with the specified ID in the passed HTML document"""
354 return get_element_by_attribute("id", id, html)
355
356 def get_element_by_attribute(attribute, value, html):
357 """Return the content of the tag with the specified attribute in the passed HTML document"""
358 parser = AttrParser(attribute, value)
359 try:
360 parser.loads(html)
361 except compat_html_parser.HTMLParseError:
362 pass
363 return parser.get_result()
364
365 class MetaParser(BaseHTMLParser):
366 """
367 Modified HTMLParser that isolates a meta tag with the specified name
368 attribute.
369 """
370 def __init__(self, name):
371 BaseHTMLParser.__init__(self)
372 self.name = name
373 self.content = None
374 self.result = None
375
376 def handle_starttag(self, tag, attrs):
377 if tag != 'meta':
378 return
379 attrs = dict(attrs)
380 if attrs.get('name') == self.name:
381 self.result = attrs.get('content')
382
383 def get_result(self):
384 return self.result
385
386 def get_meta_content(name, html):
387 """
388 Return the content attribute from the meta tag with the given name attribute.
389 """
390 parser = MetaParser(name)
391 try:
392 parser.loads(html)
393 except compat_html_parser.HTMLParseError:
394 pass
395 return parser.get_result()
396
397
398 def clean_html(html):
399 """Clean an HTML snippet into a readable string"""
400 # Newline vs <br />
401 html = html.replace('\n', ' ')
402 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
403 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
404 # Strip html tags
405 html = re.sub('<.*?>', '', html)
406 # Replace html entities
407 html = unescapeHTML(html)
408 return html.strip()
409
410
411 def sanitize_open(filename, open_mode):
412 """Try to open the given filename, and slightly tweak it if this fails.
413
414 Attempts to open the given filename. If this fails, it tries to change
415 the filename slightly, step by step, until it's either able to open it
416 or it fails and raises a final exception, like the standard open()
417 function.
418
419 It returns the tuple (stream, definitive_file_name).
420 """
421 try:
422 if filename == u'-':
423 if sys.platform == 'win32':
424 import msvcrt
425 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
426 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
427 stream = open(encodeFilename(filename), open_mode)
428 return (stream, filename)
429 except (IOError, OSError) as err:
430 if err.errno in (errno.EACCES,):
431 raise
432
433 # In case of error, try to remove win32 forbidden chars
434 alt_filename = os.path.join(
435 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
436 for path_part in os.path.split(filename)
437 )
438 if alt_filename == filename:
439 raise
440 else:
441 # An exception here should be caught in the caller
442 stream = open(encodeFilename(filename), open_mode)
443 return (stream, alt_filename)
444
445
446 def timeconvert(timestr):
447 """Convert RFC 2822 defined time string into system timestamp"""
448 timestamp = None
449 timetuple = email.utils.parsedate_tz(timestr)
450 if timetuple is not None:
451 timestamp = email.utils.mktime_tz(timetuple)
452 return timestamp
453
454 def sanitize_filename(s, restricted=False, is_id=False):
455 """Sanitizes a string so it could be used as part of a filename.
456 If restricted is set, use a stricter subset of allowed characters.
457 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
458 """
459 def replace_insane(char):
460 if char == '?' or ord(char) < 32 or ord(char) == 127:
461 return ''
462 elif char == '"':
463 return '' if restricted else '\''
464 elif char == ':':
465 return '_-' if restricted else ' -'
466 elif char in '\\/|*<>':
467 return '_'
468 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
469 return '_'
470 if restricted and ord(char) > 127:
471 return '_'
472 return char
473
474 result = u''.join(map(replace_insane, s))
475 if not is_id:
476 while '__' in result:
477 result = result.replace('__', '_')
478 result = result.strip('_')
479 # Common case of "Foreign band name - English song title"
480 if restricted and result.startswith('-_'):
481 result = result[2:]
482 if not result:
483 result = '_'
484 return result
485
486 def orderedSet(iterable):
487 """ Remove all duplicates from the input iterable """
488 res = []
489 for el in iterable:
490 if el not in res:
491 res.append(el)
492 return res
493
494 def unescapeHTML(s):
495 """
496 @param s a string
497 """
498 assert type(s) == type(u'')
499
500 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
501 return result
502
503 def encodeFilename(s):
504 """
505 @param s The name of the file
506 """
507
508 assert type(s) == type(u'')
509
510 # Python 3 has a Unicode API
511 if sys.version_info >= (3, 0):
512 return s
513
514 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
515 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
516 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
517 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
518 return s
519 else:
520 encoding = sys.getfilesystemencoding()
521 if encoding is None:
522 encoding = 'utf-8'
523 return s.encode(encoding, 'ignore')
524
525 def decodeOption(optval):
526 if optval is None:
527 return optval
528 if isinstance(optval, bytes):
529 optval = optval.decode(preferredencoding())
530
531 assert isinstance(optval, compat_str)
532 return optval
533
534 def formatSeconds(secs):
535 if secs > 3600:
536 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
537 elif secs > 60:
538 return '%d:%02d' % (secs // 60, secs % 60)
539 else:
540 return '%d' % secs
541
542 def make_HTTPS_handler(opts_no_check_certificate):
543 if sys.version_info < (3, 2):
544 import httplib
545
546 class HTTPSConnectionV3(httplib.HTTPSConnection):
547 def __init__(self, *args, **kwargs):
548 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
549
550 def connect(self):
551 sock = socket.create_connection((self.host, self.port), self.timeout)
552 if getattr(self, '_tunnel_host', False):
553 self.sock = sock
554 self._tunnel()
555 try:
556 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
557 except ssl.SSLError:
558 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
559
560 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
561 def https_open(self, req):
562 return self.do_open(HTTPSConnectionV3, req)
563 return HTTPSHandlerV3()
564 else:
565 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
566 context.verify_mode = (ssl.CERT_NONE
567 if opts_no_check_certificate
568 else ssl.CERT_REQUIRED)
569 context.set_default_verify_paths()
570 try:
571 context.load_default_certs()
572 except AttributeError:
573 pass # Python < 3.4
574 return compat_urllib_request.HTTPSHandler(context=context)
575
576 class ExtractorError(Exception):
577 """Error during info extraction."""
578 def __init__(self, msg, tb=None, expected=False, cause=None):
579 """ tb, if given, is the original traceback (so that it can be printed out).
580 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
581 """
582
583 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
584 expected = True
585 if not expected:
586 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
587 super(ExtractorError, self).__init__(msg)
588
589 self.traceback = tb
590 self.exc_info = sys.exc_info() # preserve original exception
591 self.cause = cause
592
593 def format_traceback(self):
594 if self.traceback is None:
595 return None
596 return u''.join(traceback.format_tb(self.traceback))
597
598
599 class RegexNotFoundError(ExtractorError):
600 """Error when a regex didn't match"""
601 pass
602
603
604 class DownloadError(Exception):
605 """Download Error exception.
606
607 This exception may be thrown by FileDownloader objects if they are not
608 configured to continue on errors. They will contain the appropriate
609 error message.
610 """
611 def __init__(self, msg, exc_info=None):
612 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
613 super(DownloadError, self).__init__(msg)
614 self.exc_info = exc_info
615
616
617 class SameFileError(Exception):
618 """Same File exception.
619
620 This exception will be thrown by FileDownloader objects if they detect
621 multiple files would have to be downloaded to the same file on disk.
622 """
623 pass
624
625
626 class PostProcessingError(Exception):
627 """Post Processing exception.
628
629 This exception may be raised by PostProcessor's .run() method to
630 indicate an error in the postprocessing task.
631 """
632 def __init__(self, msg):
633 self.msg = msg
634
635 class MaxDownloadsReached(Exception):
636 """ --max-downloads limit has been reached. """
637 pass
638
639
640 class UnavailableVideoError(Exception):
641 """Unavailable Format exception.
642
643 This exception will be thrown when a video is requested
644 in a format that is not available for that video.
645 """
646 pass
647
648
649 class ContentTooShortError(Exception):
650 """Content Too Short exception.
651
652 This exception may be raised by FileDownloader objects when a file they
653 download is too small for what the server announced first, indicating
654 the connection was probably interrupted.
655 """
656 # Both in bytes
657 downloaded = None
658 expected = None
659
660 def __init__(self, downloaded, expected):
661 self.downloaded = downloaded
662 self.expected = expected
663
664 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
665 """Handler for HTTP requests and responses.
666
667 This class, when installed with an OpenerDirector, automatically adds
668 the standard headers to every HTTP request and handles gzipped and
669 deflated responses from web servers. If compression is to be avoided in
670 a particular request, the original request in the program code only has
671 to include the HTTP header "Youtubedl-No-Compression", which will be
672 removed before making the real request.
673
674 Part of this code was copied from:
675
676 http://techknack.net/python-urllib2-handlers/
677
678 Andrew Rowls, the author of that code, agreed to release it to the
679 public domain.
680 """
681
682 @staticmethod
683 def deflate(data):
684 try:
685 return zlib.decompress(data, -zlib.MAX_WBITS)
686 except zlib.error:
687 return zlib.decompress(data)
688
689 @staticmethod
690 def addinfourl_wrapper(stream, headers, url, code):
691 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
692 return compat_urllib_request.addinfourl(stream, headers, url, code)
693 ret = compat_urllib_request.addinfourl(stream, headers, url)
694 ret.code = code
695 return ret
696
697 def http_request(self, req):
698 for h,v in std_headers.items():
699 if h in req.headers:
700 del req.headers[h]
701 req.add_header(h, v)
702 if 'Youtubedl-no-compression' in req.headers:
703 if 'Accept-encoding' in req.headers:
704 del req.headers['Accept-encoding']
705 del req.headers['Youtubedl-no-compression']
706 if 'Youtubedl-user-agent' in req.headers:
707 if 'User-agent' in req.headers:
708 del req.headers['User-agent']
709 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
710 del req.headers['Youtubedl-user-agent']
711 return req
712
713 def http_response(self, req, resp):
714 old_resp = resp
715 # gzip
716 if resp.headers.get('Content-encoding', '') == 'gzip':
717 content = resp.read()
718 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
719 try:
720 uncompressed = io.BytesIO(gz.read())
721 except IOError as original_ioerror:
722 # There may be junk add the end of the file
723 # See http://stackoverflow.com/q/4928560/35070 for details
724 for i in range(1, 1024):
725 try:
726 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
727 uncompressed = io.BytesIO(gz.read())
728 except IOError:
729 continue
730 break
731 else:
732 raise original_ioerror
733 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
734 resp.msg = old_resp.msg
735 # deflate
736 if resp.headers.get('Content-encoding', '') == 'deflate':
737 gz = io.BytesIO(self.deflate(resp.read()))
738 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
739 resp.msg = old_resp.msg
740 return resp
741
742 https_request = http_request
743 https_response = http_response
744
745 def unified_strdate(date_str):
746 """Return a string with the date in the format YYYYMMDD"""
747 upload_date = None
748 #Replace commas
749 date_str = date_str.replace(',',' ')
750 # %z (UTC offset) is only supported in python>=3.2
751 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
752 format_expressions = [
753 '%d %B %Y',
754 '%B %d %Y',
755 '%b %d %Y',
756 '%Y-%m-%d',
757 '%d/%m/%Y',
758 '%Y/%m/%d %H:%M:%S',
759 '%d.%m.%Y %H:%M',
760 '%Y-%m-%dT%H:%M:%SZ',
761 '%Y-%m-%dT%H:%M:%S.%fZ',
762 '%Y-%m-%dT%H:%M:%S.%f0Z',
763 '%Y-%m-%dT%H:%M:%S',
764 ]
765 for expression in format_expressions:
766 try:
767 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
768 except:
769 pass
770 if upload_date is None:
771 timetuple = email.utils.parsedate_tz(date_str)
772 if timetuple:
773 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
774 return upload_date
775
776 def determine_ext(url, default_ext=u'unknown_video'):
777 guess = url.partition(u'?')[0].rpartition(u'.')[2]
778 if re.match(r'^[A-Za-z0-9]+$', guess):
779 return guess
780 else:
781 return default_ext
782
783 def subtitles_filename(filename, sub_lang, sub_format):
784 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
785
786 def date_from_str(date_str):
787 """
788 Return a datetime object from a string in the format YYYYMMDD or
789 (now|today)[+-][0-9](day|week|month|year)(s)?"""
790 today = datetime.date.today()
791 if date_str == 'now'or date_str == 'today':
792 return today
793 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
794 if match is not None:
795 sign = match.group('sign')
796 time = int(match.group('time'))
797 if sign == '-':
798 time = -time
799 unit = match.group('unit')
800 #A bad aproximation?
801 if unit == 'month':
802 unit = 'day'
803 time *= 30
804 elif unit == 'year':
805 unit = 'day'
806 time *= 365
807 unit += 's'
808 delta = datetime.timedelta(**{unit: time})
809 return today + delta
810 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
811
812 def hyphenate_date(date_str):
813 """
814 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
815 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
816 if match is not None:
817 return '-'.join(match.groups())
818 else:
819 return date_str
820
821 class DateRange(object):
822 """Represents a time interval between two dates"""
823 def __init__(self, start=None, end=None):
824 """start and end must be strings in the format accepted by date"""
825 if start is not None:
826 self.start = date_from_str(start)
827 else:
828 self.start = datetime.datetime.min.date()
829 if end is not None:
830 self.end = date_from_str(end)
831 else:
832 self.end = datetime.datetime.max.date()
833 if self.start > self.end:
834 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
835 @classmethod
836 def day(cls, day):
837 """Returns a range that only contains the given day"""
838 return cls(day,day)
839 def __contains__(self, date):
840 """Check if the date is in the range"""
841 if not isinstance(date, datetime.date):
842 date = date_from_str(date)
843 return self.start <= date <= self.end
844 def __str__(self):
845 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
846
847
848 def platform_name():
849 """ Returns the platform name as a compat_str """
850 res = platform.platform()
851 if isinstance(res, bytes):
852 res = res.decode(preferredencoding())
853
854 assert isinstance(res, compat_str)
855 return res
856
857
858 def write_string(s, out=None):
859 if out is None:
860 out = sys.stderr
861 assert type(s) == type(u'')
862
863 if ('b' in getattr(out, 'mode', '') or
864 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
865 s = s.encode(preferredencoding(), 'ignore')
866 out.write(s)
867 out.flush()
868
869
870 def bytes_to_intlist(bs):
871 if not bs:
872 return []
873 if isinstance(bs[0], int): # Python 3
874 return list(bs)
875 else:
876 return [ord(c) for c in bs]
877
878
879 def intlist_to_bytes(xs):
880 if not xs:
881 return b''
882 if isinstance(chr(0), bytes): # Python 2
883 return ''.join([chr(x) for x in xs])
884 else:
885 return bytes(xs)
886
887
888 def get_cachedir(params={}):
889 cache_root = os.environ.get('XDG_CACHE_HOME',
890 os.path.expanduser('~/.cache'))
891 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
892
893
894 # Cross-platform file locking
895 if sys.platform == 'win32':
896 import ctypes.wintypes
897 import msvcrt
898
899 class OVERLAPPED(ctypes.Structure):
900 _fields_ = [
901 ('Internal', ctypes.wintypes.LPVOID),
902 ('InternalHigh', ctypes.wintypes.LPVOID),
903 ('Offset', ctypes.wintypes.DWORD),
904 ('OffsetHigh', ctypes.wintypes.DWORD),
905 ('hEvent', ctypes.wintypes.HANDLE),
906 ]
907
908 kernel32 = ctypes.windll.kernel32
909 LockFileEx = kernel32.LockFileEx
910 LockFileEx.argtypes = [
911 ctypes.wintypes.HANDLE, # hFile
912 ctypes.wintypes.DWORD, # dwFlags
913 ctypes.wintypes.DWORD, # dwReserved
914 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
915 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
916 ctypes.POINTER(OVERLAPPED) # Overlapped
917 ]
918 LockFileEx.restype = ctypes.wintypes.BOOL
919 UnlockFileEx = kernel32.UnlockFileEx
920 UnlockFileEx.argtypes = [
921 ctypes.wintypes.HANDLE, # hFile
922 ctypes.wintypes.DWORD, # dwReserved
923 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
924 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
925 ctypes.POINTER(OVERLAPPED) # Overlapped
926 ]
927 UnlockFileEx.restype = ctypes.wintypes.BOOL
928 whole_low = 0xffffffff
929 whole_high = 0x7fffffff
930
931 def _lock_file(f, exclusive):
932 overlapped = OVERLAPPED()
933 overlapped.Offset = 0
934 overlapped.OffsetHigh = 0
935 overlapped.hEvent = 0
936 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
937 handle = msvcrt.get_osfhandle(f.fileno())
938 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
939 whole_low, whole_high, f._lock_file_overlapped_p):
940 raise OSError('Locking file failed: %r' % ctypes.FormatError())
941
942 def _unlock_file(f):
943 assert f._lock_file_overlapped_p
944 handle = msvcrt.get_osfhandle(f.fileno())
945 if not UnlockFileEx(handle, 0,
946 whole_low, whole_high, f._lock_file_overlapped_p):
947 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
948
949 else:
950 import fcntl
951
952 def _lock_file(f, exclusive):
953 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
954
955 def _unlock_file(f):
956 fcntl.lockf(f, fcntl.LOCK_UN)
957
958
959 class locked_file(object):
960 def __init__(self, filename, mode, encoding=None):
961 assert mode in ['r', 'a', 'w']
962 self.f = io.open(filename, mode, encoding=encoding)
963 self.mode = mode
964
965 def __enter__(self):
966 exclusive = self.mode != 'r'
967 try:
968 _lock_file(self.f, exclusive)
969 except IOError:
970 self.f.close()
971 raise
972 return self
973
974 def __exit__(self, etype, value, traceback):
975 try:
976 _unlock_file(self.f)
977 finally:
978 self.f.close()
979
980 def __iter__(self):
981 return iter(self.f)
982
983 def write(self, *args):
984 return self.f.write(*args)
985
986 def read(self, *args):
987 return self.f.read(*args)
988
989
990 def shell_quote(args):
991 quoted_args = []
992 encoding = sys.getfilesystemencoding()
993 if encoding is None:
994 encoding = 'utf-8'
995 for a in args:
996 if isinstance(a, bytes):
997 # We may get a filename encoded with 'encodeFilename'
998 a = a.decode(encoding)
999 quoted_args.append(pipes.quote(a))
1000 return u' '.join(quoted_args)
1001
1002
1003 def takewhile_inclusive(pred, seq):
1004 """ Like itertools.takewhile, but include the latest evaluated element
1005 (the first element so that Not pred(e)) """
1006 for e in seq:
1007 yield e
1008 if not pred(e):
1009 return
1010
1011
1012 def smuggle_url(url, data):
1013 """ Pass additional data in a URL for internal use. """
1014
1015 sdata = compat_urllib_parse.urlencode(
1016 {u'__youtubedl_smuggle': json.dumps(data)})
1017 return url + u'#' + sdata
1018
1019
1020 def unsmuggle_url(smug_url):
1021 if not '#__youtubedl_smuggle' in smug_url:
1022 return smug_url, None
1023 url, _, sdata = smug_url.rpartition(u'#')
1024 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1025 data = json.loads(jsond)
1026 return url, data
1027
1028
1029 def format_bytes(bytes):
1030 if bytes is None:
1031 return u'N/A'
1032 if type(bytes) is str:
1033 bytes = float(bytes)
1034 if bytes == 0.0:
1035 exponent = 0
1036 else:
1037 exponent = int(math.log(bytes, 1024.0))
1038 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1039 converted = float(bytes) / float(1024 ** exponent)
1040 return u'%.2f%s' % (converted, suffix)
1041
1042
1043 def str_to_int(int_str):
1044 int_str = re.sub(r'[,\.]', u'', int_str)
1045 return int(int_str)
1046
1047
1048 def get_term_width():
1049 columns = os.environ.get('COLUMNS', None)
1050 if columns:
1051 return int(columns)
1052
1053 try:
1054 sp = subprocess.Popen(
1055 ['stty', 'size'],
1056 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1057 out, err = sp.communicate()
1058 return int(out.split()[1])
1059 except:
1060 pass
1061 return None
1062
1063
1064 def month_by_name(name):
1065 """ Return the number of a month by (locale-independently) English name """
1066
1067 ENGLISH_NAMES = [
1068 u'January', u'February', u'March', u'April', u'May', u'June',
1069 u'July', u'August', u'September', u'October', u'November', u'December']
1070 try:
1071 return ENGLISH_NAMES.index(name) + 1
1072 except ValueError:
1073 return None
1074
1075
1076 def fix_xml_all_ampersand(xml_str):
1077 """Replace all the '&' by '&amp;' in XML"""
1078 return xml_str.replace(u'&', u'&amp;')
1079
1080
1081 def setproctitle(title):
1082 assert isinstance(title, type(u''))
1083 try:
1084 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1085 except OSError:
1086 return
1087 title = title
1088 buf = ctypes.create_string_buffer(len(title) + 1)
1089 buf.value = title.encode('utf-8')
1090 try:
1091 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1092 except AttributeError:
1093 return # Strange libc, just skip this
1094
1095
1096 def remove_start(s, start):
1097 if s.startswith(start):
1098 return s[len(start):]
1099 return s
1100
1101
1102 def url_basename(url):
1103 path = compat_urlparse.urlparse(url).path
1104 return path.strip(u'/').split(u'/')[-1]
1105
1106
1107 class HEADRequest(compat_urllib_request.Request):
1108 def get_method(self):
1109 return "HEAD"