]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Use `_download_xml` in more extractors
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import email.utils
6 import errno
7 import gzip
8 import io
9 import json
10 import locale
11 import math
12 import os
13 import pipes
14 import platform
15 import re
16 import ssl
17 import socket
18 import subprocess
19 import sys
20 import traceback
21 import zlib
22
23 try:
24 import urllib.request as compat_urllib_request
25 except ImportError: # Python 2
26 import urllib2 as compat_urllib_request
27
28 try:
29 import urllib.error as compat_urllib_error
30 except ImportError: # Python 2
31 import urllib2 as compat_urllib_error
32
33 try:
34 import urllib.parse as compat_urllib_parse
35 except ImportError: # Python 2
36 import urllib as compat_urllib_parse
37
38 try:
39 from urllib.parse import urlparse as compat_urllib_parse_urlparse
40 except ImportError: # Python 2
41 from urlparse import urlparse as compat_urllib_parse_urlparse
42
43 try:
44 import urllib.parse as compat_urlparse
45 except ImportError: # Python 2
46 import urlparse as compat_urlparse
47
48 try:
49 import http.cookiejar as compat_cookiejar
50 except ImportError: # Python 2
51 import cookielib as compat_cookiejar
52
53 try:
54 import html.entities as compat_html_entities
55 except ImportError: # Python 2
56 import htmlentitydefs as compat_html_entities
57
58 try:
59 import html.parser as compat_html_parser
60 except ImportError: # Python 2
61 import HTMLParser as compat_html_parser
62
63 try:
64 import http.client as compat_http_client
65 except ImportError: # Python 2
66 import httplib as compat_http_client
67
68 try:
69 from urllib.error import HTTPError as compat_HTTPError
70 except ImportError: # Python 2
71 from urllib2 import HTTPError as compat_HTTPError
72
73 try:
74 from urllib.request import urlretrieve as compat_urlretrieve
75 except ImportError: # Python 2
76 from urllib import urlretrieve as compat_urlretrieve
77
78
79 try:
80 from subprocess import DEVNULL
81 compat_subprocess_get_DEVNULL = lambda: DEVNULL
82 except ImportError:
83 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
84
85 try:
86 from urllib.parse import parse_qs as compat_parse_qs
87 except ImportError: # Python 2
88 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
89 # Python 2's version is apparently totally broken
90 def _unquote(string, encoding='utf-8', errors='replace'):
91 if string == '':
92 return string
93 res = string.split('%')
94 if len(res) == 1:
95 return string
96 if encoding is None:
97 encoding = 'utf-8'
98 if errors is None:
99 errors = 'replace'
100 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
101 pct_sequence = b''
102 string = res[0]
103 for item in res[1:]:
104 try:
105 if not item:
106 raise ValueError
107 pct_sequence += item[:2].decode('hex')
108 rest = item[2:]
109 if not rest:
110 # This segment was just a single percent-encoded character.
111 # May be part of a sequence of code units, so delay decoding.
112 # (Stored in pct_sequence).
113 continue
114 except ValueError:
115 rest = '%' + item
116 # Encountered non-percent-encoded characters. Flush the current
117 # pct_sequence.
118 string += pct_sequence.decode(encoding, errors) + rest
119 pct_sequence = b''
120 if pct_sequence:
121 # Flush the final pct_sequence
122 string += pct_sequence.decode(encoding, errors)
123 return string
124
125 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
126 encoding='utf-8', errors='replace'):
127 qs, _coerce_result = qs, unicode
128 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
129 r = []
130 for name_value in pairs:
131 if not name_value and not strict_parsing:
132 continue
133 nv = name_value.split('=', 1)
134 if len(nv) != 2:
135 if strict_parsing:
136 raise ValueError("bad query field: %r" % (name_value,))
137 # Handle case of a control-name with no equal sign
138 if keep_blank_values:
139 nv.append('')
140 else:
141 continue
142 if len(nv[1]) or keep_blank_values:
143 name = nv[0].replace('+', ' ')
144 name = _unquote(name, encoding=encoding, errors=errors)
145 name = _coerce_result(name)
146 value = nv[1].replace('+', ' ')
147 value = _unquote(value, encoding=encoding, errors=errors)
148 value = _coerce_result(value)
149 r.append((name, value))
150 return r
151
152 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
153 encoding='utf-8', errors='replace'):
154 parsed_result = {}
155 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
156 encoding=encoding, errors=errors)
157 for name, value in pairs:
158 if name in parsed_result:
159 parsed_result[name].append(value)
160 else:
161 parsed_result[name] = [value]
162 return parsed_result
163
164 try:
165 compat_str = unicode # Python 2
166 except NameError:
167 compat_str = str
168
169 try:
170 compat_chr = unichr # Python 2
171 except NameError:
172 compat_chr = chr
173
174 def compat_ord(c):
175 if type(c) is int: return c
176 else: return ord(c)
177
178 # This is not clearly defined otherwise
179 compiled_regex_type = type(re.compile(''))
180
181 std_headers = {
182 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
183 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
184 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
185 'Accept-Encoding': 'gzip, deflate',
186 'Accept-Language': 'en-us,en;q=0.5',
187 }
188
189 def preferredencoding():
190 """Get preferred encoding.
191
192 Returns the best encoding scheme for the system, based on
193 locale.getpreferredencoding() and some further tweaks.
194 """
195 try:
196 pref = locale.getpreferredencoding()
197 u'TEST'.encode(pref)
198 except:
199 pref = 'UTF-8'
200
201 return pref
202
203 if sys.version_info < (3,0):
204 def compat_print(s):
205 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
206 else:
207 def compat_print(s):
208 assert type(s) == type(u'')
209 print(s)
210
211 # In Python 2.x, json.dump expects a bytestream.
212 # In Python 3.x, it writes to a character stream
213 if sys.version_info < (3,0):
214 def write_json_file(obj, fn):
215 with open(fn, 'wb') as f:
216 json.dump(obj, f)
217 else:
218 def write_json_file(obj, fn):
219 with open(fn, 'w', encoding='utf-8') as f:
220 json.dump(obj, f)
221
222 if sys.version_info >= (2,7):
223 def find_xpath_attr(node, xpath, key, val):
224 """ Find the xpath xpath[@key=val] """
225 assert re.match(r'^[a-zA-Z]+$', key)
226 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
227 expr = xpath + u"[@%s='%s']" % (key, val)
228 return node.find(expr)
229 else:
230 def find_xpath_attr(node, xpath, key, val):
231 for f in node.findall(xpath):
232 if f.attrib.get(key) == val:
233 return f
234 return None
235
236 # On python2.6 the xml.etree.ElementTree.Element methods don't support
237 # the namespace parameter
238 def xpath_with_ns(path, ns_map):
239 components = [c.split(':') for c in path.split('/')]
240 replaced = []
241 for c in components:
242 if len(c) == 1:
243 replaced.append(c[0])
244 else:
245 ns, tag = c
246 replaced.append('{%s}%s' % (ns_map[ns], tag))
247 return '/'.join(replaced)
248
249 def htmlentity_transform(matchobj):
250 """Transforms an HTML entity to a character.
251
252 This function receives a match object and is intended to be used with
253 the re.sub() function.
254 """
255 entity = matchobj.group(1)
256
257 # Known non-numeric HTML entity
258 if entity in compat_html_entities.name2codepoint:
259 return compat_chr(compat_html_entities.name2codepoint[entity])
260
261 mobj = re.match(u'(?u)#(x?\\d+)', entity)
262 if mobj is not None:
263 numstr = mobj.group(1)
264 if numstr.startswith(u'x'):
265 base = 16
266 numstr = u'0%s' % numstr
267 else:
268 base = 10
269 return compat_chr(int(numstr, base))
270
271 # Unknown entity in name, return its literal representation
272 return (u'&%s;' % entity)
273
274 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
275 class BaseHTMLParser(compat_html_parser.HTMLParser):
276 def __init(self):
277 compat_html_parser.HTMLParser.__init__(self)
278 self.html = None
279
280 def loads(self, html):
281 self.html = html
282 self.feed(html)
283 self.close()
284
285 class AttrParser(BaseHTMLParser):
286 """Modified HTMLParser that isolates a tag with the specified attribute"""
287 def __init__(self, attribute, value):
288 self.attribute = attribute
289 self.value = value
290 self.result = None
291 self.started = False
292 self.depth = {}
293 self.watch_startpos = False
294 self.error_count = 0
295 BaseHTMLParser.__init__(self)
296
297 def error(self, message):
298 if self.error_count > 10 or self.started:
299 raise compat_html_parser.HTMLParseError(message, self.getpos())
300 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
301 self.error_count += 1
302 self.goahead(1)
303
304 def handle_starttag(self, tag, attrs):
305 attrs = dict(attrs)
306 if self.started:
307 self.find_startpos(None)
308 if self.attribute in attrs and attrs[self.attribute] == self.value:
309 self.result = [tag]
310 self.started = True
311 self.watch_startpos = True
312 if self.started:
313 if not tag in self.depth: self.depth[tag] = 0
314 self.depth[tag] += 1
315
316 def handle_endtag(self, tag):
317 if self.started:
318 if tag in self.depth: self.depth[tag] -= 1
319 if self.depth[self.result[0]] == 0:
320 self.started = False
321 self.result.append(self.getpos())
322
323 def find_startpos(self, x):
324 """Needed to put the start position of the result (self.result[1])
325 after the opening tag with the requested id"""
326 if self.watch_startpos:
327 self.watch_startpos = False
328 self.result.append(self.getpos())
329 handle_entityref = handle_charref = handle_data = handle_comment = \
330 handle_decl = handle_pi = unknown_decl = find_startpos
331
332 def get_result(self):
333 if self.result is None:
334 return None
335 if len(self.result) != 3:
336 return None
337 lines = self.html.split('\n')
338 lines = lines[self.result[1][0]-1:self.result[2][0]]
339 lines[0] = lines[0][self.result[1][1]:]
340 if len(lines) == 1:
341 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
342 lines[-1] = lines[-1][:self.result[2][1]]
343 return '\n'.join(lines).strip()
344 # Hack for https://github.com/rg3/youtube-dl/issues/662
345 if sys.version_info < (2, 7, 3):
346 AttrParser.parse_endtag = (lambda self, i:
347 i + len("</scr'+'ipt>")
348 if self.rawdata[i:].startswith("</scr'+'ipt>")
349 else compat_html_parser.HTMLParser.parse_endtag(self, i))
350
351 def get_element_by_id(id, html):
352 """Return the content of the tag with the specified ID in the passed HTML document"""
353 return get_element_by_attribute("id", id, html)
354
355 def get_element_by_attribute(attribute, value, html):
356 """Return the content of the tag with the specified attribute in the passed HTML document"""
357 parser = AttrParser(attribute, value)
358 try:
359 parser.loads(html)
360 except compat_html_parser.HTMLParseError:
361 pass
362 return parser.get_result()
363
364 class MetaParser(BaseHTMLParser):
365 """
366 Modified HTMLParser that isolates a meta tag with the specified name
367 attribute.
368 """
369 def __init__(self, name):
370 BaseHTMLParser.__init__(self)
371 self.name = name
372 self.content = None
373 self.result = None
374
375 def handle_starttag(self, tag, attrs):
376 if tag != 'meta':
377 return
378 attrs = dict(attrs)
379 if attrs.get('name') == self.name:
380 self.result = attrs.get('content')
381
382 def get_result(self):
383 return self.result
384
385 def get_meta_content(name, html):
386 """
387 Return the content attribute from the meta tag with the given name attribute.
388 """
389 parser = MetaParser(name)
390 try:
391 parser.loads(html)
392 except compat_html_parser.HTMLParseError:
393 pass
394 return parser.get_result()
395
396
397 def clean_html(html):
398 """Clean an HTML snippet into a readable string"""
399 # Newline vs <br />
400 html = html.replace('\n', ' ')
401 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
402 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
403 # Strip html tags
404 html = re.sub('<.*?>', '', html)
405 # Replace html entities
406 html = unescapeHTML(html)
407 return html.strip()
408
409
410 def sanitize_open(filename, open_mode):
411 """Try to open the given filename, and slightly tweak it if this fails.
412
413 Attempts to open the given filename. If this fails, it tries to change
414 the filename slightly, step by step, until it's either able to open it
415 or it fails and raises a final exception, like the standard open()
416 function.
417
418 It returns the tuple (stream, definitive_file_name).
419 """
420 try:
421 if filename == u'-':
422 if sys.platform == 'win32':
423 import msvcrt
424 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
425 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
426 stream = open(encodeFilename(filename), open_mode)
427 return (stream, filename)
428 except (IOError, OSError) as err:
429 if err.errno in (errno.EACCES,):
430 raise
431
432 # In case of error, try to remove win32 forbidden chars
433 alt_filename = os.path.join(
434 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
435 for path_part in os.path.split(filename)
436 )
437 if alt_filename == filename:
438 raise
439 else:
440 # An exception here should be caught in the caller
441 stream = open(encodeFilename(filename), open_mode)
442 return (stream, alt_filename)
443
444
445 def timeconvert(timestr):
446 """Convert RFC 2822 defined time string into system timestamp"""
447 timestamp = None
448 timetuple = email.utils.parsedate_tz(timestr)
449 if timetuple is not None:
450 timestamp = email.utils.mktime_tz(timetuple)
451 return timestamp
452
453 def sanitize_filename(s, restricted=False, is_id=False):
454 """Sanitizes a string so it could be used as part of a filename.
455 If restricted is set, use a stricter subset of allowed characters.
456 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
457 """
458 def replace_insane(char):
459 if char == '?' or ord(char) < 32 or ord(char) == 127:
460 return ''
461 elif char == '"':
462 return '' if restricted else '\''
463 elif char == ':':
464 return '_-' if restricted else ' -'
465 elif char in '\\/|*<>':
466 return '_'
467 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
468 return '_'
469 if restricted and ord(char) > 127:
470 return '_'
471 return char
472
473 result = u''.join(map(replace_insane, s))
474 if not is_id:
475 while '__' in result:
476 result = result.replace('__', '_')
477 result = result.strip('_')
478 # Common case of "Foreign band name - English song title"
479 if restricted and result.startswith('-_'):
480 result = result[2:]
481 if not result:
482 result = '_'
483 return result
484
485 def orderedSet(iterable):
486 """ Remove all duplicates from the input iterable """
487 res = []
488 for el in iterable:
489 if el not in res:
490 res.append(el)
491 return res
492
493 def unescapeHTML(s):
494 """
495 @param s a string
496 """
497 assert type(s) == type(u'')
498
499 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
500 return result
501
502 def encodeFilename(s):
503 """
504 @param s The name of the file
505 """
506
507 assert type(s) == type(u'')
508
509 # Python 3 has a Unicode API
510 if sys.version_info >= (3, 0):
511 return s
512
513 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
514 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
515 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
516 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
517 return s
518 else:
519 encoding = sys.getfilesystemencoding()
520 if encoding is None:
521 encoding = 'utf-8'
522 return s.encode(encoding, 'ignore')
523
524 def decodeOption(optval):
525 if optval is None:
526 return optval
527 if isinstance(optval, bytes):
528 optval = optval.decode(preferredencoding())
529
530 assert isinstance(optval, compat_str)
531 return optval
532
533 def formatSeconds(secs):
534 if secs > 3600:
535 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
536 elif secs > 60:
537 return '%d:%02d' % (secs // 60, secs % 60)
538 else:
539 return '%d' % secs
540
541 def make_HTTPS_handler(opts_no_check_certificate):
542 if sys.version_info < (3, 2):
543 import httplib
544
545 class HTTPSConnectionV3(httplib.HTTPSConnection):
546 def __init__(self, *args, **kwargs):
547 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
548
549 def connect(self):
550 sock = socket.create_connection((self.host, self.port), self.timeout)
551 if getattr(self, '_tunnel_host', False):
552 self.sock = sock
553 self._tunnel()
554 try:
555 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
556 except ssl.SSLError:
557 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
558
559 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
560 def https_open(self, req):
561 return self.do_open(HTTPSConnectionV3, req)
562 return HTTPSHandlerV3()
563 else:
564 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
565 context.verify_mode = (ssl.CERT_NONE
566 if opts_no_check_certificate
567 else ssl.CERT_REQUIRED)
568 context.set_default_verify_paths()
569 try:
570 context.load_default_certs()
571 except AttributeError:
572 pass # Python < 3.4
573 return compat_urllib_request.HTTPSHandler(context=context)
574
575 class ExtractorError(Exception):
576 """Error during info extraction."""
577 def __init__(self, msg, tb=None, expected=False, cause=None):
578 """ tb, if given, is the original traceback (so that it can be printed out).
579 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
580 """
581
582 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
583 expected = True
584 if not expected:
585 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
586 super(ExtractorError, self).__init__(msg)
587
588 self.traceback = tb
589 self.exc_info = sys.exc_info() # preserve original exception
590 self.cause = cause
591
592 def format_traceback(self):
593 if self.traceback is None:
594 return None
595 return u''.join(traceback.format_tb(self.traceback))
596
597
598 class RegexNotFoundError(ExtractorError):
599 """Error when a regex didn't match"""
600 pass
601
602
603 class DownloadError(Exception):
604 """Download Error exception.
605
606 This exception may be thrown by FileDownloader objects if they are not
607 configured to continue on errors. They will contain the appropriate
608 error message.
609 """
610 def __init__(self, msg, exc_info=None):
611 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
612 super(DownloadError, self).__init__(msg)
613 self.exc_info = exc_info
614
615
616 class SameFileError(Exception):
617 """Same File exception.
618
619 This exception will be thrown by FileDownloader objects if they detect
620 multiple files would have to be downloaded to the same file on disk.
621 """
622 pass
623
624
625 class PostProcessingError(Exception):
626 """Post Processing exception.
627
628 This exception may be raised by PostProcessor's .run() method to
629 indicate an error in the postprocessing task.
630 """
631 def __init__(self, msg):
632 self.msg = msg
633
634 class MaxDownloadsReached(Exception):
635 """ --max-downloads limit has been reached. """
636 pass
637
638
639 class UnavailableVideoError(Exception):
640 """Unavailable Format exception.
641
642 This exception will be thrown when a video is requested
643 in a format that is not available for that video.
644 """
645 pass
646
647
648 class ContentTooShortError(Exception):
649 """Content Too Short exception.
650
651 This exception may be raised by FileDownloader objects when a file they
652 download is too small for what the server announced first, indicating
653 the connection was probably interrupted.
654 """
655 # Both in bytes
656 downloaded = None
657 expected = None
658
659 def __init__(self, downloaded, expected):
660 self.downloaded = downloaded
661 self.expected = expected
662
663 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
664 """Handler for HTTP requests and responses.
665
666 This class, when installed with an OpenerDirector, automatically adds
667 the standard headers to every HTTP request and handles gzipped and
668 deflated responses from web servers. If compression is to be avoided in
669 a particular request, the original request in the program code only has
670 to include the HTTP header "Youtubedl-No-Compression", which will be
671 removed before making the real request.
672
673 Part of this code was copied from:
674
675 http://techknack.net/python-urllib2-handlers/
676
677 Andrew Rowls, the author of that code, agreed to release it to the
678 public domain.
679 """
680
681 @staticmethod
682 def deflate(data):
683 try:
684 return zlib.decompress(data, -zlib.MAX_WBITS)
685 except zlib.error:
686 return zlib.decompress(data)
687
688 @staticmethod
689 def addinfourl_wrapper(stream, headers, url, code):
690 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
691 return compat_urllib_request.addinfourl(stream, headers, url, code)
692 ret = compat_urllib_request.addinfourl(stream, headers, url)
693 ret.code = code
694 return ret
695
696 def http_request(self, req):
697 for h,v in std_headers.items():
698 if h in req.headers:
699 del req.headers[h]
700 req.add_header(h, v)
701 if 'Youtubedl-no-compression' in req.headers:
702 if 'Accept-encoding' in req.headers:
703 del req.headers['Accept-encoding']
704 del req.headers['Youtubedl-no-compression']
705 if 'Youtubedl-user-agent' in req.headers:
706 if 'User-agent' in req.headers:
707 del req.headers['User-agent']
708 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
709 del req.headers['Youtubedl-user-agent']
710 return req
711
712 def http_response(self, req, resp):
713 old_resp = resp
714 # gzip
715 if resp.headers.get('Content-encoding', '') == 'gzip':
716 content = resp.read()
717 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
718 try:
719 uncompressed = io.BytesIO(gz.read())
720 except IOError as original_ioerror:
721 # There may be junk add the end of the file
722 # See http://stackoverflow.com/q/4928560/35070 for details
723 for i in range(1, 1024):
724 try:
725 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
726 uncompressed = io.BytesIO(gz.read())
727 except IOError:
728 continue
729 break
730 else:
731 raise original_ioerror
732 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
733 resp.msg = old_resp.msg
734 # deflate
735 if resp.headers.get('Content-encoding', '') == 'deflate':
736 gz = io.BytesIO(self.deflate(resp.read()))
737 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
738 resp.msg = old_resp.msg
739 return resp
740
741 https_request = http_request
742 https_response = http_response
743
744 def unified_strdate(date_str):
745 """Return a string with the date in the format YYYYMMDD"""
746 upload_date = None
747 #Replace commas
748 date_str = date_str.replace(',',' ')
749 # %z (UTC offset) is only supported in python>=3.2
750 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
751 format_expressions = [
752 '%d %B %Y',
753 '%B %d %Y',
754 '%b %d %Y',
755 '%Y-%m-%d',
756 '%d/%m/%Y',
757 '%Y/%m/%d %H:%M:%S',
758 '%d.%m.%Y %H:%M',
759 '%Y-%m-%dT%H:%M:%SZ',
760 '%Y-%m-%dT%H:%M:%S.%fZ',
761 '%Y-%m-%dT%H:%M:%S.%f0Z',
762 '%Y-%m-%dT%H:%M:%S',
763 ]
764 for expression in format_expressions:
765 try:
766 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
767 except:
768 pass
769 return upload_date
770
771 def determine_ext(url, default_ext=u'unknown_video'):
772 guess = url.partition(u'?')[0].rpartition(u'.')[2]
773 if re.match(r'^[A-Za-z0-9]+$', guess):
774 return guess
775 else:
776 return default_ext
777
778 def subtitles_filename(filename, sub_lang, sub_format):
779 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
780
781 def date_from_str(date_str):
782 """
783 Return a datetime object from a string in the format YYYYMMDD or
784 (now|today)[+-][0-9](day|week|month|year)(s)?"""
785 today = datetime.date.today()
786 if date_str == 'now'or date_str == 'today':
787 return today
788 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
789 if match is not None:
790 sign = match.group('sign')
791 time = int(match.group('time'))
792 if sign == '-':
793 time = -time
794 unit = match.group('unit')
795 #A bad aproximation?
796 if unit == 'month':
797 unit = 'day'
798 time *= 30
799 elif unit == 'year':
800 unit = 'day'
801 time *= 365
802 unit += 's'
803 delta = datetime.timedelta(**{unit: time})
804 return today + delta
805 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
806
807 class DateRange(object):
808 """Represents a time interval between two dates"""
809 def __init__(self, start=None, end=None):
810 """start and end must be strings in the format accepted by date"""
811 if start is not None:
812 self.start = date_from_str(start)
813 else:
814 self.start = datetime.datetime.min.date()
815 if end is not None:
816 self.end = date_from_str(end)
817 else:
818 self.end = datetime.datetime.max.date()
819 if self.start > self.end:
820 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
821 @classmethod
822 def day(cls, day):
823 """Returns a range that only contains the given day"""
824 return cls(day,day)
825 def __contains__(self, date):
826 """Check if the date is in the range"""
827 if not isinstance(date, datetime.date):
828 date = date_from_str(date)
829 return self.start <= date <= self.end
830 def __str__(self):
831 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
832
833
834 def platform_name():
835 """ Returns the platform name as a compat_str """
836 res = platform.platform()
837 if isinstance(res, bytes):
838 res = res.decode(preferredencoding())
839
840 assert isinstance(res, compat_str)
841 return res
842
843
844 def write_string(s, out=None):
845 if out is None:
846 out = sys.stderr
847 assert type(s) == type(u'')
848
849 if ('b' in getattr(out, 'mode', '') or
850 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
851 s = s.encode(preferredencoding(), 'ignore')
852 out.write(s)
853 out.flush()
854
855
856 def bytes_to_intlist(bs):
857 if not bs:
858 return []
859 if isinstance(bs[0], int): # Python 3
860 return list(bs)
861 else:
862 return [ord(c) for c in bs]
863
864
865 def intlist_to_bytes(xs):
866 if not xs:
867 return b''
868 if isinstance(chr(0), bytes): # Python 2
869 return ''.join([chr(x) for x in xs])
870 else:
871 return bytes(xs)
872
873
874 def get_cachedir(params={}):
875 cache_root = os.environ.get('XDG_CACHE_HOME',
876 os.path.expanduser('~/.cache'))
877 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
878
879
880 # Cross-platform file locking
881 if sys.platform == 'win32':
882 import ctypes.wintypes
883 import msvcrt
884
885 class OVERLAPPED(ctypes.Structure):
886 _fields_ = [
887 ('Internal', ctypes.wintypes.LPVOID),
888 ('InternalHigh', ctypes.wintypes.LPVOID),
889 ('Offset', ctypes.wintypes.DWORD),
890 ('OffsetHigh', ctypes.wintypes.DWORD),
891 ('hEvent', ctypes.wintypes.HANDLE),
892 ]
893
894 kernel32 = ctypes.windll.kernel32
895 LockFileEx = kernel32.LockFileEx
896 LockFileEx.argtypes = [
897 ctypes.wintypes.HANDLE, # hFile
898 ctypes.wintypes.DWORD, # dwFlags
899 ctypes.wintypes.DWORD, # dwReserved
900 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
901 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
902 ctypes.POINTER(OVERLAPPED) # Overlapped
903 ]
904 LockFileEx.restype = ctypes.wintypes.BOOL
905 UnlockFileEx = kernel32.UnlockFileEx
906 UnlockFileEx.argtypes = [
907 ctypes.wintypes.HANDLE, # hFile
908 ctypes.wintypes.DWORD, # dwReserved
909 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
910 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
911 ctypes.POINTER(OVERLAPPED) # Overlapped
912 ]
913 UnlockFileEx.restype = ctypes.wintypes.BOOL
914 whole_low = 0xffffffff
915 whole_high = 0x7fffffff
916
917 def _lock_file(f, exclusive):
918 overlapped = OVERLAPPED()
919 overlapped.Offset = 0
920 overlapped.OffsetHigh = 0
921 overlapped.hEvent = 0
922 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
923 handle = msvcrt.get_osfhandle(f.fileno())
924 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
925 whole_low, whole_high, f._lock_file_overlapped_p):
926 raise OSError('Locking file failed: %r' % ctypes.FormatError())
927
928 def _unlock_file(f):
929 assert f._lock_file_overlapped_p
930 handle = msvcrt.get_osfhandle(f.fileno())
931 if not UnlockFileEx(handle, 0,
932 whole_low, whole_high, f._lock_file_overlapped_p):
933 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
934
935 else:
936 import fcntl
937
938 def _lock_file(f, exclusive):
939 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
940
941 def _unlock_file(f):
942 fcntl.lockf(f, fcntl.LOCK_UN)
943
944
945 class locked_file(object):
946 def __init__(self, filename, mode, encoding=None):
947 assert mode in ['r', 'a', 'w']
948 self.f = io.open(filename, mode, encoding=encoding)
949 self.mode = mode
950
951 def __enter__(self):
952 exclusive = self.mode != 'r'
953 try:
954 _lock_file(self.f, exclusive)
955 except IOError:
956 self.f.close()
957 raise
958 return self
959
960 def __exit__(self, etype, value, traceback):
961 try:
962 _unlock_file(self.f)
963 finally:
964 self.f.close()
965
966 def __iter__(self):
967 return iter(self.f)
968
969 def write(self, *args):
970 return self.f.write(*args)
971
972 def read(self, *args):
973 return self.f.read(*args)
974
975
976 def shell_quote(args):
977 quoted_args = []
978 encoding = sys.getfilesystemencoding()
979 if encoding is None:
980 encoding = 'utf-8'
981 for a in args:
982 if isinstance(a, bytes):
983 # We may get a filename encoded with 'encodeFilename'
984 a = a.decode(encoding)
985 quoted_args.append(pipes.quote(a))
986 return u' '.join(quoted_args)
987
988
989 def takewhile_inclusive(pred, seq):
990 """ Like itertools.takewhile, but include the latest evaluated element
991 (the first element so that Not pred(e)) """
992 for e in seq:
993 yield e
994 if not pred(e):
995 return
996
997
998 def smuggle_url(url, data):
999 """ Pass additional data in a URL for internal use. """
1000
1001 sdata = compat_urllib_parse.urlencode(
1002 {u'__youtubedl_smuggle': json.dumps(data)})
1003 return url + u'#' + sdata
1004
1005
1006 def unsmuggle_url(smug_url):
1007 if not '#__youtubedl_smuggle' in smug_url:
1008 return smug_url, None
1009 url, _, sdata = smug_url.rpartition(u'#')
1010 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1011 data = json.loads(jsond)
1012 return url, data
1013
1014
1015 def format_bytes(bytes):
1016 if bytes is None:
1017 return u'N/A'
1018 if type(bytes) is str:
1019 bytes = float(bytes)
1020 if bytes == 0.0:
1021 exponent = 0
1022 else:
1023 exponent = int(math.log(bytes, 1024.0))
1024 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1025 converted = float(bytes) / float(1024 ** exponent)
1026 return u'%.2f%s' % (converted, suffix)
1027
1028
1029 def str_to_int(int_str):
1030 int_str = re.sub(r'[,\.]', u'', int_str)
1031 return int(int_str)
1032
1033
1034 def get_term_width():
1035 columns = os.environ.get('COLUMNS', None)
1036 if columns:
1037 return int(columns)
1038
1039 try:
1040 sp = subprocess.Popen(
1041 ['stty', 'size'],
1042 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1043 out, err = sp.communicate()
1044 return int(out.split()[1])
1045 except:
1046 pass
1047 return None
1048
1049
1050 def month_by_name(name):
1051 """ Return the number of a month by (locale-independently) English name """
1052
1053 ENGLISH_NAMES = [
1054 u'Januar', u'February', u'March', u'April', u'May', u'June',
1055 u'July', u'August', u'September', u'October', u'November', u'December']
1056 try:
1057 return ENGLISH_NAMES.index(name) + 1
1058 except ValueError:
1059 return None
1060
1061
1062 def fix_xml_all_ampersand(xml_str):
1063 """Replace all the '&' by '&amp;' in XML"""
1064 return xml_str.replace(u'&', u'&amp;')