]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Add format to unified_strdate
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import contextlib
5 import ctypes
6 import datetime
7 import email.utils
8 import errno
9 import gzip
10 import itertools
11 import io
12 import json
13 import locale
14 import math
15 import os
16 import pipes
17 import platform
18 import re
19 import ssl
20 import socket
21 import struct
22 import subprocess
23 import sys
24 import traceback
25 import xml.etree.ElementTree
26 import zlib
27
28 try:
29 import urllib.request as compat_urllib_request
30 except ImportError: # Python 2
31 import urllib2 as compat_urllib_request
32
33 try:
34 import urllib.error as compat_urllib_error
35 except ImportError: # Python 2
36 import urllib2 as compat_urllib_error
37
38 try:
39 import urllib.parse as compat_urllib_parse
40 except ImportError: # Python 2
41 import urllib as compat_urllib_parse
42
43 try:
44 from urllib.parse import urlparse as compat_urllib_parse_urlparse
45 except ImportError: # Python 2
46 from urlparse import urlparse as compat_urllib_parse_urlparse
47
48 try:
49 import urllib.parse as compat_urlparse
50 except ImportError: # Python 2
51 import urlparse as compat_urlparse
52
53 try:
54 import http.cookiejar as compat_cookiejar
55 except ImportError: # Python 2
56 import cookielib as compat_cookiejar
57
58 try:
59 import html.entities as compat_html_entities
60 except ImportError: # Python 2
61 import htmlentitydefs as compat_html_entities
62
63 try:
64 import html.parser as compat_html_parser
65 except ImportError: # Python 2
66 import HTMLParser as compat_html_parser
67
68 try:
69 import http.client as compat_http_client
70 except ImportError: # Python 2
71 import httplib as compat_http_client
72
73 try:
74 from urllib.error import HTTPError as compat_HTTPError
75 except ImportError: # Python 2
76 from urllib2 import HTTPError as compat_HTTPError
77
78 try:
79 from urllib.request import urlretrieve as compat_urlretrieve
80 except ImportError: # Python 2
81 from urllib import urlretrieve as compat_urlretrieve
82
83
84 try:
85 from subprocess import DEVNULL
86 compat_subprocess_get_DEVNULL = lambda: DEVNULL
87 except ImportError:
88 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
89
90 try:
91 from urllib.parse import parse_qs as compat_parse_qs
92 except ImportError: # Python 2
93 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
94 # Python 2's version is apparently totally broken
95 def _unquote(string, encoding='utf-8', errors='replace'):
96 if string == '':
97 return string
98 res = string.split('%')
99 if len(res) == 1:
100 return string
101 if encoding is None:
102 encoding = 'utf-8'
103 if errors is None:
104 errors = 'replace'
105 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
106 pct_sequence = b''
107 string = res[0]
108 for item in res[1:]:
109 try:
110 if not item:
111 raise ValueError
112 pct_sequence += item[:2].decode('hex')
113 rest = item[2:]
114 if not rest:
115 # This segment was just a single percent-encoded character.
116 # May be part of a sequence of code units, so delay decoding.
117 # (Stored in pct_sequence).
118 continue
119 except ValueError:
120 rest = '%' + item
121 # Encountered non-percent-encoded characters. Flush the current
122 # pct_sequence.
123 string += pct_sequence.decode(encoding, errors) + rest
124 pct_sequence = b''
125 if pct_sequence:
126 # Flush the final pct_sequence
127 string += pct_sequence.decode(encoding, errors)
128 return string
129
130 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
131 encoding='utf-8', errors='replace'):
132 qs, _coerce_result = qs, unicode
133 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
134 r = []
135 for name_value in pairs:
136 if not name_value and not strict_parsing:
137 continue
138 nv = name_value.split('=', 1)
139 if len(nv) != 2:
140 if strict_parsing:
141 raise ValueError("bad query field: %r" % (name_value,))
142 # Handle case of a control-name with no equal sign
143 if keep_blank_values:
144 nv.append('')
145 else:
146 continue
147 if len(nv[1]) or keep_blank_values:
148 name = nv[0].replace('+', ' ')
149 name = _unquote(name, encoding=encoding, errors=errors)
150 name = _coerce_result(name)
151 value = nv[1].replace('+', ' ')
152 value = _unquote(value, encoding=encoding, errors=errors)
153 value = _coerce_result(value)
154 r.append((name, value))
155 return r
156
157 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
158 encoding='utf-8', errors='replace'):
159 parsed_result = {}
160 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
161 encoding=encoding, errors=errors)
162 for name, value in pairs:
163 if name in parsed_result:
164 parsed_result[name].append(value)
165 else:
166 parsed_result[name] = [value]
167 return parsed_result
168
169 try:
170 compat_str = unicode # Python 2
171 except NameError:
172 compat_str = str
173
174 try:
175 compat_chr = unichr # Python 2
176 except NameError:
177 compat_chr = chr
178
179 try:
180 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
181 except ImportError: # Python 2.6
182 from xml.parsers.expat import ExpatError as compat_xml_parse_error
183
184 def compat_ord(c):
185 if type(c) is int: return c
186 else: return ord(c)
187
188 # This is not clearly defined otherwise
189 compiled_regex_type = type(re.compile(''))
190
191 std_headers = {
192 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
193 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
194 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
195 'Accept-Encoding': 'gzip, deflate',
196 'Accept-Language': 'en-us,en;q=0.5',
197 }
198
199 def preferredencoding():
200 """Get preferred encoding.
201
202 Returns the best encoding scheme for the system, based on
203 locale.getpreferredencoding() and some further tweaks.
204 """
205 try:
206 pref = locale.getpreferredencoding()
207 u'TEST'.encode(pref)
208 except:
209 pref = 'UTF-8'
210
211 return pref
212
213 if sys.version_info < (3,0):
214 def compat_print(s):
215 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
216 else:
217 def compat_print(s):
218 assert type(s) == type(u'')
219 print(s)
220
221 # In Python 2.x, json.dump expects a bytestream.
222 # In Python 3.x, it writes to a character stream
223 if sys.version_info < (3,0):
224 def write_json_file(obj, fn):
225 with open(fn, 'wb') as f:
226 json.dump(obj, f)
227 else:
228 def write_json_file(obj, fn):
229 with open(fn, 'w', encoding='utf-8') as f:
230 json.dump(obj, f)
231
232 if sys.version_info >= (2,7):
233 def find_xpath_attr(node, xpath, key, val):
234 """ Find the xpath xpath[@key=val] """
235 assert re.match(r'^[a-zA-Z]+$', key)
236 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
237 expr = xpath + u"[@%s='%s']" % (key, val)
238 return node.find(expr)
239 else:
240 def find_xpath_attr(node, xpath, key, val):
241 for f in node.findall(xpath):
242 if f.attrib.get(key) == val:
243 return f
244 return None
245
246 # On python2.6 the xml.etree.ElementTree.Element methods don't support
247 # the namespace parameter
248 def xpath_with_ns(path, ns_map):
249 components = [c.split(':') for c in path.split('/')]
250 replaced = []
251 for c in components:
252 if len(c) == 1:
253 replaced.append(c[0])
254 else:
255 ns, tag = c
256 replaced.append('{%s}%s' % (ns_map[ns], tag))
257 return '/'.join(replaced)
258
259 def htmlentity_transform(matchobj):
260 """Transforms an HTML entity to a character.
261
262 This function receives a match object and is intended to be used with
263 the re.sub() function.
264 """
265 entity = matchobj.group(1)
266
267 # Known non-numeric HTML entity
268 if entity in compat_html_entities.name2codepoint:
269 return compat_chr(compat_html_entities.name2codepoint[entity])
270
271 mobj = re.match(u'(?u)#(x?\\d+)', entity)
272 if mobj is not None:
273 numstr = mobj.group(1)
274 if numstr.startswith(u'x'):
275 base = 16
276 numstr = u'0%s' % numstr
277 else:
278 base = 10
279 return compat_chr(int(numstr, base))
280
281 # Unknown entity in name, return its literal representation
282 return (u'&%s;' % entity)
283
284 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
285 class BaseHTMLParser(compat_html_parser.HTMLParser):
286 def __init(self):
287 compat_html_parser.HTMLParser.__init__(self)
288 self.html = None
289
290 def loads(self, html):
291 self.html = html
292 self.feed(html)
293 self.close()
294
295 class AttrParser(BaseHTMLParser):
296 """Modified HTMLParser that isolates a tag with the specified attribute"""
297 def __init__(self, attribute, value):
298 self.attribute = attribute
299 self.value = value
300 self.result = None
301 self.started = False
302 self.depth = {}
303 self.watch_startpos = False
304 self.error_count = 0
305 BaseHTMLParser.__init__(self)
306
307 def error(self, message):
308 if self.error_count > 10 or self.started:
309 raise compat_html_parser.HTMLParseError(message, self.getpos())
310 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
311 self.error_count += 1
312 self.goahead(1)
313
314 def handle_starttag(self, tag, attrs):
315 attrs = dict(attrs)
316 if self.started:
317 self.find_startpos(None)
318 if self.attribute in attrs and attrs[self.attribute] == self.value:
319 self.result = [tag]
320 self.started = True
321 self.watch_startpos = True
322 if self.started:
323 if not tag in self.depth: self.depth[tag] = 0
324 self.depth[tag] += 1
325
326 def handle_endtag(self, tag):
327 if self.started:
328 if tag in self.depth: self.depth[tag] -= 1
329 if self.depth[self.result[0]] == 0:
330 self.started = False
331 self.result.append(self.getpos())
332
333 def find_startpos(self, x):
334 """Needed to put the start position of the result (self.result[1])
335 after the opening tag with the requested id"""
336 if self.watch_startpos:
337 self.watch_startpos = False
338 self.result.append(self.getpos())
339 handle_entityref = handle_charref = handle_data = handle_comment = \
340 handle_decl = handle_pi = unknown_decl = find_startpos
341
342 def get_result(self):
343 if self.result is None:
344 return None
345 if len(self.result) != 3:
346 return None
347 lines = self.html.split('\n')
348 lines = lines[self.result[1][0]-1:self.result[2][0]]
349 lines[0] = lines[0][self.result[1][1]:]
350 if len(lines) == 1:
351 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
352 lines[-1] = lines[-1][:self.result[2][1]]
353 return '\n'.join(lines).strip()
354 # Hack for https://github.com/rg3/youtube-dl/issues/662
355 if sys.version_info < (2, 7, 3):
356 AttrParser.parse_endtag = (lambda self, i:
357 i + len("</scr'+'ipt>")
358 if self.rawdata[i:].startswith("</scr'+'ipt>")
359 else compat_html_parser.HTMLParser.parse_endtag(self, i))
360
361 def get_element_by_id(id, html):
362 """Return the content of the tag with the specified ID in the passed HTML document"""
363 return get_element_by_attribute("id", id, html)
364
365 def get_element_by_attribute(attribute, value, html):
366 """Return the content of the tag with the specified attribute in the passed HTML document"""
367 parser = AttrParser(attribute, value)
368 try:
369 parser.loads(html)
370 except compat_html_parser.HTMLParseError:
371 pass
372 return parser.get_result()
373
374 class MetaParser(BaseHTMLParser):
375 """
376 Modified HTMLParser that isolates a meta tag with the specified name
377 attribute.
378 """
379 def __init__(self, name):
380 BaseHTMLParser.__init__(self)
381 self.name = name
382 self.content = None
383 self.result = None
384
385 def handle_starttag(self, tag, attrs):
386 if tag != 'meta':
387 return
388 attrs = dict(attrs)
389 if attrs.get('name') == self.name:
390 self.result = attrs.get('content')
391
392 def get_result(self):
393 return self.result
394
395 def get_meta_content(name, html):
396 """
397 Return the content attribute from the meta tag with the given name attribute.
398 """
399 parser = MetaParser(name)
400 try:
401 parser.loads(html)
402 except compat_html_parser.HTMLParseError:
403 pass
404 return parser.get_result()
405
406
407 def clean_html(html):
408 """Clean an HTML snippet into a readable string"""
409 # Newline vs <br />
410 html = html.replace('\n', ' ')
411 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
412 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
413 # Strip html tags
414 html = re.sub('<.*?>', '', html)
415 # Replace html entities
416 html = unescapeHTML(html)
417 return html.strip()
418
419
420 def sanitize_open(filename, open_mode):
421 """Try to open the given filename, and slightly tweak it if this fails.
422
423 Attempts to open the given filename. If this fails, it tries to change
424 the filename slightly, step by step, until it's either able to open it
425 or it fails and raises a final exception, like the standard open()
426 function.
427
428 It returns the tuple (stream, definitive_file_name).
429 """
430 try:
431 if filename == u'-':
432 if sys.platform == 'win32':
433 import msvcrt
434 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
435 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
436 stream = open(encodeFilename(filename), open_mode)
437 return (stream, filename)
438 except (IOError, OSError) as err:
439 if err.errno in (errno.EACCES,):
440 raise
441
442 # In case of error, try to remove win32 forbidden chars
443 alt_filename = os.path.join(
444 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
445 for path_part in os.path.split(filename)
446 )
447 if alt_filename == filename:
448 raise
449 else:
450 # An exception here should be caught in the caller
451 stream = open(encodeFilename(filename), open_mode)
452 return (stream, alt_filename)
453
454
455 def timeconvert(timestr):
456 """Convert RFC 2822 defined time string into system timestamp"""
457 timestamp = None
458 timetuple = email.utils.parsedate_tz(timestr)
459 if timetuple is not None:
460 timestamp = email.utils.mktime_tz(timetuple)
461 return timestamp
462
463 def sanitize_filename(s, restricted=False, is_id=False):
464 """Sanitizes a string so it could be used as part of a filename.
465 If restricted is set, use a stricter subset of allowed characters.
466 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
467 """
468 def replace_insane(char):
469 if char == '?' or ord(char) < 32 or ord(char) == 127:
470 return ''
471 elif char == '"':
472 return '' if restricted else '\''
473 elif char == ':':
474 return '_-' if restricted else ' -'
475 elif char in '\\/|*<>':
476 return '_'
477 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
478 return '_'
479 if restricted and ord(char) > 127:
480 return '_'
481 return char
482
483 result = u''.join(map(replace_insane, s))
484 if not is_id:
485 while '__' in result:
486 result = result.replace('__', '_')
487 result = result.strip('_')
488 # Common case of "Foreign band name - English song title"
489 if restricted and result.startswith('-_'):
490 result = result[2:]
491 if not result:
492 result = '_'
493 return result
494
495 def orderedSet(iterable):
496 """ Remove all duplicates from the input iterable """
497 res = []
498 for el in iterable:
499 if el not in res:
500 res.append(el)
501 return res
502
503 def unescapeHTML(s):
504 """
505 @param s a string
506 """
507 assert type(s) == type(u'')
508
509 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
510 return result
511
512
513 def encodeFilename(s, for_subprocess=False):
514 """
515 @param s The name of the file
516 """
517
518 assert type(s) == compat_str
519
520 # Python 3 has a Unicode API
521 if sys.version_info >= (3, 0):
522 return s
523
524 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
525 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
526 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
527 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
528 if not for_subprocess:
529 return s
530 else:
531 # For subprocess calls, encode with locale encoding
532 # Refer to http://stackoverflow.com/a/9951851/35070
533 encoding = preferredencoding()
534 else:
535 encoding = sys.getfilesystemencoding()
536 if encoding is None:
537 encoding = 'utf-8'
538 return s.encode(encoding, 'ignore')
539
540
541 def decodeOption(optval):
542 if optval is None:
543 return optval
544 if isinstance(optval, bytes):
545 optval = optval.decode(preferredencoding())
546
547 assert isinstance(optval, compat_str)
548 return optval
549
550 def formatSeconds(secs):
551 if secs > 3600:
552 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
553 elif secs > 60:
554 return '%d:%02d' % (secs // 60, secs % 60)
555 else:
556 return '%d' % secs
557
558
559 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
560 if sys.version_info < (3, 2):
561 import httplib
562
563 class HTTPSConnectionV3(httplib.HTTPSConnection):
564 def __init__(self, *args, **kwargs):
565 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
566
567 def connect(self):
568 sock = socket.create_connection((self.host, self.port), self.timeout)
569 if getattr(self, '_tunnel_host', False):
570 self.sock = sock
571 self._tunnel()
572 try:
573 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
574 except ssl.SSLError:
575 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
576
577 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
578 def https_open(self, req):
579 return self.do_open(HTTPSConnectionV3, req)
580 return HTTPSHandlerV3(**kwargs)
581 else:
582 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
583 context.verify_mode = (ssl.CERT_NONE
584 if opts_no_check_certificate
585 else ssl.CERT_REQUIRED)
586 context.set_default_verify_paths()
587 try:
588 context.load_default_certs()
589 except AttributeError:
590 pass # Python < 3.4
591 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
592
593 class ExtractorError(Exception):
594 """Error during info extraction."""
595 def __init__(self, msg, tb=None, expected=False, cause=None):
596 """ tb, if given, is the original traceback (so that it can be printed out).
597 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
598 """
599
600 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
601 expected = True
602 if not expected:
603 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
604 super(ExtractorError, self).__init__(msg)
605
606 self.traceback = tb
607 self.exc_info = sys.exc_info() # preserve original exception
608 self.cause = cause
609
610 def format_traceback(self):
611 if self.traceback is None:
612 return None
613 return u''.join(traceback.format_tb(self.traceback))
614
615
616 class RegexNotFoundError(ExtractorError):
617 """Error when a regex didn't match"""
618 pass
619
620
621 class DownloadError(Exception):
622 """Download Error exception.
623
624 This exception may be thrown by FileDownloader objects if they are not
625 configured to continue on errors. They will contain the appropriate
626 error message.
627 """
628 def __init__(self, msg, exc_info=None):
629 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
630 super(DownloadError, self).__init__(msg)
631 self.exc_info = exc_info
632
633
634 class SameFileError(Exception):
635 """Same File exception.
636
637 This exception will be thrown by FileDownloader objects if they detect
638 multiple files would have to be downloaded to the same file on disk.
639 """
640 pass
641
642
643 class PostProcessingError(Exception):
644 """Post Processing exception.
645
646 This exception may be raised by PostProcessor's .run() method to
647 indicate an error in the postprocessing task.
648 """
649 def __init__(self, msg):
650 self.msg = msg
651
652 class MaxDownloadsReached(Exception):
653 """ --max-downloads limit has been reached. """
654 pass
655
656
657 class UnavailableVideoError(Exception):
658 """Unavailable Format exception.
659
660 This exception will be thrown when a video is requested
661 in a format that is not available for that video.
662 """
663 pass
664
665
666 class ContentTooShortError(Exception):
667 """Content Too Short exception.
668
669 This exception may be raised by FileDownloader objects when a file they
670 download is too small for what the server announced first, indicating
671 the connection was probably interrupted.
672 """
673 # Both in bytes
674 downloaded = None
675 expected = None
676
677 def __init__(self, downloaded, expected):
678 self.downloaded = downloaded
679 self.expected = expected
680
681 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
682 """Handler for HTTP requests and responses.
683
684 This class, when installed with an OpenerDirector, automatically adds
685 the standard headers to every HTTP request and handles gzipped and
686 deflated responses from web servers. If compression is to be avoided in
687 a particular request, the original request in the program code only has
688 to include the HTTP header "Youtubedl-No-Compression", which will be
689 removed before making the real request.
690
691 Part of this code was copied from:
692
693 http://techknack.net/python-urllib2-handlers/
694
695 Andrew Rowls, the author of that code, agreed to release it to the
696 public domain.
697 """
698
699 @staticmethod
700 def deflate(data):
701 try:
702 return zlib.decompress(data, -zlib.MAX_WBITS)
703 except zlib.error:
704 return zlib.decompress(data)
705
706 @staticmethod
707 def addinfourl_wrapper(stream, headers, url, code):
708 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
709 return compat_urllib_request.addinfourl(stream, headers, url, code)
710 ret = compat_urllib_request.addinfourl(stream, headers, url)
711 ret.code = code
712 return ret
713
714 def http_request(self, req):
715 for h,v in std_headers.items():
716 if h in req.headers:
717 del req.headers[h]
718 req.add_header(h, v)
719 if 'Youtubedl-no-compression' in req.headers:
720 if 'Accept-encoding' in req.headers:
721 del req.headers['Accept-encoding']
722 del req.headers['Youtubedl-no-compression']
723 if 'Youtubedl-user-agent' in req.headers:
724 if 'User-agent' in req.headers:
725 del req.headers['User-agent']
726 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
727 del req.headers['Youtubedl-user-agent']
728 return req
729
730 def http_response(self, req, resp):
731 old_resp = resp
732 # gzip
733 if resp.headers.get('Content-encoding', '') == 'gzip':
734 content = resp.read()
735 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
736 try:
737 uncompressed = io.BytesIO(gz.read())
738 except IOError as original_ioerror:
739 # There may be junk add the end of the file
740 # See http://stackoverflow.com/q/4928560/35070 for details
741 for i in range(1, 1024):
742 try:
743 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
744 uncompressed = io.BytesIO(gz.read())
745 except IOError:
746 continue
747 break
748 else:
749 raise original_ioerror
750 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
751 resp.msg = old_resp.msg
752 # deflate
753 if resp.headers.get('Content-encoding', '') == 'deflate':
754 gz = io.BytesIO(self.deflate(resp.read()))
755 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
756 resp.msg = old_resp.msg
757 return resp
758
759 https_request = http_request
760 https_response = http_response
761
762
763 def unified_strdate(date_str):
764 """Return a string with the date in the format YYYYMMDD"""
765 upload_date = None
766 #Replace commas
767 date_str = date_str.replace(',', ' ')
768 # %z (UTC offset) is only supported in python>=3.2
769 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
770 format_expressions = [
771 '%d %B %Y',
772 '%d %b %Y',
773 '%B %d %Y',
774 '%b %d %Y',
775 '%Y-%m-%d',
776 '%d.%m.%Y',
777 '%d/%m/%Y',
778 '%Y/%m/%d %H:%M:%S',
779 '%Y-%m-%d %H:%M:%S',
780 '%d.%m.%Y %H:%M',
781 '%d.%m.%Y %H.%M',
782 '%Y-%m-%dT%H:%M:%SZ',
783 '%Y-%m-%dT%H:%M:%S.%fZ',
784 '%Y-%m-%dT%H:%M:%S.%f0Z',
785 '%Y-%m-%dT%H:%M:%S',
786 '%Y-%m-%dT%H:%M:%S.%f',
787 '%Y-%m-%dT%H:%M',
788 ]
789 for expression in format_expressions:
790 try:
791 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
792 except ValueError:
793 pass
794 if upload_date is None:
795 timetuple = email.utils.parsedate_tz(date_str)
796 if timetuple:
797 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
798 return upload_date
799
800 def determine_ext(url, default_ext=u'unknown_video'):
801 guess = url.partition(u'?')[0].rpartition(u'.')[2]
802 if re.match(r'^[A-Za-z0-9]+$', guess):
803 return guess
804 else:
805 return default_ext
806
807 def subtitles_filename(filename, sub_lang, sub_format):
808 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
809
810 def date_from_str(date_str):
811 """
812 Return a datetime object from a string in the format YYYYMMDD or
813 (now|today)[+-][0-9](day|week|month|year)(s)?"""
814 today = datetime.date.today()
815 if date_str == 'now'or date_str == 'today':
816 return today
817 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
818 if match is not None:
819 sign = match.group('sign')
820 time = int(match.group('time'))
821 if sign == '-':
822 time = -time
823 unit = match.group('unit')
824 #A bad aproximation?
825 if unit == 'month':
826 unit = 'day'
827 time *= 30
828 elif unit == 'year':
829 unit = 'day'
830 time *= 365
831 unit += 's'
832 delta = datetime.timedelta(**{unit: time})
833 return today + delta
834 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
835
836 def hyphenate_date(date_str):
837 """
838 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
839 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
840 if match is not None:
841 return '-'.join(match.groups())
842 else:
843 return date_str
844
845 class DateRange(object):
846 """Represents a time interval between two dates"""
847 def __init__(self, start=None, end=None):
848 """start and end must be strings in the format accepted by date"""
849 if start is not None:
850 self.start = date_from_str(start)
851 else:
852 self.start = datetime.datetime.min.date()
853 if end is not None:
854 self.end = date_from_str(end)
855 else:
856 self.end = datetime.datetime.max.date()
857 if self.start > self.end:
858 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
859 @classmethod
860 def day(cls, day):
861 """Returns a range that only contains the given day"""
862 return cls(day,day)
863 def __contains__(self, date):
864 """Check if the date is in the range"""
865 if not isinstance(date, datetime.date):
866 date = date_from_str(date)
867 return self.start <= date <= self.end
868 def __str__(self):
869 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
870
871
872 def platform_name():
873 """ Returns the platform name as a compat_str """
874 res = platform.platform()
875 if isinstance(res, bytes):
876 res = res.decode(preferredencoding())
877
878 assert isinstance(res, compat_str)
879 return res
880
881
882 def write_string(s, out=None):
883 if out is None:
884 out = sys.stderr
885 assert type(s) == compat_str
886
887 if ('b' in getattr(out, 'mode', '') or
888 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
889 s = s.encode(preferredencoding(), 'ignore')
890 try:
891 out.write(s)
892 except UnicodeEncodeError:
893 # In Windows shells, this can fail even when the codec is just charmap!?
894 # See https://wiki.python.org/moin/PrintFails#Issue
895 if sys.platform == 'win32' and hasattr(out, 'encoding'):
896 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
897 out.write(s)
898 else:
899 raise
900
901 out.flush()
902
903
904 def bytes_to_intlist(bs):
905 if not bs:
906 return []
907 if isinstance(bs[0], int): # Python 3
908 return list(bs)
909 else:
910 return [ord(c) for c in bs]
911
912
913 def intlist_to_bytes(xs):
914 if not xs:
915 return b''
916 if isinstance(chr(0), bytes): # Python 2
917 return ''.join([chr(x) for x in xs])
918 else:
919 return bytes(xs)
920
921
922 def get_cachedir(params={}):
923 cache_root = os.environ.get('XDG_CACHE_HOME',
924 os.path.expanduser('~/.cache'))
925 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
926
927
928 # Cross-platform file locking
929 if sys.platform == 'win32':
930 import ctypes.wintypes
931 import msvcrt
932
933 class OVERLAPPED(ctypes.Structure):
934 _fields_ = [
935 ('Internal', ctypes.wintypes.LPVOID),
936 ('InternalHigh', ctypes.wintypes.LPVOID),
937 ('Offset', ctypes.wintypes.DWORD),
938 ('OffsetHigh', ctypes.wintypes.DWORD),
939 ('hEvent', ctypes.wintypes.HANDLE),
940 ]
941
942 kernel32 = ctypes.windll.kernel32
943 LockFileEx = kernel32.LockFileEx
944 LockFileEx.argtypes = [
945 ctypes.wintypes.HANDLE, # hFile
946 ctypes.wintypes.DWORD, # dwFlags
947 ctypes.wintypes.DWORD, # dwReserved
948 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
949 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
950 ctypes.POINTER(OVERLAPPED) # Overlapped
951 ]
952 LockFileEx.restype = ctypes.wintypes.BOOL
953 UnlockFileEx = kernel32.UnlockFileEx
954 UnlockFileEx.argtypes = [
955 ctypes.wintypes.HANDLE, # hFile
956 ctypes.wintypes.DWORD, # dwReserved
957 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
958 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
959 ctypes.POINTER(OVERLAPPED) # Overlapped
960 ]
961 UnlockFileEx.restype = ctypes.wintypes.BOOL
962 whole_low = 0xffffffff
963 whole_high = 0x7fffffff
964
965 def _lock_file(f, exclusive):
966 overlapped = OVERLAPPED()
967 overlapped.Offset = 0
968 overlapped.OffsetHigh = 0
969 overlapped.hEvent = 0
970 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
971 handle = msvcrt.get_osfhandle(f.fileno())
972 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
973 whole_low, whole_high, f._lock_file_overlapped_p):
974 raise OSError('Locking file failed: %r' % ctypes.FormatError())
975
976 def _unlock_file(f):
977 assert f._lock_file_overlapped_p
978 handle = msvcrt.get_osfhandle(f.fileno())
979 if not UnlockFileEx(handle, 0,
980 whole_low, whole_high, f._lock_file_overlapped_p):
981 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
982
983 else:
984 import fcntl
985
986 def _lock_file(f, exclusive):
987 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
988
989 def _unlock_file(f):
990 fcntl.lockf(f, fcntl.LOCK_UN)
991
992
993 class locked_file(object):
994 def __init__(self, filename, mode, encoding=None):
995 assert mode in ['r', 'a', 'w']
996 self.f = io.open(filename, mode, encoding=encoding)
997 self.mode = mode
998
999 def __enter__(self):
1000 exclusive = self.mode != 'r'
1001 try:
1002 _lock_file(self.f, exclusive)
1003 except IOError:
1004 self.f.close()
1005 raise
1006 return self
1007
1008 def __exit__(self, etype, value, traceback):
1009 try:
1010 _unlock_file(self.f)
1011 finally:
1012 self.f.close()
1013
1014 def __iter__(self):
1015 return iter(self.f)
1016
1017 def write(self, *args):
1018 return self.f.write(*args)
1019
1020 def read(self, *args):
1021 return self.f.read(*args)
1022
1023
1024 def shell_quote(args):
1025 quoted_args = []
1026 encoding = sys.getfilesystemencoding()
1027 if encoding is None:
1028 encoding = 'utf-8'
1029 for a in args:
1030 if isinstance(a, bytes):
1031 # We may get a filename encoded with 'encodeFilename'
1032 a = a.decode(encoding)
1033 quoted_args.append(pipes.quote(a))
1034 return u' '.join(quoted_args)
1035
1036
1037 def takewhile_inclusive(pred, seq):
1038 """ Like itertools.takewhile, but include the latest evaluated element
1039 (the first element so that Not pred(e)) """
1040 for e in seq:
1041 yield e
1042 if not pred(e):
1043 return
1044
1045
1046 def smuggle_url(url, data):
1047 """ Pass additional data in a URL for internal use. """
1048
1049 sdata = compat_urllib_parse.urlencode(
1050 {u'__youtubedl_smuggle': json.dumps(data)})
1051 return url + u'#' + sdata
1052
1053
1054 def unsmuggle_url(smug_url, default=None):
1055 if not '#__youtubedl_smuggle' in smug_url:
1056 return smug_url, default
1057 url, _, sdata = smug_url.rpartition(u'#')
1058 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1059 data = json.loads(jsond)
1060 return url, data
1061
1062
1063 def format_bytes(bytes):
1064 if bytes is None:
1065 return u'N/A'
1066 if type(bytes) is str:
1067 bytes = float(bytes)
1068 if bytes == 0.0:
1069 exponent = 0
1070 else:
1071 exponent = int(math.log(bytes, 1024.0))
1072 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1073 converted = float(bytes) / float(1024 ** exponent)
1074 return u'%.2f%s' % (converted, suffix)
1075
1076
1077 def str_to_int(int_str):
1078 int_str = re.sub(r'[,\.]', u'', int_str)
1079 return int(int_str)
1080
1081
1082 def get_term_width():
1083 columns = os.environ.get('COLUMNS', None)
1084 if columns:
1085 return int(columns)
1086
1087 try:
1088 sp = subprocess.Popen(
1089 ['stty', 'size'],
1090 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1091 out, err = sp.communicate()
1092 return int(out.split()[1])
1093 except:
1094 pass
1095 return None
1096
1097
1098 def month_by_name(name):
1099 """ Return the number of a month by (locale-independently) English name """
1100
1101 ENGLISH_NAMES = [
1102 u'January', u'February', u'March', u'April', u'May', u'June',
1103 u'July', u'August', u'September', u'October', u'November', u'December']
1104 try:
1105 return ENGLISH_NAMES.index(name) + 1
1106 except ValueError:
1107 return None
1108
1109
1110 def fix_xml_ampersands(xml_str):
1111 """Replace all the '&' by '&amp;' in XML"""
1112 return re.sub(
1113 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1114 u'&amp;',
1115 xml_str)
1116
1117
1118 def setproctitle(title):
1119 assert isinstance(title, compat_str)
1120 try:
1121 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1122 except OSError:
1123 return
1124 title = title
1125 buf = ctypes.create_string_buffer(len(title) + 1)
1126 buf.value = title.encode('utf-8')
1127 try:
1128 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1129 except AttributeError:
1130 return # Strange libc, just skip this
1131
1132
1133 def remove_start(s, start):
1134 if s.startswith(start):
1135 return s[len(start):]
1136 return s
1137
1138
1139 def url_basename(url):
1140 path = compat_urlparse.urlparse(url).path
1141 return path.strip(u'/').split(u'/')[-1]
1142
1143
1144 class HEADRequest(compat_urllib_request.Request):
1145 def get_method(self):
1146 return "HEAD"
1147
1148
1149 def int_or_none(v, scale=1):
1150 return v if v is None else (int(v) // scale)
1151
1152
1153 def parse_duration(s):
1154 if s is None:
1155 return None
1156
1157 m = re.match(
1158 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1159 if not m:
1160 return None
1161 res = int(m.group('secs'))
1162 if m.group('mins'):
1163 res += int(m.group('mins')) * 60
1164 if m.group('hours'):
1165 res += int(m.group('hours')) * 60 * 60
1166 return res
1167
1168
1169 def prepend_extension(filename, ext):
1170 name, real_ext = os.path.splitext(filename)
1171 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1172
1173
1174 def check_executable(exe, args=[]):
1175 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1176 args can be a list of arguments for a short output (like -version) """
1177 try:
1178 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1179 except OSError:
1180 return False
1181 return exe
1182
1183
1184 class PagedList(object):
1185 def __init__(self, pagefunc, pagesize):
1186 self._pagefunc = pagefunc
1187 self._pagesize = pagesize
1188
1189 def __len__(self):
1190 # This is only useful for tests
1191 return len(self.getslice())
1192
1193 def getslice(self, start=0, end=None):
1194 res = []
1195 for pagenum in itertools.count(start // self._pagesize):
1196 firstid = pagenum * self._pagesize
1197 nextfirstid = pagenum * self._pagesize + self._pagesize
1198 if start >= nextfirstid:
1199 continue
1200
1201 page_results = list(self._pagefunc(pagenum))
1202
1203 startv = (
1204 start % self._pagesize
1205 if firstid <= start < nextfirstid
1206 else 0)
1207
1208 endv = (
1209 ((end - 1) % self._pagesize) + 1
1210 if (end is not None and firstid <= end <= nextfirstid)
1211 else None)
1212
1213 if startv != 0 or endv is not None:
1214 page_results = page_results[startv:endv]
1215 res.extend(page_results)
1216
1217 # A little optimization - if current page is not "full", ie. does
1218 # not contain page_size videos then we can assume that this page
1219 # is the last one - there are no more ids on further pages -
1220 # i.e. no need to query again.
1221 if len(page_results) + startv < self._pagesize:
1222 break
1223
1224 # If we got the whole page, but the next page is not interesting,
1225 # break out early as well
1226 if end == nextfirstid:
1227 break
1228 return res
1229
1230
1231 def uppercase_escape(s):
1232 return re.sub(
1233 r'\\U([0-9a-fA-F]{8})',
1234 lambda m: compat_chr(int(m.group(1), base=16)), s)
1235
1236 try:
1237 struct.pack(u'!I', 0)
1238 except TypeError:
1239 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1240 def struct_pack(spec, *args):
1241 if isinstance(spec, compat_str):
1242 spec = spec.encode('ascii')
1243 return struct.pack(spec, *args)
1244
1245 def struct_unpack(spec, *args):
1246 if isinstance(spec, compat_str):
1247 spec = spec.encode('ascii')
1248 return struct.unpack(spec, *args)
1249 else:
1250 struct_pack = struct.pack
1251 struct_unpack = struct.unpack
1252
1253
1254 def read_batch_urls(batch_fd):
1255 def fixup(url):
1256 if not isinstance(url, compat_str):
1257 url = url.decode('utf-8', 'replace')
1258 BOM_UTF8 = u'\xef\xbb\xbf'
1259 if url.startswith(BOM_UTF8):
1260 url = url[len(BOM_UTF8):]
1261 url = url.strip()
1262 if url.startswith(('#', ';', ']')):
1263 return False
1264 return url
1265
1266 with contextlib.closing(batch_fd) as fd:
1267 return [url for url in map(fixup, fd) if url]
1268
1269
1270 def urlencode_postdata(*args, **kargs):
1271 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1272
1273
1274 def parse_xml(s):
1275 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1276 def doctype(self, name, pubid, system):
1277 pass # Ignore doctypes
1278
1279 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1280 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1281 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)