]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[YoutubeDL] Do not require default output template to be set
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
676eb3f2 5import codecs
62e609ab 6import contextlib
e3946f98 7import ctypes
c496ca96
PH
8import datetime
9import email.utils
f45c185f 10import errno
e68301af 11import getpass
d77c3dfd 12import gzip
b7ab0590 13import itertools
03f9daab 14import io
f4bfd65f 15import json
d77c3dfd 16import locale
02dbf93f 17import math
d77c3dfd 18import os
4eb7f1d1 19import pipes
c496ca96 20import platform
d77c3dfd 21import re
13ebea79 22import ssl
c496ca96 23import socket
b53466e1 24import struct
1c088fa8 25import subprocess
d77c3dfd 26import sys
01951dda 27import traceback
bcf89ce6 28import xml.etree.ElementTree
d77c3dfd 29import zlib
d77c3dfd 30
01ba00ca 31try:
59ae15a5 32 import urllib.request as compat_urllib_request
01ba00ca 33except ImportError: # Python 2
59ae15a5 34 import urllib2 as compat_urllib_request
01ba00ca
PH
35
36try:
59ae15a5 37 import urllib.error as compat_urllib_error
01ba00ca 38except ImportError: # Python 2
59ae15a5 39 import urllib2 as compat_urllib_error
01ba00ca
PH
40
41try:
59ae15a5 42 import urllib.parse as compat_urllib_parse
01ba00ca 43except ImportError: # Python 2
59ae15a5 44 import urllib as compat_urllib_parse
01ba00ca 45
799c0763
PH
46try:
47 from urllib.parse import urlparse as compat_urllib_parse_urlparse
48except ImportError: # Python 2
49 from urlparse import urlparse as compat_urllib_parse_urlparse
50
6543f0dc
JMF
51try:
52 import urllib.parse as compat_urlparse
53except ImportError: # Python 2
54 import urlparse as compat_urlparse
55
01ba00ca 56try:
59ae15a5 57 import http.cookiejar as compat_cookiejar
01ba00ca 58except ImportError: # Python 2
59ae15a5 59 import cookielib as compat_cookiejar
01ba00ca 60
3e669f36 61try:
59ae15a5 62 import html.entities as compat_html_entities
9f37a959 63except ImportError: # Python 2
59ae15a5 64 import htmlentitydefs as compat_html_entities
3e669f36 65
a8156c1d 66try:
59ae15a5 67 import html.parser as compat_html_parser
9f37a959 68except ImportError: # Python 2
59ae15a5 69 import HTMLParser as compat_html_parser
a8156c1d 70
348d0a7a 71try:
59ae15a5 72 import http.client as compat_http_client
9f37a959 73except ImportError: # Python 2
59ae15a5 74 import httplib as compat_http_client
348d0a7a 75
2eabb802 76try:
0e283428 77 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
78except ImportError: # Python 2
79 from urllib2 import HTTPError as compat_HTTPError
80
e0df6211
PH
81try:
82 from urllib.request import urlretrieve as compat_urlretrieve
83except ImportError: # Python 2
84 from urllib import urlretrieve as compat_urlretrieve
85
86
5910e210
PH
87try:
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
90except ImportError:
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
92
9f37a959 93try:
59ae15a5 94 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 95except ImportError: # Python 2
59ae15a5
PH
96 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
97 # Python 2's version is apparently totally broken
98 def _unquote(string, encoding='utf-8', errors='replace'):
99 if string == '':
100 return string
101 res = string.split('%')
102 if len(res) == 1:
103 return string
104 if encoding is None:
105 encoding = 'utf-8'
106 if errors is None:
107 errors = 'replace'
108 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
109 pct_sequence = b''
110 string = res[0]
111 for item in res[1:]:
112 try:
113 if not item:
114 raise ValueError
115 pct_sequence += item[:2].decode('hex')
116 rest = item[2:]
117 if not rest:
118 # This segment was just a single percent-encoded character.
119 # May be part of a sequence of code units, so delay decoding.
120 # (Stored in pct_sequence).
121 continue
122 except ValueError:
123 rest = '%' + item
124 # Encountered non-percent-encoded characters. Flush the current
125 # pct_sequence.
126 string += pct_sequence.decode(encoding, errors) + rest
127 pct_sequence = b''
128 if pct_sequence:
129 # Flush the final pct_sequence
130 string += pct_sequence.decode(encoding, errors)
131 return string
132
133 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
134 encoding='utf-8', errors='replace'):
135 qs, _coerce_result = qs, unicode
136 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
137 r = []
138 for name_value in pairs:
139 if not name_value and not strict_parsing:
140 continue
141 nv = name_value.split('=', 1)
142 if len(nv) != 2:
143 if strict_parsing:
144 raise ValueError("bad query field: %r" % (name_value,))
145 # Handle case of a control-name with no equal sign
146 if keep_blank_values:
147 nv.append('')
148 else:
149 continue
150 if len(nv[1]) or keep_blank_values:
151 name = nv[0].replace('+', ' ')
152 name = _unquote(name, encoding=encoding, errors=errors)
153 name = _coerce_result(name)
154 value = nv[1].replace('+', ' ')
155 value = _unquote(value, encoding=encoding, errors=errors)
156 value = _coerce_result(value)
157 r.append((name, value))
158 return r
159
160 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
161 encoding='utf-8', errors='replace'):
162 parsed_result = {}
163 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
164 encoding=encoding, errors=errors)
165 for name, value in pairs:
166 if name in parsed_result:
167 parsed_result[name].append(value)
168 else:
169 parsed_result[name] = [value]
170 return parsed_result
348d0a7a 171
3e669f36 172try:
59ae15a5 173 compat_str = unicode # Python 2
3e669f36 174except NameError:
59ae15a5 175 compat_str = str
3e669f36
PH
176
177try:
59ae15a5 178 compat_chr = unichr # Python 2
3e669f36 179except NameError:
59ae15a5 180 compat_chr = chr
3e669f36 181
f7300c5c
JMF
182try:
183 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
184except ImportError: # Python 2.6
185 from xml.parsers.expat import ExpatError as compat_xml_parse_error
186
b31756c1
FV
187def compat_ord(c):
188 if type(c) is int: return c
189 else: return ord(c)
190
468e2e92
FV
191# This is not clearly defined otherwise
192compiled_regex_type = type(re.compile(''))
193
3e669f36 194std_headers = {
ae8f7871 195 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
196 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
197 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
198 'Accept-Encoding': 'gzip, deflate',
199 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 200}
f427df17 201
d77c3dfd 202def preferredencoding():
59ae15a5 203 """Get preferred encoding.
d77c3dfd 204
59ae15a5
PH
205 Returns the best encoding scheme for the system, based on
206 locale.getpreferredencoding() and some further tweaks.
207 """
208 try:
209 pref = locale.getpreferredencoding()
210 u'TEST'.encode(pref)
211 except:
212 pref = 'UTF-8'
bae611f2 213
59ae15a5 214 return pref
d77c3dfd 215
8cd10ac4 216if sys.version_info < (3,0):
59ae15a5
PH
217 def compat_print(s):
218 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 219else:
59ae15a5
PH
220 def compat_print(s):
221 assert type(s) == type(u'')
222 print(s)
d77c3dfd 223
f4bfd65f
PH
224# In Python 2.x, json.dump expects a bytestream.
225# In Python 3.x, it writes to a character stream
226if sys.version_info < (3,0):
227 def write_json_file(obj, fn):
228 with open(fn, 'wb') as f:
229 json.dump(obj, f)
230else:
231 def write_json_file(obj, fn):
232 with open(fn, 'w', encoding='utf-8') as f:
233 json.dump(obj, f)
234
59ae56fa
PH
235if sys.version_info >= (2,7):
236 def find_xpath_attr(node, xpath, key, val):
237 """ Find the xpath xpath[@key=val] """
5de3ece2 238 assert re.match(r'^[a-zA-Z]+$', key)
af1588c0 239 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
59ae56fa
PH
240 expr = xpath + u"[@%s='%s']" % (key, val)
241 return node.find(expr)
242else:
243 def find_xpath_attr(node, xpath, key, val):
244 for f in node.findall(xpath):
245 if f.attrib.get(key) == val:
246 return f
247 return None
248
d7e66d39
JMF
249# On python2.6 the xml.etree.ElementTree.Element methods don't support
250# the namespace parameter
251def xpath_with_ns(path, ns_map):
252 components = [c.split(':') for c in path.split('/')]
253 replaced = []
254 for c in components:
255 if len(c) == 1:
256 replaced.append(c[0])
257 else:
258 ns, tag = c
259 replaced.append('{%s}%s' % (ns_map[ns], tag))
260 return '/'.join(replaced)
261
d77c3dfd 262def htmlentity_transform(matchobj):
59ae15a5
PH
263 """Transforms an HTML entity to a character.
264
265 This function receives a match object and is intended to be used with
266 the re.sub() function.
267 """
268 entity = matchobj.group(1)
269
270 # Known non-numeric HTML entity
271 if entity in compat_html_entities.name2codepoint:
272 return compat_chr(compat_html_entities.name2codepoint[entity])
273
274 mobj = re.match(u'(?u)#(x?\\d+)', entity)
275 if mobj is not None:
276 numstr = mobj.group(1)
277 if numstr.startswith(u'x'):
278 base = 16
279 numstr = u'0%s' % numstr
280 else:
281 base = 10
282 return compat_chr(int(numstr, base))
283
284 # Unknown entity in name, return its literal representation
285 return (u'&%s;' % entity)
d77c3dfd 286
a8156c1d 287compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
288class BaseHTMLParser(compat_html_parser.HTMLParser):
289 def __init(self):
290 compat_html_parser.HTMLParser.__init__(self)
291 self.html = None
292
293 def loads(self, html):
294 self.html = html
295 self.feed(html)
296 self.close()
297
298class AttrParser(BaseHTMLParser):
43e8fafd
ND
299 """Modified HTMLParser that isolates a tag with the specified attribute"""
300 def __init__(self, attribute, value):
301 self.attribute = attribute
302 self.value = value
59ae15a5
PH
303 self.result = None
304 self.started = False
305 self.depth = {}
59ae15a5
PH
306 self.watch_startpos = False
307 self.error_count = 0
a921f407 308 BaseHTMLParser.__init__(self)
59ae15a5
PH
309
310 def error(self, message):
311 if self.error_count > 10 or self.started:
312 raise compat_html_parser.HTMLParseError(message, self.getpos())
313 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
314 self.error_count += 1
315 self.goahead(1)
316
59ae15a5
PH
317 def handle_starttag(self, tag, attrs):
318 attrs = dict(attrs)
319 if self.started:
320 self.find_startpos(None)
43e8fafd 321 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
322 self.result = [tag]
323 self.started = True
324 self.watch_startpos = True
325 if self.started:
326 if not tag in self.depth: self.depth[tag] = 0
327 self.depth[tag] += 1
328
329 def handle_endtag(self, tag):
330 if self.started:
331 if tag in self.depth: self.depth[tag] -= 1
332 if self.depth[self.result[0]] == 0:
333 self.started = False
334 self.result.append(self.getpos())
335
336 def find_startpos(self, x):
337 """Needed to put the start position of the result (self.result[1])
338 after the opening tag with the requested id"""
339 if self.watch_startpos:
340 self.watch_startpos = False
341 self.result.append(self.getpos())
342 handle_entityref = handle_charref = handle_data = handle_comment = \
343 handle_decl = handle_pi = unknown_decl = find_startpos
344
345 def get_result(self):
346 if self.result is None:
347 return None
348 if len(self.result) != 3:
349 return None
350 lines = self.html.split('\n')
351 lines = lines[self.result[1][0]-1:self.result[2][0]]
352 lines[0] = lines[0][self.result[1][1]:]
353 if len(lines) == 1:
354 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
355 lines[-1] = lines[-1][:self.result[2][1]]
356 return '\n'.join(lines).strip()
3b024e17
PH
357# Hack for https://github.com/rg3/youtube-dl/issues/662
358if sys.version_info < (2, 7, 3):
359 AttrParser.parse_endtag = (lambda self, i:
360 i + len("</scr'+'ipt>")
361 if self.rawdata[i:].startswith("</scr'+'ipt>")
362 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
363
364def get_element_by_id(id, html):
43e8fafd
ND
365 """Return the content of the tag with the specified ID in the passed HTML document"""
366 return get_element_by_attribute("id", id, html)
367
368def get_element_by_attribute(attribute, value, html):
369 """Return the content of the tag with the specified attribute in the passed HTML document"""
370 parser = AttrParser(attribute, value)
59ae15a5
PH
371 try:
372 parser.loads(html)
373 except compat_html_parser.HTMLParseError:
374 pass
375 return parser.get_result()
9e6dd238 376
a921f407
JMF
377class MetaParser(BaseHTMLParser):
378 """
379 Modified HTMLParser that isolates a meta tag with the specified name
380 attribute.
381 """
382 def __init__(self, name):
383 BaseHTMLParser.__init__(self)
384 self.name = name
385 self.content = None
386 self.result = None
387
388 def handle_starttag(self, tag, attrs):
389 if tag != 'meta':
390 return
391 attrs = dict(attrs)
392 if attrs.get('name') == self.name:
393 self.result = attrs.get('content')
394
395 def get_result(self):
396 return self.result
397
398def get_meta_content(name, html):
399 """
400 Return the content attribute from the meta tag with the given name attribute.
401 """
402 parser = MetaParser(name)
403 try:
404 parser.loads(html)
405 except compat_html_parser.HTMLParseError:
406 pass
407 return parser.get_result()
408
9e6dd238
FV
409
410def clean_html(html):
59ae15a5
PH
411 """Clean an HTML snippet into a readable string"""
412 # Newline vs <br />
413 html = html.replace('\n', ' ')
6b3aef80
FV
414 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
415 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
416 # Strip html tags
417 html = re.sub('<.*?>', '', html)
418 # Replace html entities
419 html = unescapeHTML(html)
7decf895 420 return html.strip()
9e6dd238
FV
421
422
d77c3dfd 423def sanitize_open(filename, open_mode):
59ae15a5
PH
424 """Try to open the given filename, and slightly tweak it if this fails.
425
426 Attempts to open the given filename. If this fails, it tries to change
427 the filename slightly, step by step, until it's either able to open it
428 or it fails and raises a final exception, like the standard open()
429 function.
430
431 It returns the tuple (stream, definitive_file_name).
432 """
433 try:
434 if filename == u'-':
435 if sys.platform == 'win32':
436 import msvcrt
437 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 438 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
439 stream = open(encodeFilename(filename), open_mode)
440 return (stream, filename)
441 except (IOError, OSError) as err:
f45c185f
PH
442 if err.errno in (errno.EACCES,):
443 raise
59ae15a5 444
f45c185f
PH
445 # In case of error, try to remove win32 forbidden chars
446 alt_filename = os.path.join(
447 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
448 for path_part in os.path.split(filename)
449 )
450 if alt_filename == filename:
451 raise
452 else:
453 # An exception here should be caught in the caller
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, alt_filename)
d77c3dfd
FV
456
457
458def timeconvert(timestr):
59ae15a5
PH
459 """Convert RFC 2822 defined time string into system timestamp"""
460 timestamp = None
461 timetuple = email.utils.parsedate_tz(timestr)
462 if timetuple is not None:
463 timestamp = email.utils.mktime_tz(timetuple)
464 return timestamp
1c469a94 465
796173d0 466def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
467 """Sanitizes a string so it could be used as part of a filename.
468 If restricted is set, use a stricter subset of allowed characters.
796173d0 469 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
470 """
471 def replace_insane(char):
472 if char == '?' or ord(char) < 32 or ord(char) == 127:
473 return ''
474 elif char == '"':
475 return '' if restricted else '\''
476 elif char == ':':
477 return '_-' if restricted else ' -'
478 elif char in '\\/|*<>':
479 return '_'
627dcfff 480 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
481 return '_'
482 if restricted and ord(char) > 127:
483 return '_'
484 return char
485
486 result = u''.join(map(replace_insane, s))
796173d0
PH
487 if not is_id:
488 while '__' in result:
489 result = result.replace('__', '_')
490 result = result.strip('_')
491 # Common case of "Foreign band name - English song title"
492 if restricted and result.startswith('-_'):
493 result = result[2:]
494 if not result:
495 result = '_'
59ae15a5 496 return result
d77c3dfd
FV
497
498def orderedSet(iterable):
59ae15a5
PH
499 """ Remove all duplicates from the input iterable """
500 res = []
501 for el in iterable:
502 if el not in res:
503 res.append(el)
504 return res
d77c3dfd 505
912b38b4 506
d77c3dfd 507def unescapeHTML(s):
912b38b4
PH
508 if s is None:
509 return None
510 assert type(s) == compat_str
d77c3dfd 511
912b38b4 512 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
59ae15a5 513 return result
d77c3dfd 514
8bf48f23
PH
515
516def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
517 """
518 @param s The name of the file
519 """
d77c3dfd 520
8bf48f23 521 assert type(s) == compat_str
d77c3dfd 522
59ae15a5
PH
523 # Python 3 has a Unicode API
524 if sys.version_info >= (3, 0):
525 return s
0f00efed 526
59ae15a5
PH
527 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
528 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
529 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
530 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
531 if not for_subprocess:
532 return s
533 else:
534 # For subprocess calls, encode with locale encoding
535 # Refer to http://stackoverflow.com/a/9951851/35070
536 encoding = preferredencoding()
59ae15a5 537 else:
6df40dcb 538 encoding = sys.getfilesystemencoding()
8bf48f23
PH
539 if encoding is None:
540 encoding = 'utf-8'
541 return s.encode(encoding, 'ignore')
542
8271226a
PH
543def decodeOption(optval):
544 if optval is None:
545 return optval
546 if isinstance(optval, bytes):
547 optval = optval.decode(preferredencoding())
548
549 assert isinstance(optval, compat_str)
550 return optval
1c256f70 551
4539dd30
PH
552def formatSeconds(secs):
553 if secs > 3600:
554 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
555 elif secs > 60:
556 return '%d:%02d' % (secs // 60, secs % 60)
557 else:
558 return '%d' % secs
559
a0ddb8a2
PH
560
561def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
562 if sys.version_info < (3, 2):
563 import httplib
564
565 class HTTPSConnectionV3(httplib.HTTPSConnection):
566 def __init__(self, *args, **kwargs):
567 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
568
569 def connect(self):
570 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 571 if getattr(self, '_tunnel_host', False):
13ebea79
PH
572 self.sock = sock
573 self._tunnel()
574 try:
575 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 576 except ssl.SSLError:
13ebea79
PH
577 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
578
579 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
580 def https_open(self, req):
581 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 582 return HTTPSHandlerV3(**kwargs)
ea6d901e 583 else:
13ebea79 584 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 585 context.verify_mode = (ssl.CERT_NONE
dca08720 586 if opts_no_check_certificate
ea6d901e 587 else ssl.CERT_REQUIRED)
303b479e
PH
588 context.set_default_verify_paths()
589 try:
590 context.load_default_certs()
591 except AttributeError:
592 pass # Python < 3.4
a0ddb8a2 593 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 594
1c256f70
PH
595class ExtractorError(Exception):
596 """Error during info extraction."""
d11271dd 597 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
598 """ tb, if given, is the original traceback (so that it can be printed out).
599 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
600 """
601
602 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
603 expected = True
d11271dd
PH
604 if video_id is not None:
605 msg = video_id + ': ' + msg
9a82b238 606 if not expected:
298f833b 607 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 608 super(ExtractorError, self).__init__(msg)
d5979c5d 609
1c256f70 610 self.traceback = tb
8cc83b8d 611 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 612 self.cause = cause
d11271dd 613 self.video_id = video_id
1c256f70 614
01951dda
PH
615 def format_traceback(self):
616 if self.traceback is None:
617 return None
618 return u''.join(traceback.format_tb(self.traceback))
619
1c256f70 620
55b3e45b
JMF
621class RegexNotFoundError(ExtractorError):
622 """Error when a regex didn't match"""
623 pass
624
625
d77c3dfd 626class DownloadError(Exception):
59ae15a5 627 """Download Error exception.
d77c3dfd 628
59ae15a5
PH
629 This exception may be thrown by FileDownloader objects if they are not
630 configured to continue on errors. They will contain the appropriate
631 error message.
632 """
8cc83b8d
FV
633 def __init__(self, msg, exc_info=None):
634 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
635 super(DownloadError, self).__init__(msg)
636 self.exc_info = exc_info
d77c3dfd
FV
637
638
639class SameFileError(Exception):
59ae15a5 640 """Same File exception.
d77c3dfd 641
59ae15a5
PH
642 This exception will be thrown by FileDownloader objects if they detect
643 multiple files would have to be downloaded to the same file on disk.
644 """
645 pass
d77c3dfd
FV
646
647
648class PostProcessingError(Exception):
59ae15a5 649 """Post Processing exception.
d77c3dfd 650
59ae15a5
PH
651 This exception may be raised by PostProcessor's .run() method to
652 indicate an error in the postprocessing task.
653 """
7851b379
PH
654 def __init__(self, msg):
655 self.msg = msg
d77c3dfd
FV
656
657class MaxDownloadsReached(Exception):
59ae15a5
PH
658 """ --max-downloads limit has been reached. """
659 pass
d77c3dfd
FV
660
661
662class UnavailableVideoError(Exception):
59ae15a5 663 """Unavailable Format exception.
d77c3dfd 664
59ae15a5
PH
665 This exception will be thrown when a video is requested
666 in a format that is not available for that video.
667 """
668 pass
d77c3dfd
FV
669
670
671class ContentTooShortError(Exception):
59ae15a5 672 """Content Too Short exception.
d77c3dfd 673
59ae15a5
PH
674 This exception may be raised by FileDownloader objects when a file they
675 download is too small for what the server announced first, indicating
676 the connection was probably interrupted.
677 """
678 # Both in bytes
679 downloaded = None
680 expected = None
d77c3dfd 681
59ae15a5
PH
682 def __init__(self, downloaded, expected):
683 self.downloaded = downloaded
684 self.expected = expected
d77c3dfd 685
acebc9cd 686class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
687 """Handler for HTTP requests and responses.
688
689 This class, when installed with an OpenerDirector, automatically adds
690 the standard headers to every HTTP request and handles gzipped and
691 deflated responses from web servers. If compression is to be avoided in
692 a particular request, the original request in the program code only has
693 to include the HTTP header "Youtubedl-No-Compression", which will be
694 removed before making the real request.
695
696 Part of this code was copied from:
697
698 http://techknack.net/python-urllib2-handlers/
699
700 Andrew Rowls, the author of that code, agreed to release it to the
701 public domain.
702 """
703
704 @staticmethod
705 def deflate(data):
706 try:
707 return zlib.decompress(data, -zlib.MAX_WBITS)
708 except zlib.error:
709 return zlib.decompress(data)
710
711 @staticmethod
712 def addinfourl_wrapper(stream, headers, url, code):
713 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
714 return compat_urllib_request.addinfourl(stream, headers, url, code)
715 ret = compat_urllib_request.addinfourl(stream, headers, url)
716 ret.code = code
717 return ret
718
acebc9cd
PH
719 def http_request(self, req):
720 for h,v in std_headers.items():
59ae15a5
PH
721 if h in req.headers:
722 del req.headers[h]
335959e7 723 req.add_header(h, v)
59ae15a5
PH
724 if 'Youtubedl-no-compression' in req.headers:
725 if 'Accept-encoding' in req.headers:
726 del req.headers['Accept-encoding']
727 del req.headers['Youtubedl-no-compression']
3446dfb7 728 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
729 if 'User-agent' in req.headers:
730 del req.headers['User-agent']
731 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 732 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
733 return req
734
acebc9cd 735 def http_response(self, req, resp):
59ae15a5
PH
736 old_resp = resp
737 # gzip
738 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
739 content = resp.read()
740 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
741 try:
742 uncompressed = io.BytesIO(gz.read())
743 except IOError as original_ioerror:
744 # There may be junk add the end of the file
745 # See http://stackoverflow.com/q/4928560/35070 for details
746 for i in range(1, 1024):
747 try:
748 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
749 uncompressed = io.BytesIO(gz.read())
750 except IOError:
751 continue
752 break
753 else:
754 raise original_ioerror
755 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
756 resp.msg = old_resp.msg
757 # deflate
758 if resp.headers.get('Content-encoding', '') == 'deflate':
759 gz = io.BytesIO(self.deflate(resp.read()))
760 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
761 resp.msg = old_resp.msg
762 return resp
0f8d03f8 763
acebc9cd
PH
764 https_request = http_request
765 https_response = http_response
bf50b038 766
5de90176 767
912b38b4
PH
768def parse_iso8601(date_str):
769 """ Return a UNIX timestamp from the given date """
770
771 if date_str is None:
772 return None
773
774 m = re.search(
775 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
776 date_str)
777 if not m:
778 timezone = datetime.timedelta()
779 else:
780 date_str = date_str[:-len(m.group(0))]
781 if not m.group('sign'):
782 timezone = datetime.timedelta()
783 else:
784 sign = 1 if m.group('sign') == '+' else -1
785 timezone = datetime.timedelta(
786 hours=sign * int(m.group('hours')),
787 minutes=sign * int(m.group('minutes')))
788
789 dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
790 return calendar.timegm(dt.timetuple())
791
792
bf50b038
JMF
793def unified_strdate(date_str):
794 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
795
796 if date_str is None:
797 return None
798
bf50b038
JMF
799 upload_date = None
800 #Replace commas
026fcc04 801 date_str = date_str.replace(',', ' ')
bf50b038 802 # %z (UTC offset) is only supported in python>=3.2
026fcc04 803 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
804 format_expressions = [
805 '%d %B %Y',
0f99566c 806 '%d %b %Y',
19e1d359
JMF
807 '%B %d %Y',
808 '%b %d %Y',
809 '%Y-%m-%d',
4cf96546 810 '%d.%m.%Y',
19e1d359
JMF
811 '%d/%m/%Y',
812 '%Y/%m/%d %H:%M:%S',
5d73273f 813 '%Y-%m-%d %H:%M:%S',
19e1d359 814 '%d.%m.%Y %H:%M',
b047de6f 815 '%d.%m.%Y %H.%M',
19e1d359 816 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
817 '%Y-%m-%dT%H:%M:%S.%fZ',
818 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 819 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 820 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 821 '%Y-%m-%dT%H:%M',
19e1d359 822 ]
bf50b038
JMF
823 for expression in format_expressions:
824 try:
825 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 826 except ValueError:
bf50b038 827 pass
42393ce2
PH
828 if upload_date is None:
829 timetuple = email.utils.parsedate_tz(date_str)
830 if timetuple:
831 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
832 return upload_date
833
cbdbb766 834def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
835 guess = url.partition(u'?')[0].rpartition(u'.')[2]
836 if re.match(r'^[A-Za-z0-9]+$', guess):
837 return guess
838 else:
cbdbb766 839 return default_ext
73e79f2a 840
d4051a8e
JMF
841def subtitles_filename(filename, sub_lang, sub_format):
842 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
843
bd558525 844def date_from_str(date_str):
37254abc
JMF
845 """
846 Return a datetime object from a string in the format YYYYMMDD or
847 (now|today)[+-][0-9](day|week|month|year)(s)?"""
848 today = datetime.date.today()
849 if date_str == 'now'or date_str == 'today':
850 return today
851 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
852 if match is not None:
853 sign = match.group('sign')
854 time = int(match.group('time'))
855 if sign == '-':
856 time = -time
857 unit = match.group('unit')
858 #A bad aproximation?
859 if unit == 'month':
860 unit = 'day'
861 time *= 30
862 elif unit == 'year':
863 unit = 'day'
864 time *= 365
865 unit += 's'
866 delta = datetime.timedelta(**{unit: time})
867 return today + delta
bd558525
JMF
868 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
869
e63fc1be 870def hyphenate_date(date_str):
871 """
872 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
873 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
874 if match is not None:
875 return '-'.join(match.groups())
876 else:
877 return date_str
878
bd558525
JMF
879class DateRange(object):
880 """Represents a time interval between two dates"""
881 def __init__(self, start=None, end=None):
882 """start and end must be strings in the format accepted by date"""
883 if start is not None:
884 self.start = date_from_str(start)
885 else:
886 self.start = datetime.datetime.min.date()
887 if end is not None:
888 self.end = date_from_str(end)
889 else:
890 self.end = datetime.datetime.max.date()
37254abc 891 if self.start > self.end:
bd558525
JMF
892 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
893 @classmethod
894 def day(cls, day):
895 """Returns a range that only contains the given day"""
896 return cls(day,day)
897 def __contains__(self, date):
898 """Check if the date is in the range"""
37254abc
JMF
899 if not isinstance(date, datetime.date):
900 date = date_from_str(date)
901 return self.start <= date <= self.end
bd558525
JMF
902 def __str__(self):
903 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
904
905
906def platform_name():
907 """ Returns the platform name as a compat_str """
908 res = platform.platform()
909 if isinstance(res, bytes):
910 res = res.decode(preferredencoding())
911
912 assert isinstance(res, compat_str)
913 return res
c257baff
PH
914
915
b58ddb32
PH
916def _windows_write_string(s, out):
917 """ Returns True if the string was written using special methods,
918 False if it has yet to be written out."""
919 # Adapted from http://stackoverflow.com/a/3259271/35070
920
921 import ctypes
922 import ctypes.wintypes
923
924 WIN_OUTPUT_IDS = {
925 1: -11,
926 2: -12,
927 }
928
929 fileno = out.fileno()
930 if fileno not in WIN_OUTPUT_IDS:
931 return False
932
933 GetStdHandle = ctypes.WINFUNCTYPE(
934 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
935 ("GetStdHandle", ctypes.windll.kernel32))
936 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
937
938 WriteConsoleW = ctypes.WINFUNCTYPE(
939 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
940 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
941 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
942 written = ctypes.wintypes.DWORD(0)
943
944 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
945 FILE_TYPE_CHAR = 0x0002
946 FILE_TYPE_REMOTE = 0x8000
947 GetConsoleMode = ctypes.WINFUNCTYPE(
948 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
949 ctypes.POINTER(ctypes.wintypes.DWORD))(
950 ("GetConsoleMode", ctypes.windll.kernel32))
951 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
952
953 def not_a_console(handle):
954 if handle == INVALID_HANDLE_VALUE or handle is None:
955 return True
956 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
957 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
958
959 if not_a_console(h):
960 return False
961
d1b9c912
PH
962 def next_nonbmp_pos(s):
963 try:
964 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
965 except StopIteration:
966 return len(s)
967
968 while s:
969 count = min(next_nonbmp_pos(s), 1024)
970
b58ddb32 971 ret = WriteConsoleW(
d1b9c912 972 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
973 if ret == 0:
974 raise OSError('Failed to write string')
d1b9c912
PH
975 if not count: # We just wrote a non-BMP character
976 assert written.value == 2
977 s = s[1:]
978 else:
979 assert written.value > 0
980 s = s[written.value:]
b58ddb32
PH
981 return True
982
983
734f90bb 984def write_string(s, out=None, encoding=None):
7459e3a2
PH
985 if out is None:
986 out = sys.stderr
8bf48f23 987 assert type(s) == compat_str
7459e3a2 988
b58ddb32
PH
989 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
990 if _windows_write_string(s, out):
991 return
992
7459e3a2
PH
993 if ('b' in getattr(out, 'mode', '') or
994 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
995 byt = s.encode(encoding or preferredencoding(), 'ignore')
996 out.write(byt)
997 elif hasattr(out, 'buffer'):
998 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
999 byt = s.encode(enc, 'ignore')
1000 out.buffer.write(byt)
1001 else:
8bf48f23 1002 out.write(s)
7459e3a2
PH
1003 out.flush()
1004
1005
48ea9cea
PH
1006def bytes_to_intlist(bs):
1007 if not bs:
1008 return []
1009 if isinstance(bs[0], int): # Python 3
1010 return list(bs)
1011 else:
1012 return [ord(c) for c in bs]
1013
c257baff 1014
cba892fa 1015def intlist_to_bytes(xs):
1016 if not xs:
1017 return b''
1018 if isinstance(chr(0), bytes): # Python 2
1019 return ''.join([chr(x) for x in xs])
1020 else:
1021 return bytes(xs)
c38b1e77
PH
1022
1023
1024def get_cachedir(params={}):
1025 cache_root = os.environ.get('XDG_CACHE_HOME',
1026 os.path.expanduser('~/.cache'))
1027 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
1028
1029
1030# Cross-platform file locking
1031if sys.platform == 'win32':
1032 import ctypes.wintypes
1033 import msvcrt
1034
1035 class OVERLAPPED(ctypes.Structure):
1036 _fields_ = [
1037 ('Internal', ctypes.wintypes.LPVOID),
1038 ('InternalHigh', ctypes.wintypes.LPVOID),
1039 ('Offset', ctypes.wintypes.DWORD),
1040 ('OffsetHigh', ctypes.wintypes.DWORD),
1041 ('hEvent', ctypes.wintypes.HANDLE),
1042 ]
1043
1044 kernel32 = ctypes.windll.kernel32
1045 LockFileEx = kernel32.LockFileEx
1046 LockFileEx.argtypes = [
1047 ctypes.wintypes.HANDLE, # hFile
1048 ctypes.wintypes.DWORD, # dwFlags
1049 ctypes.wintypes.DWORD, # dwReserved
1050 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1051 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1052 ctypes.POINTER(OVERLAPPED) # Overlapped
1053 ]
1054 LockFileEx.restype = ctypes.wintypes.BOOL
1055 UnlockFileEx = kernel32.UnlockFileEx
1056 UnlockFileEx.argtypes = [
1057 ctypes.wintypes.HANDLE, # hFile
1058 ctypes.wintypes.DWORD, # dwReserved
1059 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1060 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1061 ctypes.POINTER(OVERLAPPED) # Overlapped
1062 ]
1063 UnlockFileEx.restype = ctypes.wintypes.BOOL
1064 whole_low = 0xffffffff
1065 whole_high = 0x7fffffff
1066
1067 def _lock_file(f, exclusive):
1068 overlapped = OVERLAPPED()
1069 overlapped.Offset = 0
1070 overlapped.OffsetHigh = 0
1071 overlapped.hEvent = 0
1072 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1073 handle = msvcrt.get_osfhandle(f.fileno())
1074 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1075 whole_low, whole_high, f._lock_file_overlapped_p):
1076 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1077
1078 def _unlock_file(f):
1079 assert f._lock_file_overlapped_p
1080 handle = msvcrt.get_osfhandle(f.fileno())
1081 if not UnlockFileEx(handle, 0,
1082 whole_low, whole_high, f._lock_file_overlapped_p):
1083 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1084
1085else:
1086 import fcntl
1087
1088 def _lock_file(f, exclusive):
1089 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1090
1091 def _unlock_file(f):
1092 fcntl.lockf(f, fcntl.LOCK_UN)
1093
1094
1095class locked_file(object):
1096 def __init__(self, filename, mode, encoding=None):
1097 assert mode in ['r', 'a', 'w']
1098 self.f = io.open(filename, mode, encoding=encoding)
1099 self.mode = mode
1100
1101 def __enter__(self):
1102 exclusive = self.mode != 'r'
1103 try:
1104 _lock_file(self.f, exclusive)
1105 except IOError:
1106 self.f.close()
1107 raise
1108 return self
1109
1110 def __exit__(self, etype, value, traceback):
1111 try:
1112 _unlock_file(self.f)
1113 finally:
1114 self.f.close()
1115
1116 def __iter__(self):
1117 return iter(self.f)
1118
1119 def write(self, *args):
1120 return self.f.write(*args)
1121
1122 def read(self, *args):
1123 return self.f.read(*args)
4eb7f1d1
JMF
1124
1125
1126def shell_quote(args):
a6a173c2
JMF
1127 quoted_args = []
1128 encoding = sys.getfilesystemencoding()
1129 if encoding is None:
1130 encoding = 'utf-8'
1131 for a in args:
1132 if isinstance(a, bytes):
1133 # We may get a filename encoded with 'encodeFilename'
1134 a = a.decode(encoding)
1135 quoted_args.append(pipes.quote(a))
1136 return u' '.join(quoted_args)
9d4660ca
PH
1137
1138
f4d96df0
PH
1139def takewhile_inclusive(pred, seq):
1140 """ Like itertools.takewhile, but include the latest evaluated element
1141 (the first element so that Not pred(e)) """
1142 for e in seq:
1143 yield e
1144 if not pred(e):
1145 return
1146
1147
9d4660ca
PH
1148def smuggle_url(url, data):
1149 """ Pass additional data in a URL for internal use. """
1150
1151 sdata = compat_urllib_parse.urlencode(
1152 {u'__youtubedl_smuggle': json.dumps(data)})
1153 return url + u'#' + sdata
1154
1155
79f82953 1156def unsmuggle_url(smug_url, default=None):
9d4660ca 1157 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1158 return smug_url, default
9d4660ca
PH
1159 url, _, sdata = smug_url.rpartition(u'#')
1160 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1161 data = json.loads(jsond)
1162 return url, data
02dbf93f
PH
1163
1164
02dbf93f
PH
1165def format_bytes(bytes):
1166 if bytes is None:
1167 return u'N/A'
1168 if type(bytes) is str:
1169 bytes = float(bytes)
1170 if bytes == 0.0:
1171 exponent = 0
1172 else:
1173 exponent = int(math.log(bytes, 1024.0))
1174 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1175 converted = float(bytes) / float(1024 ** exponent)
1176 return u'%.2f%s' % (converted, suffix)
f53c966a 1177
1c088fa8 1178
f53c966a
JMF
1179def str_to_int(int_str):
1180 int_str = re.sub(r'[,\.]', u'', int_str)
1181 return int(int_str)
1c088fa8
PH
1182
1183
1184def get_term_width():
1185 columns = os.environ.get('COLUMNS', None)
1186 if columns:
1187 return int(columns)
1188
1189 try:
1190 sp = subprocess.Popen(
1191 ['stty', 'size'],
1192 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1193 out, err = sp.communicate()
1194 return int(out.split()[1])
1195 except:
1196 pass
1197 return None
caefb1de
PH
1198
1199
1200def month_by_name(name):
1201 """ Return the number of a month by (locale-independently) English name """
1202
1203 ENGLISH_NAMES = [
dadb8184 1204 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1205 u'July', u'August', u'September', u'October', u'November', u'December']
1206 try:
1207 return ENGLISH_NAMES.index(name) + 1
1208 except ValueError:
1209 return None
18258362
JMF
1210
1211
5aafe895 1212def fix_xml_ampersands(xml_str):
18258362 1213 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1214 return re.sub(
1215 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1216 u'&amp;',
1217 xml_str)
e3946f98
PH
1218
1219
1220def setproctitle(title):
8bf48f23 1221 assert isinstance(title, compat_str)
e3946f98
PH
1222 try:
1223 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1224 except OSError:
1225 return
6eefe533
PH
1226 title_bytes = title.encode('utf-8')
1227 buf = ctypes.create_string_buffer(len(title_bytes))
1228 buf.value = title_bytes
e3946f98 1229 try:
6eefe533 1230 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1231 except AttributeError:
1232 return # Strange libc, just skip this
d7dda168
PH
1233
1234
1235def remove_start(s, start):
1236 if s.startswith(start):
1237 return s[len(start):]
1238 return s
29eb5174
PH
1239
1240
1241def url_basename(url):
9b8aaeed
JMF
1242 path = compat_urlparse.urlparse(url).path
1243 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1244
1245
1246class HEADRequest(compat_urllib_request.Request):
1247 def get_method(self):
1248 return "HEAD"
7217e148
PH
1249
1250
28746fbd
PH
1251def int_or_none(v, scale=1, default=None, get_attr=None):
1252 if get_attr:
1253 if v is not None:
1254 v = getattr(v, get_attr, None)
9271bc83 1255 return default if v is None else (int(v) // scale)
608d11f5
PH
1256
1257
9271bc83
PH
1258def float_or_none(v, scale=1, default=None):
1259 return default if v is None else (float(v) / scale)
43f775e4
PH
1260
1261
608d11f5
PH
1262def parse_duration(s):
1263 if s is None:
1264 return None
1265
1266 m = re.match(
ba40a746 1267 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
608d11f5
PH
1268 if not m:
1269 return None
1270 res = int(m.group('secs'))
1271 if m.group('mins'):
1272 res += int(m.group('mins')) * 60
1273 if m.group('hours'):
1274 res += int(m.group('hours')) * 60 * 60
1275 return res
91d7d0b3
JMF
1276
1277
1278def prepend_extension(filename, ext):
1279 name, real_ext = os.path.splitext(filename)
1280 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1281
1282
1283def check_executable(exe, args=[]):
1284 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1285 args can be a list of arguments for a short output (like -version) """
1286 try:
1287 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1288 except OSError:
1289 return False
1290 return exe
b7ab0590
PH
1291
1292
1293class PagedList(object):
1294 def __init__(self, pagefunc, pagesize):
1295 self._pagefunc = pagefunc
1296 self._pagesize = pagesize
1297
dd26ced1
PH
1298 def __len__(self):
1299 # This is only useful for tests
1300 return len(self.getslice())
1301
b7ab0590
PH
1302 def getslice(self, start=0, end=None):
1303 res = []
1304 for pagenum in itertools.count(start // self._pagesize):
1305 firstid = pagenum * self._pagesize
1306 nextfirstid = pagenum * self._pagesize + self._pagesize
1307 if start >= nextfirstid:
1308 continue
1309
1310 page_results = list(self._pagefunc(pagenum))
1311
1312 startv = (
1313 start % self._pagesize
1314 if firstid <= start < nextfirstid
1315 else 0)
1316
1317 endv = (
1318 ((end - 1) % self._pagesize) + 1
1319 if (end is not None and firstid <= end <= nextfirstid)
1320 else None)
1321
1322 if startv != 0 or endv is not None:
1323 page_results = page_results[startv:endv]
1324 res.extend(page_results)
1325
1326 # A little optimization - if current page is not "full", ie. does
1327 # not contain page_size videos then we can assume that this page
1328 # is the last one - there are no more ids on further pages -
1329 # i.e. no need to query again.
1330 if len(page_results) + startv < self._pagesize:
1331 break
1332
1333 # If we got the whole page, but the next page is not interesting,
1334 # break out early as well
1335 if end == nextfirstid:
1336 break
1337 return res
81c2f20b
PH
1338
1339
1340def uppercase_escape(s):
676eb3f2 1341 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1342 return re.sub(
a612753d 1343 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1344 lambda m: unicode_escape(m.group(0))[0],
1345 s)
b53466e1
PH
1346
1347try:
1348 struct.pack(u'!I', 0)
1349except TypeError:
1350 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1351 def struct_pack(spec, *args):
1352 if isinstance(spec, compat_str):
1353 spec = spec.encode('ascii')
1354 return struct.pack(spec, *args)
1355
1356 def struct_unpack(spec, *args):
1357 if isinstance(spec, compat_str):
1358 spec = spec.encode('ascii')
1359 return struct.unpack(spec, *args)
1360else:
1361 struct_pack = struct.pack
1362 struct_unpack = struct.unpack
62e609ab
PH
1363
1364
1365def read_batch_urls(batch_fd):
1366 def fixup(url):
1367 if not isinstance(url, compat_str):
1368 url = url.decode('utf-8', 'replace')
1369 BOM_UTF8 = u'\xef\xbb\xbf'
1370 if url.startswith(BOM_UTF8):
1371 url = url[len(BOM_UTF8):]
1372 url = url.strip()
1373 if url.startswith(('#', ';', ']')):
1374 return False
1375 return url
1376
1377 with contextlib.closing(batch_fd) as fd:
1378 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1379
1380
1381def urlencode_postdata(*args, **kargs):
1382 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1383
1384
1385def parse_xml(s):
1386 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1387 def doctype(self, name, pubid, system):
1388 pass # Ignore doctypes
1389
1390 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1391 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1392 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
e68301af
PH
1393
1394
1395if sys.version_info < (3, 0) and sys.platform == 'win32':
1396 def compat_getpass(prompt, *args, **kwargs):
1397 if isinstance(prompt, compat_str):
4e6f9aec 1398 prompt = prompt.encode(preferredencoding())
e68301af
PH
1399 return getpass.getpass(prompt, *args, **kwargs)
1400else:
1401 compat_getpass = getpass.getpass
a1a530b0
PH
1402
1403
1404US_RATINGS = {
1405 'G': 0,
1406 'PG': 10,
1407 'PG-13': 13,
1408 'R': 16,
1409 'NC': 18,
1410}
fac55558
PH
1411
1412
1413def strip_jsonp(code):
1414 return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)
478c2c61
PH
1415
1416
1417def qualities(quality_ids):
1418 """ Get a numeric quality value out of a list of possible values """
1419 def q(qid):
1420 try:
1421 return quality_ids.index(qid)
1422 except ValueError:
1423 return -1
1424 return q
1425
acd69589
PH
1426
1427DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'