]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[ard] Show error message for videos that are no longer available (#3422)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
676eb3f2 5import codecs
62e609ab 6import contextlib
e3946f98 7import ctypes
c496ca96
PH
8import datetime
9import email.utils
f45c185f 10import errno
e68301af 11import getpass
d77c3dfd 12import gzip
b7ab0590 13import itertools
03f9daab 14import io
f4bfd65f 15import json
d77c3dfd 16import locale
02dbf93f 17import math
d77c3dfd 18import os
4eb7f1d1 19import pipes
c496ca96 20import platform
d77c3dfd 21import re
13ebea79 22import ssl
c496ca96 23import socket
b53466e1 24import struct
1c088fa8 25import subprocess
d77c3dfd 26import sys
01951dda 27import traceback
bcf89ce6 28import xml.etree.ElementTree
d77c3dfd 29import zlib
d77c3dfd 30
01ba00ca 31try:
59ae15a5 32 import urllib.request as compat_urllib_request
01ba00ca 33except ImportError: # Python 2
59ae15a5 34 import urllib2 as compat_urllib_request
01ba00ca
PH
35
36try:
59ae15a5 37 import urllib.error as compat_urllib_error
01ba00ca 38except ImportError: # Python 2
59ae15a5 39 import urllib2 as compat_urllib_error
01ba00ca
PH
40
41try:
59ae15a5 42 import urllib.parse as compat_urllib_parse
01ba00ca 43except ImportError: # Python 2
59ae15a5 44 import urllib as compat_urllib_parse
01ba00ca 45
799c0763
PH
46try:
47 from urllib.parse import urlparse as compat_urllib_parse_urlparse
48except ImportError: # Python 2
49 from urlparse import urlparse as compat_urllib_parse_urlparse
50
6543f0dc
JMF
51try:
52 import urllib.parse as compat_urlparse
53except ImportError: # Python 2
54 import urlparse as compat_urlparse
55
01ba00ca 56try:
59ae15a5 57 import http.cookiejar as compat_cookiejar
01ba00ca 58except ImportError: # Python 2
59ae15a5 59 import cookielib as compat_cookiejar
01ba00ca 60
3e669f36 61try:
59ae15a5 62 import html.entities as compat_html_entities
9f37a959 63except ImportError: # Python 2
59ae15a5 64 import htmlentitydefs as compat_html_entities
3e669f36 65
a8156c1d 66try:
59ae15a5 67 import html.parser as compat_html_parser
9f37a959 68except ImportError: # Python 2
59ae15a5 69 import HTMLParser as compat_html_parser
a8156c1d 70
348d0a7a 71try:
59ae15a5 72 import http.client as compat_http_client
9f37a959 73except ImportError: # Python 2
59ae15a5 74 import httplib as compat_http_client
348d0a7a 75
2eabb802 76try:
0e283428 77 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
78except ImportError: # Python 2
79 from urllib2 import HTTPError as compat_HTTPError
80
e0df6211
PH
81try:
82 from urllib.request import urlretrieve as compat_urlretrieve
83except ImportError: # Python 2
84 from urllib import urlretrieve as compat_urlretrieve
85
86
5910e210
PH
87try:
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
90except ImportError:
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
92
9f37a959 93try:
f1f725c6
PH
94 from urllib.parse import unquote as compat_urllib_parse_unquote
95except ImportError:
96 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
59ae15a5
PH
97 if string == '':
98 return string
99 res = string.split('%')
100 if len(res) == 1:
101 return string
102 if encoding is None:
103 encoding = 'utf-8'
104 if errors is None:
105 errors = 'replace'
106 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
107 pct_sequence = b''
108 string = res[0]
109 for item in res[1:]:
110 try:
111 if not item:
112 raise ValueError
113 pct_sequence += item[:2].decode('hex')
114 rest = item[2:]
115 if not rest:
116 # This segment was just a single percent-encoded character.
117 # May be part of a sequence of code units, so delay decoding.
118 # (Stored in pct_sequence).
119 continue
120 except ValueError:
121 rest = '%' + item
122 # Encountered non-percent-encoded characters. Flush the current
123 # pct_sequence.
124 string += pct_sequence.decode(encoding, errors) + rest
125 pct_sequence = b''
126 if pct_sequence:
127 # Flush the final pct_sequence
128 string += pct_sequence.decode(encoding, errors)
129 return string
130
f1f725c6
PH
131
132try:
133 from urllib.parse import parse_qs as compat_parse_qs
134except ImportError: # Python 2
135 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
136 # Python 2's version is apparently totally broken
137
59ae15a5
PH
138 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
139 encoding='utf-8', errors='replace'):
140 qs, _coerce_result = qs, unicode
141 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
142 r = []
143 for name_value in pairs:
144 if not name_value and not strict_parsing:
145 continue
146 nv = name_value.split('=', 1)
147 if len(nv) != 2:
148 if strict_parsing:
149 raise ValueError("bad query field: %r" % (name_value,))
150 # Handle case of a control-name with no equal sign
151 if keep_blank_values:
152 nv.append('')
153 else:
154 continue
155 if len(nv[1]) or keep_blank_values:
156 name = nv[0].replace('+', ' ')
f1f725c6
PH
157 name = compat_urllib_parse_unquote(
158 name, encoding=encoding, errors=errors)
59ae15a5
PH
159 name = _coerce_result(name)
160 value = nv[1].replace('+', ' ')
f1f725c6
PH
161 value = compat_urllib_parse_unquote(
162 value, encoding=encoding, errors=errors)
59ae15a5
PH
163 value = _coerce_result(value)
164 r.append((name, value))
165 return r
166
167 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
168 encoding='utf-8', errors='replace'):
169 parsed_result = {}
170 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
171 encoding=encoding, errors=errors)
172 for name, value in pairs:
173 if name in parsed_result:
174 parsed_result[name].append(value)
175 else:
176 parsed_result[name] = [value]
177 return parsed_result
348d0a7a 178
3e669f36 179try:
59ae15a5 180 compat_str = unicode # Python 2
3e669f36 181except NameError:
59ae15a5 182 compat_str = str
3e669f36
PH
183
184try:
59ae15a5 185 compat_chr = unichr # Python 2
3e669f36 186except NameError:
59ae15a5 187 compat_chr = chr
3e669f36 188
f7300c5c
JMF
189try:
190 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
191except ImportError: # Python 2.6
192 from xml.parsers.expat import ExpatError as compat_xml_parse_error
193
b31756c1
FV
194def compat_ord(c):
195 if type(c) is int: return c
196 else: return ord(c)
197
468e2e92
FV
198# This is not clearly defined otherwise
199compiled_regex_type = type(re.compile(''))
200
3e669f36 201std_headers = {
ae8f7871 202 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
203 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
204 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
205 'Accept-Encoding': 'gzip, deflate',
206 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 207}
f427df17 208
d77c3dfd 209def preferredencoding():
59ae15a5 210 """Get preferred encoding.
d77c3dfd 211
59ae15a5
PH
212 Returns the best encoding scheme for the system, based on
213 locale.getpreferredencoding() and some further tweaks.
214 """
215 try:
216 pref = locale.getpreferredencoding()
217 u'TEST'.encode(pref)
218 except:
219 pref = 'UTF-8'
bae611f2 220
59ae15a5 221 return pref
d77c3dfd 222
8cd10ac4 223if sys.version_info < (3,0):
59ae15a5
PH
224 def compat_print(s):
225 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 226else:
59ae15a5
PH
227 def compat_print(s):
228 assert type(s) == type(u'')
229 print(s)
d77c3dfd 230
f4bfd65f
PH
231# In Python 2.x, json.dump expects a bytestream.
232# In Python 3.x, it writes to a character stream
233if sys.version_info < (3,0):
234 def write_json_file(obj, fn):
235 with open(fn, 'wb') as f:
236 json.dump(obj, f)
237else:
238 def write_json_file(obj, fn):
239 with open(fn, 'w', encoding='utf-8') as f:
240 json.dump(obj, f)
241
59ae56fa
PH
242if sys.version_info >= (2,7):
243 def find_xpath_attr(node, xpath, key, val):
244 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
245 assert re.match(r'^[a-zA-Z-]+$', key)
246 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
247 expr = xpath + u"[@%s='%s']" % (key, val)
248 return node.find(expr)
249else:
250 def find_xpath_attr(node, xpath, key, val):
251 for f in node.findall(xpath):
252 if f.attrib.get(key) == val:
253 return f
254 return None
255
d7e66d39
JMF
256# On python2.6 the xml.etree.ElementTree.Element methods don't support
257# the namespace parameter
258def xpath_with_ns(path, ns_map):
259 components = [c.split(':') for c in path.split('/')]
260 replaced = []
261 for c in components:
262 if len(c) == 1:
263 replaced.append(c[0])
264 else:
265 ns, tag = c
266 replaced.append('{%s}%s' % (ns_map[ns], tag))
267 return '/'.join(replaced)
268
d77c3dfd 269def htmlentity_transform(matchobj):
59ae15a5
PH
270 """Transforms an HTML entity to a character.
271
272 This function receives a match object and is intended to be used with
273 the re.sub() function.
274 """
275 entity = matchobj.group(1)
276
277 # Known non-numeric HTML entity
278 if entity in compat_html_entities.name2codepoint:
279 return compat_chr(compat_html_entities.name2codepoint[entity])
280
281 mobj = re.match(u'(?u)#(x?\\d+)', entity)
282 if mobj is not None:
283 numstr = mobj.group(1)
284 if numstr.startswith(u'x'):
285 base = 16
286 numstr = u'0%s' % numstr
287 else:
288 base = 10
289 return compat_chr(int(numstr, base))
290
291 # Unknown entity in name, return its literal representation
292 return (u'&%s;' % entity)
d77c3dfd 293
a8156c1d 294compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
295class BaseHTMLParser(compat_html_parser.HTMLParser):
296 def __init(self):
297 compat_html_parser.HTMLParser.__init__(self)
298 self.html = None
299
300 def loads(self, html):
301 self.html = html
302 self.feed(html)
303 self.close()
304
305class AttrParser(BaseHTMLParser):
43e8fafd
ND
306 """Modified HTMLParser that isolates a tag with the specified attribute"""
307 def __init__(self, attribute, value):
308 self.attribute = attribute
309 self.value = value
59ae15a5
PH
310 self.result = None
311 self.started = False
312 self.depth = {}
59ae15a5
PH
313 self.watch_startpos = False
314 self.error_count = 0
a921f407 315 BaseHTMLParser.__init__(self)
59ae15a5
PH
316
317 def error(self, message):
318 if self.error_count > 10 or self.started:
319 raise compat_html_parser.HTMLParseError(message, self.getpos())
320 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
321 self.error_count += 1
322 self.goahead(1)
323
59ae15a5
PH
324 def handle_starttag(self, tag, attrs):
325 attrs = dict(attrs)
326 if self.started:
327 self.find_startpos(None)
43e8fafd 328 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
329 self.result = [tag]
330 self.started = True
331 self.watch_startpos = True
332 if self.started:
333 if not tag in self.depth: self.depth[tag] = 0
334 self.depth[tag] += 1
335
336 def handle_endtag(self, tag):
337 if self.started:
338 if tag in self.depth: self.depth[tag] -= 1
339 if self.depth[self.result[0]] == 0:
340 self.started = False
341 self.result.append(self.getpos())
342
343 def find_startpos(self, x):
344 """Needed to put the start position of the result (self.result[1])
345 after the opening tag with the requested id"""
346 if self.watch_startpos:
347 self.watch_startpos = False
348 self.result.append(self.getpos())
349 handle_entityref = handle_charref = handle_data = handle_comment = \
350 handle_decl = handle_pi = unknown_decl = find_startpos
351
352 def get_result(self):
353 if self.result is None:
354 return None
355 if len(self.result) != 3:
356 return None
357 lines = self.html.split('\n')
358 lines = lines[self.result[1][0]-1:self.result[2][0]]
359 lines[0] = lines[0][self.result[1][1]:]
360 if len(lines) == 1:
361 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
362 lines[-1] = lines[-1][:self.result[2][1]]
363 return '\n'.join(lines).strip()
3b024e17
PH
364# Hack for https://github.com/rg3/youtube-dl/issues/662
365if sys.version_info < (2, 7, 3):
366 AttrParser.parse_endtag = (lambda self, i:
367 i + len("</scr'+'ipt>")
368 if self.rawdata[i:].startswith("</scr'+'ipt>")
369 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
370
371def get_element_by_id(id, html):
43e8fafd
ND
372 """Return the content of the tag with the specified ID in the passed HTML document"""
373 return get_element_by_attribute("id", id, html)
374
375def get_element_by_attribute(attribute, value, html):
376 """Return the content of the tag with the specified attribute in the passed HTML document"""
377 parser = AttrParser(attribute, value)
59ae15a5
PH
378 try:
379 parser.loads(html)
380 except compat_html_parser.HTMLParseError:
381 pass
382 return parser.get_result()
9e6dd238 383
a921f407
JMF
384class MetaParser(BaseHTMLParser):
385 """
386 Modified HTMLParser that isolates a meta tag with the specified name
387 attribute.
388 """
389 def __init__(self, name):
390 BaseHTMLParser.__init__(self)
391 self.name = name
392 self.content = None
393 self.result = None
394
395 def handle_starttag(self, tag, attrs):
396 if tag != 'meta':
397 return
398 attrs = dict(attrs)
399 if attrs.get('name') == self.name:
400 self.result = attrs.get('content')
401
402 def get_result(self):
403 return self.result
404
405def get_meta_content(name, html):
406 """
407 Return the content attribute from the meta tag with the given name attribute.
408 """
409 parser = MetaParser(name)
410 try:
411 parser.loads(html)
412 except compat_html_parser.HTMLParseError:
413 pass
414 return parser.get_result()
415
9e6dd238
FV
416
417def clean_html(html):
59ae15a5
PH
418 """Clean an HTML snippet into a readable string"""
419 # Newline vs <br />
420 html = html.replace('\n', ' ')
6b3aef80
FV
421 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
422 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
423 # Strip html tags
424 html = re.sub('<.*?>', '', html)
425 # Replace html entities
426 html = unescapeHTML(html)
7decf895 427 return html.strip()
9e6dd238
FV
428
429
d77c3dfd 430def sanitize_open(filename, open_mode):
59ae15a5
PH
431 """Try to open the given filename, and slightly tweak it if this fails.
432
433 Attempts to open the given filename. If this fails, it tries to change
434 the filename slightly, step by step, until it's either able to open it
435 or it fails and raises a final exception, like the standard open()
436 function.
437
438 It returns the tuple (stream, definitive_file_name).
439 """
440 try:
441 if filename == u'-':
442 if sys.platform == 'win32':
443 import msvcrt
444 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 445 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
446 stream = open(encodeFilename(filename), open_mode)
447 return (stream, filename)
448 except (IOError, OSError) as err:
f45c185f
PH
449 if err.errno in (errno.EACCES,):
450 raise
59ae15a5 451
f45c185f
PH
452 # In case of error, try to remove win32 forbidden chars
453 alt_filename = os.path.join(
454 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
455 for path_part in os.path.split(filename)
456 )
457 if alt_filename == filename:
458 raise
459 else:
460 # An exception here should be caught in the caller
461 stream = open(encodeFilename(filename), open_mode)
462 return (stream, alt_filename)
d77c3dfd
FV
463
464
465def timeconvert(timestr):
59ae15a5
PH
466 """Convert RFC 2822 defined time string into system timestamp"""
467 timestamp = None
468 timetuple = email.utils.parsedate_tz(timestr)
469 if timetuple is not None:
470 timestamp = email.utils.mktime_tz(timetuple)
471 return timestamp
1c469a94 472
796173d0 473def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
474 """Sanitizes a string so it could be used as part of a filename.
475 If restricted is set, use a stricter subset of allowed characters.
796173d0 476 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
477 """
478 def replace_insane(char):
479 if char == '?' or ord(char) < 32 or ord(char) == 127:
480 return ''
481 elif char == '"':
482 return '' if restricted else '\''
483 elif char == ':':
484 return '_-' if restricted else ' -'
485 elif char in '\\/|*<>':
486 return '_'
627dcfff 487 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
488 return '_'
489 if restricted and ord(char) > 127:
490 return '_'
491 return char
492
493 result = u''.join(map(replace_insane, s))
796173d0
PH
494 if not is_id:
495 while '__' in result:
496 result = result.replace('__', '_')
497 result = result.strip('_')
498 # Common case of "Foreign band name - English song title"
499 if restricted and result.startswith('-_'):
500 result = result[2:]
501 if not result:
502 result = '_'
59ae15a5 503 return result
d77c3dfd
FV
504
505def orderedSet(iterable):
59ae15a5
PH
506 """ Remove all duplicates from the input iterable """
507 res = []
508 for el in iterable:
509 if el not in res:
510 res.append(el)
511 return res
d77c3dfd 512
912b38b4 513
d77c3dfd 514def unescapeHTML(s):
912b38b4
PH
515 if s is None:
516 return None
517 assert type(s) == compat_str
d77c3dfd 518
912b38b4 519 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
59ae15a5 520 return result
d77c3dfd 521
8bf48f23
PH
522
523def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
524 """
525 @param s The name of the file
526 """
d77c3dfd 527
8bf48f23 528 assert type(s) == compat_str
d77c3dfd 529
59ae15a5
PH
530 # Python 3 has a Unicode API
531 if sys.version_info >= (3, 0):
532 return s
0f00efed 533
59ae15a5
PH
534 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
535 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
536 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
537 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
538 if not for_subprocess:
539 return s
540 else:
541 # For subprocess calls, encode with locale encoding
542 # Refer to http://stackoverflow.com/a/9951851/35070
543 encoding = preferredencoding()
59ae15a5 544 else:
6df40dcb 545 encoding = sys.getfilesystemencoding()
8bf48f23
PH
546 if encoding is None:
547 encoding = 'utf-8'
548 return s.encode(encoding, 'ignore')
549
f07b74fc
PH
550
551def encodeArgument(s):
552 if not isinstance(s, compat_str):
553 # Legacy code that uses byte strings
554 # Uncomment the following line after fixing all post processors
555 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
556 s = s.decode('ascii')
557 return encodeFilename(s, True)
558
559
8271226a
PH
560def decodeOption(optval):
561 if optval is None:
562 return optval
563 if isinstance(optval, bytes):
564 optval = optval.decode(preferredencoding())
565
566 assert isinstance(optval, compat_str)
567 return optval
1c256f70 568
4539dd30
PH
569def formatSeconds(secs):
570 if secs > 3600:
571 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
572 elif secs > 60:
573 return '%d:%02d' % (secs // 60, secs % 60)
574 else:
575 return '%d' % secs
576
a0ddb8a2
PH
577
578def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
579 if sys.version_info < (3, 2):
580 import httplib
581
582 class HTTPSConnectionV3(httplib.HTTPSConnection):
583 def __init__(self, *args, **kwargs):
584 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
585
586 def connect(self):
587 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 588 if getattr(self, '_tunnel_host', False):
13ebea79
PH
589 self.sock = sock
590 self._tunnel()
591 try:
592 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 593 except ssl.SSLError:
13ebea79
PH
594 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
595
596 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
597 def https_open(self, req):
598 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 599 return HTTPSHandlerV3(**kwargs)
ea6d901e 600 else:
13ebea79 601 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 602 context.verify_mode = (ssl.CERT_NONE
dca08720 603 if opts_no_check_certificate
ea6d901e 604 else ssl.CERT_REQUIRED)
303b479e
PH
605 context.set_default_verify_paths()
606 try:
607 context.load_default_certs()
608 except AttributeError:
609 pass # Python < 3.4
a0ddb8a2 610 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 611
1c256f70
PH
612class ExtractorError(Exception):
613 """Error during info extraction."""
d11271dd 614 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
615 """ tb, if given, is the original traceback (so that it can be printed out).
616 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
617 """
618
619 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
620 expected = True
d11271dd
PH
621 if video_id is not None:
622 msg = video_id + ': ' + msg
9a82b238 623 if not expected:
298f833b 624 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 625 super(ExtractorError, self).__init__(msg)
d5979c5d 626
1c256f70 627 self.traceback = tb
8cc83b8d 628 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 629 self.cause = cause
d11271dd 630 self.video_id = video_id
1c256f70 631
01951dda
PH
632 def format_traceback(self):
633 if self.traceback is None:
634 return None
635 return u''.join(traceback.format_tb(self.traceback))
636
1c256f70 637
55b3e45b
JMF
638class RegexNotFoundError(ExtractorError):
639 """Error when a regex didn't match"""
640 pass
641
642
d77c3dfd 643class DownloadError(Exception):
59ae15a5 644 """Download Error exception.
d77c3dfd 645
59ae15a5
PH
646 This exception may be thrown by FileDownloader objects if they are not
647 configured to continue on errors. They will contain the appropriate
648 error message.
649 """
8cc83b8d
FV
650 def __init__(self, msg, exc_info=None):
651 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
652 super(DownloadError, self).__init__(msg)
653 self.exc_info = exc_info
d77c3dfd
FV
654
655
656class SameFileError(Exception):
59ae15a5 657 """Same File exception.
d77c3dfd 658
59ae15a5
PH
659 This exception will be thrown by FileDownloader objects if they detect
660 multiple files would have to be downloaded to the same file on disk.
661 """
662 pass
d77c3dfd
FV
663
664
665class PostProcessingError(Exception):
59ae15a5 666 """Post Processing exception.
d77c3dfd 667
59ae15a5
PH
668 This exception may be raised by PostProcessor's .run() method to
669 indicate an error in the postprocessing task.
670 """
7851b379
PH
671 def __init__(self, msg):
672 self.msg = msg
d77c3dfd
FV
673
674class MaxDownloadsReached(Exception):
59ae15a5
PH
675 """ --max-downloads limit has been reached. """
676 pass
d77c3dfd
FV
677
678
679class UnavailableVideoError(Exception):
59ae15a5 680 """Unavailable Format exception.
d77c3dfd 681
59ae15a5
PH
682 This exception will be thrown when a video is requested
683 in a format that is not available for that video.
684 """
685 pass
d77c3dfd
FV
686
687
688class ContentTooShortError(Exception):
59ae15a5 689 """Content Too Short exception.
d77c3dfd 690
59ae15a5
PH
691 This exception may be raised by FileDownloader objects when a file they
692 download is too small for what the server announced first, indicating
693 the connection was probably interrupted.
694 """
695 # Both in bytes
696 downloaded = None
697 expected = None
d77c3dfd 698
59ae15a5
PH
699 def __init__(self, downloaded, expected):
700 self.downloaded = downloaded
701 self.expected = expected
d77c3dfd 702
acebc9cd 703class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
704 """Handler for HTTP requests and responses.
705
706 This class, when installed with an OpenerDirector, automatically adds
707 the standard headers to every HTTP request and handles gzipped and
708 deflated responses from web servers. If compression is to be avoided in
709 a particular request, the original request in the program code only has
710 to include the HTTP header "Youtubedl-No-Compression", which will be
711 removed before making the real request.
712
713 Part of this code was copied from:
714
715 http://techknack.net/python-urllib2-handlers/
716
717 Andrew Rowls, the author of that code, agreed to release it to the
718 public domain.
719 """
720
721 @staticmethod
722 def deflate(data):
723 try:
724 return zlib.decompress(data, -zlib.MAX_WBITS)
725 except zlib.error:
726 return zlib.decompress(data)
727
728 @staticmethod
729 def addinfourl_wrapper(stream, headers, url, code):
730 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
731 return compat_urllib_request.addinfourl(stream, headers, url, code)
732 ret = compat_urllib_request.addinfourl(stream, headers, url)
733 ret.code = code
734 return ret
735
acebc9cd
PH
736 def http_request(self, req):
737 for h,v in std_headers.items():
59ae15a5
PH
738 if h in req.headers:
739 del req.headers[h]
335959e7 740 req.add_header(h, v)
59ae15a5
PH
741 if 'Youtubedl-no-compression' in req.headers:
742 if 'Accept-encoding' in req.headers:
743 del req.headers['Accept-encoding']
744 del req.headers['Youtubedl-no-compression']
3446dfb7 745 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
746 if 'User-agent' in req.headers:
747 del req.headers['User-agent']
748 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 749 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
750 return req
751
acebc9cd 752 def http_response(self, req, resp):
59ae15a5
PH
753 old_resp = resp
754 # gzip
755 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
756 content = resp.read()
757 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
758 try:
759 uncompressed = io.BytesIO(gz.read())
760 except IOError as original_ioerror:
761 # There may be junk add the end of the file
762 # See http://stackoverflow.com/q/4928560/35070 for details
763 for i in range(1, 1024):
764 try:
765 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
766 uncompressed = io.BytesIO(gz.read())
767 except IOError:
768 continue
769 break
770 else:
771 raise original_ioerror
772 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
773 resp.msg = old_resp.msg
774 # deflate
775 if resp.headers.get('Content-encoding', '') == 'deflate':
776 gz = io.BytesIO(self.deflate(resp.read()))
777 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
778 resp.msg = old_resp.msg
779 return resp
0f8d03f8 780
acebc9cd
PH
781 https_request = http_request
782 https_response = http_response
bf50b038 783
5de90176 784
305d0683 785def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
786 """ Return a UNIX timestamp from the given date """
787
788 if date_str is None:
789 return None
790
791 m = re.search(
792 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
793 date_str)
794 if not m:
795 timezone = datetime.timedelta()
796 else:
797 date_str = date_str[:-len(m.group(0))]
798 if not m.group('sign'):
799 timezone = datetime.timedelta()
800 else:
801 sign = 1 if m.group('sign') == '+' else -1
802 timezone = datetime.timedelta(
803 hours=sign * int(m.group('hours')),
804 minutes=sign * int(m.group('minutes')))
305d0683
TB
805 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
806 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
807 return calendar.timegm(dt.timetuple())
808
809
bf50b038
JMF
810def unified_strdate(date_str):
811 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
812
813 if date_str is None:
814 return None
815
bf50b038
JMF
816 upload_date = None
817 #Replace commas
026fcc04 818 date_str = date_str.replace(',', ' ')
bf50b038 819 # %z (UTC offset) is only supported in python>=3.2
026fcc04 820 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
821 format_expressions = [
822 '%d %B %Y',
0f99566c 823 '%d %b %Y',
19e1d359
JMF
824 '%B %d %Y',
825 '%b %d %Y',
78ff59d0
PP
826 '%b %dst %Y %I:%M%p',
827 '%b %dnd %Y %I:%M%p',
828 '%b %dth %Y %I:%M%p',
19e1d359 829 '%Y-%m-%d',
4cf96546 830 '%d.%m.%Y',
19e1d359
JMF
831 '%d/%m/%Y',
832 '%Y/%m/%d %H:%M:%S',
5d73273f 833 '%Y-%m-%d %H:%M:%S',
19e1d359 834 '%d.%m.%Y %H:%M',
b047de6f 835 '%d.%m.%Y %H.%M',
19e1d359 836 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
837 '%Y-%m-%dT%H:%M:%S.%fZ',
838 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 839 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 840 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 841 '%Y-%m-%dT%H:%M',
19e1d359 842 ]
bf50b038
JMF
843 for expression in format_expressions:
844 try:
845 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 846 except ValueError:
bf50b038 847 pass
42393ce2
PH
848 if upload_date is None:
849 timetuple = email.utils.parsedate_tz(date_str)
850 if timetuple:
851 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
852 return upload_date
853
cbdbb766 854def determine_ext(url, default_ext=u'unknown_video'):
f4776371
S
855 if url is None:
856 return default_ext
73e79f2a
PH
857 guess = url.partition(u'?')[0].rpartition(u'.')[2]
858 if re.match(r'^[A-Za-z0-9]+$', guess):
859 return guess
860 else:
cbdbb766 861 return default_ext
73e79f2a 862
d4051a8e
JMF
863def subtitles_filename(filename, sub_lang, sub_format):
864 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
865
bd558525 866def date_from_str(date_str):
37254abc
JMF
867 """
868 Return a datetime object from a string in the format YYYYMMDD or
869 (now|today)[+-][0-9](day|week|month|year)(s)?"""
870 today = datetime.date.today()
871 if date_str == 'now'or date_str == 'today':
872 return today
873 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
874 if match is not None:
875 sign = match.group('sign')
876 time = int(match.group('time'))
877 if sign == '-':
878 time = -time
879 unit = match.group('unit')
880 #A bad aproximation?
881 if unit == 'month':
882 unit = 'day'
883 time *= 30
884 elif unit == 'year':
885 unit = 'day'
886 time *= 365
887 unit += 's'
888 delta = datetime.timedelta(**{unit: time})
889 return today + delta
bd558525
JMF
890 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
891
e63fc1be 892def hyphenate_date(date_str):
893 """
894 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
895 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
896 if match is not None:
897 return '-'.join(match.groups())
898 else:
899 return date_str
900
bd558525
JMF
901class DateRange(object):
902 """Represents a time interval between two dates"""
903 def __init__(self, start=None, end=None):
904 """start and end must be strings in the format accepted by date"""
905 if start is not None:
906 self.start = date_from_str(start)
907 else:
908 self.start = datetime.datetime.min.date()
909 if end is not None:
910 self.end = date_from_str(end)
911 else:
912 self.end = datetime.datetime.max.date()
37254abc 913 if self.start > self.end:
bd558525
JMF
914 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
915 @classmethod
916 def day(cls, day):
917 """Returns a range that only contains the given day"""
918 return cls(day,day)
919 def __contains__(self, date):
920 """Check if the date is in the range"""
37254abc
JMF
921 if not isinstance(date, datetime.date):
922 date = date_from_str(date)
923 return self.start <= date <= self.end
bd558525
JMF
924 def __str__(self):
925 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
926
927
928def platform_name():
929 """ Returns the platform name as a compat_str """
930 res = platform.platform()
931 if isinstance(res, bytes):
932 res = res.decode(preferredencoding())
933
934 assert isinstance(res, compat_str)
935 return res
c257baff
PH
936
937
b58ddb32
PH
938def _windows_write_string(s, out):
939 """ Returns True if the string was written using special methods,
940 False if it has yet to be written out."""
941 # Adapted from http://stackoverflow.com/a/3259271/35070
942
943 import ctypes
944 import ctypes.wintypes
945
946 WIN_OUTPUT_IDS = {
947 1: -11,
948 2: -12,
949 }
950
a383a98a
PH
951 try:
952 fileno = out.fileno()
953 except AttributeError:
954 # If the output stream doesn't have a fileno, it's virtual
955 return False
b58ddb32
PH
956 if fileno not in WIN_OUTPUT_IDS:
957 return False
958
959 GetStdHandle = ctypes.WINFUNCTYPE(
960 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
961 ("GetStdHandle", ctypes.windll.kernel32))
962 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
963
964 WriteConsoleW = ctypes.WINFUNCTYPE(
965 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
966 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
967 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
968 written = ctypes.wintypes.DWORD(0)
969
970 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
971 FILE_TYPE_CHAR = 0x0002
972 FILE_TYPE_REMOTE = 0x8000
973 GetConsoleMode = ctypes.WINFUNCTYPE(
974 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
975 ctypes.POINTER(ctypes.wintypes.DWORD))(
976 ("GetConsoleMode", ctypes.windll.kernel32))
977 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
978
979 def not_a_console(handle):
980 if handle == INVALID_HANDLE_VALUE or handle is None:
981 return True
982 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
983 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
984
985 if not_a_console(h):
986 return False
987
d1b9c912
PH
988 def next_nonbmp_pos(s):
989 try:
990 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
991 except StopIteration:
992 return len(s)
993
994 while s:
995 count = min(next_nonbmp_pos(s), 1024)
996
b58ddb32 997 ret = WriteConsoleW(
d1b9c912 998 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
999 if ret == 0:
1000 raise OSError('Failed to write string')
d1b9c912
PH
1001 if not count: # We just wrote a non-BMP character
1002 assert written.value == 2
1003 s = s[1:]
1004 else:
1005 assert written.value > 0
1006 s = s[written.value:]
b58ddb32
PH
1007 return True
1008
1009
734f90bb 1010def write_string(s, out=None, encoding=None):
7459e3a2
PH
1011 if out is None:
1012 out = sys.stderr
8bf48f23 1013 assert type(s) == compat_str
7459e3a2 1014
b58ddb32
PH
1015 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1016 if _windows_write_string(s, out):
1017 return
1018
7459e3a2
PH
1019 if ('b' in getattr(out, 'mode', '') or
1020 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1021 byt = s.encode(encoding or preferredencoding(), 'ignore')
1022 out.write(byt)
1023 elif hasattr(out, 'buffer'):
1024 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1025 byt = s.encode(enc, 'ignore')
1026 out.buffer.write(byt)
1027 else:
8bf48f23 1028 out.write(s)
7459e3a2
PH
1029 out.flush()
1030
1031
48ea9cea
PH
1032def bytes_to_intlist(bs):
1033 if not bs:
1034 return []
1035 if isinstance(bs[0], int): # Python 3
1036 return list(bs)
1037 else:
1038 return [ord(c) for c in bs]
1039
c257baff 1040
cba892fa 1041def intlist_to_bytes(xs):
1042 if not xs:
1043 return b''
1044 if isinstance(chr(0), bytes): # Python 2
1045 return ''.join([chr(x) for x in xs])
1046 else:
1047 return bytes(xs)
c38b1e77
PH
1048
1049
1050def get_cachedir(params={}):
1051 cache_root = os.environ.get('XDG_CACHE_HOME',
1052 os.path.expanduser('~/.cache'))
1053 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
1054
1055
1056# Cross-platform file locking
1057if sys.platform == 'win32':
1058 import ctypes.wintypes
1059 import msvcrt
1060
1061 class OVERLAPPED(ctypes.Structure):
1062 _fields_ = [
1063 ('Internal', ctypes.wintypes.LPVOID),
1064 ('InternalHigh', ctypes.wintypes.LPVOID),
1065 ('Offset', ctypes.wintypes.DWORD),
1066 ('OffsetHigh', ctypes.wintypes.DWORD),
1067 ('hEvent', ctypes.wintypes.HANDLE),
1068 ]
1069
1070 kernel32 = ctypes.windll.kernel32
1071 LockFileEx = kernel32.LockFileEx
1072 LockFileEx.argtypes = [
1073 ctypes.wintypes.HANDLE, # hFile
1074 ctypes.wintypes.DWORD, # dwFlags
1075 ctypes.wintypes.DWORD, # dwReserved
1076 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1077 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1078 ctypes.POINTER(OVERLAPPED) # Overlapped
1079 ]
1080 LockFileEx.restype = ctypes.wintypes.BOOL
1081 UnlockFileEx = kernel32.UnlockFileEx
1082 UnlockFileEx.argtypes = [
1083 ctypes.wintypes.HANDLE, # hFile
1084 ctypes.wintypes.DWORD, # dwReserved
1085 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1086 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1087 ctypes.POINTER(OVERLAPPED) # Overlapped
1088 ]
1089 UnlockFileEx.restype = ctypes.wintypes.BOOL
1090 whole_low = 0xffffffff
1091 whole_high = 0x7fffffff
1092
1093 def _lock_file(f, exclusive):
1094 overlapped = OVERLAPPED()
1095 overlapped.Offset = 0
1096 overlapped.OffsetHigh = 0
1097 overlapped.hEvent = 0
1098 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1099 handle = msvcrt.get_osfhandle(f.fileno())
1100 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1101 whole_low, whole_high, f._lock_file_overlapped_p):
1102 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1103
1104 def _unlock_file(f):
1105 assert f._lock_file_overlapped_p
1106 handle = msvcrt.get_osfhandle(f.fileno())
1107 if not UnlockFileEx(handle, 0,
1108 whole_low, whole_high, f._lock_file_overlapped_p):
1109 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1110
1111else:
1112 import fcntl
1113
1114 def _lock_file(f, exclusive):
1115 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1116
1117 def _unlock_file(f):
1118 fcntl.lockf(f, fcntl.LOCK_UN)
1119
1120
1121class locked_file(object):
1122 def __init__(self, filename, mode, encoding=None):
1123 assert mode in ['r', 'a', 'w']
1124 self.f = io.open(filename, mode, encoding=encoding)
1125 self.mode = mode
1126
1127 def __enter__(self):
1128 exclusive = self.mode != 'r'
1129 try:
1130 _lock_file(self.f, exclusive)
1131 except IOError:
1132 self.f.close()
1133 raise
1134 return self
1135
1136 def __exit__(self, etype, value, traceback):
1137 try:
1138 _unlock_file(self.f)
1139 finally:
1140 self.f.close()
1141
1142 def __iter__(self):
1143 return iter(self.f)
1144
1145 def write(self, *args):
1146 return self.f.write(*args)
1147
1148 def read(self, *args):
1149 return self.f.read(*args)
4eb7f1d1
JMF
1150
1151
1152def shell_quote(args):
a6a173c2
JMF
1153 quoted_args = []
1154 encoding = sys.getfilesystemencoding()
1155 if encoding is None:
1156 encoding = 'utf-8'
1157 for a in args:
1158 if isinstance(a, bytes):
1159 # We may get a filename encoded with 'encodeFilename'
1160 a = a.decode(encoding)
1161 quoted_args.append(pipes.quote(a))
1162 return u' '.join(quoted_args)
9d4660ca
PH
1163
1164
f4d96df0
PH
1165def takewhile_inclusive(pred, seq):
1166 """ Like itertools.takewhile, but include the latest evaluated element
1167 (the first element so that Not pred(e)) """
1168 for e in seq:
1169 yield e
1170 if not pred(e):
1171 return
1172
1173
9d4660ca
PH
1174def smuggle_url(url, data):
1175 """ Pass additional data in a URL for internal use. """
1176
1177 sdata = compat_urllib_parse.urlencode(
1178 {u'__youtubedl_smuggle': json.dumps(data)})
1179 return url + u'#' + sdata
1180
1181
79f82953 1182def unsmuggle_url(smug_url, default=None):
9d4660ca 1183 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1184 return smug_url, default
9d4660ca
PH
1185 url, _, sdata = smug_url.rpartition(u'#')
1186 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1187 data = json.loads(jsond)
1188 return url, data
02dbf93f
PH
1189
1190
02dbf93f
PH
1191def format_bytes(bytes):
1192 if bytes is None:
1193 return u'N/A'
1194 if type(bytes) is str:
1195 bytes = float(bytes)
1196 if bytes == 0.0:
1197 exponent = 0
1198 else:
1199 exponent = int(math.log(bytes, 1024.0))
1200 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1201 converted = float(bytes) / float(1024 ** exponent)
1202 return u'%.2f%s' % (converted, suffix)
f53c966a 1203
1c088fa8 1204
1c088fa8
PH
1205def get_term_width():
1206 columns = os.environ.get('COLUMNS', None)
1207 if columns:
1208 return int(columns)
1209
1210 try:
1211 sp = subprocess.Popen(
1212 ['stty', 'size'],
1213 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1214 out, err = sp.communicate()
1215 return int(out.split()[1])
1216 except:
1217 pass
1218 return None
caefb1de
PH
1219
1220
1221def month_by_name(name):
1222 """ Return the number of a month by (locale-independently) English name """
1223
1224 ENGLISH_NAMES = [
dadb8184 1225 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1226 u'July', u'August', u'September', u'October', u'November', u'December']
1227 try:
1228 return ENGLISH_NAMES.index(name) + 1
1229 except ValueError:
1230 return None
18258362
JMF
1231
1232
5aafe895 1233def fix_xml_ampersands(xml_str):
18258362 1234 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1235 return re.sub(
1236 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1237 u'&amp;',
1238 xml_str)
e3946f98
PH
1239
1240
1241def setproctitle(title):
8bf48f23 1242 assert isinstance(title, compat_str)
e3946f98
PH
1243 try:
1244 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1245 except OSError:
1246 return
6eefe533
PH
1247 title_bytes = title.encode('utf-8')
1248 buf = ctypes.create_string_buffer(len(title_bytes))
1249 buf.value = title_bytes
e3946f98 1250 try:
6eefe533 1251 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1252 except AttributeError:
1253 return # Strange libc, just skip this
d7dda168
PH
1254
1255
1256def remove_start(s, start):
1257 if s.startswith(start):
1258 return s[len(start):]
1259 return s
29eb5174
PH
1260
1261
1262def url_basename(url):
9b8aaeed
JMF
1263 path = compat_urlparse.urlparse(url).path
1264 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1265
1266
1267class HEADRequest(compat_urllib_request.Request):
1268 def get_method(self):
1269 return "HEAD"
7217e148
PH
1270
1271
9732d77e 1272def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1273 if get_attr:
1274 if v is not None:
1275 v = getattr(v, get_attr, None)
9732d77e
PH
1276 return default if v is None else (int(v) * invscale // scale)
1277
40a90862
JMF
1278def str_or_none(v, default=None):
1279 return default if v is None else compat_str(v)
1280
9732d77e
PH
1281
1282def str_to_int(int_str):
1283 if int_str is None:
1284 return None
1285 int_str = re.sub(r'[,\.]', u'', int_str)
1286 return int(int_str)
608d11f5
PH
1287
1288
9732d77e
PH
1289def float_or_none(v, scale=1, invscale=1, default=None):
1290 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1291
1292
608d11f5
PH
1293def parse_duration(s):
1294 if s is None:
1295 return None
1296
1297 m = re.match(
ba40a746 1298 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
608d11f5
PH
1299 if not m:
1300 return None
1301 res = int(m.group('secs'))
1302 if m.group('mins'):
1303 res += int(m.group('mins')) * 60
1304 if m.group('hours'):
1305 res += int(m.group('hours')) * 60 * 60
1306 return res
91d7d0b3
JMF
1307
1308
1309def prepend_extension(filename, ext):
1310 name, real_ext = os.path.splitext(filename)
1311 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1312
1313
1314def check_executable(exe, args=[]):
1315 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1316 args can be a list of arguments for a short output (like -version) """
1317 try:
1318 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1319 except OSError:
1320 return False
1321 return exe
b7ab0590
PH
1322
1323
1324class PagedList(object):
1325 def __init__(self, pagefunc, pagesize):
1326 self._pagefunc = pagefunc
1327 self._pagesize = pagesize
1328
dd26ced1
PH
1329 def __len__(self):
1330 # This is only useful for tests
1331 return len(self.getslice())
1332
b7ab0590
PH
1333 def getslice(self, start=0, end=None):
1334 res = []
1335 for pagenum in itertools.count(start // self._pagesize):
1336 firstid = pagenum * self._pagesize
1337 nextfirstid = pagenum * self._pagesize + self._pagesize
1338 if start >= nextfirstid:
1339 continue
1340
1341 page_results = list(self._pagefunc(pagenum))
1342
1343 startv = (
1344 start % self._pagesize
1345 if firstid <= start < nextfirstid
1346 else 0)
1347
1348 endv = (
1349 ((end - 1) % self._pagesize) + 1
1350 if (end is not None and firstid <= end <= nextfirstid)
1351 else None)
1352
1353 if startv != 0 or endv is not None:
1354 page_results = page_results[startv:endv]
1355 res.extend(page_results)
1356
1357 # A little optimization - if current page is not "full", ie. does
1358 # not contain page_size videos then we can assume that this page
1359 # is the last one - there are no more ids on further pages -
1360 # i.e. no need to query again.
1361 if len(page_results) + startv < self._pagesize:
1362 break
1363
1364 # If we got the whole page, but the next page is not interesting,
1365 # break out early as well
1366 if end == nextfirstid:
1367 break
1368 return res
81c2f20b
PH
1369
1370
1371def uppercase_escape(s):
676eb3f2 1372 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1373 return re.sub(
a612753d 1374 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1375 lambda m: unicode_escape(m.group(0))[0],
1376 s)
b53466e1
PH
1377
1378try:
1379 struct.pack(u'!I', 0)
1380except TypeError:
1381 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1382 def struct_pack(spec, *args):
1383 if isinstance(spec, compat_str):
1384 spec = spec.encode('ascii')
1385 return struct.pack(spec, *args)
1386
1387 def struct_unpack(spec, *args):
1388 if isinstance(spec, compat_str):
1389 spec = spec.encode('ascii')
1390 return struct.unpack(spec, *args)
1391else:
1392 struct_pack = struct.pack
1393 struct_unpack = struct.unpack
62e609ab
PH
1394
1395
1396def read_batch_urls(batch_fd):
1397 def fixup(url):
1398 if not isinstance(url, compat_str):
1399 url = url.decode('utf-8', 'replace')
1400 BOM_UTF8 = u'\xef\xbb\xbf'
1401 if url.startswith(BOM_UTF8):
1402 url = url[len(BOM_UTF8):]
1403 url = url.strip()
1404 if url.startswith(('#', ';', ']')):
1405 return False
1406 return url
1407
1408 with contextlib.closing(batch_fd) as fd:
1409 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1410
1411
1412def urlencode_postdata(*args, **kargs):
1413 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1414
1415
1416def parse_xml(s):
1417 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1418 def doctype(self, name, pubid, system):
1419 pass # Ignore doctypes
1420
1421 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1422 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1423 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
e68301af
PH
1424
1425
1426if sys.version_info < (3, 0) and sys.platform == 'win32':
1427 def compat_getpass(prompt, *args, **kwargs):
1428 if isinstance(prompt, compat_str):
4e6f9aec 1429 prompt = prompt.encode(preferredencoding())
e68301af
PH
1430 return getpass.getpass(prompt, *args, **kwargs)
1431else:
1432 compat_getpass = getpass.getpass
a1a530b0
PH
1433
1434
1435US_RATINGS = {
1436 'G': 0,
1437 'PG': 10,
1438 'PG-13': 13,
1439 'R': 16,
1440 'NC': 18,
1441}
fac55558
PH
1442
1443
1444def strip_jsonp(code):
816930c4 1445 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
478c2c61
PH
1446
1447
1448def qualities(quality_ids):
1449 """ Get a numeric quality value out of a list of possible values """
1450 def q(qid):
1451 try:
1452 return quality_ids.index(qid)
1453 except ValueError:
1454 return -1
1455 return q
1456
acd69589
PH
1457
1458DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68
PH
1459
1460try:
1461 subprocess_check_output = subprocess.check_output
1462except AttributeError:
1463 def subprocess_check_output(*args, **kwargs):
1464 assert 'input' not in kwargs
1465 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1466 output, _ = p.communicate()
1467 ret = p.poll()
1468 if ret:
1469 raise subprocess.CalledProcessError(ret, p.args, output=output)
1470 return output