]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Merge branch 'izlesene' of https://github.com/naglis/youtube-dl into naglis-izlesene
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
676eb3f2 5import codecs
62e609ab 6import contextlib
e3946f98 7import ctypes
c496ca96
PH
8import datetime
9import email.utils
f45c185f 10import errno
e68301af 11import getpass
d77c3dfd 12import gzip
b7ab0590 13import itertools
03f9daab 14import io
f4bfd65f 15import json
d77c3dfd 16import locale
02dbf93f 17import math
d77c3dfd 18import os
4eb7f1d1 19import pipes
c496ca96 20import platform
d77c3dfd 21import re
13ebea79 22import ssl
c496ca96 23import socket
b53466e1 24import struct
1c088fa8 25import subprocess
d77c3dfd 26import sys
01951dda 27import traceback
bcf89ce6 28import xml.etree.ElementTree
d77c3dfd 29import zlib
d77c3dfd 30
01ba00ca 31try:
59ae15a5 32 import urllib.request as compat_urllib_request
01ba00ca 33except ImportError: # Python 2
59ae15a5 34 import urllib2 as compat_urllib_request
01ba00ca
PH
35
36try:
59ae15a5 37 import urllib.error as compat_urllib_error
01ba00ca 38except ImportError: # Python 2
59ae15a5 39 import urllib2 as compat_urllib_error
01ba00ca
PH
40
41try:
59ae15a5 42 import urllib.parse as compat_urllib_parse
01ba00ca 43except ImportError: # Python 2
59ae15a5 44 import urllib as compat_urllib_parse
01ba00ca 45
799c0763
PH
46try:
47 from urllib.parse import urlparse as compat_urllib_parse_urlparse
48except ImportError: # Python 2
49 from urlparse import urlparse as compat_urllib_parse_urlparse
50
6543f0dc
JMF
51try:
52 import urllib.parse as compat_urlparse
53except ImportError: # Python 2
54 import urlparse as compat_urlparse
55
01ba00ca 56try:
59ae15a5 57 import http.cookiejar as compat_cookiejar
01ba00ca 58except ImportError: # Python 2
59ae15a5 59 import cookielib as compat_cookiejar
01ba00ca 60
3e669f36 61try:
59ae15a5 62 import html.entities as compat_html_entities
9f37a959 63except ImportError: # Python 2
59ae15a5 64 import htmlentitydefs as compat_html_entities
3e669f36 65
a8156c1d 66try:
59ae15a5 67 import html.parser as compat_html_parser
9f37a959 68except ImportError: # Python 2
59ae15a5 69 import HTMLParser as compat_html_parser
a8156c1d 70
348d0a7a 71try:
59ae15a5 72 import http.client as compat_http_client
9f37a959 73except ImportError: # Python 2
59ae15a5 74 import httplib as compat_http_client
348d0a7a 75
2eabb802 76try:
0e283428 77 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
78except ImportError: # Python 2
79 from urllib2 import HTTPError as compat_HTTPError
80
e0df6211
PH
81try:
82 from urllib.request import urlretrieve as compat_urlretrieve
83except ImportError: # Python 2
84 from urllib import urlretrieve as compat_urlretrieve
85
86
5910e210
PH
87try:
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
90except ImportError:
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
92
9f37a959 93try:
f1f725c6
PH
94 from urllib.parse import unquote as compat_urllib_parse_unquote
95except ImportError:
96 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
59ae15a5
PH
97 if string == '':
98 return string
99 res = string.split('%')
100 if len(res) == 1:
101 return string
102 if encoding is None:
103 encoding = 'utf-8'
104 if errors is None:
105 errors = 'replace'
106 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
107 pct_sequence = b''
108 string = res[0]
109 for item in res[1:]:
110 try:
111 if not item:
112 raise ValueError
113 pct_sequence += item[:2].decode('hex')
114 rest = item[2:]
115 if not rest:
116 # This segment was just a single percent-encoded character.
117 # May be part of a sequence of code units, so delay decoding.
118 # (Stored in pct_sequence).
119 continue
120 except ValueError:
121 rest = '%' + item
122 # Encountered non-percent-encoded characters. Flush the current
123 # pct_sequence.
124 string += pct_sequence.decode(encoding, errors) + rest
125 pct_sequence = b''
126 if pct_sequence:
127 # Flush the final pct_sequence
128 string += pct_sequence.decode(encoding, errors)
129 return string
130
f1f725c6
PH
131
132try:
133 from urllib.parse import parse_qs as compat_parse_qs
134except ImportError: # Python 2
135 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
136 # Python 2's version is apparently totally broken
137
59ae15a5
PH
138 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
139 encoding='utf-8', errors='replace'):
140 qs, _coerce_result = qs, unicode
141 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
142 r = []
143 for name_value in pairs:
144 if not name_value and not strict_parsing:
145 continue
146 nv = name_value.split('=', 1)
147 if len(nv) != 2:
148 if strict_parsing:
149 raise ValueError("bad query field: %r" % (name_value,))
150 # Handle case of a control-name with no equal sign
151 if keep_blank_values:
152 nv.append('')
153 else:
154 continue
155 if len(nv[1]) or keep_blank_values:
156 name = nv[0].replace('+', ' ')
f1f725c6
PH
157 name = compat_urllib_parse_unquote(
158 name, encoding=encoding, errors=errors)
59ae15a5
PH
159 name = _coerce_result(name)
160 value = nv[1].replace('+', ' ')
f1f725c6
PH
161 value = compat_urllib_parse_unquote(
162 value, encoding=encoding, errors=errors)
59ae15a5
PH
163 value = _coerce_result(value)
164 r.append((name, value))
165 return r
166
167 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
168 encoding='utf-8', errors='replace'):
169 parsed_result = {}
170 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
171 encoding=encoding, errors=errors)
172 for name, value in pairs:
173 if name in parsed_result:
174 parsed_result[name].append(value)
175 else:
176 parsed_result[name] = [value]
177 return parsed_result
348d0a7a 178
3e669f36 179try:
59ae15a5 180 compat_str = unicode # Python 2
3e669f36 181except NameError:
59ae15a5 182 compat_str = str
3e669f36
PH
183
184try:
59ae15a5 185 compat_chr = unichr # Python 2
3e669f36 186except NameError:
59ae15a5 187 compat_chr = chr
3e669f36 188
f7300c5c
JMF
189try:
190 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
191except ImportError: # Python 2.6
192 from xml.parsers.expat import ExpatError as compat_xml_parse_error
193
b31756c1
FV
194def compat_ord(c):
195 if type(c) is int: return c
196 else: return ord(c)
197
468e2e92
FV
198# This is not clearly defined otherwise
199compiled_regex_type = type(re.compile(''))
200
3e669f36 201std_headers = {
ae8f7871 202 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
203 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
204 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
205 'Accept-Encoding': 'gzip, deflate',
206 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 207}
f427df17 208
d77c3dfd 209def preferredencoding():
59ae15a5 210 """Get preferred encoding.
d77c3dfd 211
59ae15a5
PH
212 Returns the best encoding scheme for the system, based on
213 locale.getpreferredencoding() and some further tweaks.
214 """
215 try:
216 pref = locale.getpreferredencoding()
217 u'TEST'.encode(pref)
218 except:
219 pref = 'UTF-8'
bae611f2 220
59ae15a5 221 return pref
d77c3dfd 222
8cd10ac4 223if sys.version_info < (3,0):
59ae15a5
PH
224 def compat_print(s):
225 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 226else:
59ae15a5
PH
227 def compat_print(s):
228 assert type(s) == type(u'')
229 print(s)
d77c3dfd 230
f4bfd65f
PH
231# In Python 2.x, json.dump expects a bytestream.
232# In Python 3.x, it writes to a character stream
233if sys.version_info < (3,0):
234 def write_json_file(obj, fn):
235 with open(fn, 'wb') as f:
236 json.dump(obj, f)
237else:
238 def write_json_file(obj, fn):
239 with open(fn, 'w', encoding='utf-8') as f:
240 json.dump(obj, f)
241
59ae56fa
PH
242if sys.version_info >= (2,7):
243 def find_xpath_attr(node, xpath, key, val):
244 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
245 assert re.match(r'^[a-zA-Z-]+$', key)
246 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
247 expr = xpath + u"[@%s='%s']" % (key, val)
248 return node.find(expr)
249else:
250 def find_xpath_attr(node, xpath, key, val):
251 for f in node.findall(xpath):
252 if f.attrib.get(key) == val:
253 return f
254 return None
255
d7e66d39
JMF
256# On python2.6 the xml.etree.ElementTree.Element methods don't support
257# the namespace parameter
258def xpath_with_ns(path, ns_map):
259 components = [c.split(':') for c in path.split('/')]
260 replaced = []
261 for c in components:
262 if len(c) == 1:
263 replaced.append(c[0])
264 else:
265 ns, tag = c
266 replaced.append('{%s}%s' % (ns_map[ns], tag))
267 return '/'.join(replaced)
268
d77c3dfd 269def htmlentity_transform(matchobj):
59ae15a5
PH
270 """Transforms an HTML entity to a character.
271
272 This function receives a match object and is intended to be used with
273 the re.sub() function.
274 """
275 entity = matchobj.group(1)
276
277 # Known non-numeric HTML entity
278 if entity in compat_html_entities.name2codepoint:
279 return compat_chr(compat_html_entities.name2codepoint[entity])
280
281 mobj = re.match(u'(?u)#(x?\\d+)', entity)
282 if mobj is not None:
283 numstr = mobj.group(1)
284 if numstr.startswith(u'x'):
285 base = 16
286 numstr = u'0%s' % numstr
287 else:
288 base = 10
289 return compat_chr(int(numstr, base))
290
291 # Unknown entity in name, return its literal representation
292 return (u'&%s;' % entity)
d77c3dfd 293
a8156c1d 294compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
295class BaseHTMLParser(compat_html_parser.HTMLParser):
296 def __init(self):
297 compat_html_parser.HTMLParser.__init__(self)
298 self.html = None
299
300 def loads(self, html):
301 self.html = html
302 self.feed(html)
303 self.close()
304
305class AttrParser(BaseHTMLParser):
43e8fafd
ND
306 """Modified HTMLParser that isolates a tag with the specified attribute"""
307 def __init__(self, attribute, value):
308 self.attribute = attribute
309 self.value = value
59ae15a5
PH
310 self.result = None
311 self.started = False
312 self.depth = {}
59ae15a5
PH
313 self.watch_startpos = False
314 self.error_count = 0
a921f407 315 BaseHTMLParser.__init__(self)
59ae15a5
PH
316
317 def error(self, message):
318 if self.error_count > 10 or self.started:
319 raise compat_html_parser.HTMLParseError(message, self.getpos())
320 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
321 self.error_count += 1
322 self.goahead(1)
323
59ae15a5
PH
324 def handle_starttag(self, tag, attrs):
325 attrs = dict(attrs)
326 if self.started:
327 self.find_startpos(None)
43e8fafd 328 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
329 self.result = [tag]
330 self.started = True
331 self.watch_startpos = True
332 if self.started:
333 if not tag in self.depth: self.depth[tag] = 0
334 self.depth[tag] += 1
335
336 def handle_endtag(self, tag):
337 if self.started:
338 if tag in self.depth: self.depth[tag] -= 1
339 if self.depth[self.result[0]] == 0:
340 self.started = False
341 self.result.append(self.getpos())
342
343 def find_startpos(self, x):
344 """Needed to put the start position of the result (self.result[1])
345 after the opening tag with the requested id"""
346 if self.watch_startpos:
347 self.watch_startpos = False
348 self.result.append(self.getpos())
349 handle_entityref = handle_charref = handle_data = handle_comment = \
350 handle_decl = handle_pi = unknown_decl = find_startpos
351
352 def get_result(self):
353 if self.result is None:
354 return None
355 if len(self.result) != 3:
356 return None
357 lines = self.html.split('\n')
358 lines = lines[self.result[1][0]-1:self.result[2][0]]
359 lines[0] = lines[0][self.result[1][1]:]
360 if len(lines) == 1:
361 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
362 lines[-1] = lines[-1][:self.result[2][1]]
363 return '\n'.join(lines).strip()
3b024e17
PH
364# Hack for https://github.com/rg3/youtube-dl/issues/662
365if sys.version_info < (2, 7, 3):
366 AttrParser.parse_endtag = (lambda self, i:
367 i + len("</scr'+'ipt>")
368 if self.rawdata[i:].startswith("</scr'+'ipt>")
369 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
370
371def get_element_by_id(id, html):
43e8fafd
ND
372 """Return the content of the tag with the specified ID in the passed HTML document"""
373 return get_element_by_attribute("id", id, html)
374
375def get_element_by_attribute(attribute, value, html):
376 """Return the content of the tag with the specified attribute in the passed HTML document"""
377 parser = AttrParser(attribute, value)
59ae15a5
PH
378 try:
379 parser.loads(html)
380 except compat_html_parser.HTMLParseError:
381 pass
382 return parser.get_result()
9e6dd238 383
a921f407
JMF
384class MetaParser(BaseHTMLParser):
385 """
386 Modified HTMLParser that isolates a meta tag with the specified name
387 attribute.
388 """
389 def __init__(self, name):
390 BaseHTMLParser.__init__(self)
391 self.name = name
392 self.content = None
393 self.result = None
394
395 def handle_starttag(self, tag, attrs):
396 if tag != 'meta':
397 return
398 attrs = dict(attrs)
399 if attrs.get('name') == self.name:
400 self.result = attrs.get('content')
401
402 def get_result(self):
403 return self.result
404
405def get_meta_content(name, html):
406 """
407 Return the content attribute from the meta tag with the given name attribute.
408 """
409 parser = MetaParser(name)
410 try:
411 parser.loads(html)
412 except compat_html_parser.HTMLParseError:
413 pass
414 return parser.get_result()
415
9e6dd238
FV
416
417def clean_html(html):
59ae15a5
PH
418 """Clean an HTML snippet into a readable string"""
419 # Newline vs <br />
420 html = html.replace('\n', ' ')
6b3aef80
FV
421 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
422 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
423 # Strip html tags
424 html = re.sub('<.*?>', '', html)
425 # Replace html entities
426 html = unescapeHTML(html)
7decf895 427 return html.strip()
9e6dd238
FV
428
429
d77c3dfd 430def sanitize_open(filename, open_mode):
59ae15a5
PH
431 """Try to open the given filename, and slightly tweak it if this fails.
432
433 Attempts to open the given filename. If this fails, it tries to change
434 the filename slightly, step by step, until it's either able to open it
435 or it fails and raises a final exception, like the standard open()
436 function.
437
438 It returns the tuple (stream, definitive_file_name).
439 """
440 try:
441 if filename == u'-':
442 if sys.platform == 'win32':
443 import msvcrt
444 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 445 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
446 stream = open(encodeFilename(filename), open_mode)
447 return (stream, filename)
448 except (IOError, OSError) as err:
f45c185f
PH
449 if err.errno in (errno.EACCES,):
450 raise
59ae15a5 451
f45c185f
PH
452 # In case of error, try to remove win32 forbidden chars
453 alt_filename = os.path.join(
454 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
455 for path_part in os.path.split(filename)
456 )
457 if alt_filename == filename:
458 raise
459 else:
460 # An exception here should be caught in the caller
461 stream = open(encodeFilename(filename), open_mode)
462 return (stream, alt_filename)
d77c3dfd
FV
463
464
465def timeconvert(timestr):
59ae15a5
PH
466 """Convert RFC 2822 defined time string into system timestamp"""
467 timestamp = None
468 timetuple = email.utils.parsedate_tz(timestr)
469 if timetuple is not None:
470 timestamp = email.utils.mktime_tz(timetuple)
471 return timestamp
1c469a94 472
796173d0 473def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
474 """Sanitizes a string so it could be used as part of a filename.
475 If restricted is set, use a stricter subset of allowed characters.
796173d0 476 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
477 """
478 def replace_insane(char):
479 if char == '?' or ord(char) < 32 or ord(char) == 127:
480 return ''
481 elif char == '"':
482 return '' if restricted else '\''
483 elif char == ':':
484 return '_-' if restricted else ' -'
485 elif char in '\\/|*<>':
486 return '_'
627dcfff 487 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
488 return '_'
489 if restricted and ord(char) > 127:
490 return '_'
491 return char
492
493 result = u''.join(map(replace_insane, s))
796173d0
PH
494 if not is_id:
495 while '__' in result:
496 result = result.replace('__', '_')
497 result = result.strip('_')
498 # Common case of "Foreign band name - English song title"
499 if restricted and result.startswith('-_'):
500 result = result[2:]
501 if not result:
502 result = '_'
59ae15a5 503 return result
d77c3dfd
FV
504
505def orderedSet(iterable):
59ae15a5
PH
506 """ Remove all duplicates from the input iterable """
507 res = []
508 for el in iterable:
509 if el not in res:
510 res.append(el)
511 return res
d77c3dfd 512
912b38b4 513
d77c3dfd 514def unescapeHTML(s):
912b38b4
PH
515 if s is None:
516 return None
517 assert type(s) == compat_str
d77c3dfd 518
912b38b4 519 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
59ae15a5 520 return result
d77c3dfd 521
8bf48f23
PH
522
523def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
524 """
525 @param s The name of the file
526 """
d77c3dfd 527
8bf48f23 528 assert type(s) == compat_str
d77c3dfd 529
59ae15a5
PH
530 # Python 3 has a Unicode API
531 if sys.version_info >= (3, 0):
532 return s
0f00efed 533
59ae15a5
PH
534 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
535 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
536 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
537 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
538 if not for_subprocess:
539 return s
540 else:
541 # For subprocess calls, encode with locale encoding
542 # Refer to http://stackoverflow.com/a/9951851/35070
543 encoding = preferredencoding()
59ae15a5 544 else:
6df40dcb 545 encoding = sys.getfilesystemencoding()
8bf48f23
PH
546 if encoding is None:
547 encoding = 'utf-8'
548 return s.encode(encoding, 'ignore')
549
f07b74fc
PH
550
551def encodeArgument(s):
552 if not isinstance(s, compat_str):
553 # Legacy code that uses byte strings
554 # Uncomment the following line after fixing all post processors
555 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
556 s = s.decode('ascii')
557 return encodeFilename(s, True)
558
559
8271226a
PH
560def decodeOption(optval):
561 if optval is None:
562 return optval
563 if isinstance(optval, bytes):
564 optval = optval.decode(preferredencoding())
565
566 assert isinstance(optval, compat_str)
567 return optval
1c256f70 568
4539dd30
PH
569def formatSeconds(secs):
570 if secs > 3600:
571 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
572 elif secs > 60:
573 return '%d:%02d' % (secs // 60, secs % 60)
574 else:
575 return '%d' % secs
576
a0ddb8a2
PH
577
578def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
579 if sys.version_info < (3, 2):
580 import httplib
581
582 class HTTPSConnectionV3(httplib.HTTPSConnection):
583 def __init__(self, *args, **kwargs):
584 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
585
586 def connect(self):
587 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 588 if getattr(self, '_tunnel_host', False):
13ebea79
PH
589 self.sock = sock
590 self._tunnel()
591 try:
592 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 593 except ssl.SSLError:
13ebea79
PH
594 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
595
596 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
597 def https_open(self, req):
598 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 599 return HTTPSHandlerV3(**kwargs)
ea6d901e 600 else:
13ebea79 601 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 602 context.verify_mode = (ssl.CERT_NONE
dca08720 603 if opts_no_check_certificate
ea6d901e 604 else ssl.CERT_REQUIRED)
303b479e
PH
605 context.set_default_verify_paths()
606 try:
607 context.load_default_certs()
608 except AttributeError:
609 pass # Python < 3.4
a0ddb8a2 610 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 611
1c256f70
PH
612class ExtractorError(Exception):
613 """Error during info extraction."""
d11271dd 614 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
615 """ tb, if given, is the original traceback (so that it can be printed out).
616 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
617 """
618
619 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
620 expected = True
d11271dd
PH
621 if video_id is not None:
622 msg = video_id + ': ' + msg
9a82b238 623 if not expected:
298f833b 624 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 625 super(ExtractorError, self).__init__(msg)
d5979c5d 626
1c256f70 627 self.traceback = tb
8cc83b8d 628 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 629 self.cause = cause
d11271dd 630 self.video_id = video_id
1c256f70 631
01951dda
PH
632 def format_traceback(self):
633 if self.traceback is None:
634 return None
635 return u''.join(traceback.format_tb(self.traceback))
636
1c256f70 637
55b3e45b
JMF
638class RegexNotFoundError(ExtractorError):
639 """Error when a regex didn't match"""
640 pass
641
642
d77c3dfd 643class DownloadError(Exception):
59ae15a5 644 """Download Error exception.
d77c3dfd 645
59ae15a5
PH
646 This exception may be thrown by FileDownloader objects if they are not
647 configured to continue on errors. They will contain the appropriate
648 error message.
649 """
8cc83b8d
FV
650 def __init__(self, msg, exc_info=None):
651 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
652 super(DownloadError, self).__init__(msg)
653 self.exc_info = exc_info
d77c3dfd
FV
654
655
656class SameFileError(Exception):
59ae15a5 657 """Same File exception.
d77c3dfd 658
59ae15a5
PH
659 This exception will be thrown by FileDownloader objects if they detect
660 multiple files would have to be downloaded to the same file on disk.
661 """
662 pass
d77c3dfd
FV
663
664
665class PostProcessingError(Exception):
59ae15a5 666 """Post Processing exception.
d77c3dfd 667
59ae15a5
PH
668 This exception may be raised by PostProcessor's .run() method to
669 indicate an error in the postprocessing task.
670 """
7851b379
PH
671 def __init__(self, msg):
672 self.msg = msg
d77c3dfd
FV
673
674class MaxDownloadsReached(Exception):
59ae15a5
PH
675 """ --max-downloads limit has been reached. """
676 pass
d77c3dfd
FV
677
678
679class UnavailableVideoError(Exception):
59ae15a5 680 """Unavailable Format exception.
d77c3dfd 681
59ae15a5
PH
682 This exception will be thrown when a video is requested
683 in a format that is not available for that video.
684 """
685 pass
d77c3dfd
FV
686
687
688class ContentTooShortError(Exception):
59ae15a5 689 """Content Too Short exception.
d77c3dfd 690
59ae15a5
PH
691 This exception may be raised by FileDownloader objects when a file they
692 download is too small for what the server announced first, indicating
693 the connection was probably interrupted.
694 """
695 # Both in bytes
696 downloaded = None
697 expected = None
d77c3dfd 698
59ae15a5
PH
699 def __init__(self, downloaded, expected):
700 self.downloaded = downloaded
701 self.expected = expected
d77c3dfd 702
acebc9cd 703class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
704 """Handler for HTTP requests and responses.
705
706 This class, when installed with an OpenerDirector, automatically adds
707 the standard headers to every HTTP request and handles gzipped and
708 deflated responses from web servers. If compression is to be avoided in
709 a particular request, the original request in the program code only has
710 to include the HTTP header "Youtubedl-No-Compression", which will be
711 removed before making the real request.
712
713 Part of this code was copied from:
714
715 http://techknack.net/python-urllib2-handlers/
716
717 Andrew Rowls, the author of that code, agreed to release it to the
718 public domain.
719 """
720
721 @staticmethod
722 def deflate(data):
723 try:
724 return zlib.decompress(data, -zlib.MAX_WBITS)
725 except zlib.error:
726 return zlib.decompress(data)
727
728 @staticmethod
729 def addinfourl_wrapper(stream, headers, url, code):
730 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
731 return compat_urllib_request.addinfourl(stream, headers, url, code)
732 ret = compat_urllib_request.addinfourl(stream, headers, url)
733 ret.code = code
734 return ret
735
acebc9cd
PH
736 def http_request(self, req):
737 for h,v in std_headers.items():
59ae15a5
PH
738 if h in req.headers:
739 del req.headers[h]
335959e7 740 req.add_header(h, v)
59ae15a5
PH
741 if 'Youtubedl-no-compression' in req.headers:
742 if 'Accept-encoding' in req.headers:
743 del req.headers['Accept-encoding']
744 del req.headers['Youtubedl-no-compression']
3446dfb7 745 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
746 if 'User-agent' in req.headers:
747 del req.headers['User-agent']
748 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 749 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
750 return req
751
acebc9cd 752 def http_response(self, req, resp):
59ae15a5
PH
753 old_resp = resp
754 # gzip
755 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
756 content = resp.read()
757 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
758 try:
759 uncompressed = io.BytesIO(gz.read())
760 except IOError as original_ioerror:
761 # There may be junk add the end of the file
762 # See http://stackoverflow.com/q/4928560/35070 for details
763 for i in range(1, 1024):
764 try:
765 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
766 uncompressed = io.BytesIO(gz.read())
767 except IOError:
768 continue
769 break
770 else:
771 raise original_ioerror
772 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
773 resp.msg = old_resp.msg
774 # deflate
775 if resp.headers.get('Content-encoding', '') == 'deflate':
776 gz = io.BytesIO(self.deflate(resp.read()))
777 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
778 resp.msg = old_resp.msg
779 return resp
0f8d03f8 780
acebc9cd
PH
781 https_request = http_request
782 https_response = http_response
bf50b038 783
5de90176 784
305d0683 785def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
786 """ Return a UNIX timestamp from the given date """
787
788 if date_str is None:
789 return None
790
791 m = re.search(
792 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
793 date_str)
794 if not m:
795 timezone = datetime.timedelta()
796 else:
797 date_str = date_str[:-len(m.group(0))]
798 if not m.group('sign'):
799 timezone = datetime.timedelta()
800 else:
801 sign = 1 if m.group('sign') == '+' else -1
802 timezone = datetime.timedelta(
803 hours=sign * int(m.group('hours')),
804 minutes=sign * int(m.group('minutes')))
305d0683
TB
805 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
806 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
807 return calendar.timegm(dt.timetuple())
808
809
bf50b038
JMF
810def unified_strdate(date_str):
811 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
812
813 if date_str is None:
814 return None
815
bf50b038
JMF
816 upload_date = None
817 #Replace commas
026fcc04 818 date_str = date_str.replace(',', ' ')
bf50b038 819 # %z (UTC offset) is only supported in python>=3.2
026fcc04 820 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
821 format_expressions = [
822 '%d %B %Y',
0f99566c 823 '%d %b %Y',
19e1d359
JMF
824 '%B %d %Y',
825 '%b %d %Y',
78ff59d0
PP
826 '%b %dst %Y %I:%M%p',
827 '%b %dnd %Y %I:%M%p',
828 '%b %dth %Y %I:%M%p',
19e1d359 829 '%Y-%m-%d',
4cf96546 830 '%d.%m.%Y',
19e1d359
JMF
831 '%d/%m/%Y',
832 '%Y/%m/%d %H:%M:%S',
5d73273f 833 '%Y-%m-%d %H:%M:%S',
19e1d359 834 '%d.%m.%Y %H:%M',
b047de6f 835 '%d.%m.%Y %H.%M',
19e1d359 836 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
837 '%Y-%m-%dT%H:%M:%S.%fZ',
838 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 839 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 840 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 841 '%Y-%m-%dT%H:%M',
19e1d359 842 ]
bf50b038
JMF
843 for expression in format_expressions:
844 try:
845 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 846 except ValueError:
bf50b038 847 pass
42393ce2
PH
848 if upload_date is None:
849 timetuple = email.utils.parsedate_tz(date_str)
850 if timetuple:
851 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
852 return upload_date
853
cbdbb766 854def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
855 guess = url.partition(u'?')[0].rpartition(u'.')[2]
856 if re.match(r'^[A-Za-z0-9]+$', guess):
857 return guess
858 else:
cbdbb766 859 return default_ext
73e79f2a 860
d4051a8e
JMF
861def subtitles_filename(filename, sub_lang, sub_format):
862 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
863
bd558525 864def date_from_str(date_str):
37254abc
JMF
865 """
866 Return a datetime object from a string in the format YYYYMMDD or
867 (now|today)[+-][0-9](day|week|month|year)(s)?"""
868 today = datetime.date.today()
869 if date_str == 'now'or date_str == 'today':
870 return today
871 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
872 if match is not None:
873 sign = match.group('sign')
874 time = int(match.group('time'))
875 if sign == '-':
876 time = -time
877 unit = match.group('unit')
878 #A bad aproximation?
879 if unit == 'month':
880 unit = 'day'
881 time *= 30
882 elif unit == 'year':
883 unit = 'day'
884 time *= 365
885 unit += 's'
886 delta = datetime.timedelta(**{unit: time})
887 return today + delta
bd558525
JMF
888 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
889
e63fc1be 890def hyphenate_date(date_str):
891 """
892 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
893 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
894 if match is not None:
895 return '-'.join(match.groups())
896 else:
897 return date_str
898
bd558525
JMF
899class DateRange(object):
900 """Represents a time interval between two dates"""
901 def __init__(self, start=None, end=None):
902 """start and end must be strings in the format accepted by date"""
903 if start is not None:
904 self.start = date_from_str(start)
905 else:
906 self.start = datetime.datetime.min.date()
907 if end is not None:
908 self.end = date_from_str(end)
909 else:
910 self.end = datetime.datetime.max.date()
37254abc 911 if self.start > self.end:
bd558525
JMF
912 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
913 @classmethod
914 def day(cls, day):
915 """Returns a range that only contains the given day"""
916 return cls(day,day)
917 def __contains__(self, date):
918 """Check if the date is in the range"""
37254abc
JMF
919 if not isinstance(date, datetime.date):
920 date = date_from_str(date)
921 return self.start <= date <= self.end
bd558525
JMF
922 def __str__(self):
923 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
924
925
926def platform_name():
927 """ Returns the platform name as a compat_str """
928 res = platform.platform()
929 if isinstance(res, bytes):
930 res = res.decode(preferredencoding())
931
932 assert isinstance(res, compat_str)
933 return res
c257baff
PH
934
935
b58ddb32
PH
936def _windows_write_string(s, out):
937 """ Returns True if the string was written using special methods,
938 False if it has yet to be written out."""
939 # Adapted from http://stackoverflow.com/a/3259271/35070
940
941 import ctypes
942 import ctypes.wintypes
943
944 WIN_OUTPUT_IDS = {
945 1: -11,
946 2: -12,
947 }
948
a383a98a
PH
949 try:
950 fileno = out.fileno()
951 except AttributeError:
952 # If the output stream doesn't have a fileno, it's virtual
953 return False
b58ddb32
PH
954 if fileno not in WIN_OUTPUT_IDS:
955 return False
956
957 GetStdHandle = ctypes.WINFUNCTYPE(
958 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
959 ("GetStdHandle", ctypes.windll.kernel32))
960 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
961
962 WriteConsoleW = ctypes.WINFUNCTYPE(
963 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
964 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
965 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
966 written = ctypes.wintypes.DWORD(0)
967
968 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
969 FILE_TYPE_CHAR = 0x0002
970 FILE_TYPE_REMOTE = 0x8000
971 GetConsoleMode = ctypes.WINFUNCTYPE(
972 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
973 ctypes.POINTER(ctypes.wintypes.DWORD))(
974 ("GetConsoleMode", ctypes.windll.kernel32))
975 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
976
977 def not_a_console(handle):
978 if handle == INVALID_HANDLE_VALUE or handle is None:
979 return True
980 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
981 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
982
983 if not_a_console(h):
984 return False
985
d1b9c912
PH
986 def next_nonbmp_pos(s):
987 try:
988 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
989 except StopIteration:
990 return len(s)
991
992 while s:
993 count = min(next_nonbmp_pos(s), 1024)
994
b58ddb32 995 ret = WriteConsoleW(
d1b9c912 996 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
997 if ret == 0:
998 raise OSError('Failed to write string')
d1b9c912
PH
999 if not count: # We just wrote a non-BMP character
1000 assert written.value == 2
1001 s = s[1:]
1002 else:
1003 assert written.value > 0
1004 s = s[written.value:]
b58ddb32
PH
1005 return True
1006
1007
734f90bb 1008def write_string(s, out=None, encoding=None):
7459e3a2
PH
1009 if out is None:
1010 out = sys.stderr
8bf48f23 1011 assert type(s) == compat_str
7459e3a2 1012
b58ddb32
PH
1013 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1014 if _windows_write_string(s, out):
1015 return
1016
7459e3a2
PH
1017 if ('b' in getattr(out, 'mode', '') or
1018 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1019 byt = s.encode(encoding or preferredencoding(), 'ignore')
1020 out.write(byt)
1021 elif hasattr(out, 'buffer'):
1022 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1023 byt = s.encode(enc, 'ignore')
1024 out.buffer.write(byt)
1025 else:
8bf48f23 1026 out.write(s)
7459e3a2
PH
1027 out.flush()
1028
1029
48ea9cea
PH
1030def bytes_to_intlist(bs):
1031 if not bs:
1032 return []
1033 if isinstance(bs[0], int): # Python 3
1034 return list(bs)
1035 else:
1036 return [ord(c) for c in bs]
1037
c257baff 1038
cba892fa 1039def intlist_to_bytes(xs):
1040 if not xs:
1041 return b''
1042 if isinstance(chr(0), bytes): # Python 2
1043 return ''.join([chr(x) for x in xs])
1044 else:
1045 return bytes(xs)
c38b1e77
PH
1046
1047
1048def get_cachedir(params={}):
1049 cache_root = os.environ.get('XDG_CACHE_HOME',
1050 os.path.expanduser('~/.cache'))
1051 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
1052
1053
1054# Cross-platform file locking
1055if sys.platform == 'win32':
1056 import ctypes.wintypes
1057 import msvcrt
1058
1059 class OVERLAPPED(ctypes.Structure):
1060 _fields_ = [
1061 ('Internal', ctypes.wintypes.LPVOID),
1062 ('InternalHigh', ctypes.wintypes.LPVOID),
1063 ('Offset', ctypes.wintypes.DWORD),
1064 ('OffsetHigh', ctypes.wintypes.DWORD),
1065 ('hEvent', ctypes.wintypes.HANDLE),
1066 ]
1067
1068 kernel32 = ctypes.windll.kernel32
1069 LockFileEx = kernel32.LockFileEx
1070 LockFileEx.argtypes = [
1071 ctypes.wintypes.HANDLE, # hFile
1072 ctypes.wintypes.DWORD, # dwFlags
1073 ctypes.wintypes.DWORD, # dwReserved
1074 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1075 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1076 ctypes.POINTER(OVERLAPPED) # Overlapped
1077 ]
1078 LockFileEx.restype = ctypes.wintypes.BOOL
1079 UnlockFileEx = kernel32.UnlockFileEx
1080 UnlockFileEx.argtypes = [
1081 ctypes.wintypes.HANDLE, # hFile
1082 ctypes.wintypes.DWORD, # dwReserved
1083 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1084 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1085 ctypes.POINTER(OVERLAPPED) # Overlapped
1086 ]
1087 UnlockFileEx.restype = ctypes.wintypes.BOOL
1088 whole_low = 0xffffffff
1089 whole_high = 0x7fffffff
1090
1091 def _lock_file(f, exclusive):
1092 overlapped = OVERLAPPED()
1093 overlapped.Offset = 0
1094 overlapped.OffsetHigh = 0
1095 overlapped.hEvent = 0
1096 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1097 handle = msvcrt.get_osfhandle(f.fileno())
1098 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1099 whole_low, whole_high, f._lock_file_overlapped_p):
1100 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1101
1102 def _unlock_file(f):
1103 assert f._lock_file_overlapped_p
1104 handle = msvcrt.get_osfhandle(f.fileno())
1105 if not UnlockFileEx(handle, 0,
1106 whole_low, whole_high, f._lock_file_overlapped_p):
1107 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1108
1109else:
1110 import fcntl
1111
1112 def _lock_file(f, exclusive):
1113 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1114
1115 def _unlock_file(f):
1116 fcntl.lockf(f, fcntl.LOCK_UN)
1117
1118
1119class locked_file(object):
1120 def __init__(self, filename, mode, encoding=None):
1121 assert mode in ['r', 'a', 'w']
1122 self.f = io.open(filename, mode, encoding=encoding)
1123 self.mode = mode
1124
1125 def __enter__(self):
1126 exclusive = self.mode != 'r'
1127 try:
1128 _lock_file(self.f, exclusive)
1129 except IOError:
1130 self.f.close()
1131 raise
1132 return self
1133
1134 def __exit__(self, etype, value, traceback):
1135 try:
1136 _unlock_file(self.f)
1137 finally:
1138 self.f.close()
1139
1140 def __iter__(self):
1141 return iter(self.f)
1142
1143 def write(self, *args):
1144 return self.f.write(*args)
1145
1146 def read(self, *args):
1147 return self.f.read(*args)
4eb7f1d1
JMF
1148
1149
1150def shell_quote(args):
a6a173c2
JMF
1151 quoted_args = []
1152 encoding = sys.getfilesystemencoding()
1153 if encoding is None:
1154 encoding = 'utf-8'
1155 for a in args:
1156 if isinstance(a, bytes):
1157 # We may get a filename encoded with 'encodeFilename'
1158 a = a.decode(encoding)
1159 quoted_args.append(pipes.quote(a))
1160 return u' '.join(quoted_args)
9d4660ca
PH
1161
1162
f4d96df0
PH
1163def takewhile_inclusive(pred, seq):
1164 """ Like itertools.takewhile, but include the latest evaluated element
1165 (the first element so that Not pred(e)) """
1166 for e in seq:
1167 yield e
1168 if not pred(e):
1169 return
1170
1171
9d4660ca
PH
1172def smuggle_url(url, data):
1173 """ Pass additional data in a URL for internal use. """
1174
1175 sdata = compat_urllib_parse.urlencode(
1176 {u'__youtubedl_smuggle': json.dumps(data)})
1177 return url + u'#' + sdata
1178
1179
79f82953 1180def unsmuggle_url(smug_url, default=None):
9d4660ca 1181 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1182 return smug_url, default
9d4660ca
PH
1183 url, _, sdata = smug_url.rpartition(u'#')
1184 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1185 data = json.loads(jsond)
1186 return url, data
02dbf93f
PH
1187
1188
02dbf93f
PH
1189def format_bytes(bytes):
1190 if bytes is None:
1191 return u'N/A'
1192 if type(bytes) is str:
1193 bytes = float(bytes)
1194 if bytes == 0.0:
1195 exponent = 0
1196 else:
1197 exponent = int(math.log(bytes, 1024.0))
1198 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1199 converted = float(bytes) / float(1024 ** exponent)
1200 return u'%.2f%s' % (converted, suffix)
f53c966a 1201
1c088fa8 1202
1c088fa8
PH
1203def get_term_width():
1204 columns = os.environ.get('COLUMNS', None)
1205 if columns:
1206 return int(columns)
1207
1208 try:
1209 sp = subprocess.Popen(
1210 ['stty', 'size'],
1211 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1212 out, err = sp.communicate()
1213 return int(out.split()[1])
1214 except:
1215 pass
1216 return None
caefb1de
PH
1217
1218
1219def month_by_name(name):
1220 """ Return the number of a month by (locale-independently) English name """
1221
1222 ENGLISH_NAMES = [
dadb8184 1223 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1224 u'July', u'August', u'September', u'October', u'November', u'December']
1225 try:
1226 return ENGLISH_NAMES.index(name) + 1
1227 except ValueError:
1228 return None
18258362
JMF
1229
1230
5aafe895 1231def fix_xml_ampersands(xml_str):
18258362 1232 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1233 return re.sub(
1234 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1235 u'&amp;',
1236 xml_str)
e3946f98
PH
1237
1238
1239def setproctitle(title):
8bf48f23 1240 assert isinstance(title, compat_str)
e3946f98
PH
1241 try:
1242 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1243 except OSError:
1244 return
6eefe533
PH
1245 title_bytes = title.encode('utf-8')
1246 buf = ctypes.create_string_buffer(len(title_bytes))
1247 buf.value = title_bytes
e3946f98 1248 try:
6eefe533 1249 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1250 except AttributeError:
1251 return # Strange libc, just skip this
d7dda168
PH
1252
1253
1254def remove_start(s, start):
1255 if s.startswith(start):
1256 return s[len(start):]
1257 return s
29eb5174
PH
1258
1259
1260def url_basename(url):
9b8aaeed
JMF
1261 path = compat_urlparse.urlparse(url).path
1262 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1263
1264
1265class HEADRequest(compat_urllib_request.Request):
1266 def get_method(self):
1267 return "HEAD"
7217e148
PH
1268
1269
9732d77e 1270def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1271 if get_attr:
1272 if v is not None:
1273 v = getattr(v, get_attr, None)
9732d77e
PH
1274 return default if v is None else (int(v) * invscale // scale)
1275
1276
1277def str_to_int(int_str):
1278 if int_str is None:
1279 return None
1280 int_str = re.sub(r'[,\.]', u'', int_str)
1281 return int(int_str)
608d11f5
PH
1282
1283
9732d77e
PH
1284def float_or_none(v, scale=1, invscale=1, default=None):
1285 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1286
1287
608d11f5
PH
1288def parse_duration(s):
1289 if s is None:
1290 return None
1291
1292 m = re.match(
ba40a746 1293 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
608d11f5
PH
1294 if not m:
1295 return None
1296 res = int(m.group('secs'))
1297 if m.group('mins'):
1298 res += int(m.group('mins')) * 60
1299 if m.group('hours'):
1300 res += int(m.group('hours')) * 60 * 60
1301 return res
91d7d0b3
JMF
1302
1303
1304def prepend_extension(filename, ext):
1305 name, real_ext = os.path.splitext(filename)
1306 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1307
1308
1309def check_executable(exe, args=[]):
1310 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1311 args can be a list of arguments for a short output (like -version) """
1312 try:
1313 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1314 except OSError:
1315 return False
1316 return exe
b7ab0590
PH
1317
1318
1319class PagedList(object):
1320 def __init__(self, pagefunc, pagesize):
1321 self._pagefunc = pagefunc
1322 self._pagesize = pagesize
1323
dd26ced1
PH
1324 def __len__(self):
1325 # This is only useful for tests
1326 return len(self.getslice())
1327
b7ab0590
PH
1328 def getslice(self, start=0, end=None):
1329 res = []
1330 for pagenum in itertools.count(start // self._pagesize):
1331 firstid = pagenum * self._pagesize
1332 nextfirstid = pagenum * self._pagesize + self._pagesize
1333 if start >= nextfirstid:
1334 continue
1335
1336 page_results = list(self._pagefunc(pagenum))
1337
1338 startv = (
1339 start % self._pagesize
1340 if firstid <= start < nextfirstid
1341 else 0)
1342
1343 endv = (
1344 ((end - 1) % self._pagesize) + 1
1345 if (end is not None and firstid <= end <= nextfirstid)
1346 else None)
1347
1348 if startv != 0 or endv is not None:
1349 page_results = page_results[startv:endv]
1350 res.extend(page_results)
1351
1352 # A little optimization - if current page is not "full", ie. does
1353 # not contain page_size videos then we can assume that this page
1354 # is the last one - there are no more ids on further pages -
1355 # i.e. no need to query again.
1356 if len(page_results) + startv < self._pagesize:
1357 break
1358
1359 # If we got the whole page, but the next page is not interesting,
1360 # break out early as well
1361 if end == nextfirstid:
1362 break
1363 return res
81c2f20b
PH
1364
1365
1366def uppercase_escape(s):
676eb3f2 1367 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1368 return re.sub(
a612753d 1369 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1370 lambda m: unicode_escape(m.group(0))[0],
1371 s)
b53466e1
PH
1372
1373try:
1374 struct.pack(u'!I', 0)
1375except TypeError:
1376 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1377 def struct_pack(spec, *args):
1378 if isinstance(spec, compat_str):
1379 spec = spec.encode('ascii')
1380 return struct.pack(spec, *args)
1381
1382 def struct_unpack(spec, *args):
1383 if isinstance(spec, compat_str):
1384 spec = spec.encode('ascii')
1385 return struct.unpack(spec, *args)
1386else:
1387 struct_pack = struct.pack
1388 struct_unpack = struct.unpack
62e609ab
PH
1389
1390
1391def read_batch_urls(batch_fd):
1392 def fixup(url):
1393 if not isinstance(url, compat_str):
1394 url = url.decode('utf-8', 'replace')
1395 BOM_UTF8 = u'\xef\xbb\xbf'
1396 if url.startswith(BOM_UTF8):
1397 url = url[len(BOM_UTF8):]
1398 url = url.strip()
1399 if url.startswith(('#', ';', ']')):
1400 return False
1401 return url
1402
1403 with contextlib.closing(batch_fd) as fd:
1404 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1405
1406
1407def urlencode_postdata(*args, **kargs):
1408 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1409
1410
1411def parse_xml(s):
1412 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1413 def doctype(self, name, pubid, system):
1414 pass # Ignore doctypes
1415
1416 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1417 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1418 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
e68301af
PH
1419
1420
1421if sys.version_info < (3, 0) and sys.platform == 'win32':
1422 def compat_getpass(prompt, *args, **kwargs):
1423 if isinstance(prompt, compat_str):
4e6f9aec 1424 prompt = prompt.encode(preferredencoding())
e68301af
PH
1425 return getpass.getpass(prompt, *args, **kwargs)
1426else:
1427 compat_getpass = getpass.getpass
a1a530b0
PH
1428
1429
1430US_RATINGS = {
1431 'G': 0,
1432 'PG': 10,
1433 'PG-13': 13,
1434 'R': 16,
1435 'NC': 18,
1436}
fac55558
PH
1437
1438
1439def strip_jsonp(code):
816930c4 1440 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
478c2c61
PH
1441
1442
1443def qualities(quality_ids):
1444 """ Get a numeric quality value out of a list of possible values """
1445 def q(qid):
1446 try:
1447 return quality_ids.index(qid)
1448 except ValueError:
1449 return -1
1450 return q
1451
acd69589
PH
1452
1453DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68
PH
1454
1455try:
1456 subprocess_check_output = subprocess.check_output
1457except AttributeError:
1458 def subprocess_check_output(*args, **kwargs):
1459 assert 'input' not in kwargs
1460 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1461 output, _ = p.communicate()
1462 ret = p.poll()
1463 if ret:
1464 raise subprocess.CalledProcessError(ret, p.args, output=output)
1465 return output