]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[ard] Make more robust against missing thumbnails
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
676eb3f2 5import codecs
62e609ab 6import contextlib
e3946f98 7import ctypes
c496ca96
PH
8import datetime
9import email.utils
f45c185f 10import errno
e68301af 11import getpass
d77c3dfd 12import gzip
b7ab0590 13import itertools
03f9daab 14import io
f4bfd65f 15import json
d77c3dfd 16import locale
02dbf93f 17import math
d77c3dfd 18import os
4eb7f1d1 19import pipes
c496ca96 20import platform
d77c3dfd 21import re
13ebea79 22import ssl
c496ca96 23import socket
b53466e1 24import struct
1c088fa8 25import subprocess
d77c3dfd 26import sys
181c8655 27import tempfile
01951dda 28import traceback
bcf89ce6 29import xml.etree.ElementTree
d77c3dfd 30import zlib
d77c3dfd 31
01ba00ca 32try:
59ae15a5 33 import urllib.request as compat_urllib_request
01ba00ca 34except ImportError: # Python 2
59ae15a5 35 import urllib2 as compat_urllib_request
01ba00ca
PH
36
37try:
59ae15a5 38 import urllib.error as compat_urllib_error
01ba00ca 39except ImportError: # Python 2
59ae15a5 40 import urllib2 as compat_urllib_error
01ba00ca
PH
41
42try:
59ae15a5 43 import urllib.parse as compat_urllib_parse
01ba00ca 44except ImportError: # Python 2
59ae15a5 45 import urllib as compat_urllib_parse
01ba00ca 46
799c0763
PH
47try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
6543f0dc
JMF
52try:
53 import urllib.parse as compat_urlparse
54except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
01ba00ca 57try:
59ae15a5 58 import http.cookiejar as compat_cookiejar
01ba00ca 59except ImportError: # Python 2
59ae15a5 60 import cookielib as compat_cookiejar
01ba00ca 61
3e669f36 62try:
59ae15a5 63 import html.entities as compat_html_entities
9f37a959 64except ImportError: # Python 2
59ae15a5 65 import htmlentitydefs as compat_html_entities
3e669f36 66
a8156c1d 67try:
59ae15a5 68 import html.parser as compat_html_parser
9f37a959 69except ImportError: # Python 2
59ae15a5 70 import HTMLParser as compat_html_parser
a8156c1d 71
348d0a7a 72try:
59ae15a5 73 import http.client as compat_http_client
9f37a959 74except ImportError: # Python 2
59ae15a5 75 import httplib as compat_http_client
348d0a7a 76
2eabb802 77try:
0e283428 78 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
79except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
e0df6211
PH
82try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
5910e210
PH
88try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
9f37a959 94try:
f1f725c6
PH
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
59ae15a5
PH
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
f1f725c6
PH
132
133try:
134 from urllib.parse import parse_qs as compat_parse_qs
135except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
59ae15a5
PH
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
f1f725c6
PH
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
59ae15a5
PH
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
f1f725c6
PH
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
59ae15a5
PH
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
348d0a7a 179
3e669f36 180try:
59ae15a5 181 compat_str = unicode # Python 2
3e669f36 182except NameError:
59ae15a5 183 compat_str = str
3e669f36
PH
184
185try:
59ae15a5 186 compat_chr = unichr # Python 2
3e669f36 187except NameError:
59ae15a5 188 compat_chr = chr
3e669f36 189
f7300c5c
JMF
190try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
8d31fa3c
PH
195try:
196 from shlex import quote as shlex_quote
197except ImportError: # Python < 3.3
198 def shlex_quote(s):
199 return "'" + s.replace("'", "'\"'\"'") + "'"
200
201
b31756c1
FV
202def compat_ord(c):
203 if type(c) is int: return c
204 else: return ord(c)
205
468e2e92
FV
206# This is not clearly defined otherwise
207compiled_regex_type = type(re.compile(''))
208
3e669f36 209std_headers = {
ae8f7871 210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 215}
f427df17 216
d77c3dfd 217def preferredencoding():
59ae15a5 218 """Get preferred encoding.
d77c3dfd 219
59ae15a5
PH
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
222 """
223 try:
224 pref = locale.getpreferredencoding()
225 u'TEST'.encode(pref)
226 except:
227 pref = 'UTF-8'
bae611f2 228
59ae15a5 229 return pref
d77c3dfd 230
8cd10ac4 231if sys.version_info < (3,0):
59ae15a5
PH
232 def compat_print(s):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 234else:
59ae15a5
PH
235 def compat_print(s):
236 assert type(s) == type(u'')
237 print(s)
d77c3dfd 238
f4bfd65f 239
181c8655
PH
240def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
242
73159f99
S
243 args = {
244 'suffix': '.tmp',
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
247 'delete': False,
248 }
249
181c8655
PH
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
73159f99 253 args['mode'] = 'wb'
181c8655 254 else:
73159f99
S
255 args.update({
256 'mode': 'w',
257 'encoding': 'utf-8',
258 })
259
260 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
261
262 try:
263 with tf:
264 json.dump(obj, tf)
265 os.rename(tf.name, fn)
266 except:
267 try:
268 os.remove(tf.name)
269 except OSError:
270 pass
271 raise
272
273
274if sys.version_info >= (2, 7):
59ae56fa
PH
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
281else:
282 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
283 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
284 # .//node does not match if a node is a direct child of . !
285 if isinstance(xpath, unicode):
286 xpath = xpath.encode('ascii')
287
59ae56fa
PH
288 for f in node.findall(xpath):
289 if f.attrib.get(key) == val:
290 return f
291 return None
292
d7e66d39
JMF
293# On python2.6 the xml.etree.ElementTree.Element methods don't support
294# the namespace parameter
295def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
297 replaced = []
298 for c in components:
299 if len(c) == 1:
300 replaced.append(c[0])
301 else:
302 ns, tag = c
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
305
d77c3dfd 306
bf0ff932
PH
307def xpath_text(node, xpath, name=None, fatal=False):
308 n = node.find(xpath)
309 if n is None:
310 if fatal:
311 name = xpath if name is None else name
312 raise ExtractorError('Could not find XML element %s' % name)
313 else:
314 return None
315 return n.text
316
317
a8156c1d 318compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
319class BaseHTMLParser(compat_html_parser.HTMLParser):
320 def __init(self):
321 compat_html_parser.HTMLParser.__init__(self)
322 self.html = None
323
324 def loads(self, html):
325 self.html = html
326 self.feed(html)
327 self.close()
328
329class AttrParser(BaseHTMLParser):
43e8fafd
ND
330 """Modified HTMLParser that isolates a tag with the specified attribute"""
331 def __init__(self, attribute, value):
332 self.attribute = attribute
333 self.value = value
59ae15a5
PH
334 self.result = None
335 self.started = False
336 self.depth = {}
59ae15a5
PH
337 self.watch_startpos = False
338 self.error_count = 0
a921f407 339 BaseHTMLParser.__init__(self)
59ae15a5
PH
340
341 def error(self, message):
342 if self.error_count > 10 or self.started:
343 raise compat_html_parser.HTMLParseError(message, self.getpos())
344 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
345 self.error_count += 1
346 self.goahead(1)
347
59ae15a5
PH
348 def handle_starttag(self, tag, attrs):
349 attrs = dict(attrs)
350 if self.started:
351 self.find_startpos(None)
43e8fafd 352 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
353 self.result = [tag]
354 self.started = True
355 self.watch_startpos = True
356 if self.started:
357 if not tag in self.depth: self.depth[tag] = 0
358 self.depth[tag] += 1
359
360 def handle_endtag(self, tag):
361 if self.started:
362 if tag in self.depth: self.depth[tag] -= 1
363 if self.depth[self.result[0]] == 0:
364 self.started = False
365 self.result.append(self.getpos())
366
367 def find_startpos(self, x):
368 """Needed to put the start position of the result (self.result[1])
369 after the opening tag with the requested id"""
370 if self.watch_startpos:
371 self.watch_startpos = False
372 self.result.append(self.getpos())
373 handle_entityref = handle_charref = handle_data = handle_comment = \
374 handle_decl = handle_pi = unknown_decl = find_startpos
375
376 def get_result(self):
377 if self.result is None:
378 return None
379 if len(self.result) != 3:
380 return None
381 lines = self.html.split('\n')
382 lines = lines[self.result[1][0]-1:self.result[2][0]]
383 lines[0] = lines[0][self.result[1][1]:]
384 if len(lines) == 1:
385 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
386 lines[-1] = lines[-1][:self.result[2][1]]
387 return '\n'.join(lines).strip()
3b024e17
PH
388# Hack for https://github.com/rg3/youtube-dl/issues/662
389if sys.version_info < (2, 7, 3):
390 AttrParser.parse_endtag = (lambda self, i:
391 i + len("</scr'+'ipt>")
392 if self.rawdata[i:].startswith("</scr'+'ipt>")
393 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
394
395def get_element_by_id(id, html):
43e8fafd
ND
396 """Return the content of the tag with the specified ID in the passed HTML document"""
397 return get_element_by_attribute("id", id, html)
398
399def get_element_by_attribute(attribute, value, html):
400 """Return the content of the tag with the specified attribute in the passed HTML document"""
401 parser = AttrParser(attribute, value)
59ae15a5
PH
402 try:
403 parser.loads(html)
404 except compat_html_parser.HTMLParseError:
405 pass
406 return parser.get_result()
9e6dd238 407
a921f407
JMF
408class MetaParser(BaseHTMLParser):
409 """
410 Modified HTMLParser that isolates a meta tag with the specified name
411 attribute.
412 """
413 def __init__(self, name):
414 BaseHTMLParser.__init__(self)
415 self.name = name
416 self.content = None
417 self.result = None
418
419 def handle_starttag(self, tag, attrs):
420 if tag != 'meta':
421 return
422 attrs = dict(attrs)
423 if attrs.get('name') == self.name:
424 self.result = attrs.get('content')
425
426 def get_result(self):
427 return self.result
428
429def get_meta_content(name, html):
430 """
431 Return the content attribute from the meta tag with the given name attribute.
432 """
433 parser = MetaParser(name)
434 try:
435 parser.loads(html)
436 except compat_html_parser.HTMLParseError:
437 pass
438 return parser.get_result()
439
9e6dd238
FV
440
441def clean_html(html):
59ae15a5
PH
442 """Clean an HTML snippet into a readable string"""
443 # Newline vs <br />
444 html = html.replace('\n', ' ')
6b3aef80
FV
445 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
446 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
447 # Strip html tags
448 html = re.sub('<.*?>', '', html)
449 # Replace html entities
450 html = unescapeHTML(html)
7decf895 451 return html.strip()
9e6dd238
FV
452
453
d77c3dfd 454def sanitize_open(filename, open_mode):
59ae15a5
PH
455 """Try to open the given filename, and slightly tweak it if this fails.
456
457 Attempts to open the given filename. If this fails, it tries to change
458 the filename slightly, step by step, until it's either able to open it
459 or it fails and raises a final exception, like the standard open()
460 function.
461
462 It returns the tuple (stream, definitive_file_name).
463 """
464 try:
465 if filename == u'-':
466 if sys.platform == 'win32':
467 import msvcrt
468 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 469 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
470 stream = open(encodeFilename(filename), open_mode)
471 return (stream, filename)
472 except (IOError, OSError) as err:
f45c185f
PH
473 if err.errno in (errno.EACCES,):
474 raise
59ae15a5 475
f45c185f
PH
476 # In case of error, try to remove win32 forbidden chars
477 alt_filename = os.path.join(
478 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
479 for path_part in os.path.split(filename)
480 )
481 if alt_filename == filename:
482 raise
483 else:
484 # An exception here should be caught in the caller
485 stream = open(encodeFilename(filename), open_mode)
486 return (stream, alt_filename)
d77c3dfd
FV
487
488
489def timeconvert(timestr):
59ae15a5
PH
490 """Convert RFC 2822 defined time string into system timestamp"""
491 timestamp = None
492 timetuple = email.utils.parsedate_tz(timestr)
493 if timetuple is not None:
494 timestamp = email.utils.mktime_tz(timetuple)
495 return timestamp
1c469a94 496
796173d0 497def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
498 """Sanitizes a string so it could be used as part of a filename.
499 If restricted is set, use a stricter subset of allowed characters.
796173d0 500 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
501 """
502 def replace_insane(char):
503 if char == '?' or ord(char) < 32 or ord(char) == 127:
504 return ''
505 elif char == '"':
506 return '' if restricted else '\''
507 elif char == ':':
508 return '_-' if restricted else ' -'
509 elif char in '\\/|*<>':
510 return '_'
627dcfff 511 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
512 return '_'
513 if restricted and ord(char) > 127:
514 return '_'
515 return char
516
517 result = u''.join(map(replace_insane, s))
796173d0
PH
518 if not is_id:
519 while '__' in result:
520 result = result.replace('__', '_')
521 result = result.strip('_')
522 # Common case of "Foreign band name - English song title"
523 if restricted and result.startswith('-_'):
524 result = result[2:]
525 if not result:
526 result = '_'
59ae15a5 527 return result
d77c3dfd
FV
528
529def orderedSet(iterable):
59ae15a5
PH
530 """ Remove all duplicates from the input iterable """
531 res = []
532 for el in iterable:
533 if el not in res:
534 res.append(el)
535 return res
d77c3dfd 536
912b38b4 537
4e408e47
PH
538def _htmlentity_transform(entity):
539 """Transforms an HTML entity to a character."""
540 # Known non-numeric HTML entity
541 if entity in compat_html_entities.name2codepoint:
542 return compat_chr(compat_html_entities.name2codepoint[entity])
543
544 mobj = re.match(r'#(x?[0-9]+)', entity)
545 if mobj is not None:
546 numstr = mobj.group(1)
547 if numstr.startswith(u'x'):
548 base = 16
549 numstr = u'0%s' % numstr
550 else:
551 base = 10
552 return compat_chr(int(numstr, base))
553
554 # Unknown entity in name, return its literal representation
555 return (u'&%s;' % entity)
556
557
d77c3dfd 558def unescapeHTML(s):
912b38b4
PH
559 if s is None:
560 return None
561 assert type(s) == compat_str
d77c3dfd 562
4e408e47
PH
563 return re.sub(
564 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 565
8bf48f23
PH
566
567def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
568 """
569 @param s The name of the file
570 """
d77c3dfd 571
8bf48f23 572 assert type(s) == compat_str
d77c3dfd 573
59ae15a5
PH
574 # Python 3 has a Unicode API
575 if sys.version_info >= (3, 0):
576 return s
0f00efed 577
59ae15a5
PH
578 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
579 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
580 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
581 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
582 if not for_subprocess:
583 return s
584 else:
585 # For subprocess calls, encode with locale encoding
586 # Refer to http://stackoverflow.com/a/9951851/35070
587 encoding = preferredencoding()
59ae15a5 588 else:
6df40dcb 589 encoding = sys.getfilesystemencoding()
8bf48f23
PH
590 if encoding is None:
591 encoding = 'utf-8'
592 return s.encode(encoding, 'ignore')
593
f07b74fc
PH
594
595def encodeArgument(s):
596 if not isinstance(s, compat_str):
597 # Legacy code that uses byte strings
598 # Uncomment the following line after fixing all post processors
599 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
600 s = s.decode('ascii')
601 return encodeFilename(s, True)
602
603
8271226a
PH
604def decodeOption(optval):
605 if optval is None:
606 return optval
607 if isinstance(optval, bytes):
608 optval = optval.decode(preferredencoding())
609
610 assert isinstance(optval, compat_str)
611 return optval
1c256f70 612
4539dd30
PH
613def formatSeconds(secs):
614 if secs > 3600:
615 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
616 elif secs > 60:
617 return '%d:%02d' % (secs // 60, secs % 60)
618 else:
619 return '%d' % secs
620
a0ddb8a2
PH
621
622def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
623 if sys.version_info < (3, 2):
624 import httplib
625
626 class HTTPSConnectionV3(httplib.HTTPSConnection):
627 def __init__(self, *args, **kwargs):
628 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
629
630 def connect(self):
631 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 632 if getattr(self, '_tunnel_host', False):
13ebea79
PH
633 self.sock = sock
634 self._tunnel()
635 try:
aa37e3d4 636 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 637 except ssl.SSLError:
13ebea79
PH
638 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
639
640 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
641 def https_open(self, req):
642 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 643 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
644 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
645 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
646 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
647 if opts_no_check_certificate:
648 context.verify_mode = ssl.CERT_NONE
649 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
650 else: # Python < 3.4
651 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 652 context.verify_mode = (ssl.CERT_NONE
dca08720 653 if opts_no_check_certificate
ea6d901e 654 else ssl.CERT_REQUIRED)
303b479e
PH
655 context.set_default_verify_paths()
656 try:
657 context.load_default_certs()
658 except AttributeError:
659 pass # Python < 3.4
a0ddb8a2 660 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 661
1c256f70
PH
662class ExtractorError(Exception):
663 """Error during info extraction."""
d11271dd 664 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
665 """ tb, if given, is the original traceback (so that it can be printed out).
666 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
667 """
668
669 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
670 expected = True
d11271dd
PH
671 if video_id is not None:
672 msg = video_id + ': ' + msg
9a82b238 673 if not expected:
298f833b 674 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 675 super(ExtractorError, self).__init__(msg)
d5979c5d 676
1c256f70 677 self.traceback = tb
8cc83b8d 678 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 679 self.cause = cause
d11271dd 680 self.video_id = video_id
1c256f70 681
01951dda
PH
682 def format_traceback(self):
683 if self.traceback is None:
684 return None
685 return u''.join(traceback.format_tb(self.traceback))
686
1c256f70 687
55b3e45b
JMF
688class RegexNotFoundError(ExtractorError):
689 """Error when a regex didn't match"""
690 pass
691
692
d77c3dfd 693class DownloadError(Exception):
59ae15a5 694 """Download Error exception.
d77c3dfd 695
59ae15a5
PH
696 This exception may be thrown by FileDownloader objects if they are not
697 configured to continue on errors. They will contain the appropriate
698 error message.
699 """
8cc83b8d
FV
700 def __init__(self, msg, exc_info=None):
701 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
702 super(DownloadError, self).__init__(msg)
703 self.exc_info = exc_info
d77c3dfd
FV
704
705
706class SameFileError(Exception):
59ae15a5 707 """Same File exception.
d77c3dfd 708
59ae15a5
PH
709 This exception will be thrown by FileDownloader objects if they detect
710 multiple files would have to be downloaded to the same file on disk.
711 """
712 pass
d77c3dfd
FV
713
714
715class PostProcessingError(Exception):
59ae15a5 716 """Post Processing exception.
d77c3dfd 717
59ae15a5
PH
718 This exception may be raised by PostProcessor's .run() method to
719 indicate an error in the postprocessing task.
720 """
7851b379
PH
721 def __init__(self, msg):
722 self.msg = msg
d77c3dfd
FV
723
724class MaxDownloadsReached(Exception):
59ae15a5
PH
725 """ --max-downloads limit has been reached. """
726 pass
d77c3dfd
FV
727
728
729class UnavailableVideoError(Exception):
59ae15a5 730 """Unavailable Format exception.
d77c3dfd 731
59ae15a5
PH
732 This exception will be thrown when a video is requested
733 in a format that is not available for that video.
734 """
735 pass
d77c3dfd
FV
736
737
738class ContentTooShortError(Exception):
59ae15a5 739 """Content Too Short exception.
d77c3dfd 740
59ae15a5
PH
741 This exception may be raised by FileDownloader objects when a file they
742 download is too small for what the server announced first, indicating
743 the connection was probably interrupted.
744 """
745 # Both in bytes
746 downloaded = None
747 expected = None
d77c3dfd 748
59ae15a5
PH
749 def __init__(self, downloaded, expected):
750 self.downloaded = downloaded
751 self.expected = expected
d77c3dfd 752
acebc9cd 753class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
754 """Handler for HTTP requests and responses.
755
756 This class, when installed with an OpenerDirector, automatically adds
757 the standard headers to every HTTP request and handles gzipped and
758 deflated responses from web servers. If compression is to be avoided in
759 a particular request, the original request in the program code only has
760 to include the HTTP header "Youtubedl-No-Compression", which will be
761 removed before making the real request.
762
763 Part of this code was copied from:
764
765 http://techknack.net/python-urllib2-handlers/
766
767 Andrew Rowls, the author of that code, agreed to release it to the
768 public domain.
769 """
770
771 @staticmethod
772 def deflate(data):
773 try:
774 return zlib.decompress(data, -zlib.MAX_WBITS)
775 except zlib.error:
776 return zlib.decompress(data)
777
778 @staticmethod
779 def addinfourl_wrapper(stream, headers, url, code):
780 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
781 return compat_urllib_request.addinfourl(stream, headers, url, code)
782 ret = compat_urllib_request.addinfourl(stream, headers, url)
783 ret.code = code
784 return ret
785
acebc9cd 786 def http_request(self, req):
33ac271b
PH
787 for h, v in std_headers.items():
788 if h not in req.headers:
789 req.add_header(h, v)
59ae15a5
PH
790 if 'Youtubedl-no-compression' in req.headers:
791 if 'Accept-encoding' in req.headers:
792 del req.headers['Accept-encoding']
793 del req.headers['Youtubedl-no-compression']
3446dfb7 794 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
795 if 'User-agent' in req.headers:
796 del req.headers['User-agent']
797 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 798 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
799 return req
800
acebc9cd 801 def http_response(self, req, resp):
59ae15a5
PH
802 old_resp = resp
803 # gzip
804 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
805 content = resp.read()
806 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
807 try:
808 uncompressed = io.BytesIO(gz.read())
809 except IOError as original_ioerror:
810 # There may be junk add the end of the file
811 # See http://stackoverflow.com/q/4928560/35070 for details
812 for i in range(1, 1024):
813 try:
814 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
815 uncompressed = io.BytesIO(gz.read())
816 except IOError:
817 continue
818 break
819 else:
820 raise original_ioerror
821 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
822 resp.msg = old_resp.msg
823 # deflate
824 if resp.headers.get('Content-encoding', '') == 'deflate':
825 gz = io.BytesIO(self.deflate(resp.read()))
826 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
827 resp.msg = old_resp.msg
828 return resp
0f8d03f8 829
acebc9cd
PH
830 https_request = http_request
831 https_response = http_response
bf50b038 832
5de90176 833
305d0683 834def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
835 """ Return a UNIX timestamp from the given date """
836
837 if date_str is None:
838 return None
839
840 m = re.search(
841 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
842 date_str)
843 if not m:
844 timezone = datetime.timedelta()
845 else:
846 date_str = date_str[:-len(m.group(0))]
847 if not m.group('sign'):
848 timezone = datetime.timedelta()
849 else:
850 sign = 1 if m.group('sign') == '+' else -1
851 timezone = datetime.timedelta(
852 hours=sign * int(m.group('hours')),
853 minutes=sign * int(m.group('minutes')))
305d0683
TB
854 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
855 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
856 return calendar.timegm(dt.timetuple())
857
858
bf50b038
JMF
859def unified_strdate(date_str):
860 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
861
862 if date_str is None:
863 return None
864
bf50b038
JMF
865 upload_date = None
866 #Replace commas
026fcc04 867 date_str = date_str.replace(',', ' ')
bf50b038 868 # %z (UTC offset) is only supported in python>=3.2
026fcc04 869 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
870 format_expressions = [
871 '%d %B %Y',
0f99566c 872 '%d %b %Y',
19e1d359
JMF
873 '%B %d %Y',
874 '%b %d %Y',
78ff59d0
PP
875 '%b %dst %Y %I:%M%p',
876 '%b %dnd %Y %I:%M%p',
877 '%b %dth %Y %I:%M%p',
19e1d359 878 '%Y-%m-%d',
fe556f1b 879 '%Y/%m/%d',
4cf96546 880 '%d.%m.%Y',
19e1d359 881 '%d/%m/%Y',
423817c4 882 '%d/%m/%y',
19e1d359 883 '%Y/%m/%d %H:%M:%S',
5d73273f 884 '%Y-%m-%d %H:%M:%S',
19e1d359 885 '%d.%m.%Y %H:%M',
b047de6f 886 '%d.%m.%Y %H.%M',
19e1d359 887 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
888 '%Y-%m-%dT%H:%M:%S.%fZ',
889 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 890 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 891 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 892 '%Y-%m-%dT%H:%M',
19e1d359 893 ]
bf50b038
JMF
894 for expression in format_expressions:
895 try:
896 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 897 except ValueError:
bf50b038 898 pass
42393ce2
PH
899 if upload_date is None:
900 timetuple = email.utils.parsedate_tz(date_str)
901 if timetuple:
902 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
903 return upload_date
904
cbdbb766 905def determine_ext(url, default_ext=u'unknown_video'):
f4776371
S
906 if url is None:
907 return default_ext
73e79f2a
PH
908 guess = url.partition(u'?')[0].rpartition(u'.')[2]
909 if re.match(r'^[A-Za-z0-9]+$', guess):
910 return guess
911 else:
cbdbb766 912 return default_ext
73e79f2a 913
d4051a8e
JMF
914def subtitles_filename(filename, sub_lang, sub_format):
915 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
916
bd558525 917def date_from_str(date_str):
37254abc
JMF
918 """
919 Return a datetime object from a string in the format YYYYMMDD or
920 (now|today)[+-][0-9](day|week|month|year)(s)?"""
921 today = datetime.date.today()
922 if date_str == 'now'or date_str == 'today':
923 return today
924 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
925 if match is not None:
926 sign = match.group('sign')
927 time = int(match.group('time'))
928 if sign == '-':
929 time = -time
930 unit = match.group('unit')
931 #A bad aproximation?
932 if unit == 'month':
933 unit = 'day'
934 time *= 30
935 elif unit == 'year':
936 unit = 'day'
937 time *= 365
938 unit += 's'
939 delta = datetime.timedelta(**{unit: time})
940 return today + delta
bd558525
JMF
941 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
942
e63fc1be 943def hyphenate_date(date_str):
944 """
945 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
946 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
947 if match is not None:
948 return '-'.join(match.groups())
949 else:
950 return date_str
951
bd558525
JMF
952class DateRange(object):
953 """Represents a time interval between two dates"""
954 def __init__(self, start=None, end=None):
955 """start and end must be strings in the format accepted by date"""
956 if start is not None:
957 self.start = date_from_str(start)
958 else:
959 self.start = datetime.datetime.min.date()
960 if end is not None:
961 self.end = date_from_str(end)
962 else:
963 self.end = datetime.datetime.max.date()
37254abc 964 if self.start > self.end:
bd558525
JMF
965 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
966 @classmethod
967 def day(cls, day):
968 """Returns a range that only contains the given day"""
969 return cls(day,day)
970 def __contains__(self, date):
971 """Check if the date is in the range"""
37254abc
JMF
972 if not isinstance(date, datetime.date):
973 date = date_from_str(date)
974 return self.start <= date <= self.end
bd558525
JMF
975 def __str__(self):
976 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
977
978
979def platform_name():
980 """ Returns the platform name as a compat_str """
981 res = platform.platform()
982 if isinstance(res, bytes):
983 res = res.decode(preferredencoding())
984
985 assert isinstance(res, compat_str)
986 return res
c257baff
PH
987
988
b58ddb32
PH
989def _windows_write_string(s, out):
990 """ Returns True if the string was written using special methods,
991 False if it has yet to be written out."""
992 # Adapted from http://stackoverflow.com/a/3259271/35070
993
994 import ctypes
995 import ctypes.wintypes
996
997 WIN_OUTPUT_IDS = {
998 1: -11,
999 2: -12,
1000 }
1001
a383a98a
PH
1002 try:
1003 fileno = out.fileno()
1004 except AttributeError:
1005 # If the output stream doesn't have a fileno, it's virtual
1006 return False
b58ddb32
PH
1007 if fileno not in WIN_OUTPUT_IDS:
1008 return False
1009
1010 GetStdHandle = ctypes.WINFUNCTYPE(
1011 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1012 ("GetStdHandle", ctypes.windll.kernel32))
1013 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1014
1015 WriteConsoleW = ctypes.WINFUNCTYPE(
1016 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1017 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1018 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1019 written = ctypes.wintypes.DWORD(0)
1020
1021 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1022 FILE_TYPE_CHAR = 0x0002
1023 FILE_TYPE_REMOTE = 0x8000
1024 GetConsoleMode = ctypes.WINFUNCTYPE(
1025 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1026 ctypes.POINTER(ctypes.wintypes.DWORD))(
1027 ("GetConsoleMode", ctypes.windll.kernel32))
1028 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1029
1030 def not_a_console(handle):
1031 if handle == INVALID_HANDLE_VALUE or handle is None:
1032 return True
1033 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1034 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1035
1036 if not_a_console(h):
1037 return False
1038
d1b9c912
PH
1039 def next_nonbmp_pos(s):
1040 try:
1041 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1042 except StopIteration:
1043 return len(s)
1044
1045 while s:
1046 count = min(next_nonbmp_pos(s), 1024)
1047
b58ddb32 1048 ret = WriteConsoleW(
d1b9c912 1049 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1050 if ret == 0:
1051 raise OSError('Failed to write string')
d1b9c912
PH
1052 if not count: # We just wrote a non-BMP character
1053 assert written.value == 2
1054 s = s[1:]
1055 else:
1056 assert written.value > 0
1057 s = s[written.value:]
b58ddb32
PH
1058 return True
1059
1060
734f90bb 1061def write_string(s, out=None, encoding=None):
7459e3a2
PH
1062 if out is None:
1063 out = sys.stderr
8bf48f23 1064 assert type(s) == compat_str
7459e3a2 1065
b58ddb32
PH
1066 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1067 if _windows_write_string(s, out):
1068 return
1069
7459e3a2
PH
1070 if ('b' in getattr(out, 'mode', '') or
1071 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1072 byt = s.encode(encoding or preferredencoding(), 'ignore')
1073 out.write(byt)
1074 elif hasattr(out, 'buffer'):
1075 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1076 byt = s.encode(enc, 'ignore')
1077 out.buffer.write(byt)
1078 else:
8bf48f23 1079 out.write(s)
7459e3a2
PH
1080 out.flush()
1081
1082
48ea9cea
PH
1083def bytes_to_intlist(bs):
1084 if not bs:
1085 return []
1086 if isinstance(bs[0], int): # Python 3
1087 return list(bs)
1088 else:
1089 return [ord(c) for c in bs]
1090
c257baff 1091
cba892fa 1092def intlist_to_bytes(xs):
1093 if not xs:
1094 return b''
1095 if isinstance(chr(0), bytes): # Python 2
1096 return ''.join([chr(x) for x in xs])
1097 else:
1098 return bytes(xs)
c38b1e77
PH
1099
1100
c1c9a79c
PH
1101# Cross-platform file locking
1102if sys.platform == 'win32':
1103 import ctypes.wintypes
1104 import msvcrt
1105
1106 class OVERLAPPED(ctypes.Structure):
1107 _fields_ = [
1108 ('Internal', ctypes.wintypes.LPVOID),
1109 ('InternalHigh', ctypes.wintypes.LPVOID),
1110 ('Offset', ctypes.wintypes.DWORD),
1111 ('OffsetHigh', ctypes.wintypes.DWORD),
1112 ('hEvent', ctypes.wintypes.HANDLE),
1113 ]
1114
1115 kernel32 = ctypes.windll.kernel32
1116 LockFileEx = kernel32.LockFileEx
1117 LockFileEx.argtypes = [
1118 ctypes.wintypes.HANDLE, # hFile
1119 ctypes.wintypes.DWORD, # dwFlags
1120 ctypes.wintypes.DWORD, # dwReserved
1121 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1122 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1123 ctypes.POINTER(OVERLAPPED) # Overlapped
1124 ]
1125 LockFileEx.restype = ctypes.wintypes.BOOL
1126 UnlockFileEx = kernel32.UnlockFileEx
1127 UnlockFileEx.argtypes = [
1128 ctypes.wintypes.HANDLE, # hFile
1129 ctypes.wintypes.DWORD, # dwReserved
1130 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1131 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1132 ctypes.POINTER(OVERLAPPED) # Overlapped
1133 ]
1134 UnlockFileEx.restype = ctypes.wintypes.BOOL
1135 whole_low = 0xffffffff
1136 whole_high = 0x7fffffff
1137
1138 def _lock_file(f, exclusive):
1139 overlapped = OVERLAPPED()
1140 overlapped.Offset = 0
1141 overlapped.OffsetHigh = 0
1142 overlapped.hEvent = 0
1143 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1144 handle = msvcrt.get_osfhandle(f.fileno())
1145 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1146 whole_low, whole_high, f._lock_file_overlapped_p):
1147 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1148
1149 def _unlock_file(f):
1150 assert f._lock_file_overlapped_p
1151 handle = msvcrt.get_osfhandle(f.fileno())
1152 if not UnlockFileEx(handle, 0,
1153 whole_low, whole_high, f._lock_file_overlapped_p):
1154 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1155
1156else:
1157 import fcntl
1158
1159 def _lock_file(f, exclusive):
2582bebe 1160 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1161
1162 def _unlock_file(f):
2582bebe 1163 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1164
1165
1166class locked_file(object):
1167 def __init__(self, filename, mode, encoding=None):
1168 assert mode in ['r', 'a', 'w']
1169 self.f = io.open(filename, mode, encoding=encoding)
1170 self.mode = mode
1171
1172 def __enter__(self):
1173 exclusive = self.mode != 'r'
1174 try:
1175 _lock_file(self.f, exclusive)
1176 except IOError:
1177 self.f.close()
1178 raise
1179 return self
1180
1181 def __exit__(self, etype, value, traceback):
1182 try:
1183 _unlock_file(self.f)
1184 finally:
1185 self.f.close()
1186
1187 def __iter__(self):
1188 return iter(self.f)
1189
1190 def write(self, *args):
1191 return self.f.write(*args)
1192
1193 def read(self, *args):
1194 return self.f.read(*args)
4eb7f1d1
JMF
1195
1196
1197def shell_quote(args):
a6a173c2
JMF
1198 quoted_args = []
1199 encoding = sys.getfilesystemencoding()
1200 if encoding is None:
1201 encoding = 'utf-8'
1202 for a in args:
1203 if isinstance(a, bytes):
1204 # We may get a filename encoded with 'encodeFilename'
1205 a = a.decode(encoding)
1206 quoted_args.append(pipes.quote(a))
1207 return u' '.join(quoted_args)
9d4660ca
PH
1208
1209
f4d96df0
PH
1210def takewhile_inclusive(pred, seq):
1211 """ Like itertools.takewhile, but include the latest evaluated element
1212 (the first element so that Not pred(e)) """
1213 for e in seq:
1214 yield e
1215 if not pred(e):
1216 return
1217
1218
9d4660ca
PH
1219def smuggle_url(url, data):
1220 """ Pass additional data in a URL for internal use. """
1221
1222 sdata = compat_urllib_parse.urlencode(
1223 {u'__youtubedl_smuggle': json.dumps(data)})
1224 return url + u'#' + sdata
1225
1226
79f82953 1227def unsmuggle_url(smug_url, default=None):
9d4660ca 1228 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1229 return smug_url, default
9d4660ca
PH
1230 url, _, sdata = smug_url.rpartition(u'#')
1231 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1232 data = json.loads(jsond)
1233 return url, data
02dbf93f
PH
1234
1235
02dbf93f
PH
1236def format_bytes(bytes):
1237 if bytes is None:
1238 return u'N/A'
1239 if type(bytes) is str:
1240 bytes = float(bytes)
1241 if bytes == 0.0:
1242 exponent = 0
1243 else:
1244 exponent = int(math.log(bytes, 1024.0))
1245 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1246 converted = float(bytes) / float(1024 ** exponent)
1247 return u'%.2f%s' % (converted, suffix)
f53c966a 1248
1c088fa8 1249
1c088fa8
PH
1250def get_term_width():
1251 columns = os.environ.get('COLUMNS', None)
1252 if columns:
1253 return int(columns)
1254
1255 try:
1256 sp = subprocess.Popen(
1257 ['stty', 'size'],
1258 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1259 out, err = sp.communicate()
1260 return int(out.split()[1])
1261 except:
1262 pass
1263 return None
caefb1de
PH
1264
1265
1266def month_by_name(name):
1267 """ Return the number of a month by (locale-independently) English name """
1268
1269 ENGLISH_NAMES = [
dadb8184 1270 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1271 u'July', u'August', u'September', u'October', u'November', u'December']
1272 try:
1273 return ENGLISH_NAMES.index(name) + 1
1274 except ValueError:
1275 return None
18258362
JMF
1276
1277
5aafe895 1278def fix_xml_ampersands(xml_str):
18258362 1279 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1280 return re.sub(
1281 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1282 u'&amp;',
1283 xml_str)
e3946f98
PH
1284
1285
1286def setproctitle(title):
8bf48f23 1287 assert isinstance(title, compat_str)
e3946f98
PH
1288 try:
1289 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1290 except OSError:
1291 return
6eefe533
PH
1292 title_bytes = title.encode('utf-8')
1293 buf = ctypes.create_string_buffer(len(title_bytes))
1294 buf.value = title_bytes
e3946f98 1295 try:
6eefe533 1296 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1297 except AttributeError:
1298 return # Strange libc, just skip this
d7dda168
PH
1299
1300
1301def remove_start(s, start):
1302 if s.startswith(start):
1303 return s[len(start):]
1304 return s
29eb5174
PH
1305
1306
2b9faf55
PH
1307def remove_end(s, end):
1308 if s.endswith(end):
1309 return s[:-len(end)]
1310 return s
1311
1312
29eb5174 1313def url_basename(url):
9b8aaeed
JMF
1314 path = compat_urlparse.urlparse(url).path
1315 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1316
1317
1318class HEADRequest(compat_urllib_request.Request):
1319 def get_method(self):
1320 return "HEAD"
7217e148
PH
1321
1322
9732d77e 1323def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1324 if get_attr:
1325 if v is not None:
1326 v = getattr(v, get_attr, None)
9572013d
PH
1327 if v == '':
1328 v = None
9732d77e
PH
1329 return default if v is None else (int(v) * invscale // scale)
1330
9572013d 1331
40a90862
JMF
1332def str_or_none(v, default=None):
1333 return default if v is None else compat_str(v)
1334
9732d77e
PH
1335
1336def str_to_int(int_str):
48d4681e 1337 """ A more relaxed version of int_or_none """
9732d77e
PH
1338 if int_str is None:
1339 return None
884ae747 1340 int_str = re.sub(r'[,\.\+]', u'', int_str)
9732d77e 1341 return int(int_str)
608d11f5
PH
1342
1343
9732d77e
PH
1344def float_or_none(v, scale=1, invscale=1, default=None):
1345 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1346
1347
608d11f5
PH
1348def parse_duration(s):
1349 if s is None:
1350 return None
1351
ca7b3246
S
1352 s = s.strip()
1353
608d11f5 1354 m = re.match(
f164038b 1355 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
608d11f5
PH
1356 if not m:
1357 return None
1358 res = int(m.group('secs'))
1359 if m.group('mins'):
1360 res += int(m.group('mins')) * 60
1361 if m.group('hours'):
1362 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1363 if m.group('ms'):
1364 res += float(m.group('ms'))
608d11f5 1365 return res
91d7d0b3
JMF
1366
1367
1368def prepend_extension(filename, ext):
1369 name, real_ext = os.path.splitext(filename)
1370 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1371
1372
1373def check_executable(exe, args=[]):
1374 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1375 args can be a list of arguments for a short output (like -version) """
1376 try:
1377 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1378 except OSError:
1379 return False
1380 return exe
b7ab0590
PH
1381
1382
1383class PagedList(object):
1384 def __init__(self, pagefunc, pagesize):
1385 self._pagefunc = pagefunc
1386 self._pagesize = pagesize
1387
dd26ced1
PH
1388 def __len__(self):
1389 # This is only useful for tests
1390 return len(self.getslice())
1391
b7ab0590
PH
1392 def getslice(self, start=0, end=None):
1393 res = []
1394 for pagenum in itertools.count(start // self._pagesize):
1395 firstid = pagenum * self._pagesize
1396 nextfirstid = pagenum * self._pagesize + self._pagesize
1397 if start >= nextfirstid:
1398 continue
1399
1400 page_results = list(self._pagefunc(pagenum))
1401
1402 startv = (
1403 start % self._pagesize
1404 if firstid <= start < nextfirstid
1405 else 0)
1406
1407 endv = (
1408 ((end - 1) % self._pagesize) + 1
1409 if (end is not None and firstid <= end <= nextfirstid)
1410 else None)
1411
1412 if startv != 0 or endv is not None:
1413 page_results = page_results[startv:endv]
1414 res.extend(page_results)
1415
1416 # A little optimization - if current page is not "full", ie. does
1417 # not contain page_size videos then we can assume that this page
1418 # is the last one - there are no more ids on further pages -
1419 # i.e. no need to query again.
1420 if len(page_results) + startv < self._pagesize:
1421 break
1422
1423 # If we got the whole page, but the next page is not interesting,
1424 # break out early as well
1425 if end == nextfirstid:
1426 break
1427 return res
81c2f20b
PH
1428
1429
1430def uppercase_escape(s):
676eb3f2 1431 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1432 return re.sub(
a612753d 1433 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1434 lambda m: unicode_escape(m.group(0))[0],
1435 s)
b53466e1
PH
1436
1437try:
1438 struct.pack(u'!I', 0)
1439except TypeError:
1440 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1441 def struct_pack(spec, *args):
1442 if isinstance(spec, compat_str):
1443 spec = spec.encode('ascii')
1444 return struct.pack(spec, *args)
1445
1446 def struct_unpack(spec, *args):
1447 if isinstance(spec, compat_str):
1448 spec = spec.encode('ascii')
1449 return struct.unpack(spec, *args)
1450else:
1451 struct_pack = struct.pack
1452 struct_unpack = struct.unpack
62e609ab
PH
1453
1454
1455def read_batch_urls(batch_fd):
1456 def fixup(url):
1457 if not isinstance(url, compat_str):
1458 url = url.decode('utf-8', 'replace')
1459 BOM_UTF8 = u'\xef\xbb\xbf'
1460 if url.startswith(BOM_UTF8):
1461 url = url[len(BOM_UTF8):]
1462 url = url.strip()
1463 if url.startswith(('#', ';', ']')):
1464 return False
1465 return url
1466
1467 with contextlib.closing(batch_fd) as fd:
1468 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1469
1470
1471def urlencode_postdata(*args, **kargs):
1472 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1473
1474
0990305d
PH
1475try:
1476 etree_iter = xml.etree.ElementTree.Element.iter
1477except AttributeError: # Python <=2.6
1478 etree_iter = lambda n: n.findall('.//*')
1479
1480
bcf89ce6
PH
1481def parse_xml(s):
1482 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1483 def doctype(self, name, pubid, system):
1484 pass # Ignore doctypes
1485
1486 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1487 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1488 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1489 # Fix up XML parser in Python 2.x
1490 if sys.version_info < (3, 0):
1491 for n in etree_iter(tree):
1492 if n.text is not None:
1493 if not isinstance(n.text, compat_str):
1494 n.text = n.text.decode('utf-8')
1495 return tree
e68301af
PH
1496
1497
1498if sys.version_info < (3, 0) and sys.platform == 'win32':
1499 def compat_getpass(prompt, *args, **kwargs):
1500 if isinstance(prompt, compat_str):
4e6f9aec 1501 prompt = prompt.encode(preferredencoding())
e68301af
PH
1502 return getpass.getpass(prompt, *args, **kwargs)
1503else:
1504 compat_getpass = getpass.getpass
a1a530b0
PH
1505
1506
1507US_RATINGS = {
1508 'G': 0,
1509 'PG': 10,
1510 'PG-13': 13,
1511 'R': 16,
1512 'NC': 18,
1513}
fac55558
PH
1514
1515
1516def strip_jsonp(code):
816930c4 1517 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
478c2c61
PH
1518
1519
e05f6939
PH
1520def js_to_json(code):
1521 def fix_kv(m):
1522 key = m.group(2)
1523 if key.startswith("'"):
1524 assert key.endswith("'")
1525 assert '"' not in key
1526 key = '"%s"' % key[1:-1]
1527 elif not key.startswith('"'):
1528 key = '"%s"' % key
1529
1530 value = m.group(4)
1531 if value.startswith("'"):
1532 assert value.endswith("'")
1533 assert '"' not in value
1534 value = '"%s"' % value[1:-1]
1535
1536 return m.group(1) + key + m.group(3) + value
1537
1538 res = re.sub(r'''(?x)
1539 ([{,]\s*)
1540 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1541 (:\s*)
1542 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1543 ''', fix_kv, code)
1544 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1545 return res
1546
1547
478c2c61
PH
1548def qualities(quality_ids):
1549 """ Get a numeric quality value out of a list of possible values """
1550 def q(qid):
1551 try:
1552 return quality_ids.index(qid)
1553 except ValueError:
1554 return -1
1555 return q
1556
acd69589
PH
1557
1558DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68
PH
1559
1560try:
1561 subprocess_check_output = subprocess.check_output
1562except AttributeError:
1563 def subprocess_check_output(*args, **kwargs):
1564 assert 'input' not in kwargs
1565 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1566 output, _ = p.communicate()
1567 ret = p.poll()
1568 if ret:
1569 raise subprocess.CalledProcessError(ret, p.args, output=output)
1570 return output