]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[arte] Fix upload date extraction
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
676eb3f2 5import codecs
62e609ab 6import contextlib
e3946f98 7import ctypes
c496ca96
PH
8import datetime
9import email.utils
f45c185f 10import errno
e68301af 11import getpass
d77c3dfd 12import gzip
b7ab0590 13import itertools
03f9daab 14import io
f4bfd65f 15import json
d77c3dfd 16import locale
02dbf93f 17import math
d77c3dfd 18import os
4eb7f1d1 19import pipes
c496ca96 20import platform
d77c3dfd 21import re
13ebea79 22import ssl
c496ca96 23import socket
b53466e1 24import struct
1c088fa8 25import subprocess
d77c3dfd 26import sys
181c8655 27import tempfile
01951dda 28import traceback
bcf89ce6 29import xml.etree.ElementTree
d77c3dfd 30import zlib
d77c3dfd 31
01ba00ca 32try:
59ae15a5 33 import urllib.request as compat_urllib_request
01ba00ca 34except ImportError: # Python 2
59ae15a5 35 import urllib2 as compat_urllib_request
01ba00ca
PH
36
37try:
59ae15a5 38 import urllib.error as compat_urllib_error
01ba00ca 39except ImportError: # Python 2
59ae15a5 40 import urllib2 as compat_urllib_error
01ba00ca
PH
41
42try:
59ae15a5 43 import urllib.parse as compat_urllib_parse
01ba00ca 44except ImportError: # Python 2
59ae15a5 45 import urllib as compat_urllib_parse
01ba00ca 46
799c0763
PH
47try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
6543f0dc
JMF
52try:
53 import urllib.parse as compat_urlparse
54except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
01ba00ca 57try:
59ae15a5 58 import http.cookiejar as compat_cookiejar
01ba00ca 59except ImportError: # Python 2
59ae15a5 60 import cookielib as compat_cookiejar
01ba00ca 61
3e669f36 62try:
59ae15a5 63 import html.entities as compat_html_entities
9f37a959 64except ImportError: # Python 2
59ae15a5 65 import htmlentitydefs as compat_html_entities
3e669f36 66
a8156c1d 67try:
59ae15a5 68 import html.parser as compat_html_parser
9f37a959 69except ImportError: # Python 2
59ae15a5 70 import HTMLParser as compat_html_parser
a8156c1d 71
348d0a7a 72try:
59ae15a5 73 import http.client as compat_http_client
9f37a959 74except ImportError: # Python 2
59ae15a5 75 import httplib as compat_http_client
348d0a7a 76
2eabb802 77try:
0e283428 78 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
79except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
e0df6211
PH
82try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
5910e210
PH
88try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
9f37a959 94try:
f1f725c6
PH
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
59ae15a5
PH
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
f1f725c6
PH
132
133try:
134 from urllib.parse import parse_qs as compat_parse_qs
135except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
59ae15a5
PH
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
f1f725c6
PH
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
59ae15a5
PH
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
f1f725c6
PH
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
59ae15a5
PH
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
348d0a7a 179
3e669f36 180try:
59ae15a5 181 compat_str = unicode # Python 2
3e669f36 182except NameError:
59ae15a5 183 compat_str = str
3e669f36
PH
184
185try:
59ae15a5 186 compat_chr = unichr # Python 2
3e669f36 187except NameError:
59ae15a5 188 compat_chr = chr
3e669f36 189
f7300c5c
JMF
190try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
8d31fa3c
PH
195try:
196 from shlex import quote as shlex_quote
197except ImportError: # Python < 3.3
198 def shlex_quote(s):
199 return "'" + s.replace("'", "'\"'\"'") + "'"
200
201
b31756c1
FV
202def compat_ord(c):
203 if type(c) is int: return c
204 else: return ord(c)
205
468e2e92
FV
206# This is not clearly defined otherwise
207compiled_regex_type = type(re.compile(''))
208
3e669f36 209std_headers = {
ae8f7871 210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 215}
f427df17 216
d77c3dfd 217def preferredencoding():
59ae15a5 218 """Get preferred encoding.
d77c3dfd 219
59ae15a5
PH
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
222 """
223 try:
224 pref = locale.getpreferredencoding()
225 u'TEST'.encode(pref)
226 except:
227 pref = 'UTF-8'
bae611f2 228
59ae15a5 229 return pref
d77c3dfd 230
8cd10ac4 231if sys.version_info < (3,0):
59ae15a5
PH
232 def compat_print(s):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 234else:
59ae15a5
PH
235 def compat_print(s):
236 assert type(s) == type(u'')
237 print(s)
d77c3dfd 238
f4bfd65f 239
181c8655
PH
240def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
242
73159f99
S
243 args = {
244 'suffix': '.tmp',
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
247 'delete': False,
248 }
249
181c8655
PH
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
73159f99 253 args['mode'] = 'wb'
181c8655 254 else:
73159f99
S
255 args.update({
256 'mode': 'w',
257 'encoding': 'utf-8',
258 })
259
260 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
261
262 try:
263 with tf:
264 json.dump(obj, tf)
265 os.rename(tf.name, fn)
266 except:
267 try:
268 os.remove(tf.name)
269 except OSError:
270 pass
271 raise
272
273
274if sys.version_info >= (2, 7):
59ae56fa
PH
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
281else:
282 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
283 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
284 # .//node does not match if a node is a direct child of . !
285 if isinstance(xpath, unicode):
286 xpath = xpath.encode('ascii')
287
59ae56fa
PH
288 for f in node.findall(xpath):
289 if f.attrib.get(key) == val:
290 return f
291 return None
292
d7e66d39
JMF
293# On python2.6 the xml.etree.ElementTree.Element methods don't support
294# the namespace parameter
295def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
297 replaced = []
298 for c in components:
299 if len(c) == 1:
300 replaced.append(c[0])
301 else:
302 ns, tag = c
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
305
d77c3dfd 306
bf0ff932 307def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
308 if sys.version_info < (2, 7): # Crazy 2.6
309 xpath = xpath.encode('ascii')
310
bf0ff932
PH
311 n = node.find(xpath)
312 if n is None:
313 if fatal:
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element %s' % name)
316 else:
317 return None
318 return n.text
319
320
a8156c1d 321compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
322class BaseHTMLParser(compat_html_parser.HTMLParser):
323 def __init(self):
324 compat_html_parser.HTMLParser.__init__(self)
325 self.html = None
326
327 def loads(self, html):
328 self.html = html
329 self.feed(html)
330 self.close()
331
332class AttrParser(BaseHTMLParser):
43e8fafd
ND
333 """Modified HTMLParser that isolates a tag with the specified attribute"""
334 def __init__(self, attribute, value):
335 self.attribute = attribute
336 self.value = value
59ae15a5
PH
337 self.result = None
338 self.started = False
339 self.depth = {}
59ae15a5
PH
340 self.watch_startpos = False
341 self.error_count = 0
a921f407 342 BaseHTMLParser.__init__(self)
59ae15a5
PH
343
344 def error(self, message):
345 if self.error_count > 10 or self.started:
346 raise compat_html_parser.HTMLParseError(message, self.getpos())
347 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
348 self.error_count += 1
349 self.goahead(1)
350
59ae15a5
PH
351 def handle_starttag(self, tag, attrs):
352 attrs = dict(attrs)
353 if self.started:
354 self.find_startpos(None)
43e8fafd 355 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
356 self.result = [tag]
357 self.started = True
358 self.watch_startpos = True
359 if self.started:
360 if not tag in self.depth: self.depth[tag] = 0
361 self.depth[tag] += 1
362
363 def handle_endtag(self, tag):
364 if self.started:
365 if tag in self.depth: self.depth[tag] -= 1
366 if self.depth[self.result[0]] == 0:
367 self.started = False
368 self.result.append(self.getpos())
369
370 def find_startpos(self, x):
371 """Needed to put the start position of the result (self.result[1])
372 after the opening tag with the requested id"""
373 if self.watch_startpos:
374 self.watch_startpos = False
375 self.result.append(self.getpos())
376 handle_entityref = handle_charref = handle_data = handle_comment = \
377 handle_decl = handle_pi = unknown_decl = find_startpos
378
379 def get_result(self):
380 if self.result is None:
381 return None
382 if len(self.result) != 3:
383 return None
384 lines = self.html.split('\n')
385 lines = lines[self.result[1][0]-1:self.result[2][0]]
386 lines[0] = lines[0][self.result[1][1]:]
387 if len(lines) == 1:
388 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
389 lines[-1] = lines[-1][:self.result[2][1]]
390 return '\n'.join(lines).strip()
3b024e17
PH
391# Hack for https://github.com/rg3/youtube-dl/issues/662
392if sys.version_info < (2, 7, 3):
393 AttrParser.parse_endtag = (lambda self, i:
394 i + len("</scr'+'ipt>")
395 if self.rawdata[i:].startswith("</scr'+'ipt>")
396 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
397
398def get_element_by_id(id, html):
43e8fafd
ND
399 """Return the content of the tag with the specified ID in the passed HTML document"""
400 return get_element_by_attribute("id", id, html)
401
402def get_element_by_attribute(attribute, value, html):
403 """Return the content of the tag with the specified attribute in the passed HTML document"""
404 parser = AttrParser(attribute, value)
59ae15a5
PH
405 try:
406 parser.loads(html)
407 except compat_html_parser.HTMLParseError:
408 pass
409 return parser.get_result()
9e6dd238 410
a921f407
JMF
411class MetaParser(BaseHTMLParser):
412 """
413 Modified HTMLParser that isolates a meta tag with the specified name
414 attribute.
415 """
416 def __init__(self, name):
417 BaseHTMLParser.__init__(self)
418 self.name = name
419 self.content = None
420 self.result = None
421
422 def handle_starttag(self, tag, attrs):
423 if tag != 'meta':
424 return
425 attrs = dict(attrs)
426 if attrs.get('name') == self.name:
427 self.result = attrs.get('content')
428
429 def get_result(self):
430 return self.result
431
432def get_meta_content(name, html):
433 """
434 Return the content attribute from the meta tag with the given name attribute.
435 """
436 parser = MetaParser(name)
437 try:
438 parser.loads(html)
439 except compat_html_parser.HTMLParseError:
440 pass
441 return parser.get_result()
442
9e6dd238
FV
443
444def clean_html(html):
59ae15a5
PH
445 """Clean an HTML snippet into a readable string"""
446 # Newline vs <br />
447 html = html.replace('\n', ' ')
6b3aef80
FV
448 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
449 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
450 # Strip html tags
451 html = re.sub('<.*?>', '', html)
452 # Replace html entities
453 html = unescapeHTML(html)
7decf895 454 return html.strip()
9e6dd238
FV
455
456
d77c3dfd 457def sanitize_open(filename, open_mode):
59ae15a5
PH
458 """Try to open the given filename, and slightly tweak it if this fails.
459
460 Attempts to open the given filename. If this fails, it tries to change
461 the filename slightly, step by step, until it's either able to open it
462 or it fails and raises a final exception, like the standard open()
463 function.
464
465 It returns the tuple (stream, definitive_file_name).
466 """
467 try:
468 if filename == u'-':
469 if sys.platform == 'win32':
470 import msvcrt
471 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 472 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
473 stream = open(encodeFilename(filename), open_mode)
474 return (stream, filename)
475 except (IOError, OSError) as err:
f45c185f
PH
476 if err.errno in (errno.EACCES,):
477 raise
59ae15a5 478
f45c185f
PH
479 # In case of error, try to remove win32 forbidden chars
480 alt_filename = os.path.join(
481 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
482 for path_part in os.path.split(filename)
483 )
484 if alt_filename == filename:
485 raise
486 else:
487 # An exception here should be caught in the caller
488 stream = open(encodeFilename(filename), open_mode)
489 return (stream, alt_filename)
d77c3dfd
FV
490
491
492def timeconvert(timestr):
59ae15a5
PH
493 """Convert RFC 2822 defined time string into system timestamp"""
494 timestamp = None
495 timetuple = email.utils.parsedate_tz(timestr)
496 if timetuple is not None:
497 timestamp = email.utils.mktime_tz(timetuple)
498 return timestamp
1c469a94 499
796173d0 500def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
501 """Sanitizes a string so it could be used as part of a filename.
502 If restricted is set, use a stricter subset of allowed characters.
796173d0 503 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
504 """
505 def replace_insane(char):
506 if char == '?' or ord(char) < 32 or ord(char) == 127:
507 return ''
508 elif char == '"':
509 return '' if restricted else '\''
510 elif char == ':':
511 return '_-' if restricted else ' -'
512 elif char in '\\/|*<>':
513 return '_'
627dcfff 514 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
515 return '_'
516 if restricted and ord(char) > 127:
517 return '_'
518 return char
519
520 result = u''.join(map(replace_insane, s))
796173d0
PH
521 if not is_id:
522 while '__' in result:
523 result = result.replace('__', '_')
524 result = result.strip('_')
525 # Common case of "Foreign band name - English song title"
526 if restricted and result.startswith('-_'):
527 result = result[2:]
528 if not result:
529 result = '_'
59ae15a5 530 return result
d77c3dfd
FV
531
532def orderedSet(iterable):
59ae15a5
PH
533 """ Remove all duplicates from the input iterable """
534 res = []
535 for el in iterable:
536 if el not in res:
537 res.append(el)
538 return res
d77c3dfd 539
912b38b4 540
4e408e47
PH
541def _htmlentity_transform(entity):
542 """Transforms an HTML entity to a character."""
543 # Known non-numeric HTML entity
544 if entity in compat_html_entities.name2codepoint:
545 return compat_chr(compat_html_entities.name2codepoint[entity])
546
547 mobj = re.match(r'#(x?[0-9]+)', entity)
548 if mobj is not None:
549 numstr = mobj.group(1)
550 if numstr.startswith(u'x'):
551 base = 16
552 numstr = u'0%s' % numstr
553 else:
554 base = 10
555 return compat_chr(int(numstr, base))
556
557 # Unknown entity in name, return its literal representation
558 return (u'&%s;' % entity)
559
560
d77c3dfd 561def unescapeHTML(s):
912b38b4
PH
562 if s is None:
563 return None
564 assert type(s) == compat_str
d77c3dfd 565
4e408e47
PH
566 return re.sub(
567 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 568
8bf48f23
PH
569
570def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
571 """
572 @param s The name of the file
573 """
d77c3dfd 574
8bf48f23 575 assert type(s) == compat_str
d77c3dfd 576
59ae15a5
PH
577 # Python 3 has a Unicode API
578 if sys.version_info >= (3, 0):
579 return s
0f00efed 580
59ae15a5
PH
581 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
582 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
583 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
584 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
585 if not for_subprocess:
586 return s
587 else:
588 # For subprocess calls, encode with locale encoding
589 # Refer to http://stackoverflow.com/a/9951851/35070
590 encoding = preferredencoding()
59ae15a5 591 else:
6df40dcb 592 encoding = sys.getfilesystemencoding()
8bf48f23
PH
593 if encoding is None:
594 encoding = 'utf-8'
595 return s.encode(encoding, 'ignore')
596
f07b74fc
PH
597
598def encodeArgument(s):
599 if not isinstance(s, compat_str):
600 # Legacy code that uses byte strings
601 # Uncomment the following line after fixing all post processors
602 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
603 s = s.decode('ascii')
604 return encodeFilename(s, True)
605
606
8271226a
PH
607def decodeOption(optval):
608 if optval is None:
609 return optval
610 if isinstance(optval, bytes):
611 optval = optval.decode(preferredencoding())
612
613 assert isinstance(optval, compat_str)
614 return optval
1c256f70 615
4539dd30
PH
616def formatSeconds(secs):
617 if secs > 3600:
618 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
619 elif secs > 60:
620 return '%d:%02d' % (secs // 60, secs % 60)
621 else:
622 return '%d' % secs
623
a0ddb8a2
PH
624
625def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
626 if sys.version_info < (3, 2):
627 import httplib
628
629 class HTTPSConnectionV3(httplib.HTTPSConnection):
630 def __init__(self, *args, **kwargs):
631 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
632
633 def connect(self):
634 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 635 if getattr(self, '_tunnel_host', False):
13ebea79
PH
636 self.sock = sock
637 self._tunnel()
638 try:
aa37e3d4 639 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 640 except ssl.SSLError:
13ebea79
PH
641 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
642
643 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
644 def https_open(self, req):
645 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 646 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
647 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
648 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
649 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
650 if opts_no_check_certificate:
651 context.verify_mode = ssl.CERT_NONE
652 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
653 else: # Python < 3.4
654 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 655 context.verify_mode = (ssl.CERT_NONE
dca08720 656 if opts_no_check_certificate
ea6d901e 657 else ssl.CERT_REQUIRED)
303b479e
PH
658 context.set_default_verify_paths()
659 try:
660 context.load_default_certs()
661 except AttributeError:
662 pass # Python < 3.4
a0ddb8a2 663 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 664
1c256f70
PH
665class ExtractorError(Exception):
666 """Error during info extraction."""
d11271dd 667 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
668 """ tb, if given, is the original traceback (so that it can be printed out).
669 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
670 """
671
672 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
673 expected = True
d11271dd
PH
674 if video_id is not None:
675 msg = video_id + ': ' + msg
9a82b238 676 if not expected:
298f833b 677 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 678 super(ExtractorError, self).__init__(msg)
d5979c5d 679
1c256f70 680 self.traceback = tb
8cc83b8d 681 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 682 self.cause = cause
d11271dd 683 self.video_id = video_id
1c256f70 684
01951dda
PH
685 def format_traceback(self):
686 if self.traceback is None:
687 return None
688 return u''.join(traceback.format_tb(self.traceback))
689
1c256f70 690
55b3e45b
JMF
691class RegexNotFoundError(ExtractorError):
692 """Error when a regex didn't match"""
693 pass
694
695
d77c3dfd 696class DownloadError(Exception):
59ae15a5 697 """Download Error exception.
d77c3dfd 698
59ae15a5
PH
699 This exception may be thrown by FileDownloader objects if they are not
700 configured to continue on errors. They will contain the appropriate
701 error message.
702 """
8cc83b8d
FV
703 def __init__(self, msg, exc_info=None):
704 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
705 super(DownloadError, self).__init__(msg)
706 self.exc_info = exc_info
d77c3dfd
FV
707
708
709class SameFileError(Exception):
59ae15a5 710 """Same File exception.
d77c3dfd 711
59ae15a5
PH
712 This exception will be thrown by FileDownloader objects if they detect
713 multiple files would have to be downloaded to the same file on disk.
714 """
715 pass
d77c3dfd
FV
716
717
718class PostProcessingError(Exception):
59ae15a5 719 """Post Processing exception.
d77c3dfd 720
59ae15a5
PH
721 This exception may be raised by PostProcessor's .run() method to
722 indicate an error in the postprocessing task.
723 """
7851b379
PH
724 def __init__(self, msg):
725 self.msg = msg
d77c3dfd
FV
726
727class MaxDownloadsReached(Exception):
59ae15a5
PH
728 """ --max-downloads limit has been reached. """
729 pass
d77c3dfd
FV
730
731
732class UnavailableVideoError(Exception):
59ae15a5 733 """Unavailable Format exception.
d77c3dfd 734
59ae15a5
PH
735 This exception will be thrown when a video is requested
736 in a format that is not available for that video.
737 """
738 pass
d77c3dfd
FV
739
740
741class ContentTooShortError(Exception):
59ae15a5 742 """Content Too Short exception.
d77c3dfd 743
59ae15a5
PH
744 This exception may be raised by FileDownloader objects when a file they
745 download is too small for what the server announced first, indicating
746 the connection was probably interrupted.
747 """
748 # Both in bytes
749 downloaded = None
750 expected = None
d77c3dfd 751
59ae15a5
PH
752 def __init__(self, downloaded, expected):
753 self.downloaded = downloaded
754 self.expected = expected
d77c3dfd 755
acebc9cd 756class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
757 """Handler for HTTP requests and responses.
758
759 This class, when installed with an OpenerDirector, automatically adds
760 the standard headers to every HTTP request and handles gzipped and
761 deflated responses from web servers. If compression is to be avoided in
762 a particular request, the original request in the program code only has
763 to include the HTTP header "Youtubedl-No-Compression", which will be
764 removed before making the real request.
765
766 Part of this code was copied from:
767
768 http://techknack.net/python-urllib2-handlers/
769
770 Andrew Rowls, the author of that code, agreed to release it to the
771 public domain.
772 """
773
774 @staticmethod
775 def deflate(data):
776 try:
777 return zlib.decompress(data, -zlib.MAX_WBITS)
778 except zlib.error:
779 return zlib.decompress(data)
780
781 @staticmethod
782 def addinfourl_wrapper(stream, headers, url, code):
783 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
784 return compat_urllib_request.addinfourl(stream, headers, url, code)
785 ret = compat_urllib_request.addinfourl(stream, headers, url)
786 ret.code = code
787 return ret
788
acebc9cd 789 def http_request(self, req):
33ac271b
PH
790 for h, v in std_headers.items():
791 if h not in req.headers:
792 req.add_header(h, v)
59ae15a5
PH
793 if 'Youtubedl-no-compression' in req.headers:
794 if 'Accept-encoding' in req.headers:
795 del req.headers['Accept-encoding']
796 del req.headers['Youtubedl-no-compression']
3446dfb7 797 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
798 if 'User-agent' in req.headers:
799 del req.headers['User-agent']
800 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 801 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
802
803 if sys.version_info < (2, 7) and '#' in req.get_full_url():
804 # Python 2.6 is brain-dead when it comes to fragments
805 req._Request__original = req._Request__original.partition('#')[0]
806 req._Request__r_type = req._Request__r_type.partition('#')[0]
807
59ae15a5
PH
808 return req
809
acebc9cd 810 def http_response(self, req, resp):
59ae15a5
PH
811 old_resp = resp
812 # gzip
813 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
814 content = resp.read()
815 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
816 try:
817 uncompressed = io.BytesIO(gz.read())
818 except IOError as original_ioerror:
819 # There may be junk add the end of the file
820 # See http://stackoverflow.com/q/4928560/35070 for details
821 for i in range(1, 1024):
822 try:
823 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
824 uncompressed = io.BytesIO(gz.read())
825 except IOError:
826 continue
827 break
828 else:
829 raise original_ioerror
830 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
831 resp.msg = old_resp.msg
832 # deflate
833 if resp.headers.get('Content-encoding', '') == 'deflate':
834 gz = io.BytesIO(self.deflate(resp.read()))
835 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
836 resp.msg = old_resp.msg
837 return resp
0f8d03f8 838
acebc9cd
PH
839 https_request = http_request
840 https_response = http_response
bf50b038 841
5de90176 842
305d0683 843def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
844 """ Return a UNIX timestamp from the given date """
845
846 if date_str is None:
847 return None
848
849 m = re.search(
850 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
851 date_str)
852 if not m:
853 timezone = datetime.timedelta()
854 else:
855 date_str = date_str[:-len(m.group(0))]
856 if not m.group('sign'):
857 timezone = datetime.timedelta()
858 else:
859 sign = 1 if m.group('sign') == '+' else -1
860 timezone = datetime.timedelta(
861 hours=sign * int(m.group('hours')),
862 minutes=sign * int(m.group('minutes')))
305d0683
TB
863 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
864 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
865 return calendar.timegm(dt.timetuple())
866
867
bf50b038
JMF
868def unified_strdate(date_str):
869 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
870
871 if date_str is None:
872 return None
873
bf50b038
JMF
874 upload_date = None
875 #Replace commas
026fcc04 876 date_str = date_str.replace(',', ' ')
bf50b038 877 # %z (UTC offset) is only supported in python>=3.2
026fcc04 878 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
879 format_expressions = [
880 '%d %B %Y',
0f99566c 881 '%d %b %Y',
19e1d359
JMF
882 '%B %d %Y',
883 '%b %d %Y',
78ff59d0
PP
884 '%b %dst %Y %I:%M%p',
885 '%b %dnd %Y %I:%M%p',
886 '%b %dth %Y %I:%M%p',
19e1d359 887 '%Y-%m-%d',
fe556f1b 888 '%Y/%m/%d',
4cf96546 889 '%d.%m.%Y',
19e1d359 890 '%d/%m/%Y',
423817c4 891 '%d/%m/%y',
19e1d359 892 '%Y/%m/%d %H:%M:%S',
99b67fec 893 '%d/%m/%Y %H:%M:%S',
5d73273f 894 '%Y-%m-%d %H:%M:%S',
19e1d359 895 '%d.%m.%Y %H:%M',
b047de6f 896 '%d.%m.%Y %H.%M',
19e1d359 897 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
898 '%Y-%m-%dT%H:%M:%S.%fZ',
899 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 900 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 901 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 902 '%Y-%m-%dT%H:%M',
19e1d359 903 ]
bf50b038
JMF
904 for expression in format_expressions:
905 try:
906 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 907 except ValueError:
bf50b038 908 pass
42393ce2
PH
909 if upload_date is None:
910 timetuple = email.utils.parsedate_tz(date_str)
911 if timetuple:
912 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
913 return upload_date
914
cbdbb766 915def determine_ext(url, default_ext=u'unknown_video'):
f4776371
S
916 if url is None:
917 return default_ext
73e79f2a
PH
918 guess = url.partition(u'?')[0].rpartition(u'.')[2]
919 if re.match(r'^[A-Za-z0-9]+$', guess):
920 return guess
921 else:
cbdbb766 922 return default_ext
73e79f2a 923
d4051a8e
JMF
924def subtitles_filename(filename, sub_lang, sub_format):
925 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
926
bd558525 927def date_from_str(date_str):
37254abc
JMF
928 """
929 Return a datetime object from a string in the format YYYYMMDD or
930 (now|today)[+-][0-9](day|week|month|year)(s)?"""
931 today = datetime.date.today()
932 if date_str == 'now'or date_str == 'today':
933 return today
934 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
935 if match is not None:
936 sign = match.group('sign')
937 time = int(match.group('time'))
938 if sign == '-':
939 time = -time
940 unit = match.group('unit')
941 #A bad aproximation?
942 if unit == 'month':
943 unit = 'day'
944 time *= 30
945 elif unit == 'year':
946 unit = 'day'
947 time *= 365
948 unit += 's'
949 delta = datetime.timedelta(**{unit: time})
950 return today + delta
bd558525
JMF
951 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
952
e63fc1be 953def hyphenate_date(date_str):
954 """
955 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
956 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
957 if match is not None:
958 return '-'.join(match.groups())
959 else:
960 return date_str
961
bd558525
JMF
962class DateRange(object):
963 """Represents a time interval between two dates"""
964 def __init__(self, start=None, end=None):
965 """start and end must be strings in the format accepted by date"""
966 if start is not None:
967 self.start = date_from_str(start)
968 else:
969 self.start = datetime.datetime.min.date()
970 if end is not None:
971 self.end = date_from_str(end)
972 else:
973 self.end = datetime.datetime.max.date()
37254abc 974 if self.start > self.end:
bd558525
JMF
975 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
976 @classmethod
977 def day(cls, day):
978 """Returns a range that only contains the given day"""
979 return cls(day,day)
980 def __contains__(self, date):
981 """Check if the date is in the range"""
37254abc
JMF
982 if not isinstance(date, datetime.date):
983 date = date_from_str(date)
984 return self.start <= date <= self.end
bd558525
JMF
985 def __str__(self):
986 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
987
988
989def platform_name():
990 """ Returns the platform name as a compat_str """
991 res = platform.platform()
992 if isinstance(res, bytes):
993 res = res.decode(preferredencoding())
994
995 assert isinstance(res, compat_str)
996 return res
c257baff
PH
997
998
b58ddb32
PH
999def _windows_write_string(s, out):
1000 """ Returns True if the string was written using special methods,
1001 False if it has yet to be written out."""
1002 # Adapted from http://stackoverflow.com/a/3259271/35070
1003
1004 import ctypes
1005 import ctypes.wintypes
1006
1007 WIN_OUTPUT_IDS = {
1008 1: -11,
1009 2: -12,
1010 }
1011
a383a98a
PH
1012 try:
1013 fileno = out.fileno()
1014 except AttributeError:
1015 # If the output stream doesn't have a fileno, it's virtual
1016 return False
b58ddb32
PH
1017 if fileno not in WIN_OUTPUT_IDS:
1018 return False
1019
1020 GetStdHandle = ctypes.WINFUNCTYPE(
1021 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1022 ("GetStdHandle", ctypes.windll.kernel32))
1023 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1024
1025 WriteConsoleW = ctypes.WINFUNCTYPE(
1026 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1027 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1028 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1029 written = ctypes.wintypes.DWORD(0)
1030
1031 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1032 FILE_TYPE_CHAR = 0x0002
1033 FILE_TYPE_REMOTE = 0x8000
1034 GetConsoleMode = ctypes.WINFUNCTYPE(
1035 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1036 ctypes.POINTER(ctypes.wintypes.DWORD))(
1037 ("GetConsoleMode", ctypes.windll.kernel32))
1038 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1039
1040 def not_a_console(handle):
1041 if handle == INVALID_HANDLE_VALUE or handle is None:
1042 return True
1043 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1044 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1045
1046 if not_a_console(h):
1047 return False
1048
d1b9c912
PH
1049 def next_nonbmp_pos(s):
1050 try:
1051 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1052 except StopIteration:
1053 return len(s)
1054
1055 while s:
1056 count = min(next_nonbmp_pos(s), 1024)
1057
b58ddb32 1058 ret = WriteConsoleW(
d1b9c912 1059 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1060 if ret == 0:
1061 raise OSError('Failed to write string')
d1b9c912
PH
1062 if not count: # We just wrote a non-BMP character
1063 assert written.value == 2
1064 s = s[1:]
1065 else:
1066 assert written.value > 0
1067 s = s[written.value:]
b58ddb32
PH
1068 return True
1069
1070
734f90bb 1071def write_string(s, out=None, encoding=None):
7459e3a2
PH
1072 if out is None:
1073 out = sys.stderr
8bf48f23 1074 assert type(s) == compat_str
7459e3a2 1075
b58ddb32
PH
1076 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1077 if _windows_write_string(s, out):
1078 return
1079
7459e3a2
PH
1080 if ('b' in getattr(out, 'mode', '') or
1081 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1082 byt = s.encode(encoding or preferredencoding(), 'ignore')
1083 out.write(byt)
1084 elif hasattr(out, 'buffer'):
1085 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1086 byt = s.encode(enc, 'ignore')
1087 out.buffer.write(byt)
1088 else:
8bf48f23 1089 out.write(s)
7459e3a2
PH
1090 out.flush()
1091
1092
48ea9cea
PH
1093def bytes_to_intlist(bs):
1094 if not bs:
1095 return []
1096 if isinstance(bs[0], int): # Python 3
1097 return list(bs)
1098 else:
1099 return [ord(c) for c in bs]
1100
c257baff 1101
cba892fa 1102def intlist_to_bytes(xs):
1103 if not xs:
1104 return b''
1105 if isinstance(chr(0), bytes): # Python 2
1106 return ''.join([chr(x) for x in xs])
1107 else:
1108 return bytes(xs)
c38b1e77
PH
1109
1110
c1c9a79c
PH
1111# Cross-platform file locking
1112if sys.platform == 'win32':
1113 import ctypes.wintypes
1114 import msvcrt
1115
1116 class OVERLAPPED(ctypes.Structure):
1117 _fields_ = [
1118 ('Internal', ctypes.wintypes.LPVOID),
1119 ('InternalHigh', ctypes.wintypes.LPVOID),
1120 ('Offset', ctypes.wintypes.DWORD),
1121 ('OffsetHigh', ctypes.wintypes.DWORD),
1122 ('hEvent', ctypes.wintypes.HANDLE),
1123 ]
1124
1125 kernel32 = ctypes.windll.kernel32
1126 LockFileEx = kernel32.LockFileEx
1127 LockFileEx.argtypes = [
1128 ctypes.wintypes.HANDLE, # hFile
1129 ctypes.wintypes.DWORD, # dwFlags
1130 ctypes.wintypes.DWORD, # dwReserved
1131 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1132 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1133 ctypes.POINTER(OVERLAPPED) # Overlapped
1134 ]
1135 LockFileEx.restype = ctypes.wintypes.BOOL
1136 UnlockFileEx = kernel32.UnlockFileEx
1137 UnlockFileEx.argtypes = [
1138 ctypes.wintypes.HANDLE, # hFile
1139 ctypes.wintypes.DWORD, # dwReserved
1140 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1141 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1142 ctypes.POINTER(OVERLAPPED) # Overlapped
1143 ]
1144 UnlockFileEx.restype = ctypes.wintypes.BOOL
1145 whole_low = 0xffffffff
1146 whole_high = 0x7fffffff
1147
1148 def _lock_file(f, exclusive):
1149 overlapped = OVERLAPPED()
1150 overlapped.Offset = 0
1151 overlapped.OffsetHigh = 0
1152 overlapped.hEvent = 0
1153 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1154 handle = msvcrt.get_osfhandle(f.fileno())
1155 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1156 whole_low, whole_high, f._lock_file_overlapped_p):
1157 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1158
1159 def _unlock_file(f):
1160 assert f._lock_file_overlapped_p
1161 handle = msvcrt.get_osfhandle(f.fileno())
1162 if not UnlockFileEx(handle, 0,
1163 whole_low, whole_high, f._lock_file_overlapped_p):
1164 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1165
1166else:
1167 import fcntl
1168
1169 def _lock_file(f, exclusive):
2582bebe 1170 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1171
1172 def _unlock_file(f):
2582bebe 1173 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1174
1175
1176class locked_file(object):
1177 def __init__(self, filename, mode, encoding=None):
1178 assert mode in ['r', 'a', 'w']
1179 self.f = io.open(filename, mode, encoding=encoding)
1180 self.mode = mode
1181
1182 def __enter__(self):
1183 exclusive = self.mode != 'r'
1184 try:
1185 _lock_file(self.f, exclusive)
1186 except IOError:
1187 self.f.close()
1188 raise
1189 return self
1190
1191 def __exit__(self, etype, value, traceback):
1192 try:
1193 _unlock_file(self.f)
1194 finally:
1195 self.f.close()
1196
1197 def __iter__(self):
1198 return iter(self.f)
1199
1200 def write(self, *args):
1201 return self.f.write(*args)
1202
1203 def read(self, *args):
1204 return self.f.read(*args)
4eb7f1d1
JMF
1205
1206
1207def shell_quote(args):
a6a173c2
JMF
1208 quoted_args = []
1209 encoding = sys.getfilesystemencoding()
1210 if encoding is None:
1211 encoding = 'utf-8'
1212 for a in args:
1213 if isinstance(a, bytes):
1214 # We may get a filename encoded with 'encodeFilename'
1215 a = a.decode(encoding)
1216 quoted_args.append(pipes.quote(a))
1217 return u' '.join(quoted_args)
9d4660ca
PH
1218
1219
f4d96df0
PH
1220def takewhile_inclusive(pred, seq):
1221 """ Like itertools.takewhile, but include the latest evaluated element
1222 (the first element so that Not pred(e)) """
1223 for e in seq:
1224 yield e
1225 if not pred(e):
1226 return
1227
1228
9d4660ca
PH
1229def smuggle_url(url, data):
1230 """ Pass additional data in a URL for internal use. """
1231
1232 sdata = compat_urllib_parse.urlencode(
1233 {u'__youtubedl_smuggle': json.dumps(data)})
1234 return url + u'#' + sdata
1235
1236
79f82953 1237def unsmuggle_url(smug_url, default=None):
9d4660ca 1238 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1239 return smug_url, default
9d4660ca
PH
1240 url, _, sdata = smug_url.rpartition(u'#')
1241 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1242 data = json.loads(jsond)
1243 return url, data
02dbf93f
PH
1244
1245
02dbf93f
PH
1246def format_bytes(bytes):
1247 if bytes is None:
1248 return u'N/A'
1249 if type(bytes) is str:
1250 bytes = float(bytes)
1251 if bytes == 0.0:
1252 exponent = 0
1253 else:
1254 exponent = int(math.log(bytes, 1024.0))
1255 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1256 converted = float(bytes) / float(1024 ** exponent)
1257 return u'%.2f%s' % (converted, suffix)
f53c966a 1258
1c088fa8 1259
1c088fa8
PH
1260def get_term_width():
1261 columns = os.environ.get('COLUMNS', None)
1262 if columns:
1263 return int(columns)
1264
1265 try:
1266 sp = subprocess.Popen(
1267 ['stty', 'size'],
1268 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1269 out, err = sp.communicate()
1270 return int(out.split()[1])
1271 except:
1272 pass
1273 return None
caefb1de
PH
1274
1275
1276def month_by_name(name):
1277 """ Return the number of a month by (locale-independently) English name """
1278
1279 ENGLISH_NAMES = [
dadb8184 1280 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1281 u'July', u'August', u'September', u'October', u'November', u'December']
1282 try:
1283 return ENGLISH_NAMES.index(name) + 1
1284 except ValueError:
1285 return None
18258362
JMF
1286
1287
5aafe895 1288def fix_xml_ampersands(xml_str):
18258362 1289 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1290 return re.sub(
1291 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1292 u'&amp;',
1293 xml_str)
e3946f98
PH
1294
1295
1296def setproctitle(title):
8bf48f23 1297 assert isinstance(title, compat_str)
e3946f98
PH
1298 try:
1299 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1300 except OSError:
1301 return
6eefe533
PH
1302 title_bytes = title.encode('utf-8')
1303 buf = ctypes.create_string_buffer(len(title_bytes))
1304 buf.value = title_bytes
e3946f98 1305 try:
6eefe533 1306 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1307 except AttributeError:
1308 return # Strange libc, just skip this
d7dda168
PH
1309
1310
1311def remove_start(s, start):
1312 if s.startswith(start):
1313 return s[len(start):]
1314 return s
29eb5174
PH
1315
1316
2b9faf55
PH
1317def remove_end(s, end):
1318 if s.endswith(end):
1319 return s[:-len(end)]
1320 return s
1321
1322
29eb5174 1323def url_basename(url):
9b8aaeed
JMF
1324 path = compat_urlparse.urlparse(url).path
1325 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1326
1327
1328class HEADRequest(compat_urllib_request.Request):
1329 def get_method(self):
1330 return "HEAD"
7217e148
PH
1331
1332
9732d77e 1333def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1334 if get_attr:
1335 if v is not None:
1336 v = getattr(v, get_attr, None)
9572013d
PH
1337 if v == '':
1338 v = None
9732d77e
PH
1339 return default if v is None else (int(v) * invscale // scale)
1340
9572013d 1341
40a90862
JMF
1342def str_or_none(v, default=None):
1343 return default if v is None else compat_str(v)
1344
9732d77e
PH
1345
1346def str_to_int(int_str):
48d4681e 1347 """ A more relaxed version of int_or_none """
9732d77e
PH
1348 if int_str is None:
1349 return None
884ae747 1350 int_str = re.sub(r'[,\.\+]', u'', int_str)
9732d77e 1351 return int(int_str)
608d11f5
PH
1352
1353
9732d77e
PH
1354def float_or_none(v, scale=1, invscale=1, default=None):
1355 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1356
1357
608d11f5
PH
1358def parse_duration(s):
1359 if s is None:
1360 return None
1361
ca7b3246
S
1362 s = s.strip()
1363
608d11f5 1364 m = re.match(
f164038b 1365 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
608d11f5
PH
1366 if not m:
1367 return None
1368 res = int(m.group('secs'))
1369 if m.group('mins'):
1370 res += int(m.group('mins')) * 60
1371 if m.group('hours'):
1372 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1373 if m.group('ms'):
1374 res += float(m.group('ms'))
608d11f5 1375 return res
91d7d0b3
JMF
1376
1377
1378def prepend_extension(filename, ext):
1379 name, real_ext = os.path.splitext(filename)
1380 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1381
1382
1383def check_executable(exe, args=[]):
1384 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1385 args can be a list of arguments for a short output (like -version) """
1386 try:
1387 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1388 except OSError:
1389 return False
1390 return exe
b7ab0590
PH
1391
1392
1393class PagedList(object):
dd26ced1
PH
1394 def __len__(self):
1395 # This is only useful for tests
1396 return len(self.getslice())
1397
9c44d242
PH
1398
1399class OnDemandPagedList(PagedList):
1400 def __init__(self, pagefunc, pagesize):
1401 self._pagefunc = pagefunc
1402 self._pagesize = pagesize
1403
b7ab0590
PH
1404 def getslice(self, start=0, end=None):
1405 res = []
1406 for pagenum in itertools.count(start // self._pagesize):
1407 firstid = pagenum * self._pagesize
1408 nextfirstid = pagenum * self._pagesize + self._pagesize
1409 if start >= nextfirstid:
1410 continue
1411
1412 page_results = list(self._pagefunc(pagenum))
1413
1414 startv = (
1415 start % self._pagesize
1416 if firstid <= start < nextfirstid
1417 else 0)
1418
1419 endv = (
1420 ((end - 1) % self._pagesize) + 1
1421 if (end is not None and firstid <= end <= nextfirstid)
1422 else None)
1423
1424 if startv != 0 or endv is not None:
1425 page_results = page_results[startv:endv]
1426 res.extend(page_results)
1427
1428 # A little optimization - if current page is not "full", ie. does
1429 # not contain page_size videos then we can assume that this page
1430 # is the last one - there are no more ids on further pages -
1431 # i.e. no need to query again.
1432 if len(page_results) + startv < self._pagesize:
1433 break
1434
1435 # If we got the whole page, but the next page is not interesting,
1436 # break out early as well
1437 if end == nextfirstid:
1438 break
1439 return res
81c2f20b
PH
1440
1441
9c44d242
PH
1442class InAdvancePagedList(PagedList):
1443 def __init__(self, pagefunc, pagecount, pagesize):
1444 self._pagefunc = pagefunc
1445 self._pagecount = pagecount
1446 self._pagesize = pagesize
1447
1448 def getslice(self, start=0, end=None):
1449 res = []
1450 start_page = start // self._pagesize
1451 end_page = (
1452 self._pagecount if end is None else (end // self._pagesize + 1))
1453 skip_elems = start - start_page * self._pagesize
1454 only_more = None if end is None else end - start
1455 for pagenum in range(start_page, end_page):
1456 page = list(self._pagefunc(pagenum))
1457 if skip_elems:
1458 page = page[skip_elems:]
1459 skip_elems = None
1460 if only_more is not None:
1461 if len(page) < only_more:
1462 only_more -= len(page)
1463 else:
1464 page = page[:only_more]
1465 res.extend(page)
1466 break
1467 res.extend(page)
1468 return res
1469
1470
81c2f20b 1471def uppercase_escape(s):
676eb3f2 1472 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1473 return re.sub(
a612753d 1474 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1475 lambda m: unicode_escape(m.group(0))[0],
1476 s)
b53466e1 1477
d05cfe06
S
1478
1479def escape_rfc3986(s):
1480 """Escape non-ASCII characters as suggested by RFC 3986"""
1481 if sys.version_info < (3, 0) and isinstance(s, unicode):
1482 s = s.encode('utf-8')
984e8e14 1483 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1484
1485
1486def escape_url(url):
1487 """Escape URL as suggested by RFC 3986"""
1488 url_parsed = compat_urllib_parse_urlparse(url)
1489 return url_parsed._replace(
1490 path=escape_rfc3986(url_parsed.path),
1491 params=escape_rfc3986(url_parsed.params),
1492 query=escape_rfc3986(url_parsed.query),
1493 fragment=escape_rfc3986(url_parsed.fragment)
1494 ).geturl()
1495
b53466e1
PH
1496try:
1497 struct.pack(u'!I', 0)
1498except TypeError:
1499 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1500 def struct_pack(spec, *args):
1501 if isinstance(spec, compat_str):
1502 spec = spec.encode('ascii')
1503 return struct.pack(spec, *args)
1504
1505 def struct_unpack(spec, *args):
1506 if isinstance(spec, compat_str):
1507 spec = spec.encode('ascii')
1508 return struct.unpack(spec, *args)
1509else:
1510 struct_pack = struct.pack
1511 struct_unpack = struct.unpack
62e609ab
PH
1512
1513
1514def read_batch_urls(batch_fd):
1515 def fixup(url):
1516 if not isinstance(url, compat_str):
1517 url = url.decode('utf-8', 'replace')
1518 BOM_UTF8 = u'\xef\xbb\xbf'
1519 if url.startswith(BOM_UTF8):
1520 url = url[len(BOM_UTF8):]
1521 url = url.strip()
1522 if url.startswith(('#', ';', ']')):
1523 return False
1524 return url
1525
1526 with contextlib.closing(batch_fd) as fd:
1527 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1528
1529
1530def urlencode_postdata(*args, **kargs):
1531 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1532
1533
0990305d
PH
1534try:
1535 etree_iter = xml.etree.ElementTree.Element.iter
1536except AttributeError: # Python <=2.6
1537 etree_iter = lambda n: n.findall('.//*')
1538
1539
bcf89ce6
PH
1540def parse_xml(s):
1541 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1542 def doctype(self, name, pubid, system):
1543 pass # Ignore doctypes
1544
1545 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1546 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1547 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1548 # Fix up XML parser in Python 2.x
1549 if sys.version_info < (3, 0):
1550 for n in etree_iter(tree):
1551 if n.text is not None:
1552 if not isinstance(n.text, compat_str):
1553 n.text = n.text.decode('utf-8')
1554 return tree
e68301af
PH
1555
1556
1557if sys.version_info < (3, 0) and sys.platform == 'win32':
1558 def compat_getpass(prompt, *args, **kwargs):
1559 if isinstance(prompt, compat_str):
4e6f9aec 1560 prompt = prompt.encode(preferredencoding())
e68301af
PH
1561 return getpass.getpass(prompt, *args, **kwargs)
1562else:
1563 compat_getpass = getpass.getpass
a1a530b0
PH
1564
1565
1566US_RATINGS = {
1567 'G': 0,
1568 'PG': 10,
1569 'PG-13': 13,
1570 'R': 16,
1571 'NC': 18,
1572}
fac55558
PH
1573
1574
1575def strip_jsonp(code):
816930c4 1576 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
478c2c61
PH
1577
1578
e05f6939
PH
1579def js_to_json(code):
1580 def fix_kv(m):
1581 key = m.group(2)
1582 if key.startswith("'"):
1583 assert key.endswith("'")
1584 assert '"' not in key
1585 key = '"%s"' % key[1:-1]
1586 elif not key.startswith('"'):
1587 key = '"%s"' % key
1588
1589 value = m.group(4)
1590 if value.startswith("'"):
1591 assert value.endswith("'")
1592 assert '"' not in value
1593 value = '"%s"' % value[1:-1]
1594
1595 return m.group(1) + key + m.group(3) + value
1596
1597 res = re.sub(r'''(?x)
1598 ([{,]\s*)
1599 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1600 (:\s*)
1601 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1602 ''', fix_kv, code)
1603 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1604 return res
1605
1606
478c2c61
PH
1607def qualities(quality_ids):
1608 """ Get a numeric quality value out of a list of possible values """
1609 def q(qid):
1610 try:
1611 return quality_ids.index(qid)
1612 except ValueError:
1613 return -1
1614 return q
1615
acd69589
PH
1616
1617DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68
PH
1618
1619try:
1620 subprocess_check_output = subprocess.check_output
1621except AttributeError:
1622 def subprocess_check_output(*args, **kwargs):
1623 assert 'input' not in kwargs
1624 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1625 output, _ = p.communicate()
1626 ret = p.poll()
1627 if ret:
1628 raise subprocess.CalledProcessError(ret, p.args, output=output)
1629 return output
a020a0dc
PH
1630
1631
1632def limit_length(s, length):
1633 """ Add ellipses to overly long strings """
1634 if s is None:
1635 return None
1636 ELLIPSES = '...'
1637 if len(s) > length:
1638 return s[:length - len(ELLIPSES)] + ELLIPSES
1639 return s