]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[viki] Make uploader field optional (#1813)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
c496ca96
PH
4import datetime
5import email.utils
f45c185f 6import errno
d77c3dfd 7import gzip
03f9daab 8import io
f4bfd65f 9import json
d77c3dfd
FV
10import locale
11import os
4eb7f1d1 12import pipes
c496ca96 13import platform
d77c3dfd 14import re
13ebea79 15import ssl
c496ca96 16import socket
d77c3dfd 17import sys
01951dda 18import traceback
d77c3dfd 19import zlib
d77c3dfd 20
01ba00ca 21try:
59ae15a5 22 import urllib.request as compat_urllib_request
01ba00ca 23except ImportError: # Python 2
59ae15a5 24 import urllib2 as compat_urllib_request
01ba00ca
PH
25
26try:
59ae15a5 27 import urllib.error as compat_urllib_error
01ba00ca 28except ImportError: # Python 2
59ae15a5 29 import urllib2 as compat_urllib_error
01ba00ca
PH
30
31try:
59ae15a5 32 import urllib.parse as compat_urllib_parse
01ba00ca 33except ImportError: # Python 2
59ae15a5 34 import urllib as compat_urllib_parse
01ba00ca 35
799c0763
PH
36try:
37 from urllib.parse import urlparse as compat_urllib_parse_urlparse
38except ImportError: # Python 2
39 from urlparse import urlparse as compat_urllib_parse_urlparse
40
6543f0dc
JMF
41try:
42 import urllib.parse as compat_urlparse
43except ImportError: # Python 2
44 import urlparse as compat_urlparse
45
01ba00ca 46try:
59ae15a5 47 import http.cookiejar as compat_cookiejar
01ba00ca 48except ImportError: # Python 2
59ae15a5 49 import cookielib as compat_cookiejar
01ba00ca 50
3e669f36 51try:
59ae15a5 52 import html.entities as compat_html_entities
9f37a959 53except ImportError: # Python 2
59ae15a5 54 import htmlentitydefs as compat_html_entities
3e669f36 55
a8156c1d 56try:
59ae15a5 57 import html.parser as compat_html_parser
9f37a959 58except ImportError: # Python 2
59ae15a5 59 import HTMLParser as compat_html_parser
a8156c1d 60
348d0a7a 61try:
59ae15a5 62 import http.client as compat_http_client
9f37a959 63except ImportError: # Python 2
59ae15a5 64 import httplib as compat_http_client
348d0a7a 65
2eabb802 66try:
0e283428 67 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
68except ImportError: # Python 2
69 from urllib2 import HTTPError as compat_HTTPError
70
e0df6211
PH
71try:
72 from urllib.request import urlretrieve as compat_urlretrieve
73except ImportError: # Python 2
74 from urllib import urlretrieve as compat_urlretrieve
75
76
5910e210
PH
77try:
78 from subprocess import DEVNULL
79 compat_subprocess_get_DEVNULL = lambda: DEVNULL
80except ImportError:
81 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
82
9f37a959 83try:
59ae15a5 84 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 85except ImportError: # Python 2
59ae15a5
PH
86 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
87 # Python 2's version is apparently totally broken
88 def _unquote(string, encoding='utf-8', errors='replace'):
89 if string == '':
90 return string
91 res = string.split('%')
92 if len(res) == 1:
93 return string
94 if encoding is None:
95 encoding = 'utf-8'
96 if errors is None:
97 errors = 'replace'
98 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
99 pct_sequence = b''
100 string = res[0]
101 for item in res[1:]:
102 try:
103 if not item:
104 raise ValueError
105 pct_sequence += item[:2].decode('hex')
106 rest = item[2:]
107 if not rest:
108 # This segment was just a single percent-encoded character.
109 # May be part of a sequence of code units, so delay decoding.
110 # (Stored in pct_sequence).
111 continue
112 except ValueError:
113 rest = '%' + item
114 # Encountered non-percent-encoded characters. Flush the current
115 # pct_sequence.
116 string += pct_sequence.decode(encoding, errors) + rest
117 pct_sequence = b''
118 if pct_sequence:
119 # Flush the final pct_sequence
120 string += pct_sequence.decode(encoding, errors)
121 return string
122
123 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
124 encoding='utf-8', errors='replace'):
125 qs, _coerce_result = qs, unicode
126 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
127 r = []
128 for name_value in pairs:
129 if not name_value and not strict_parsing:
130 continue
131 nv = name_value.split('=', 1)
132 if len(nv) != 2:
133 if strict_parsing:
134 raise ValueError("bad query field: %r" % (name_value,))
135 # Handle case of a control-name with no equal sign
136 if keep_blank_values:
137 nv.append('')
138 else:
139 continue
140 if len(nv[1]) or keep_blank_values:
141 name = nv[0].replace('+', ' ')
142 name = _unquote(name, encoding=encoding, errors=errors)
143 name = _coerce_result(name)
144 value = nv[1].replace('+', ' ')
145 value = _unquote(value, encoding=encoding, errors=errors)
146 value = _coerce_result(value)
147 r.append((name, value))
148 return r
149
150 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
151 encoding='utf-8', errors='replace'):
152 parsed_result = {}
153 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
154 encoding=encoding, errors=errors)
155 for name, value in pairs:
156 if name in parsed_result:
157 parsed_result[name].append(value)
158 else:
159 parsed_result[name] = [value]
160 return parsed_result
348d0a7a 161
3e669f36 162try:
59ae15a5 163 compat_str = unicode # Python 2
3e669f36 164except NameError:
59ae15a5 165 compat_str = str
3e669f36
PH
166
167try:
59ae15a5 168 compat_chr = unichr # Python 2
3e669f36 169except NameError:
59ae15a5 170 compat_chr = chr
3e669f36 171
b31756c1
FV
172def compat_ord(c):
173 if type(c) is int: return c
174 else: return ord(c)
175
468e2e92
FV
176# This is not clearly defined otherwise
177compiled_regex_type = type(re.compile(''))
178
3e669f36 179std_headers = {
ae8f7871 180 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
181 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
182 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
183 'Accept-Encoding': 'gzip, deflate',
184 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 185}
f427df17 186
d77c3dfd 187def preferredencoding():
59ae15a5 188 """Get preferred encoding.
d77c3dfd 189
59ae15a5
PH
190 Returns the best encoding scheme for the system, based on
191 locale.getpreferredencoding() and some further tweaks.
192 """
193 try:
194 pref = locale.getpreferredencoding()
195 u'TEST'.encode(pref)
196 except:
197 pref = 'UTF-8'
bae611f2 198
59ae15a5 199 return pref
d77c3dfd 200
8cd10ac4 201if sys.version_info < (3,0):
59ae15a5
PH
202 def compat_print(s):
203 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 204else:
59ae15a5
PH
205 def compat_print(s):
206 assert type(s) == type(u'')
207 print(s)
d77c3dfd 208
f4bfd65f
PH
209# In Python 2.x, json.dump expects a bytestream.
210# In Python 3.x, it writes to a character stream
211if sys.version_info < (3,0):
212 def write_json_file(obj, fn):
213 with open(fn, 'wb') as f:
214 json.dump(obj, f)
215else:
216 def write_json_file(obj, fn):
217 with open(fn, 'w', encoding='utf-8') as f:
218 json.dump(obj, f)
219
59ae56fa
PH
220if sys.version_info >= (2,7):
221 def find_xpath_attr(node, xpath, key, val):
222 """ Find the xpath xpath[@key=val] """
5de3ece2 223 assert re.match(r'^[a-zA-Z]+$', key)
54543467 224 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
59ae56fa
PH
225 expr = xpath + u"[@%s='%s']" % (key, val)
226 return node.find(expr)
227else:
228 def find_xpath_attr(node, xpath, key, val):
229 for f in node.findall(xpath):
230 if f.attrib.get(key) == val:
231 return f
232 return None
233
d7e66d39
JMF
234# On python2.6 the xml.etree.ElementTree.Element methods don't support
235# the namespace parameter
236def xpath_with_ns(path, ns_map):
237 components = [c.split(':') for c in path.split('/')]
238 replaced = []
239 for c in components:
240 if len(c) == 1:
241 replaced.append(c[0])
242 else:
243 ns, tag = c
244 replaced.append('{%s}%s' % (ns_map[ns], tag))
245 return '/'.join(replaced)
246
d77c3dfd 247def htmlentity_transform(matchobj):
59ae15a5
PH
248 """Transforms an HTML entity to a character.
249
250 This function receives a match object and is intended to be used with
251 the re.sub() function.
252 """
253 entity = matchobj.group(1)
254
255 # Known non-numeric HTML entity
256 if entity in compat_html_entities.name2codepoint:
257 return compat_chr(compat_html_entities.name2codepoint[entity])
258
259 mobj = re.match(u'(?u)#(x?\\d+)', entity)
260 if mobj is not None:
261 numstr = mobj.group(1)
262 if numstr.startswith(u'x'):
263 base = 16
264 numstr = u'0%s' % numstr
265 else:
266 base = 10
267 return compat_chr(int(numstr, base))
268
269 # Unknown entity in name, return its literal representation
270 return (u'&%s;' % entity)
d77c3dfd 271
a8156c1d 272compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
273class BaseHTMLParser(compat_html_parser.HTMLParser):
274 def __init(self):
275 compat_html_parser.HTMLParser.__init__(self)
276 self.html = None
277
278 def loads(self, html):
279 self.html = html
280 self.feed(html)
281 self.close()
282
283class AttrParser(BaseHTMLParser):
43e8fafd
ND
284 """Modified HTMLParser that isolates a tag with the specified attribute"""
285 def __init__(self, attribute, value):
286 self.attribute = attribute
287 self.value = value
59ae15a5
PH
288 self.result = None
289 self.started = False
290 self.depth = {}
59ae15a5
PH
291 self.watch_startpos = False
292 self.error_count = 0
a921f407 293 BaseHTMLParser.__init__(self)
59ae15a5
PH
294
295 def error(self, message):
296 if self.error_count > 10 or self.started:
297 raise compat_html_parser.HTMLParseError(message, self.getpos())
298 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
299 self.error_count += 1
300 self.goahead(1)
301
59ae15a5
PH
302 def handle_starttag(self, tag, attrs):
303 attrs = dict(attrs)
304 if self.started:
305 self.find_startpos(None)
43e8fafd 306 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
307 self.result = [tag]
308 self.started = True
309 self.watch_startpos = True
310 if self.started:
311 if not tag in self.depth: self.depth[tag] = 0
312 self.depth[tag] += 1
313
314 def handle_endtag(self, tag):
315 if self.started:
316 if tag in self.depth: self.depth[tag] -= 1
317 if self.depth[self.result[0]] == 0:
318 self.started = False
319 self.result.append(self.getpos())
320
321 def find_startpos(self, x):
322 """Needed to put the start position of the result (self.result[1])
323 after the opening tag with the requested id"""
324 if self.watch_startpos:
325 self.watch_startpos = False
326 self.result.append(self.getpos())
327 handle_entityref = handle_charref = handle_data = handle_comment = \
328 handle_decl = handle_pi = unknown_decl = find_startpos
329
330 def get_result(self):
331 if self.result is None:
332 return None
333 if len(self.result) != 3:
334 return None
335 lines = self.html.split('\n')
336 lines = lines[self.result[1][0]-1:self.result[2][0]]
337 lines[0] = lines[0][self.result[1][1]:]
338 if len(lines) == 1:
339 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
340 lines[-1] = lines[-1][:self.result[2][1]]
341 return '\n'.join(lines).strip()
3b024e17
PH
342# Hack for https://github.com/rg3/youtube-dl/issues/662
343if sys.version_info < (2, 7, 3):
344 AttrParser.parse_endtag = (lambda self, i:
345 i + len("</scr'+'ipt>")
346 if self.rawdata[i:].startswith("</scr'+'ipt>")
347 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
348
349def get_element_by_id(id, html):
43e8fafd
ND
350 """Return the content of the tag with the specified ID in the passed HTML document"""
351 return get_element_by_attribute("id", id, html)
352
353def get_element_by_attribute(attribute, value, html):
354 """Return the content of the tag with the specified attribute in the passed HTML document"""
355 parser = AttrParser(attribute, value)
59ae15a5
PH
356 try:
357 parser.loads(html)
358 except compat_html_parser.HTMLParseError:
359 pass
360 return parser.get_result()
9e6dd238 361
a921f407
JMF
362class MetaParser(BaseHTMLParser):
363 """
364 Modified HTMLParser that isolates a meta tag with the specified name
365 attribute.
366 """
367 def __init__(self, name):
368 BaseHTMLParser.__init__(self)
369 self.name = name
370 self.content = None
371 self.result = None
372
373 def handle_starttag(self, tag, attrs):
374 if tag != 'meta':
375 return
376 attrs = dict(attrs)
377 if attrs.get('name') == self.name:
378 self.result = attrs.get('content')
379
380 def get_result(self):
381 return self.result
382
383def get_meta_content(name, html):
384 """
385 Return the content attribute from the meta tag with the given name attribute.
386 """
387 parser = MetaParser(name)
388 try:
389 parser.loads(html)
390 except compat_html_parser.HTMLParseError:
391 pass
392 return parser.get_result()
393
9e6dd238
FV
394
395def clean_html(html):
59ae15a5
PH
396 """Clean an HTML snippet into a readable string"""
397 # Newline vs <br />
398 html = html.replace('\n', ' ')
6b3aef80
FV
399 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
400 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
401 # Strip html tags
402 html = re.sub('<.*?>', '', html)
403 # Replace html entities
404 html = unescapeHTML(html)
7decf895 405 return html.strip()
9e6dd238
FV
406
407
d77c3dfd 408def sanitize_open(filename, open_mode):
59ae15a5
PH
409 """Try to open the given filename, and slightly tweak it if this fails.
410
411 Attempts to open the given filename. If this fails, it tries to change
412 the filename slightly, step by step, until it's either able to open it
413 or it fails and raises a final exception, like the standard open()
414 function.
415
416 It returns the tuple (stream, definitive_file_name).
417 """
418 try:
419 if filename == u'-':
420 if sys.platform == 'win32':
421 import msvcrt
422 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 423 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
424 stream = open(encodeFilename(filename), open_mode)
425 return (stream, filename)
426 except (IOError, OSError) as err:
f45c185f
PH
427 if err.errno in (errno.EACCES,):
428 raise
59ae15a5 429
f45c185f
PH
430 # In case of error, try to remove win32 forbidden chars
431 alt_filename = os.path.join(
432 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
433 for path_part in os.path.split(filename)
434 )
435 if alt_filename == filename:
436 raise
437 else:
438 # An exception here should be caught in the caller
439 stream = open(encodeFilename(filename), open_mode)
440 return (stream, alt_filename)
d77c3dfd
FV
441
442
443def timeconvert(timestr):
59ae15a5
PH
444 """Convert RFC 2822 defined time string into system timestamp"""
445 timestamp = None
446 timetuple = email.utils.parsedate_tz(timestr)
447 if timetuple is not None:
448 timestamp = email.utils.mktime_tz(timetuple)
449 return timestamp
1c469a94 450
796173d0 451def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
452 """Sanitizes a string so it could be used as part of a filename.
453 If restricted is set, use a stricter subset of allowed characters.
796173d0 454 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
455 """
456 def replace_insane(char):
457 if char == '?' or ord(char) < 32 or ord(char) == 127:
458 return ''
459 elif char == '"':
460 return '' if restricted else '\''
461 elif char == ':':
462 return '_-' if restricted else ' -'
463 elif char in '\\/|*<>':
464 return '_'
627dcfff 465 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
466 return '_'
467 if restricted and ord(char) > 127:
468 return '_'
469 return char
470
471 result = u''.join(map(replace_insane, s))
796173d0
PH
472 if not is_id:
473 while '__' in result:
474 result = result.replace('__', '_')
475 result = result.strip('_')
476 # Common case of "Foreign band name - English song title"
477 if restricted and result.startswith('-_'):
478 result = result[2:]
479 if not result:
480 result = '_'
59ae15a5 481 return result
d77c3dfd
FV
482
483def orderedSet(iterable):
59ae15a5
PH
484 """ Remove all duplicates from the input iterable """
485 res = []
486 for el in iterable:
487 if el not in res:
488 res.append(el)
489 return res
d77c3dfd
FV
490
491def unescapeHTML(s):
59ae15a5
PH
492 """
493 @param s a string
494 """
495 assert type(s) == type(u'')
d77c3dfd 496
59ae15a5
PH
497 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
498 return result
d77c3dfd
FV
499
500def encodeFilename(s):
59ae15a5
PH
501 """
502 @param s The name of the file
503 """
d77c3dfd 504
59ae15a5 505 assert type(s) == type(u'')
d77c3dfd 506
59ae15a5
PH
507 # Python 3 has a Unicode API
508 if sys.version_info >= (3, 0):
509 return s
0f00efed 510
59ae15a5
PH
511 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
512 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
513 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
514 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
515 return s
516 else:
6df40dcb
PH
517 encoding = sys.getfilesystemencoding()
518 if encoding is None:
519 encoding = 'utf-8'
520 return s.encode(encoding, 'ignore')
d77c3dfd 521
8271226a
PH
522def decodeOption(optval):
523 if optval is None:
524 return optval
525 if isinstance(optval, bytes):
526 optval = optval.decode(preferredencoding())
527
528 assert isinstance(optval, compat_str)
529 return optval
1c256f70 530
4539dd30
PH
531def formatSeconds(secs):
532 if secs > 3600:
533 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
534 elif secs > 60:
535 return '%d:%02d' % (secs // 60, secs % 60)
536 else:
537 return '%d' % secs
538
13ebea79 539
ea6d901e 540def make_HTTPS_handler(opts):
13ebea79
PH
541 if sys.version_info < (3, 2):
542 import httplib
543
544 class HTTPSConnectionV3(httplib.HTTPSConnection):
545 def __init__(self, *args, **kwargs):
546 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
547
548 def connect(self):
549 sock = socket.create_connection((self.host, self.port), self.timeout)
550 if self._tunnel_host:
551 self.sock = sock
552 self._tunnel()
553 try:
554 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
555 except ssl.SSLError as e:
556 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
557
558 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
559 def https_open(self, req):
560 return self.do_open(HTTPSConnectionV3, req)
561 return HTTPSHandlerV3()
ea6d901e 562 else:
13ebea79 563 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e
PH
564 context.set_default_verify_paths()
565
566 context.verify_mode = (ssl.CERT_NONE
567 if opts.no_check_certificate
568 else ssl.CERT_REQUIRED)
acebc9cd 569 return compat_urllib_request.HTTPSHandler(context=context)
ea6d901e 570
1c256f70
PH
571class ExtractorError(Exception):
572 """Error during info extraction."""
2eabb802 573 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
574 """ tb, if given, is the original traceback (so that it can be printed out).
575 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
576 """
577
578 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
579 expected = True
580 if not expected:
298f833b 581 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 582 super(ExtractorError, self).__init__(msg)
d5979c5d 583
1c256f70 584 self.traceback = tb
8cc83b8d 585 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 586 self.cause = cause
1c256f70 587
01951dda
PH
588 def format_traceback(self):
589 if self.traceback is None:
590 return None
591 return u''.join(traceback.format_tb(self.traceback))
592
1c256f70 593
55b3e45b
JMF
594class RegexNotFoundError(ExtractorError):
595 """Error when a regex didn't match"""
596 pass
597
598
d77c3dfd 599class DownloadError(Exception):
59ae15a5 600 """Download Error exception.
d77c3dfd 601
59ae15a5
PH
602 This exception may be thrown by FileDownloader objects if they are not
603 configured to continue on errors. They will contain the appropriate
604 error message.
605 """
8cc83b8d
FV
606 def __init__(self, msg, exc_info=None):
607 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
608 super(DownloadError, self).__init__(msg)
609 self.exc_info = exc_info
d77c3dfd
FV
610
611
612class SameFileError(Exception):
59ae15a5 613 """Same File exception.
d77c3dfd 614
59ae15a5
PH
615 This exception will be thrown by FileDownloader objects if they detect
616 multiple files would have to be downloaded to the same file on disk.
617 """
618 pass
d77c3dfd
FV
619
620
621class PostProcessingError(Exception):
59ae15a5 622 """Post Processing exception.
d77c3dfd 623
59ae15a5
PH
624 This exception may be raised by PostProcessor's .run() method to
625 indicate an error in the postprocessing task.
626 """
7851b379
PH
627 def __init__(self, msg):
628 self.msg = msg
d77c3dfd
FV
629
630class MaxDownloadsReached(Exception):
59ae15a5
PH
631 """ --max-downloads limit has been reached. """
632 pass
d77c3dfd
FV
633
634
635class UnavailableVideoError(Exception):
59ae15a5 636 """Unavailable Format exception.
d77c3dfd 637
59ae15a5
PH
638 This exception will be thrown when a video is requested
639 in a format that is not available for that video.
640 """
641 pass
d77c3dfd
FV
642
643
644class ContentTooShortError(Exception):
59ae15a5 645 """Content Too Short exception.
d77c3dfd 646
59ae15a5
PH
647 This exception may be raised by FileDownloader objects when a file they
648 download is too small for what the server announced first, indicating
649 the connection was probably interrupted.
650 """
651 # Both in bytes
652 downloaded = None
653 expected = None
d77c3dfd 654
59ae15a5
PH
655 def __init__(self, downloaded, expected):
656 self.downloaded = downloaded
657 self.expected = expected
d77c3dfd 658
acebc9cd 659class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
660 """Handler for HTTP requests and responses.
661
662 This class, when installed with an OpenerDirector, automatically adds
663 the standard headers to every HTTP request and handles gzipped and
664 deflated responses from web servers. If compression is to be avoided in
665 a particular request, the original request in the program code only has
666 to include the HTTP header "Youtubedl-No-Compression", which will be
667 removed before making the real request.
668
669 Part of this code was copied from:
670
671 http://techknack.net/python-urllib2-handlers/
672
673 Andrew Rowls, the author of that code, agreed to release it to the
674 public domain.
675 """
676
677 @staticmethod
678 def deflate(data):
679 try:
680 return zlib.decompress(data, -zlib.MAX_WBITS)
681 except zlib.error:
682 return zlib.decompress(data)
683
684 @staticmethod
685 def addinfourl_wrapper(stream, headers, url, code):
686 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
687 return compat_urllib_request.addinfourl(stream, headers, url, code)
688 ret = compat_urllib_request.addinfourl(stream, headers, url)
689 ret.code = code
690 return ret
691
acebc9cd
PH
692 def http_request(self, req):
693 for h,v in std_headers.items():
59ae15a5
PH
694 if h in req.headers:
695 del req.headers[h]
335959e7 696 req.add_header(h, v)
59ae15a5
PH
697 if 'Youtubedl-no-compression' in req.headers:
698 if 'Accept-encoding' in req.headers:
699 del req.headers['Accept-encoding']
700 del req.headers['Youtubedl-no-compression']
3446dfb7 701 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
702 if 'User-agent' in req.headers:
703 del req.headers['User-agent']
704 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 705 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
706 return req
707
acebc9cd 708 def http_response(self, req, resp):
59ae15a5
PH
709 old_resp = resp
710 # gzip
711 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
712 content = resp.read()
713 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
714 try:
715 uncompressed = io.BytesIO(gz.read())
716 except IOError as original_ioerror:
717 # There may be junk add the end of the file
718 # See http://stackoverflow.com/q/4928560/35070 for details
719 for i in range(1, 1024):
720 try:
721 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
722 uncompressed = io.BytesIO(gz.read())
723 except IOError:
724 continue
725 break
726 else:
727 raise original_ioerror
728 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
729 resp.msg = old_resp.msg
730 # deflate
731 if resp.headers.get('Content-encoding', '') == 'deflate':
732 gz = io.BytesIO(self.deflate(resp.read()))
733 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
734 resp.msg = old_resp.msg
735 return resp
0f8d03f8 736
acebc9cd
PH
737 https_request = http_request
738 https_response = http_response
bf50b038
JMF
739
740def unified_strdate(date_str):
741 """Return a string with the date in the format YYYYMMDD"""
742 upload_date = None
743 #Replace commas
744 date_str = date_str.replace(',',' ')
745 # %z (UTC offset) is only supported in python>=3.2
746 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
19e1d359
JMF
747 format_expressions = [
748 '%d %B %Y',
749 '%B %d %Y',
750 '%b %d %Y',
751 '%Y-%m-%d',
752 '%d/%m/%Y',
753 '%Y/%m/%d %H:%M:%S',
754 '%d.%m.%Y %H:%M',
755 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
756 '%Y-%m-%dT%H:%M:%S.%fZ',
757 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 758 '%Y-%m-%dT%H:%M:%S',
19e1d359 759 ]
bf50b038
JMF
760 for expression in format_expressions:
761 try:
762 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
763 except:
764 pass
765 return upload_date
766
cbdbb766 767def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
768 guess = url.partition(u'?')[0].rpartition(u'.')[2]
769 if re.match(r'^[A-Za-z0-9]+$', guess):
770 return guess
771 else:
cbdbb766 772 return default_ext
73e79f2a 773
d4051a8e
JMF
774def subtitles_filename(filename, sub_lang, sub_format):
775 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
776
bd558525 777def date_from_str(date_str):
37254abc
JMF
778 """
779 Return a datetime object from a string in the format YYYYMMDD or
780 (now|today)[+-][0-9](day|week|month|year)(s)?"""
781 today = datetime.date.today()
782 if date_str == 'now'or date_str == 'today':
783 return today
784 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
785 if match is not None:
786 sign = match.group('sign')
787 time = int(match.group('time'))
788 if sign == '-':
789 time = -time
790 unit = match.group('unit')
791 #A bad aproximation?
792 if unit == 'month':
793 unit = 'day'
794 time *= 30
795 elif unit == 'year':
796 unit = 'day'
797 time *= 365
798 unit += 's'
799 delta = datetime.timedelta(**{unit: time})
800 return today + delta
bd558525
JMF
801 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
802
803class DateRange(object):
804 """Represents a time interval between two dates"""
805 def __init__(self, start=None, end=None):
806 """start and end must be strings in the format accepted by date"""
807 if start is not None:
808 self.start = date_from_str(start)
809 else:
810 self.start = datetime.datetime.min.date()
811 if end is not None:
812 self.end = date_from_str(end)
813 else:
814 self.end = datetime.datetime.max.date()
37254abc 815 if self.start > self.end:
bd558525
JMF
816 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
817 @classmethod
818 def day(cls, day):
819 """Returns a range that only contains the given day"""
820 return cls(day,day)
821 def __contains__(self, date):
822 """Check if the date is in the range"""
37254abc
JMF
823 if not isinstance(date, datetime.date):
824 date = date_from_str(date)
825 return self.start <= date <= self.end
bd558525
JMF
826 def __str__(self):
827 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
828
829
830def platform_name():
831 """ Returns the platform name as a compat_str """
832 res = platform.platform()
833 if isinstance(res, bytes):
834 res = res.decode(preferredencoding())
835
836 assert isinstance(res, compat_str)
837 return res
c257baff
PH
838
839
7459e3a2
PH
840def write_string(s, out=None):
841 if out is None:
842 out = sys.stderr
843 assert type(s) == type(u'')
844
845 if ('b' in getattr(out, 'mode', '') or
846 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
847 s = s.encode(preferredencoding(), 'ignore')
848 out.write(s)
849 out.flush()
850
851
48ea9cea
PH
852def bytes_to_intlist(bs):
853 if not bs:
854 return []
855 if isinstance(bs[0], int): # Python 3
856 return list(bs)
857 else:
858 return [ord(c) for c in bs]
859
c257baff 860
cba892fa 861def intlist_to_bytes(xs):
862 if not xs:
863 return b''
864 if isinstance(chr(0), bytes): # Python 2
865 return ''.join([chr(x) for x in xs])
866 else:
867 return bytes(xs)
c38b1e77
PH
868
869
870def get_cachedir(params={}):
871 cache_root = os.environ.get('XDG_CACHE_HOME',
872 os.path.expanduser('~/.cache'))
873 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
874
875
876# Cross-platform file locking
877if sys.platform == 'win32':
878 import ctypes.wintypes
879 import msvcrt
880
881 class OVERLAPPED(ctypes.Structure):
882 _fields_ = [
883 ('Internal', ctypes.wintypes.LPVOID),
884 ('InternalHigh', ctypes.wintypes.LPVOID),
885 ('Offset', ctypes.wintypes.DWORD),
886 ('OffsetHigh', ctypes.wintypes.DWORD),
887 ('hEvent', ctypes.wintypes.HANDLE),
888 ]
889
890 kernel32 = ctypes.windll.kernel32
891 LockFileEx = kernel32.LockFileEx
892 LockFileEx.argtypes = [
893 ctypes.wintypes.HANDLE, # hFile
894 ctypes.wintypes.DWORD, # dwFlags
895 ctypes.wintypes.DWORD, # dwReserved
896 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
897 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
898 ctypes.POINTER(OVERLAPPED) # Overlapped
899 ]
900 LockFileEx.restype = ctypes.wintypes.BOOL
901 UnlockFileEx = kernel32.UnlockFileEx
902 UnlockFileEx.argtypes = [
903 ctypes.wintypes.HANDLE, # hFile
904 ctypes.wintypes.DWORD, # dwReserved
905 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
906 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
907 ctypes.POINTER(OVERLAPPED) # Overlapped
908 ]
909 UnlockFileEx.restype = ctypes.wintypes.BOOL
910 whole_low = 0xffffffff
911 whole_high = 0x7fffffff
912
913 def _lock_file(f, exclusive):
914 overlapped = OVERLAPPED()
915 overlapped.Offset = 0
916 overlapped.OffsetHigh = 0
917 overlapped.hEvent = 0
918 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
919 handle = msvcrt.get_osfhandle(f.fileno())
920 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
921 whole_low, whole_high, f._lock_file_overlapped_p):
922 raise OSError('Locking file failed: %r' % ctypes.FormatError())
923
924 def _unlock_file(f):
925 assert f._lock_file_overlapped_p
926 handle = msvcrt.get_osfhandle(f.fileno())
927 if not UnlockFileEx(handle, 0,
928 whole_low, whole_high, f._lock_file_overlapped_p):
929 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
930
931else:
932 import fcntl
933
934 def _lock_file(f, exclusive):
935 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
936
937 def _unlock_file(f):
938 fcntl.lockf(f, fcntl.LOCK_UN)
939
940
941class locked_file(object):
942 def __init__(self, filename, mode, encoding=None):
943 assert mode in ['r', 'a', 'w']
944 self.f = io.open(filename, mode, encoding=encoding)
945 self.mode = mode
946
947 def __enter__(self):
948 exclusive = self.mode != 'r'
949 try:
950 _lock_file(self.f, exclusive)
951 except IOError:
952 self.f.close()
953 raise
954 return self
955
956 def __exit__(self, etype, value, traceback):
957 try:
958 _unlock_file(self.f)
959 finally:
960 self.f.close()
961
962 def __iter__(self):
963 return iter(self.f)
964
965 def write(self, *args):
966 return self.f.write(*args)
967
968 def read(self, *args):
969 return self.f.read(*args)
4eb7f1d1
JMF
970
971
972def shell_quote(args):
a6a173c2
JMF
973 quoted_args = []
974 encoding = sys.getfilesystemencoding()
975 if encoding is None:
976 encoding = 'utf-8'
977 for a in args:
978 if isinstance(a, bytes):
979 # We may get a filename encoded with 'encodeFilename'
980 a = a.decode(encoding)
981 quoted_args.append(pipes.quote(a))
982 return u' '.join(quoted_args)
9d4660ca
PH
983
984
f4d96df0
PH
985def takewhile_inclusive(pred, seq):
986 """ Like itertools.takewhile, but include the latest evaluated element
987 (the first element so that Not pred(e)) """
988 for e in seq:
989 yield e
990 if not pred(e):
991 return
992
993
9d4660ca
PH
994def smuggle_url(url, data):
995 """ Pass additional data in a URL for internal use. """
996
997 sdata = compat_urllib_parse.urlencode(
998 {u'__youtubedl_smuggle': json.dumps(data)})
999 return url + u'#' + sdata
1000
1001
1002def unsmuggle_url(smug_url):
1003 if not '#__youtubedl_smuggle' in smug_url:
1004 return smug_url, None
1005 url, _, sdata = smug_url.rpartition(u'#')
1006 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1007 data = json.loads(jsond)
1008 return url, data