]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[facebook] Fix support for untitled videos (Fixes #3757)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import calendar
5 import codecs
6 import contextlib
7 import ctypes
8 import datetime
9 import email.utils
10 import errno
11 import getpass
12 import gzip
13 import itertools
14 import io
15 import json
16 import locale
17 import math
18 import os
19 import pipes
20 import platform
21 import re
22 import ssl
23 import socket
24 import struct
25 import subprocess
26 import sys
27 import tempfile
28 import traceback
29 import xml.etree.ElementTree
30 import zlib
31
32 try:
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
36
37 try:
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
41
42 try:
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
46
47 try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
52 try:
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
57 try:
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
61
62 try:
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
66
67 try:
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
71
72 try:
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
76
77 try:
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
82 try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
88 try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91 except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
94 try:
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96 except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
132
133 try:
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
179
180 try:
181 compat_str = unicode # Python 2
182 except NameError:
183 compat_str = str
184
185 try:
186 compat_chr = unichr # Python 2
187 except NameError:
188 compat_chr = chr
189
190 try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
195 try:
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
198 def shlex_quote(s):
199 return "'" + s.replace("'", "'\"'\"'") + "'"
200
201
202 def compat_ord(c):
203 if type(c) is int: return c
204 else: return ord(c)
205
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
208
209 std_headers = {
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
215 }
216
217 def preferredencoding():
218 """Get preferred encoding.
219
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
222 """
223 try:
224 pref = locale.getpreferredencoding()
225 u'TEST'.encode(pref)
226 except:
227 pref = 'UTF-8'
228
229 return pref
230
231 if sys.version_info < (3,0):
232 def compat_print(s):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
234 else:
235 def compat_print(s):
236 assert type(s) == type(u'')
237 print(s)
238
239
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
242
243 args = {
244 'suffix': '.tmp',
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
247 'delete': False,
248 }
249
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
253 args['mode'] = 'wb'
254 else:
255 args.update({
256 'mode': 'w',
257 'encoding': 'utf-8',
258 })
259
260 tf = tempfile.NamedTemporaryFile(**args)
261
262 try:
263 with tf:
264 json.dump(obj, tf)
265 os.rename(tf.name, fn)
266 except:
267 try:
268 os.remove(tf.name)
269 except OSError:
270 pass
271 raise
272
273
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
281 else:
282 def find_xpath_attr(node, xpath, key, val):
283 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
284 # .//node does not match if a node is a direct child of . !
285 if isinstance(xpath, unicode):
286 xpath = xpath.encode('ascii')
287
288 for f in node.findall(xpath):
289 if f.attrib.get(key) == val:
290 return f
291 return None
292
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295 def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
297 replaced = []
298 for c in components:
299 if len(c) == 1:
300 replaced.append(c[0])
301 else:
302 ns, tag = c
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
305
306
307 def xpath_text(node, xpath, name=None, fatal=False):
308 if sys.version_info < (2, 7): # Crazy 2.6
309 xpath = xpath.encode('ascii')
310
311 n = node.find(xpath)
312 if n is None:
313 if fatal:
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element %s' % name)
316 else:
317 return None
318 return n.text
319
320
321 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
322 class BaseHTMLParser(compat_html_parser.HTMLParser):
323 def __init(self):
324 compat_html_parser.HTMLParser.__init__(self)
325 self.html = None
326
327 def loads(self, html):
328 self.html = html
329 self.feed(html)
330 self.close()
331
332 class AttrParser(BaseHTMLParser):
333 """Modified HTMLParser that isolates a tag with the specified attribute"""
334 def __init__(self, attribute, value):
335 self.attribute = attribute
336 self.value = value
337 self.result = None
338 self.started = False
339 self.depth = {}
340 self.watch_startpos = False
341 self.error_count = 0
342 BaseHTMLParser.__init__(self)
343
344 def error(self, message):
345 if self.error_count > 10 or self.started:
346 raise compat_html_parser.HTMLParseError(message, self.getpos())
347 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
348 self.error_count += 1
349 self.goahead(1)
350
351 def handle_starttag(self, tag, attrs):
352 attrs = dict(attrs)
353 if self.started:
354 self.find_startpos(None)
355 if self.attribute in attrs and attrs[self.attribute] == self.value:
356 self.result = [tag]
357 self.started = True
358 self.watch_startpos = True
359 if self.started:
360 if not tag in self.depth: self.depth[tag] = 0
361 self.depth[tag] += 1
362
363 def handle_endtag(self, tag):
364 if self.started:
365 if tag in self.depth: self.depth[tag] -= 1
366 if self.depth[self.result[0]] == 0:
367 self.started = False
368 self.result.append(self.getpos())
369
370 def find_startpos(self, x):
371 """Needed to put the start position of the result (self.result[1])
372 after the opening tag with the requested id"""
373 if self.watch_startpos:
374 self.watch_startpos = False
375 self.result.append(self.getpos())
376 handle_entityref = handle_charref = handle_data = handle_comment = \
377 handle_decl = handle_pi = unknown_decl = find_startpos
378
379 def get_result(self):
380 if self.result is None:
381 return None
382 if len(self.result) != 3:
383 return None
384 lines = self.html.split('\n')
385 lines = lines[self.result[1][0]-1:self.result[2][0]]
386 lines[0] = lines[0][self.result[1][1]:]
387 if len(lines) == 1:
388 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
389 lines[-1] = lines[-1][:self.result[2][1]]
390 return '\n'.join(lines).strip()
391 # Hack for https://github.com/rg3/youtube-dl/issues/662
392 if sys.version_info < (2, 7, 3):
393 AttrParser.parse_endtag = (lambda self, i:
394 i + len("</scr'+'ipt>")
395 if self.rawdata[i:].startswith("</scr'+'ipt>")
396 else compat_html_parser.HTMLParser.parse_endtag(self, i))
397
398 def get_element_by_id(id, html):
399 """Return the content of the tag with the specified ID in the passed HTML document"""
400 return get_element_by_attribute("id", id, html)
401
402 def get_element_by_attribute(attribute, value, html):
403 """Return the content of the tag with the specified attribute in the passed HTML document"""
404 parser = AttrParser(attribute, value)
405 try:
406 parser.loads(html)
407 except compat_html_parser.HTMLParseError:
408 pass
409 return parser.get_result()
410
411 class MetaParser(BaseHTMLParser):
412 """
413 Modified HTMLParser that isolates a meta tag with the specified name
414 attribute.
415 """
416 def __init__(self, name):
417 BaseHTMLParser.__init__(self)
418 self.name = name
419 self.content = None
420 self.result = None
421
422 def handle_starttag(self, tag, attrs):
423 if tag != 'meta':
424 return
425 attrs = dict(attrs)
426 if attrs.get('name') == self.name:
427 self.result = attrs.get('content')
428
429 def get_result(self):
430 return self.result
431
432 def get_meta_content(name, html):
433 """
434 Return the content attribute from the meta tag with the given name attribute.
435 """
436 parser = MetaParser(name)
437 try:
438 parser.loads(html)
439 except compat_html_parser.HTMLParseError:
440 pass
441 return parser.get_result()
442
443
444 def clean_html(html):
445 """Clean an HTML snippet into a readable string"""
446 # Newline vs <br />
447 html = html.replace('\n', ' ')
448 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
449 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
450 # Strip html tags
451 html = re.sub('<.*?>', '', html)
452 # Replace html entities
453 html = unescapeHTML(html)
454 return html.strip()
455
456
457 def sanitize_open(filename, open_mode):
458 """Try to open the given filename, and slightly tweak it if this fails.
459
460 Attempts to open the given filename. If this fails, it tries to change
461 the filename slightly, step by step, until it's either able to open it
462 or it fails and raises a final exception, like the standard open()
463 function.
464
465 It returns the tuple (stream, definitive_file_name).
466 """
467 try:
468 if filename == u'-':
469 if sys.platform == 'win32':
470 import msvcrt
471 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
472 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
473 stream = open(encodeFilename(filename), open_mode)
474 return (stream, filename)
475 except (IOError, OSError) as err:
476 if err.errno in (errno.EACCES,):
477 raise
478
479 # In case of error, try to remove win32 forbidden chars
480 alt_filename = os.path.join(
481 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
482 for path_part in os.path.split(filename)
483 )
484 if alt_filename == filename:
485 raise
486 else:
487 # An exception here should be caught in the caller
488 stream = open(encodeFilename(filename), open_mode)
489 return (stream, alt_filename)
490
491
492 def timeconvert(timestr):
493 """Convert RFC 2822 defined time string into system timestamp"""
494 timestamp = None
495 timetuple = email.utils.parsedate_tz(timestr)
496 if timetuple is not None:
497 timestamp = email.utils.mktime_tz(timetuple)
498 return timestamp
499
500 def sanitize_filename(s, restricted=False, is_id=False):
501 """Sanitizes a string so it could be used as part of a filename.
502 If restricted is set, use a stricter subset of allowed characters.
503 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
504 """
505 def replace_insane(char):
506 if char == '?' or ord(char) < 32 or ord(char) == 127:
507 return ''
508 elif char == '"':
509 return '' if restricted else '\''
510 elif char == ':':
511 return '_-' if restricted else ' -'
512 elif char in '\\/|*<>':
513 return '_'
514 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
515 return '_'
516 if restricted and ord(char) > 127:
517 return '_'
518 return char
519
520 result = u''.join(map(replace_insane, s))
521 if not is_id:
522 while '__' in result:
523 result = result.replace('__', '_')
524 result = result.strip('_')
525 # Common case of "Foreign band name - English song title"
526 if restricted and result.startswith('-_'):
527 result = result[2:]
528 if not result:
529 result = '_'
530 return result
531
532 def orderedSet(iterable):
533 """ Remove all duplicates from the input iterable """
534 res = []
535 for el in iterable:
536 if el not in res:
537 res.append(el)
538 return res
539
540
541 def _htmlentity_transform(entity):
542 """Transforms an HTML entity to a character."""
543 # Known non-numeric HTML entity
544 if entity in compat_html_entities.name2codepoint:
545 return compat_chr(compat_html_entities.name2codepoint[entity])
546
547 mobj = re.match(r'#(x?[0-9]+)', entity)
548 if mobj is not None:
549 numstr = mobj.group(1)
550 if numstr.startswith(u'x'):
551 base = 16
552 numstr = u'0%s' % numstr
553 else:
554 base = 10
555 return compat_chr(int(numstr, base))
556
557 # Unknown entity in name, return its literal representation
558 return (u'&%s;' % entity)
559
560
561 def unescapeHTML(s):
562 if s is None:
563 return None
564 assert type(s) == compat_str
565
566 return re.sub(
567 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
568
569
570 def encodeFilename(s, for_subprocess=False):
571 """
572 @param s The name of the file
573 """
574
575 assert type(s) == compat_str
576
577 # Python 3 has a Unicode API
578 if sys.version_info >= (3, 0):
579 return s
580
581 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
582 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
583 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
584 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
585 if not for_subprocess:
586 return s
587 else:
588 # For subprocess calls, encode with locale encoding
589 # Refer to http://stackoverflow.com/a/9951851/35070
590 encoding = preferredencoding()
591 else:
592 encoding = sys.getfilesystemencoding()
593 if encoding is None:
594 encoding = 'utf-8'
595 return s.encode(encoding, 'ignore')
596
597
598 def encodeArgument(s):
599 if not isinstance(s, compat_str):
600 # Legacy code that uses byte strings
601 # Uncomment the following line after fixing all post processors
602 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
603 s = s.decode('ascii')
604 return encodeFilename(s, True)
605
606
607 def decodeOption(optval):
608 if optval is None:
609 return optval
610 if isinstance(optval, bytes):
611 optval = optval.decode(preferredencoding())
612
613 assert isinstance(optval, compat_str)
614 return optval
615
616 def formatSeconds(secs):
617 if secs > 3600:
618 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
619 elif secs > 60:
620 return '%d:%02d' % (secs // 60, secs % 60)
621 else:
622 return '%d' % secs
623
624
625 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
626 if sys.version_info < (3, 2):
627 import httplib
628
629 class HTTPSConnectionV3(httplib.HTTPSConnection):
630 def __init__(self, *args, **kwargs):
631 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
632
633 def connect(self):
634 sock = socket.create_connection((self.host, self.port), self.timeout)
635 if getattr(self, '_tunnel_host', False):
636 self.sock = sock
637 self._tunnel()
638 try:
639 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
640 except ssl.SSLError:
641 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
642
643 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
644 def https_open(self, req):
645 return self.do_open(HTTPSConnectionV3, req)
646 return HTTPSHandlerV3(**kwargs)
647 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
648 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
649 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
650 if opts_no_check_certificate:
651 context.verify_mode = ssl.CERT_NONE
652 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
653 else: # Python < 3.4
654 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
655 context.verify_mode = (ssl.CERT_NONE
656 if opts_no_check_certificate
657 else ssl.CERT_REQUIRED)
658 context.set_default_verify_paths()
659 try:
660 context.load_default_certs()
661 except AttributeError:
662 pass # Python < 3.4
663 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
664
665 class ExtractorError(Exception):
666 """Error during info extraction."""
667 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
668 """ tb, if given, is the original traceback (so that it can be printed out).
669 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
670 """
671
672 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
673 expected = True
674 if video_id is not None:
675 msg = video_id + ': ' + msg
676 if not expected:
677 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
678 super(ExtractorError, self).__init__(msg)
679
680 self.traceback = tb
681 self.exc_info = sys.exc_info() # preserve original exception
682 self.cause = cause
683 self.video_id = video_id
684
685 def format_traceback(self):
686 if self.traceback is None:
687 return None
688 return u''.join(traceback.format_tb(self.traceback))
689
690
691 class RegexNotFoundError(ExtractorError):
692 """Error when a regex didn't match"""
693 pass
694
695
696 class DownloadError(Exception):
697 """Download Error exception.
698
699 This exception may be thrown by FileDownloader objects if they are not
700 configured to continue on errors. They will contain the appropriate
701 error message.
702 """
703 def __init__(self, msg, exc_info=None):
704 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
705 super(DownloadError, self).__init__(msg)
706 self.exc_info = exc_info
707
708
709 class SameFileError(Exception):
710 """Same File exception.
711
712 This exception will be thrown by FileDownloader objects if they detect
713 multiple files would have to be downloaded to the same file on disk.
714 """
715 pass
716
717
718 class PostProcessingError(Exception):
719 """Post Processing exception.
720
721 This exception may be raised by PostProcessor's .run() method to
722 indicate an error in the postprocessing task.
723 """
724 def __init__(self, msg):
725 self.msg = msg
726
727 class MaxDownloadsReached(Exception):
728 """ --max-downloads limit has been reached. """
729 pass
730
731
732 class UnavailableVideoError(Exception):
733 """Unavailable Format exception.
734
735 This exception will be thrown when a video is requested
736 in a format that is not available for that video.
737 """
738 pass
739
740
741 class ContentTooShortError(Exception):
742 """Content Too Short exception.
743
744 This exception may be raised by FileDownloader objects when a file they
745 download is too small for what the server announced first, indicating
746 the connection was probably interrupted.
747 """
748 # Both in bytes
749 downloaded = None
750 expected = None
751
752 def __init__(self, downloaded, expected):
753 self.downloaded = downloaded
754 self.expected = expected
755
756 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
757 """Handler for HTTP requests and responses.
758
759 This class, when installed with an OpenerDirector, automatically adds
760 the standard headers to every HTTP request and handles gzipped and
761 deflated responses from web servers. If compression is to be avoided in
762 a particular request, the original request in the program code only has
763 to include the HTTP header "Youtubedl-No-Compression", which will be
764 removed before making the real request.
765
766 Part of this code was copied from:
767
768 http://techknack.net/python-urllib2-handlers/
769
770 Andrew Rowls, the author of that code, agreed to release it to the
771 public domain.
772 """
773
774 @staticmethod
775 def deflate(data):
776 try:
777 return zlib.decompress(data, -zlib.MAX_WBITS)
778 except zlib.error:
779 return zlib.decompress(data)
780
781 @staticmethod
782 def addinfourl_wrapper(stream, headers, url, code):
783 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
784 return compat_urllib_request.addinfourl(stream, headers, url, code)
785 ret = compat_urllib_request.addinfourl(stream, headers, url)
786 ret.code = code
787 return ret
788
789 def http_request(self, req):
790 for h, v in std_headers.items():
791 if h not in req.headers:
792 req.add_header(h, v)
793 if 'Youtubedl-no-compression' in req.headers:
794 if 'Accept-encoding' in req.headers:
795 del req.headers['Accept-encoding']
796 del req.headers['Youtubedl-no-compression']
797 if 'Youtubedl-user-agent' in req.headers:
798 if 'User-agent' in req.headers:
799 del req.headers['User-agent']
800 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
801 del req.headers['Youtubedl-user-agent']
802 return req
803
804 def http_response(self, req, resp):
805 old_resp = resp
806 # gzip
807 if resp.headers.get('Content-encoding', '') == 'gzip':
808 content = resp.read()
809 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
810 try:
811 uncompressed = io.BytesIO(gz.read())
812 except IOError as original_ioerror:
813 # There may be junk add the end of the file
814 # See http://stackoverflow.com/q/4928560/35070 for details
815 for i in range(1, 1024):
816 try:
817 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
818 uncompressed = io.BytesIO(gz.read())
819 except IOError:
820 continue
821 break
822 else:
823 raise original_ioerror
824 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
825 resp.msg = old_resp.msg
826 # deflate
827 if resp.headers.get('Content-encoding', '') == 'deflate':
828 gz = io.BytesIO(self.deflate(resp.read()))
829 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
830 resp.msg = old_resp.msg
831 return resp
832
833 https_request = http_request
834 https_response = http_response
835
836
837 def parse_iso8601(date_str, delimiter='T'):
838 """ Return a UNIX timestamp from the given date """
839
840 if date_str is None:
841 return None
842
843 m = re.search(
844 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
845 date_str)
846 if not m:
847 timezone = datetime.timedelta()
848 else:
849 date_str = date_str[:-len(m.group(0))]
850 if not m.group('sign'):
851 timezone = datetime.timedelta()
852 else:
853 sign = 1 if m.group('sign') == '+' else -1
854 timezone = datetime.timedelta(
855 hours=sign * int(m.group('hours')),
856 minutes=sign * int(m.group('minutes')))
857 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
858 dt = datetime.datetime.strptime(date_str, date_format) - timezone
859 return calendar.timegm(dt.timetuple())
860
861
862 def unified_strdate(date_str):
863 """Return a string with the date in the format YYYYMMDD"""
864
865 if date_str is None:
866 return None
867
868 upload_date = None
869 #Replace commas
870 date_str = date_str.replace(',', ' ')
871 # %z (UTC offset) is only supported in python>=3.2
872 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
873 format_expressions = [
874 '%d %B %Y',
875 '%d %b %Y',
876 '%B %d %Y',
877 '%b %d %Y',
878 '%b %dst %Y %I:%M%p',
879 '%b %dnd %Y %I:%M%p',
880 '%b %dth %Y %I:%M%p',
881 '%Y-%m-%d',
882 '%Y/%m/%d',
883 '%d.%m.%Y',
884 '%d/%m/%Y',
885 '%d/%m/%y',
886 '%Y/%m/%d %H:%M:%S',
887 '%Y-%m-%d %H:%M:%S',
888 '%d.%m.%Y %H:%M',
889 '%d.%m.%Y %H.%M',
890 '%Y-%m-%dT%H:%M:%SZ',
891 '%Y-%m-%dT%H:%M:%S.%fZ',
892 '%Y-%m-%dT%H:%M:%S.%f0Z',
893 '%Y-%m-%dT%H:%M:%S',
894 '%Y-%m-%dT%H:%M:%S.%f',
895 '%Y-%m-%dT%H:%M',
896 ]
897 for expression in format_expressions:
898 try:
899 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
900 except ValueError:
901 pass
902 if upload_date is None:
903 timetuple = email.utils.parsedate_tz(date_str)
904 if timetuple:
905 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
906 return upload_date
907
908 def determine_ext(url, default_ext=u'unknown_video'):
909 if url is None:
910 return default_ext
911 guess = url.partition(u'?')[0].rpartition(u'.')[2]
912 if re.match(r'^[A-Za-z0-9]+$', guess):
913 return guess
914 else:
915 return default_ext
916
917 def subtitles_filename(filename, sub_lang, sub_format):
918 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
919
920 def date_from_str(date_str):
921 """
922 Return a datetime object from a string in the format YYYYMMDD or
923 (now|today)[+-][0-9](day|week|month|year)(s)?"""
924 today = datetime.date.today()
925 if date_str == 'now'or date_str == 'today':
926 return today
927 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
928 if match is not None:
929 sign = match.group('sign')
930 time = int(match.group('time'))
931 if sign == '-':
932 time = -time
933 unit = match.group('unit')
934 #A bad aproximation?
935 if unit == 'month':
936 unit = 'day'
937 time *= 30
938 elif unit == 'year':
939 unit = 'day'
940 time *= 365
941 unit += 's'
942 delta = datetime.timedelta(**{unit: time})
943 return today + delta
944 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
945
946 def hyphenate_date(date_str):
947 """
948 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
949 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
950 if match is not None:
951 return '-'.join(match.groups())
952 else:
953 return date_str
954
955 class DateRange(object):
956 """Represents a time interval between two dates"""
957 def __init__(self, start=None, end=None):
958 """start and end must be strings in the format accepted by date"""
959 if start is not None:
960 self.start = date_from_str(start)
961 else:
962 self.start = datetime.datetime.min.date()
963 if end is not None:
964 self.end = date_from_str(end)
965 else:
966 self.end = datetime.datetime.max.date()
967 if self.start > self.end:
968 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
969 @classmethod
970 def day(cls, day):
971 """Returns a range that only contains the given day"""
972 return cls(day,day)
973 def __contains__(self, date):
974 """Check if the date is in the range"""
975 if not isinstance(date, datetime.date):
976 date = date_from_str(date)
977 return self.start <= date <= self.end
978 def __str__(self):
979 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
980
981
982 def platform_name():
983 """ Returns the platform name as a compat_str """
984 res = platform.platform()
985 if isinstance(res, bytes):
986 res = res.decode(preferredencoding())
987
988 assert isinstance(res, compat_str)
989 return res
990
991
992 def _windows_write_string(s, out):
993 """ Returns True if the string was written using special methods,
994 False if it has yet to be written out."""
995 # Adapted from http://stackoverflow.com/a/3259271/35070
996
997 import ctypes
998 import ctypes.wintypes
999
1000 WIN_OUTPUT_IDS = {
1001 1: -11,
1002 2: -12,
1003 }
1004
1005 try:
1006 fileno = out.fileno()
1007 except AttributeError:
1008 # If the output stream doesn't have a fileno, it's virtual
1009 return False
1010 if fileno not in WIN_OUTPUT_IDS:
1011 return False
1012
1013 GetStdHandle = ctypes.WINFUNCTYPE(
1014 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1015 ("GetStdHandle", ctypes.windll.kernel32))
1016 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1017
1018 WriteConsoleW = ctypes.WINFUNCTYPE(
1019 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1020 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1021 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1022 written = ctypes.wintypes.DWORD(0)
1023
1024 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1025 FILE_TYPE_CHAR = 0x0002
1026 FILE_TYPE_REMOTE = 0x8000
1027 GetConsoleMode = ctypes.WINFUNCTYPE(
1028 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1029 ctypes.POINTER(ctypes.wintypes.DWORD))(
1030 ("GetConsoleMode", ctypes.windll.kernel32))
1031 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1032
1033 def not_a_console(handle):
1034 if handle == INVALID_HANDLE_VALUE or handle is None:
1035 return True
1036 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1037 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1038
1039 if not_a_console(h):
1040 return False
1041
1042 def next_nonbmp_pos(s):
1043 try:
1044 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1045 except StopIteration:
1046 return len(s)
1047
1048 while s:
1049 count = min(next_nonbmp_pos(s), 1024)
1050
1051 ret = WriteConsoleW(
1052 h, s, count if count else 2, ctypes.byref(written), None)
1053 if ret == 0:
1054 raise OSError('Failed to write string')
1055 if not count: # We just wrote a non-BMP character
1056 assert written.value == 2
1057 s = s[1:]
1058 else:
1059 assert written.value > 0
1060 s = s[written.value:]
1061 return True
1062
1063
1064 def write_string(s, out=None, encoding=None):
1065 if out is None:
1066 out = sys.stderr
1067 assert type(s) == compat_str
1068
1069 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1070 if _windows_write_string(s, out):
1071 return
1072
1073 if ('b' in getattr(out, 'mode', '') or
1074 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1075 byt = s.encode(encoding or preferredencoding(), 'ignore')
1076 out.write(byt)
1077 elif hasattr(out, 'buffer'):
1078 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1079 byt = s.encode(enc, 'ignore')
1080 out.buffer.write(byt)
1081 else:
1082 out.write(s)
1083 out.flush()
1084
1085
1086 def bytes_to_intlist(bs):
1087 if not bs:
1088 return []
1089 if isinstance(bs[0], int): # Python 3
1090 return list(bs)
1091 else:
1092 return [ord(c) for c in bs]
1093
1094
1095 def intlist_to_bytes(xs):
1096 if not xs:
1097 return b''
1098 if isinstance(chr(0), bytes): # Python 2
1099 return ''.join([chr(x) for x in xs])
1100 else:
1101 return bytes(xs)
1102
1103
1104 # Cross-platform file locking
1105 if sys.platform == 'win32':
1106 import ctypes.wintypes
1107 import msvcrt
1108
1109 class OVERLAPPED(ctypes.Structure):
1110 _fields_ = [
1111 ('Internal', ctypes.wintypes.LPVOID),
1112 ('InternalHigh', ctypes.wintypes.LPVOID),
1113 ('Offset', ctypes.wintypes.DWORD),
1114 ('OffsetHigh', ctypes.wintypes.DWORD),
1115 ('hEvent', ctypes.wintypes.HANDLE),
1116 ]
1117
1118 kernel32 = ctypes.windll.kernel32
1119 LockFileEx = kernel32.LockFileEx
1120 LockFileEx.argtypes = [
1121 ctypes.wintypes.HANDLE, # hFile
1122 ctypes.wintypes.DWORD, # dwFlags
1123 ctypes.wintypes.DWORD, # dwReserved
1124 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1125 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1126 ctypes.POINTER(OVERLAPPED) # Overlapped
1127 ]
1128 LockFileEx.restype = ctypes.wintypes.BOOL
1129 UnlockFileEx = kernel32.UnlockFileEx
1130 UnlockFileEx.argtypes = [
1131 ctypes.wintypes.HANDLE, # hFile
1132 ctypes.wintypes.DWORD, # dwReserved
1133 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1134 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1135 ctypes.POINTER(OVERLAPPED) # Overlapped
1136 ]
1137 UnlockFileEx.restype = ctypes.wintypes.BOOL
1138 whole_low = 0xffffffff
1139 whole_high = 0x7fffffff
1140
1141 def _lock_file(f, exclusive):
1142 overlapped = OVERLAPPED()
1143 overlapped.Offset = 0
1144 overlapped.OffsetHigh = 0
1145 overlapped.hEvent = 0
1146 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1147 handle = msvcrt.get_osfhandle(f.fileno())
1148 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1149 whole_low, whole_high, f._lock_file_overlapped_p):
1150 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1151
1152 def _unlock_file(f):
1153 assert f._lock_file_overlapped_p
1154 handle = msvcrt.get_osfhandle(f.fileno())
1155 if not UnlockFileEx(handle, 0,
1156 whole_low, whole_high, f._lock_file_overlapped_p):
1157 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1158
1159 else:
1160 import fcntl
1161
1162 def _lock_file(f, exclusive):
1163 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1164
1165 def _unlock_file(f):
1166 fcntl.flock(f, fcntl.LOCK_UN)
1167
1168
1169 class locked_file(object):
1170 def __init__(self, filename, mode, encoding=None):
1171 assert mode in ['r', 'a', 'w']
1172 self.f = io.open(filename, mode, encoding=encoding)
1173 self.mode = mode
1174
1175 def __enter__(self):
1176 exclusive = self.mode != 'r'
1177 try:
1178 _lock_file(self.f, exclusive)
1179 except IOError:
1180 self.f.close()
1181 raise
1182 return self
1183
1184 def __exit__(self, etype, value, traceback):
1185 try:
1186 _unlock_file(self.f)
1187 finally:
1188 self.f.close()
1189
1190 def __iter__(self):
1191 return iter(self.f)
1192
1193 def write(self, *args):
1194 return self.f.write(*args)
1195
1196 def read(self, *args):
1197 return self.f.read(*args)
1198
1199
1200 def shell_quote(args):
1201 quoted_args = []
1202 encoding = sys.getfilesystemencoding()
1203 if encoding is None:
1204 encoding = 'utf-8'
1205 for a in args:
1206 if isinstance(a, bytes):
1207 # We may get a filename encoded with 'encodeFilename'
1208 a = a.decode(encoding)
1209 quoted_args.append(pipes.quote(a))
1210 return u' '.join(quoted_args)
1211
1212
1213 def takewhile_inclusive(pred, seq):
1214 """ Like itertools.takewhile, but include the latest evaluated element
1215 (the first element so that Not pred(e)) """
1216 for e in seq:
1217 yield e
1218 if not pred(e):
1219 return
1220
1221
1222 def smuggle_url(url, data):
1223 """ Pass additional data in a URL for internal use. """
1224
1225 sdata = compat_urllib_parse.urlencode(
1226 {u'__youtubedl_smuggle': json.dumps(data)})
1227 return url + u'#' + sdata
1228
1229
1230 def unsmuggle_url(smug_url, default=None):
1231 if not '#__youtubedl_smuggle' in smug_url:
1232 return smug_url, default
1233 url, _, sdata = smug_url.rpartition(u'#')
1234 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1235 data = json.loads(jsond)
1236 return url, data
1237
1238
1239 def format_bytes(bytes):
1240 if bytes is None:
1241 return u'N/A'
1242 if type(bytes) is str:
1243 bytes = float(bytes)
1244 if bytes == 0.0:
1245 exponent = 0
1246 else:
1247 exponent = int(math.log(bytes, 1024.0))
1248 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1249 converted = float(bytes) / float(1024 ** exponent)
1250 return u'%.2f%s' % (converted, suffix)
1251
1252
1253 def get_term_width():
1254 columns = os.environ.get('COLUMNS', None)
1255 if columns:
1256 return int(columns)
1257
1258 try:
1259 sp = subprocess.Popen(
1260 ['stty', 'size'],
1261 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1262 out, err = sp.communicate()
1263 return int(out.split()[1])
1264 except:
1265 pass
1266 return None
1267
1268
1269 def month_by_name(name):
1270 """ Return the number of a month by (locale-independently) English name """
1271
1272 ENGLISH_NAMES = [
1273 u'January', u'February', u'March', u'April', u'May', u'June',
1274 u'July', u'August', u'September', u'October', u'November', u'December']
1275 try:
1276 return ENGLISH_NAMES.index(name) + 1
1277 except ValueError:
1278 return None
1279
1280
1281 def fix_xml_ampersands(xml_str):
1282 """Replace all the '&' by '&amp;' in XML"""
1283 return re.sub(
1284 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1285 u'&amp;',
1286 xml_str)
1287
1288
1289 def setproctitle(title):
1290 assert isinstance(title, compat_str)
1291 try:
1292 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1293 except OSError:
1294 return
1295 title_bytes = title.encode('utf-8')
1296 buf = ctypes.create_string_buffer(len(title_bytes))
1297 buf.value = title_bytes
1298 try:
1299 libc.prctl(15, buf, 0, 0, 0)
1300 except AttributeError:
1301 return # Strange libc, just skip this
1302
1303
1304 def remove_start(s, start):
1305 if s.startswith(start):
1306 return s[len(start):]
1307 return s
1308
1309
1310 def remove_end(s, end):
1311 if s.endswith(end):
1312 return s[:-len(end)]
1313 return s
1314
1315
1316 def url_basename(url):
1317 path = compat_urlparse.urlparse(url).path
1318 return path.strip(u'/').split(u'/')[-1]
1319
1320
1321 class HEADRequest(compat_urllib_request.Request):
1322 def get_method(self):
1323 return "HEAD"
1324
1325
1326 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1327 if get_attr:
1328 if v is not None:
1329 v = getattr(v, get_attr, None)
1330 if v == '':
1331 v = None
1332 return default if v is None else (int(v) * invscale // scale)
1333
1334
1335 def str_or_none(v, default=None):
1336 return default if v is None else compat_str(v)
1337
1338
1339 def str_to_int(int_str):
1340 """ A more relaxed version of int_or_none """
1341 if int_str is None:
1342 return None
1343 int_str = re.sub(r'[,\.\+]', u'', int_str)
1344 return int(int_str)
1345
1346
1347 def float_or_none(v, scale=1, invscale=1, default=None):
1348 return default if v is None else (float(v) * invscale / scale)
1349
1350
1351 def parse_duration(s):
1352 if s is None:
1353 return None
1354
1355 s = s.strip()
1356
1357 m = re.match(
1358 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1359 if not m:
1360 return None
1361 res = int(m.group('secs'))
1362 if m.group('mins'):
1363 res += int(m.group('mins')) * 60
1364 if m.group('hours'):
1365 res += int(m.group('hours')) * 60 * 60
1366 if m.group('ms'):
1367 res += float(m.group('ms'))
1368 return res
1369
1370
1371 def prepend_extension(filename, ext):
1372 name, real_ext = os.path.splitext(filename)
1373 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1374
1375
1376 def check_executable(exe, args=[]):
1377 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1378 args can be a list of arguments for a short output (like -version) """
1379 try:
1380 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1381 except OSError:
1382 return False
1383 return exe
1384
1385
1386 class PagedList(object):
1387 def __init__(self, pagefunc, pagesize):
1388 self._pagefunc = pagefunc
1389 self._pagesize = pagesize
1390
1391 def __len__(self):
1392 # This is only useful for tests
1393 return len(self.getslice())
1394
1395 def getslice(self, start=0, end=None):
1396 res = []
1397 for pagenum in itertools.count(start // self._pagesize):
1398 firstid = pagenum * self._pagesize
1399 nextfirstid = pagenum * self._pagesize + self._pagesize
1400 if start >= nextfirstid:
1401 continue
1402
1403 page_results = list(self._pagefunc(pagenum))
1404
1405 startv = (
1406 start % self._pagesize
1407 if firstid <= start < nextfirstid
1408 else 0)
1409
1410 endv = (
1411 ((end - 1) % self._pagesize) + 1
1412 if (end is not None and firstid <= end <= nextfirstid)
1413 else None)
1414
1415 if startv != 0 or endv is not None:
1416 page_results = page_results[startv:endv]
1417 res.extend(page_results)
1418
1419 # A little optimization - if current page is not "full", ie. does
1420 # not contain page_size videos then we can assume that this page
1421 # is the last one - there are no more ids on further pages -
1422 # i.e. no need to query again.
1423 if len(page_results) + startv < self._pagesize:
1424 break
1425
1426 # If we got the whole page, but the next page is not interesting,
1427 # break out early as well
1428 if end == nextfirstid:
1429 break
1430 return res
1431
1432
1433 def uppercase_escape(s):
1434 unicode_escape = codecs.getdecoder('unicode_escape')
1435 return re.sub(
1436 r'\\U[0-9a-fA-F]{8}',
1437 lambda m: unicode_escape(m.group(0))[0],
1438 s)
1439
1440 try:
1441 struct.pack(u'!I', 0)
1442 except TypeError:
1443 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1444 def struct_pack(spec, *args):
1445 if isinstance(spec, compat_str):
1446 spec = spec.encode('ascii')
1447 return struct.pack(spec, *args)
1448
1449 def struct_unpack(spec, *args):
1450 if isinstance(spec, compat_str):
1451 spec = spec.encode('ascii')
1452 return struct.unpack(spec, *args)
1453 else:
1454 struct_pack = struct.pack
1455 struct_unpack = struct.unpack
1456
1457
1458 def read_batch_urls(batch_fd):
1459 def fixup(url):
1460 if not isinstance(url, compat_str):
1461 url = url.decode('utf-8', 'replace')
1462 BOM_UTF8 = u'\xef\xbb\xbf'
1463 if url.startswith(BOM_UTF8):
1464 url = url[len(BOM_UTF8):]
1465 url = url.strip()
1466 if url.startswith(('#', ';', ']')):
1467 return False
1468 return url
1469
1470 with contextlib.closing(batch_fd) as fd:
1471 return [url for url in map(fixup, fd) if url]
1472
1473
1474 def urlencode_postdata(*args, **kargs):
1475 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1476
1477
1478 try:
1479 etree_iter = xml.etree.ElementTree.Element.iter
1480 except AttributeError: # Python <=2.6
1481 etree_iter = lambda n: n.findall('.//*')
1482
1483
1484 def parse_xml(s):
1485 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1486 def doctype(self, name, pubid, system):
1487 pass # Ignore doctypes
1488
1489 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1490 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1491 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1492 # Fix up XML parser in Python 2.x
1493 if sys.version_info < (3, 0):
1494 for n in etree_iter(tree):
1495 if n.text is not None:
1496 if not isinstance(n.text, compat_str):
1497 n.text = n.text.decode('utf-8')
1498 return tree
1499
1500
1501 if sys.version_info < (3, 0) and sys.platform == 'win32':
1502 def compat_getpass(prompt, *args, **kwargs):
1503 if isinstance(prompt, compat_str):
1504 prompt = prompt.encode(preferredencoding())
1505 return getpass.getpass(prompt, *args, **kwargs)
1506 else:
1507 compat_getpass = getpass.getpass
1508
1509
1510 US_RATINGS = {
1511 'G': 0,
1512 'PG': 10,
1513 'PG-13': 13,
1514 'R': 16,
1515 'NC': 18,
1516 }
1517
1518
1519 def strip_jsonp(code):
1520 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1521
1522
1523 def js_to_json(code):
1524 def fix_kv(m):
1525 key = m.group(2)
1526 if key.startswith("'"):
1527 assert key.endswith("'")
1528 assert '"' not in key
1529 key = '"%s"' % key[1:-1]
1530 elif not key.startswith('"'):
1531 key = '"%s"' % key
1532
1533 value = m.group(4)
1534 if value.startswith("'"):
1535 assert value.endswith("'")
1536 assert '"' not in value
1537 value = '"%s"' % value[1:-1]
1538
1539 return m.group(1) + key + m.group(3) + value
1540
1541 res = re.sub(r'''(?x)
1542 ([{,]\s*)
1543 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1544 (:\s*)
1545 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1546 ''', fix_kv, code)
1547 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1548 return res
1549
1550
1551 def qualities(quality_ids):
1552 """ Get a numeric quality value out of a list of possible values """
1553 def q(qid):
1554 try:
1555 return quality_ids.index(qid)
1556 except ValueError:
1557 return -1
1558 return q
1559
1560
1561 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1562
1563 try:
1564 subprocess_check_output = subprocess.check_output
1565 except AttributeError:
1566 def subprocess_check_output(*args, **kwargs):
1567 assert 'input' not in kwargs
1568 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1569 output, _ = p.communicate()
1570 ret = p.poll()
1571 if ret:
1572 raise subprocess.CalledProcessError(ret, p.args, output=output)
1573 return output
1574
1575
1576 def limit_length(s, length):
1577 """ Add ellipses to overly long strings """
1578 if s is None:
1579 return None
1580 ELLIPSES = '...'
1581 if len(s) > length:
1582 return s[:length - len(ELLIPSES)] + ELLIPSES
1583 return s