]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Keep video IDs verbatim if possible (Closes #571)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import gzip
03f9daab 5import io
d77c3dfd
FV
6import locale
7import os
8import re
9import sys
10import zlib
d77c3dfd 11import email.utils
921a1455 12import json
d77c3dfd 13
01ba00ca 14try:
59ae15a5 15 import urllib.request as compat_urllib_request
01ba00ca 16except ImportError: # Python 2
59ae15a5 17 import urllib2 as compat_urllib_request
01ba00ca
PH
18
19try:
59ae15a5 20 import urllib.error as compat_urllib_error
01ba00ca 21except ImportError: # Python 2
59ae15a5 22 import urllib2 as compat_urllib_error
01ba00ca
PH
23
24try:
59ae15a5 25 import urllib.parse as compat_urllib_parse
01ba00ca 26except ImportError: # Python 2
59ae15a5 27 import urllib as compat_urllib_parse
01ba00ca 28
799c0763
PH
29try:
30 from urllib.parse import urlparse as compat_urllib_parse_urlparse
31except ImportError: # Python 2
32 from urlparse import urlparse as compat_urllib_parse_urlparse
33
01ba00ca 34try:
59ae15a5 35 import http.cookiejar as compat_cookiejar
01ba00ca 36except ImportError: # Python 2
59ae15a5 37 import cookielib as compat_cookiejar
01ba00ca 38
3e669f36 39try:
59ae15a5 40 import html.entities as compat_html_entities
9f37a959 41except ImportError: # Python 2
59ae15a5 42 import htmlentitydefs as compat_html_entities
3e669f36 43
a8156c1d 44try:
59ae15a5 45 import html.parser as compat_html_parser
9f37a959 46except ImportError: # Python 2
59ae15a5 47 import HTMLParser as compat_html_parser
a8156c1d 48
348d0a7a 49try:
59ae15a5 50 import http.client as compat_http_client
9f37a959 51except ImportError: # Python 2
59ae15a5 52 import httplib as compat_http_client
348d0a7a 53
9f37a959 54try:
59ae15a5 55 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 56except ImportError: # Python 2
59ae15a5
PH
57 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
58 # Python 2's version is apparently totally broken
59 def _unquote(string, encoding='utf-8', errors='replace'):
60 if string == '':
61 return string
62 res = string.split('%')
63 if len(res) == 1:
64 return string
65 if encoding is None:
66 encoding = 'utf-8'
67 if errors is None:
68 errors = 'replace'
69 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
70 pct_sequence = b''
71 string = res[0]
72 for item in res[1:]:
73 try:
74 if not item:
75 raise ValueError
76 pct_sequence += item[:2].decode('hex')
77 rest = item[2:]
78 if not rest:
79 # This segment was just a single percent-encoded character.
80 # May be part of a sequence of code units, so delay decoding.
81 # (Stored in pct_sequence).
82 continue
83 except ValueError:
84 rest = '%' + item
85 # Encountered non-percent-encoded characters. Flush the current
86 # pct_sequence.
87 string += pct_sequence.decode(encoding, errors) + rest
88 pct_sequence = b''
89 if pct_sequence:
90 # Flush the final pct_sequence
91 string += pct_sequence.decode(encoding, errors)
92 return string
93
94 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
95 encoding='utf-8', errors='replace'):
96 qs, _coerce_result = qs, unicode
97 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
98 r = []
99 for name_value in pairs:
100 if not name_value and not strict_parsing:
101 continue
102 nv = name_value.split('=', 1)
103 if len(nv) != 2:
104 if strict_parsing:
105 raise ValueError("bad query field: %r" % (name_value,))
106 # Handle case of a control-name with no equal sign
107 if keep_blank_values:
108 nv.append('')
109 else:
110 continue
111 if len(nv[1]) or keep_blank_values:
112 name = nv[0].replace('+', ' ')
113 name = _unquote(name, encoding=encoding, errors=errors)
114 name = _coerce_result(name)
115 value = nv[1].replace('+', ' ')
116 value = _unquote(value, encoding=encoding, errors=errors)
117 value = _coerce_result(value)
118 r.append((name, value))
119 return r
120
121 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
122 encoding='utf-8', errors='replace'):
123 parsed_result = {}
124 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
125 encoding=encoding, errors=errors)
126 for name, value in pairs:
127 if name in parsed_result:
128 parsed_result[name].append(value)
129 else:
130 parsed_result[name] = [value]
131 return parsed_result
348d0a7a 132
3e669f36 133try:
59ae15a5 134 compat_str = unicode # Python 2
3e669f36 135except NameError:
59ae15a5 136 compat_str = str
3e669f36
PH
137
138try:
59ae15a5 139 compat_chr = unichr # Python 2
3e669f36 140except NameError:
59ae15a5 141 compat_chr = chr
3e669f36 142
3e669f36 143std_headers = {
59ae15a5
PH
144 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
145 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
146 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
147 'Accept-Encoding': 'gzip, deflate',
148 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 149}
d77c3dfd 150def preferredencoding():
59ae15a5 151 """Get preferred encoding.
d77c3dfd 152
59ae15a5
PH
153 Returns the best encoding scheme for the system, based on
154 locale.getpreferredencoding() and some further tweaks.
155 """
156 try:
157 pref = locale.getpreferredencoding()
158 u'TEST'.encode(pref)
159 except:
160 pref = 'UTF-8'
bae611f2 161
59ae15a5 162 return pref
d77c3dfd 163
8cd10ac4 164if sys.version_info < (3,0):
59ae15a5
PH
165 def compat_print(s):
166 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 167else:
59ae15a5
PH
168 def compat_print(s):
169 assert type(s) == type(u'')
170 print(s)
d77c3dfd
FV
171
172def htmlentity_transform(matchobj):
59ae15a5
PH
173 """Transforms an HTML entity to a character.
174
175 This function receives a match object and is intended to be used with
176 the re.sub() function.
177 """
178 entity = matchobj.group(1)
179
180 # Known non-numeric HTML entity
181 if entity in compat_html_entities.name2codepoint:
182 return compat_chr(compat_html_entities.name2codepoint[entity])
183
184 mobj = re.match(u'(?u)#(x?\\d+)', entity)
185 if mobj is not None:
186 numstr = mobj.group(1)
187 if numstr.startswith(u'x'):
188 base = 16
189 numstr = u'0%s' % numstr
190 else:
191 base = 10
192 return compat_chr(int(numstr, base))
193
194 # Unknown entity in name, return its literal representation
195 return (u'&%s;' % entity)
d77c3dfd 196
a8156c1d
PH
197compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
198class IDParser(compat_html_parser.HTMLParser):
59ae15a5
PH
199 """Modified HTMLParser that isolates a tag with the specified id"""
200 def __init__(self, id):
201 self.id = id
202 self.result = None
203 self.started = False
204 self.depth = {}
205 self.html = None
206 self.watch_startpos = False
207 self.error_count = 0
208 compat_html_parser.HTMLParser.__init__(self)
209
210 def error(self, message):
211 if self.error_count > 10 or self.started:
212 raise compat_html_parser.HTMLParseError(message, self.getpos())
213 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
214 self.error_count += 1
215 self.goahead(1)
216
217 def loads(self, html):
218 self.html = html
219 self.feed(html)
220 self.close()
221
222 def handle_starttag(self, tag, attrs):
223 attrs = dict(attrs)
224 if self.started:
225 self.find_startpos(None)
226 if 'id' in attrs and attrs['id'] == self.id:
227 self.result = [tag]
228 self.started = True
229 self.watch_startpos = True
230 if self.started:
231 if not tag in self.depth: self.depth[tag] = 0
232 self.depth[tag] += 1
233
234 def handle_endtag(self, tag):
235 if self.started:
236 if tag in self.depth: self.depth[tag] -= 1
237 if self.depth[self.result[0]] == 0:
238 self.started = False
239 self.result.append(self.getpos())
240
241 def find_startpos(self, x):
242 """Needed to put the start position of the result (self.result[1])
243 after the opening tag with the requested id"""
244 if self.watch_startpos:
245 self.watch_startpos = False
246 self.result.append(self.getpos())
247 handle_entityref = handle_charref = handle_data = handle_comment = \
248 handle_decl = handle_pi = unknown_decl = find_startpos
249
250 def get_result(self):
251 if self.result is None:
252 return None
253 if len(self.result) != 3:
254 return None
255 lines = self.html.split('\n')
256 lines = lines[self.result[1][0]-1:self.result[2][0]]
257 lines[0] = lines[0][self.result[1][1]:]
258 if len(lines) == 1:
259 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
260 lines[-1] = lines[-1][:self.result[2][1]]
261 return '\n'.join(lines).strip()
9e6dd238
FV
262
263def get_element_by_id(id, html):
59ae15a5
PH
264 """Return the content of the tag with the specified id in the passed HTML document"""
265 parser = IDParser(id)
266 try:
267 parser.loads(html)
268 except compat_html_parser.HTMLParseError:
269 pass
270 return parser.get_result()
9e6dd238
FV
271
272
273def clean_html(html):
59ae15a5
PH
274 """Clean an HTML snippet into a readable string"""
275 # Newline vs <br />
276 html = html.replace('\n', ' ')
277 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
278 # Strip html tags
279 html = re.sub('<.*?>', '', html)
280 # Replace html entities
281 html = unescapeHTML(html)
282 return html
9e6dd238
FV
283
284
d77c3dfd 285def sanitize_open(filename, open_mode):
59ae15a5
PH
286 """Try to open the given filename, and slightly tweak it if this fails.
287
288 Attempts to open the given filename. If this fails, it tries to change
289 the filename slightly, step by step, until it's either able to open it
290 or it fails and raises a final exception, like the standard open()
291 function.
292
293 It returns the tuple (stream, definitive_file_name).
294 """
295 try:
296 if filename == u'-':
297 if sys.platform == 'win32':
298 import msvcrt
299 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
300 return (sys.stdout, filename)
301 stream = open(encodeFilename(filename), open_mode)
302 return (stream, filename)
303 except (IOError, OSError) as err:
304 # In case of error, try to remove win32 forbidden chars
305 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
306
307 # An exception here should be caught in the caller
308 stream = open(encodeFilename(filename), open_mode)
309 return (stream, filename)
d77c3dfd
FV
310
311
312def timeconvert(timestr):
59ae15a5
PH
313 """Convert RFC 2822 defined time string into system timestamp"""
314 timestamp = None
315 timetuple = email.utils.parsedate_tz(timestr)
316 if timetuple is not None:
317 timestamp = email.utils.mktime_tz(timetuple)
318 return timestamp
1c469a94 319
796173d0 320def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
321 """Sanitizes a string so it could be used as part of a filename.
322 If restricted is set, use a stricter subset of allowed characters.
796173d0 323 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
324 """
325 def replace_insane(char):
326 if char == '?' or ord(char) < 32 or ord(char) == 127:
327 return ''
328 elif char == '"':
329 return '' if restricted else '\''
330 elif char == ':':
331 return '_-' if restricted else ' -'
332 elif char in '\\/|*<>':
333 return '_'
627dcfff 334 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
335 return '_'
336 if restricted and ord(char) > 127:
337 return '_'
338 return char
339
340 result = u''.join(map(replace_insane, s))
796173d0
PH
341 if not is_id:
342 while '__' in result:
343 result = result.replace('__', '_')
344 result = result.strip('_')
345 # Common case of "Foreign band name - English song title"
346 if restricted and result.startswith('-_'):
347 result = result[2:]
348 if not result:
349 result = '_'
59ae15a5 350 return result
d77c3dfd
FV
351
352def orderedSet(iterable):
59ae15a5
PH
353 """ Remove all duplicates from the input iterable """
354 res = []
355 for el in iterable:
356 if el not in res:
357 res.append(el)
358 return res
d77c3dfd
FV
359
360def unescapeHTML(s):
59ae15a5
PH
361 """
362 @param s a string
363 """
364 assert type(s) == type(u'')
d77c3dfd 365
59ae15a5
PH
366 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
367 return result
d77c3dfd
FV
368
369def encodeFilename(s):
59ae15a5
PH
370 """
371 @param s The name of the file
372 """
d77c3dfd 373
59ae15a5 374 assert type(s) == type(u'')
d77c3dfd 375
59ae15a5
PH
376 # Python 3 has a Unicode API
377 if sys.version_info >= (3, 0):
378 return s
0f00efed 379
59ae15a5
PH
380 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
381 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
382 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
383 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
384 return s
385 else:
386 return s.encode(sys.getfilesystemencoding(), 'ignore')
d77c3dfd
FV
387
388class DownloadError(Exception):
59ae15a5 389 """Download Error exception.
d77c3dfd 390
59ae15a5
PH
391 This exception may be thrown by FileDownloader objects if they are not
392 configured to continue on errors. They will contain the appropriate
393 error message.
394 """
395 pass
d77c3dfd
FV
396
397
398class SameFileError(Exception):
59ae15a5 399 """Same File exception.
d77c3dfd 400
59ae15a5
PH
401 This exception will be thrown by FileDownloader objects if they detect
402 multiple files would have to be downloaded to the same file on disk.
403 """
404 pass
d77c3dfd
FV
405
406
407class PostProcessingError(Exception):
59ae15a5 408 """Post Processing exception.
d77c3dfd 409
59ae15a5
PH
410 This exception may be raised by PostProcessor's .run() method to
411 indicate an error in the postprocessing task.
412 """
413 pass
d77c3dfd
FV
414
415class MaxDownloadsReached(Exception):
59ae15a5
PH
416 """ --max-downloads limit has been reached. """
417 pass
d77c3dfd
FV
418
419
420class UnavailableVideoError(Exception):
59ae15a5 421 """Unavailable Format exception.
d77c3dfd 422
59ae15a5
PH
423 This exception will be thrown when a video is requested
424 in a format that is not available for that video.
425 """
426 pass
d77c3dfd
FV
427
428
429class ContentTooShortError(Exception):
59ae15a5 430 """Content Too Short exception.
d77c3dfd 431
59ae15a5
PH
432 This exception may be raised by FileDownloader objects when a file they
433 download is too small for what the server announced first, indicating
434 the connection was probably interrupted.
435 """
436 # Both in bytes
437 downloaded = None
438 expected = None
d77c3dfd 439
59ae15a5
PH
440 def __init__(self, downloaded, expected):
441 self.downloaded = downloaded
442 self.expected = expected
d77c3dfd
FV
443
444
0b8c922d 445class Trouble(Exception):
59ae15a5 446 """Trouble helper exception
dffe658b 447
59ae15a5
PH
448 This is an exception to be handled with
449 FileDownloader.trouble
450 """
0b8c922d 451
01ba00ca 452class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
453 """Handler for HTTP requests and responses.
454
455 This class, when installed with an OpenerDirector, automatically adds
456 the standard headers to every HTTP request and handles gzipped and
457 deflated responses from web servers. If compression is to be avoided in
458 a particular request, the original request in the program code only has
459 to include the HTTP header "Youtubedl-No-Compression", which will be
460 removed before making the real request.
461
462 Part of this code was copied from:
463
464 http://techknack.net/python-urllib2-handlers/
465
466 Andrew Rowls, the author of that code, agreed to release it to the
467 public domain.
468 """
469
470 @staticmethod
471 def deflate(data):
472 try:
473 return zlib.decompress(data, -zlib.MAX_WBITS)
474 except zlib.error:
475 return zlib.decompress(data)
476
477 @staticmethod
478 def addinfourl_wrapper(stream, headers, url, code):
479 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
480 return compat_urllib_request.addinfourl(stream, headers, url, code)
481 ret = compat_urllib_request.addinfourl(stream, headers, url)
482 ret.code = code
483 return ret
484
485 def http_request(self, req):
486 for h in std_headers:
487 if h in req.headers:
488 del req.headers[h]
489 req.add_header(h, std_headers[h])
490 if 'Youtubedl-no-compression' in req.headers:
491 if 'Accept-encoding' in req.headers:
492 del req.headers['Accept-encoding']
493 del req.headers['Youtubedl-no-compression']
494 return req
495
496 def http_response(self, req, resp):
497 old_resp = resp
498 # gzip
499 if resp.headers.get('Content-encoding', '') == 'gzip':
500 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
501 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
502 resp.msg = old_resp.msg
503 # deflate
504 if resp.headers.get('Content-encoding', '') == 'deflate':
505 gz = io.BytesIO(self.deflate(resp.read()))
506 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
507 resp.msg = old_resp.msg
508 return resp