youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import locale
   7 import os
   8 import re
   9 import sys
  10 import zlib
  11 import email.utils
  12 import json
  13
  14 try:
  15         import urllib.request as compat_urllib_request
  16 except ImportError: # Python 2
  17         import urllib2 as compat_urllib_request
  18
  19 try:
  20         import urllib.error as compat_urllib_error
  21 except ImportError: # Python 2
  22         import urllib2 as compat_urllib_error
  23
  24 try:
  25         import urllib.parse as compat_urllib_parse
  26 except ImportError: # Python 2
  27         import urllib as compat_urllib_parse
  28
  29 try:
  30         import http.cookiejar as compat_cookiejar
  31 except ImportError: # Python 2
  32         import cookielib as compat_cookiejar
  33
  34 try:
  35         import html.entities as compat_html_entities
  36 except ImportError: # Python 2
  37         import htmlentitydefs as compat_html_entities
  38
  39 try:
  40         import html.parser as compat_html_parser
  41 except ImportError: # Python 2
  42         import HTMLParser as compat_html_parser
  43
  44 try:
  45         import http.client as compat_html_client
  46 except ImportError: # Python 2
  47         import httplib as compat_html_client
  48
  49 try:
  50         from urllib.parse.parse_qs import parse_qs as compat_parse_qs
  51 except ImportError: # Python 2
  52         from urlparse import parse_qs as compat_parse_qs
  53
  54 try:
  55         compat_str = unicode # Python 2
  56 except NameError:
  57         compat_str = str
  58
  59 try:
  60         compat_chr = unichr # Python 2
  61 except NameError:
  62         compat_chr = chr
  63
  64
  65 std_headers = {
  66         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  67         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  68         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  69         'Accept-Encoding': 'gzip, deflate',
  70         'Accept-Language': 'en-us,en;q=0.5',
  71 }
  72 def preferredencoding():
  73         """Get preferred encoding.
  74
  75         Returns the best encoding scheme for the system, based on
  76         locale.getpreferredencoding() and some further tweaks.
  77         """
  78         try:
  79                 pref = locale.getpreferredencoding()
  80                 u'TEST'.encode(pref)
  81         except:
  82                 pref = 'UTF-8'
  83
  84         return pref
  85
  86
  87 def htmlentity_transform(matchobj):
  88         """Transforms an HTML entity to a character.
  89
  90         This function receives a match object and is intended to be used with
  91         the re.sub() function.
  92         """
  93         entity = matchobj.group(1)
  94
  95         # Known non-numeric HTML entity
  96         if entity in compat_html_entities.name2codepoint:
  97                 return compat_chr(compat_html_entities.name2codepoint[entity])
  98
  99         mobj = re.match(u'(?u)#(x?\\d+)', entity)
 100         if mobj is not None:
 101                 numstr = mobj.group(1)
 102                 if numstr.startswith(u'x'):
 103                         base = 16
 104                         numstr = u'0%s' % numstr
 105                 else:
 106                         base = 10
 107                 return compat_chr(int(numstr, base))
 108
 109         # Unknown entity in name, return its literal representation
 110         return (u'&%s;' % entity)
 111
 112 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 113 class IDParser(compat_html_parser.HTMLParser):
 114         """Modified HTMLParser that isolates a tag with the specified id"""
 115         def __init__(self, id):
 116                 self.id = id
 117                 self.result = None
 118                 self.started = False
 119                 self.depth = {}
 120                 self.html = None
 121                 self.watch_startpos = False
 122                 self.error_count = 0
 123                 compat_html_parser.HTMLParser.__init__(self)
 124
 125         def error(self, message):
 126                 if self.error_count > 10 or self.started:
 127                         raise compat_html_parser.HTMLParseError(message, self.getpos())
 128                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 129                 self.error_count += 1
 130                 self.goahead(1)
 131
 132         def loads(self, html):
 133                 self.html = html
 134                 self.feed(html)
 135                 self.close()
 136
 137         def handle_starttag(self, tag, attrs):
 138                 attrs = dict(attrs)
 139                 if self.started:
 140                         self.find_startpos(None)
 141                 if 'id' in attrs and attrs['id'] == self.id:
 142                         self.result = [tag]
 143                         self.started = True
 144                         self.watch_startpos = True
 145                 if self.started:
 146                         if not tag in self.depth: self.depth[tag] = 0
 147                         self.depth[tag] += 1
 148
 149         def handle_endtag(self, tag):
 150                 if self.started:
 151                         if tag in self.depth: self.depth[tag] -= 1
 152                         if self.depth[self.result[0]] == 0:
 153                                 self.started = False
 154                                 self.result.append(self.getpos())
 155
 156         def find_startpos(self, x):
 157                 """Needed to put the start position of the result (self.result[1])
 158                 after the opening tag with the requested id"""
 159                 if self.watch_startpos:
 160                         self.watch_startpos = False
 161                         self.result.append(self.getpos())
 162         handle_entityref = handle_charref = handle_data = handle_comment = \
 163         handle_decl = handle_pi = unknown_decl = find_startpos
 164
 165         def get_result(self):
 166                 if self.result is None:
 167                         return None
 168                 if len(self.result) != 3:
 169                         return None
 170                 lines = self.html.split('\n')
 171                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 172                 lines[0] = lines[0][self.result[1][1]:]
 173                 if len(lines) == 1:
 174                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 175                 lines[-1] = lines[-1][:self.result[2][1]]
 176                 return '\n'.join(lines).strip()
 177
 178 def get_element_by_id(id, html):
 179         """Return the content of the tag with the specified id in the passed HTML document"""
 180         parser = IDParser(id)
 181         try:
 182                 parser.loads(html)
 183         except compat_html_parser.HTMLParseError:
 184                 pass
 185         return parser.get_result()
 186
 187
 188 def clean_html(html):
 189         """Clean an HTML snippet into a readable string"""
 190         # Newline vs <br />
 191         html = html.replace('\n', ' ')
 192         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 193         # Strip html tags
 194         html = re.sub('<.*?>', '', html)
 195         # Replace html entities
 196         html = unescapeHTML(html)
 197         return html
 198
 199
 200 def sanitize_open(filename, open_mode):
 201         """Try to open the given filename, and slightly tweak it if this fails.
 202
 203         Attempts to open the given filename. If this fails, it tries to change
 204         the filename slightly, step by step, until it's either able to open it
 205         or it fails and raises a final exception, like the standard open()
 206         function.
 207
 208         It returns the tuple (stream, definitive_file_name).
 209         """
 210         try:
 211                 if filename == u'-':
 212                         if sys.platform == 'win32':
 213                                 import msvcrt
 214                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 215                         return (sys.stdout, filename)
 216                 stream = open(encodeFilename(filename), open_mode)
 217                 return (stream, filename)
 218         except (IOError, OSError) as err:
 219                 # In case of error, try to remove win32 forbidden chars
 220                 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 221
 222                 # An exception here should be caught in the caller
 223                 stream = open(encodeFilename(filename), open_mode)
 224                 return (stream, filename)
 225
 226
 227 def timeconvert(timestr):
 228         """Convert RFC 2822 defined time string into system timestamp"""
 229         timestamp = None
 230         timetuple = email.utils.parsedate_tz(timestr)
 231         if timetuple is not None:
 232                 timestamp = email.utils.mktime_tz(timetuple)
 233         return timestamp
 234
 235 def sanitize_filename(s, restricted=False):
 236         """Sanitizes a string so it could be used as part of a filename.
 237         If restricted is set, use a stricter subset of allowed characters.
 238         """
 239         def replace_insane(char):
 240                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 241                         return ''
 242                 elif char == '"':
 243                         return '' if restricted else '\''
 244                 elif char == ':':
 245                         return '_-' if restricted else ' -'
 246                 elif char in '\\/|*<>':
 247                         return '_'
 248                 if restricted and (char in '!&\'' or char.isspace()):
 249                         return '_'
 250                 if restricted and ord(char) > 127:
 251                         return '_'
 252                 return char
 253
 254         result = u''.join(map(replace_insane, s))
 255         while '__' in result:
 256                 result = result.replace('__', '_')
 257         result = result.strip('_')
 258         # Common case of "Foreign band name - English song title"
 259         if restricted and result.startswith('-_'):
 260                 result = result[2:]
 261         if not result:
 262                 result = '_'
 263         return result
 264
 265 def orderedSet(iterable):
 266         """ Remove all duplicates from the input iterable """
 267         res = []
 268         for el in iterable:
 269                 if el not in res:
 270                         res.append(el)
 271         return res
 272
 273 def unescapeHTML(s):
 274         """
 275         @param s a string
 276         """
 277         assert type(s) == type(u'')
 278
 279         result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 280         return result
 281
 282 def encodeFilename(s):
 283         """
 284         @param s The name of the file
 285         """
 286
 287         assert type(s) == type(u'')
 288
 289         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 290                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 291                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 292                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 293                 return s
 294         else:
 295                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 296
 297 class DownloadError(Exception):
 298         """Download Error exception.
 299
 300         This exception may be thrown by FileDownloader objects if they are not
 301         configured to continue on errors. They will contain the appropriate
 302         error message.
 303         """
 304         pass
 305
 306
 307 class SameFileError(Exception):
 308         """Same File exception.
 309
 310         This exception will be thrown by FileDownloader objects if they detect
 311         multiple files would have to be downloaded to the same file on disk.
 312         """
 313         pass
 314
 315
 316 class PostProcessingError(Exception):
 317         """Post Processing exception.
 318
 319         This exception may be raised by PostProcessor's .run() method to
 320         indicate an error in the postprocessing task.
 321         """
 322         pass
 323
 324 class MaxDownloadsReached(Exception):
 325         """ --max-downloads limit has been reached. """
 326         pass
 327
 328
 329 class UnavailableVideoError(Exception):
 330         """Unavailable Format exception.
 331
 332         This exception will be thrown when a video is requested
 333         in a format that is not available for that video.
 334         """
 335         pass
 336
 337
 338 class ContentTooShortError(Exception):
 339         """Content Too Short exception.
 340
 341         This exception may be raised by FileDownloader objects when a file they
 342         download is too small for what the server announced first, indicating
 343         the connection was probably interrupted.
 344         """
 345         # Both in bytes
 346         downloaded = None
 347         expected = None
 348
 349         def __init__(self, downloaded, expected):
 350                 self.downloaded = downloaded
 351                 self.expected = expected
 352
 353
 354 class Trouble(Exception):
 355         """Trouble helper exception
 356
 357         This is an exception to be handled with
 358         FileDownloader.trouble
 359         """
 360
 361 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 362         """Handler for HTTP requests and responses.
 363
 364         This class, when installed with an OpenerDirector, automatically adds
 365         the standard headers to every HTTP request and handles gzipped and
 366         deflated responses from web servers. If compression is to be avoided in
 367         a particular request, the original request in the program code only has
 368         to include the HTTP header "Youtubedl-No-Compression", which will be
 369         removed before making the real request.
 370
 371         Part of this code was copied from:
 372
 373         http://techknack.net/python-urllib2-handlers/
 374
 375         Andrew Rowls, the author of that code, agreed to release it to the
 376         public domain.
 377         """
 378
 379         @staticmethod
 380         def deflate(data):
 381                 try:
 382                         return zlib.decompress(data, -zlib.MAX_WBITS)
 383                 except zlib.error:
 384                         return zlib.decompress(data)
 385
 386         @staticmethod
 387         def addinfourl_wrapper(stream, headers, url, code):
 388                 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 389                         return compat_urllib_request.addinfourl(stream, headers, url, code)
 390                 ret = compat_urllib_request.addinfourl(stream, headers, url)
 391                 ret.code = code
 392                 return ret
 393
 394         def http_request(self, req):
 395                 for h in std_headers:
 396                         if h in req.headers:
 397                                 del req.headers[h]
 398                         req.add_header(h, std_headers[h])
 399                 if 'Youtubedl-no-compression' in req.headers:
 400                         if 'Accept-encoding' in req.headers:
 401                                 del req.headers['Accept-encoding']
 402                         del req.headers['Youtubedl-no-compression']
 403                 return req
 404
 405         def http_response(self, req, resp):
 406                 old_resp = resp
 407                 # gzip
 408                 if resp.headers.get('Content-encoding', '') == 'gzip':
 409                         gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 410                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 411                         resp.msg = old_resp.msg
 412                 # deflate
 413                 if resp.headers.get('Content-encoding', '') == 'deflate':
 414                         gz = io.BytesIO(self.deflate(resp.read()))
 415                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 416                         resp.msg = old_resp.msg
 417                 return resp