youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import htmlentitydefs
   6 import HTMLParser
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import email.utils
  13 import json
  14
  15 try:
  16         import cStringIO as StringIO
  17 except ImportError:
  18         import StringIO
  19
  20 std_headers = {
  21         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  22         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  23         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  24         'Accept-Encoding': 'gzip, deflate',
  25         'Accept-Language': 'en-us,en;q=0.5',
  26 }
  27
  28 try:
  29         compat_str = unicode # Python 2
  30 except NameError:
  31         compat_str = str
  32
  33 try:
  34         import urllib.request as compat_urllib_request
  35 except ImportError: # Python 2
  36         import urllib2 as compat_urllib_request
  37
  38 try:
  39         import urllib.error as compat_urllib_error
  40 except ImportError: # Python 2
  41         import urllib2 as compat_urllib_error
  42
  43 try:
  44         import urllib.parse as compat_urllib_parse
  45 except ImportError: # Python 2
  46         import urllib2 as compat_urllib_parse
  47
  48 try:
  49         import http.cookiejar as compat_cookiejar
  50 except ImportError: # Python 2
  51         import cookielib as compat_cookiejar
  52
  53 def preferredencoding():
  54         """Get preferred encoding.
  55
  56         Returns the best encoding scheme for the system, based on
  57         locale.getpreferredencoding() and some further tweaks.
  58         """
  59         try:
  60                 pref = locale.getpreferredencoding()
  61                 u'TEST'.encode(pref)
  62         except:
  63                 pref = 'UTF-8'
  64
  65         return pref
  66
  67
  68 def htmlentity_transform(matchobj):
  69         """Transforms an HTML entity to a character.
  70
  71         This function receives a match object and is intended to be used with
  72         the re.sub() function.
  73         """
  74         entity = matchobj.group(1)
  75
  76         # Known non-numeric HTML entity
  77         if entity in htmlentitydefs.name2codepoint:
  78                 return unichr(htmlentitydefs.name2codepoint[entity])
  79
  80         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  81         if mobj is not None:
  82                 numstr = mobj.group(1)
  83                 if numstr.startswith(u'x'):
  84                         base = 16
  85                         numstr = u'0%s' % numstr
  86                 else:
  87                         base = 10
  88                 return unichr(int(numstr, base))
  89
  90         # Unknown entity in name, return its literal representation
  91         return (u'&%s;' % entity)
  92
  93 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  94 class IDParser(HTMLParser.HTMLParser):
  95         """Modified HTMLParser that isolates a tag with the specified id"""
  96         def __init__(self, id):
  97                 self.id = id
  98                 self.result = None
  99                 self.started = False
 100                 self.depth = {}
 101                 self.html = None
 102                 self.watch_startpos = False
 103                 self.error_count = 0
 104                 HTMLParser.HTMLParser.__init__(self)
 105
 106         def error(self, message):
 107                 if self.error_count > 10 or self.started:
 108                         raise HTMLParser.HTMLParseError(message, self.getpos())
 109                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 110                 self.error_count += 1
 111                 self.goahead(1)
 112
 113         def loads(self, html):
 114                 self.html = html
 115                 self.feed(html)
 116                 self.close()
 117
 118         def handle_starttag(self, tag, attrs):
 119                 attrs = dict(attrs)
 120                 if self.started:
 121                         self.find_startpos(None)
 122                 if 'id' in attrs and attrs['id'] == self.id:
 123                         self.result = [tag]
 124                         self.started = True
 125                         self.watch_startpos = True
 126                 if self.started:
 127                         if not tag in self.depth: self.depth[tag] = 0
 128                         self.depth[tag] += 1
 129
 130         def handle_endtag(self, tag):
 131                 if self.started:
 132                         if tag in self.depth: self.depth[tag] -= 1
 133                         if self.depth[self.result[0]] == 0:
 134                                 self.started = False
 135                                 self.result.append(self.getpos())
 136
 137         def find_startpos(self, x):
 138                 """Needed to put the start position of the result (self.result[1])
 139                 after the opening tag with the requested id"""
 140                 if self.watch_startpos:
 141                         self.watch_startpos = False
 142                         self.result.append(self.getpos())
 143         handle_entityref = handle_charref = handle_data = handle_comment = \
 144         handle_decl = handle_pi = unknown_decl = find_startpos
 145
 146         def get_result(self):
 147                 if self.result is None:
 148                         return None
 149                 if len(self.result) != 3:
 150                         return None
 151                 lines = self.html.split('\n')
 152                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 153                 lines[0] = lines[0][self.result[1][1]:]
 154                 if len(lines) == 1:
 155                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 156                 lines[-1] = lines[-1][:self.result[2][1]]
 157                 return '\n'.join(lines).strip()
 158
 159 def get_element_by_id(id, html):
 160         """Return the content of the tag with the specified id in the passed HTML document"""
 161         parser = IDParser(id)
 162         try:
 163                 parser.loads(html)
 164         except HTMLParser.HTMLParseError:
 165                 pass
 166         return parser.get_result()
 167
 168
 169 def clean_html(html):
 170         """Clean an HTML snippet into a readable string"""
 171         # Newline vs <br />
 172         html = html.replace('\n', ' ')
 173         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 174         # Strip html tags
 175         html = re.sub('<.*?>', '', html)
 176         # Replace html entities
 177         html = unescapeHTML(html)
 178         return html
 179
 180
 181 def sanitize_open(filename, open_mode):
 182         """Try to open the given filename, and slightly tweak it if this fails.
 183
 184         Attempts to open the given filename. If this fails, it tries to change
 185         the filename slightly, step by step, until it's either able to open it
 186         or it fails and raises a final exception, like the standard open()
 187         function.
 188
 189         It returns the tuple (stream, definitive_file_name).
 190         """
 191         try:
 192                 if filename == u'-':
 193                         if sys.platform == 'win32':
 194                                 import msvcrt
 195                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 196                         return (sys.stdout, filename)
 197                 stream = open(encodeFilename(filename), open_mode)
 198                 return (stream, filename)
 199         except (IOError, OSError) as err:
 200                 # In case of error, try to remove win32 forbidden chars
 201                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 202
 203                 # An exception here should be caught in the caller
 204                 stream = open(encodeFilename(filename), open_mode)
 205                 return (stream, filename)
 206
 207
 208 def timeconvert(timestr):
 209         """Convert RFC 2822 defined time string into system timestamp"""
 210         timestamp = None
 211         timetuple = email.utils.parsedate_tz(timestr)
 212         if timetuple is not None:
 213                 timestamp = email.utils.mktime_tz(timetuple)
 214         return timestamp
 215
 216 def sanitize_filename(s, restricted=False):
 217         """Sanitizes a string so it could be used as part of a filename.
 218         If restricted is set, use a stricter subset of allowed characters.
 219         """
 220         def replace_insane(char):
 221                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 222                         return ''
 223                 elif char == '"':
 224                         return '' if restricted else '\''
 225                 elif char == ':':
 226                         return '_-' if restricted else ' -'
 227                 elif char in '\\/|*<>':
 228                         return '_'
 229                 if restricted and (char in '!&\'' or char.isspace()):
 230                         return '_'
 231                 if restricted and ord(char) > 127:
 232                         return '_'
 233                 return char
 234
 235         result = u''.join(map(replace_insane, s))
 236         while '__' in result:
 237                 result = result.replace('__', '_')
 238         result = result.strip('_')
 239         # Common case of "Foreign band name - English song title"
 240         if restricted and result.startswith('-_'):
 241                 result = result[2:]
 242         if not result:
 243                 result = '_'
 244         return result
 245
 246 def orderedSet(iterable):
 247         """ Remove all duplicates from the input iterable """
 248         res = []
 249         for el in iterable:
 250                 if el not in res:
 251                         res.append(el)
 252         return res
 253
 254 def unescapeHTML(s):
 255         """
 256         @param s a string
 257         """
 258         assert type(s) == type(u'')
 259
 260         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
 261         return result
 262
 263 def encodeFilename(s):
 264         """
 265         @param s The name of the file
 266         """
 267
 268         assert type(s) == type(u'')
 269
 270         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 271                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 272                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 273                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 274                 return s
 275         else:
 276                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 277
 278 class DownloadError(Exception):
 279         """Download Error exception.
 280
 281         This exception may be thrown by FileDownloader objects if they are not
 282         configured to continue on errors. They will contain the appropriate
 283         error message.
 284         """
 285         pass
 286
 287
 288 class SameFileError(Exception):
 289         """Same File exception.
 290
 291         This exception will be thrown by FileDownloader objects if they detect
 292         multiple files would have to be downloaded to the same file on disk.
 293         """
 294         pass
 295
 296
 297 class PostProcessingError(Exception):
 298         """Post Processing exception.
 299
 300         This exception may be raised by PostProcessor's .run() method to
 301         indicate an error in the postprocessing task.
 302         """
 303         pass
 304
 305 class MaxDownloadsReached(Exception):
 306         """ --max-downloads limit has been reached. """
 307         pass
 308
 309
 310 class UnavailableVideoError(Exception):
 311         """Unavailable Format exception.
 312
 313         This exception will be thrown when a video is requested
 314         in a format that is not available for that video.
 315         """
 316         pass
 317
 318
 319 class ContentTooShortError(Exception):
 320         """Content Too Short exception.
 321
 322         This exception may be raised by FileDownloader objects when a file they
 323         download is too small for what the server announced first, indicating
 324         the connection was probably interrupted.
 325         """
 326         # Both in bytes
 327         downloaded = None
 328         expected = None
 329
 330         def __init__(self, downloaded, expected):
 331                 self.downloaded = downloaded
 332                 self.expected = expected
 333
 334
 335 class Trouble(Exception):
 336         """Trouble helper exception
 337
 338         This is an exception to be handled with
 339         FileDownloader.trouble
 340         """
 341
 342 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 343         """Handler for HTTP requests and responses.
 344
 345         This class, when installed with an OpenerDirector, automatically adds
 346         the standard headers to every HTTP request and handles gzipped and
 347         deflated responses from web servers. If compression is to be avoided in
 348         a particular request, the original request in the program code only has
 349         to include the HTTP header "Youtubedl-No-Compression", which will be
 350         removed before making the real request.
 351
 352         Part of this code was copied from:
 353
 354         http://techknack.net/python-urllib2-handlers/
 355
 356         Andrew Rowls, the author of that code, agreed to release it to the
 357         public domain.
 358         """
 359
 360         @staticmethod
 361         def deflate(data):
 362                 try:
 363                         return zlib.decompress(data, -zlib.MAX_WBITS)
 364                 except zlib.error:
 365                         return zlib.decompress(data)
 366
 367         @staticmethod
 368         def addinfourl_wrapper(stream, headers, url, code):
 369                 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 370                         return compat_urllib_request.addinfourl(stream, headers, url, code)
 371                 ret = compat_urllib_request.addinfourl(stream, headers, url)
 372                 ret.code = code
 373                 return ret
 374
 375         def http_request(self, req):
 376                 for h in std_headers:
 377                         if h in req.headers:
 378                                 del req.headers[h]
 379                         req.add_header(h, std_headers[h])
 380                 if 'Youtubedl-no-compression' in req.headers:
 381                         if 'Accept-encoding' in req.headers:
 382                                 del req.headers['Accept-encoding']
 383                         del req.headers['Youtubedl-no-compression']
 384                 return req
 385
 386         def http_response(self, req, resp):
 387                 old_resp = resp
 388                 # gzip
 389                 if resp.headers.get('Content-encoding', '') == 'gzip':
 390                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 391                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 392                         resp.msg = old_resp.msg
 393                 # deflate
 394                 if resp.headers.get('Content-encoding', '') == 'deflate':
 395                         gz = StringIO.StringIO(self.deflate(resp.read()))
 396                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 397                         resp.msg = old_resp.msg
 398                 return resp