Merge pull request #9395 from pmrowla/afreecatv

author Yen Chi Hsuan <redacted>

Thu, 9 Jun 2016 09:20:16 +0000 (17:20 +0800)

committer Yen Chi Hsuan <redacted>

Thu, 9 Jun 2016 09:20:16 +0000 (17:20 +0800)
author Yen Chi Hsuan <redacted>
Thu, 9 Jun 2016 09:20:16 +0000 (17:20 +0800)
committer Yen Chi Hsuan <redacted>
Thu, 9 Jun 2016 09:20:16 +0000 (17:20 +0800)
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md

index a26ff1de45b4c0e545e0da27b060ed05ca7618aa..e593ee78a5c6c79074c642087400acb27b55af64 100644 (file)
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -6,8 +6,8 @@ ## Please follow the guide below
  
  ---
  
-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.05.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
-- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.05.01**
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.03*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.03**
  
  ### Before submitting an *issue* make sure you have:
  - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@@ -35,7 +35,7 @@ ### If the purpose of this *issue* is a *bug report*, *site support request* or
  [debug] User config: []
  [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
  [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
-[debug] youtube-dl version 2016.05.01
+[debug] youtube-dl version 2016.06.03
  [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
  [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
  [debug] Proxy map: {}
diff --git a/.gitignore b/.gitignore

index 72c10425d675f7c1952061be0057db0c2e5e232d..a802c75a10225f53f8da414fa34fd5422de5bea2 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -28,10 +28,16 @@ updates_key.pem
  *.mp4
  *.m4a
  *.m4v
+*.mp3
  *.part
  *.swp
  test/testdata
+test/local_parameters.json
  .tox
  youtube-dl.zsh
+
+# IntelliJ related files
  .idea
-.idea/*
+*.iml
+
+tmp/
diff --git a/.travis.yml b/.travis.yml

index cc21fae8f41ca567a2367d3515f981d8ec0af759..136c339f0cf9058fd9811077c6257afc553e30cf 100644 (file)
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,11 +7,13 @@ python:
    - "3.4"
    - "3.5"
  sudo: false
+install:
+  - bash ./devscripts/install_srelay.sh
+  - export PATH=$PATH:$(pwd)/tmp/srelay-0.4.8b6
  script: nosetests test --verbose
  notifications:
    email:
      - filippo.valsorda@gmail.com
-    - phihag@phihag.de
      - yasoob.khld@gmail.com
  #  irc:
  #    channels:
diff --git a/AUTHORS b/AUTHORS

index 814fe9ec3c5f2a706dc8ecccf81170d6b44d7235..3272fc6ea77178598579364f1070857f8d5bf6fd 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -169,3 +169,7 @@ Viťas Strádal
  Kagami Hiiragi
  Philip Huppert
  blahgeek
+Kevin Deldycke
+inondle
+Tomáš Čech
+Déstin Reed
diff --git a/Makefile b/Makefile

index 06cffcb710c6fd8fa6962007bd07d4753d5d5af6..6ee4ba4ebc6804ad78061d6b346fd67cd3fd01e5 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites
  
  clean:
-       rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
+       rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.jpg *.png CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
         find . -name "*.pyc" -delete
         find . -name "*.class" -delete
  
@@ -37,7 +37,7 @@ test:
  ot: offlinetest
  
  offlinetest: codetest
-       $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py
+       $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py --exclude test_socks.py
  
  tar: youtube-dl.tar.gz
  
@@ -69,7 +69,7 @@ README.txt: README.md
         pandoc -f markdown -t plain README.md -o README.txt
  
  youtube-dl.1: README.md
-       $(PYTHON) devscripts/prepare_manpage.py >youtube-dl.1.temp.md
+       $(PYTHON) devscripts/prepare_manpage.py youtube-dl.1.temp.md
         pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1
         rm -f youtube-dl.1.temp.md
  
diff --git a/README.md b/README.md

index 50acb26a0175df2f3f11d6c8e42c1505cb14faa6..205c485d0621805e6e4f3890b65f6109674d4ae0 100644 (file)
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ # INSTALLATION
  
  To install it right away for all UNIX users (Linux, OS X, etc.), type:
  
-    sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
+    sudo curl -L https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
      sudo chmod a+rx /usr/local/bin/youtube-dl
  
  If you do not have curl, you can alternatively use a recent wget:
@@ -25,15 +25,21 @@ # INSTALLATION
      sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl
      sudo chmod a+rx /usr/local/bin/youtube-dl
  
-Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29).
+Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`).
  
-OS X users can install **youtube-dl** with [Homebrew](http://brew.sh/).
+You can also use pip:
+
+    sudo pip install --upgrade youtube-dl
+    
+This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information.
+
+OS X users can install youtube-dl with [Homebrew](http://brew.sh/):
  
      brew install youtube-dl
  
-You can also use pip:
+Or with [MacPorts](https://www.macports.org/):
  
-    sudo pip install youtube-dl
+    sudo port install youtube-dl
  
  Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html).
  
@@ -73,8 +79,8 @@ # OPTIONS
                                       repairs broken URLs, but emits an error if
                                       this is not possible instead of searching.
      --ignore-config                  Do not read configuration files. When given
-                                     in the global configuration file /etc
-                                     /youtube-dl.conf: Do not read the user
+                                     in the global configuration file
+                                     /etc/youtube-dl.conf: Do not read the user
                                       configuration in ~/.config/youtube-
                                       dl/config (%APPDATA%/youtube-dl/config.txt
                                       on Windows)
@@ -85,9 +91,11 @@ # OPTIONS
      --no-color                       Do not emit color codes in output
  
  ## Network Options:
-    --proxy URL                      Use the specified HTTP/HTTPS proxy. Pass in
-                                     an empty string (--proxy "") for direct
-                                     connection
+    --proxy URL                      Use the specified HTTP/HTTPS/SOCKS proxy.
+                                     To enable experimental SOCKS proxy, specify
+                                     a proper scheme. For example
+                                     socks5://127.0.0.1:1080/. Pass in an empty
+                                     string (--proxy "") for direct connection
      --socket-timeout SECONDS         Time to wait before giving up, in seconds
      --source-address IP              Client-side IP address to bind to
                                       (experimental)
@@ -160,7 +168,7 @@ ## Video Selection:
                                       (experimental)
  
  ## Download Options:
-    -r, --rate-limit LIMIT           Maximum download rate in bytes per second
+    -r, --limit-rate RATE            Maximum download rate in bytes per second
                                       (e.g. 50K or 4.2M)
      -R, --retries RETRIES            Number of retries (default is 10), or
                                       "infinite".
@@ -254,11 +262,12 @@ ## Filesystem Options:
                                       jar in
      --cache-dir DIR                  Location in the filesystem where youtube-dl
                                       can store some downloaded information
-                                     permanently. By default $XDG_CACHE_HOME
-                                     /youtube-dl or ~/.cache/youtube-dl . At the
-                                     moment, only YouTube player files (for
-                                     videos with obfuscated signatures) are
-                                     cached, but that may change.
+                                     permanently. By default
+                                     $XDG_CACHE_HOME/youtube-dl or
+                                     ~/.cache/youtube-dl . At the moment, only
+                                     YouTube player files (for videos with
+                                     obfuscated signatures) are cached, but that
+                                     may change.
      --no-cache-dir                   Disable filesystem caching
      --rm-cache-dir                   Delete all filesystem cache files
  
@@ -415,7 +424,7 @@ ## Post-processing Options:
  
  # CONFIGURATION
  
-You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`.
+You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`.
  
  For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory:
  ```
@@ -431,7 +440,7 @@ # CONFIGURATION
  
  ### Authentication with `.netrc` file
  
-You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create a`.netrc` file in your `$HOME` and restrict permissions to read/write by you only:
+You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by you only:
  ```
  touch $HOME/.netrc
  chmod a-rwx,u+rw $HOME/.netrc
@@ -691,6 +700,10 @@ ### How do I update youtube-dl?
  
  Again, from then on you'll be able to update with `sudo youtube-dl -U`.
  
+### youtube-dl is extremely slow to start on Windows
+
+Add a file exclusion for `youtube-dl.exe` in Windows Defender settings.
+
  ### I'm getting an error `Unable to extract OpenGraph title` on YouTube playlists
  
  YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos.
@@ -778,9 +791,9 @@ ### What is this binary file? Where has the code gone?
  
  Since June 2012 ([#342](https://github.com/rg3/youtube-dl/issues/342)) youtube-dl is packed as an executable zipfile, simply unzip it (might need renaming to `youtube-dl.zip` first on some systems) or clone the git repository, as laid out above. If you modify the code, you can run it by executing the `__main__.py` file. To recompile the executable, run `make youtube-dl`.
  
-### The exe throws a *Runtime error from Visual C++*
+### The exe throws an error due to missing `MSVCR100.dll`
  
-To run the exe you need to install first the [Microsoft Visual C++ 2008 Redistributable Package](http://www.microsoft.com/en-us/download/details.aspx?id=29).
+To run the exe you need to install first the [Microsoft Visual C++ 2010 Redistributable Package (x86)](https://www.microsoft.com/en-US/download/details.aspx?id=5555).
  
  ### On Windows, how should I set up ffmpeg and youtube-dl? Where should I put the exe files?
  
@@ -835,6 +848,12 @@ ### How can I detect whether a given URL is supported by youtube-dl?
  
  If you want to find out whether a given URL is supported, simply call youtube-dl with it. If you get no videos back, chances are the URL is either not referring to a video or unsupported. You can find out which by examining the output (if you run youtube-dl on the console) or catching an `UnsupportedError` exception if you run it from a Python program.
  
+# Why do I need to go through that much red tape when filing bugs?
+
+Before we had the issue template, despite our extensive [bug reporting instructions](#bugs), about 80% of the issue reports we got were useless, for instance because people used ancient versions hundreds of releases old, because of simple syntactic errors (not in youtube-dl but in general shell usage), because the problem was alrady reported multiple times before, because people did not actually read an error message, even if it said "please install ffmpeg", because people did not mention the URL they were trying to download and many more simple, easy-to-avoid problems, many of whom were totally unrelated to youtube-dl.
+
+youtube-dl is an open-source project manned by too few volunteers, so we'd rather spend time fixing bugs where we are certain none of those simple problems apply, and where we can be reasonably confident to be able to reproduce the issue without asking the reporter repeatedly. As such, the output of `youtube-dl -v YOUR_URL_HERE` is really all that's required to file an issue. The issue template also guides you through some basic steps you can do, such as checking that your version of youtube-dl is current.
+
  # DEVELOPER INSTRUCTIONS
  
  Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
diff --git a/devscripts/buildserver.py b/devscripts/buildserver.py

index 7c2f49f8bb63bbe2b47efca151129a7e6b49674d..fc99c3213dddf985cfcf4fe74584cc09eeaf3175 100644 (file)
--- a/devscripts/buildserver.py
+++ b/devscripts/buildserver.py
@@ -1,17 +1,38 @@
  #!/usr/bin/python3
  
-from http.server import HTTPServer, BaseHTTPRequestHandler
-from socketserver import ThreadingMixIn
  import argparse
  import ctypes
  import functools
+import shutil
+import subprocess
  import sys
+import tempfile
  import threading
  import traceback
  import os.path
  
+sys.path.insert(0, os.path.dirname(os.path.dirname((os.path.abspath(__file__)))))
+from youtube_dl.compat import (
+    compat_input,
+    compat_http_server,
+    compat_str,
+    compat_urlparse,
+)
+
+# These are not used outside of buildserver.py thus not in compat.py
+
+try:
+    import winreg as compat_winreg
+except ImportError:  # Python 2
+    import _winreg as compat_winreg
  
-class BuildHTTPServer(ThreadingMixIn, HTTPServer):
+try:
+    import socketserver as compat_socketserver
+except ImportError:  # Python 2
+    import SocketServer as compat_socketserver
+
+
+class BuildHTTPServer(compat_socketserver.ThreadingMixIn, compat_http_server.HTTPServer):
      allow_reuse_address = True
  
  
@@ -191,7 +212,7 @@ def main(args=None):
                          action='store_const', dest='action', const='service',
                          help='Run as a Windows service')
      parser.add_argument('-b', '--bind', metavar='<host:port>',
-                        action='store', default='localhost:8142',
+                        action='store', default='0.0.0.0:8142',
                          help='Bind to host:port (default %default)')
      options = parser.parse_args(args=args)
  
@@ -216,7 +237,7 @@ def main(args=None):
      srv = BuildHTTPServer((host, port), BuildHTTPRequestHandler)
      thr = threading.Thread(target=srv.serve_forever)
      thr.start()
-    input('Press ENTER to shut down')
+    compat_input('Press ENTER to shut down')
      srv.shutdown()
      thr.join()
  
@@ -231,8 +252,6 @@ def rmtree(path):
              os.remove(fname)
      os.rmdir(path)
  
-#==============================================================================
-
  
  class BuildError(Exception):
      def __init__(self, output, code=500):
@@ -249,15 +268,25 @@ class HTTPError(BuildError):
  
  class PythonBuilder(object):
      def __init__(self, **kwargs):
-        pythonVersion = kwargs.pop('python', '2.7')
-        try:
-            key = _winreg.OpenKey(_winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\Python\PythonCore\%s\InstallPath' % pythonVersion)
+        python_version = kwargs.pop('python', '3.4')
+        python_path = None
+        for node in ('Wow6432Node\\', ''):
              try:
-                self.pythonPath, _ = _winreg.QueryValueEx(key, '')
-            finally:
-                _winreg.CloseKey(key)
-        except Exception:
-            raise BuildError('No such Python version: %s' % pythonVersion)
+                key = compat_winreg.OpenKey(
+                    compat_winreg.HKEY_LOCAL_MACHINE,
+                    r'SOFTWARE\%sPython\PythonCore\%s\InstallPath' % (node, python_version))
+                try:
+                    python_path, _ = compat_winreg.QueryValueEx(key, '')
+                finally:
+                    compat_winreg.CloseKey(key)
+                break
+            except Exception:
+                pass
+
+        if not python_path:
+            raise BuildError('No such Python version: %s' % python_version)
+
+        self.pythonPath = python_path
  
          super(PythonBuilder, self).__init__(**kwargs)
  
@@ -305,8 +334,10 @@ def __init__(self, **kwargs):
  
      def build(self):
          try:
-            subprocess.check_output([os.path.join(self.pythonPath, 'python.exe'), 'setup.py', 'py2exe'],
-                                    cwd=self.buildPath)
+            proc = subprocess.Popen([os.path.join(self.pythonPath, 'python.exe'), 'setup.py', 'py2exe'], stdin=subprocess.PIPE, cwd=self.buildPath)
+            proc.wait()
+            #subprocess.check_output([os.path.join(self.pythonPath, 'python.exe'), 'setup.py', 'py2exe'],
+            #                        cwd=self.buildPath)
          except subprocess.CalledProcessError as e:
              raise BuildError(e.output)
  
@@ -369,12 +400,12 @@ class Builder(PythonBuilder, GITBuilder, YoutubeDLBuilder, DownloadBuilder, Clea
      pass
  
  
-class BuildHTTPRequestHandler(BaseHTTPRequestHandler):
+class BuildHTTPRequestHandler(compat_http_server.BaseHTTPRequestHandler):
      actionDict = {'build': Builder, 'download': Builder}  # They're the same, no more caching.
  
      def do_GET(self):
-        path = urlparse.urlparse(self.path)
-        paramDict = dict([(key, value[0]) for key, value in urlparse.parse_qs(path.query).items()])
+        path = compat_urlparse.urlparse(self.path)
+        paramDict = dict([(key, value[0]) for key, value in compat_urlparse.parse_qs(path.query).items()])
          action, _, path = path.path.strip('/').partition('/')
          if path:
              path = path.split('/')
@@ -388,7 +419,7 @@ def do_GET(self):
                          builder.close()
                  except BuildError as e:
                      self.send_response(e.code)
-                    msg = unicode(e).encode('UTF-8')
+                    msg = compat_str(e).encode('UTF-8')
                      self.send_header('Content-Type', 'text/plain; charset=UTF-8')
                      self.send_header('Content-Length', len(msg))
                      self.end_headers()
@@ -400,7 +431,5 @@ def do_GET(self):
          else:
              self.send_response(500, 'Malformed URL')
  
-#==============================================================================
-
  if __name__ == '__main__':
      main()
diff --git a/devscripts/create-github-release.py b/devscripts/create-github-release.py

new file mode 100644 (file)

index 0000000..3b8021e
--- /dev/null
+++ b/devscripts/create-github-release.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+
+import base64
+import json
+import mimetypes
+import netrc
+import optparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.compat import (
+    compat_basestring,
+    compat_input,
+    compat_getpass,
+    compat_print,
+    compat_urllib_request,
+)
+from youtube_dl.utils import (
+    make_HTTPS_handler,
+    sanitized_Request,
+)
+
+
+class GitHubReleaser(object):
+    _API_URL = 'https://api.github.com/repos/rg3/youtube-dl/releases'
+    _UPLOADS_URL = 'https://uploads.github.com/repos/rg3/youtube-dl/releases/%s/assets?name=%s'
+    _NETRC_MACHINE = 'github.com'
+
+    def __init__(self, debuglevel=0):
+        self._init_github_account()
+        https_handler = make_HTTPS_handler({}, debuglevel=debuglevel)
+        self._opener = compat_urllib_request.build_opener(https_handler)
+
+    def _init_github_account(self):
+        try:
+            info = netrc.netrc().authenticators(self._NETRC_MACHINE)
+            if info is not None:
+                self._username = info[0]
+                self._password = info[2]
+                compat_print('Using GitHub credentials found in .netrc...')
+                return
+            else:
+                compat_print('No GitHub credentials found in .netrc')
+        except (IOError, netrc.NetrcParseError):
+            compat_print('Unable to parse .netrc')
+        self._username = compat_input(
+            'Type your GitHub username or email address and press [Return]: ')
+        self._password = compat_getpass(
+            'Type your GitHub password and press [Return]: ')
+
+    def _call(self, req):
+        if isinstance(req, compat_basestring):
+            req = sanitized_Request(req)
+        # Authorizing manually since GitHub does not response with 401 with
+        # WWW-Authenticate header set (see
+        # https://developer.github.com/v3/#basic-authentication)
+        b64 = base64.b64encode(
+            ('%s:%s' % (self._username, self._password)).encode('utf-8')).decode('ascii')
+        req.add_header('Authorization', 'Basic %s' % b64)
+        response = self._opener.open(req).read().decode('utf-8')
+        return json.loads(response)
+
+    def list_releases(self):
+        return self._call(self._API_URL)
+
+    def create_release(self, tag_name, name=None, body='', draft=False, prerelease=False):
+        data = {
+            'tag_name': tag_name,
+            'target_commitish': 'master',
+            'name': name,
+            'body': body,
+            'draft': draft,
+            'prerelease': prerelease,
+        }
+        req = sanitized_Request(self._API_URL, json.dumps(data).encode('utf-8'))
+        return self._call(req)
+
+    def create_asset(self, release_id, asset):
+        asset_name = os.path.basename(asset)
+        url = self._UPLOADS_URL % (release_id, asset_name)
+        # Our files are small enough to be loaded directly into memory.
+        data = open(asset, 'rb').read()
+        req = sanitized_Request(url, data)
+        mime_type, _ = mimetypes.guess_type(asset_name)
+        req.add_header('Content-Type', mime_type or 'application/octet-stream')
+        return self._call(req)
+
+
+def main():
+    parser = optparse.OptionParser(usage='%prog VERSION BUILDPATH')
+    options, args = parser.parse_args()
+    if len(args) != 2:
+        parser.error('Expected a version and a build directory')
+
+    version, build_path = args
+
+    releaser = GitHubReleaser()
+
+    new_release = releaser.create_release(version, name='youtube-dl %s' % version)
+    release_id = new_release['id']
+
+    for asset in os.listdir(build_path):
+        compat_print('Uploading %s...' % asset)
+        releaser.create_asset(release_id, os.path.join(build_path, asset))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/devscripts/install_srelay.sh b/devscripts/install_srelay.sh

new file mode 100755 (executable)

index 0000000..33ce8a3
--- /dev/null
+++ b/devscripts/install_srelay.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+mkdir -p tmp && cd tmp
+wget -N http://downloads.sourceforge.net/project/socks-relay/socks-relay/srelay-0.4.8/srelay-0.4.8b6.tar.gz
+tar zxvf srelay-0.4.8b6.tar.gz
+cd srelay-0.4.8b6
+./configure
+make
diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py

index 776e6556e5b2bd683acbcf79d7bc07431be6548a..e3f6339b5a60fc0a19106e7447842d08e680dce2 100644 (file)
--- a/devscripts/prepare_manpage.py
+++ b/devscripts/prepare_manpage.py
@@ -1,13 +1,46 @@
  from __future__ import unicode_literals
  
  import io
+import optparse
  import os.path
-import sys
  import re
  
  ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  README_FILE = os.path.join(ROOT_DIR, 'README.md')
  
+PREFIX = '''%YOUTUBE-DL(1)
+
+# NAME
+
+youtube\-dl \- download videos from youtube.com or other video platforms
+
+# SYNOPSIS
+
+**youtube-dl** \[OPTIONS\] URL [URL...]
+
+'''
+
+
+def main():
+    parser = optparse.OptionParser(usage='%prog OUTFILE.md')
+    options, args = parser.parse_args()
+    if len(args) != 1:
+        parser.error('Expected an output filename')
+
+    outfile, = args
+
+    with io.open(README_FILE, encoding='utf-8') as f:
+        readme = f.read()
+
+    readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme)
+    readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme)
+    readme = PREFIX + readme
+
+    readme = filter_options(readme)
+
+    with io.open(outfile, 'w', encoding='utf-8') as outf:
+        outf.write(readme)
+
  
  def filter_options(readme):
      ret = ''
@@ -37,27 +70,5 @@ def filter_options(readme):
  
      return ret
  
-with io.open(README_FILE, encoding='utf-8') as f:
-    readme = f.read()
-
-PREFIX = '''%YOUTUBE-DL(1)
-
-# NAME
-
-youtube\-dl \- download videos from youtube.com or other video platforms
-
-# SYNOPSIS
-
-**youtube-dl** \[OPTIONS\] URL [URL...]
-
-'''
-readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme)
-readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme)
-readme = PREFIX + readme
-
-readme = filter_options(readme)
-
-if sys.version_info < (3, 0):
-    print(readme.encode('utf-8'))
-else:
-    print(readme)
+if __name__ == '__main__':
+    main()
diff --git a/devscripts/release.sh b/devscripts/release.sh

index 8dea55dbbc6a4b577c18d9255a9ed010d95c6f2e..87e8eda50c1c5e3d0a5e41414fb8b1eba2ee27ee 100755 (executable)
--- a/devscripts/release.sh
+++ b/devscripts/release.sh
@@ -6,7 +6,7 @@
  # * the git config user.signingkey is properly set
  
  # You will need
-# pip install coverage nose rsa
+# pip install coverage nose rsa wheel
  
  # TODO
  # release notes
@@ -15,10 +15,28 @@
  set -e
  
  skip_tests=true
-if [ "$1" = '--run-tests' ]; then
-    skip_tests=false
-    shift
-fi
+buildserver='localhost:8142'
+
+while true
+do
+case "$1" in
+    --run-tests)
+        skip_tests=false
+        shift
+    ;;
+    --buildserver)
+        buildserver="$2"
+        shift 2
+    ;;
+    --*)
+        echo "ERROR: unknown option $1"
+        exit 1
+    ;;
+    *)
+        break
+    ;;
+esac
+done
  
  if [ -z "$1" ]; then echo "ERROR: specify version number like this: $0 1994.09.06"; exit 1; fi
  version="$1"
@@ -33,6 +51,9 @@ if [ ! -z "`git status --porcelain | grep -v CHANGELOG`" ]; then echo 'ERROR: th
  useless_files=$(find youtube_dl -type f -not -name '*.py')
  if [ ! -z "$useless_files" ]; then echo "ERROR: Non-.py files in youtube_dl: $useless_files"; exit 1; fi
  if [ ! -f "updates_key.pem" ]; then echo 'ERROR: updates_key.pem missing'; exit 1; fi
+if ! type pandoc >/dev/null 2>/dev/null; then echo 'ERROR: pandoc is missing'; exit 1; fi
+if ! python3 -c 'import rsa' 2>/dev/null; then echo 'ERROR: python3-rsa is missing'; exit 1; fi
+if ! python3 -c 'import wheel' 2>/dev/null; then echo 'ERROR: wheel is missing'; exit 1; fi
  
  /bin/echo -e "\n### First of all, testing..."
  make clean
@@ -64,7 +85,7 @@ git push origin "$version"
  REV=$(git rev-parse HEAD)
  make youtube-dl youtube-dl.tar.gz
  read -p "VM running? (y/n) " -n 1
-wget "http://localhost:8142/build/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe
+wget "http://$buildserver/build/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe
  mkdir -p "build/$version"
  mv youtube-dl youtube-dl.exe "build/$version"
  mv youtube-dl.tar.gz "build/$version/youtube-dl-$version.tar.gz"
@@ -74,15 +95,16 @@ RELEASE_FILES="youtube-dl youtube-dl.exe youtube-dl-$version.tar.gz"
  (cd build/$version/ && sha256sum $RELEASE_FILES > SHA2-256SUMS)
  (cd build/$version/ && sha512sum $RELEASE_FILES > SHA2-512SUMS)
  
-/bin/echo -e "\n### Signing and uploading the new binaries to yt-dl.org ..."
+/bin/echo -e "\n### Signing and uploading the new binaries to GitHub..."
  for f in $RELEASE_FILES; do gpg --passphrase-repeat 5 --detach-sig "build/$version/$f"; done
-scp -r "build/$version" ytdl@yt-dl.org:html/tmp/
-ssh ytdl@yt-dl.org "mv html/tmp/$version html/downloads/"
+
+ROOT=$(pwd)
+python devscripts/create-github-release.py $version "$ROOT/build/$version"
+
  ssh ytdl@yt-dl.org "sh html/update_latest.sh $version"
  
  /bin/echo -e "\n### Now switching to gh-pages..."
  git clone --branch gh-pages --single-branch . build/gh-pages
-ROOT=$(pwd)
  (
      set -e
      ORIGIN_URL=$(git config --get remote.origin.url)
diff --git a/docs/supportedsites.md b/docs/supportedsites.md

index 9fb43671fa440c2073754aa413fe5ddbad01c353..619bd0825ad36d870c44fcfed831108cca454e76 100644 (file)
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -6,6 +6,7 @@ # Supported sites
   - **22tracks:genre**
   - **22tracks:track**
   - **24video**
+ - **3qsdn**: 3Q SDN
   - **3sat**
   - **4tube**
   - **56.com**
@@ -15,6 +16,8 @@ # Supported sites
   - **9gag**
   - **abc.net.au**
   - **Abc7News**
+ - **abcnews**
+ - **abcnews:video**
   - **AcademicEarth:Course**
   - **acast**
   - **acast:channel**
@@ -52,6 +55,7 @@ # Supported sites
   - **arte.tv:future**
   - **arte.tv:info**
   - **arte.tv:magazine**
+ - **arte.tv:playlist**
   - **AtresPlayer**
   - **ATTTechChannel**
   - **AudiMedia**
@@ -77,6 +81,7 @@ # Supported sites
   - **Bild**: Bild.de
   - **BiliBili**
   - **BioBioChileTV**
+ - **BIQLE**
   - **BleacherReport**
   - **BleacherReportCMS**
   - **blinkx**
@@ -102,6 +107,7 @@ # Supported sites
   - **CBCPlayer**
   - **CBS**
   - **CBSInteractive**
+ - **CBSLocal**
   - **CBSNews**: CBS News
   - **CBSNewsLiveVideo**: CBS News Live Videos
   - **CBSSports**
@@ -113,7 +119,6 @@ # Supported sites
   - **chirbit**
   - **chirbit:profile**
   - **Cinchcast**
- - **Cinemassacre**
   - **Clipfish**
   - **cliphunter**
   - **ClipRs**
@@ -127,12 +132,12 @@ # Supported sites
   - **CNN**
   - **CNNArticle**
   - **CNNBlogs**
- - **CollegeHumor**
   - **CollegeRama**
   - **ComCarCoff**
   - **ComedyCentral**
   - **ComedyCentralShows**: The Daily Show / The Colbert Report
   - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED
+ - **Coub**
   - **Cracked**
   - **Crackle**
   - **Criterion**
@@ -145,6 +150,7 @@ # Supported sites
   - **culturebox.francetvinfo.fr**
   - **CultureUnplugged**
   - **CWTV**
+ - **DailyMail**
   - **dailymotion**
   - **dailymotion:playlist**
   - **dailymotion:user**
@@ -201,6 +207,7 @@ # Supported sites
   - **exfm**: ex.fm
   - **ExpoTV**
   - **ExtremeTube**
+ - **EyedoTV**
   - **facebook**
   - **faz.net**
   - **fc2**
@@ -212,6 +219,7 @@ # Supported sites
   - **Flickr**
   - **Folketinget**: Folketinget (ft.dk; Danish parliament)
   - **FootyRoom**
+ - **Formula1**
   - **FOX**
   - **Foxgay**
   - **FoxNews**: Fox News and Fox Business Video
@@ -315,20 +323,24 @@ # Supported sites
   - **la7.tv**
   - **Laola1Tv**
   - **Le**: 乐视网
+ - **Learnr**
   - **Lecture2Go**
   - **Lemonde**
   - **LePlaylist**
   - **LetvCloud**: 乐视云
   - **Libsyn**
+ - **life**: Life.ru
   - **life:embed**
- - **lifenews**: LIFE | NEWS
   - **limelight**
   - **limelight:channel**
   - **limelight:channel_list**
+ - **LiTV**
   - **LiveLeak**
   - **livestream**
   - **livestream:original**
   - **LnkGo**
+ - **loc**: Library of Congress
+ - **LocalNews8**
   - **LoveHomePorn**
   - **lrt.lt**
   - **lynda**: lynda.com videos
@@ -374,6 +386,8 @@ # Supported sites
   - **mtvservices:embedded**
   - **MuenchenTV**: münchen.tv
   - **MusicPlayOn**
+ - **mva**: Microsoft Virtual Academy videos
+ - **mva:course**: Microsoft Virtual Academy courses
   - **Mwave**
   - **MwaveMeetGreet**
   - **MySpace**
@@ -463,7 +477,8 @@ # Supported sites
   - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET  (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC)
   - **pcmag**
   - **People**
- - **Periscope**: Periscope
+ - **periscope**: Periscope
+ - **periscope:user**: Periscope user videos
   - **PhilharmonieDeParis**: Philharmonie de Paris
   - **phoenix.de**
   - **Photobucket**
@@ -501,6 +516,8 @@ # Supported sites
   - **R7**
   - **radio.de**
   - **radiobremen**
+ - **radiocanada**
+ - **RadioCanadaAudioVideo**
   - **radiofrance**
   - **RadioJavan**
   - **Rai**
@@ -510,8 +527,10 @@ # Supported sites
   - **RedTube**
   - **RegioTV**
   - **Restudy**
+ - **Reuters**
   - **ReverbNation**
- - **Revision3**
+ - **revision**
+ - **revision3:embed**
   - **RICE**
   - **RingTV**
   - **RottenTomatoes**
@@ -550,7 +569,9 @@ # Supported sites
   - **ScreencastOMatic**
   - **ScreenJunkies**
   - **ScreenwaveMedia**
+ - **Seeker**
   - **SenateISVP**
+ - **SendtoNews**
   - **ServingSys**
   - **Sexu**
   - **Shahid**
@@ -670,11 +691,10 @@ # Supported sites
   - **TVCArticle**
   - **tvigle**: Интернет-телевидение Tvigle.ru
   - **tvland.com**
- - **tvp.pl**
- - **tvp.pl:Series**
+ - **tvp**: Telewizja Polska
+ - **tvp:series**
   - **TVPlay**: TV3Play and related services
   - **Tweakers**
- - **twitch:bookmarks**
   - **twitch:chapter**
   - **twitch:past_broadcasts**
   - **twitch:profile**
@@ -692,7 +712,8 @@ # Supported sites
   - **USAToday**
   - **ustream**
   - **ustream:channel**
- - **Ustudio**
+ - **ustudio**
+ - **ustudio:embed**
   - **Varzesh3**
   - **Vbox7**
   - **VeeHD**
@@ -700,6 +721,7 @@ # Supported sites
   - **Vessel**
   - **Vesti**: Вести.Ru
   - **Vevo**
+ - **VevoPlaylist**
   - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet
   - **vh1.com**
   - **Vice**
@@ -753,7 +775,8 @@ # Supported sites
   - **VuClip**
   - **vulture.com**
   - **Walla**
- - **WashingtonPost**
+ - **washingtonpost**
+ - **washingtonpost:article**
   - **wat.tv**
   - **WatchIndianPorn**: Watch Indian Porn
   - **WDR**
@@ -772,7 +795,7 @@ # Supported sites
   - **WSJ**: Wall Street Journal
   - **XBef**
   - **XboxClips**
- - **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me
+ - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To
   - **XHamster**
   - **XHamsterEmbed**
   - **xiami:album**: 虾米音乐 - 专辑
diff --git a/test/helper.py b/test/helper.py

index b8e22c5cb42f2e14465e812ed624aaa5e102ff5c..dfee217a9b8acb64e426c3ce8fc5c11a9c5a0121 100644 (file)
--- a/test/helper.py
+++ b/test/helper.py
@@ -24,8 +24,13 @@
  def get_params(override=None):
      PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                     "parameters.json")
+    LOCAL_PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                         "local_parameters.json")
      with io.open(PARAMETERS_FILE, encoding='utf-8') as pf:
          parameters = json.load(pf)
+    if os.path.exists(LOCAL_PARAMETERS_FILE):
+        with io.open(LOCAL_PARAMETERS_FILE, encoding='utf-8') as pf:
+            parameters.update(json.load(pf))
      if override:
          parameters.update(override)
      return parameters
diff --git a/test/test_compat.py b/test/test_compat.py

index 618668210f62191da7f899a2a586c699b512c129..f5317ac3e24290d5aa73e12c7e490bfed72d6c21 100644 (file)
--- a/test/test_compat.py
+++ b/test/test_compat.py
@@ -10,13 +10,14 @@
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
  
-from youtube_dl.utils import get_filesystem_encoding
  from youtube_dl.compat import (
      compat_getenv,
+    compat_setenv,
      compat_etree_fromstring,
      compat_expanduser,
      compat_shlex_split,
      compat_str,
+    compat_struct_unpack,
      compat_urllib_parse_unquote,
      compat_urllib_parse_unquote_plus,
      compat_urllib_parse_urlencode,
@@ -26,19 +27,22 @@
  class TestCompat(unittest.TestCase):
      def test_compat_getenv(self):
          test_str = 'тест'
-        os.environ['YOUTUBE-DL-TEST'] = (
-            test_str if sys.version_info >= (3, 0)
-            else test_str.encode(get_filesystem_encoding()))
+        compat_setenv('YOUTUBE-DL-TEST', test_str)
          self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str)
  
+    def test_compat_setenv(self):
+        test_var = 'YOUTUBE-DL-TEST'
+        test_str = 'тест'
+        compat_setenv(test_var, test_str)
+        compat_getenv(test_var)
+        self.assertEqual(compat_getenv(test_var), test_str)
+
      def test_compat_expanduser(self):
          old_home = os.environ.get('HOME')
          test_str = 'C:\Documents and Settings\тест\Application Data'
-        os.environ['HOME'] = (
-            test_str if sys.version_info >= (3, 0)
-            else test_str.encode(get_filesystem_encoding()))
+        compat_setenv('HOME', test_str)
          self.assertEqual(compat_expanduser('~'), test_str)
-        os.environ['HOME'] = old_home
+        compat_setenv('HOME', old_home or '')
  
      def test_all_present(self):
          import youtube_dl.compat
@@ -99,5 +103,15 @@ def test_compat_etree_fromstring(self):
          self.assertTrue(isinstance(doc.find('chinese').text, compat_str))
          self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str))
  
+    def test_compat_etree_fromstring_doctype(self):
+        xml = '''<?xml version="1.0"?>
+<!DOCTYPE smil PUBLIC "-//W3C//DTD SMIL 2.0//EN" "http://www.w3.org/2001/SMIL20/SMIL20.dtd">
+<smil xmlns="http://www.w3.org/2001/SMIL20/Language"></smil>'''
+        compat_etree_fromstring(xml)
+
+    def test_struct_unpack(self):
+        self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,))
+
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_http.py b/test/test_http.py

index 15e0ad369d57966bef222bf35c422ad9bdb4e755..5076ced510847b83b2ac7a5374b92ef248ba8ec8 100644 (file)
--- a/test/test_http.py
+++ b/test/test_http.py
@@ -16,6 +16,15 @@
  TEST_DIR = os.path.dirname(os.path.abspath(__file__))
  
  
+def http_server_port(httpd):
+    if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket):
+        # In Jython SSLSocket is not a subclass of socket.socket
+        sock = httpd.socket.sock
+    else:
+        sock = httpd.socket
+    return sock.getsockname()[1]
+
+
  class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
      def log_message(self, format, *args):
          pass
@@ -31,6 +40,22 @@ def do_GET(self):
              self.send_header('Content-Type', 'video/mp4')
              self.end_headers()
              self.wfile.write(b'\x00\x00\x00\x00\x20\x66\x74[video]')
+        elif self.path == '/302':
+            if sys.version_info[0] == 3:
+                # XXX: Python 3 http server does not allow non-ASCII header values
+                self.send_response(404)
+                self.end_headers()
+                return
+
+            new_url = 'http://localhost:%d/中文.html' % http_server_port(self.server)
+            self.send_response(302)
+            self.send_header(b'Location', new_url.encode('utf-8'))
+            self.end_headers()
+        elif self.path == '/%E4%B8%AD%E6%96%87.html':
+            self.send_response(200)
+            self.send_header('Content-Type', 'text/html; charset=utf-8')
+            self.end_headers()
+            self.wfile.write(b'<html><video src="/vid.mp4" /></html>')
          else:
              assert False
  
@@ -47,18 +72,32 @@ def error(self, msg):
  
  
  class TestHTTP(unittest.TestCase):
+    def setUp(self):
+        self.httpd = compat_http_server.HTTPServer(
+            ('localhost', 0), HTTPTestRequestHandler)
+        self.port = http_server_port(self.httpd)
+        self.server_thread = threading.Thread(target=self.httpd.serve_forever)
+        self.server_thread.daemon = True
+        self.server_thread.start()
+
+    def test_unicode_path_redirection(self):
+        # XXX: Python 3 http server does not allow non-ASCII header values
+        if sys.version_info[0] == 3:
+            return
+
+        ydl = YoutubeDL({'logger': FakeLogger()})
+        r = ydl.extract_info('http://localhost:%d/302' % self.port)
+        self.assertEqual(r['url'], 'http://localhost:%d/vid.mp4' % self.port)
+
+
+class TestHTTPS(unittest.TestCase):
      def setUp(self):
          certfn = os.path.join(TEST_DIR, 'testcert.pem')
          self.httpd = compat_http_server.HTTPServer(
              ('localhost', 0), HTTPTestRequestHandler)
          self.httpd.socket = ssl.wrap_socket(
              self.httpd.socket, certfile=certfn, server_side=True)
-        if os.name == 'java':
-            # In Jython SSLSocket is not a subclass of socket.socket
-            sock = self.httpd.socket.sock
-        else:
-            sock = self.httpd.socket
-        self.port = sock.getsockname()[1]
+        self.port = http_server_port(self.httpd)
          self.server_thread = threading.Thread(target=self.httpd.serve_forever)
          self.server_thread.daemon = True
          self.server_thread.start()
@@ -94,14 +133,14 @@ class TestProxy(unittest.TestCase):
      def setUp(self):
          self.proxy = compat_http_server.HTTPServer(
              ('localhost', 0), _build_proxy_handler('normal'))
-        self.port = self.proxy.socket.getsockname()[1]
+        self.port = http_server_port(self.proxy)
          self.proxy_thread = threading.Thread(target=self.proxy.serve_forever)
          self.proxy_thread.daemon = True
          self.proxy_thread.start()
  
          self.cn_proxy = compat_http_server.HTTPServer(
              ('localhost', 0), _build_proxy_handler('cn'))
-        self.cn_port = self.cn_proxy.socket.getsockname()[1]
+        self.cn_port = http_server_port(self.cn_proxy)
          self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever)
          self.cn_proxy_thread.daemon = True
          self.cn_proxy_thread.start()
diff --git a/test/test_socks.py b/test/test_socks.py

new file mode 100644 (file)

index 0000000..1e68eb0
--- /dev/null
+++ b/test/test_socks.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+# coding: utf-8
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import random
+import subprocess
+
+from test.helper import (
+    FakeYDL,
+    get_params,
+)
+from youtube_dl.compat import (
+    compat_str,
+    compat_urllib_request,
+)
+
+
+class TestMultipleSocks(unittest.TestCase):
+    @staticmethod
+    def _check_params(attrs):
+        params = get_params()
+        for attr in attrs:
+            if attr not in params:
+                print('Missing %s. Skipping.' % attr)
+                return
+        return params
+
+    def test_proxy_http(self):
+        params = self._check_params(['primary_proxy', 'primary_server_ip'])
+        if params is None:
+            return
+        ydl = FakeYDL({
+            'proxy': params['primary_proxy']
+        })
+        self.assertEqual(
+            ydl.urlopen('http://yt-dl.org/ip').read().decode('utf-8'),
+            params['primary_server_ip'])
+
+    def test_proxy_https(self):
+        params = self._check_params(['primary_proxy', 'primary_server_ip'])
+        if params is None:
+            return
+        ydl = FakeYDL({
+            'proxy': params['primary_proxy']
+        })
+        self.assertEqual(
+            ydl.urlopen('https://yt-dl.org/ip').read().decode('utf-8'),
+            params['primary_server_ip'])
+
+    def test_secondary_proxy_http(self):
+        params = self._check_params(['secondary_proxy', 'secondary_server_ip'])
+        if params is None:
+            return
+        ydl = FakeYDL()
+        req = compat_urllib_request.Request('http://yt-dl.org/ip')
+        req.add_header('Ytdl-request-proxy', params['secondary_proxy'])
+        self.assertEqual(
+            ydl.urlopen(req).read().decode('utf-8'),
+            params['secondary_server_ip'])
+
+    def test_secondary_proxy_https(self):
+        params = self._check_params(['secondary_proxy', 'secondary_server_ip'])
+        if params is None:
+            return
+        ydl = FakeYDL()
+        req = compat_urllib_request.Request('https://yt-dl.org/ip')
+        req.add_header('Ytdl-request-proxy', params['secondary_proxy'])
+        self.assertEqual(
+            ydl.urlopen(req).read().decode('utf-8'),
+            params['secondary_server_ip'])
+
+
+class TestSocks(unittest.TestCase):
+    _SKIP_SOCKS_TEST = True
+
+    def setUp(self):
+        if self._SKIP_SOCKS_TEST:
+            return
+
+        self.port = random.randint(20000, 30000)
+        self.server_process = subprocess.Popen([
+            'srelay', '-f', '-i', '127.0.0.1:%d' % self.port],
+            stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    def tearDown(self):
+        if self._SKIP_SOCKS_TEST:
+            return
+
+        self.server_process.terminate()
+        self.server_process.communicate()
+
+    def _get_ip(self, protocol):
+        if self._SKIP_SOCKS_TEST:
+            return '127.0.0.1'
+
+        ydl = FakeYDL({
+            'proxy': '%s://127.0.0.1:%d' % (protocol, self.port),
+        })
+        return ydl.urlopen('http://yt-dl.org/ip').read().decode('utf-8')
+
+    def test_socks4(self):
+        self.assertTrue(isinstance(self._get_ip('socks4'), compat_str))
+
+    def test_socks4a(self):
+        self.assertTrue(isinstance(self._get_ip('socks4a'), compat_str))
+
+    def test_socks5(self):
+        self.assertTrue(isinstance(self._get_ip('socks5'), compat_str))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_utils.py b/test/test_utils.py

index e16a6761b7e9a70589c6da7b48c9f54e2c03e734..feef80465bb11759efe1248c24e3d4451e4340e3 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -50,12 +50,13 @@
      sanitize_path,
      prepend_extension,
      replace_extension,
+    remove_start,
+    remove_end,
      remove_quotes,
      shell_quote,
      smuggle_url,
      str_to_int,
      strip_jsonp,
-    struct_unpack,
      timeconvert,
      unescapeHTML,
      unified_strdate,
@@ -139,8 +140,8 @@ def test_sanitize_filename_restricted(self):
          self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True))
          self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True))
  
-        tests = 'a\xe4b\u4e2d\u56fd\u7684c'
-        self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c')
+        tests = 'aäb\u4e2d\u56fd\u7684c'
+        self.assertEqual(sanitize_filename(tests, restricted=True), 'aab_c')
          self.assertTrue(sanitize_filename('\xf6', restricted=True) != '')  # No empty filename
  
          forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#'
@@ -155,6 +156,10 @@ def test_sanitize_filename_restricted(self):
          self.assertTrue(sanitize_filename('-', restricted=True) != '')
          self.assertTrue(sanitize_filename(':', restricted=True) != '')
  
+        self.assertEqual(sanitize_filename(
+            'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', restricted=True),
+            'AAAAAAAECEEEEIIIIDNOOOOOOOOEUUUUUYPssaaaaaaaeceeeeiiiionooooooooeuuuuuypy')
+
      def test_sanitize_ids(self):
          self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw')
          self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw')
@@ -212,6 +217,16 @@ def test_replace_extension(self):
          self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp')
          self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp')
  
+    def test_remove_start(self):
+        self.assertEqual(remove_start(None, 'A - '), None)
+        self.assertEqual(remove_start('A - B', 'A - '), 'B')
+        self.assertEqual(remove_start('B - A', 'A - '), 'B - A')
+
+    def test_remove_end(self):
+        self.assertEqual(remove_end(None, ' - B'), None)
+        self.assertEqual(remove_end('A - B', ' - B'), 'A')
+        self.assertEqual(remove_end('B - A', ' - B'), 'B - A')
+
      def test_remove_quotes(self):
          self.assertEqual(remove_quotes(None), None)
          self.assertEqual(remove_quotes('"'), '"')
@@ -453,9 +468,6 @@ def get_page(pagenum):
          testPL(5, 2, (2, 99), [2, 3, 4])
          testPL(5, 2, (20, 99), [])
  
-    def test_struct_unpack(self):
-        self.assertEqual(struct_unpack('!B', b'\x00'), (0,))
-
      def test_read_batch_urls(self):
          f = io.StringIO('''\xef\xbb\xbf foo
              bar\r
@@ -617,6 +629,15 @@ def test_js_to_json_realworld(self):
          json_code = js_to_json(inp)
          self.assertEqual(json.loads(json_code), json.loads(inp))
  
+        inp = '''{
+            0:{src:'skipped', type: 'application/dash+xml'},
+            1:{src:'skipped', type: 'application/vnd.apple.mpegURL'},
+        }'''
+        self.assertEqual(js_to_json(inp), '''{
+            "0":{"src":"skipped", "type": "application/dash+xml"},
+            "1":{"src":"skipped", "type": "application/vnd.apple.mpegURL"}
+        }''')
+
      def test_js_to_json_edgecases(self):
          on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}")
          self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"})
@@ -640,6 +661,27 @@ def test_js_to_json_edgecases(self):
          on = js_to_json('{"abc": "def",}')
          self.assertEqual(json.loads(on), {'abc': 'def'})
  
+        on = js_to_json('{ 0: /* " \n */ ",]" , }')
+        self.assertEqual(json.loads(on), {'0': ',]'})
+
+        on = js_to_json(r'["<p>x<\/p>"]')
+        self.assertEqual(json.loads(on), ['<p>x</p>'])
+
+        on = js_to_json(r'["\xaa"]')
+        self.assertEqual(json.loads(on), ['\u00aa'])
+
+        on = js_to_json("['a\\\nb']")
+        self.assertEqual(json.loads(on), ['ab'])
+
+        on = js_to_json('{0xff:0xff}')
+        self.assertEqual(json.loads(on), {'255': 255})
+
+        on = js_to_json('{077:077}')
+        self.assertEqual(json.loads(on), {'63': 63})
+
+        on = js_to_json('{42:42}')
+        self.assertEqual(json.loads(on), {'42': 42})
+
      def test_extract_attributes(self):
          self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
          self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
diff --git a/tox.ini b/tox.ini

index 2d71340050bf8f8a971acb3931621f62ded02176..9c4e4a3d1eab285d8def7fce06e7d5ceb108952e 100644 (file)
--- a/tox.ini
+++ b/tox.ini
@@ -9,5 +9,6 @@ passenv = HOME
  defaultargs = test --exclude test_download.py --exclude test_age_restriction.py
      --exclude test_subtitles.py --exclude test_write_annotations.py
      --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py
+    --exclude test_socks.py
  commands = nosetests --verbose {posargs:{[testenv]defaultargs}}  # --with-coverage --cover-package=youtube_dl --cover-html
                                                 # test.test_download:TestDownload.test_NowVideo
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 2187dcc8f6b850707e5d024fd94427cdc21fe536..5036289b062f9ee05ab5c872c76ab38babcb4a54 100755 (executable)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -64,6 +64,7 @@
      PostProcessingError,
      preferredencoding,
      prepend_extension,
+    register_socks_protocols,
      render_table,
      replace_extension,
      SameFileError,
@@ -325,7 +326,7 @@ def __init__(self, params=None, auto_init=True):
                          ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
                  self._output_channel = os.fdopen(master, 'rb')
              except OSError as ose:
-                if ose.errno == 2:
+                if ose.errno == errno.ENOENT:
                      self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
                  else:
                      raise
@@ -361,6 +362,8 @@ def __init__(self, params=None, auto_init=True):
          for ph in self.params.get('progress_hooks', []):
              self.add_progress_hook(ph)
  
+        register_socks_protocols()
+
      def warn_if_short_id(self, argv):
          # short YouTube ID starting with dash?
          idxs = [
@@ -717,6 +720,7 @@ def process_ie_result(self, ie_result, download=True, extra_info={}):
          result_type = ie_result.get('_type', 'video')
  
          if result_type in ('url', 'url_transparent'):
+            ie_result['url'] = sanitize_url(ie_result['url'])
              extract_flat = self.params.get('extract_flat', False)
              if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
                      extract_flat is True):
@@ -1219,6 +1223,10 @@ def process_video_result(self, info_dict, download=True):
          if 'title' not in info_dict:
              raise ExtractorError('Missing "title" field in extractor result')
  
+        if not isinstance(info_dict['id'], compat_str):
+            self.report_warning('"id" field is not a string - forcing string conversion')
+            info_dict['id'] = compat_str(info_dict['id'])
+
          if 'playlist' not in info_dict:
              # It isn't part of a playlist
              info_dict['playlist'] = None
@@ -2018,6 +2026,7 @@ def _setup_opener(self):
          if opts_cookiefile is None:
              self.cookiejar = compat_cookiejar.CookieJar()
          else:
+            opts_cookiefile = compat_expanduser(opts_cookiefile)
              self.cookiejar = compat_cookiejar.MozillaCookieJar(
                  opts_cookiefile)
              if os.access(opts_cookiefile, os.R_OK):
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py

index 737f6545d4136401dd3d8ddd691ad52e86894bb0..4905674ad26f29323264cdcb188f1c9aed621a0a 100644 (file)
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -18,7 +18,6 @@
  from .compat import (
      compat_expanduser,
      compat_getpass,
-    compat_print,
      compat_shlex_split,
      workaround_optparse_bug9161,
  )
@@ -67,16 +66,16 @@ def _real_main(argv=None):
      # Custom HTTP headers
      if opts.headers is not None:
          for h in opts.headers:
-            if h.find(':', 1) < 0:
+            if ':' not in h:
                  parser.error('wrong header formatting, it should be key:value, not "%s"' % h)
-            key, value = h.split(':', 2)
+            key, value = h.split(':', 1)
              if opts.verbose:
                  write_string('[debug] Adding header from command line option %s:%s\n' % (key, value))
              std_headers[key] = value
  
      # Dump user agent
      if opts.dump_user_agent:
-        compat_print(std_headers['User-Agent'])
+        write_string(std_headers['User-Agent'] + '\n', out=sys.stdout)
          sys.exit(0)
  
      # Batch file verification
@@ -86,7 +85,9 @@ def _real_main(argv=None):
              if opts.batchfile == '-':
                  batchfd = sys.stdin
              else:
-                batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore')
+                batchfd = io.open(
+                    compat_expanduser(opts.batchfile),
+                    'r', encoding='utf-8', errors='ignore')
              batch_urls = read_batch_urls(batchfd)
              if opts.verbose:
                  write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n')
@@ -99,10 +100,10 @@ def _real_main(argv=None):
  
      if opts.list_extractors:
          for ie in list_extractors(opts.age_limit):
-            compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else ''))
+            write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout)
              matchedUrls = [url for url in all_urls if ie.suitable(url)]
              for mu in matchedUrls:
-                compat_print('  ' + mu)
+                write_string('  ' + mu + '\n', out=sys.stdout)
          sys.exit(0)
      if opts.list_extractor_descriptions:
          for ie in list_extractors(opts.age_limit):
@@ -115,7 +116,7 @@ def _real_main(argv=None):
                  _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow')
                  _COUNTS = ('', '5', '10', 'all')
                  desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES))
-            compat_print(desc)
+            write_string(desc + '\n', out=sys.stdout)
          sys.exit(0)
  
      # Conflicting, missing and erroneous options
@@ -404,7 +405,7 @@ def parse_retries(retries):
  
          try:
              if opts.load_info_filename is not None:
-                retcode = ydl.download_with_info_file(opts.load_info_filename)
+                retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename))
              else:
                  retcode = ydl.download(all_urls)
          except MaxDownloadsReached:
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py

index 0b6c5ca7a8ba5eb6cb064916d56a5ca8eae32003..e3cab4dd0ff40c3813dd46faa16b6de64f5b7bbd 100644 (file)
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -11,6 +11,7 @@
  import shlex
  import shutil
  import socket
+import struct
  import subprocess
  import sys
  import itertools
@@ -244,13 +245,20 @@ def data_open(self, req):
  except ImportError:  # Python 2.6
      from xml.parsers.expat import ExpatError as compat_xml_parse_error
  
+
+etree = xml.etree.ElementTree
+
+
+class _TreeBuilder(etree.TreeBuilder):
+    def doctype(self, name, pubid, system):
+        pass
+
  if sys.version_info[0] >= 3:
-    compat_etree_fromstring = xml.etree.ElementTree.fromstring
+    def compat_etree_fromstring(text):
+        return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder()))
  else:
      # python 2.x tries to encode unicode strings with ascii (see the
      # XMLParser._fixtext method)
-    etree = xml.etree.ElementTree
-
      try:
          _etree_iter = etree.Element.iter
      except AttributeError:  # Python <=2.6
@@ -264,7 +272,7 @@ def _etree_iter(root):
      # 2.7 source
      def _XML(text, parser=None):
          if not parser:
-            parser = etree.XMLParser(target=etree.TreeBuilder())
+            parser = etree.XMLParser(target=_TreeBuilder())
          parser.feed(text)
          return parser.close()
  
@@ -276,7 +284,7 @@ def _element_factory(*args, **kwargs):
          return el
  
      def compat_etree_fromstring(text):
-        doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
+        doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory)))
          for el in _etree_iter(doc):
              if el.text is not None and isinstance(el.text, bytes):
                  el.text = el.text.decode('utf-8')
@@ -340,9 +348,9 @@ def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
          return parsed_result
  
  try:
-    from shlex import quote as shlex_quote
+    from shlex import quote as compat_shlex_quote
  except ImportError:  # Python < 3.3
-    def shlex_quote(s):
+    def compat_shlex_quote(s):
          if re.match(r'^[-_\w./]+$', s):
              return s
          else:
@@ -373,6 +381,9 @@ def compat_ord(c):
  if sys.version_info >= (3, 0):
      compat_getenv = os.getenv
      compat_expanduser = os.path.expanduser
+
+    def compat_setenv(key, value, env=os.environ):
+        env[key] = value
  else:
      # Environment variables should be decoded with filesystem encoding.
      # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
@@ -384,6 +395,12 @@ def compat_getenv(key, default=None):
              env = env.decode(get_filesystem_encoding())
          return env
  
+    def compat_setenv(key, value, env=os.environ):
+        def encode(v):
+            from .utils import get_filesystem_encoding
+            return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v
+        env[encode(key)] = encode(value)
+
      # HACK: The default implementations of os.path.expanduser from cpython do not decode
      # environment variables with filesystem encoding. We will work around this by
      # providing adjusted implementations.
@@ -456,18 +473,6 @@ def compat_print(s):
          print(s)
  
  
-try:
-    subprocess_check_output = subprocess.check_output
-except AttributeError:
-    def subprocess_check_output(*args, **kwargs):
-        assert 'input' not in kwargs
-        p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
-        output, _ = p.communicate()
-        ret = p.poll()
-        if ret:
-            raise subprocess.CalledProcessError(ret, p.args, output=output)
-        return output
-
  if sys.version_info < (3, 0) and sys.platform == 'win32':
      def compat_getpass(prompt, *args, **kwargs):
          if isinstance(prompt, compat_str):
@@ -477,6 +482,11 @@ def compat_getpass(prompt, *args, **kwargs):
  else:
      compat_getpass = getpass.getpass
  
+try:
+    compat_input = raw_input
+except NameError:  # Python 3
+    compat_input = input
+
  # Python < 2.6.5 require kwargs to be bytes
  try:
      def _testfunc(x):
@@ -583,6 +593,26 @@ def compat_itertools_count(start=0, step=1):
  else:
      from tokenize import generate_tokens as compat_tokenize_tokenize
  
+
+try:
+    struct.pack('!I', 0)
+except TypeError:
+    # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
+    # See https://bugs.python.org/issue19099
+    def compat_struct_pack(spec, *args):
+        if isinstance(spec, compat_str):
+            spec = spec.encode('ascii')
+        return struct.pack(spec, *args)
+
+    def compat_struct_unpack(spec, *args):
+        if isinstance(spec, compat_str):
+            spec = spec.encode('ascii')
+        return struct.unpack(spec, *args)
+else:
+    compat_struct_pack = struct.pack
+    compat_struct_unpack = struct.unpack
+
+
  __all__ = [
      'compat_HTMLParser',
      'compat_HTTPError',
@@ -598,15 +628,20 @@ def compat_itertools_count(start=0, step=1):
      'compat_html_entities',
      'compat_http_client',
      'compat_http_server',
+    'compat_input',
      'compat_itertools_count',
      'compat_kwargs',
      'compat_ord',
      'compat_os_name',
      'compat_parse_qs',
      'compat_print',
+    'compat_setenv',
+    'compat_shlex_quote',
      'compat_shlex_split',
      'compat_socket_create_connection',
      'compat_str',
+    'compat_struct_pack',
+    'compat_struct_unpack',
      'compat_subprocess_get_DEVNULL',
      'compat_tokenize_tokenize',
      'compat_urllib_error',
@@ -623,7 +658,5 @@ def compat_itertools_count(start=0, step=1):
      'compat_urlretrieve',
      'compat_xml_parse_error',
      'compat_xpath',
-    'shlex_quote',
-    'subprocess_check_output',
      'workaround_optparse_bug9161',
  ]
diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py

index 8d642fc3e60594f10a057847cf5702f715941326..3ff1f9ed4c1dea1c3b30032909651b50f706dfea 100644 (file)
--- a/youtube_dl/downloader/external.py
+++ b/youtube_dl/downloader/external.py
@@ -6,6 +6,7 @@
  import re
  
  from .common import FileDownloader
+from ..compat import compat_setenv
  from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS
  from ..utils import (
      cli_option,
@@ -198,6 +199,19 @@ def _call_downloader(self, tmpfilename, info_dict):
                  '-headers',
                  ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())]
  
+        env = None
+        proxy = self.params.get('proxy')
+        if proxy:
+            if not re.match(r'^[\da-zA-Z]+://', proxy):
+                proxy = 'http://%s' % proxy
+            # Since December 2015 ffmpeg supports -http_proxy option (see
+            # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd)
+            # We could switch to the following code if we are able to detect version properly
+            # args += ['-http_proxy', proxy]
+            env = os.environ.copy()
+            compat_setenv('HTTP_PROXY', proxy, env=env)
+            compat_setenv('http_proxy', proxy, env=env)
+
          protocol = info_dict.get('protocol')
  
          if protocol == 'rtmp':
@@ -224,7 +238,7 @@ def _call_downloader(self, tmpfilename, info_dict):
                  args += ['-rtmp_live', 'live']
  
          args += ['-i', url, '-c', 'copy']
-        if protocol == 'm3u8':
+        if protocol in ('m3u8', 'm3u8_native'):
              if self.params.get('hls_use_mpegts', False) or tmpfilename == '-':
                  args += ['-f', 'mpegts']
              else:
@@ -239,7 +253,7 @@ def _call_downloader(self, tmpfilename, info_dict):
  
          self._debug_cmd(args)
  
-        proc = subprocess.Popen(args, stdin=subprocess.PIPE)
+        proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env)
          try:
              retval = proc.wait()
          except KeyboardInterrupt:
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py

index 664d87543d07f7c357b803e0a0058034b71276a6..8f88b02414a28a8da650e03418c8ecd7e52a59a6 100644 (file)
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -12,37 +12,49 @@
      compat_urlparse,
      compat_urllib_error,
      compat_urllib_parse_urlparse,
+    compat_struct_pack,
+    compat_struct_unpack,
  )
  from ..utils import (
      encodeFilename,
      fix_xml_ampersands,
      sanitize_open,
-    struct_pack,
-    struct_unpack,
      xpath_text,
  )
  
  
+class DataTruncatedError(Exception):
+    pass
+
+
  class FlvReader(io.BytesIO):
      """
      Reader for Flv files
      The file format is documented in https://www.adobe.com/devnet/f4v.html
      """
  
+    def read_bytes(self, n):
+        data = self.read(n)
+        if len(data) < n:
+            raise DataTruncatedError(
+                'FlvReader error: need %d bytes while only %d bytes got' % (
+                    n, len(data)))
+        return data
+
      # Utility functions for reading numbers and strings
      def read_unsigned_long_long(self):
-        return struct_unpack('!Q', self.read(8))[0]
+        return compat_struct_unpack('!Q', self.read_bytes(8))[0]
  
      def read_unsigned_int(self):
-        return struct_unpack('!I', self.read(4))[0]
+        return compat_struct_unpack('!I', self.read_bytes(4))[0]
  
      def read_unsigned_char(self):
-        return struct_unpack('!B', self.read(1))[0]
+        return compat_struct_unpack('!B', self.read_bytes(1))[0]
  
      def read_string(self):
          res = b''
          while True:
-            char = self.read(1)
+            char = self.read_bytes(1)
              if char == b'\x00':
                  break
              res += char
@@ -53,18 +65,18 @@ def read_box_info(self):
          Read a box and return the info as a tuple: (box_size, box_type, box_data)
          """
          real_size = size = self.read_unsigned_int()
-        box_type = self.read(4)
+        box_type = self.read_bytes(4)
          header_end = 8
          if size == 1:
              real_size = self.read_unsigned_long_long()
              header_end = 16
-        return real_size, box_type, self.read(real_size - header_end)
+        return real_size, box_type, self.read_bytes(real_size - header_end)
  
      def read_asrt(self):
          # version
          self.read_unsigned_char()
          # flags
-        self.read(3)
+        self.read_bytes(3)
          quality_entry_count = self.read_unsigned_char()
          # QualityEntryCount
          for i in range(quality_entry_count):
@@ -85,7 +97,7 @@ def read_afrt(self):
          # version
          self.read_unsigned_char()
          # flags
-        self.read(3)
+        self.read_bytes(3)
          # time scale
          self.read_unsigned_int()
  
@@ -119,7 +131,7 @@ def read_abst(self):
          # version
          self.read_unsigned_char()
          # flags
-        self.read(3)
+        self.read_bytes(3)
  
          self.read_unsigned_int()  # BootstrapinfoVersion
          # Profile,Live,Update,Reserved
@@ -194,11 +206,11 @@ def build_fragments_list(boot_info):
  
  
  def write_unsigned_int(stream, val):
-    stream.write(struct_pack('!I', val))
+    stream.write(compat_struct_pack('!I', val))
  
  
  def write_unsigned_int_24(stream, val):
-    stream.write(struct_pack('!I', val)[1:])
+    stream.write(compat_struct_pack('!I', val)[1:])
  
  
  def write_flv_header(stream):
@@ -307,7 +319,7 @@ def real_download(self, filename, info_dict):
          doc = compat_etree_fromstring(manifest)
          formats = [(int(f.attrib.get('bitrate', -1)), f)
                     for f in self._get_unencrypted_media(doc)]
-        if requested_bitrate is None:
+        if requested_bitrate is None or len(formats) == 1:
              # get the best format
              formats = sorted(formats, key=lambda f: f[0])
              rate, media = formats[-1]
@@ -374,7 +386,17 @@ def real_download(self, filename, info_dict):
                  down.close()
                  reader = FlvReader(down_data)
                  while True:
-                    _, box_type, box_data = reader.read_box_info()
+                    try:
+                        _, box_type, box_data = reader.read_box_info()
+                    except DataTruncatedError:
+                        if test:
+                            # In tests, segments may be truncated, and thus
+                            # FlvReader may not be able to parse the whole
+                            # chunk. If so, write the segment as is
+                            # See https://github.com/rg3/youtube-dl/issues/9214
+                            dest_stream.write(down_data)
+                            break
+                        raise
                      if box_type == b'mdat':
                          dest_stream.write(box_data)
                          break
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py

index a01dac031aa3b0c012a4262d210d16ef2b10a47a..54f2108e964b5eb70609b15cc6e0589036931235 100644 (file)
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -4,6 +4,7 @@
  import re
  
  from .fragment import FragmentFD
+from .external import FFmpegFD
  
  from ..compat import compat_urlparse
  from ..utils import (
@@ -17,12 +18,45 @@ class HlsFD(FragmentFD):
  
      FD_NAME = 'hlsnative'
  
+    @staticmethod
+    def can_download(manifest):
+        UNSUPPORTED_FEATURES = (
+            r'#EXT-X-KEY:METHOD=(?!NONE)',  # encrypted streams [1]
+            r'#EXT-X-BYTERANGE',  # playlists composed of byte ranges of media files [2]
+
+            # Live streams heuristic does not always work (e.g. geo restricted to Germany
+            # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0)
+            # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)',  # live streams [3]
+
+            # This heuristic also is not correct since segments may not be appended as well.
+            # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite
+            # no segments will definitely be appended to the end of the playlist.
+            # r'#EXT-X-PLAYLIST-TYPE:EVENT',  # media segments may be appended to the end of
+            #                                 # event media playlists [4]
+
+            # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
+            # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
+            # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
+            # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
+        )
+        return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES)
+
      def real_download(self, filename, info_dict):
          man_url = info_dict['url']
          self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
          manifest = self.ydl.urlopen(man_url).read()
  
          s = manifest.decode('utf-8', 'ignore')
+
+        if not self.can_download(s):
+            self.report_warning(
+                'hlsnative has detected features it does not support, '
+                'extraction will be delegated to ffmpeg')
+            fd = FFmpegFD(self.ydl, self.params)
+            for ph in self._progress_hooks:
+                fd.add_progress_hook(ph)
+            return fd.real_download(filename, info_dict)
+
          fragment_urls = []
          for line in s.splitlines():
              line = line.strip()
diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py

new file mode 100644 (file)

index 0000000..b61a632
--- /dev/null
+++ b/youtube_dl/extractor/abcnews.py
@@ -0,0 +1,135 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import calendar
+import re
+import time
+
+from .amp import AMPIE
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+
+
+class AbcNewsVideoIE(AMPIE):
+    IE_NAME = 'abcnews:video'
+    _VALID_URL = 'http://abcnews.go.com/[^/]+/video/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',
+        'info_dict': {
+            'id': '20411932',
+            'ext': 'mp4',
+            'display_id': 'week-exclusive-irans-foreign-minister-zarif',
+            'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif',
+            'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',
+            'duration': 180,
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+        video_id = mobj.group('id')
+        info_dict = self._extract_feed_info(
+            'http://abcnews.go.com/video/itemfeed?id=%s' % video_id)
+        info_dict.update({
+            'id': video_id,
+            'display_id': display_id,
+        })
+        return info_dict
+
+
+class AbcNewsIE(InfoExtractor):
+    IE_NAME = 'abcnews'
+    _VALID_URL = 'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY',
+        'info_dict': {
+            'id': '10498713',
+            'ext': 'flv',
+            'display_id': 'dramatic-video-rare-death-job-america',
+            'title': 'Occupational Hazards',
+            'description': 'Nightline investigates the dangers that lurk at various jobs.',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'upload_date': '20100428',
+            'timestamp': 1272412800,
+        },
+        'add_ie': ['AbcNewsVideo'],
+    }, {
+        'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
+        'info_dict': {
+            'id': '39125818',
+            'ext': 'mp4',
+            'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',
+            'title': 'Justin Timberlake Drops Hints For Secret Single',
+            'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
+            'upload_date': '20160515',
+            'timestamp': 1463329500,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+            # The embedded YouTube video is blocked due to copyright issues
+            'playlist_items': '1',
+        },
+        'add_ie': ['AbcNewsVideo'],
+    }, {
+        'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        video_url = self._search_regex(
+            r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
+        full_video_url = compat_urlparse.urljoin(url, video_url)
+
+        youtube_url = self._html_search_regex(
+            r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"',
+            webpage, 'YouTube URL', default=None)
+
+        timestamp = None
+        date_str = self._html_search_regex(
+            r'<span[^>]+class="timestamp">([^<]+)</span>',
+            webpage, 'timestamp', fatal=False)
+        if date_str:
+            tz_offset = 0
+            if date_str.endswith(' ET'):  # Eastern Time
+                tz_offset = -5
+                date_str = date_str[:-3]
+            date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p']
+            for date_format in date_formats:
+                try:
+                    timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format))
+                except ValueError:
+                    continue
+            if timestamp is not None:
+                timestamp -= tz_offset * 3600
+
+        entry = {
+            '_type': 'url_transparent',
+            'ie_key': AbcNewsVideoIE.ie_key(),
+            'url': full_video_url,
+            'id': video_id,
+            'display_id': display_id,
+            'timestamp': timestamp,
+        }
+
+        if youtube_url:
+            entries = [entry, self.url_result(youtube_url, 'Youtube')]
+            return self.playlist_result(entries)
+
+        return entry
diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py

index 138fa08086ee2d4e7c446c09ce85b8726e4ff255..8545681bead617b2ffedeee04212e032bdc3406e 100644 (file)
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@@ -52,7 +52,7 @@ def get_media_node(name, default=None):
          for media_data in media_content:
              media = media_data['@attributes']
              media_type = media['type']
-            if media_type == 'video/f4m':
+            if media_type in ('video/f4m', 'application/f4m+xml'):
                  formats.extend(self._extract_f4m_formats(
                      media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
                      video_id, f4m_id='hds', fatal=False))
@@ -61,7 +61,7 @@ def get_media_node(name, default=None):
                      media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False))
              else:
                  formats.append({
-                    'format_id': media_data['media-category']['@attributes']['label'],
+                    'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
                      'url': media['url'],
                      'tbr': int_or_none(media.get('bitrate')),
                      'filesize': int_or_none(media.get('fileSize')),
diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py

new file mode 100644 (file)

index 0000000..cb29cf1
--- /dev/null
+++ b/youtube_dl/extractor/anvato.py
@@ -0,0 +1,224 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import hashlib
+import json
+import random
+import time
+
+from .common import InfoExtractor
+from ..aes import aes_encrypt
+from ..compat import compat_str
+from ..utils import (
+    bytes_to_intlist,
+    determine_ext,
+    intlist_to_bytes,
+    int_or_none,
+    strip_jsonp,
+)
+
+
+def md5_text(s):
+    if not isinstance(s, compat_str):
+        s = compat_str(s)
+    return hashlib.md5(s.encode('utf-8')).hexdigest()
+
+
+class AnvatoIE(InfoExtractor):
+    # Copied from anvplayer.min.js
+    _ANVACK_TABLE = {
+        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
+        'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA',
+        'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP',
+        'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv',
+        'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7',
+        'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR',
+        'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg',
+        'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto',
+        'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY',
+        'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh',
+        'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK',
+        'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D',
+        'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad',
+        'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp',
+        'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih',
+        'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR',
+        'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW',
+        'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su',
+        'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q',
+        'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5',
+        'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3',
+        'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI',
+        'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s',
+        'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz',
+        'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg',
+        'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x',
+        'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH',
+        'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX',
+        'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc',
+        'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK',
+        'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7',
+        'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C',
+        'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e',
+        'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1',
+        'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re',
+        'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51',
+        'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho',
+        'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9',
+        'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH',
+        'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F',
+        'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo',
+        'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR',
+        'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa',
+        'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk',
+        'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ',
+        'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ',
+        'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m',
+        'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b',
+        'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3',
+        'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK',
+        'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
+        'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
+        'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F',
+        'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx',
+        'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ',
+        'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH',
+        'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm',
+        'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt',
+        'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl',
+        'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b',
+        'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV',
+        'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg',
+        'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk',
+        'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT',
+        'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa',
+        'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv',
+        'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k',
+        'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI',
+        'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr',
+        'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw',
+        'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K',
+        'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH',
+        'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK',
+        'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu',
+        'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
+        'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
+        'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK',
+        'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n',
+        'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD',
+        'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk',
+        'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn',
+        'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W',
+        'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ',
+        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
+    }
+
+    _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
+
+    def __init__(self, *args, **kwargs):
+        super(AnvatoIE, self).__init__(*args, **kwargs)
+        self.__server_time = None
+
+    def _server_time(self, access_key, video_id):
+        if self.__server_time is not None:
+            return self.__server_time
+
+        self.__server_time = int(self._download_json(
+            self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id,
+            note='Fetching server time')['server_time'])
+
+        return self.__server_time
+
+    def _api_prefix(self, access_key):
+        return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage')
+
+    def _get_video_json(self, access_key, video_id):
+        # See et() in anvplayer.min.js, which is an alias of getVideoJSON()
+        video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key)
+        server_time = self._server_time(access_key, video_id)
+        input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time))
+
+        auth_secret = intlist_to_bytes(aes_encrypt(
+            bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
+
+        video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii')
+        anvrid = md5_text(time.time() * 1000 * random.random())[:30]
+        payload = {
+            'api': {
+                'anvrid': anvrid,
+                'anvstk': md5_text('%s|%s|%d|%s' % (
+                    access_key, anvrid, server_time, self._ANVACK_TABLE[access_key])),
+                'anvts': server_time,
+            },
+        }
+
+        return self._download_json(
+            video_data_url, video_id, transform_source=strip_jsonp,
+            data=json.dumps(payload).encode('utf-8'))
+
+    def _extract_anvato_videos(self, webpage, video_id):
+        anvplayer_data = self._parse_json(self._html_search_regex(
+            r'<script[^>]+data-anvp=\'([^\']+)\'', webpage,
+            'Anvato player data'), video_id)
+
+        video_id = anvplayer_data['video']
+        access_key = anvplayer_data['accessKey']
+
+        video_data = self._get_video_json(access_key, video_id)
+
+        formats = []
+        for published_url in video_data['published_urls']:
+            video_url = published_url['embed_url']
+            ext = determine_ext(video_url)
+
+            if ext == 'smil':
+                formats.extend(self._extract_smil_formats(video_url, video_id))
+                continue
+
+            tbr = int_or_none(published_url.get('kbps'))
+            a_format = {
+                'url': video_url,
+                'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(),
+                'tbr': tbr if tbr != 0 else None,
+            }
+
+            if ext == 'm3u8':
+                # Not using _extract_m3u8_formats here as individual media
+                # playlists are also included in published_urls.
+                if tbr is None:
+                    formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls'))
+                    continue
+                else:
+                    a_format.update({
+                        'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
+                        'ext': 'mp4',
+                    })
+            elif ext == 'mp3':
+                a_format['vcodec'] = 'none'
+            else:
+                a_format.update({
+                    'width': int_or_none(published_url.get('width')),
+                    'height': int_or_none(published_url.get('height')),
+                })
+            formats.append(a_format)
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for caption in video_data.get('captions', []):
+            a_caption = {
+                'url': caption['url'],
+                'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None
+            }
+            subtitles.setdefault(caption['language'], []).append(a_caption)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': video_data.get('def_title'),
+            'description': video_data.get('def_description'),
+            'categories': video_data.get('categories'),
+            'thumbnail': video_data.get('thumbnail'),
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py

index 881cacfabae42a9e18b67ba74b0f20ddb00b6050..f405329297da70c5c686216c35b16e2ab1c2ded5 100644 (file)
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -61,10 +61,7 @@ def _real_extract(self, url):
          }
  
  
-class ArteTVPlus7IE(InfoExtractor):
-    IE_NAME = 'arte.tv:+7'
-    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)'
-
+class ArteTVBaseIE(InfoExtractor):
      @classmethod
      def _extract_url_info(cls, url):
          mobj = re.match(cls._VALID_URL, url)
@@ -78,60 +75,6 @@ def _extract_url_info(cls, url):
              video_id = mobj.group('id')
          return video_id, lang
  
-    def _real_extract(self, url):
-        video_id, lang = self._extract_url_info(url)
-        webpage = self._download_webpage(url, video_id)
-        return self._extract_from_webpage(webpage, video_id, lang)
-
-    def _extract_from_webpage(self, webpage, video_id, lang):
-        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
-        ids = (video_id, '')
-        # some pages contain multiple videos (like
-        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
-        # so we first try to look for json URLs that contain the video id from
-        # the 'vid' parameter.
-        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
-        json_url = self._html_search_regex(
-            patterns, webpage, 'json vp url', default=None)
-        if not json_url:
-            def find_iframe_url(webpage, default=NO_DEFAULT):
-                return self._html_search_regex(
-                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
-                    webpage, 'iframe url', group='url', default=default)
-
-            iframe_url = find_iframe_url(webpage, None)
-            if not iframe_url:
-                embed_url = self._html_search_regex(
-                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
-                if embed_url:
-                    player = self._download_json(
-                        embed_url, video_id, 'Downloading player page')
-                    iframe_url = find_iframe_url(player['html'])
-            # en and es URLs produce react-based pages with different layout (e.g.
-            # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
-            if not iframe_url:
-                program = self._search_regex(
-                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
-                    webpage, 'program', default=None)
-                if program:
-                    embed_html = self._parse_json(program, video_id)
-                    if embed_html:
-                        iframe_url = find_iframe_url(embed_html['embed_html'])
-            if iframe_url:
-                json_url = compat_parse_qs(
-                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
-        if json_url:
-            title = self._search_regex(
-                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
-                webpage, 'title', default=None, group='title')
-            return self._extract_from_json_url(json_url, video_id, lang, title=title)
-        # Different kind of embed URL (e.g.
-        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
-        embed_url = self._search_regex(
-            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
-            webpage, 'embed url', group='url')
-        return self.url_result(embed_url)
-
      def _extract_from_json_url(self, json_url, video_id, lang, title=None):
          info = self._download_json(json_url, video_id)
          player_info = info['videoJsonPlayer']
@@ -161,24 +104,53 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
              'es': 'E[ESP]',
          }
  
+        langcode = LANGS.get(lang, lang)
+
          formats = []
          for format_id, format_dict in player_info['VSR'].items():
              f = dict(format_dict)
              versionCode = f.get('versionCode')
-            langcode = LANGS.get(lang, lang)
-            lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)]
-            lang_pref = None
-            if versionCode:
-                matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)]
-                lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs)
-            source_pref = 0
-            if versionCode is not None:
-                # The original version with subtitles has lower relevance
-                if re.match(r'VO-ST(F|A|E)', versionCode):
-                    source_pref -= 10
-                # The version with sourds/mal subtitles has also lower relevance
-                elif re.match(r'VO?(F|A|E)-STM\1', versionCode):
-                    source_pref -= 9
+            l = re.escape(langcode)
+
+            # Language preference from most to least priority
+            # Reference: section 5.6.3 of
+            # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf
+            PREFERENCES = (
+                # original version in requested language, without subtitles
+                r'VO{0}$'.format(l),
+                # original version in requested language, with partial subtitles in requested language
+                r'VO{0}-ST{0}$'.format(l),
+                # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
+                r'VO{0}-STM{0}$'.format(l),
+                # non-original (dubbed) version in requested language, without subtitles
+                r'V{0}$'.format(l),
+                # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
+                r'V{0}-ST{0}$'.format(l),
+                # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
+                r'V{0}-STM{0}$'.format(l),
+                # original version in requested language, with partial subtitles in different language
+                r'VO{0}-ST(?!{0}).+?$'.format(l),
+                # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
+                r'VO{0}-STM(?!{0}).+?$'.format(l),
+                # original version in different language, with partial subtitles in requested language
+                r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
+                # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
+                r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
+                # original version in different language, without subtitles
+                r'VO(?:(?!{0}))?$'.format(l),
+                # original version in different language, with partial subtitles in different language
+                r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
+                # original version in different language, with subtitles for the deaf and hard-of-hearing in different language
+                r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
+            )
+
+            for pref, p in enumerate(PREFERENCES):
+                if re.match(p, versionCode):
+                    lang_pref = len(PREFERENCES) - pref
+                    break
+            else:
+                lang_pref = -1
+
              format = {
                  'format_id': format_id,
                  'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@@ -188,7 +160,6 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
                  'height': int_or_none(f.get('height')),
                  'tbr': int_or_none(f.get('bitrate')),
                  'quality': qfunc(f.get('quality')),
-                'source_preference': source_pref,
              }
  
              if f.get('mediaType') == 'rtmp':
@@ -207,6 +178,74 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
          return info_dict
  
  
+class ArteTVPlus7IE(ArteTVBaseIE):
+    IE_NAME = 'arte.tv:+7'
+    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        video_id, lang = self._extract_url_info(url)
+        webpage = self._download_webpage(url, video_id)
+        return self._extract_from_webpage(webpage, video_id, lang)
+
+    def _extract_from_webpage(self, webpage, video_id, lang):
+        patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
+        ids = (video_id, '')
+        # some pages contain multiple videos (like
+        # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
+        # so we first try to look for json URLs that contain the video id from
+        # the 'vid' parameter.
+        patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
+        json_url = self._html_search_regex(
+            patterns, webpage, 'json vp url', default=None)
+        if not json_url:
+            def find_iframe_url(webpage, default=NO_DEFAULT):
+                return self._html_search_regex(
+                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
+                    webpage, 'iframe url', group='url', default=default)
+
+            iframe_url = find_iframe_url(webpage, None)
+            if not iframe_url:
+                embed_url = self._html_search_regex(
+                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
+                if embed_url:
+                    player = self._download_json(
+                        embed_url, video_id, 'Downloading player page')
+                    iframe_url = find_iframe_url(player['html'])
+            # en and es URLs produce react-based pages with different layout (e.g.
+            # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
+            if not iframe_url:
+                program = self._search_regex(
+                    r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
+                    webpage, 'program', default=None)
+                if program:
+                    embed_html = self._parse_json(program, video_id)
+                    if embed_html:
+                        iframe_url = find_iframe_url(embed_html['embed_html'])
+            if iframe_url:
+                json_url = compat_parse_qs(
+                    compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
+        if json_url:
+            title = self._search_regex(
+                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
+                webpage, 'title', default=None, group='title')
+            return self._extract_from_json_url(json_url, video_id, lang, title=title)
+        # Different kind of embed URL (e.g.
+        # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
+        embed_url = self._search_regex(
+            r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
+            webpage, 'embed url', group='url')
+        return self.url_result(embed_url)
+
+
  # It also uses the arte_vp_url url from the webpage to extract the information
  class ArteTVCreativeIE(ArteTVPlus7IE):
      IE_NAME = 'arte.tv:creative'
@@ -239,7 +278,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):
      IE_NAME = 'arte.tv:info'
      _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere',
          'info_dict': {
              'id': '067528-000-A',
@@ -247,7 +286,7 @@ class ArteTVInfoIE(ArteTVPlus7IE):
              'title': 'Service civique, un cache misère ?',
              'upload_date': '20160403',
          },
-    }
+    }]
  
  
  class ArteTVFutureIE(ArteTVPlus7IE):
@@ -272,6 +311,8 @@ class ArteTVDDCIE(ArteTVPlus7IE):
      IE_NAME = 'arte.tv:ddc'
      _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)'
  
+    _TESTS = []
+
      def _real_extract(self, url):
          video_id, lang = self._extract_url_info(url)
          if lang == 'folge':
@@ -290,7 +331,7 @@ class ArteTVConcertIE(ArteTVPlus7IE):
      IE_NAME = 'arte.tv:concert'
      _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
          'md5': '9ea035b7bd69696b67aa2ccaaa218161',
          'info_dict': {
@@ -300,14 +341,14 @@ class ArteTVConcertIE(ArteTVPlus7IE):
              'upload_date': '20140128',
              'description': 'md5:486eb08f991552ade77439fe6d82c305',
          },
-    }
+    }]
  
  
  class ArteTVCinemaIE(ArteTVPlus7IE):
      IE_NAME = 'arte.tv:cinema'
      _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://cinema.arte.tv/de/node/38291',
          'md5': '6b275511a5107c60bacbeeda368c3aa1',
          'info_dict': {
@@ -317,7 +358,7 @@ class ArteTVCinemaIE(ArteTVPlus7IE):
              'upload_date': '20160122',
              'description': 'md5:7f749bbb77d800ef2be11d54529b96bc',
          },
-    }
+    }]
  
  
  class ArteTVMagazineIE(ArteTVPlus7IE):
@@ -362,9 +403,41 @@ class ArteTVEmbedIE(ArteTVPlus7IE):
          )
      '''
  
+    _TESTS = []
+
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('id')
          lang = mobj.group('lang')
          json_url = mobj.group('json_url')
          return self._extract_from_json_url(json_url, video_id, lang)
+
+
+class ArteTVPlaylistIE(ArteTVBaseIE):
+    IE_NAME = 'arte.tv:playlist'
+    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV',
+        'info_dict': {
+            'id': 'PL-013263',
+            'title': 'Areva & Uramin',
+        },
+        'playlist_mincount': 6,
+    }, {
+        'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id, lang = self._extract_url_info(url)
+        collection = self._download_json(
+            'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
+            % (lang, playlist_id), playlist_id)
+        title = collection.get('title')
+        description = collection.get('shortDescription') or collection.get('teaserText')
+        entries = [
+            self._extract_from_json_url(
+                video['jsonUrl'], video.get('programId') or playlist_id, lang)
+            for video in collection['videos'] if video.get('jsonUrl')]
+        return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py

index c1ef8051d3074a6551941bf140f88eee4ed8a124..991ab0676e6b93a1c64d04f48b3728551bc4ccf0 100644 (file)
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -29,7 +29,7 @@ class BandcampIE(InfoExtractor):
          '_skip': 'There is a limit of 200 free downloads / month for the test song'
      }, {
          'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
-        'md5': '2b68e5851514c20efdff2afc5603b8b4',
+        'md5': '73d0b3171568232574e45652f8720b5c',
          'info_dict': {
              'id': '2650410135',
              'ext': 'mp3',
@@ -48,6 +48,10 @@ def _real_extract(self, url):
              if m_trackinfo:
                  json_code = m_trackinfo.group(1)
                  data = json.loads(json_code)[0]
+                track_id = compat_str(data['id'])
+
+                if not data.get('file'):
+                    raise ExtractorError('Not streamable', video_id=track_id, expected=True)
  
                  formats = []
                  for format_id, format_url in data['file'].items():
@@ -64,7 +68,7 @@ def _real_extract(self, url):
                  self._sort_formats(formats)
  
                  return {
-                    'id': compat_str(data['id']),
+                    'id': track_id,
                      'title': data['title'],
                      'formats': formats,
                      'duration': float_or_none(data.get('duration')),
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py

index 8baff2041bb380d0204895cbbc6c64b16be94993..b17047b399b6630fe2334aa24f0a5e97aed8506f 100644 (file)
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -1,34 +1,42 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import calendar
+import datetime
  import re
  
  from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+    compat_etree_fromstring,
+    compat_str,
+    compat_parse_qs,
+    compat_xml_parse_error,
+)
  from ..utils import (
-    int_or_none,
-    unescapeHTML,
      ExtractorError,
+    int_or_none,
+    float_or_none,
      xpath_text,
  )
  
  
  class BiliBiliIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?'
+    _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)'
  
      _TESTS = [{
          'url': 'http://www.bilibili.tv/video/av1074402/',
-        'md5': '2c301e4dab317596e837c3e7633e7d86',
+        'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
          'info_dict': {
              'id': '1554319',
              'ext': 'flv',
              'title': '【金坷垃】金泡沫',
-            'duration': 308313,
+            'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
+            'duration': 308.067,
+            'timestamp': 1398012660,
              'upload_date': '20140420',
              'thumbnail': 're:^https?://.+\.jpg',
-            'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
-            'timestamp': 1397983878,
              'uploader': '菊子桑',
+            'uploader_id': '156160',
          },
      }, {
          'url': 'http://www.bilibili.com/video/av1041170/',
@@ -36,75 +44,186 @@ class BiliBiliIE(InfoExtractor):
              'id': '1041170',
              'title': '【BD1080P】刀语【诸神&异域】',
              'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦！~',
-            'uploader': '枫叶逝去',
-            'timestamp': 1396501299,
          },
          'playlist_count': 9,
+    }, {
+        'url': 'http://www.bilibili.com/video/av4808130/',
+        'info_dict': {
+            'id': '4808130',
+            'title': '【长篇】哆啦A梦443【钉铛】',
+            'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉，又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+        },
+        'playlist': [{
+            'md5': '55cdadedf3254caaa0d5d27cf20a8f9c',
+            'info_dict': {
+                'id': '4808130_part1',
+                'ext': 'flv',
+                'title': '【长篇】哆啦A梦443【钉铛】',
+                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉，又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+                'timestamp': 1464564180,
+                'upload_date': '20160529',
+                'uploader': '喜欢拉面',
+                'uploader_id': '151066',
+            },
+        }, {
+            'md5': '926f9f67d0c482091872fbd8eca7ea3d',
+            'info_dict': {
+                'id': '4808130_part2',
+                'ext': 'flv',
+                'title': '【长篇】哆啦A梦443【钉铛】',
+                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉，又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+                'timestamp': 1464564180,
+                'upload_date': '20160529',
+                'uploader': '喜欢拉面',
+                'uploader_id': '151066',
+            },
+        }, {
+            'md5': '4b7b225b968402d7c32348c646f1fd83',
+            'info_dict': {
+                'id': '4808130_part3',
+                'ext': 'flv',
+                'title': '【长篇】哆啦A梦443【钉铛】',
+                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉，又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+                'timestamp': 1464564180,
+                'upload_date': '20160529',
+                'uploader': '喜欢拉面',
+                'uploader_id': '151066',
+            },
+        }, {
+            'md5': '7b795e214166501e9141139eea236e91',
+            'info_dict': {
+                'id': '4808130_part4',
+                'ext': 'flv',
+                'title': '【长篇】哆啦A梦443【钉铛】',
+                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉，又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+                'timestamp': 1464564180,
+                'upload_date': '20160529',
+                'uploader': '喜欢拉面',
+                'uploader_id': '151066',
+            },
+        }],
+    }, {
+        # Missing upload time
+        'url': 'http://www.bilibili.com/video/av1867637/',
+        'info_dict': {
+            'id': '2880301',
+            'ext': 'flv',
+            'title': '【HDTV】【喜剧】岳父岳母真难当 （2014）【法国票房冠军】',
+            'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫，老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人，结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】',
+            'uploader': '黑夜为猫',
+            'uploader_id': '610729',
+        },
+        'params': {
+            # Just to test metadata extraction
+            'skip_download': True,
+        },
+        'expected_warnings': ['upload time'],
      }]
  
+    # BiliBili blocks keys from time to time. The current key is extracted from
+    # the Android client
+    # TODO: find the sign algorithm used in the flash player
+    _APP_KEY = '86385cdc024c0f6c'
+
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('id')
-        page_num = mobj.group('page_num') or '1'
  
-        view_data = self._download_json(
-            'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num),
-            video_id)
-        if 'error' in view_data:
-            raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True)
+        webpage = self._download_webpage(url, video_id)
  
-        cid = view_data['cid']
-        title = unescapeHTML(view_data['title'])
+        params = compat_parse_qs(self._search_regex(
+            [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
+             r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
+            webpage, 'player parameters'))
+        cid = params['cid'][0]
  
-        doc = self._download_xml(
-            'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid,
-            cid,
-            'Downloading page %s/%s' % (page_num, view_data['pages'])
-        )
+        info_xml_str = self._download_webpage(
+            'http://interface.bilibili.com/v_cdn_play',
+            cid, query={'appkey': self._APP_KEY, 'cid': cid},
+            note='Downloading video info page')
+
+        err_msg = None
+        durls = None
+        info_xml = None
+        try:
+            info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8'))
+        except compat_xml_parse_error:
+            info_json = self._parse_json(info_xml_str, video_id, fatal=False)
+            err_msg = (info_json or {}).get('error_text')
+        else:
+            err_msg = xpath_text(info_xml, './message')
  
-        if xpath_text(doc, './result') == 'error':
-            raise ExtractorError('%s said: %s' % (self.IE_NAME, xpath_text(doc, './message')), expected=True)
+        if info_xml is not None:
+            durls = info_xml.findall('./durl')
+        if not durls:
+            if err_msg:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True)
+            else:
+                raise ExtractorError('No videos found!')
  
          entries = []
  
-        for durl in doc.findall('./durl'):
+        for durl in durls:
              size = xpath_text(durl, ['./filesize', './size'])
              formats = [{
                  'url': durl.find('./url').text,
                  'filesize': int_or_none(size),
-                'ext': 'flv',
              }]
-            backup_urls = durl.find('./backup_url')
-            if backup_urls is not None:
-                for backup_url in backup_urls.findall('./url'):
-                    formats.append({'url': backup_url.text})
-            formats.reverse()
+            for backup_url in durl.findall('./backup_url/url'):
+                formats.append({
+                    'url': backup_url.text,
+                    # backup URLs have lower priorities
+                    'preference': -2 if 'hd.mp4' in backup_url.text else -3,
+                })
+
+            self._sort_formats(formats)
  
              entries.append({
                  'id': '%s_part%s' % (cid, xpath_text(durl, './order')),
-                'title': title,
                  'duration': int_or_none(xpath_text(durl, './length'), 1000),
                  'formats': formats,
              })
  
+        title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title')
+        description = self._html_search_meta('description', webpage)
+        datetime_str = self._html_search_regex(
+            r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False)
+        timestamp = None
+        if datetime_str:
+            timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple())
+
+        # TODO 'view_count' requires deobfuscating Javascript
          info = {
              'id': compat_str(cid),
              'title': title,
-            'description': view_data.get('description'),
-            'thumbnail': view_data.get('pic'),
-            'uploader': view_data.get('author'),
-            'timestamp': int_or_none(view_data.get('created')),
-            'view_count': int_or_none(view_data.get('play')),
-            'duration': int_or_none(xpath_text(doc, './timelength')),
+            'description': description,
+            'timestamp': timestamp,
+            'thumbnail': self._html_search_meta('thumbnailUrl', webpage),
+            'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000),
          }
  
+        uploader_mobj = re.search(
+            r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"',
+            webpage)
+        if uploader_mobj:
+            info.update({
+                'uploader': uploader_mobj.group('name'),
+                'uploader_id': uploader_mobj.group('id'),
+            })
+
+        for entry in entries:
+            entry.update(info)
+
          if len(entries) == 1:
-            entries[0].update(info)
              return entries[0]
          else:
-            info.update({
+            for idx, entry in enumerate(entries):
+                entry['id'] = '%s_part%d' % (video_id, (idx + 1))
+
+            return {
                  '_type': 'multi_video',
                  'id': video_id,
+                'title': title,
+                'description': description,
                  'entries': entries,
-            })
-            return info
+            }
diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py

new file mode 100644 (file)

index 0000000..ae4579b
--- /dev/null
+++ b/youtube_dl/extractor/biqle.py
@@ -0,0 +1,39 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class BIQLEIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
+    _TESTS = [{
+        'url': 'http://www.biqle.ru/watch/847655_160197695',
+        'md5': 'ad5f746a874ccded7b8f211aeea96637',
+        'info_dict': {
+            'id': '160197695',
+            'ext': 'mp4',
+            'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)',
+            'uploader': 'Andrey Rogozin',
+            'upload_date': '20110605',
+        }
+    }, {
+        'url': 'https://biqle.org/watch/-44781847_168547604',
+        'md5': '7f24e72af1db0edf7c1aaba513174f97',
+        'info_dict': {
+            'id': '168547604',
+            'ext': 'mp4',
+            'title': 'Ребенок в шоке от автоматической мойки',
+            'uploader': 'Dmitry Kotov',
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        embed_url = self._proto_relative_url(self._search_regex(
+            r'<iframe.+?src="((?:http:)?//daxab\.com/[^"]+)".*?></iframe>', webpage, 'embed url'))
+
+        return {
+            '_type': 'url_transparent',
+            'url': embed_url,
+        }
diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py

index 13343bc258532b37bf912f0648e317103b5f428d..bd538be50bc4a650d000eaffb5e292a4e8e76cbe 100644 (file)
--- a/youtube_dl/extractor/bloomberg.py
+++ b/youtube_dl/extractor/bloomberg.py
@@ -17,6 +17,9 @@ class BloombergIE(InfoExtractor):
              'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
              'description': 'md5:a8ba0302912d03d246979735c17d2761',
          },
+        'params': {
+            'format': 'best[format_id^=hds]',
+        },
      }, {
          'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
          'only_matching': True,
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

index f0781fc273a18ec30c1ffa97546232d991ad8574..ef560b592792910c2d39174eb2db129cea5590fe 100644 (file)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -307,9 +307,10 @@ def _get_playlist_info(self, player_key):
                                      playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
  
      def _extract_video_info(self, video_info):
+        video_id = compat_str(video_info['id'])
          publisher_id = video_info.get('publisherId')
          info = {
-            'id': compat_str(video_info['id']),
+            'id': video_id,
              'title': video_info['displayName'].strip(),
              'description': video_info.get('shortDescription'),
              'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
@@ -331,7 +332,8 @@ def _extract_video_info(self, video_info):
                      url_comp = compat_urllib_parse_urlparse(url)
                      if url_comp.path.endswith('.m3u8'):
                          formats.extend(
-                            self._extract_m3u8_formats(url, info['id'], 'mp4'))
+                            self._extract_m3u8_formats(
+                                url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
                          continue
                      elif 'akamaihd.net' in url_comp.netloc:
                          # This type of renditions are served through
@@ -365,7 +367,7 @@ def _extract_video_info(self, video_info):
                      a_format.update({
                          'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''),
                          'ext': 'mp4',
-                        'protocol': 'm3u8',
+                        'protocol': 'm3u8_native',
                      })
  
                  formats.append(a_format)
@@ -395,7 +397,7 @@ def _extract_video_info(self, video_info):
                      return ad_info
  
          if 'url' not in info and not info.get('formats'):
-            raise ExtractorError('Unable to extract video url for %s' % info['id'])
+            raise ExtractorError('Unable to extract video url for %s' % video_id)
          return info
  
  
@@ -442,6 +444,10 @@ class BrightcoveNewIE(InfoExtractor):
          # non numeric ref: prefixed video id
          'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356',
          'only_matching': True,
+    }, {
+        # unavailable video without message but with error_code
+        'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001',
+        'only_matching': True,
      }]
  
      @staticmethod
@@ -512,8 +518,9 @@ def _real_extract(self, url):
              })
          except ExtractorError as e:
              if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
-                json_data = self._parse_json(e.cause.read().decode(), video_id)
-                raise ExtractorError(json_data[0]['message'], expected=True)
+                json_data = self._parse_json(e.cause.read().decode(), video_id)[0]
+                raise ExtractorError(
+                    json_data.get('message') or json_data['error_code'], expected=True)
              raise
  
          title = json_data['name'].strip()
@@ -527,7 +534,7 @@ def _real_extract(self, url):
                  if not src:
                      continue
                  formats.extend(self._extract_m3u8_formats(
-                    src, video_id, 'mp4', m3u8_id='hls', fatal=False))
+                    src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
              elif source_type == 'application/dash+xml':
                  if not src:
                      continue
diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py

index dda98059e9041c651de5a211fccb2c106b11bb75..3aec601f8e7179570088e1ea5ad1f7b6d30f219d 100644 (file)
--- a/youtube_dl/extractor/byutv.py
+++ b/youtube_dl/extractor/byutv.py
@@ -11,6 +11,7 @@ class BYUtvIE(InfoExtractor):
      _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
      _TEST = {
          'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
+        'md5': '05850eb8c749e2ee05ad5a1c34668493',
          'info_dict': {
              'id': 'studio-c-season-5-episode-5',
              'ext': 'mp4',
@@ -21,7 +22,8 @@ class BYUtvIE(InfoExtractor):
          },
          'params': {
              'skip_download': True,
-        }
+        },
+        'add_ie': ['Ooyala'],
      }
  
      def _real_extract(self, url):
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py

index 25b2d4efe5d54e1c3264f906a3105ad05dd2ca3f..61463f249f6e4ded3b5f59831d7dba421ef9de9a 100644 (file)
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -4,11 +4,11 @@
  import re
  
  from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
  from ..utils import (
      ExtractorError,
      HEADRequest,
      unified_strdate,
-    url_basename,
      qualities,
      int_or_none,
  )
@@ -16,24 +16,38 @@
  
  class CanalplusIE(InfoExtractor):
      IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv'
-    _VALID_URL = r'https?://(?:www\.(?P<site>canalplus\.fr|piwiplus\.fr|d8\.tv|itele\.fr)/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))'
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:
+                                (?:
+                                    (?:(?:www|m)\.)?canalplus\.fr|
+                                    (?:www\.)?piwiplus\.fr|
+                                    (?:www\.)?d8\.tv|
+                                    (?:www\.)?d17\.tv|
+                                    (?:www\.)?itele\.fr
+                                )/(?:(?:[^/]+/)*(?P<display_id>[^/?#&]+))?(?:\?.*\bvid=(?P<vid>\d+))?|
+                                player\.canalplus\.fr/#/(?P<id>\d+)
+                            )
+
+                    '''
      _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json'
      _SITE_ID_MAP = {
-        'canalplus.fr': 'cplus',
-        'piwiplus.fr': 'teletoon',
-        'd8.tv': 'd8',
-        'itele.fr': 'itele',
+        'canalplus': 'cplus',
+        'piwiplus': 'teletoon',
+        'd8': 'd8',
+        'd17': 'd17',
+        'itele': 'itele',
      }
  
      _TESTS = [{
-        'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092',
-        'md5': '12164a6f14ff6df8bd628e8ba9b10b78',
+        'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814',
+        'md5': '41f438a4904f7664b91b4ed0dec969dc',
          'info_dict': {
-            'id': '1263092',
+            'id': '1192814',
              'ext': 'mp4',
-            'title': 'Le Zapping - 13/05/15',
-            'description': 'md5:09738c0d06be4b5d06a0940edb0da73f',
-            'upload_date': '20150513',
+            'title': "L'Année du Zapping 2014 - L'Année du Zapping 2014",
+            'description': "Toute l'année 2014 dans un Zapping exceptionnel !",
+            'upload_date': '20150105',
          },
      }, {
          'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
@@ -46,35 +60,45 @@ class CanalplusIE(InfoExtractor):
          },
          'skip': 'Only works from France',
      }, {
-        'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html',
+        'url': 'http://www.d8.tv/d8-docs-mags/pid5198-d8-en-quete-d-actualite.html?vid=1390231',
          'info_dict': {
-            'id': '966289',
-            'ext': 'flv',
-            'title': 'Campagne intime - Documentaire exceptionnel',
-            'description': 'md5:d2643b799fb190846ae09c61e59a859f',
-            'upload_date': '20131108',
+            'id': '1390231',
+            'ext': 'mp4',
+            'title': "Vacances pas chères : prix discount ou grosses dépenses ? - En quête d'actualité",
+            'description': 'md5:edb6cf1cb4a1e807b5dd089e1ac8bfc6',
+            'upload_date': '20160512',
+        },
+        'params': {
+            'skip_download': True,
          },
-        'skip': 'videos get deleted after a while',
      }, {
-        'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559',
-        'md5': '38b8f7934def74f0d6f3ba6c036a5f82',
+        'url': 'http://www.itele.fr/chroniques/invite-bruce-toussaint/thierry-solere-nicolas-sarkozy-officialisera-sa-candidature-a-la-primaire-quand-il-le-voudra-167224',
          'info_dict': {
-            'id': '1213714',
+            'id': '1398334',
              'ext': 'mp4',
-            'title': 'Aubervilliers : un lycée en colère - Le 11/02/2015 à 06h45',
-            'description': 'md5:8216206ec53426ea6321321f3b3c16db',
-            'upload_date': '20150211',
+            'title': "L'invité de Bruce Toussaint du 07/06/2016 - ",
+            'description': 'md5:40ac7c9ad0feaeb6f605bad986f61324',
+            'upload_date': '20160607',
          },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://m.canalplus.fr/?vid=1398231',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.d17.tv/emissions/pid8303-lolywood.html?vid=1397061',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.groupdict().get('id')
+        video_id = mobj.groupdict().get('id') or mobj.groupdict().get('vid')
  
-        site_id = self._SITE_ID_MAP[mobj.group('site') or 'canal']
+        site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]]
  
          # Beware, some subclasses do not define an id group
-        display_id = url_basename(mobj.group('path'))
+        display_id = mobj.group('display_id') or video_id
  
          if video_id is None:
              webpage = self._download_webpage(url, display_id)
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py

index 68a0633b63a8751abd1165b36f0519eaef59cde8..ff663d07947fd345a107203892cbe1811080ac6c 100644 (file)
--- a/youtube_dl/extractor/cbc.py
+++ b/youtube_dl/extractor/cbc.py
@@ -4,65 +4,66 @@
  import re
  
  from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+    js_to_json,
+    smuggle_url,
+)
  
  
  class CBCIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:[^/]+/)+(?P<id>[^/?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'
      _TESTS = [{
          # with mediaId
          'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs',
+        'md5': '97e24d09672fc4cf56256d6faa6c25bc',
          'info_dict': {
              'id': '2682904050',
-            'ext': 'flv',
+            'ext': 'mp4',
              'title': 'Don Cherry – All-Stars',
              'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.',
-            'timestamp': 1454475540,
+            'timestamp': 1454463000,
              'upload_date': '20160203',
-        },
-        'params': {
-            # rtmp download
-            'skip_download': True,
+            'uploader': 'CBCC-NEW',
          },
      }, {
          # with clipId
          'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
+        'md5': '0274a90b51a9b4971fe005c63f592f12',
          'info_dict': {
              'id': '2487345465',
-            'ext': 'flv',
+            'ext': 'mp4',
              'title': 'Robin Williams freestyles on 90 Minutes Live',
              'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.',
-            'upload_date': '19700101',
+            'upload_date': '19780210',
              'uploader': 'CBCC-NEW',
-        },
-        'params': {
-            # rtmp download
-            'skip_download': True,
+            'timestamp': 255977160,
          },
      }, {
          # multiple iframes
          'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot',
          'playlist': [{
+            'md5': '377572d0b49c4ce0c9ad77470e0b96b4',
              'info_dict': {
                  'id': '2680832926',
-                'ext': 'flv',
+                'ext': 'mp4',
                  'title': 'An Eagle\'s-Eye View Off Burrard Bridge',
                  'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.',
-                'upload_date': '19700101',
+                'upload_date': '20160201',
+                'timestamp': 1454342820,
+                'uploader': 'CBCC-NEW',
              },
          }, {
+            'md5': '415a0e3f586113894174dfb31aa5bb1a',
              'info_dict': {
                  'id': '2658915080',
-                'ext': 'flv',
+                'ext': 'mp4',
                  'title': 'Fly like an eagle!',
                  'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower',
-                'upload_date': '19700101',
+                'upload_date': '20150315',
+                'timestamp': 1426443984,
+                'uploader': 'CBCC-NEW',
              },
          }],
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        },
      }]
  
      @classmethod
@@ -91,24 +92,54 @@ def _real_extract(self, url):
  
  class CBCPlayerIE(InfoExtractor):
      _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'
-    _TEST = {
+    _TESTS = [{
          'url': 'http://www.cbc.ca/player/play/2683190193',
+        'md5': '64d25f841ddf4ddb28a235338af32e2c',
          'info_dict': {
              'id': '2683190193',
-            'ext': 'flv',
+            'ext': 'mp4',
              'title': 'Gerry Runs a Sweat Shop',
              'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0',
-            'timestamp': 1455067800,
+            'timestamp': 1455071400,
              'upload_date': '20160210',
+            'uploader': 'CBCC-NEW',
          },
-        'params': {
-            # rtmp download
-            'skip_download': True,
+    }, {
+        # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
+        'url': 'http://www.cbc.ca/player/play/2657631896',
+        'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
+        'info_dict': {
+            'id': '2657631896',
+            'ext': 'mp3',
+            'title': 'CBC Montreal is organizing its first ever community hackathon!',
+            'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.',
+            'timestamp': 1425704400,
+            'upload_date': '20150307',
+            'uploader': 'CBCC-NEW',
          },
-    }
+    }, {
+        # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url
+        'url': 'http://www.cbc.ca/player/play/2164402062',
+        'md5': '17a61eb813539abea40618d6323a7f82',
+        'info_dict': {
+            'id': '2164402062',
+            'ext': 'flv',
+            'title': 'Cancer survivor four times over',
+            'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
+            'timestamp': 1320410746,
+            'upload_date': '20111104',
+            'uploader': 'CBCC-NEW',
+        },
+    }]
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        return self.url_result(
-            'http://feed.theplatform.com/f/ExhSPC/vms_5akSXx4Ng_Zn?byGuid=%s' % video_id,
-            'ThePlatformFeed', video_id)
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': smuggle_url(
+                'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, {
+                    'force_smil_url': True
+                }),
+            'id': video_id,
+        }
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py

index 051d783a23cc7c0b5858af0c24f63187181cd276..ac2c7dced6f3561bb90f679947cfe35d6a31e2b1 100644 (file)
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -1,5 +1,7 @@
  from __future__ import unicode_literals
  
+import re
+
  from .theplatform import ThePlatformIE
  from ..utils import (
      xpath_text,
@@ -21,7 +23,7 @@ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
  
  
  class CBSIE(CBSBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)'
+    _VALID_URL = r'(?:cbs:(?P<content_id>\w+)|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<display_id>[^/]+))'
  
      _TESTS = [{
          'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
@@ -66,11 +68,12 @@ class CBSIE(CBSBaseIE):
      TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true'
  
      def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
-        content_id = self._search_regex(
-            [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"],
-            webpage, 'content id')
+        content_id, display_id = re.match(self._VALID_URL, url).groups()
+        if not content_id:
+            webpage = self._download_webpage(url, display_id)
+            content_id = self._search_regex(
+                [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"],
+                webpage, 'content id')
          items_data = self._download_xml(
              'http://can.cbs.com/thunder/player/videoPlayerService.php',
              content_id, query={'partner': 'cbs', 'contentId': content_id})
diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py

new file mode 100644 (file)

index 0000000..74adb38
--- /dev/null
+++ b/youtube_dl/extractor/cbslocal.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import calendar
+import datetime
+
+from .anvato import AnvatoIE
+from .sendtonews import SendtoNewsIE
+from ..compat import compat_urlparse
+
+
+class CBSLocalIE(AnvatoIE):
+    _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/\d+/\d+/\d+/(?P<id>[0-9a-z-]+)'
+
+    _TESTS = [{
+        # Anvato backend
+        'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis',
+        'md5': 'f0ee3081e3843f575fccef901199b212',
+        'info_dict': {
+            'id': '3401037',
+            'ext': 'mp4',
+            'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'',
+            'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.',
+            'thumbnail': 're:^https?://.*',
+            'timestamp': 1463440500,
+            'upload_date': '20160516',
+            'subtitles': {
+                'en': 'mincount:5',
+            },
+            'categories': [
+                'Stations\\Spoken Word\\KCBSTV',
+                'Syndication\\MSN',
+                'Syndication\\NDN',
+                'Syndication\\AOL',
+                'Syndication\\Yahoo',
+                'Syndication\\Tribune',
+                'Syndication\\Curb.tv',
+                'Content\\News'
+            ],
+        },
+    }, {
+        # SendtoNews embed
+        'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/',
+        'info_dict': {
+            'id': 'GxfCe0Zo7D-175909-5588',
+            'ext': 'mp4',
+            'title': 'Recap: CLE 15, CIN 6',
+            'description': '5/16/16: Indians\' bats explode for 15 runs in a win',
+            'upload_date': '20160516',
+            'timestamp': 1463433840,
+            'duration': 49,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        sendtonews_url = SendtoNewsIE._extract_url(webpage)
+        if sendtonews_url:
+            info_dict = {
+                '_type': 'url_transparent',
+                'url': compat_urlparse.urljoin(url, sendtonews_url),
+            }
+        else:
+            info_dict = self._extract_anvato_videos(webpage, display_id)
+
+        time_str = self._html_search_regex(
+            r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False)
+        timestamp = None
+        if time_str:
+            timestamp = calendar.timegm(datetime.datetime.strptime(
+                time_str, '%b %d, %Y %I:%M %p').timetuple())
+
+        info_dict.update({
+            'display_id': display_id,
+            'timestamp': timestamp,
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py

index 6652c8e42a279f45bdbbc1af3d36ad2500a454eb..5a58d1777d50297557cae49039df19cbfe15fef0 100644 (file)
--- a/youtube_dl/extractor/ceskatelevize.py
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -33,19 +33,33 @@ class CeskaTelevizeIE(InfoExtractor):
              'skip_download': True,
          },
      }, {
-        'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
+        'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
          'info_dict': {
-            'id': '61924494876844374',
+            'id': '61924494877028507',
              'ext': 'mp4',
-            'title': 'První republika: Zpěvačka z Dupárny Bobina',
-            'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.',
+            'title': 'Hyde Park Civilizace: Bonus 01 - En',
+            'description': 'English Subtittles',
              'thumbnail': 're:^https?://.*\.jpg',
-            'duration': 88.4,
+            'duration': 81.3,
          },
          'params': {
              # m3u8 download
              'skip_download': True,
          },
+    }, {
+        # live stream
+        'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
+        'info_dict': {
+            'id': 402,
+            'ext': 'mp4',
+            'title': 're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+            'is_live': True,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': 'Georestricted to Czech Republic',
      }, {
          # video with 18+ caution trailer
          'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
@@ -118,19 +132,21 @@ def _real_extract(self, url):
          req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
          req.add_header('Referer', url)
  
-        playlist_title = self._og_search_title(webpage)
-        playlist_description = self._og_search_description(webpage)
+        playlist_title = self._og_search_title(webpage, default=None)
+        playlist_description = self._og_search_description(webpage, default=None)
  
          playlist = self._download_json(req, playlist_id)['playlist']
          playlist_len = len(playlist)
  
          entries = []
          for item in playlist:
+            is_live = item.get('type') == 'LIVE'
              formats = []
              for format_id, stream_url in item['streamUrls'].items():
                  formats.extend(self._extract_m3u8_formats(
                      stream_url, playlist_id, 'mp4',
-                    entry_protocol='m3u8_native', fatal=False))
+                    entry_protocol='m3u8' if is_live else 'm3u8_native',
+                    fatal=False))
              self._sort_formats(formats)
  
              item_id = item.get('id') or item['assetId']
@@ -145,14 +161,22 @@ def _real_extract(self, url):
                  if subs:
                      subtitles = self.extract_subtitles(episode_id, subs)
  
+            if playlist_len == 1:
+                final_title = playlist_title or title
+                if is_live:
+                    final_title = self._live_title(final_title)
+            else:
+                final_title = '%s (%s)' % (playlist_title, title)
+
              entries.append({
                  'id': item_id,
-                'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title),
+                'title': final_title,
                  'description': playlist_description if playlist_len == 1 else None,
                  'thumbnail': thumbnail,
                  'duration': duration,
                  'formats': formats,
                  'subtitles': subtitles,
+                'is_live': is_live,
              })
  
          return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py

index c74553dcfa7c689b7fc8d69147625b1169e1e178..34d4e61569b110b49998768f13bb81cdda75bd75 100644 (file)
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -20,54 +20,64 @@ class Channel9IE(InfoExtractor):
      '''
      IE_DESC = 'Channel 9'
      IE_NAME = 'channel9'
-    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
-
-    _TESTS = [
-        {
-            'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
-            'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
-            'info_dict': {
-                'id': 'Events/TechEd/Australia/2013/KOS002',
-                'ext': 'mp4',
-                'title': 'Developer Kick-Off Session: Stuff We Love',
-                'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
-                'duration': 4576,
-                'thumbnail': 're:http://.*\.jpg',
-                'session_code': 'KOS002',
-                'session_day': 'Day 1',
-                'session_room': 'Arena 1A',
-                'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
-            },
+    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
+
+    _TESTS = [{
+        'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
+        'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
+        'info_dict': {
+            'id': 'Events/TechEd/Australia/2013/KOS002',
+            'ext': 'mp4',
+            'title': 'Developer Kick-Off Session: Stuff We Love',
+            'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
+            'duration': 4576,
+            'thumbnail': 're:http://.*\.jpg',
+            'session_code': 'KOS002',
+            'session_day': 'Day 1',
+            'session_room': 'Arena 1A',
+            'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
+                                 'Mads Kristensen'],
          },
-        {
-            'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
-            'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
-            'info_dict': {
-                'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
-                'ext': 'mp4',
-                'title': 'Self-service BI with Power BI - nuclear testing',
-                'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
-                'duration': 1540,
-                'thumbnail': 're:http://.*\.jpg',
-                'authors': ['Mike Wilmot'],
-            },
+    }, {
+        'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
+        'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
+        'info_dict': {
+            'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
+            'ext': 'mp4',
+            'title': 'Self-service BI with Power BI - nuclear testing',
+            'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
+            'duration': 1540,
+            'thumbnail': 're:http://.*\.jpg',
+            'authors': ['Mike Wilmot'],
          },
-        {
-            # low quality mp4 is best
-            'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
-            'info_dict': {
-                'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
-                'ext': 'mp4',
-                'title': 'Ranges for the Standard Library',
-                'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
-                'duration': 5646,
-                'thumbnail': 're:http://.*\.jpg',
-            },
-            'params': {
-                'skip_download': True,
-            },
-        }
-    ]
+    }, {
+        # low quality mp4 is best
+        'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+        'info_dict': {
+            'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+            'ext': 'mp4',
+            'title': 'Ranges for the Standard Library',
+            'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
+            'duration': 5646,
+            'thumbnail': 're:http://.*\.jpg',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
+        'info_dict': {
+            'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
+            'title': 'Channel 9',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
+        'only_matching': True,
+    }, {
+        'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
+        'only_matching': True,
+    }]
  
      _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  
@@ -254,22 +264,30 @@ def _extract_session(self, html, content_path):
  
          return self.playlist_result(contents)
  
-    def _extract_list(self, content_path):
-        rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
+    def _extract_list(self, video_id, rss_url=None):
+        if not rss_url:
+            rss_url = self._RSS_URL % video_id
+        rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
          entries = [self.url_result(session_url.text, 'Channel9')
                     for session_url in rss.findall('./channel/item/link')]
          title_text = rss.find('./channel/title').text
-        return self.playlist_result(entries, content_path, title_text)
+        return self.playlist_result(entries, video_id, title_text)
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          content_path = mobj.group('contentpath')
+        rss = mobj.group('rss')
+
+        if rss:
+            return self._extract_list(content_path, url)
  
-        webpage = self._download_webpage(url, content_path, 'Downloading web page')
+        webpage = self._download_webpage(
+            url, content_path, 'Downloading web page')
  
-        page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
-        if page_type_m is not None:
-            page_type = page_type_m.group('pagetype')
+        page_type = self._search_regex(
+            r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
+            webpage, 'page type', default=None, group='pagetype')
+        if page_type:
              if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content
                  return self._extract_entry_item(webpage, content_path)
              elif page_type == 'Session':  # Event session page, may contain downloadable content
@@ -278,6 +296,5 @@ def _real_extract(self, url):
                  return self._extract_list(content_path)
              else:
                  raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
-
          else:  # Assuming list
              return self._extract_list(content_path)
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py

deleted file mode 100644 (file)

index 042c4f2..0000000
--- a/youtube_dl/extractor/cinemassacre.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import ExtractorError
-from .screenwavemedia import ScreenwaveMediaIE
-
-
-class CinemassacreIE(InfoExtractor):
-    _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
-    _TESTS = [
-        {
-            'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
-            'md5': 'fde81fbafaee331785f58cd6c0d46190',
-            'info_dict': {
-                'id': 'Cinemassacre-19911',
-                'ext': 'mp4',
-                'upload_date': '20121110',
-                'title': '“Angry Video Game Nerd: The Movie” – Trailer',
-                'description': 'md5:fb87405fcb42a331742a0dce2708560b',
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
-            'md5': 'd72f10cd39eac4215048f62ab477a511',
-            'info_dict': {
-                'id': 'Cinemassacre-521be8ef82b16',
-                'ext': 'mp4',
-                'upload_date': '20131002',
-                'title': 'The Mummy’s Hand (1940)',
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        },
-        {
-            # Youtube embedded video
-            'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/',
-            'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9',
-            'info_dict': {
-                'id': 'OEVzPCY2T-g',
-                'ext': 'webm',
-                'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',
-                'upload_date': '20061207',
-                'uploader': 'Cinemassacre',
-                'uploader_id': 'JamesNintendoNerd',
-                'description': 'md5:784734696c2b8b7f4b8625cc799e07f6',
-            }
-        },
-        {
-            # Youtube embedded video
-            'url': 'http://cinemassacre.com/2006/09/01/mckids/',
-            'md5': '7393c4e0f54602ad110c793eb7a6513a',
-            'info_dict': {
-                'id': 'FnxsNhuikpo',
-                'ext': 'webm',
-                'upload_date': '20060901',
-                'uploader': 'Cinemassacre Extra',
-                'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53',
-                'uploader_id': 'Cinemassacre',
-                'title': 'AVGN: McKids',
-            }
-        },
-        {
-            'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/',
-            'md5': '1376908e49572389e7b06251a53cdd08',
-            'info_dict': {
-                'id': 'Cinemassacre-555779690c440',
-                'ext': 'mp4',
-                'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
-                'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
-                'upload_date': '20150525',
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        }
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
-        video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d')
-
-        webpage = self._download_webpage(url, display_id)
-
-        playerdata_url = self._search_regex(
-            [
-                ScreenwaveMediaIE.EMBED_PATTERN,
-                r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
-            ],
-            webpage, 'player data URL', default=None, group='url')
-        if not playerdata_url:
-            raise ExtractorError('Unable to find player data')
-
-        video_title = self._html_search_regex(
-            r'<title>(?P<title>.+?)\|', webpage, 'title')
-        video_description = self._html_search_regex(
-            r'<div class="entry-content">(?P<description>.+?)</div>',
-            webpage, 'description', flags=re.DOTALL, fatal=False)
-        video_thumbnail = self._og_search_thumbnail(webpage)
-
-        return {
-            '_type': 'url_transparent',
-            'display_id': display_id,
-            'title': video_title,
-            'description': video_description,
-            'upload_date': video_date,
-            'thumbnail': video_thumbnail,
-            'url': playerdata_url,
-        }
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py

deleted file mode 100644 (file)

index 002b240..0000000
--- a/youtube_dl/extractor/collegehumor.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-import re
-
-from .common import InfoExtractor
-from ..utils import int_or_none
-
-
-class CollegeHumorIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$'
-
-    _TESTS = [
-        {
-            'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
-            'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd',
-            'info_dict': {
-                'id': '6902724',
-                'ext': 'mp4',
-                'title': 'Comic-Con Cosplay Catastrophe',
-                'description': "Fans get creative this year at San Diego.  Too creative.  And yes, that's really Joss Whedon.",
-                'age_limit': 13,
-                'duration': 187,
-            },
-        }, {
-            'url': 'http://www.collegehumor.com/video/3505939/font-conference',
-            'md5': '72fa701d8ef38664a4dbb9e2ab721816',
-            'info_dict': {
-                'id': '3505939',
-                'ext': 'mp4',
-                'title': 'Font Conference',
-                'description': "This video wasn't long enough, so we made it double-spaced.",
-                'age_limit': 10,
-                'duration': 179,
-            },
-        }, {
-            # embedded youtube video
-            'url': 'http://www.collegehumor.com/embed/6950306',
-            'info_dict': {
-                'id': 'Z-bao9fg6Yc',
-                'ext': 'mp4',
-                'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!',
-                'uploader': 'Mark Dice',
-                'uploader_id': 'MarkDice',
-                'description': 'md5:62c3dab9351fac7bb44b53b69511d87f',
-                'upload_date': '20140127',
-            },
-            'params': {
-                'skip_download': True,
-            },
-            'add_ie': ['Youtube'],
-        },
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('videoid')
-
-        jsonUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id + '.json'
-        data = json.loads(self._download_webpage(
-            jsonUrl, video_id, 'Downloading info JSON'))
-        vdata = data['video']
-        if vdata.get('youtubeId') is not None:
-            return {
-                '_type': 'url',
-                'url': vdata['youtubeId'],
-                'ie_key': 'Youtube',
-            }
-
-        AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0}
-        rating = vdata.get('rating')
-        if rating:
-            age_limit = AGE_LIMITS.get(rating.lower())
-        else:
-            age_limit = None  # None = No idea
-
-        PREFS = {'high_quality': 2, 'low_quality': 0}
-        formats = []
-        for format_key in ('mp4', 'webm'):
-            for qname, qurl in vdata.get(format_key, {}).items():
-                formats.append({
-                    'format_id': format_key + '_' + qname,
-                    'url': qurl,
-                    'format': format_key,
-                    'preference': PREFS.get(qname),
-                })
-        self._sort_formats(formats)
-
-        duration = int_or_none(vdata.get('duration'), 1000)
-        like_count = int_or_none(vdata.get('likes'))
-
-        return {
-            'id': video_id,
-            'title': vdata['title'],
-            'description': vdata.get('description'),
-            'thumbnail': vdata.get('thumbnail'),
-            'formats': formats,
-            'age_limit': age_limit,
-            'duration': duration,
-            'like_count': like_count,
-        }
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py

index 0c59102e072594857cc0f1c53e15c183b1885a93..2b6aaa3aa47541669a59b7936477023d54ae7437 100644 (file)
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -44,10 +44,10 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
      #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
      _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow)
                        |https?://(:www\.)?
-                          (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
+                          (?P<showname>thedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/
                           ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
                            (?P<clip>
-                              (?:(?:guests/[^/]+|videos|video-playlists|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))
+                              (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))
                                |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
                                |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
                            )|
@@ -129,6 +129,9 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
      }, {
          'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel',
          'only_matching': True,
+    }, {
+        'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans',
+        'only_matching': True,
      }]
  
      _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 0843d89af71f7b68f6b650c01a3f8edcffdc78b3..bfd4321605a906862987063b243703445136dc5e 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -45,6 +45,7 @@
      unescapeHTML,
      unified_strdate,
      url_basename,
+    xpath_element,
      xpath_text,
      xpath_with_ns,
      determine_protocol,
@@ -987,7 +988,7 @@ def _sleep(self, timeout, video_id, msg_template=None):
  
      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
                               transform_source=lambda s: fix_xml_ampersands(s).strip(),
-                             fatal=True):
+                             fatal=True, m3u8_id=None):
          manifest = self._download_xml(
              manifest_url, video_id, 'Downloading f4m manifest',
              'Unable to download f4m manifest',
@@ -1001,11 +1002,11 @@ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=N
  
          return self._parse_f4m_formats(
              manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
-            transform_source=transform_source, fatal=fatal)
+            transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
  
      def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
-                           fatal=True):
+                           fatal=True, m3u8_id=None):
          # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
          akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
          if akamai_pv is not None and ';' in akamai_pv.text:
@@ -1029,9 +1030,26 @@ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None,
              'base URL', default=None)
          if base_url:
              base_url = base_url.strip()
+
+        bootstrap_info = xpath_element(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
+            'bootstrap info', default=None)
+
          for i, media_el in enumerate(media_nodes):
-            if manifest_version == '2.0':
-                media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+            tbr = int_or_none(media_el.attrib.get('bitrate'))
+            width = int_or_none(media_el.attrib.get('width'))
+            height = int_or_none(media_el.attrib.get('height'))
+            format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
+            # If <bootstrapInfo> is present, the specified f4m is a
+            # stream-level manifest, and only set-level manifests may refer to
+            # external resources.  See section 11.4 and section 4 of F4M spec
+            if bootstrap_info is None:
+                media_url = None
+                # @href is introduced in 2.0, see section 11.6 of F4M spec
+                if manifest_version == '2.0':
+                    media_url = media_el.attrib.get('href')
+                if media_url is None:
+                    media_url = media_el.attrib.get('url')
                  if not media_url:
                      continue
                  manifest_url = (
@@ -1041,29 +1059,43 @@ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None,
                  # since bitrates in parent manifest (this one) and media_url manifest
                  # may differ leading to inability to resolve the format by requested
                  # bitrate in f4m downloader
-                if determine_ext(manifest_url) == 'f4m':
-                    formats.extend(self._extract_f4m_formats(
+                ext = determine_ext(manifest_url)
+                if ext == 'f4m':
+                    f4m_formats = self._extract_f4m_formats(
                          manifest_url, video_id, preference=preference, f4m_id=f4m_id,
-                        transform_source=transform_source, fatal=fatal))
+                        transform_source=transform_source, fatal=fatal)
+                    # Sometimes stream-level manifest contains single media entry that
+                    # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
+                    # At the same time parent's media entry in set-level manifest may
+                    # contain it. We will copy it from parent in such cases.
+                    if len(f4m_formats) == 1:
+                        f = f4m_formats[0]
+                        f.update({
+                            'tbr': f.get('tbr') or tbr,
+                            'width': f.get('width') or width,
+                            'height': f.get('height') or height,
+                            'format_id': f.get('format_id') if not tbr else format_id,
+                        })
+                    formats.extend(f4m_formats)
+                    continue
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        manifest_url, video_id, 'mp4', preference=preference,
+                        m3u8_id=m3u8_id, fatal=fatal))
                      continue
-            tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
-                'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
+                'format_id': format_id,
                  'url': manifest_url,
-                'ext': 'flv',
+                'ext': 'flv' if bootstrap_info is not None else None,
                  'tbr': tbr,
-                'width': int_or_none(media_el.attrib.get('width')),
-                'height': int_or_none(media_el.attrib.get('height')),
+                'width': width,
+                'height': height,
                  'preference': preference,
              })
          return formats
  
-    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
-                              entry_protocol='m3u8', preference=None,
-                              m3u8_id=None, note=None, errnote=None,
-                              fatal=True, live=False):
-
-        formats = [{
+    def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
+        return {
              'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
              'url': m3u8_url,
              'ext': ext,
@@ -1071,7 +1103,14 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
              'preference': preference - 1 if preference else -1,
              'resolution': 'multiple',
              'format_note': 'Quality selection URL',
-        }]
+        }
+
+    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
+                              entry_protocol='m3u8', preference=None,
+                              m3u8_id=None, note=None, errnote=None,
+                              fatal=True, live=False):
+
+        formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
  
          format_url = lambda u: (
              u
@@ -1138,12 +1177,15 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                  format_id = []
                  if m3u8_id:
                      format_id.append(m3u8_id)
-                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
+                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None
+                # Despite specification does not mention NAME attribute for
+                # EXT-X-STREAM-INF it still sometimes may be present
+                stream_name = last_info.get('NAME') or last_media_name
                  # Bandwidth of live streams may differ over time thus making
                  # format_id unpredictable. So it's better to keep provided
                  # format_id intact.
                  if not live:
-                    format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
+                    format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
                  f = {
                      'format_id': '-'.join(format_id),
                      'url': format_url(line.strip()),
@@ -1275,21 +1317,21 @@ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_para
          m3u8_count = 0
  
          srcs = []
-        videos = smil.findall(self._xpath_ns('.//video', namespace))
-        for video in videos:
-            src = video.get('src')
+        media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
+        for medium in media:
+            src = medium.get('src')
              if not src or src in srcs:
                  continue
              srcs.append(src)
  
-            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
-            filesize = int_or_none(video.get('size') or video.get('fileSize'))
-            width = int_or_none(video.get('width'))
-            height = int_or_none(video.get('height'))
-            proto = video.get('proto')
-            ext = video.get('ext')
+            bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
+            filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
+            width = int_or_none(medium.get('width'))
+            height = int_or_none(medium.get('height'))
+            proto = medium.get('proto')
+            ext = medium.get('ext')
              src_ext = determine_ext(src)
-            streamer = video.get('streamer') or base
+            streamer = medium.get('streamer') or base
  
              if proto == 'rtmp' or streamer.startswith('rtmp'):
                  rtmp_count += 1
diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py

new file mode 100644 (file)

index 0000000..a901b8d
--- /dev/null
+++ b/youtube_dl/extractor/coub.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    parse_iso8601,
+    qualities,
+)
+
+
+class CoubIE(InfoExtractor):
+    _VALID_URL = r'(?:coub:|https?://(?:coub\.com/(?:view|embed|coubs)/|c-cdn\.coub\.com/fb-player\.swf\?.*\bcoub(?:ID|id)=))(?P<id>[\da-z]+)'
+
+    _TESTS = [{
+        'url': 'http://coub.com/view/5u5n1',
+        'info_dict': {
+            'id': '5u5n1',
+            'ext': 'mp4',
+            'title': 'The Matrix Moonwalk',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 4.6,
+            'timestamp': 1428527772,
+            'upload_date': '20150408',
+            'uploader': 'Артём Лоскутников',
+            'uploader_id': 'artyom.loskutnikov',
+            'view_count': int,
+            'like_count': int,
+            'repost_count': int,
+            'comment_count': int,
+            'age_limit': 0,
+        },
+    }, {
+        'url': 'http://c-cdn.coub.com/fb-player.swf?bot_type=vk&coubID=7w5a4',
+        'only_matching': True,
+    }, {
+        'url': 'coub:5u5n1',
+        'only_matching': True,
+    }, {
+        # longer video id
+        'url': 'http://coub.com/view/237d5l5h',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        coub = self._download_json(
+            'http://coub.com/api/v2/coubs/%s.json' % video_id, video_id)
+
+        if coub.get('error'):
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, coub['error']), expected=True)
+
+        title = coub['title']
+
+        file_versions = coub['file_versions']
+
+        QUALITIES = ('low', 'med', 'high')
+
+        MOBILE = 'mobile'
+        IPHONE = 'iphone'
+        HTML5 = 'html5'
+
+        SOURCE_PREFERENCE = (MOBILE, IPHONE, HTML5)
+
+        quality_key = qualities(QUALITIES)
+        preference_key = qualities(SOURCE_PREFERENCE)
+
+        formats = []
+
+        for kind, items in file_versions.get(HTML5, {}).items():
+            if kind not in ('video', 'audio'):
+                continue
+            if not isinstance(items, dict):
+                continue
+            for quality, item in items.items():
+                if not isinstance(item, dict):
+                    continue
+                item_url = item.get('url')
+                if not item_url:
+                    continue
+                formats.append({
+                    'url': item_url,
+                    'format_id': '%s-%s-%s' % (HTML5, kind, quality),
+                    'filesize': int_or_none(item.get('size')),
+                    'vcodec': 'none' if kind == 'audio' else None,
+                    'quality': quality_key(quality),
+                    'preference': preference_key(HTML5),
+                })
+
+        iphone_url = file_versions.get(IPHONE, {}).get('url')
+        if iphone_url:
+            formats.append({
+                'url': iphone_url,
+                'format_id': IPHONE,
+                'preference': preference_key(IPHONE),
+            })
+
+        mobile_url = file_versions.get(MOBILE, {}).get('audio_url')
+        if mobile_url:
+            formats.append({
+                'url': mobile_url,
+                'format_id': '%s-audio' % MOBILE,
+                'preference': preference_key(MOBILE),
+            })
+
+        self._sort_formats(formats)
+
+        thumbnail = coub.get('picture')
+        duration = float_or_none(coub.get('duration'))
+        timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at'))
+        uploader = coub.get('channel', {}).get('title')
+        uploader_id = coub.get('channel', {}).get('permalink')
+
+        view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count'))
+        like_count = int_or_none(coub.get('likes_count'))
+        repost_count = int_or_none(coub.get('recoubs_count'))
+        comment_count = int_or_none(coub.get('comments_count'))
+
+        age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin'))
+        if age_restricted is not None:
+            age_limit = 18 if age_restricted is True else 0
+        else:
+            age_limit = None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'view_count': view_count,
+            'like_count': like_count,
+            'repost_count': repost_count,
+            'comment_count': comment_count,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py

new file mode 100644 (file)

index 0000000..b60a1d8
--- /dev/null
+++ b/youtube_dl/extractor/dailymail.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    determine_protocol,
+)
+
+
+class DailyMailIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html',
+        'md5': '2f639d446394f53f3a33658b518b6615',
+        'info_dict': {
+            'id': '1288527',
+            'ext': 'mp4',
+            'title': 'Turn any video into an impressionist masterpiece',
+            'description': 'md5:88ddbcb504367987b2708bb38677c9d2',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        video_data = self._parse_json(self._search_regex(
+            r"data-opts='({.+?})'", webpage, 'video data'), video_id)
+        title = video_data['title']
+        video_sources = self._download_json(video_data.get(
+            'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
+
+        formats = []
+        for rendition in video_sources['renditions']:
+            rendition_url = rendition.get('url')
+            if not rendition_url:
+                continue
+            tbr = int_or_none(rendition.get('encodingRate'), 1000)
+            container = rendition.get('videoContainer')
+            is_hls = container == 'M2TS'
+            protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url})
+            formats.append({
+                'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''),
+                'url': rendition_url,
+                'width': int_or_none(rendition.get('frameWidth')),
+                'height': int_or_none(rendition.get('frameHeight')),
+                'tbr': tbr,
+                'vcodec': rendition.get('videoCodec'),
+                'container': container,
+                'protocol': protocol,
+                'ext': 'mp4' if is_hls else None,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('descr'),
+            'thumbnail': video_data.get('poster') or video_data.get('thumbnail'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py

index ae7c571bd3d06b41895d46f1df0dbd803170de0b..0f0f0b8d375b3c319b00cc3e235133686c1744a8 100644 (file)
--- a/youtube_dl/extractor/dw.py
+++ b/youtube_dl/extractor/dw.py
@@ -2,13 +2,16 @@
  from __future__ import unicode_literals
  
  from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+)
  from ..compat import compat_urlparse
  
  
  class DWIE(InfoExtractor):
      IE_NAME = 'dw'
-    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+(?:av|e)-(?P<id>\d+)'
      _TESTS = [{
          # video
          'url': 'http://www.dw.com/en/intelligent-light/av-19112290',
@@ -31,6 +34,16 @@ class DWIE(InfoExtractor):
              'description': 'md5:bc9ca6e4e063361e21c920c53af12405',
              'upload_date': '20160311',
          }
+    }, {
+        'url': 'http://www.dw.com/en/documentaries-welcome-to-the-90s-2016-05-21/e-19220158-9798',
+        'md5': '56b6214ef463bfb9a3b71aeb886f3cf1',
+        'info_dict': {
+            'id': '19274438',
+            'ext': 'mp4',
+            'title': 'Welcome to the 90s – Hip Hop',
+            'description': 'Welcome to the 90s - The Golden Decade of Hip Hop',
+            'upload_date': '20160521',
+        },
      }]
  
      def _real_extract(self, url):
@@ -38,6 +51,7 @@ def _real_extract(self, url):
          webpage = self._download_webpage(url, media_id)
          hidden_inputs = self._hidden_inputs(webpage)
          title = hidden_inputs['media_title']
+        media_id = hidden_inputs.get('media_id') or media_id
  
          if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
              formats = self._extract_smil_formats(
@@ -49,13 +63,20 @@ def _real_extract(self, url):
          else:
              formats = [{'url': hidden_inputs['file_name']}]
  
+        upload_date = hidden_inputs.get('display_date')
+        if not upload_date:
+            upload_date = self._html_search_regex(
+                r'<span[^>]+class="date">([0-9.]+)\s*\|', webpage,
+                'upload date', default=None)
+            upload_date = unified_strdate(upload_date)
+
          return {
              'id': media_id,
              'title': title,
              'description': self._og_search_description(webpage),
              'thumbnail': hidden_inputs.get('preview_image'),
              'duration': int_or_none(hidden_inputs.get('file_duration')),
-            'upload_date': hidden_inputs.get('display_date'),
+            'upload_date': upload_date,
              'formats': formats,
          }
  
diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py

index e006921ec3f8d2a0aff0e6bb0595148469b1c256..ac5d0fe2426a8e3fb213b2bb6d8f59fa8acc5fba 100644 (file)
--- a/youtube_dl/extractor/eporner.py
+++ b/youtube_dl/extractor/eporner.py
@@ -11,8 +11,8 @@
  
  
  class EpornerIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)/(?P<display_id>[\w-]+)'
+    _TESTS = [{
          'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/',
          'md5': '39d486f046212d8e1b911c52ab4691f8',
          'info_dict': {
@@ -23,8 +23,12 @@ class EpornerIE(InfoExtractor):
              'duration': 1838,
              'view_count': int,
              'age_limit': 18,
-        }
-    }
+        },
+    }, {
+        # New (May 2016) URL layout
+        'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py

index db4b263bcbf40a9cb133d2a9729e4fe07292bae3..66c08bec47d8aa639cf758bb3e083b9772230c76 100644 (file)
--- a/youtube_dl/extractor/espn.py
+++ b/youtube_dl/extractor/espn.py
@@ -8,6 +8,7 @@ class ESPNIE(InfoExtractor):
      _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'
      _TESTS = [{
          'url': 'http://espn.go.com/video/clip?id=10365079',
+        'md5': '60e5d097a523e767d06479335d1bdc58',
          'info_dict': {
              'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
              'ext': 'mp4',
@@ -15,21 +16,22 @@ class ESPNIE(InfoExtractor):
              'description': None,
          },
          'params': {
-            # m3u8 download
              'skip_download': True,
          },
+        'add_ie': ['OoyalaExternal'],
      }, {
          # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season
          'url': 'http://espn.go.com/video/clip?id=2743663',
+        'md5': 'f4ac89b59afc7e2d7dbb049523df6768',
          'info_dict': {
              'id': '50NDFkeTqRHB0nXBOK-RGdSG5YQPuxHg',
              'ext': 'mp4',
              'title': 'Must-See Moments: Best of the MLS season',
          },
          'params': {
-            # m3u8 download
              'skip_download': True,
          },
+        'add_ie': ['OoyalaExternal'],
      }, {
          'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
          'only_matching': True,
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py

index 1f95530a5f1ee6e335eb1d31453b95e1d1574c59..aa98782a5097c348a23a3387dbdce08884f813e4 100644 (file)
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -3,6 +3,10 @@
  
  from .abc import ABCIE
  from .abc7news import Abc7NewsIE
+from .abcnews import (
+    AbcNewsIE,
+    AbcNewsVideoIE,
+)
  from .academicearth import AcademicEarthCourseIE
  from .acast import (
      ACastIE,
@@ -53,6 +57,7 @@
      ArteTVDDCIE,
      ArteTVMagazineIE,
      ArteTVEmbedIE,
+    ArteTVPlaylistIE,
  )
  from .atresplayer import AtresPlayerIE
  from .atttechchannel import ATTTechChannelIE
@@ -76,6 +81,7 @@
  from .bild import BildIE
  from .bilibili import BiliBiliIE
  from .biobiochiletv import BioBioChileTVIE
+from .biqle import BIQLEIE
  from .bleacherreport import (
      BleacherReportIE,
      BleacherReportCMSIE,
@@ -107,6 +113,7 @@
      CBCPlayerIE,
  )
  from .cbs import CBSIE
+from .cbslocal import CBSLocalIE
  from .cbsinteractive import CBSInteractiveIE
  from .cbsnews import (
      CBSNewsIE,
@@ -124,7 +131,6 @@
      ChirbitProfileIE,
  )
  from .cinchcast import CinchcastIE
-from .cinemassacre import CinemassacreIE
  from .cliprs import ClipRsIE
  from .clipfish import ClipfishIE
  from .cliphunter import CliphunterIE
@@ -139,7 +145,7 @@
      CNNBlogsIE,
      CNNArticleIE,
  )
-from .collegehumor import CollegeHumorIE
+from .coub import CoubIE
  from .collegerama import CollegeRamaIE
  from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
  from .comcarcoff import ComCarCoffIE
@@ -158,6 +164,7 @@
  from .ctsnews import CtsNewsIE
  from .cultureunplugged import CultureUnpluggedIE
  from .cwtv import CWTVIE
+from .dailymail import DailyMailIE
  from .dailymotion import (
      DailymotionIE,
      DailymotionPlaylistIE,
@@ -227,6 +234,7 @@
  from .exfm import ExfmIE
  from .expotv import ExpoTVIE
  from .extremetube import ExtremeTubeIE
+from .eyedotv import EyedoTVIE
  from .facebook import FacebookIE
  from .faz import FazIE
  from .fc2 import FC2IE
@@ -239,6 +247,7 @@
  from .flickr import FlickrIE
  from .folketinget import FolketingetIE
  from .footyroom import FootyRoomIE
+from .formula1 import Formula1IE
  from .fourtube import FourTubeIE
  from .fox import FOXIE
  from .foxgay import FoxgayIE
@@ -366,6 +375,7 @@
  )
  from .la7 import LA7IE
  from .laola1tv import Laola1TvIE
+from .learnr import LearnrIE
  from .lecture2go import Lecture2GoIE
  from .lemonde import LemondeIE
  from .leeco import (
@@ -373,6 +383,7 @@
      LePlaylistIE,
      LetvCloudIE,
  )
+from .libraryofcongress import LibraryOfCongressIE
  from .libsyn import LibsynIE
  from .lifenews import (
      LifeNewsIE,
@@ -383,6 +394,7 @@
      LimelightChannelIE,
      LimelightChannelListIE,
  )
+from .litv import LiTVIE
  from .liveleak import LiveLeakIE
  from .livestream import (
      LivestreamIE,
@@ -390,6 +402,7 @@
      LivestreamShortenerIE,
  )
  from .lnkgo import LnkGoIE
+from .localnews8 import LocalNews8IE
  from .lovehomeporn import LoveHomePornIE
  from .lrt import LRTIE
  from .lynda import (
@@ -407,6 +420,10 @@
  from .metacritic import MetacriticIE
  from .mgoon import MgoonIE
  from .mgtv import MGTVIE
+from .microsoftvirtualacademy import (
+    MicrosoftVirtualAcademyIE,
+    MicrosoftVirtualAcademyCourseIE,
+)
  from .minhateca import MinhatecaIE
  from .ministrygrid import MinistryGridIE
  from .minoto import MinotoIE
@@ -561,7 +578,10 @@
  from .patreon import PatreonIE
  from .pbs import PBSIE
  from .people import PeopleIE
-from .periscope import PeriscopeIE
+from .periscope import (
+    PeriscopeIE,
+    PeriscopeUserIE,
+)
  from .philharmoniedeparis import PhilharmonieDeParisIE
  from .phoenix import PhoenixIE
  from .photobucket import PhotobucketIE
@@ -602,6 +622,10 @@
      QQMusicPlaylistIE,
  )
  from .r7 import R7IE
+from .radiocanada import (
+    RadioCanadaIE,
+    RadioCanadaAudioVideoIE,
+)
  from .radiode import RadioDeIE
  from .radiojavan import RadioJavanIE
  from .radiobremen import RadioBremenIE
@@ -615,8 +639,12 @@
  from .redtube import RedTubeIE
  from .regiotv import RegioTVIE
  from .restudy import RestudyIE
+from .reuters import ReutersIE
  from .reverbnation import ReverbNationIE
-from .revision3 import Revision3IE
+from .revision3 import (
+    Revision3EmbedIE,
+    Revision3IE,
+)
  from .rice import RICEIE
  from .ringtv import RingTVIE
  from .ro220 import Ro220IE
@@ -655,7 +683,9 @@
  from .screencastomatic import ScreencastOMaticIE
  from .screenjunkies import ScreenJunkiesIE
  from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE
+from .seeker import SeekerIE
  from .senateisvp import SenateISVPIE
+from .sendtonews import SendtoNewsIE
  from .servingsys import ServingSysIE
  from .sexu import SexuIE
  from .shahid import ShahidIE
@@ -758,6 +788,7 @@
  from .thestar import TheStarIE
  from .thisamericanlife import ThisAmericanLifeIE
  from .thisav import ThisAVIE
+from .threeqsdn import ThreeQSDNIE
  from .tinypic import TinyPicIE
  from .tlc import TlcDeIE
  from .tmz import (
@@ -810,7 +841,10 @@
  )
  from .tvigle import TvigleIE
  from .tvland import TVLandIE
-from .tvp import TvpIE, TvpSeriesIE
+from .tvp import (
+    TVPIE,
+    TVPSeriesIE,
+)
  from .tvplay import TVPlayIE
  from .tweakers import TweakersIE
  from .twentyfourvideo import TwentyFourVideoIE
@@ -825,7 +859,6 @@
      TwitchVodIE,
      TwitchProfileIE,
      TwitchPastBroadcastsIE,
-    TwitchBookmarksIE,
      TwitchStreamIE,
  )
  from .twitter import (
@@ -843,7 +876,10 @@
  from .urort import UrortIE
  from .usatoday import USATodayIE
  from .ustream import UstreamIE, UstreamChannelIE
-from .ustudio import UstudioIE
+from .ustudio import (
+    UstudioIE,
+    UstudioEmbedIE,
+)
  from .varzesh3 import Varzesh3IE
  from .vbox7 import Vbox7IE
  from .veehd import VeeHDIE
@@ -875,6 +911,7 @@
  )
  from .videopremium import VideoPremiumIE
  from .videott import VideoTtIE
+from .vidio import VidioIE
  from .vidme import (
      VidmeIE,
      VidmeUserIE,
@@ -922,13 +959,15 @@
  from .vuclip import VuClipIE
  from .vulture import VultureIE
  from .walla import WallaIE
-from .washingtonpost import WashingtonPostIE
+from .washingtonpost import (
+    WashingtonPostIE,
+    WashingtonPostArticleIE,
+)
  from .wat import WatIE
  from .watchindianporn import WatchIndianPornIE
  from .wdr import (
      WDRIE,
      WDRMobileIE,
-    WDRMausIE,
  )
  from .webofstories import (
      WebOfStoriesIE,
@@ -975,7 +1014,10 @@
  from .yinyuetai import YinYueTaiIE
  from .ynet import YnetIE
  from .youjizz import YouJizzIE
-from .youku import YoukuIE
+from .youku import (
+    YoukuIE,
+    YoukuShowIE,
+)
  from .youporn import YouPornIE
  from .yourupload import YourUploadIE
  from .youtube import (
diff --git a/youtube_dl/extractor/eyedotv.py b/youtube_dl/extractor/eyedotv.py

new file mode 100644 (file)

index 0000000..2f30351
--- /dev/null
+++ b/youtube_dl/extractor/eyedotv.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    xpath_text,
+    parse_duration,
+    ExtractorError,
+)
+
+
+class EyedoTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?eyedo\.tv/[^/]+/(?:#!/)?Live/Detail/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://www.eyedo.tv/en-US/#!/Live/Detail/16301',
+        'md5': 'ba14f17995cdfc20c36ba40e21bf73f7',
+        'info_dict': {
+            'id': '16301',
+            'ext': 'mp4',
+            'title': 'Journée du conseil scientifique de l\'Afnic 2015',
+            'description': 'md5:4abe07293b2f73efc6e1c37028d58c98',
+            'uploader': 'Afnic Live',
+            'uploader_id': '8023',
+        }
+    }
+    _ROOT_URL = 'http://live.eyedo.net:1935/'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_xml('http://eyedo.tv/api/live/GetLive/%s' % video_id, video_id)
+
+        def _add_ns(path):
+            return self._xpath_ns(path, 'http://schemas.datacontract.org/2004/07/EyeDo.Core.Implementation.Web.ViewModels.Api')
+
+        title = xpath_text(video_data, _add_ns('Titre'), 'title', True)
+        state_live_code = xpath_text(video_data, _add_ns('StateLiveCode'), 'title', True)
+        if state_live_code == 'avenir':
+            raise ExtractorError(
+                '%s said: We\'re sorry, but this video is not yet available.' % self.IE_NAME,
+                expected=True)
+
+        is_live = state_live_code == 'live'
+        m3u8_url = None
+        # http://eyedo.tv/Content/Html5/Scripts/html5view.js
+        if is_live:
+            if xpath_text(video_data, 'Cdn') == 'true':
+                m3u8_url = 'http://rrr.sz.xlcdn.com/?account=eyedo&file=A%s&type=live&service=wowza&protocol=http&output=playlist.m3u8' % video_id
+            else:
+                m3u8_url = self._ROOT_URL + 'w/%s/eyedo_720p/playlist.m3u8' % video_id
+        else:
+            m3u8_url = self._ROOT_URL + 'replay-w/%s/mp4:%s.mp4/playlist.m3u8' % (video_id, video_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native'),
+            'description': xpath_text(video_data, _add_ns('Description')),
+            'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))),
+            'uploader': xpath_text(video_data, _add_ns('Createur')),
+            'uploader_id': xpath_text(video_data, _add_ns('CreateurId')),
+            'chapter': xpath_text(video_data, _add_ns('ChapitreTitre')),
+            'chapter_id': xpath_text(video_data, _add_ns('ChapitreId')),
+        }
diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py

index f1f150ef2ce41defbcee841d86fe4f9ada34d25d..8d1010b88c83dcbfd3e71e9f20275bf6fb9c9d21 100644 (file)
--- a/youtube_dl/extractor/fczenit.py
+++ b/youtube_dl/extractor/fczenit.py
@@ -1,20 +1,19 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
-import re
-
  from .common import InfoExtractor
+from ..compat import compat_urlparse
  
  
  class FczenitIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)'
      _TEST = {
-        'url': 'http://fc-zenit.ru/video/gl6785/',
-        'md5': '458bacc24549173fe5a5aa29174a5606',
+        'url': 'http://fc-zenit.ru/video/41044/',
+        'md5': '0e3fab421b455e970fa1aa3891e57df0',
          'info_dict': {
-            'id': '6785',
+            'id': '41044',
              'ext': 'mp4',
-            'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»',
+            'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»',
          },
      }
  
@@ -22,15 +21,23 @@ def _real_extract(self, url):
          video_id = self._match_id(url)
          webpage = self._download_webpage(url, video_id)
  
-        video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+        video_title = self._html_search_regex(
+            r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+
+        video_items = self._parse_json(self._search_regex(
+            r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'),
+            video_id)
  
-        bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL')
-        bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw)
+        def merge_dicts(*dicts):
+            ret = {}
+            for a_dict in dicts:
+                ret.update(a_dict)
+            return ret
  
          formats = [{
-            'url': furl,
-            'tbr': tbr,
-        } for furl, tbr in bitrates]
+            'url': compat_urlparse.urljoin(url, video_url),
+            'tbr': int(tbr),
+        } for tbr, video_url in merge_dicts(*video_items).items()]
  
          self._sort_formats(formats)
  
diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py

index 0a3de14988dc06e92a7a27e52c4c7838caf69b2b..a8e1bf42a433fd87f638e8b34ce5ab68464a9252 100644 (file)
--- a/youtube_dl/extractor/flickr.py
+++ b/youtube_dl/extractor/flickr.py
@@ -24,13 +24,28 @@ class FlickrIE(InfoExtractor):
              'upload_date': '20110423',
              'uploader_id': '10922353@N03',
              'uploader': 'Forest Wander',
+            'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/',
              'comment_count': int,
              'view_count': int,
              'tags': list,
+            'license': 'Attribution-ShareAlike',
          }
      }
-
      _API_BASE_URL = 'https://api.flickr.com/services/rest?'
+    # https://help.yahoo.com/kb/flickr/SLN25525.html
+    _LICENSES = {
+        '0': 'All Rights Reserved',
+        '1': 'Attribution-NonCommercial-ShareAlike',
+        '2': 'Attribution-NonCommercial',
+        '3': 'Attribution-NonCommercial-NoDerivs',
+        '4': 'Attribution',
+        '5': 'Attribution-ShareAlike',
+        '6': 'Attribution-NoDerivs',
+        '7': 'No known copyright restrictions',
+        '8': 'United States government work',
+        '9': 'Public Domain Dedication (CC0)',
+        '10': 'Public Domain Work',
+    }
  
      def _call_api(self, method, video_id, api_key, note, secret=None):
          query = {
@@ -75,6 +90,9 @@ def _real_extract(self, url):
              self._sort_formats(formats)
  
              owner = video_info.get('owner', {})
+            uploader_id = owner.get('nsid')
+            uploader_path = owner.get('path_alias') or uploader_id
+            uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None
  
              return {
                  'id': video_id,
@@ -83,11 +101,13 @@ def _real_extract(self, url):
                  'formats': formats,
                  'timestamp': int_or_none(video_info.get('dateuploaded')),
                  'duration': int_or_none(video_info.get('video', {}).get('duration')),
-                'uploader_id': owner.get('nsid'),
+                'uploader_id': uploader_id,
                  'uploader': owner.get('realname'),
+                'uploader_url': uploader_url,
                  'comment_count': int_or_none(video_info.get('comments', {}).get('_content')),
                  'view_count': int_or_none(video_info.get('views')),
-                'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])]
+                'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])],
+                'license': self._LICENSES.get(video_info.get('license')),
              }
          else:
              raise ExtractorError('not a video', expected=True)
diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py

new file mode 100644 (file)

index 0000000..322c41e
--- /dev/null
+++ b/youtube_dl/extractor/formula1.py
@@ -0,0 +1,26 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class Formula1IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html'
+    _TEST = {
+        'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html',
+        'md5': '8c79e54be72078b26b89e0e111c0502b',
+        'info_dict': {
+            'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV',
+            'ext': 'flv',
+            'title': 'Race highlights - Spain 2016',
+        },
+        'add_ie': ['Ooyala'],
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        ooyala_embed_code = self._search_regex(
+            r'data-videoid="([^"]+)"', webpage, 'ooyala embed code')
+        return self.url_result(
+            'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 0f1eb7fa64dbf64b32000fd05ea6b65bb8ea502a..90575ab0e98e066e25a43814c5734185b0d10c47 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -61,6 +61,9 @@
  from .digiteka import DigitekaIE
  from .instagram import InstagramIE
  from .liveleak import LiveLeakIE
+from .threeqsdn import ThreeQSDNIE
+from .theplatform import ThePlatformIE
+from .vessel import VesselIE
  
  
  class GenericIE(InfoExtractor):
@@ -716,15 +719,18 @@ class GenericIE(InfoExtractor):
          },
          # Wistia embed
          {
-            'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
-            'md5': '8788b683c777a5cf25621eaf286d0c23',
+            'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
+            'md5': '1953f3a698ab51cfc948ed3992a0b7ff',
              'info_dict': {
-                'id': '1cfaf6b7ea',
+                'id': '6e2wtrbdaf',
                  'ext': 'mov',
-                'title': 'md5:51364a8d3d009997ba99656004b5e20d',
-                'duration': 643.0,
-                'filesize': 182808282,
-                'uploader': 'education-portal.com',
+                'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england',
+                'description': 'a Paywall Videos video from Remilon',
+                'duration': 644.072,
+                'uploader': 'study.com',
+                'timestamp': 1459678540,
+                'upload_date': '20160403',
+                'filesize': 24687186,
              },
          },
          {
@@ -733,13 +739,29 @@ class GenericIE(InfoExtractor):
              'info_dict': {
                  'id': 'uxjb0lwrcz',
                  'ext': 'mp4',
-                'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
+                'title': 'Conversation about Hexagonal Rails Part 1',
                  'description': 'a Martin Fowler video from ThoughtWorks',
                  'duration': 1715.0,
                  'uploader': 'thoughtworks.wistia.com',
-                'upload_date': '20140603',
                  'timestamp': 1401832161,
+                'upload_date': '20140603',
+            },
+        },
+        # Wistia standard embed (async)
+        {
+            'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/',
+            'info_dict': {
+                'id': '807fafadvk',
+                'ext': 'mp4',
+                'title': 'Drip Brennan Dunn Workshop',
+                'description': 'a JV Webinars video from getdrip-1',
+                'duration': 4986.95,
+                'timestamp': 1463607249,
+                'upload_date': '20160518',
              },
+            'params': {
+                'skip_download': True,
+            }
          },
          # Soundcloud embed
          {
@@ -763,6 +785,19 @@ class GenericIE(InfoExtractor):
                  'title': 'Rosetta #CometLanding webcast HL 10',
              }
          },
+        # Another Livestream embed, without 'new.' in URL
+        {
+            'url': 'https://www.freespeech.org/',
+            'info_dict': {
+                'id': '123537347',
+                'ext': 'mp4',
+                'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            },
+            'params': {
+                # Live stream
+                'skip_download': True,
+            },
+        },
          # LazyYT
          {
              'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
@@ -847,18 +882,6 @@ class GenericIE(InfoExtractor):
                  'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
              }
          },
-        # Kaltura embed
-        {
-            'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
-            'info_dict': {
-                'id': '1_eergr3h1',
-                'ext': 'mp4',
-                'upload_date': '20150226',
-                'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
-                'timestamp': int,
-                'title': 'John Carlson Postgame 2/25/15',
-            },
-        },
          # Kaltura embed (different embed code)
          {
              'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
@@ -884,6 +907,19 @@ class GenericIE(InfoExtractor):
                  'uploader_id': 'echojecka',
              },
          },
+        # Kaltura embed with single quotes
+        {
+            'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY',
+            'info_dict': {
+                'id': '0_izeg5utt',
+                'ext': 'mp4',
+                'title': '35871',
+                'timestamp': 1355743100,
+                'upload_date': '20121217',
+                'uploader_id': 'batchUser',
+            },
+            'add_ie': ['Kaltura'],
+        },
          # Eagle.Platform embed (generic URL)
          {
              'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -998,14 +1034,18 @@ class GenericIE(InfoExtractor):
          },
          # UDN embed
          {
-            'url': 'http://www.udn.com/news/story/7314/822787',
+            'url': 'https://video.udn.com/news/300346',
              'md5': 'fd2060e988c326991037b9aff9df21a6',
              'info_dict': {
                  'id': '300346',
                  'ext': 'mp4',
                  'title': '中一中男師變性 全校師生力挺',
                  'thumbnail': 're:^https?://.*\.jpg$',
-            }
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
          },
          # Ooyala embed
          {
@@ -1173,6 +1213,16 @@ class GenericIE(InfoExtractor):
                  'uploader': 'Lake8737',
              }
          },
+        # Duplicated embedded video URLs
+        {
+            'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
+            'info_dict': {
+                'id': '149298443_480_16c25b74_2',
+                'ext': 'mp4',
+                'title': 'vs. Blue Orange Spring Game',
+                'uploader': 'www.hudl.com',
+            },
+        },
      ]
  
      def report_following_redirect(self, new_url):
@@ -1427,7 +1477,8 @@ def _real_extract(self, url):
          #   Site Name | Video Title
          #   Video Title - Tagline | Site Name
          # and so on and so forth; it's just not practical
-        video_title = self._html_search_regex(
+        video_title = self._og_search_title(
+            webpage, default=None) or self._html_search_regex(
              r'(?s)<title>(.*?)</title>', webpage, 'video title',
              default='video')
  
@@ -1445,6 +1496,9 @@ def _real_extract(self, url):
          video_uploader = self._search_regex(
              r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
  
+        video_description = self._og_search_description(webpage, default=None)
+        video_thumbnail = self._og_search_thumbnail(webpage, default=None)
+
          # Helper method
          def _playlist_from_matches(matches, getter=None, ie=None):
              urlrs = orderedSet(
@@ -1475,6 +1529,16 @@ def _playlist_from_matches(matches, getter=None, ie=None):
          if bc_urls:
              return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
  
+        # Look for ThePlatform embeds
+        tp_urls = ThePlatformIE._extract_urls(webpage)
+        if tp_urls:
+            return _playlist_from_matches(tp_urls, ie='ThePlatform')
+
+        # Look for Vessel embeds
+        vessel_urls = VesselIE._extract_urls(webpage)
+        if vessel_urls:
+            return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key())
+
          # Look for embedded rtl.nl player
          matches = re.findall(
              r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
@@ -1543,21 +1607,26 @@ def _playlist_from_matches(matches, getter=None, ie=None):
                  'url': embed_url,
                  'ie_key': 'Wistia',
                  'uploader': video_uploader,
-                'title': video_title,
-                'id': video_id,
              }
  
          match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
          if match:
              return {
                  '_type': 'url_transparent',
-                'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
+                'url': 'wistia:%s' % match.group('id'),
                  'ie_key': 'Wistia',
                  'uploader': video_uploader,
-                'title': video_title,
-                'id': match.group('id')
              }
  
+        match = re.search(
+            r'''(?sx)
+                <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
+                <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
+            ''', webpage)
+        if match:
+            return self.url_result(self._proto_relative_url(
+                'wistia:%s' % match.group('id')), 'Wistia')
+
          # Look for SVT player
          svt_url = SVTIE._extract_url(webpage)
          if svt_url:
@@ -1833,7 +1902,7 @@ def _playlist_from_matches(matches, getter=None, ie=None):
              return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
  
          mobj = re.search(
-            r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
+            r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"',
              webpage)
          if mobj is not None:
              return self.url_result(mobj.group('url'), 'Livestream')
@@ -1845,7 +1914,7 @@ def _playlist_from_matches(matches, getter=None, ie=None):
              return self.url_result(mobj.group('url'), 'Zapiks')
  
          # Look for Kaltura embeds
-        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or
+        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or
                  re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
          if mobj is not None:
              return self.url_result(smuggle_url(
@@ -1983,6 +2052,19 @@ def _playlist_from_matches(matches, getter=None, ie=None):
          if liveleak_url:
              return self.url_result(liveleak_url, 'LiveLeak')
  
+        # Look for 3Q SDN embeds
+        threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
+        if threeqsdn_url:
+            return {
+                '_type': 'url_transparent',
+                'ie_key': ThreeQSDNIE.ie_key(),
+                'url': self._proto_relative_url(threeqsdn_url),
+                'title': video_title,
+                'description': video_description,
+                'thumbnail': video_thumbnail,
+                'uploader': video_uploader,
+            }
+
          def check_video(vurl):
              if YoutubeIE.suitable(vurl):
                  return True
@@ -2063,7 +2145,7 @@ def filter_video(urls):
              raise UnsupportedError(url)
  
          entries = []
-        for video_url in found:
+        for video_url in orderedSet(found):
              video_url = unescapeHTML(video_url)
              video_url = video_url.replace('\\/', '/')
              video_url = compat_urlparse.urljoin(url, video_url)
diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py

index f6b69662baf547aa48a9bdf460671f072bd59884..a6da909310a5591fe39a68244142a46fb24ce65d 100644 (file)
--- a/youtube_dl/extractor/groupon.py
+++ b/youtube_dl/extractor/groupon.py
@@ -4,7 +4,7 @@
  
  
  class GrouponIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.groupon\.com/deals/(?P<id>[^?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?groupon\.com/deals/(?P<id>[^/?#&]+)'
  
      _TEST = {
          'url': 'https://www.groupon.com/deals/bikram-yoga-huntington-beach-2#ooid=tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
@@ -14,17 +14,27 @@ class GrouponIE(InfoExtractor):
              'description': 'Studio kept at 105 degrees and 40% humidity with anti-microbial and anti-slip Flotex flooring; certified instructors',
          },
          'playlist': [{
+            'md5': '42428ce8a00585f9bc36e49226eae7a1',
              'info_dict': {
-                'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
-                'ext': 'flv',
-                'title': 'Bikram Yoga Huntington Beach | Orange County',
+                'id': 'fk6OhWpXgIQ',
+                'ext': 'mp4',
+                'title': 'Bikram Yoga Huntington Beach | Orange County !tubGNycTo@9Uxg82uESj4i61EYX8nyuf',
                  'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
-                'duration': 44.961,
+                'duration': 45,
+                'upload_date': '20160405',
+                'uploader_id': 'groupon',
+                'uploader': 'Groupon',
              },
+            'add_ie': ['Youtube'],
          }],
          'params': {
-            'skip_download': 'HDS',
-        }
+            'skip_download': True,
+        },
+    }
+
+    _PROVIDERS = {
+        'ooyala': ('ooyala:%s', 'Ooyala'),
+        'youtube': ('%s', 'Youtube'),
      }
  
      def _real_extract(self, url):
@@ -36,12 +46,17 @@ def _real_extract(self, url):
          videos = payload['carousel'].get('dealVideos', [])
          entries = []
          for v in videos:
-            if v.get('provider') != 'OOYALA':
+            provider = v.get('provider')
+            video_id = v.get('media') or v.get('id') or v.get('baseURL')
+            if not provider or not video_id:
+                continue
+            url_pattern, ie_key = self._PROVIDERS.get(provider.lower())
+            if not url_pattern:
                  self.report_warning(
                      '%s: Unsupported video provider %s, skipping video' %
-                    (playlist_id, v.get('provider')))
+                    (playlist_id, provider))
                  continue
-            entries.append(self.url_result('ooyala:%s' % v['media']))
+            entries.append(self.url_result(url_pattern % video_id, ie_key))
  
          return {
              '_type': 'playlist',
diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py

index 7d8698655666f8de4e8850ac2684a16dd28810af..2564538820e7d534adc24fd8c967ee44490e0dc3 100644 (file)
--- a/youtube_dl/extractor/hearthisat.py
+++ b/youtube_dl/extractor/hearthisat.py
@@ -7,6 +7,7 @@
  from ..compat import compat_urlparse
  from ..utils import (
      HEADRequest,
+    KNOWN_EXTENSIONS,
      sanitized_Request,
      str_to_int,
      urlencode_postdata,
@@ -17,7 +18,7 @@
  class HearThisAtIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$'
      _PLAYLIST_URL = 'https://hearthis.at/playlist.php'
-    _TEST = {
+    _TESTS = [{
          'url': 'https://hearthis.at/moofi/dr-kreep',
          'md5': 'ab6ec33c8fed6556029337c7885eb4e0',
          'info_dict': {
@@ -26,7 +27,7 @@ class HearThisAtIE(InfoExtractor):
              'title': 'Moofi - Dr. Kreep',
              'thumbnail': 're:^https?://.*\.jpg$',
              'timestamp': 1421564134,
-            'description': 'Creepy Patch. Mutable Instruments Braids Vowel + Formant Mode.',
+            'description': 'Listen to Dr. Kreep by Moofi on hearthis.at - Modular, Eurorack, Mutable Intruments Braids, Valhalla-DSP',
              'upload_date': '20150118',
              'comment_count': int,
              'view_count': int,
@@ -34,7 +35,25 @@ class HearThisAtIE(InfoExtractor):
              'duration': 71,
              'categories': ['Experimental'],
          }
-    }
+    }, {
+        # 'download' link redirects to the original webpage
+        'url': 'https://hearthis.at/twitchsf/dj-jim-hopkins-totally-bitchin-80s-dance-mix/',
+        'md5': '5980ceb7c461605d30f1f039df160c6e',
+        'info_dict': {
+            'id': '811296',
+            'ext': 'mp3',
+            'title': 'TwitchSF - DJ Jim Hopkins -  Totally Bitchin\' 80\'s Dance Mix!',
+            'description': 'Listen to DJ Jim Hopkins -  Totally Bitchin\' 80\'s Dance Mix! by TwitchSF on hearthis.at - Dance',
+            'upload_date': '20160328',
+            'timestamp': 1459186146,
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'comment_count': int,
+            'view_count': int,
+            'like_count': int,
+            'duration': 4360,
+            'categories': ['Dance'],
+        },
+    }]
  
      def _real_extract(self, url):
          m = re.match(self._VALID_URL, url)
@@ -90,13 +109,14 @@ def _real_extract(self, url):
              ext_handle = self._request_webpage(
                  ext_req, display_id, note='Determining extension')
              ext = urlhandle_detect_ext(ext_handle)
-            formats.append({
-                'format_id': 'download',
-                'vcodec': 'none',
-                'ext': ext,
-                'url': download_url,
-                'preference': 2,  # Usually better quality
-            })
+            if ext in KNOWN_EXTENSIONS:
+                formats.append({
+                    'format_id': 'download',
+                    'vcodec': 'none',
+                    'ext': ext,
+                    'url': download_url,
+                    'preference': 2,  # Usually better quality
+                })
          self._sort_formats(formats)
  
          return {
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py

index e8f51e545bfd2b89a251e1a4fbbeefe80aa371f9..7e36b85ad586984dfb761e4518b23d2b4a074bf7 100644 (file)
--- a/youtube_dl/extractor/howcast.py
+++ b/youtube_dl/extractor/howcast.py
@@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
      _TEST = {
          'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
-        'md5': '8b743df908c42f60cf6496586c7f12c3',
+        'md5': '7d45932269a288149483144f01b99789',
          'info_dict': {
              'id': '390161',
              'ext': 'mp4',
@@ -19,9 +19,9 @@ class HowcastIE(InfoExtractor):
              'duration': 56.823,
          },
          'params': {
-            # m3u8 download
              'skip_download': True,
          },
+        'add_ie': ['Ooyala'],
      }
  
      def _real_extract(self, url):
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py

index 8bed8ccd06e2eeb64eba69f3407c9271c0643731..3a2b7cec5cebb0d0bc06aec0d93400c4987adc8d 100644 (file)
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -1,10 +1,10 @@
  from __future__ import unicode_literals
  
  import re
-import json
  
  from .common import InfoExtractor
  from ..utils import (
+    mimetype2ext,
      qualities,
  )
  
@@ -12,9 +12,9 @@
  class ImdbIE(InfoExtractor):
      IE_NAME = 'imdb'
      IE_DESC = 'Internet Movie Database trailers'
-    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/[^/]+/vi(?P<id>\d+)'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://www.imdb.com/video/imdb/vi2524815897',
          'info_dict': {
              'id': '2524815897',
@@ -22,7 +22,10 @@ class ImdbIE(InfoExtractor):
              'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
              'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
          }
-    }
+    }, {
+        'url': 'http://www.imdb.com/video/_/vi2524815897',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
@@ -48,13 +51,27 @@ def _real_extract(self, url):
              json_data = self._search_regex(
                  r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
                  format_page, 'json data', flags=re.DOTALL)
-            info = json.loads(json_data)
-            format_info = info['videoPlayerObject']['video']
-            f_id = format_info['ffname']
+            info = self._parse_json(json_data, video_id, fatal=False)
+            if not info:
+                continue
+            format_info = info.get('videoPlayerObject', {}).get('video', {})
+            if not format_info:
+                continue
+            video_info_list = format_info.get('videoInfoList')
+            if not video_info_list or not isinstance(video_info_list, list):
+                continue
+            video_info = video_info_list[0]
+            if not video_info or not isinstance(video_info, dict):
+                continue
+            video_url = video_info.get('videoUrl')
+            if not video_url:
+                continue
+            format_id = format_info.get('ffname')
              formats.append({
-                'format_id': f_id,
-                'url': format_info['videoInfoList'][0]['videoUrl'],
-                'quality': quality(f_id),
+                'format_id': format_id,
+                'url': video_url,
+                'ext': mimetype2ext(video_info.get('videoMimeType')),
+                'quality': quality(format_id),
              })
          self._sort_formats(formats)
  
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py

index ffb8008ce29c81363c58e8b7af135b4d096835e8..ddcb3c916e6a0610484dc5ceddbd84b507e761fd 100644 (file)
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@@ -505,7 +505,10 @@ def get_raw_data(self, tvid, video_id, enc_key, _uuid):
              'enc': md5_text(enc_key + tail),
              'qyid': _uuid,
              'tn': random.random(),
-            'um': 0,
+            # In iQiyi's flash player, um is set to 1 if there's a logged user
+            # Some 1080P formats are only available with a logged user.
+            # Here force um=1 to trick the iQiyi server
+            'um': 1,
              'authkey': md5_text(md5_text('') + tail),
              'k_tag': 1,
          }
diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py

index 8a5e562dbc24fac4d18498e631e8f5e10d8fe038..fa6f335e1e25575e5b2de57859a73a7c262ed1ae 100644 (file)
--- a/youtube_dl/extractor/jwplatform.py
+++ b/youtube_dl/extractor/jwplatform.py
@@ -5,33 +5,50 @@
  
  from .common import InfoExtractor
  from ..utils import (
+    determine_ext,
      float_or_none,
      int_or_none,
  )
  
  
  class JWPlatformBaseIE(InfoExtractor):
-    def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True):
+    def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None):
          video_data = jwplayer_data['playlist'][0]
  
          formats = []
          for source in video_data['sources']:
              source_url = self._proto_relative_url(source['file'])
              source_type = source.get('type') or ''
-            if source_type in ('application/vnd.apple.mpegurl', 'hls'):
+            if source_type in ('application/vnd.apple.mpegurl', 'hls') or determine_ext(source_url) == 'm3u8':
                  formats.extend(self._extract_m3u8_formats(
-                    source_url, video_id, 'mp4', 'm3u8_native', fatal=False))
+                    source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
              elif source_type.startswith('audio'):
                  formats.append({
                      'url': source_url,
                      'vcodec': 'none',
                  })
              else:
-                formats.append({
+                a_format = {
                      'url': source_url,
                      'width': int_or_none(source.get('width')),
                      'height': int_or_none(source.get('height')),
-                })
+                }
+                if source_url.startswith('rtmp'):
+                    a_format['ext'] = 'flv',
+
+                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
+                    # of jwplayer.flash.swf
+                    rtmp_url_parts = re.split(
+                        r'((?:mp4|mp3|flv):)', source_url, 1)
+                    if len(rtmp_url_parts) == 3:
+                        rtmp_url, prefix, play_path = rtmp_url_parts
+                        a_format.update({
+                            'url': rtmp_url,
+                            'play_path': prefix + play_path,
+                        })
+                    if rtmp_params:
+                        a_format.update(rtmp_params)
+                formats.append(a_format)
          self._sort_formats(formats)
  
          subtitles = {}
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py

index 616ed19e124ed72b8312d6a04120af8e20fa60e4..11b31a69943e3252597a433b69e9ddeb471527fa 100644 (file)
--- a/youtube_dl/extractor/kuwo.py
+++ b/youtube_dl/extractor/kuwo.py
@@ -266,6 +266,7 @@ class KuwoCategoryIE(InfoExtractor):
          'info_dict': {
              'id': '86375',
              'title': '八十年代精选',
+            'description': '这些都是属于八十年代的回忆！',
          },
          'playlist_mincount': 24,
      }
diff --git a/youtube_dl/extractor/learnr.py b/youtube_dl/extractor/learnr.py

new file mode 100644 (file)

index 0000000..1435e09
--- /dev/null
+++ b/youtube_dl/extractor/learnr.py
@@ -0,0 +1,33 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class LearnrIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?learnr\.pro/view/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.learnr.pro/view/video/51624-web-development-tutorial-for-beginners-1-how-to-build-webpages-with-html-css-javascript',
+        'md5': '3719fdf0a68397f49899e82c308a89de',
+        'info_dict': {
+            'id': '51624',
+            'ext': 'mp4',
+            'title': 'Web Development Tutorial for Beginners (#1) - How to build webpages with HTML, CSS, Javascript',
+            'description': 'md5:b36dbfa92350176cdf12b4d388485503',
+            'uploader': 'LearnCode.academy',
+            'uploader_id': 'learncodeacademy',
+            'upload_date': '20131021',
+        },
+        'add_ie': ['Youtube'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        return {
+            '_type': 'url_transparent',
+            'url': self._search_regex(
+                r"videoId\s*:\s*'([^']+)'", webpage, 'youtube id'),
+            'id': video_id,
+        }
diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py

new file mode 100644 (file)

index 0000000..0a94366
--- /dev/null
+++ b/youtube_dl/extractor/libraryofcongress.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    parse_filesize,
+)
+
+
+class LibraryOfCongressIE(InfoExtractor):
+    IE_NAME = 'loc'
+    IE_DESC = 'Library of Congress'
+    _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9]+)'
+    _TESTS = [{
+        # embedded via <div class="media-player"
+        'url': 'http://loc.gov/item/90716351/',
+        'md5': '353917ff7f0255aa6d4b80a034833de8',
+        'info_dict': {
+            'id': '90716351',
+            'ext': 'mp4',
+            'title': "Pa's trip to Mars",
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 0,
+            'view_count': int,
+        },
+    }, {
+        # webcast embedded via mediaObjectId
+        'url': 'https://www.loc.gov/today/cyberlc/feature_wdesc.php?rec=5578',
+        'info_dict': {
+            'id': '5578',
+            'ext': 'mp4',
+            'title': 'Help! Preservation Training Needs Here, There & Everywhere',
+            'duration': 3765,
+            'view_count': int,
+            'subtitles': 'mincount:1',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # with direct download links
+        'url': 'https://www.loc.gov/item/78710669/',
+        'info_dict': {
+            'id': '78710669',
+            'ext': 'mp4',
+            'title': 'La vie et la passion de Jesus-Christ',
+            'duration': 0,
+            'view_count': int,
+            'formats': 'mincount:4',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        media_id = self._search_regex(
+            (r'id=(["\'])media-player-(?P<id>.+?)\1',
+             r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1',
+             r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1',
+             r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1'),
+            webpage, 'media id', group='id')
+
+        data = self._download_json(
+            'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id,
+            video_id)['mediaObject']
+
+        derivative = data['derivatives'][0]
+        media_url = derivative['derivativeUrl']
+
+        title = derivative.get('shortName') or data.get('shortName') or self._og_search_title(
+            webpage)
+
+        # Following algorithm was extracted from setAVSource js function
+        # found in webpage
+        media_url = media_url.replace('rtmp', 'https')
+
+        is_video = data.get('mediaType', 'v').lower() == 'v'
+        ext = determine_ext(media_url)
+        if ext not in ('mp4', 'mp3'):
+            media_url += '.mp4' if is_video else '.mp3'
+
+        if 'vod/mp4:' in media_url:
+            formats = [{
+                'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8',
+                'format_id': 'hls',
+                'ext': 'mp4',
+                'protocol': 'm3u8_native',
+                'quality': 1,
+            }]
+        elif 'vod/mp3:' in media_url:
+            formats = [{
+                'url': media_url.replace('vod/mp3:', ''),
+                'vcodec': 'none',
+            }]
+
+        download_urls = set()
+        for m in re.finditer(
+                r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?:&nbsp;|\s+)\((?P<size>.+?)\))?\s*<', webpage):
+            format_id = m.group('id').lower()
+            if format_id == 'gif':
+                continue
+            download_url = m.group('url')
+            if download_url in download_urls:
+                continue
+            download_urls.add(download_url)
+            formats.append({
+                'url': download_url,
+                'format_id': format_id,
+                'filesize_approx': parse_filesize(m.group('size')),
+            })
+
+        self._sort_formats(formats)
+
+        duration = float_or_none(data.get('duration'))
+        view_count = int_or_none(data.get('viewCount'))
+
+        subtitles = {}
+        cc_url = data.get('ccUrl')
+        if cc_url:
+            subtitles.setdefault('en', []).append({
+                'url': cc_url,
+                'ext': 'ttml',
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py

index ba2f80a757d071042b8d574721bde37a1b7006ba..c2b4490c49044bea3426dc48b873218aa98146b1 100644 (file)
--- a/youtube_dl/extractor/lifenews.py
+++ b/youtube_dl/extractor/lifenews.py
@@ -7,48 +7,53 @@
  from ..compat import compat_urlparse
  from ..utils import (
      determine_ext,
+    ExtractorError,
      int_or_none,
+    parse_iso8601,
      remove_end,
-    unified_strdate,
-    ExtractorError,
  )
  
  
  class LifeNewsIE(InfoExtractor):
-    IE_NAME = 'lifenews'
-    IE_DESC = 'LIFE | NEWS'
-    _VALID_URL = r'https?://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'
+    IE_NAME = 'life'
+    IE_DESC = 'Life.ru'
+    _VALID_URL = r'https?://life\.ru/t/[^/]+/(?P<id>\d+)'
  
      _TESTS = [{
          # single video embedded via video/source
-        'url': 'http://lifenews.ru/news/98736',
+        'url': 'https://life.ru/t/новости/98736',
          'md5': '77c95eaefaca216e32a76a343ad89d23',
          'info_dict': {
              'id': '98736',
              'ext': 'mp4',
              'title': 'Мужчина нашел дома архив оборонного завода',
              'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26',
+            'timestamp': 1344154740,
              'upload_date': '20120805',
+            'view_count': int,
          }
      }, {
          # single video embedded via iframe
-        'url': 'http://lifenews.ru/news/152125',
+        'url': 'https://life.ru/t/новости/152125',
          'md5': '77d19a6f0886cd76bdbf44b4d971a273',
          'info_dict': {
              'id': '152125',
              'ext': 'mp4',
              'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',
              'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ',
+            'timestamp': 1427961840,
              'upload_date': '20150402',
+            'view_count': int,
          }
      }, {
          # two videos embedded via iframe
-        'url': 'http://lifenews.ru/news/153461',
+        'url': 'https://life.ru/t/новости/153461',
          'info_dict': {
              'id': '153461',
              'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
              'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
-            'upload_date': '20150505',
+            'timestamp': 1430825520,
+            'view_count': int,
          },
          'playlist': [{
              'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
@@ -57,6 +62,7 @@ class LifeNewsIE(InfoExtractor):
                  'ext': 'mp4',
                  'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)',
                  'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+                'timestamp': 1430825520,
                  'upload_date': '20150505',
              },
          }, {
@@ -66,22 +72,25 @@ class LifeNewsIE(InfoExtractor):
                  'ext': 'mp4',
                  'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)',
                  'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+                'timestamp': 1430825520,
                  'upload_date': '20150505',
              },
          }],
      }, {
-        'url': 'http://lifenews.ru/video/13035',
+        'url': 'https://life.ru/t/новости/213035',
+        'only_matching': True,
+    }, {
+        'url': 'https://life.ru/t/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/153461',
+        'only_matching': True,
+    }, {
+        'url': 'https://life.ru/t/новости/411489/manuel_vals_nazval_frantsiiu_tsieliu_nomier_odin_dlia_ighil',
          'only_matching': True,
      }]
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        section = mobj.group('section')
+        video_id = self._match_id(url)
  
-        webpage = self._download_webpage(
-            'http://lifenews.ru/%s/%s' % (section, video_id),
-            video_id, 'Downloading page')
+        webpage = self._download_webpage(url, video_id)
  
          video_urls = re.findall(
              r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage)
@@ -95,26 +104,22 @@ def _real_extract(self, url):
  
          title = remove_end(
              self._og_search_title(webpage),
-            ' - Первый по срочным новостям — LIFE | NEWS')
+            ' - Life.ru')
  
          description = self._og_search_description(webpage)
  
          view_count = self._html_search_regex(
-            r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False)
-        comment_count = self._html_search_regex(
-            r'=\'commentCount\'[^>]*>\s*(\d+)\s*<',
-            webpage, 'comment count', fatal=False)
+            r'<div[^>]+class=(["\']).*?\bhits-count\b.*?\1[^>]*>\s*(?P<value>\d+)\s*</div>',
+            webpage, 'view count', fatal=False, group='value')
  
-        upload_date = self._html_search_regex(
-            r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False)
-        if upload_date is not None:
-            upload_date = unified_strdate(upload_date)
+        timestamp = parse_iso8601(self._search_regex(
+            r'<time[^>]+datetime=(["\'])(?P<value>.+?)\1',
+            webpage, 'upload date', fatal=False, group='value'))
  
          common_info = {
              'description': description,
              'view_count': int_or_none(view_count),
-            'comment_count': int_or_none(comment_count),
-            'upload_date': upload_date,
+            'timestamp': timestamp,
          }
  
          def make_entry(video_id, video_url, index=None):
@@ -183,7 +188,8 @@ def _real_extract(self, url):
              ext = determine_ext(video_url)
              if ext == 'm3u8':
                  formats.extend(self._extract_m3u8_formats(
-                    video_url, video_id, 'mp4', m3u8_id='m3u8'))
+                    video_url, video_id, 'mp4',
+                    entry_protocol='m3u8_native', m3u8_id='m3u8'))
              else:
                  formats.append({
                      'url': video_url,
diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py

new file mode 100644 (file)

index 0000000..3356d01
--- /dev/null
+++ b/youtube_dl/extractor/litv.py
@@ -0,0 +1,137 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    smuggle_url,
+    unsmuggle_url,
+)
+
+
+class LiTVIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P<id>[^&]+)'
+
+    _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s'
+
+    _TESTS = [{
+        'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
+        'info_dict': {
+            'id': 'VOD00041606',
+            'title': '花千骨',
+        },
+        'playlist_count': 50,
+    }, {
+        'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
+        'info_dict': {
+            'id': 'VOD00041610',
+            'ext': 'mp4',
+            'title': '花千骨第1集',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f',
+            'episode_number': 1,
+        },
+        'params': {
+            'noplaylist': True,
+            'skip_download': True,  # m3u8 download
+        },
+        'skip': 'Georestricted to Taiwan',
+    }]
+
+    def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True):
+        episode_title = view_data['title']
+        content_id = season_list['contentId']
+
+        if prompt:
+            self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id))
+
+        all_episodes = [
+            self.url_result(smuggle_url(
+                self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']),
+                {'force_noplaylist': True}))  # To prevent infinite recursion
+            for episode in season_list['episode']]
+
+        return self.playlist_result(all_episodes, content_id, episode_title)
+
+    def _real_extract(self, url):
+        url, data = unsmuggle_url(url, {})
+
+        video_id = self._match_id(url)
+
+        noplaylist = self._downloader.params.get('noplaylist')
+        noplaylist_prompt = True
+        if 'force_noplaylist' in data:
+            noplaylist = data['force_noplaylist']
+            noplaylist_prompt = False
+
+        webpage = self._download_webpage(url, video_id)
+
+        view_data = dict(map(lambda t: (t[0], t[2]), re.findall(
+            r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2',
+            webpage)))
+
+        vod_data = self._parse_json(self._search_regex(
+            'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'),
+            video_id)
+
+        season_list = list(vod_data.get('seasonList', {}).values())
+        if season_list:
+            if not noplaylist:
+                return self._extract_playlist(
+                    season_list[0], video_id, vod_data, view_data,
+                    prompt=noplaylist_prompt)
+
+            if noplaylist_prompt:
+                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
+        # In browsers `getMainUrl` request is always issued. Usually this
+        # endpoint gives the same result as the data embedded in the webpage.
+        # If georestricted, there are no embedded data, so an extra request is
+        # necessary to get the error code
+        video_data = self._parse_json(self._search_regex(
+            r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
+            webpage, 'video data', default='{}'), video_id)
+        if not video_data:
+            payload = {
+                'assetId': view_data['assetId'],
+                'watchDevices': vod_data['watchDevices'],
+                'contentType': view_data['contentType'],
+            }
+            video_data = self._download_json(
+                'https://www.litv.tv/vod/getMainUrl', video_id,
+                data=json.dumps(payload).encode('utf-8'),
+                headers={'Content-Type': 'application/json'})
+
+        if not video_data.get('fullpath'):
+            error_msg = video_data.get('errorMessage')
+            if error_msg == 'vod.error.outsideregionerror':
+                self.raise_geo_restricted('This video is available in Taiwan only')
+            if error_msg:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True)
+            raise ExtractorError('Unexpected result from %s' % self.IE_NAME)
+
+        formats = self._extract_m3u8_formats(
+            video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls')
+        for a_format in formats:
+            # LiTV HLS segments doesn't like compressions
+            a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True
+
+        title = view_data['title'] + view_data.get('secondaryMark', '')
+        description = view_data.get('description')
+        thumbnail = view_data.get('imageFile')
+        categories = [item['name'] for item in vod_data.get('category', [])]
+        episode = int_or_none(view_data.get('episode'))
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'categories': categories,
+            'episode_number': episode,
+        }
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py

index 29fba5f30b0cc4633dbc978e886c62eab0d4ac81..ea0565ac05099aab8c05609aee4140a1b4c2c1c7 100644 (file)
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -17,7 +17,8 @@ class LiveLeakIE(InfoExtractor):
              'ext': 'flv',
              'description': 'extremely bad day for this guy..!',
              'uploader': 'ljfriel2',
-            'title': 'Most unlucky car accident'
+            'title': 'Most unlucky car accident',
+            'thumbnail': 're:^https?://.*\.jpg$'
          }
      }, {
          'url': 'http://www.liveleak.com/view?i=f93_1390833151',
@@ -28,6 +29,7 @@ class LiveLeakIE(InfoExtractor):
              'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',
              'uploader': 'ARD_Stinkt',
              'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
+            'thumbnail': 're:^https?://.*\.jpg$'
          }
      }, {
          'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
@@ -49,7 +51,8 @@ class LiveLeakIE(InfoExtractor):
              'ext': 'mp4',
              'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.',
              'uploader': 'bony333',
-            'title': 'Crazy Hungarian tourist films close call waterspout in Croatia'
+            'title': 'Crazy Hungarian tourist films close call waterspout in Croatia',
+            'thumbnail': 're:^https?://.*\.jpg$'
          }
      }]
  
@@ -72,6 +75,7 @@ def _real_extract(self, url):
          age_limit = int_or_none(self._search_regex(
              r'you confirm that you are ([0-9]+) years and over.',
              webpage, 'age limit', default=None))
+        video_thumbnail = self._og_search_thumbnail(webpage)
  
          sources_raw = self._search_regex(
              r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
@@ -124,4 +128,5 @@ def _real_extract(self, url):
              'uploader': video_uploader,
              'formats': formats,
              'age_limit': age_limit,
+            'thumbnail': video_thumbnail,
          }
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py

index eada7c299238953baa9fd3d8219b2754aa7f9356..bc7894bf13ed29963aa1dad7880cf8549be1ca77 100644 (file)
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -150,7 +150,7 @@ def _extract_video_info(self, video_data):
          }
  
      def _extract_stream_info(self, stream_info):
-        broadcast_id = stream_info['broadcast_id']
+        broadcast_id = compat_str(stream_info['broadcast_id'])
          is_live = stream_info.get('is_live')
  
          formats = []
@@ -203,9 +203,10 @@ def _extract_event(self, event_data):
              if not videos_info:
                  break
              for v in videos_info:
+                v_id = compat_str(v['id'])
                  entries.append(self.url_result(
-                    'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v['id']),
-                    'Livestream', v['id'], v['caption']))
+                    'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id),
+                    'Livestream', v_id, v.get('caption')))
              last_video = videos_info[-1]['id']
          return self.playlist_result(entries, event_id, event_data['full_name'])
  
diff --git a/youtube_dl/extractor/localnews8.py b/youtube_dl/extractor/localnews8.py

new file mode 100644 (file)

index 0000000..aad3961
--- /dev/null
+++ b/youtube_dl/extractor/localnews8.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class LocalNews8IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?localnews8\.com/(?:[^/]+/)*(?P<display_id>[^/]+)/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.localnews8.com/news/rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings/35183304',
+        'md5': 'be4d48aea61aa2bde7be2ee47691ad20',
+        'info_dict': {
+            'id': '35183304',
+            'display_id': 'rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings',
+            'ext': 'mp4',
+            'title': 'Rexburg business turns carbon fiber scraps into wedding ring',
+            'description': 'The process was first invented by Lamborghini and less than a dozen companies around the world use it.',
+            'duration': 153,
+            'timestamp': 1441844822,
+            'upload_date': '20150910',
+            'uploader_id': 'api',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        partner_id = self._search_regex(
+            r'partnerId\s*[:=]\s*(["\'])(?P<id>\d+)\1',
+            webpage, 'partner id', group='id')
+        kaltura_id = self._search_regex(
+            r'videoIdString\s*[:=]\s*(["\'])kaltura:(?P<id>[0-9a-z_]+)\1',
+            webpage, 'videl id', group='id')
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'kaltura:%s:%s' % (partner_id, kaltura_id),
+            'ie_key': 'Kaltura',
+            'id': video_id,
+            'display_id': display_id,
+        }
diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py

index a14d176a550c1cc5a21f9f06255a686afd24d03f..9fbc74f5d52e819ca3213623a30b6d9cd2e6ec08 100644 (file)
--- a/youtube_dl/extractor/mgtv.py
+++ b/youtube_dl/extractor/mgtv.py
@@ -11,7 +11,7 @@ class MGTVIE(InfoExtractor):
  
      _TEST = {
          'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
-        'md5': '',
+        'md5': '1bdadcf760a0b90946ca68ee9a2db41a',
          'info_dict': {
              'id': '3116640',
              'ext': 'mp4',
@@ -20,15 +20,6 @@ class MGTVIE(InfoExtractor):
              'duration': 7461,
              'thumbnail': 're:^https?://.*\.jpg$',
          },
-        'params': {
-            'skip_download': True,  # m3u8 download
-        },
-    }
-
-    _FORMAT_MAP = {
-        '标清': ('Standard', 0),
-        '高清': ('High', 1),
-        '超清': ('SuperHigh', 2),
      }
  
      def _real_extract(self, url):
@@ -40,17 +31,27 @@ def _real_extract(self, url):
  
          formats = []
          for idx, stream in enumerate(api_data['stream']):
-            format_name = stream.get('name')
-            format_id, preference = self._FORMAT_MAP.get(format_name, (None, None))
-            format_info = self._download_json(
-                stream['url'], video_id,
-                note='Download video info for format %s' % format_id or '#%d' % idx)
-            formats.append({
-                'format_id': format_id,
-                'url': format_info['info'],
-                'ext': 'mp4',  # These are m3u8 playlists
-                'preference': preference,
-            })
+            stream_url = stream.get('url')
+            if not stream_url:
+                continue
+            tbr = int_or_none(self._search_regex(
+                r'(\d+)\.mp4', stream_url, 'tbr', default=None))
+
+            def extract_format(stream_url, format_id, idx, query={}):
+                format_info = self._download_json(
+                    stream_url, video_id,
+                    note='Download video info for format %s' % format_id or '#%d' % idx, query=query)
+                return {
+                    'format_id': format_id,
+                    'url': format_info['info'],
+                    'ext': 'mp4',
+                    'tbr': tbr,
+                }
+
+            formats.append(extract_format(
+                stream_url, 'hls-%d' % tbr if tbr else None, idx * 2))
+            formats.append(extract_format(stream_url.replace(
+                '/playlist.m3u8', ''), 'http-%d' % tbr if tbr else None, idx * 2 + 1, {'pno': 1031}))
          self._sort_formats(formats)
  
          return {
diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py

new file mode 100644 (file)

index 0000000..afd3e98
--- /dev/null
+++ b/youtube_dl/extractor/microsoftvirtualacademy.py
@@ -0,0 +1,192 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_xpath,
+)
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    smuggle_url,
+    unsmuggle_url,
+    xpath_text,
+)
+
+
+class MicrosoftVirtualAcademyBaseIE(InfoExtractor):
+    def _extract_base_url(self, course_id, display_id):
+        return self._download_json(
+            'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id,
+            display_id, 'Downloading course base URL')
+
+    def _extract_chapter_and_title(self, title):
+        if not title:
+            return None, None
+        m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title)
+        return (int(m.group('chapter')), m.group('title')) if m else (None, title)
+
+
+class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE):
+    IE_NAME = 'mva'
+    IE_DESC = 'Microsoft Virtual Academy videos'
+    _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME
+
+    _TESTS = [{
+        'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382',
+        'md5': '7826c44fc31678b12ad8db11f6b5abb9',
+        'info_dict': {
+            'id': 'gfVXISmEB_6804984382',
+            'ext': 'mp4',
+            'title': 'Course Introduction',
+            'formats': 'mincount:3',
+            'subtitles': {
+                'en': [{
+                    'ext': 'ttml',
+                }],
+            },
+        }
+    }, {
+        'url': 'mva:11788:gfVXISmEB_6804984382',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        mobj = re.match(self._VALID_URL, url)
+        course_id = mobj.group('course_id')
+        video_id = mobj.group('id')
+
+        base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id)
+
+        settings = self._download_xml(
+            '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id),
+            video_id, 'Downloading video settings XML')
+
+        _, title = self._extract_chapter_and_title(xpath_text(
+            settings, './/Title', 'title', fatal=True))
+
+        formats = []
+
+        for sources in settings.findall(compat_xpath('.//MediaSources')):
+            if sources.get('videoType') == 'smoothstreaming':
+                continue
+            for source in sources.findall(compat_xpath('./MediaSource')):
+                video_url = source.text
+                if not video_url or not video_url.startswith('http'):
+                    continue
+                video_mode = source.get('videoMode')
+                height = int_or_none(self._search_regex(
+                    r'^(\d+)[pP]$', video_mode or '', 'height', default=None))
+                codec = source.get('codec')
+                acodec, vcodec = [None] * 2
+                if codec:
+                    codecs = codec.split(',')
+                    if len(codecs) == 2:
+                        acodec, vcodec = codecs
+                    elif len(codecs) == 1:
+                        vcodec = codecs[0]
+                formats.append({
+                    'url': video_url,
+                    'format_id': video_mode,
+                    'height': height,
+                    'acodec': acodec,
+                    'vcodec': vcodec,
+                })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for source in settings.findall(compat_xpath('.//MarkerResourceSource')):
+            subtitle_url = source.text
+            if not subtitle_url:
+                continue
+            subtitles.setdefault('en', []).append({
+                'url': '%s/%s' % (base_url, subtitle_url),
+                'ext': source.get('type'),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'subtitles': subtitles,
+            'formats': formats
+        }
+
+
+class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE):
+    IE_NAME = 'mva:course'
+    IE_DESC = 'Microsoft Virtual Academy courses'
+    _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME
+
+    _TESTS = [{
+        'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
+        'info_dict': {
+            'id': '11788',
+            'title': 'Microsoft Azure Fundamentals: Virtual Machines',
+        },
+        'playlist_count': 36,
+    }, {
+        # with emphasized chapters
+        'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335',
+        'info_dict': {
+            'id': '16335',
+            'title': 'Developing Windows 10 Games with Construct 2',
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
+        'only_matching': True,
+    }, {
+        'url': 'mva:course:11788',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if MicrosoftVirtualAcademyIE.suitable(url) else super(
+            MicrosoftVirtualAcademyCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        course_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        base_url = self._extract_base_url(course_id, display_id)
+
+        manifest = self._download_json(
+            '%s/imsmanifestlite.json' % base_url,
+            display_id, 'Downloading course manifest JSON')['manifest']
+
+        organization = manifest['organizations']['organization'][0]
+
+        entries = []
+        for chapter in organization['item']:
+            chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title'))
+            chapter_id = chapter.get('@identifier')
+            for item in chapter.get('item', []):
+                item_id = item.get('@identifier')
+                if not item_id:
+                    continue
+                metadata = item.get('resource', {}).get('metadata') or {}
+                if metadata.get('learningresourcetype') != 'Video':
+                    continue
+                _, title = self._extract_chapter_and_title(item.get('title'))
+                duration = parse_duration(metadata.get('duration'))
+                description = metadata.get('description')
+                entries.append({
+                    '_type': 'url_transparent',
+                    'url': smuggle_url(
+                        'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}),
+                    'title': title,
+                    'description': description,
+                    'duration': duration,
+                    'chapter': chapter_title,
+                    'chapter_number': chapter_number,
+                    'chapter_id': chapter_id,
+                })
+
+        title = organization.get('title') or manifest.get('metadata', {}).get('title')
+
+        return self.playlist_result(entries, course_id, title)
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py

index 46504cd5ff6aafa40d662caa58eaec08b1c88a48..f27c7f139314a75734129c0ff4d61e4b413f4790 100644 (file)
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -67,6 +67,23 @@ class NBCIE(InfoExtractor):
              # This video has expired but with an escaped embedURL
              'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
              'only_matching': True,
+        },
+        {
+            # HLS streams requires the 'hdnea3' cookie
+            'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
+            'info_dict': {
+                'id': 'n1806',
+                'ext': 'mp4',
+                'title': 'Goliath',
+                'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.',
+                'timestamp': 1237100400,
+                'upload_date': '20090315',
+                'uploader': 'NBCU-COM',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'Only works from US',
          }
      ]
  
diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py

index 2a1ca80df797f0abe63cc6327c5e283965865f70..96528f6499d1e02c5208e61fe8abd1f606b29392 100644 (file)
--- a/youtube_dl/extractor/ndtv.py
+++ b/youtube_dl/extractor/ndtv.py
@@ -1,19 +1,18 @@
  from __future__ import unicode_literals
  
-import re
-
  from .common import InfoExtractor
  from ..utils import (
-    month_by_name,
      int_or_none,
+    remove_end,
+    unified_strdate,
  )
  
  
  class NDTVIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?ndtv\.com/video/(?:[^/]+/)+[^/?^&]+-(?P<id>\d+)'
  
      _TEST = {
-        'url': 'http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710',
+        'url': 'http://www.ndtv.com/video/news/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal-300710',
          'md5': '39f992dbe5fb531c395d8bbedb1e5e88',
          'info_dict': {
              'id': '300710',
@@ -22,7 +21,7 @@ class NDTVIE(InfoExtractor):
              'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02',
              'upload_date': '20131208',
              'duration': 1327,
-            'thumbnail': 'http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg',
+            'thumbnail': 're:https?://.*\.jpg',
          },
      }
  
@@ -30,36 +29,19 @@ def _real_extract(self, url):
          video_id = self._match_id(url)
          webpage = self._download_webpage(url, video_id)
  
+        title = remove_end(self._og_search_title(webpage), ' - NDTV')
+
          filename = self._search_regex(
              r"__filename='([^']+)'", webpage, 'video filename')
-        video_url = ('http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' %
-                     filename)
+        video_url = 'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % filename
  
          duration = int_or_none(self._search_regex(
              r"__duration='([^']+)'", webpage, 'duration', fatal=False))
  
-        date_m = re.search(r'''(?x)
-            <p\s+class="vod_dateline">\s*
-                Published\s+On:\s*
-                (?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+)
-            ''', webpage)
-        upload_date = None
-
-        if date_m is not None:
-            month = month_by_name(date_m.group('monthname'))
-            if month is not None:
-                upload_date = '%s%02d%02d' % (
-                    date_m.group('year'), month, int(date_m.group('day')))
-
-        description = self._og_search_description(webpage)
-        READ_MORE = ' (Read more)'
-        if description.endswith(READ_MORE):
-            description = description[:-len(READ_MORE)]
+        upload_date = unified_strdate(self._html_search_meta(
+            'publish-date', webpage, 'upload date', fatal=False))
  
-        title = self._og_search_title(webpage)
-        TITLE_SUFFIX = ' - NDTV'
-        if title.endswith(TITLE_SUFFIX):
-            title = title[:-len(TITLE_SUFFIX)]
+        description = remove_end(self._og_search_description(webpage), ' (Read more)')
  
          return {
              'id': video_id,
diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py

index 51e4a34f789f0e7e9dff2eeb9ec839e655632c75..adcc636bc32c062fec74074044783e452a3725d9 100644 (file)
--- a/youtube_dl/extractor/nfb.py
+++ b/youtube_dl/extractor/nfb.py
@@ -2,8 +2,12 @@
  
  from .common import InfoExtractor
  from ..utils import (
-    sanitized_Request,
+    clean_html,
+    determine_ext,
+    int_or_none,
+    qualities,
      urlencode_postdata,
+    xpath_text,
  )
  
  
@@ -16,12 +20,12 @@ class NFBIE(InfoExtractor):
          'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
          'info_dict': {
              'id': 'qallunaat_why_white_people_are_funny',
-            'ext': 'mp4',
+            'ext': 'flv',
              'title': 'Qallunaat! Why White People Are Funny ',
-            'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
+            'description': 'md5:6b8e32dde3abf91e58857b174916620c',
              'duration': 3128,
+            'creator': 'Mark Sandiford',
              'uploader': 'Mark Sandiford',
-            'uploader_id': 'mark-sandiford',
          },
          'params': {
              # rtmp download
@@ -31,65 +35,78 @@ class NFBIE(InfoExtractor):
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
-        page = self._download_webpage(
-            'https://www.nfb.ca/film/%s' % video_id, video_id,
-            'Downloading film page')
  
-        uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
-                                              page, 'director id', fatal=False)
-        uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
-                                           page, 'director name', fatal=False)
-
-        request = sanitized_Request(
+        config = self._download_xml(
              'https://www.nfb.ca/film/%s/player_config' % video_id,
-            urlencode_postdata({'getConfig': 'true'}))
-        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
-
-        config = self._download_xml(request, video_id, 'Downloading player config XML')
+            video_id, 'Downloading player config XML',
+            data=urlencode_postdata({'getConfig': 'true'}),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf'
+            })
  
-        title = None
-        description = None
-        thumbnail = None
-        duration = None
-        formats = []
-
-        def extract_thumbnail(media):
-            thumbnails = {}
-            for asset in media.findall('assets/asset'):
-                thumbnails[asset.get('quality')] = asset.find('default/url').text
-            if not thumbnails:
-                return None
-            if 'high' in thumbnails:
-                return thumbnails['high']
-            return list(thumbnails.values())[0]
+        title, description, thumbnail, duration, uploader, author = [None] * 6
+        thumbnails, formats = [[]] * 2
+        subtitles = {}
  
          for media in config.findall('./player/stream/media'):
              if media.get('type') == 'posterImage':
-                thumbnail = extract_thumbnail(media)
+                quality_key = qualities(('low', 'high'))
+                thumbnails = []
+                for asset in media.findall('assets/asset'):
+                    asset_url = xpath_text(asset, 'default/url', default=None)
+                    if not asset_url:
+                        continue
+                    quality = asset.get('quality')
+                    thumbnails.append({
+                        'url': asset_url,
+                        'id': quality,
+                        'preference': quality_key(quality),
+                    })
              elif media.get('type') == 'video':
-                duration = int(media.get('duration'))
-                title = media.find('title').text
-                description = media.find('description').text
-                # It seems assets always go from lower to better quality, so no need to sort
+                title = xpath_text(media, 'title', fatal=True)
                  for asset in media.findall('assets/asset'):
-                    for x in asset:
+                    quality = asset.get('quality')
+                    height = int_or_none(self._search_regex(
+                        r'^(\d+)[pP]$', quality or '', 'height', default=None))
+                    for node in asset:
+                        streamer = xpath_text(node, 'streamerURI', default=None)
+                        if not streamer:
+                            continue
+                        play_path = xpath_text(node, 'url', default=None)
+                        if not play_path:
+                            continue
                          formats.append({
-                            'url': x.find('streamerURI').text,
-                            'app': x.find('streamerURI').text.split('/', 3)[3],
-                            'play_path': x.find('url').text,
+                            'url': streamer,
+                            'app': streamer.split('/', 3)[3],
+                            'play_path': play_path,
                              'rtmp_live': False,
-                            'ext': 'mp4',
-                            'format_id': '%s-%s' % (x.tag, asset.get('quality')),
+                            'ext': 'flv',
+                            'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag,
+                            'height': height,
                          })
+                self._sort_formats(formats)
+                description = clean_html(xpath_text(media, 'description'))
+                uploader = xpath_text(media, 'author')
+                duration = int_or_none(media.get('duration'))
+                for subtitle in media.findall('./subtitles/subtitle'):
+                    subtitle_url = xpath_text(subtitle, 'url', default=None)
+                    if not subtitle_url:
+                        continue
+                    lang = xpath_text(subtitle, 'lang', default='en')
+                    subtitles.setdefault(lang, []).append({
+                        'url': subtitle_url,
+                        'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(),
+                    })
  
          return {
              'id': video_id,
              'title': title,
              'description': description,
-            'thumbnail': thumbnail,
+            'thumbnails': thumbnails,
              'duration': duration,
+            'creator': uploader,
              'uploader': uploader,
-            'uploader_id': uploader_id,
              'formats': formats,
+            'subtitles': subtitles,
          }
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py

index 51dfc27ac3427522d5761cde36673ceb9c77037a..486e086bb22d655d2ad5bc9df4aade07c9030816 100644 (file)
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -4,91 +4,219 @@
  import re
  
  from .common import InfoExtractor
-from ..compat import (
-    compat_urlparse,
-    compat_urllib_parse_unquote,
-)
+from ..compat import compat_urllib_parse_unquote
  from ..utils import (
-    determine_ext,
      ExtractorError,
-    float_or_none,
+    int_or_none,
+    parse_age_limit,
      parse_duration,
-    unified_strdate,
  )
  
  
-class NRKIE(InfoExtractor):
-    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
-
-    _TESTS = [
-        {
-            'url': 'http://www.nrk.no/video/PS*150533',
-            # MD5 is unstable
-            'info_dict': {
-                'id': '150533',
-                'ext': 'flv',
-                'title': 'Dompap og andre fugler i Piip-Show',
-                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
-                'duration': 263,
-            }
-        },
-        {
-            'url': 'http://www.nrk.no/video/PS*154915',
-            # MD5 is unstable
-            'info_dict': {
-                'id': '154915',
-                'ext': 'flv',
-                'title': 'Slik høres internett ut når du er blind',
-                'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
-                'duration': 20,
-            }
-        },
-    ]
+class NRKBaseIE(InfoExtractor):
+    def _extract_formats(self, manifest_url, video_id, fatal=True):
+        formats = []
+        formats.extend(self._extract_f4m_formats(
+            manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81',
+            video_id, f4m_id='hds', fatal=fatal))
+        formats.extend(self._extract_m3u8_formats(manifest_url.replace(
+            'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'),
+            video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=fatal))
+        return formats
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
          data = self._download_json(
-            'http://v8.psapi.nrk.no/mediaelement/%s' % video_id,
-            video_id, 'Downloading media JSON')
+            'http://%s/mediaelement/%s' % (self._API_HOST, video_id),
+            video_id, 'Downloading mediaelement JSON')
+
+        title = data.get('fullTitle') or data.get('mainTitle') or data['title']
+        video_id = data.get('id') or video_id
+
+        entries = []
+
+        media_assets = data.get('mediaAssets')
+        if media_assets and isinstance(media_assets, list):
+            def video_id_and_title(idx):
+                return ((video_id, title) if len(media_assets) == 1
+                        else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx)))
+            for num, asset in enumerate(media_assets, 1):
+                asset_url = asset.get('url')
+                if not asset_url:
+                    continue
+                formats = self._extract_formats(asset_url, video_id, fatal=False)
+                if not formats:
+                    continue
+                self._sort_formats(formats)
+                entry_id, entry_title = video_id_and_title(num)
+                duration = parse_duration(asset.get('duration'))
+                subtitles = {}
+                for subtitle in ('webVtt', 'timedText'):
+                    subtitle_url = asset.get('%sSubtitlesUrl' % subtitle)
+                    if subtitle_url:
+                        subtitles.setdefault('no', []).append({
+                            'url': compat_urllib_parse_unquote(subtitle_url)
+                        })
+                entries.append({
+                    'id': asset.get('carrierId') or entry_id,
+                    'title': entry_title,
+                    'duration': duration,
+                    'subtitles': subtitles,
+                    'formats': formats,
+                })
  
-        media_url = data.get('mediaUrl')
+        if not entries:
+            media_url = data.get('mediaUrl')
+            if media_url:
+                formats = self._extract_formats(media_url, video_id)
+                self._sort_formats(formats)
+                duration = parse_duration(data.get('duration'))
+                entries = [{
+                    'id': video_id,
+                    'title': title,
+                    'duration': duration,
+                    'formats': formats,
+                }]
  
-        if not media_url:
-            if data['usageRights']['isGeoBlocked']:
+        if not entries:
+            if data.get('usageRights', {}).get('isGeoBlocked'):
                  raise ExtractorError(
                      'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
                      expected=True)
  
-        if determine_ext(media_url) == 'f4m':
-            formats = self._extract_f4m_formats(
-                media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds')
-            self._sort_formats(formats)
-        else:
-            formats = [{
-                'url': media_url,
-                'ext': 'flv',
-            }]
-
-        duration = parse_duration(data.get('duration'))
+        conviva = data.get('convivaStatistics') or {}
+        series = conviva.get('seriesName') or data.get('seriesTitle')
+        episode = conviva.get('episodeName') or data.get('episodeNumberOrDate')
  
+        thumbnails = None
          images = data.get('images')
-        if images:
-            thumbnails = images['webImages']
-            thumbnails.sort(key=lambda image: image['pixelWidth'])
-            thumbnail = thumbnails[-1]['imageUrl']
-        else:
-            thumbnail = None
-
-        return {
-            'id': video_id,
-            'title': data['title'],
-            'description': data['description'],
-            'duration': duration,
-            'thumbnail': thumbnail,
-            'formats': formats,
+        if images and isinstance(images, dict):
+            web_images = images.get('webImages')
+            if isinstance(web_images, list):
+                thumbnails = [{
+                    'url': image['imageUrl'],
+                    'width': int_or_none(image.get('width')),
+                    'height': int_or_none(image.get('height')),
+                } for image in web_images if image.get('imageUrl')]
+
+        description = data.get('description')
+
+        common_info = {
+            'description': description,
+            'series': series,
+            'episode': episode,
+            'age_limit': parse_age_limit(data.get('legalAge')),
+            'thumbnails': thumbnails,
          }
  
+        vcodec = 'none' if data.get('mediaType') == 'Audio' else None
+
+        # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged
+
+        for entry in entries:
+            entry.update(common_info)
+            for f in entry['formats']:
+                f['vcodec'] = vcodec
+
+        return self.playlist_result(entries, video_id, title, description)
+
+
+class NRKIE(NRKBaseIE):
+    _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
+    _API_HOST = 'v8.psapi.nrk.no'
+    _TESTS = [{
+        # video
+        'url': 'http://www.nrk.no/video/PS*150533',
+        'md5': '2f7f6eeb2aacdd99885f355428715cfa',
+        'info_dict': {
+            'id': '150533',
+            'ext': 'mp4',
+            'title': 'Dompap og andre fugler i Piip-Show',
+            'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
+            'duration': 263,
+        }
+    }, {
+        # audio
+        'url': 'http://www.nrk.no/video/PS*154915',
+        # MD5 is unstable
+        'info_dict': {
+            'id': '154915',
+            'ext': 'flv',
+            'title': 'Slik høres internett ut når du er blind',
+            'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
+            'duration': 20,
+        }
+    }]
+
+
+class NRKTVIE(NRKBaseIE):
+    IE_DESC = 'NRK TV and NRK Radio'
+    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
+    _API_HOST = 'psapi-we.nrk.no'
+
+    _TESTS = [{
+        'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
+        'md5': '4e9ca6629f09e588ed240fb11619922a',
+        'info_dict': {
+            'id': 'MUHH48000314AA',
+            'ext': 'mp4',
+            'title': '20 spørsmål 23.05.2014',
+            'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
+            'duration': 1741.52,
+        },
+    }, {
+        'url': 'https://tv.nrk.no/program/mdfp15000514',
+        'md5': '43d0be26663d380603a9cf0c24366531',
+        'info_dict': {
+            'id': 'MDFP15000514CA',
+            'ext': 'mp4',
+            'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014',
+            'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db',
+            'duration': 4605.08,
+        },
+    }, {
+        # single playlist video
+        'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
+        'md5': 'adbd1dbd813edaf532b0a253780719c2',
+        'info_dict': {
+            'id': 'MSPO40010515-part2',
+            'ext': 'flv',
+            'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+            'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+        },
+        'skip': 'Only works from Norway',
+    }, {
+        'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
+        'playlist': [{
+            'md5': '9480285eff92d64f06e02a5367970a7a',
+            'info_dict': {
+                'id': 'MSPO40010515-part1',
+                'ext': 'flv',
+                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)',
+                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+            },
+        }, {
+            'md5': 'adbd1dbd813edaf532b0a253780719c2',
+            'info_dict': {
+                'id': 'MSPO40010515-part2',
+                'ext': 'flv',
+                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+            },
+        }],
+        'info_dict': {
+            'id': 'MSPO40010515',
+            'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn',
+            'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+            'duration': 6947.52,
+        },
+        'skip': 'Only works from Norway',
+    }, {
+        'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
+        'only_matching': True,
+    }]
+
  
  class NRKPlaylistIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)'
@@ -159,179 +287,3 @@ def _real_extract(self, url):
  
          nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id')
          return self.url_result('nrk:%s' % nrk_id)
-
-
-class NRKTVIE(InfoExtractor):
-    IE_DESC = 'NRK TV and NRK Radio'
-    _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
-
-    _TESTS = [
-        {
-            'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
-            'info_dict': {
-                'id': 'MUHH48000314',
-                'ext': 'mp4',
-                'title': '20 spørsmål',
-                'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
-                'upload_date': '20140523',
-                'duration': 1741.52,
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        },
-        {
-            'url': 'https://tv.nrk.no/program/mdfp15000514',
-            'info_dict': {
-                'id': 'mdfp15000514',
-                'ext': 'mp4',
-                'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting',
-                'description': 'md5:654c12511f035aed1e42bdf5db3b206a',
-                'upload_date': '20140524',
-                'duration': 4605.08,
-            },
-            'params': {
-                # m3u8 download
-                'skip_download': True,
-            },
-        },
-        {
-            # single playlist video
-            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
-            'md5': 'adbd1dbd813edaf532b0a253780719c2',
-            'info_dict': {
-                'id': 'MSPO40010515-part2',
-                'ext': 'flv',
-                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
-                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
-                'upload_date': '20150106',
-            },
-            'skip': 'Only works from Norway',
-        },
-        {
-            'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
-            'playlist': [
-                {
-                    'md5': '9480285eff92d64f06e02a5367970a7a',
-                    'info_dict': {
-                        'id': 'MSPO40010515-part1',
-                        'ext': 'flv',
-                        'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)',
-                        'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
-                        'upload_date': '20150106',
-                    },
-                },
-                {
-                    'md5': 'adbd1dbd813edaf532b0a253780719c2',
-                    'info_dict': {
-                        'id': 'MSPO40010515-part2',
-                        'ext': 'flv',
-                        'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
-                        'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
-                        'upload_date': '20150106',
-                    },
-                },
-            ],
-            'info_dict': {
-                'id': 'MSPO40010515',
-                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn',
-                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
-                'upload_date': '20150106',
-                'duration': 6947.5199999999995,
-            },
-            'skip': 'Only works from Norway',
-        },
-        {
-            'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
-            'only_matching': True,
-        }
-    ]
-
-    def _extract_f4m(self, manifest_url, video_id):
-        return self._extract_f4m_formats(
-            manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds')
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        part_id = mobj.group('part_id')
-        base_url = mobj.group('baseurl')
-
-        webpage = self._download_webpage(url, video_id)
-
-        title = self._html_search_meta(
-            'title', webpage, 'title')
-        description = self._html_search_meta(
-            'description', webpage, 'description')
-
-        thumbnail = self._html_search_regex(
-            r'data-posterimage="([^"]+)"',
-            webpage, 'thumbnail', fatal=False)
-        upload_date = unified_strdate(self._html_search_meta(
-            'rightsfrom', webpage, 'upload date', fatal=False))
-        duration = float_or_none(self._html_search_regex(
-            r'data-duration="([^"]+)"',
-            webpage, 'duration', fatal=False))
-
-        # playlist
-        parts = re.findall(
-            r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage)
-        if parts:
-            entries = []
-            for current_part_id, stream_url, part_title in parts:
-                if part_id and current_part_id != part_id:
-                    continue
-                video_part_id = '%s-part%s' % (video_id, current_part_id)
-                formats = self._extract_f4m(stream_url, video_part_id)
-                entries.append({
-                    'id': video_part_id,
-                    'title': part_title,
-                    'description': description,
-                    'thumbnail': thumbnail,
-                    'upload_date': upload_date,
-                    'formats': formats,
-                })
-            if part_id:
-                if entries:
-                    return entries[0]
-            else:
-                playlist = self.playlist_result(entries, video_id, title, description)
-                playlist.update({
-                    'thumbnail': thumbnail,
-                    'upload_date': upload_date,
-                    'duration': duration,
-                })
-                return playlist
-
-        formats = []
-
-        f4m_url = re.search(r'data-media="([^"]+)"', webpage)
-        if f4m_url:
-            formats.extend(self._extract_f4m(f4m_url.group(1), video_id))
-
-        m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)
-        if m3u8_url:
-            formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls'))
-        self._sort_formats(formats)
-
-        subtitles_url = self._html_search_regex(
-            r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1',
-            webpage, 'subtitle URL', default=None, group='url')
-        subtitles = {}
-        if subtitles_url:
-            subtitles['no'] = [{
-                'ext': 'ttml',
-                'url': compat_urlparse.urljoin(base_url, subtitles_url),
-            }]
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-            'duration': duration,
-            'formats': formats,
-            'subtitles': subtitles,
-        }
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py

index 95e982897b1537200927889ca68d925fa67e8f45..2038a6ba5001283e786905a23c429d2418762515 100644 (file)
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -8,6 +8,7 @@
      float_or_none,
      ExtractorError,
      unsmuggle_url,
+    determine_ext,
  )
  from ..compat import compat_urllib_parse_urlencode
  
@@ -15,71 +16,80 @@
  class OoyalaBaseIE(InfoExtractor):
      _PLAYER_BASE = 'http://player.ooyala.com/'
      _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/'
-    _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v1/authorization/embed_code/%s/%s?'
+    _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s?'
  
      def _extract(self, content_tree_url, video_id, domain='example.org'):
          content_tree = self._download_json(content_tree_url, video_id)['content_tree']
          metadata = content_tree[list(content_tree)[0]]
          embed_code = metadata['embed_code']
          pcode = metadata.get('asset_pcode') or embed_code
-        video_info = {
-            'id': embed_code,
-            'title': metadata['title'],
-            'description': metadata.get('description'),
-            'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'),
-            'duration': float_or_none(metadata.get('duration'), 1000),
-        }
+        title = metadata['title']
+
+        auth_data = self._download_json(
+            self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) +
+            compat_urllib_parse_urlencode({
+                'domain': domain,
+                'supportedFormats': 'mp4,rtmp,m3u8,hds',
+            }), video_id)
+
+        cur_auth_data = auth_data['authorization_data'][embed_code]
  
          urls = []
          formats = []
-        for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'):
-            auth_data = self._download_json(
-                self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) +
-                compat_urllib_parse_urlencode({
-                    'domain': domain,
-                    'supportedFormats': supported_format
-                }),
-                video_id, 'Downloading %s JSON' % supported_format)
-
-            cur_auth_data = auth_data['authorization_data'][embed_code]
-
-            if cur_auth_data['authorized']:
-                for stream in cur_auth_data['streams']:
-                    url = base64.b64decode(
-                        stream['url']['data'].encode('ascii')).decode('utf-8')
-                    if url in urls:
-                        continue
-                    urls.append(url)
-                    delivery_type = stream['delivery_type']
-                    if delivery_type == 'hls' or '.m3u8' in url:
-                        formats.extend(self._extract_m3u8_formats(
-                            url, embed_code, 'mp4', 'm3u8_native',
-                            m3u8_id='hls', fatal=False))
-                    elif delivery_type == 'hds' or '.f4m' in url:
-                        formats.extend(self._extract_f4m_formats(
-                            url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
-                    elif '.smil' in url:
-                        formats.extend(self._extract_smil_formats(
-                            url, embed_code, fatal=False))
-                    else:
-                        formats.append({
-                            'url': url,
-                            'ext': stream.get('delivery_type'),
-                            'vcodec': stream.get('video_codec'),
-                            'format_id': delivery_type,
-                            'width': int_or_none(stream.get('width')),
-                            'height': int_or_none(stream.get('height')),
-                            'abr': int_or_none(stream.get('audio_bitrate')),
-                            'vbr': int_or_none(stream.get('video_bitrate')),
-                            'fps': float_or_none(stream.get('framerate')),
-                        })
-            else:
-                raise ExtractorError('%s said: %s' % (
-                    self.IE_NAME, cur_auth_data['message']), expected=True)
+        if cur_auth_data['authorized']:
+            for stream in cur_auth_data['streams']:
+                s_url = base64.b64decode(
+                    stream['url']['data'].encode('ascii')).decode('utf-8')
+                if s_url in urls:
+                    continue
+                urls.append(s_url)
+                ext = determine_ext(s_url, None)
+                delivery_type = stream['delivery_type']
+                if delivery_type == 'hls' or ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        s_url, embed_code, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+                elif delivery_type == 'hds' or ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
+                elif ext == 'smil':
+                    formats.extend(self._extract_smil_formats(
+                        s_url, embed_code, fatal=False))
+                else:
+                    formats.append({
+                        'url': s_url,
+                        'ext': ext or stream.get('delivery_type'),
+                        'vcodec': stream.get('video_codec'),
+                        'format_id': delivery_type,
+                        'width': int_or_none(stream.get('width')),
+                        'height': int_or_none(stream.get('height')),
+                        'abr': int_or_none(stream.get('audio_bitrate')),
+                        'vbr': int_or_none(stream.get('video_bitrate')),
+                        'fps': float_or_none(stream.get('framerate')),
+                    })
+        else:
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, cur_auth_data['message']), expected=True)
          self._sort_formats(formats)
  
-        video_info['formats'] = formats
-        return video_info
+        subtitles = {}
+        for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items():
+            sub_url = sub.get('url')
+            if not sub_url:
+                continue
+            subtitles[lang] = [{
+                'url': sub_url,
+            }]
+
+        return {
+            'id': embed_code,
+            'title': title,
+            'description': metadata.get('description'),
+            'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'),
+            'duration': float_or_none(metadata.get('duration'), 1000),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
  
  
  class OoyalaIE(OoyalaBaseIE):
diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py

index 456561bcc6bcc41c5d4ae9726693a7d1b2531224..5049b870ed198f3ce4f702c07efe98559d453ef2 100644 (file)
--- a/youtube_dl/extractor/openload.py
+++ b/youtube_dl/extractor/openload.py
@@ -100,7 +100,7 @@ def _real_extract(self, url):
              raise ExtractorError('File not found', expected=True)
  
          code = self._search_regex(
-            r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>',
+            r'</video>\s*</div>\s*<script[^>]+>([^<]+)</script>',
              webpage, 'JS code')
  
          decoded = self.openload_decode(code)
diff --git a/youtube_dl/extractor/ora.py b/youtube_dl/extractor/ora.py

index 8545fb1b88cbf29ae1acb999566ee9041335dc4a..1d42be39b3303c95952a8ec54a34abbb9d09f0b1 100644 (file)
--- a/youtube_dl/extractor/ora.py
+++ b/youtube_dl/extractor/ora.py
@@ -12,8 +12,8 @@
  
  
  class OraTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?ora\.tv/([^/]+/)*(?P<id>[^/\?#]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?(?:ora\.tv|unsafespeech\.com)/([^/]+/)*(?P<id>[^/\?#]+)'
+    _TESTS = [{
          'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq',
          'md5': 'fa33717591c631ec93b04b0e330df786',
          'info_dict': {
@@ -22,7 +22,10 @@ class OraTVIE(InfoExtractor):
              'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!',
              'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1',
          }
-    }
+    }, {
+        'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          display_id = self._match_id(url)
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py

index 514e9b4339be43b509f9c9a8a6d2b87187e5f056..c23b314e79df70e6b115fee6fb91386345d00ad4 100644 (file)
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -2,11 +2,15 @@
  from __future__ import unicode_literals
  
  from .common import InfoExtractor
-from ..utils import parse_iso8601
+from ..utils import (
+    parse_iso8601,
+    unescapeHTML,
+)
  
  
  class PeriscopeIE(InfoExtractor):
      IE_DESC = 'Periscope'
+    IE_NAME = 'periscope'
      _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)'
      # Alive example URLs can be found here http://onperiscope.com/
      _TESTS = [{
@@ -41,8 +45,11 @@ def _real_extract(self, url):
          broadcast = broadcast_data['broadcast']
          status = broadcast['status']
  
-        uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name')
-        uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id')
+        user = broadcast_data.get('user', {})
+
+        uploader = broadcast.get('user_display_name') or user.get('display_name')
+        uploader_id = (broadcast.get('username') or user.get('username') or
+                       broadcast.get('user_id') or user.get('id'))
  
          title = '%s - %s' % (uploader, status) if uploader else status
          state = broadcast.get('state').lower()
@@ -79,3 +86,43 @@ def _real_extract(self, url):
              'thumbnails': thumbnails,
              'formats': formats,
          }
+
+
+class PeriscopeUserIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$'
+    IE_DESC = 'Periscope user videos'
+    IE_NAME = 'periscope:user'
+
+    _TEST = {
+        'url': 'https://www.periscope.tv/LularoeHusbandMike/',
+        'info_dict': {
+            'id': 'LularoeHusbandMike',
+            'title': 'LULAROE HUSBAND MIKE',
+            'description': 'md5:6cf4ec8047768098da58e446e82c82f0',
+        },
+        # Periscope only shows videos in the last 24 hours, so it's possible to
+        # get 0 videos
+        'playlist_mincount': 0,
+    }
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, user_id)
+
+        data_store = self._parse_json(
+            unescapeHTML(self._search_regex(
+                r'data-store=(["\'])(?P<data>.+?)\1',
+                webpage, 'data store', default='{}', group='data')),
+            user_id)
+
+        user = data_store.get('User', {}).get('user', {})
+        title = user.get('display_name') or user.get('username')
+        description = user.get('description')
+
+        entries = [
+            self.url_result(
+                'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id']))
+            for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])]
+
+        return self.playlist_result(entries, user_id, title, description)
diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py

index 6d138ef25d2d5cec02a012f5a06af085a6c35d26..0bc7431189a0eed819fb85a6fbbdc1558a4b84ed 100644 (file)
--- a/youtube_dl/extractor/playwire.py
+++ b/youtube_dl/extractor/playwire.py
@@ -4,9 +4,8 @@
  
  from .common import InfoExtractor
  from ..utils import (
-    xpath_text,
+    dict_get,
      float_or_none,
-    int_or_none,
  )
  
  
@@ -23,6 +22,19 @@ class PlaywireIE(InfoExtractor):
              'duration': 145.94,
          },
      }, {
+        # m3u8 in f4m
+        'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json',
+        'info_dict': {
+            'id': '4840492',
+            'ext': 'mp4',
+            'title': 'ITV EL SHOW FULL',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # Multiple resolutions while bitrates missing
          'url': 'http://cdn.playwire.com/11625/embed/85228.html',
          'only_matching': True,
      }, {
@@ -48,25 +60,10 @@ def _real_extract(self, url):
          thumbnail = content.get('poster')
          src = content['media']['f4m']
  
-        f4m = self._download_xml(src, video_id)
-        base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True)
-        formats = []
-        for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'):
-            media_url = media.get('url')
-            if not media_url:
-                continue
-            tbr = int_or_none(media.get('bitrate'))
-            width = int_or_none(media.get('width'))
-            height = int_or_none(media.get('height'))
-            f = {
-                'url': '%s/%s' % (base_url, media.attrib['url']),
-                'tbr': tbr,
-                'width': width,
-                'height': height,
-            }
-            if not (tbr or width or height):
-                f['quality'] = 1 if '-hd.' in media_url else 0
-            formats.append(f)
+        formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls')
+        for a_format in formats:
+            if not dict_get(a_format, ['tbr', 'width', 'height']):
+                a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0
          self._sort_formats(formats)
  
          return {
diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py

new file mode 100644 (file)

index 0000000..4f05bbd
--- /dev/null
+++ b/youtube_dl/extractor/radiocanada.py
@@ -0,0 +1,130 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    xpath_text,
+    find_xpath_attr,
+    determine_ext,
+    int_or_none,
+    unified_strdate,
+    xpath_element,
+    ExtractorError,
+)
+
+
+class RadioCanadaIE(InfoExtractor):
+    IE_NAME = 'radiocanada'
+    _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
+        'info_dict': {
+            'id': '7184272',
+            'ext': 'flv',
+            'title': 'Le parcours du tireur capté sur vidéo',
+            'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
+            'upload_date': '20141023',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        app_code, video_id = re.match(self._VALID_URL, url).groups()
+
+        formats = []
+        # TODO: extract m3u8 and f4m formats
+        # m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements
+        # f4m formats can be extracted using flashhd device_type but they produce unplayable file
+        for device_type in ('flash',):
+            v_data = self._download_xml(
+                'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx',
+                video_id, note='Downloading %s XML' % device_type, query={
+                    'appCode': app_code,
+                    'idMedia': video_id,
+                    'connectionType': 'broadband',
+                    'multibitrate': 'true',
+                    'deviceType': device_type,
+                    # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction
+                    'paysJ391wsHjbOJwvCs26toz': 'CA',
+                    'bypasslock': 'NZt5K62gRqfc',
+                })
+            v_url = xpath_text(v_data, 'url')
+            if not v_url:
+                continue
+            if v_url == 'null':
+                raise ExtractorError('%s said: %s' % (
+                    self.IE_NAME, xpath_text(v_data, 'message')), expected=True)
+            ext = determine_ext(v_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    v_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False))
+            else:
+                ext = determine_ext(v_url)
+                bitrates = xpath_element(v_data, 'bitrates')
+                for url_e in bitrates.findall('url'):
+                    tbr = int_or_none(url_e.get('bitrate'))
+                    if not tbr:
+                        continue
+                    formats.append({
+                        'format_id': 'rtmp-%d' % tbr,
+                        'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url),
+                        'ext': 'flv',
+                        'protocol': 'rtmp',
+                        'width': int_or_none(url_e.get('width')),
+                        'height': int_or_none(url_e.get('height')),
+                        'tbr': tbr,
+                    })
+        self._sort_formats(formats)
+
+        metadata = self._download_xml(
+            'http://api.radio-canada.ca/metaMedia/v1/index.ashx',
+            video_id, note='Downloading metadata XML', query={
+                'appCode': app_code,
+                'idMedia': video_id,
+            })
+
+        def get_meta(name):
+            el = find_xpath_attr(metadata, './/Meta', 'name', name)
+            return el.text if el is not None else None
+
+        return {
+            'id': video_id,
+            'title': get_meta('Title'),
+            'description': get_meta('Description') or get_meta('ShortDescription'),
+            'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),
+            'duration': int_or_none(get_meta('length')),
+            'series': get_meta('Emission'),
+            'season_number': int_or_none('SrcSaison'),
+            'episode_number': int_or_none('SrcEpisode'),
+            'upload_date': unified_strdate(get_meta('Date')),
+            'formats': formats,
+        }
+
+
+class RadioCanadaAudioVideoIE(InfoExtractor):
+    'radiocanada:audiovideo'
+    _VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
+        'info_dict': {
+            'id': '7527184',
+            'ext': 'flv',
+            'title': 'Barack Obama au Vietnam',
+            'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam',
+            'upload_date': '20160523',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        return self.url_result('radiocanada:medianet:%s' % self._match_id(url))
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py

index 7ba41ba593295cdc7d2e28e6b64702321ed1ef08..721fc3a9e2d2b3431051ea00982f72ae1d98ff65 100644 (file)
--- a/youtube_dl/extractor/redtube.py
+++ b/youtube_dl/extractor/redtube.py
@@ -1,7 +1,12 @@
  from __future__ import unicode_literals
  
  from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    str_to_int,
+    unified_strdate,
+)
  
  
  class RedTubeIE(InfoExtractor):
@@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor):
              'id': '66418',
              'ext': 'mp4',
              'title': 'Sucked on a toilet',
+            'upload_date': '20120831',
+            'duration': 596,
+            'view_count': int,
              'age_limit': 18,
          }
      }
@@ -24,12 +32,39 @@ def _real_extract(self, url):
          if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
              raise ExtractorError('Video %s has been removed' % video_id, expected=True)
  
-        video_url = self._html_search_regex(
-            r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
-        video_title = self._html_search_regex(
-            r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
-            webpage, 'title')
-        video_thumbnail = self._og_search_thumbnail(webpage)
+        title = self._html_search_regex(
+            (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>',
+             r'videoTitle\s*:\s*(["\'])(?P<title>)\1'),
+            webpage, 'title', group='title')
+
+        formats = []
+        sources = self._parse_json(
+            self._search_regex(
+                r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
+            video_id, fatal=False)
+        if sources and isinstance(sources, dict):
+            for format_id, format_url in sources.items():
+                if format_url:
+                    formats.append({
+                        'url': format_url,
+                        'format_id': format_id,
+                        'height': int_or_none(format_id),
+                    })
+        else:
+            video_url = self._html_search_regex(
+                r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
+            formats.append({'url': video_url})
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+        upload_date = unified_strdate(self._search_regex(
+            r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
+            webpage, 'upload date', fatal=False))
+        duration = int_or_none(self._search_regex(
+            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+        view_count = str_to_int(self._search_regex(
+            r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',
+            webpage, 'view count', fatal=False))
  
          # No self-labeling, but they describe themselves as
          # "Home of Videos Porno"
@@ -37,9 +72,12 @@ def _real_extract(self, url):
  
          return {
              'id': video_id,
-            'url': video_url,
              'ext': 'mp4',
-            'title': video_title,
-            'thumbnail': video_thumbnail,
+            'title': title,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'view_count': view_count,
              'age_limit': age_limit,
+            'formats': formats,
          }
diff --git a/youtube_dl/extractor/reuters.py b/youtube_dl/extractor/reuters.py

new file mode 100644 (file)

index 0000000..961d504
--- /dev/null
+++ b/youtube_dl/extractor/reuters.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+    int_or_none,
+    unescapeHTML,
+)
+
+
+class ReutersIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562',
+        'md5': '8015113643a0b12838f160b0b81cc2ee',
+        'info_dict': {
+            'id': '368575562',
+            'ext': 'mp4',
+            'title': 'San Francisco police chief resigns',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id)
+        video_data = js_to_json(self._search_regex(
+            r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);',
+            webpage, 'video data'))
+
+        def get_json_value(key, fatal=False):
+            return self._search_regex('"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal)
+
+        title = unescapeHTML(get_json_value('title', fatal=True))
+        mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups()
+
+        mas_data = self._download_json(
+            'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid),
+            video_id, transform_source=js_to_json)
+        formats = []
+        for f in mas_data:
+            f_url = f.get('url')
+            if not f_url:
+                continue
+            method = f.get('method')
+            if method == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+            else:
+                container = f.get('container')
+                ext = '3gp' if method == 'mobile' else container
+                formats.append({
+                    'format_id': ext,
+                    'url': f_url,
+                    'ext': ext,
+                    'container': container if method != 'mobile' else None,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': get_json_value('thumb'),
+            'duration': int_or_none(get_json_value('seconds')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py

index 99979ebe1a9fe82099076b46b576ef38a58bca8c..833d8a2f0d3813014224e39a8d2d41fb0e51d515 100644 (file)
--- a/youtube_dl/extractor/revision3.py
+++ b/youtube_dl/extractor/revision3.py
@@ -13,8 +13,64 @@
  )
  
  
+class Revision3EmbedIE(InfoExtractor):
+    IE_NAME = 'revision3:embed'
+    _VALID_URL = r'(?:revision3:(?:(?P<playlist_type>[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P<playlist_id>\d+)'
+    _TEST = {
+        'url': 'http://api.seekernetwork.com/player/embed?videoId=67558',
+        'md5': '83bcd157cab89ad7318dd7b8c9cf1306',
+        'info_dict': {
+            'id': '67558',
+            'ext': 'mp4',
+            'title': 'The Pros & Cons Of Zoos',
+            'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?',
+            'uploader_id': 'dnews',
+            'uploader': 'DNews',
+        }
+    }
+    _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('playlist_id')
+        playlist_type = mobj.group('playlist_type') or 'video_id'
+        video_data = self._download_json(
+            'http://revision3.com/api/getPlaylist.json', playlist_id, query={
+                'api_key': self._API_KEY,
+                'codecs': 'h264,vp8,theora',
+                playlist_type: playlist_id,
+            })['items'][0]
+
+        formats = []
+        for vcodec, media in video_data['media'].items():
+            for quality_id, quality in media.items():
+                if quality_id == 'hls':
+                    formats.extend(self._extract_m3u8_formats(
+                        quality['url'], playlist_id, 'mp4',
+                        'm3u8_native', m3u8_id='hls', fatal=False))
+                else:
+                    formats.append({
+                        'url': quality['url'],
+                        'format_id': '%s-%s' % (vcodec, quality_id),
+                        'tbr': int_or_none(quality.get('bitrate')),
+                        'vcodec': vcodec,
+                    })
+        self._sort_formats(formats)
+
+        return {
+            'id': playlist_id,
+            'title': unescapeHTML(video_data['title']),
+            'description': unescapeHTML(video_data.get('summary')),
+            'uploader': video_data.get('show', {}).get('name'),
+            'uploader_id': video_data.get('show', {}).get('slug'),
+            'duration': int_or_none(video_data.get('duration')),
+            'formats': formats,
+        }
+
+
  class Revision3IE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|testtube|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)'
+    IE_NAME = 'revision'
+    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)'
      _TESTS = [{
          'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',
          'md5': 'd94a72d85d0a829766de4deb8daaf7df',
@@ -32,52 +88,14 @@ class Revision3IE(InfoExtractor):
          }
      }, {
          # Show
-        'url': 'http://testtube.com/brainstuff',
-        'info_dict': {
-            'id': '251',
-            'title': 'BrainStuff',
-            'description': 'Whether the topic is popcorn or particle physics, you can count on the HowStuffWorks team to explore-and explain-the everyday science in the world around us on BrainStuff.',
-        },
-        'playlist_mincount': 93,
-    }, {
-        'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',
-        'info_dict': {
-            'id': '58227',
-            'display_id': 'dnews/5-weird-ways-plants-can-eat-animals',
-            'duration': 275,
-            'ext': 'webm',
-            'title': '5 Weird Ways Plants Can Eat Animals',
-            'description': 'Why have some plants evolved to eat meat?',
-            'upload_date': '20150120',
-            'timestamp': 1421763300,
-            'uploader': 'DNews',
-            'uploader_id': 'dnews',
-        },
-    }, {
-        'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min',
-        'info_dict': {
-            'id': '71618',
-            'ext': 'mp4',
-            'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min',
-            'title': 'The Israel-Palestine Conflict Explained in Ten Minutes',
-            'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start',
-            'uploader': 'Editors\' Picks',
-            'uploader_id': 'tt-editors-picks',
-            'timestamp': 1453309200,
-            'upload_date': '20160120',
-        },
-        'add_ie': ['Youtube'],
+        'url': 'http://revision3.com/variant',
+        'only_matching': True,
      }, {
          # Tag
-        'url': 'http://testtube.com/tech-news',
-        'info_dict': {
-            'id': '21018',
-            'title': 'tech news',
-        },
-        'playlist_mincount': 9,
+        'url': 'http://revision3.com/vr',
+        'only_matching': True,
      }]
      _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s'
-    _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
  
      def _real_extract(self, url):
          domain, display_id = re.match(self._VALID_URL, url).groups()
@@ -119,33 +137,9 @@ def _real_extract(self, url):
                  })
                  return info
  
-            video_data = self._download_json(
-                'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id),
-                video_id)['items'][0]
-
-            formats = []
-            for vcodec, media in video_data['media'].items():
-                for quality_id, quality in media.items():
-                    if quality_id == 'hls':
-                        formats.extend(self._extract_m3u8_formats(
-                            quality['url'], video_id, 'mp4',
-                            'm3u8_native', m3u8_id='hls', fatal=False))
-                    else:
-                        formats.append({
-                            'url': quality['url'],
-                            'format_id': '%s-%s' % (vcodec, quality_id),
-                            'tbr': int_or_none(quality.get('bitrate')),
-                            'vcodec': vcodec,
-                        })
-            self._sort_formats(formats)
-
              info.update({
-                'title': unescapeHTML(video_data['title']),
-                'description': unescapeHTML(video_data.get('summary')),
-                'uploader': video_data.get('show', {}).get('name'),
-                'uploader_id': video_data.get('show', {}).get('slug'),
-                'duration': int_or_none(video_data.get('duration')),
-                'formats': formats,
+                '_type': 'url_transparent',
+                'url': 'revision3:%s' % video_id,
              })
              return info
          else:
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py

index 79af477158630503078d86b117f960a36f5f1f73..f11e3588b0796718e2ecbe316b53b968a08df98c 100644 (file)
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -6,6 +6,9 @@
  import time
  
  from .common import InfoExtractor
+from ..compat import (
+    compat_struct_unpack,
+)
  from ..utils import (
      ExtractorError,
      float_or_none,
@@ -13,7 +16,6 @@
      remove_start,
      sanitized_Request,
      std_headers,
-    struct_unpack,
  )
  
  
@@ -21,7 +23,7 @@ def _decrypt_url(png):
      encrypted_data = base64.b64decode(png.encode('utf-8'))
      text_index = encrypted_data.find(b'tEXt')
      text_chunk = encrypted_data[text_index - 4:]
-    length = struct_unpack('!I', text_chunk[:4])[0]
+    length = compat_struct_unpack('!I', text_chunk[:4])[0]
      # Use bytearray to get integers when iterating in both python 2.x and 3.x
      data = bytearray(text_chunk[8:8 + length])
      data = [chr(b) for b in data if b != 0]
@@ -62,7 +64,7 @@ def _decrypt_url(png):
  class RTVEALaCartaIE(InfoExtractor):
      IE_NAME = 'rtve.es:alacarta'
      IE_DESC = 'RTVE a la carta'
-    _VALID_URL = r'https?://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)'
+    _VALID_URL = r'https?://www\.rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)'
  
      _TESTS = [{
          'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
@@ -85,6 +87,9 @@ class RTVEALaCartaIE(InfoExtractor):
      }, {
          'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve',
          'only_matching': True,
+    }, {
+        'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/',
+        'only_matching': True,
      }]
  
      def _real_initialize(self):
diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py

new file mode 100644 (file)

index 0000000..3b9c65e
--- /dev/null
+++ b/youtube_dl/extractor/seeker.py
@@ -0,0 +1,57 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class SeekerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html'
+    _TESTS = [{
+        # player.loadRevision3Item
+        'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html',
+        'md5': '30c1dc4030cc715cf05b423d0947ac18',
+        'info_dict': {
+            'id': '76243',
+            'ext': 'webm',
+            'title': 'Should Trump Be Required To Release His Tax Returns?',
+            'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?',
+            'uploader': 'Seeker Daily',
+            'uploader_id': 'seekerdaily',
+        }
+    }, {
+        'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html',
+        'playlist': [
+            {
+                'md5': '83bcd157cab89ad7318dd7b8c9cf1306',
+                'info_dict': {
+                    'id': '67558',
+                    'ext': 'mp4',
+                    'title': 'The Pros & Cons Of Zoos',
+                    'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?',
+                    'uploader': 'DNews',
+                    'uploader_id': 'dnews',
+                },
+            }
+        ],
+        'info_dict': {
+            'id': '1834116536',
+            'title': 'After Gorilla Killing, Changes Ahead for Zoos',
+            'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.',
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id, article_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, display_id)
+        mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage)
+        if mobj:
+            playlist_type, playlist_id = mobj.groups()
+            return self.url_result(
+                'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id)
+        else:
+            entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall(
+                r'<iframe[^>]+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)]
+            return self.playlist_result(
+                entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage))
diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py

new file mode 100644 (file)

index 0000000..1c636f6
--- /dev/null
+++ b/youtube_dl/extractor/sendtonews.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .jwplatform import JWPlatformBaseIE
+from ..compat import compat_parse_qs
+from ..utils import (
+    ExtractorError,
+    parse_duration,
+)
+
+
+class SendtoNewsIE(JWPlatformBaseIE):
+    _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P<query>[^#]+)'
+
+    _TEST = {
+        # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/
+        'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes',
+        'info_dict': {
+            'id': 'GxfCe0Zo7D-175909-5588',
+            'ext': 'mp4',
+            'title': 'Recap: CLE 15, CIN 6',
+            'description': '5/16/16: Indians\' bats explode for 15 runs in a win',
+            'duration': 49,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s'
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(r'''(?x)<script[^>]+src=([\'"])
+            (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\?
+                .*\bSC=(?P<SC>[0-9a-zA-Z-]+).*
+            \1>''', webpage)
+        if mobj:
+            sk, mk, pk = mobj.group('SC').split('-')
+            return cls._URL_TEMPLATE % (sk, mk, pk)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        params = compat_parse_qs(mobj.group('query'))
+
+        if 'SK' not in params or 'MK' not in params or 'PK' not in params:
+            raise ExtractorError('Invalid URL', expected=True)
+
+        video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]])
+
+        webpage = self._download_webpage(url, video_id)
+
+        jwplayer_data_str = self._search_regex(
+            r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data')
+        js_vars = {
+            'w': 1024,
+            'h': 768,
+            'modeVar': 'html5',
+        }
+        for name, val in js_vars.items():
+            js_val = '%d' % val if isinstance(val, int) else '"%s"' % val
+            jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val)
+
+        info_dict = self._parse_jwplayer_data(
+            self._parse_json(jwplayer_data_str, video_id),
+            video_id, require_title=False, rtmp_params={'no_resume': True})
+
+        title = self._html_search_regex(
+            r'<div[^>]+class="embedTitle">([^<]+)</div>', webpage, 'title')
+        description = self._html_search_regex(
+            r'<div[^>]+class="embedSubTitle">([^<]+)</div>', webpage,
+            'description', fatal=False)
+        duration = parse_duration(self._html_search_regex(
+            r'<div[^>]+class="embedDetails">([0-9:]+)', webpage,
+            'duration', fatal=False))
+
+        info_dict.update({
+            'title': title,
+            'description': description,
+            'duration': duration,
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py

index d03f1b1d4308d047e5b690a682587ac5655ce338..8fc66732af70f4db5305fdc891c5142afd5c97c7 100644 (file)
--- a/youtube_dl/extractor/sina.py
+++ b/youtube_dl/extractor/sina.py
@@ -4,28 +4,35 @@
  import re
  
  from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlencode
-from ..utils import sanitized_Request
+from ..utils import (
+    HEADRequest,
+    ExtractorError,
+    int_or_none,
+    update_url_query,
+    qualities,
+    get_element_by_attribute,
+    clean_html,
+)
  
  
  class SinaIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://(.*?\.)?video\.sina\.com\.cn/
-                        (
-                            (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=)|b/)(?P<id>\d+?)($|&|\-))))
-                            |
+    _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/
+                        (?:
+                            (?:view/|.*\#)(?P<video_id>\d+)|
+                            .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|
                              # This is used by external sites like Weibo
-                            (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf)
+                            api/sinawebApi/outplay.php/(?P<token>.+?)\.swf
                          )
                    '''
  
      _TESTS = [
          {
-            'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898',
-            'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f',
+            'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622',
+            'md5': 'd38433e2fc886007729735650ae4b3e9',
              'info_dict': {
-                'id': '110028898',
-                'ext': 'flv',
-                'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员',
+                'id': '250576622',
+                'ext': 'mp4',
+                'title': '现场:克鲁兹宣布退选 特朗普将稳获提名',
              }
          },
          {
@@ -35,37 +42,74 @@ class SinaIE(InfoExtractor):
                  'ext': 'flv',
                  'title': '军方提高对朝情报监视级别',
              },
+            'skip': 'the page does not exist or has been deleted',
+        },
+        {
+            'url': 'http://video.sina.com.cn/view/250587748.html',
+            'md5': '3d1807a25c775092aab3bc157fff49b4',
+            'info_dict': {
+                'id': '250587748',
+                'ext': 'mp4',
+                'title': '瞬间泪目：8年前汶川地震珍贵视频首曝光',
+            },
          },
      ]
  
-    def _extract_video(self, video_id):
-        data = compat_urllib_parse_urlencode({'vid': video_id})
-        url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data,
-                                     video_id, 'Downloading video url')
-        image_page = self._download_webpage(
-            'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
-            video_id, 'Downloading thumbnail info')
-
-        return {'id': video_id,
-                'url': url_doc.find('./durl/url').text,
-                'ext': 'flv',
-                'title': url_doc.find('./vname').text,
-                'thumbnail': image_page.split('=')[1],
-                }
-
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        if mobj.group('token') is not None:
-            # The video id is in the redirected url
-            self.to_screen('Getting video id')
-            request = sanitized_Request(url)
-            request.get_method = lambda: 'HEAD'
-            (_, urlh) = self._download_webpage_handle(request, 'NA', False)
-            return self._real_extract(urlh.geturl())
-        elif video_id is None:
-            pseudo_id = mobj.group('pseudo_id')
-            webpage = self._download_webpage(url, pseudo_id)
-            video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, 'video id')
  
-        return self._extract_video(video_id)
+        video_id = mobj.group('video_id')
+        if not video_id:
+            if mobj.group('token') is not None:
+                # The video id is in the redirected url
+                self.to_screen('Getting video id')
+                request = HEADRequest(url)
+                (_, urlh) = self._download_webpage_handle(request, 'NA', False)
+                return self._real_extract(urlh.geturl())
+            else:
+                pseudo_id = mobj.group('pseudo_id')
+                webpage = self._download_webpage(url, pseudo_id)
+                error = get_element_by_attribute('class', 'errtitle', webpage)
+                if error:
+                    raise ExtractorError('%s said: %s' % (
+                        self.IE_NAME, clean_html(error)), expected=True)
+                video_id = self._search_regex(
+                    r"video_id\s*:\s*'(\d+)'", webpage, 'video id')
+
+        video_data = self._download_json(
+            'http://s.video.sina.com.cn/video/h5play',
+            video_id, query={'video_id': video_id})
+        if video_data['code'] != 1:
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, video_data['message']), expected=True)
+        else:
+            video_data = video_data['data']
+            title = video_data['title']
+            description = video_data.get('description')
+            if description:
+                description = description.strip()
+
+            preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd'])
+            formats = []
+            for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items():
+                file_api = quality.get('file_api')
+                file_id = quality.get('file_id')
+                if not file_api or not file_id:
+                    continue
+                formats.append({
+                    'format_id': quality_id,
+                    'url': update_url_query(file_api, {'vid': file_id}),
+                    'preference': preference(quality_id),
+                    'ext': 'mp4',
+                })
+            self._sort_formats(formats)
+
+            return {
+                'id': video_id,
+                'title': title,
+                'description': description,
+                'thumbnail': video_data.get('image'),
+                'duration': int_or_none(video_data.get('length')),
+                'timestamp': int_or_none(video_data.get('create_time')),
+                'formats': formats,
+            }
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py

index 692fd78e886c0a6a932adce4659f2564beeab7e6..92a7120a3242e732ceb58f51b4391a5efbc569d8 100644 (file)
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -96,20 +96,18 @@ def _real_extract(self, url):
          formats = []
          for height, video_url in zip(heights, video_urls):
              path = compat_urllib_parse_urlparse(video_url).path
-            _, quality = path.split('/')[4].split('_')[:2]
-            f = {
+            m = re.search(r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', path)
+            if m:
+                tbr = int(m.group('tbr'))
+                height = int(m.group('height'))
+            else:
+                tbr = None
+            formats.append({
                  'url': video_url,
+                'format_id': '%dp' % height,
                  'height': height,
-            }
-            tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None)
-            if tbr:
-                f.update({
-                    'tbr': int(tbr),
-                    'format_id': '%dp' % height,
-                })
-            else:
-                f['format_id'] = quality
-            formats.append(f)
+                'tbr': tbr,
+            })
          self._sort_formats(formats)
  
          age_limit = self._rta_search(webpage)
diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py

index e0477382ceabea0769bd0575ceb1f350ce8c0911..d14d93e3ab1ae87902dc275e1208964a86b6b840 100644 (file)
--- a/youtube_dl/extractor/teachingchannel.py
+++ b/youtube_dl/extractor/teachingchannel.py
@@ -11,6 +11,7 @@ class TeachingChannelIE(InfoExtractor):
  
      _TEST = {
          'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution',
+        'md5': '3d6361864d7cac20b57c8784da17166f',
          'info_dict': {
              'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
              'ext': 'mp4',
@@ -19,9 +20,9 @@ class TeachingChannelIE(InfoExtractor):
              'duration': 422.255,
          },
          'params': {
-            # m3u8 download
              'skip_download': True,
          },
+        'add_ie': ['Ooyala'],
      }
  
      def _real_extract(self, url):
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py

index b49ab5f5b98c2d6219d1d17a1c0aea02eb534f61..79a7789200e34e1e457d9cd69cdabb495e3548c3 100644 (file)
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -88,7 +88,7 @@ def _real_extract(self, url):
          preload_codes = self._html_search_regex(
              r'(function.+)setTimeout\(function\(\)\{playlist',
              webpage, 'preload codes')
-        base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes)
+        base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes)
          base64_fragments.remove('init')
  
          def _check_sequence(cur_fragments):
diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py

index 6f8333cfc0d40aee4d3637ed1e58867da1277e9f..9092e9b853637d14d54a2c15c524b12e35e91a30 100644 (file)
--- a/youtube_dl/extractor/telegraaf.py
+++ b/youtube_dl/extractor/telegraaf.py
@@ -2,14 +2,16 @@
  from __future__ import unicode_literals
  
  from .common import InfoExtractor
-from ..utils import remove_end
+from ..utils import (
+    determine_ext,
+    remove_end,
+)
  
  
  class TelegraafIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html'
      _TEST = {
          'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html',
-        'md5': '83245a9779bcc4a24454bfd53c65b6dc',
          'info_dict': {
              'id': '24353229',
              'ext': 'mp4',
@@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor):
              'thumbnail': 're:^https?://.*\.jpg$',
              'duration': 33,
          },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
      }
  
      def _real_extract(self, url):
-        playlist_id = self._match_id(url)
+        video_id = self._match_id(url)
  
-        webpage = self._download_webpage(url, playlist_id)
+        webpage = self._download_webpage(url, video_id)
  
+        player_url = self._html_search_regex(
+            r'<iframe[^>]+src="([^"]+")', webpage, 'player URL')
+        player_page = self._download_webpage(
+            player_url, video_id, note='Download player webpage')
          playlist_url = self._search_regex(
-            r"iframe\.loadPlayer\('([^']+)'", webpage, 'player')
+            r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL')
+        playlist_data = self._download_json(playlist_url, video_id)
+
+        item = playlist_data['items'][0]
+        formats = []
+        locations = item['locations']
+        for location in locations.get('adaptive', []):
+            manifest_url = location['src']
+            ext = determine_ext(manifest_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    manifest_url, video_id, ext='mp4', m3u8_id='hls'))
+            elif ext == 'mpd':
+                # TODO: Current DASH formats are broken - $Time$ pattern in
+                # <SegmentTemplate> not implemented yet
+                continue
+            else:
+                self.report_warning('Unknown adaptive format %s' % ext)
+        for location in locations.get('progressive', []):
+            formats.append({
+                'url': location['sources'][0]['src'],
+                'width': location.get('width'),
+                'height': location.get('height'),
+                'format_id': 'http-%s' % location['label'],
+            })
+
+        self._sort_formats(formats)
  
-        entries = self._extract_xspf_playlist(playlist_url, playlist_id)
          title = remove_end(self._og_search_title(webpage), ' - VIDEO')
          description = self._og_search_description(webpage)
+        duration = item.get('duration')
+        thumbnail = item.get('poster')
  
-        return self.playlist_result(entries, playlist_id, title, description)
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'formats': formats,
+            'duration': duration,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py

index 3f54b2744cb16cd6385e5cb06919cbaf9628167a..6c848dc6fad39c72a474b3fd53fabb1079dfdd84 100644 (file)
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -6,7 +6,7 @@
  
  class TF1IE(InfoExtractor):
      """TF1 uses the wat.tv player."""
-    _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html'
+    _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P<id>[^/?#.]+)'
      _TESTS = [{
          'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
          'info_dict': {
@@ -48,6 +48,6 @@ def _real_extract(self, url):
          video_id = self._match_id(url)
          webpage = self._download_webpage(url, video_id)
          wat_id = self._html_search_regex(
-            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})(?:#.*?)?\1',
+            r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8}).*?\1',
              webpage, 'wat id', group='id')
          return self.url_result('wat:%s' % wat_id, 'Wat')
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py

index a25417f94f846a70080c51661b57147e01e76b47..5793ec6efb4e9a1abe54470c4602a68f073e51ae 100644 (file)
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -14,11 +14,13 @@
      compat_urllib_parse_urlparse,
  )
  from ..utils import (
+    determine_ext,
      ExtractorError,
      float_or_none,
      int_or_none,
      sanitized_Request,
      unsmuggle_url,
+    update_url_query,
      xpath_with_ns,
      mimetype2ext,
      find_xpath_attr,
@@ -48,6 +50,12 @@ def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL d
              if OnceIE.suitable(_format['url']):
                  formats.extend(self._extract_once_formats(_format['url']))
              else:
+                media_url = _format['url']
+                if determine_ext(media_url) == 'm3u8':
+                    hdnea2 = self._get_cookies(media_url).get('hdnea2')
+                    if hdnea2:
+                        _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value})
+
                  formats.append(_format)
  
          subtitles = self._parse_smil_subtitles(meta, default_ns)
@@ -151,6 +159,22 @@ class ThePlatformIE(ThePlatformBaseIE):
          'only_matching': True,
      }]
  
+    @classmethod
+    def _extract_urls(cls, webpage):
+        m = re.search(
+            r'''(?x)
+                    <meta\s+
+                        property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+                        content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
+            ''', webpage)
+        if m:
+            return [m.group('url')]
+
+        matches = re.findall(
+            r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
+        if matches:
+            return list(zip(*matches))[1]
+
      @staticmethod
      def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
          flags = '10' if include_qs else '00'
diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py

index d8b1fd2813eadc3d17a17a6d46766b3c9c4ea37a..d63aef5dea9a8543f2a919b19321582f20e8df86 100644 (file)
--- a/youtube_dl/extractor/thesixtyone.py
+++ b/youtube_dl/extractor/thesixtyone.py
@@ -12,7 +12,7 @@ class TheSixtyOneIE(InfoExtractor):
              s|
              song/comments/list|
              song
-        )/(?P<id>[A-Za-z0-9]+)/?$'''
+        )/(?:[^/]+/)?(?P<id>[A-Za-z0-9]+)/?$'''
      _SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}'
      _SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream'
      _THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop'
@@ -45,6 +45,10 @@ class TheSixtyOneIE(InfoExtractor):
              'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/',
              'only_matching': True,
          },
+        {
+            'url': 'http://www.thesixtyone.com/maryatmidnight/song/StrawberriesandCream/yvWtLp0c4GQ/',
+            'only_matching': True,
+        },
      ]
  
      _DECODE_MAP = {
diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py

new file mode 100644 (file)

index 0000000..c77a079
--- /dev/null
+++ b/youtube_dl/extractor/threeqsdn.py
@@ -0,0 +1,139 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    js_to_json,
+    mimetype2ext,
+)
+
+
+class ThreeQSDNIE(InfoExtractor):
+    IE_NAME = '3qsdn'
+    IE_DESC = '3Q SDN'
+    _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TESTS = [{
+        # ondemand from http://www.philharmonie.tv/veranstaltung/26/
+        'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http',
+        'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd',
+        'info_dict': {
+            'id': '0280d6b9-1215-11e6-b427-0cc47a188158',
+            'ext': 'mp4',
+            'title': '0280d6b9-1215-11e6-b427-0cc47a188158',
+            'is_live': False,
+        },
+        'expected_warnings': ['Failed to download MPD manifest'],
+    }, {
+        # live video stream
+        'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true',
+        'info_dict': {
+            'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
+            'ext': 'mp4',
+            'title': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
+            'is_live': False,
+        },
+    }, {
+        # live audio stream
+        'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # live audio stream with some 404 URLs
+        'url': 'http://playout.3qsdn.com/ac5c3186-777a-11e2-9c30-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # geo restricted with 'This content is not available in your country'
+        'url': 'http://playout.3qsdn.com/d63a3ffe-75e8-11e2-9c30-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # geo restricted with 'playout.3qsdn.com/forbidden'
+        'url': 'http://playout.3qsdn.com/8e330f26-6ae2-11e2-a16a-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # live video with rtmp link
+        'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        js = self._download_webpage(
+            'http://playout.3qsdn.com/%s' % video_id, video_id,
+            query={'js': 'true'})
+
+        if any(p in js for p in (
+                '>This content is not available in your country',
+                'playout.3qsdn.com/forbidden')):
+            self.raise_geo_restricted()
+
+        stream_content = self._search_regex(
+            r'streamContent\s*:\s*(["\'])(?P<content>.+?)\1', js,
+            'stream content', default='demand', group='content')
+
+        live = stream_content == 'live'
+
+        stream_type = self._search_regex(
+            r'streamType\s*:\s*(["\'])(?P<type>audio|video)\1', js,
+            'stream type', default='video', group='type')
+
+        formats = []
+        urls = set()
+
+        def extract_formats(item_url, item={}):
+            if not item_url or item_url in urls:
+                return
+            urls.add(item_url)
+            type_ = item.get('type')
+            ext = determine_ext(item_url, default_ext=None)
+            if type_ == 'application/dash+xml' or ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    item_url, video_id, mpd_id='mpd', fatal=False))
+            elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    item_url, video_id, 'mp4',
+                    entry_protocol='m3u8' if live else 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    item_url, video_id, f4m_id='hds', fatal=False))
+            else:
+                if not self._is_valid_url(item_url, video_id):
+                    return
+                formats.append({
+                    'url': item_url,
+                    'format_id': item.get('quality'),
+                    'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext,
+                    'vcodec': 'none' if stream_type == 'audio' else None,
+                })
+
+        for item_js in re.findall(r'({.*?\b(?:src|source)\s*:\s*["\'].+?})', js):
+            f = self._parse_json(
+                item_js, video_id, transform_source=js_to_json, fatal=False)
+            if not f:
+                continue
+            extract_formats(f.get('src'), f)
+
+        # More relaxed version to collect additional URLs and acting
+        # as a future-proof fallback
+        for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js):
+            extract_formats(src)
+
+        self._sort_formats(formats)
+
+        title = self._live_title(video_id) if live else video_id
+
+        return {
+            'id': video_id,
+            'title': title,
+            'is_live': live,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py

index f57d609d43eecb13f3bb43ecc042107b5cad50bd..a4997cb8965dd9c31be4f5fc4a679fb0aa147b82 100644 (file)
--- a/youtube_dl/extractor/tvp.py
+++ b/youtube_dl/extractor/tvp.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
  from __future__ import unicode_literals
  
  import re
@@ -6,20 +6,13 @@
  from .common import InfoExtractor
  
  
-class TvpIE(InfoExtractor):
-    IE_NAME = 'tvp.pl'
-    _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P<id>\d+)$'
+class TVPIE(InfoExtractor):
+    IE_NAME = 'tvp'
+    IE_DESC = 'Telewizja Polska'
+    _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)'
  
      _TESTS = [{
-        'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035',
-        'md5': 'cdd98303338b8a7f7abab5cd14092bf2',
-        'info_dict': {
-            'id': '4278035',
-            'ext': 'wmv',
-            'title': 'Ogniem i mieczem, odc. 2',
-        },
-    }, {
-        'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536',
+        'url': 'http://vod.tvp.pl/194536/i-seria-odc-13',
          'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
          'info_dict': {
              'id': '194536',
@@ -36,12 +29,22 @@ class TvpIE(InfoExtractor):
          },
      }, {
          'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
-        'md5': 'c3b15ed1af288131115ff17a17c19dda',
-        'info_dict': {
-            'id': '17834272',
-            'ext': 'mp4',
-            'title': 'Na sygnale, odc. 39',
-        },
+        'only_matching': True,
+    }, {
+        'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
+        'only_matching': True,
+    }, {
+        'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
+        'only_matching': True,
+    }, {
+        'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
+        'only_matching': True,
+    }, {
+        'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
@@ -92,8 +95,8 @@ def _real_extract(self, url):
          }
  
  
-class TvpSeriesIE(InfoExtractor):
-    IE_NAME = 'tvp.pl:Series'
+class TVPSeriesIE(InfoExtractor):
+    IE_NAME = 'tvp:series'
      _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'
  
      _TESTS = [{
@@ -127,7 +130,7 @@ def _real_extract(self, url):
          videos_paths = re.findall(
              '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)
          entries = [
-            self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key())
+            self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key())
              for v_path in videos_paths]
  
          return {
diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py

index e03e2dbaa42f23a5107a50c67e7c12d9f378600b..4025edf02b4ca5b8c42e3324fb094fe43141ce68 100644 (file)
--- a/youtube_dl/extractor/twentyfourvideo.py
+++ b/youtube_dl/extractor/twentyfourvideo.py
@@ -47,7 +47,8 @@ def _real_extract(self, url):
  
          title = self._og_search_title(webpage)
          description = self._html_search_regex(
-            r'<span itemprop="description">([^<]+)</span>', webpage, 'description', fatal=False)
+            r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>',
+            webpage, 'description', fatal=False, group='description')
          thumbnail = self._og_search_thumbnail(webpage)
          duration = int_or_none(self._og_search_property(
              'duration', webpage, 'duration', fatal=False))
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py

index 36ee1adff2288570fc39936640bacd3abafe9ed2..d898f14c32457ac565026525f11d49d95f8c55eb 100644 (file)
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -171,6 +171,7 @@ class TwitchVideoIE(TwitchItemBaseIE):
              'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
          },
          'playlist_mincount': 12,
+        'skip': 'HTTP Error 404: Not Found',
      }
  
  
@@ -187,6 +188,7 @@ class TwitchChapterIE(TwitchItemBaseIE):
              'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
          },
          'playlist_mincount': 3,
+        'skip': 'HTTP Error 404: Not Found',
      }, {
          'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',
          'only_matching': True,
@@ -258,7 +260,7 @@ def _real_extract(self, url):
                      'nauth': access_token['token'],
                      'nauthsig': access_token['sig'],
                  })),
-            item_id, 'mp4')
+            item_id, 'mp4', entry_protocol='m3u8_native')
  
          self._prefer_source(formats)
          info['formats'] = formats
@@ -355,31 +357,6 @@ class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
      }
  
  
-class TwitchBookmarksIE(TwitchPlaylistBaseIE):
-    IE_NAME = 'twitch:bookmarks'
-    _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
-    _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
-    _PLAYLIST_TYPE = 'bookmarks'
-
-    _TEST = {
-        'url': 'http://www.twitch.tv/ognos/profile/bookmarks',
-        'info_dict': {
-            'id': 'ognos',
-            'title': 'Ognos',
-        },
-        'playlist_mincount': 3,
-    }
-
-    def _extract_playlist_page(self, response):
-        entries = []
-        for bookmark in response.get('bookmarks', []):
-            video = bookmark.get('video')
-            if not video:
-                continue
-            entries.append(video['url'])
-        return entries
-
-
  class TwitchStreamIE(TwitchBaseIE):
      IE_NAME = 'twitch:stream'
      _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py

index ea673054fdc7135a203cca8db00dc128344b0829..b7384298619608ab879337326b1e6719962932e3 100644 (file)
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@@ -5,6 +5,7 @@
  
  from .common import InfoExtractor
  from ..utils import (
+    determine_ext,
      float_or_none,
      xpath_text,
      remove_end,
@@ -52,7 +53,7 @@ class TwitterCardIE(TwitterBaseIE):
                  'id': 'dq4Oj5quskI',
                  'ext': 'mp4',
                  'title': 'Ubuntu 11.10 Overview',
-                'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/',
+                'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10...',
                  'upload_date': '20111013',
                  'uploader': 'OMG! Ubuntu!',
                  'uploader_id': 'omgubuntu',
@@ -116,13 +117,16 @@ def _search_dimensions_in_video_url(a_format, video_url):
          video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source')
  
          if video_url:
-            f = {
-                'url': video_url,
-            }
+            if determine_ext(video_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls'))
+            else:
+                f = {
+                    'url': video_url,
+                }
  
-            _search_dimensions_in_video_url(f, video_url)
+                _search_dimensions_in_video_url(f, video_url)
  
-            formats.append(f)
+                formats.append(f)
  
          vmap_url = config.get('vmapUrl') or config.get('vmap_url')
          if vmap_url:
@@ -207,6 +211,7 @@ class TwitterIE(InfoExtractor):
              'uploader_id': 'giphz',
          },
          'expected_warnings': ['height', 'width'],
+        'skip': 'Account suspended',
      }, {
          'url': 'https://twitter.com/starwars/status/665052190608723968',
          'md5': '39b7199856dee6cd4432e72c74bc69d4',
@@ -239,10 +244,10 @@ class TwitterIE(InfoExtractor):
          'info_dict': {
              'id': '700207533655363584',
              'ext': 'mp4',
-            'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel',
-            'description': 'jay on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
+            'title': 'Donte The Dumbass - BEAT PROD: @suhmeduh #Damndaniel',
+            'description': 'Donte The Dumbass on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
              'thumbnail': 're:^https?://.*\.jpg',
-            'uploader': 'jay',
+            'uploader': 'Donte The Dumbass',
              'uploader_id': 'jaydingeer',
          },
          'params': {
@@ -262,7 +267,6 @@ class TwitterIE(InfoExtractor):
          'add_ie': ['Vine'],
      }, {
          'url': 'https://twitter.com/captainamerica/status/719944021058060289',
-        # md5 constantly changes
          'info_dict': {
              'id': '719944021058060289',
              'ext': 'mp4',
@@ -271,6 +275,9 @@ class TwitterIE(InfoExtractor):
              'uploader_id': 'captainamerica',
              'uploader': 'Captain America',
          },
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        },
      }]
  
      def _real_extract(self, url):
@@ -278,7 +285,11 @@ def _real_extract(self, url):
          user_id = mobj.group('user_id')
          twid = mobj.group('id')
  
-        webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid)
+        webpage, urlh = self._download_webpage_handle(
+            self._TEMPLATE_URL % (user_id, twid), twid)
+
+        if 'twitter.com/account/suspended' in urlh.geturl():
+            raise ExtractorError('Account suspended by Twitter.', expected=True)
  
          username = remove_end(self._og_search_title(webpage), ' on Twitter')
  
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py

index d1e6f2703e022dac0edc3ef0f16794a6285d2b8f..89b86955913587c3c09474fdffaab8ad338bb26a 100644 (file)
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -5,7 +5,6 @@
  from .common import InfoExtractor
  from ..compat import (
      compat_HTTPError,
-    compat_urllib_parse_urlencode,
      compat_urllib_request,
      compat_urlparse,
  )
@@ -84,18 +83,19 @@ def combine_url(base_url, url):
          if enroll_url:
              webpage = self._download_webpage(
                  combine_url(base_url, enroll_url),
-                course_id, 'Enrolling in the course')
+                course_id, 'Enrolling in the course',
+                headers={'Referer': base_url})
              if '>You have enrolled in' in webpage:
                  self.to_screen('%s: Successfully enrolled in the course' % course_id)
  
      def _download_lecture(self, course_id, lecture_id):
          return self._download_json(
-            'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % (
-                course_id, lecture_id, compat_urllib_parse_urlencode({
-                    'fields[lecture]': 'title,description,view_html,asset',
-                    'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
-                })),
-            lecture_id, 'Downloading lecture JSON')
+            'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?'
+            % (course_id, lecture_id),
+            lecture_id, 'Downloading lecture JSON', query={
+                'fields[lecture]': 'title,description,view_html,asset',
+                'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
+            })
  
      def _handle_error(self, response):
          if not isinstance(response, dict):
@@ -142,7 +142,9 @@ def _login(self):
              self._LOGIN_URL, None, 'Downloading login popup')
  
          def is_logged(webpage):
-            return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<'])
+            return any(re.search(p, webpage) for p in (
+                r'href=["\'](?:https://www\.udemy\.com)?/user/logout/',
+                r'>Logout<'))
  
          # already logged in
          if is_logged(login_popup):
@@ -155,13 +157,13 @@ def is_logged(webpage):
              'password': password,
          })
  
-        request = sanitized_Request(
-            self._LOGIN_URL, urlencode_postdata(login_form))
-        request.add_header('Referer', self._ORIGIN_URL)
-        request.add_header('Origin', self._ORIGIN_URL)
-
          response = self._download_webpage(
-            request, None, 'Logging in as %s' % username)
+            self._LOGIN_URL, None, 'Logging in as %s' % username,
+            data=urlencode_postdata(login_form),
+            headers={
+                'Referer': self._ORIGIN_URL,
+                'Origin': self._ORIGIN_URL,
+            })
  
          if not is_logged(response):
              error = self._html_search_regex(
diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py

index ee35b7227372c0ddc128dfc694577578f9fc6009..57dd73aef6f6254f22cdcd814e2d76b20c75b847 100644 (file)
--- a/youtube_dl/extractor/udn.py
+++ b/youtube_dl/extractor/udn.py
@@ -2,10 +2,13 @@
  from __future__ import unicode_literals
  
  import json
+import re
+
  from .common import InfoExtractor
  from ..utils import (
+    determine_ext,
+    int_or_none,
      js_to_json,
-    ExtractorError,
  )
  from ..compat import compat_urlparse
  
@@ -16,13 +19,16 @@ class UDNEmbedIE(InfoExtractor):
      _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL
      _TESTS = [{
          'url': 'http://video.udn.com/embed/news/300040',
-        'md5': 'de06b4c90b042c128395a88f0384817e',
          'info_dict': {
              'id': '300040',
              'ext': 'mp4',
              'title': '生物老師男變女 全校挺"做自己"',
              'thumbnail': 're:^https?://.*\.jpg$',
-        }
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
      }, {
          'url': 'https://video.udn.com/embed/news/300040',
          'only_matching': True,
@@ -38,39 +44,53 @@ def _real_extract(self, url):
          page = self._download_webpage(url, video_id)
  
          options = json.loads(js_to_json(self._html_search_regex(
-            r'var options\s*=\s*([^;]+);', page, 'video urls dictionary')))
+            r'var\s+options\s*=\s*([^;]+);', page, 'video urls dictionary')))
  
          video_urls = options['video']
  
          if video_urls.get('youtube'):
              return self.url_result(video_urls.get('youtube'), 'Youtube')
  
-        try:
-            del video_urls['youtube']
-        except KeyError:
-            pass
+        formats = []
+        for video_type, api_url in video_urls.items():
+            if not api_url:
+                continue
  
-        formats = [{
-            'url': self._download_webpage(
+            video_url = self._download_webpage(
                  compat_urlparse.urljoin(url, api_url), video_id,
-                'retrieve url for %s video' % video_type),
-            'format_id': video_type,
-            'preference': 0 if video_type == 'mp4' else -1,
-        } for video_type, api_url in video_urls.items() if api_url]
+                note='retrieve url for %s video' % video_type)
  
-        if not formats:
-            raise ExtractorError('No videos found', expected=True)
+            ext = determine_ext(video_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, ext='mp4', m3u8_id='hls'))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    video_url, video_id, f4m_id='hds'))
+            else:
+                mobj = re.search(r'_(?P<height>\d+)p_(?P<tbr>\d+).mp4', video_url)
+                a_format = {
+                    'url': video_url,
+                    # video_type may be 'mp4', which confuses YoutubeDL
+                    'format_id': 'http-' + video_type,
+                }
+                if mobj:
+                    a_format.update({
+                        'height': int_or_none(mobj.group('height')),
+                        'tbr': int_or_none(mobj.group('tbr')),
+                    })
+                formats.append(a_format)
  
          self._sort_formats(formats)
  
-        thumbnail = None
-
-        if options.get('gallery') and len(options['gallery']):
-            thumbnail = options['gallery'][0].get('original')
+        thumbnails = [{
+            'url': img_url,
+            'id': img_type,
+        } for img_type, img_url in options.get('gallery', [{}])[0].items() if img_url]
  
          return {
              'id': video_id,
              'formats': formats,
              'title': options['title'],
-            'thumbnail': thumbnail
+            'thumbnails': thumbnails,
          }
diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py

index cafc082b6bb8a589edaf02ce9a87f266e44c941d..3484a204658e1f09d472c0b31026ec6621121f1f 100644 (file)
--- a/youtube_dl/extractor/ustudio.py
+++ b/youtube_dl/extractor/ustudio.py
@@ -6,10 +6,12 @@
  from ..utils import (
      int_or_none,
      unified_strdate,
+    unescapeHTML,
  )
  
  
  class UstudioIE(InfoExtractor):
+    IE_NAME = 'ustudio'
      _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'
      _TEST = {
          'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge',
@@ -27,9 +29,7 @@ class UstudioIE(InfoExtractor):
      }
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        display_id = mobj.group('display_id')
+        video_id, display_id = re.match(self._VALID_URL, url).groups()
  
          config = self._download_xml(
              'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id,
@@ -37,7 +37,7 @@ def _real_extract(self, url):
  
          def extract(kind):
              return [{
-                'url': item.attrib['url'],
+                'url': unescapeHTML(item.attrib['url']),
                  'width': int_or_none(item.get('width')),
                  'height': int_or_none(item.get('height')),
              } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')]
@@ -65,3 +65,61 @@ def extract(kind):
              'uploader': uploader,
              'formats': formats,
          }
+
+
+class UstudioEmbedIE(InfoExtractor):
+    IE_NAME = 'ustudio:embed'
+    _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T',
+        'md5': '47c0be52a09b23a7f40de9469cec58f4',
+        'info_dict': {
+            'id': 'Uw7G1kMCe65T',
+            'ext': 'mp4',
+            'title': '5 Things IT Should Know About Video',
+            'description': 'md5:93d32650884b500115e158c5677d25ad',
+            'uploader_id': 'DeN7VdYRDKhP',
+        }
+    }
+
+    def _real_extract(self, url):
+        uploader_id, video_id = re.match(self._VALID_URL, url).groups()
+        video_data = self._download_json(
+            'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id),
+            video_id)['videos'][0]
+        title = video_data['name']
+
+        formats = []
+        for ext, qualities in video_data.get('transcodes', {}).items():
+            for quality in qualities:
+                quality_url = quality.get('url')
+                if not quality_url:
+                    continue
+                height = int_or_none(quality.get('height'))
+                formats.append({
+                    'format_id': '%s-%dp' % (ext, height) if height else ext,
+                    'url': quality_url,
+                    'width': int_or_none(quality.get('width')),
+                    'height': height,
+                })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for image in video_data.get('images', []):
+            image_url = image.get('url')
+            if not image_url:
+                continue
+            thumbnails.append({
+                'url': image_url,
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'duration': int_or_none(video_data.get('duration')),
+            'uploader_id': uploader_id,
+            'tags': video_data.get('keywords'),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py

index 23ce0a0d1929febac87f789374d8411d7b7ddd00..0f5d6873808ed2dce5cde2e6239b6973cf809367 100644 (file)
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -37,6 +37,7 @@ class VeohIE(InfoExtractor):
                  'uploader': 'afp-news',
                  'duration': 123,
              },
+            'skip': 'This video has been deleted.',
          },
          {
              'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py

index 1a0ff3395598027ebd8de05a609faca987c14e9e..2cd617b91ce4a4a7eba0a639c0956dca3e168576 100644 (file)
--- a/youtube_dl/extractor/vessel.py
+++ b/youtube_dl/extractor/vessel.py
@@ -2,6 +2,7 @@
  from __future__ import unicode_literals
  
  import json
+import re
  
  from .common import InfoExtractor
  from ..utils import (
@@ -12,11 +13,11 @@
  
  
  class VesselIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)'
+    _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)'
      _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s'
      _LOGIN_URL = 'https://www.vessel.com/api/account/login'
      _NETRC_MACHINE = 'vessel'
-    _TEST = {
+    _TESTS = [{
          'url': 'https://www.vessel.com/videos/HDN7G5UMs',
          'md5': '455cdf8beb71c6dd797fd2f3818d05c4',
          'info_dict': {
@@ -28,7 +29,16 @@ class VesselIE(InfoExtractor):
              'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?',
              'timestamp': int,
          },
-    }
+    }, {
+        'url': 'https://www.vessel.com/embed/G4U7gUJ6a?w=615&h=346',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [url for _, url in re.findall(
+            r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z]+.*?)\1',
+            webpage)]
  
      @staticmethod
      def make_json_request(url, data):
@@ -98,16 +108,24 @@ def _real_extract(self, url):
  
          formats = []
          for f in video_asset.get('sources', []):
-            if f['name'] == 'hls-index':
+            location = f.get('location')
+            if not location:
+                continue
+            name = f.get('name')
+            if name == 'hls-index':
                  formats.extend(self._extract_m3u8_formats(
-                    f['location'], video_id, ext='mp4', m3u8_id='m3u8'))
+                    location, video_id, ext='mp4',
+                    entry_protocol='m3u8_native', m3u8_id='m3u8', fatal=False))
+            elif name == 'dash-index':
+                formats.extend(self._extract_mpd_formats(
+                    location, video_id, mpd_id='dash', fatal=False))
              else:
                  formats.append({
-                    'format_id': f['name'],
+                    'format_id': name,
                      'tbr': f.get('bitrate'),
                      'height': f.get('height'),
                      'width': f.get('width'),
-                    'url': f['location'],
+                    'url': location,
                  })
          self._sort_formats(formats)
  
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py

index c0ef08c02b03457d44219edc8c3434e5d07666ee..388b4debee27d7331ae7dc351338e3829e539071 100644 (file)
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -203,7 +203,8 @@ def _real_extract(self, url):
  
          json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
          response = self._download_json(
-            json_url, video_id, 'Downloading video info', 'Unable to download info')
+            json_url, video_id, 'Downloading video info',
+            'Unable to download info', fatal=False) or {}
          video_info = response.get('video') or {}
          artist = None
          featured_artist = None
@@ -212,19 +213,17 @@ def _real_extract(self, url):
          formats = []
  
          if not video_info:
-            if response.get('statusCode') != 909:
+            try:
+                self._initialize_api(video_id)
+            except ExtractorError:
                  ytid = response.get('errorInfo', {}).get('ytid')
                  if ytid:
                      self.report_warning(
                          'Video is geoblocked, trying with the YouTube video %s' % ytid)
                      return self.url_result(ytid, 'Youtube', ytid)
  
-                if 'statusMessage' in response:
-                    raise ExtractorError('%s said: %s' % (
-                        self.IE_NAME, response['statusMessage']), expected=True)
-                raise ExtractorError('Unable to extract videos')
+                raise
  
-            self._initialize_api(video_id)
              video_info = self._call_api(
                  'video/%s' % video_id, video_id, 'Downloading api video info',
                  'Failed to download video info')
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py

index 95daf4dfdf2155dbbab26f2896cf3c42e0f33e2f..e2b2ce0981cc8767ade2f5ef4c8bc52759b86af3 100644 (file)
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -11,12 +11,14 @@ class ViceIE(InfoExtractor):
  
      _TESTS = [{
          'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
+        'md5': 'e9d77741f9e42ba583e683cd170660f7',
          'info_dict': {
              'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
              'ext': 'flv',
              'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
              'duration': 725.983,
          },
+        'add_ie': ['Ooyala'],
      }, {
          'url': 'http://www.vice.com/video/how-to-hack-a-car',
          'md5': '6fb2989a3fed069fb8eab3401fc2d3c9',
@@ -29,6 +31,7 @@ class ViceIE(InfoExtractor):
              'uploader': 'Motherboard',
              'upload_date': '20140529',
          },
+        'add_ie': ['Youtube'],
      }, {
          'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
          'only_matching': True,
diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py

new file mode 100644 (file)

index 0000000..6898042
--- /dev/null
+++ b/youtube_dl/extractor/vidio.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class VidioIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P<id>\d+)-(?P<display_id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015',
+        'md5': 'cd2801394afc164e9775db6a140b91fe',
+        'info_dict': {
+            'id': '165683',
+            'display_id': 'dj_ambred-booyah-live-2015',
+            'ext': 'mp4',
+            'title': 'DJ_AMBRED - Booyah (Live 2015)',
+            'description': 'md5:27dc15f819b6a78a626490881adbadf8',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 149,
+            'like_count': int,
+        },
+    }, {
+        'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.group('id', 'display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(webpage)
+
+        m3u8_url, duration, thumbnail = [None] * 3
+
+        clips = self._parse_json(
+            self._html_search_regex(
+                r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1',
+                webpage, 'video data', default='[]', group='data'),
+            display_id, fatal=False)
+        if clips:
+            clip = clips[0]
+            m3u8_url = clip.get('sources', [{}])[0].get('file')
+            duration = clip.get('clip_duration')
+            thumbnail = clip.get('image')
+
+        m3u8_url = m3u8_url or self._search_regex(
+            r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url')
+        formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+
+        duration = int_or_none(duration or self._search_regex(
+            r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration'))
+        thumbnail = thumbnail or self._og_search_thumbnail(webpage)
+
+        like_count = int_or_none(self._search_regex(
+            (r'<span[^>]+data-comment-vote-count=["\'](\d+)',
+             r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'),
+            webpage, 'like count', fatal=False))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'like_count': like_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py

index dd4a13a4a86d4b156dfa017d606b96b13b7e3da7..19500eba84f1bf7b4fdf7c59b8b56ca7e5b91efc 100644 (file)
--- a/youtube_dl/extractor/viewlift.py
+++ b/youtube_dl/extractor/viewlift.py
@@ -141,6 +141,10 @@ class ViewLiftIE(ViewLiftBaseIE):
      }, {
          'url': 'http://www.kesari.tv/news/video/1461919076414',
          'only_matching': True,
+    }, {
+        # Was once Kaltura embed
+        'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py

index 67220f1b7a991e48494adf24c791317e29eda8cd..79c819bc389a09b2cd5186f64feb49d3e474d189 100644 (file)
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -26,12 +26,16 @@ class VKIE(InfoExtractor):
      _VALID_URL = r'''(?x)
                      https?://
                          (?:
-                            (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|
+                            (?:
+                                (?:m\.)?vk\.com/video_|
+                                (?:www\.)?daxab.com/
+                            )
+                            ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
                              (?:
                                  (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
-                                (?:www\.)?biqle\.ru/watch/
+                                (?:www\.)?daxab.com/embed/
                              )
-                            (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$)
+                            (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
                          )
                      '''
      _NETRC_MACHINE = 'vk'
@@ -75,7 +79,8 @@ class VKIE(InfoExtractor):
                  'duration': 101,
                  'upload_date': '20120730',
                  'view_count': int,
-            }
+            },
+            'skip': 'This video has been removed from public access.',
          },
          {
              # VIDEO NOW REMOVED
@@ -142,7 +147,7 @@ class VKIE(InfoExtractor):
                  'id': 'V3K4mi0SYkc',
                  'ext': 'webm',
                  'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
-                'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
+                'description': 'md5:d9903938abdc74c738af77f527ca0596',
                  'duration': 178,
                  'upload_date': '20130116',
                  'uploader': "Children's Joy Foundation",
@@ -173,11 +178,6 @@ class VKIE(InfoExtractor):
              'url': 'https://vk.com/video205387401_164765225',
              'only_matching': True,
          },
-        {
-            # vk wrapper
-            'url': 'http://www.biqle.ru/watch/847655_160197695',
-            'only_matching': True,
-        },
          {
              # pladform embed
              'url': 'https://vk.com/video-76116461_171554880',
@@ -217,20 +217,21 @@ def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('videoid')
  
-        if not video_id:
+        if video_id:
+            info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+            # Some videos (removed?) can only be downloaded with list id specified
+            list_id = mobj.group('list_id')
+            if list_id:
+                info_url += '&list=%s' % list_id
+        else:
+            info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')
              video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
  
-        info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
-
-        # Some videos (removed?) can only be downloaded with list id specified
-        list_id = mobj.group('list_id')
-        if list_id:
-            info_url += '&list=%s' % list_id
-
          info_page = self._download_webpage(info_url, video_id)
  
          error_message = self._html_search_regex(
-            r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+            [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+                r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
              info_page, 'error message', default=None)
          if error_message:
              raise ExtractorError(error_message, expected=True)
@@ -305,17 +306,17 @@ def _real_extract(self, url):
          view_count = None
          views = self._html_search_regex(
              r'"mv_views_count_number"[^>]*>(.+?\bviews?)<',
-            info_page, 'view count', fatal=False)
+            info_page, 'view count', default=None)
          if views:
              view_count = str_to_int(self._search_regex(
                  r'([\d,.]+)', views, 'view count', fatal=False))
  
          formats = []
          for k, v in data.items():
-            if not k.startswith('url') and k != 'extra_data' or not v:
+            if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v:
                  continue
              height = int_or_none(self._search_regex(
-                r'^url(\d+)', k, 'height', default=None))
+                r'^(?:url|cache)(\d+)', k, 'height', default=None))
              formats.append({
                  'format_id': k,
                  'url': v,
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py

index a672ea9c5a5c05316f3e03808ac66a60d48e0f18..8d671cca767d4592a5428f7d3ad855e952df5353 100644 (file)
--- a/youtube_dl/extractor/vlive.py
+++ b/youtube_dl/extractor/vlive.py
@@ -1,8 +1,7 @@
  # coding: utf-8
-from __future__ import division, unicode_literals
+from __future__ import unicode_literals
  
  import re
-import time
  
  from .common import InfoExtractor
  from ..utils import (
@@ -10,6 +9,7 @@
      ExtractorError,
      float_or_none,
      int_or_none,
+    remove_start,
  )
  from ..compat import compat_urllib_parse_urlencode
  
@@ -23,7 +23,7 @@ class VLiveIE(InfoExtractor):
          'info_dict': {
              'id': '1326',
              'ext': 'mp4',
-            'title': "[V] Girl's Day's Broadcast",
+            'title': "[V LIVE] Girl's Day's Broadcast",
              'creator': "Girl's Day",
              'view_count': int,
          },
@@ -35,24 +35,12 @@ def _real_extract(self, url):
          webpage = self._download_webpage(
              'http://www.vlive.tv/video/%s' % video_id, video_id)
  
-        # UTC+x - UTC+9 (KST)
-        tz = time.altzone if time.localtime().tm_isdst == 1 else time.timezone
-        tz_offset = -tz // 60 - 9 * 60
-        self._set_cookie('vlive.tv', 'timezoneOffset', '%d' % tz_offset)
-
-        status_params = self._download_json(
-            'http://www.vlive.tv/video/status?videoSeq=%s' % video_id,
-            video_id, 'Downloading JSON status',
-            headers={'Referer': url.encode('utf-8')})
-        status = status_params.get('status')
-        air_start = status_params.get('onAirStartAt', '')
-        is_live = status_params.get('isLive')
-
          video_params = self._search_regex(
-            r'vlive\.tv\.video\.ajax\.request\.handler\.init\((.+)\)',
+            r'\bvlive\.video\.init\(([^)]+)\)',
              webpage, 'video params')
-        live_params, long_video_id, key = re.split(
-            r'"\s*,\s*"', video_params)[1:4]
+        status, _, _, live_params, long_video_id, key = re.split(
+            r'"\s*,\s*"', video_params)[2:8]
+        status = remove_start(status, 'PRODUCT_')
  
          if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR':
              live_params = self._parse_json('"%s"' % live_params, video_id)
@@ -61,8 +49,6 @@ def _real_extract(self, url):
          elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO':
              if long_video_id and key:
                  return self._replay(video_id, webpage, long_video_id, key)
-            elif is_live:
-                status = 'LIVE_END'
              else:
                  status = 'COMING_SOON'
  
@@ -70,7 +56,7 @@ def _real_extract(self, url):
              raise ExtractorError('Uploading for replay. Please wait...',
                                   expected=True)
          elif status == 'COMING_SOON':
-            raise ExtractorError('Coming soon! %s' % air_start, expected=True)
+            raise ExtractorError('Coming soon!', expected=True)
          elif status == 'CANCELED':
              raise ExtractorError('We are sorry, '
                                   'but the live broadcast has been canceled.',
diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py

index 0c6b1f0305801d71bd942b3e2ee9ee307cb4e191..b1b32ad44ecfd796e46219a87ea71caa3587face 100644 (file)
--- a/youtube_dl/extractor/voxmedia.py
+++ b/youtube_dl/extractor/voxmedia.py
@@ -15,7 +15,8 @@ class VoxMediaIE(InfoExtractor):
              'ext': 'mp4',
              'title': 'Google\'s new material design direction',
              'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2',
-        }
+        },
+        'add_ie': ['Ooyala'],
      }, {
          # data-ooyala-id
          'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet',
@@ -25,7 +26,8 @@ class VoxMediaIE(InfoExtractor):
              'ext': 'mp4',
              'title': 'The Nexus 6: hands-on with Google\'s phablet',
              'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af',
-        }
+        },
+        'add_ie': ['Ooyala'],
      }, {
          # volume embed
          'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill',
@@ -35,7 +37,8 @@ class VoxMediaIE(InfoExtractor):
              'ext': 'mp4',
              'title': 'The new frontier of LGBTQ civil rights, explained',
              'description': 'md5:0dc58e94a465cbe91d02950f770eb93f',
-        }
+        },
+        'add_ie': ['Ooyala'],
      }, {
          # youtube embed
          'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance',
@@ -48,7 +51,8 @@ class VoxMediaIE(InfoExtractor):
              'upload_date': '20160324',
              'uploader_id': 'voxdotcom',
              'uploader': 'Vox',
-        }
+        },
+        'add_ie': ['Youtube'],
      }, {
          # SBN.VideoLinkset.entryGroup multiple ooyala embeds
          'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
@@ -117,7 +121,7 @@ def create_entry(provider_video_id, provider_video_type, title=None, description
              volume_webpage = self._download_webpage(
                  'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid)
              video_data = self._parse_json(self._search_regex(
-                r'Volume\.createVideo\(({.+})\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid)
+                r'Volume\.createVideo\(({.+})\s*,\s*{.*}\s*,\s*\[.*\]\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid)
              for provider_video_type in ('ooyala', 'youtube'):
                  provider_video_id = video_data.get('%s_id' % provider_video_type)
                  if provider_video_id:
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py

index ec8b999983f6ae89a3bf53909e9d70a463f87f52..839cad986cbbf4edc8f73ca5639e780f210163c2 100644 (file)
--- a/youtube_dl/extractor/washingtonpost.py
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -11,7 +11,96 @@
  
  
  class WashingtonPostIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
+    IE_NAME = 'washingtonpost'
+    _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TEST = {
+        'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
+        'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
+        'info_dict': {
+            'id': '480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
+            'ext': 'mp4',
+            'title': 'Egypt finds belongings, debris from plane crash',
+            'description': 'md5:a17ceee432f215a5371388c1f680bd86',
+            'upload_date': '20160520',
+            'uploader': 'Reuters',
+            'timestamp': 1463778452,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id,
+            video_id, transform_source=strip_jsonp)[0]['contentConfig']
+        title = video_data['title']
+
+        urls = []
+        formats = []
+        for s in video_data.get('streams', []):
+            s_url = s.get('url')
+            if not s_url or s_url in urls:
+                continue
+            urls.append(s_url)
+            video_type = s.get('type')
+            if video_type == 'smil':
+                continue
+            elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url):
+                m3u8_formats = self._extract_m3u8_formats(
+                    s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+                for m3u8_format in m3u8_formats:
+                    width = m3u8_format.get('width')
+                    if not width:
+                        continue
+                    vbr = self._search_regex(
+                        r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None)
+                    if vbr:
+                        m3u8_format.update({
+                            'vbr': int_or_none(vbr),
+                        })
+                formats.extend(m3u8_formats)
+            else:
+                width = int_or_none(s.get('width'))
+                vbr = int_or_none(s.get('bitrate'))
+                has_width = width != 0
+                formats.append({
+                    'format_id': (
+                        '%s-%d-%d' % (video_type, width, vbr)
+                        if width
+                        else video_type),
+                    'vbr': vbr if has_width else None,
+                    'width': width,
+                    'height': int_or_none(s.get('height')),
+                    'acodec': s.get('audioCodec'),
+                    'vcodec': s.get('videoCodec') if has_width else 'none',
+                    'filesize': int_or_none(s.get('fileSize')),
+                    'url': s_url,
+                    'ext': 'mp4',
+                    'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None,
+                })
+        source_media_url = video_data.get('sourceMediaURL')
+        if source_media_url:
+            formats.append({
+                'format_id': 'source_media',
+                'url': source_media_url,
+            })
+        self._sort_formats(
+            formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('blurb'),
+            'uploader': video_data.get('credits', {}).get('source'),
+            'formats': formats,
+            'duration': int_or_none(video_data.get('videoDuration'), 100),
+            'timestamp': int_or_none(
+                video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000),
+        }
+
+
+class WashingtonPostArticleIE(InfoExtractor):
+    IE_NAME = 'washingtonpost:article'
+    _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
      _TESTS = [{
          'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
          'info_dict': {
@@ -63,6 +152,10 @@ class WashingtonPostIE(InfoExtractor):
          }]
      }]
  
+    @classmethod
+    def suitable(cls, url):
+        return False if WashingtonPostIE.suitable(url) else super(WashingtonPostArticleIE, cls).suitable(url)
+
      def _real_extract(self, url):
          page_id = self._match_id(url)
          webpage = self._download_webpage(url, page_id)
@@ -74,54 +167,7 @@ def _real_extract(self, url):
                  <div\s+class="posttv-video-embed[^>]*?data-uuid=|
                  data-video-uuid=
              )"([^"]+)"''', webpage)
-        entries = []
-        for i, uuid in enumerate(uuids, start=1):
-            vinfo_all = self._download_json(
-                'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid,
-                page_id,
-                transform_source=strip_jsonp,
-                note='Downloading information of video %d/%d' % (i, len(uuids))
-            )
-            vinfo = vinfo_all[0]['contentConfig']
-            uploader = vinfo.get('credits', {}).get('source')
-            timestamp = int_or_none(
-                vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000)
-
-            formats = [{
-                'format_id': (
-                    '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate'))
-                    if s.get('width')
-                    else s.get('type')),
-                'vbr': s.get('bitrate') if s.get('width') != 0 else None,
-                'width': s.get('width'),
-                'height': s.get('height'),
-                'acodec': s.get('audioCodec'),
-                'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none',
-                'filesize': s.get('fileSize'),
-                'url': s.get('url'),
-                'ext': 'mp4',
-                'preference': -100 if s.get('type') == 'smil' else None,
-                'protocol': {
-                    'MP4': 'http',
-                    'F4F': 'f4m',
-                }.get(s.get('type')),
-            } for s in vinfo.get('streams', [])]
-            source_media_url = vinfo.get('sourceMediaURL')
-            if source_media_url:
-                formats.append({
-                    'format_id': 'source_media',
-                    'url': source_media_url,
-                })
-            self._sort_formats(formats)
-            entries.append({
-                'id': uuid,
-                'title': vinfo['title'],
-                'description': vinfo.get('blurb'),
-                'uploader': uploader,
-                'formats': formats,
-                'duration': int_or_none(vinfo.get('videoDuration'), 100),
-                'timestamp': timestamp,
-            })
+        entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids]
  
          return {
              '_type': 'playlist',
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py

index 5227bb5ad9a2cd4f71c156cd8ca9bb3f5fbd5d17..de7d6b55935cd5fd8edb4c83c581505c2f0f4214 100644 (file)
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -2,25 +2,26 @@
  from __future__ import unicode_literals
  
  import re
-import hashlib
  
  from .common import InfoExtractor
+from ..compat import compat_str
  from ..utils import (
      ExtractorError,
      unified_strdate,
+    HEADRequest,
+    float_or_none,
  )
  
  
  class WatIE(InfoExtractor):
-    _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|https?://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)'
+    _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)'
      IE_NAME = 'wat.tv'
      _TESTS = [
          {
              'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
-            'md5': 'ce70e9223945ed26a8056d413ca55dc9',
+            'md5': '83d882d9de5c9d97f0bb2c6273cde56a',
              'info_dict': {
                  'id': '11713067',
-                'display_id': 'soupe-figues-l-orange-aux-epices',
                  'ext': 'mp4',
                  'title': 'Soupe de figues à l\'orange et aux épices',
                  'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',
@@ -33,7 +34,6 @@ class WatIE(InfoExtractor):
              'md5': 'fbc84e4378165278e743956d9c1bf16b',
              'info_dict': {
                  'id': '11713075',
-                'display_id': 'gregory-lemarchal-voix-ange',
                  'ext': 'mp4',
                  'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',
                  'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3',
@@ -44,96 +44,85 @@ class WatIE(InfoExtractor):
          },
      ]
  
-    def download_video_info(self, real_id):
-        # 'contentv4' is used in the website, but it also returns the related
-        # videos, we don't need them
-        info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id)
-        return info['media']
-
      def _real_extract(self, url):
-        def real_id_for_chapter(chapter):
-            return chapter['tc_start'].split('-')[0]
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('display_id')
-        real_id = mobj.group('real_id')
-        if not real_id:
-            short_id = mobj.group('short_id')
-            webpage = self._download_webpage(url, display_id or short_id)
-            real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
+        video_id = self._match_id(url)
+        video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))
  
-        video_info = self.download_video_info(real_id)
+        # 'contentv4' is used in the website, but it also returns the related
+        # videos, we don't need them
+        video_info = self._download_json(
+            'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media']
  
          error_desc = video_info.get('error_desc')
          if error_desc:
              raise ExtractorError(
                  '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True)
  
-        geo_list = video_info.get('geoList')
-        country = geo_list[0] if geo_list else ''
-
          chapters = video_info['chapters']
          first_chapter = chapters[0]
-        files = video_info['files']
-        first_file = files[0]
  
-        if real_id_for_chapter(first_chapter) != real_id:
-            self.to_screen('Multipart video detected')
-            chapter_urls = []
-            for chapter in chapters:
-                chapter_id = real_id_for_chapter(chapter)
-                # Yes, when we this chapter is processed by WatIE,
-                # it will download the info again
-                chapter_info = self.download_video_info(chapter_id)
-                chapter_urls.append(chapter_info['url'])
-            entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
-            return self.playlist_result(entries, real_id, video_info['title'])
+        def video_id_for_chapter(chapter):
+            return chapter['tc_start'].split('-')[0]
  
-        upload_date = None
-        if 'date_diffusion' in first_chapter:
-            upload_date = unified_strdate(first_chapter['date_diffusion'])
+        if video_id_for_chapter(first_chapter) != video_id:
+            self.to_screen('Multipart video detected')
+            entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters]
+            return self.playlist_result(entries, video_id, video_info['title'])
          # Otherwise we can continue and extract just one part, we have to use
-        # the short id for getting the video url
-
-        formats = [{
-            'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
-            'format_id': 'Mobile',
-        }]
-
-        fmts = [('SD', 'web')]
-        if first_file.get('hasHD'):
-            fmts.append(('HD', 'webhd'))
-
-        def compute_token(param):
-            timestamp = '%08x' % int(self._download_webpage(
-                'http://www.wat.tv/servertime', real_id,
-                'Downloading server time').split('|')[0])
-            magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564'
-            return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp)
-
-        for fmt in fmts:
-            webid = '/%s/%s' % (fmt[1], real_id)
-            video_url = self._download_webpage(
-                'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country),
-                real_id,
-                'Downloading %s video URL' % fmt[0],
-                'Failed to download %s video URL' % fmt[0],
-                False)
-            if not video_url:
+        # the video id for getting the video url
+
+        date_diffusion = first_chapter.get('date_diffusion')
+        upload_date = unified_strdate(date_diffusion) if date_diffusion else None
+
+        def extract_url(path_template, url_type):
+            req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id)
+            head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type)
+            red_url = head.geturl()
+            if req_url == red_url:
+                raise ExtractorError(
+                    '%s said: Sorry, this video is not available from your country.' % self.IE_NAME,
+                    expected=True)
+            return red_url
+
+        m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8')
+        http_url = extract_url('android5/%s.mp4', 'http')
+
+        formats = []
+        m3u8_formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+        formats.extend(m3u8_formats)
+        formats.extend(self._extract_f4m_formats(
+            m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'),
+            video_id, f4m_id='hds', fatal=False))
+        for m3u8_format in m3u8_formats:
+            mobj = re.search(
+                r'audio.*?%3D(\d+)(?:-video.*?%3D(\d+))?', m3u8_format['url'])
+            if not mobj:
                  continue
-            formats.append({
-                'url': video_url,
-                'ext': 'mp4',
-                'format_id': fmt[0],
+            abr, vbr = mobj.groups()
+            abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
+            m3u8_format.update({
+                'vbr': vbr,
+                'abr': abr,
+            })
+            if not vbr or not abr:
+                continue
+            f = m3u8_format.copy()
+            f.update({
+                'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url),
+                'format_id': f['format_id'].replace('hls', 'http'),
+                'protocol': 'http',
              })
+            formats.append(f)
+        self._sort_formats(formats)
  
          return {
-            'id': real_id,
-            'display_id': display_id,
+            'id': video_id,
              'title': first_chapter['title'],
              'thumbnail': first_chapter['preview'],
              'description': first_chapter['description'],
              'view_count': video_info['views'],
              'upload_date': upload_date,
-            'duration': first_file['duration'],
+            'duration': video_info['files'][0]['duration'],
              'formats': formats,
          }
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py

index 31c90430327da895ffc974c1d489cb4c92689d2f..a9238cbeb03549325cb27dd9fdb319829c7bf59e 100644 (file)
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@@ -1,214 +1,222 @@
  # -*- coding: utf-8 -*-
  from __future__ import unicode_literals
  
-import itertools
  import re
  
  from .common import InfoExtractor
-from ..compat import (
-    compat_parse_qs,
-    compat_urlparse,
-)
  from ..utils import (
+    determine_ext,
+    ExtractorError,
+    js_to_json,
+    strip_jsonp,
      unified_strdate,
-    qualities,
+    update_url_query,
+    urlhandle_detect_ext,
  )
  
  
  class WDRIE(InfoExtractor):
-    _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?'
-    _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)(?P<player>%s)?\.html' % _PLAYER_REGEX
+    _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
+    _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P<type>[^/]+)/(?P<display_id>.+)\.html'
+    _VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
  
      _TESTS = [
          {
-            'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html',
+            'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html',
+            # HDS download, MD5 is unstable
              'info_dict': {
-                'id': 'mdb-362427',
+                'id': 'mdb-1058683',
                  'ext': 'flv',
-                'title': 'Servicezeit',
-                'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb',
-                'upload_date': '20140310',
-                'is_live': False
+                'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100',
+                'title': 'Geheimnis Aachener Dom',
+                'alt_title': 'Doku am Freitag',
+                'upload_date': '20160304',
+                'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318',
+                'is_live': False,
+                'subtitles': {'de': [{
+                    'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml'
+                }]},
              },
-            'params': {
-                'skip_download': True,
+        },
+        {
+            'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html',
+            'md5': 'f4c1f96d01cf285240f53ea4309663d8',
+            'info_dict': {
+                'id': 'mdb-1072000',
+                'ext': 'mp3',
+                'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100',
+                'title': 'Schriftstellerin Juli Zeh',
+                'alt_title': 'WDR 3 Gespräch am Samstag',
+                'upload_date': '20160312',
+                'description': 'md5:e127d320bc2b1f149be697ce044a3dd7',
+                'is_live': False,
+                'subtitles': {}
              },
-            'skip': 'Page Not Found',
          },
          {
-            'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html',
+            'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
              'info_dict': {
-                'id': 'mdb-363194',
-                'ext': 'flv',
-                'title': 'Marga Spiegel ist tot',
-                'description': 'md5:2309992a6716c347891c045be50992e4',
-                'upload_date': '20140311',
-                'is_live': False
+                'id': 'mdb-103364',
+                'ext': 'mp4',
+                'display_id': 'index',
+                'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+                'alt_title': 'WDR Fernsehen Live',
+                'upload_date': None,
+                'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
+                'is_live': True,
+                'subtitles': {}
              },
              'params': {
-                'skip_download': True,
+                'skip_download': True,  # m3u8 download
              },
-            'skip': 'Page Not Found',
          },
          {
-            'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html',
-            'md5': '83e9e8fefad36f357278759870805898',
+            'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html',
+            'playlist_mincount': 8,
              'info_dict': {
-                'id': 'mdb-194332',
-                'ext': 'mp3',
-                'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)',
-                'description': 'md5:2309992a6716c347891c045be50992e4',
-                'upload_date': '20091129',
-                'is_live': False
+                'id': 'aktuelle-stunde/aktuelle-stunde-120',
              },
          },
          {
-            'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html',
-            'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa',
+            'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
              'info_dict': {
-                'id': 'mdb-478135',
-                'ext': 'mp3',
-                'title': 'Flavia Coelho: Amar é Amar',
-                'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a',
-                'upload_date': '20140717',
-                'is_live': False
+                'id': 'mdb-1096487',
+                'ext': 'flv',
+                'upload_date': 're:^[0-9]{8}$',
+                'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$',
+                'description': '- Die Sendung mit der Maus -',
              },
-            'skip': 'Page Not Found',
+            'skip': 'The id changes from week to week because of the new episode'
          },
          {
-            'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html',
-            'playlist_mincount': 146,
+            'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5',
+            # HDS download, MD5 is unstable
              'info_dict': {
-                'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100',
-            }
+                'id': 'mdb-186083',
+                'ext': 'flv',
+                'upload_date': '20130919',
+                'title': 'Sachgeschichte - Achterbahn ',
+                'description': '- Die Sendung mit der Maus -',
+            },
          },
          {
-            'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html',
+            'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html',
+            # Live stream, MD5 unstable
              'info_dict': {
-                'id': 'mdb-103364',
-                'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
-                'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
+                'id': 'mdb-869971',
                  'ext': 'flv',
-                'upload_date': '20150101',
-                'is_live': True
-            },
-            'params': {
-                'skip_download': True,
+                'title': 'Funkhaus Europa Livestream',
+                'description': 'md5:2309992a6716c347891c045be50992e4',
+                'upload_date': '20160101',
              },
          }
      ]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        page_url = mobj.group('url')
-        page_id = mobj.group('id')
-
-        webpage = self._download_webpage(url, page_id)
-
-        if mobj.group('player') is None:
+        url_type = mobj.group('type')
+        page_url = mobj.group('page_url')
+        display_id = mobj.group('display_id')
+        webpage = self._download_webpage(url, display_id)
+
+        # for wdr.de the data-extension is in a tag with the class "mediaLink"
+        # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
+        # for wdrmaus its in a link to the page in a multiline "videoLink"-tag
+        json_metadata = self._html_search_regex(
+            r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"',
+            webpage, 'media link', default=None, flags=re.MULTILINE)
+
+        if not json_metadata:
              entries = [
-                self.url_result(page_url + href, 'WDR')
+                self.url_result(page_url + href[0], 'WDR')
                  for href in re.findall(
-                    r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX,
+                    r'<a href="(%s)"[^>]+data-extension=' % self._PAGE_REGEX,
                      webpage)
              ]
  
              if entries:  # Playlist page
-                return self.playlist_result(entries, page_id)
-
-            # Overview page
-            entries = []
-            for page_num in itertools.count(2):
-                hrefs = re.findall(
-                    r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"',
-                    webpage)
-                entries.extend(
-                    self.url_result(page_url + href, 'WDR')
-                    for href in hrefs)
-                next_url_m = re.search(
-                    r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage)
-                if not next_url_m:
-                    break
-                next_url = page_url + next_url_m.group(1)
-                webpage = self._download_webpage(
-                    next_url, page_id,
-                    note='Downloading playlist page %d' % page_num)
-            return self.playlist_result(entries, page_id)
+                return self.playlist_result(entries, playlist_id=display_id)
  
-        flashvars = compat_parse_qs(self._html_search_regex(
-            r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars'))
+            raise ExtractorError('No downloadable streams found', expected=True)
  
-        page_id = flashvars['trackerClipId'][0]
-        video_url = flashvars['dslSrc'][0]
-        title = flashvars['trackerClipTitle'][0]
-        thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None
-        is_live = flashvars.get('isLive', ['0'])[0] == '1'
+        media_link_obj = self._parse_json(json_metadata, display_id,
+                                          transform_source=js_to_json)
+        jsonp_url = media_link_obj['mediaObj']['url']
  
-        if is_live:
-            title = self._live_title(title)
-
-        if 'trackerClipAirTime' in flashvars:
-            upload_date = flashvars['trackerClipAirTime'][0]
-        else:
-            upload_date = self._html_search_meta(
-                'DC.Date', webpage, 'upload date')
+        metadata = self._download_json(
+            jsonp_url, 'metadata', transform_source=strip_jsonp)
  
-        if upload_date:
-            upload_date = unified_strdate(upload_date)
+        metadata_tracker_data = metadata['trackerData']
+        metadata_media_resource = metadata['mediaResource']
  
          formats = []
-        preference = qualities(['S', 'M', 'L', 'XL'])
  
-        if video_url.endswith('.f4m'):
-            formats.extend(self._extract_f4m_formats(
-                video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id,
-                f4m_id='hds', fatal=False))
-        elif video_url.endswith('.smil'):
-            formats.extend(self._extract_smil_formats(
-                video_url, page_id, False, {
-                    'hdcore': '3.3.0',
-                    'plugin': 'aasp-3.3.0.99.43',
-                }))
-        else:
-            formats.append({
-                'url': video_url,
-                'http_headers': {
-                    'User-Agent': 'mobile',
-                },
-            })
+        # check if the metadata contains a direct URL to a file
+        for kind, media_resource in metadata_media_resource.items():
+            if kind not in ('dflt', 'alt'):
+                continue
+
+            for tag_name, medium_url in media_resource.items():
+                if tag_name not in ('videoURL', 'audioURL'):
+                    continue
+
+                ext = determine_ext(medium_url)
+                if ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        medium_url, display_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls'))
+                elif ext == 'f4m':
+                    manifest_url = update_url_query(
+                        medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'})
+                    formats.extend(self._extract_f4m_formats(
+                        manifest_url, display_id, f4m_id='hds', fatal=False))
+                elif ext == 'smil':
+                    formats.extend(self._extract_smil_formats(
+                        medium_url, 'stream', fatal=False))
+                else:
+                    a_format = {
+                        'url': medium_url
+                    }
+                    if ext == 'unknown_video':
+                        urlh = self._request_webpage(
+                            medium_url, display_id, note='Determining extension')
+                        ext = urlhandle_detect_ext(urlh)
+                        a_format['ext'] = ext
+                    formats.append(a_format)
  
-        m3u8_url = self._search_regex(
-            r'rel="adaptiv"[^>]+href="([^"]+)"',
-            webpage, 'm3u8 url', default=None)
-        if m3u8_url:
-            formats.extend(self._extract_m3u8_formats(
-                m3u8_url, page_id, 'mp4', 'm3u8_native',
-                m3u8_id='hls', fatal=False))
+        self._sort_formats(formats)
  
-        direct_urls = re.findall(
-            r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage)
-        if direct_urls:
-            for quality, video_url in direct_urls:
-                formats.append({
-                    'url': video_url,
-                    'preference': preference(quality),
-                    'http_headers': {
-                        'User-Agent': 'mobile',
-                    },
-                })
+        subtitles = {}
+        caption_url = metadata_media_resource.get('captionURL')
+        if caption_url:
+            subtitles['de'] = [{
+                'url': caption_url
+            }]
  
-        self._sort_formats(formats)
+        title = metadata_tracker_data.get('trackerClipTitle')
+        is_live = url_type == 'live'
+
+        if is_live:
+            title = self._live_title(title)
+            upload_date = None
+        elif 'trackerClipAirTime' in metadata_tracker_data:
+            upload_date = metadata_tracker_data['trackerClipAirTime']
+        else:
+            upload_date = self._html_search_meta('DC.Date', webpage, 'upload date')
  
-        description = self._html_search_meta('Description', webpage, 'description')
+        if upload_date:
+            upload_date = unified_strdate(upload_date)
  
          return {
-            'id': page_id,
-            'formats': formats,
+            'id': metadata_tracker_data.get('trackerClipId', display_id),
+            'display_id': display_id,
              'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
+            'alt_title': metadata_tracker_data.get('trackerClipSubcategory'),
+            'formats': formats,
              'upload_date': upload_date,
-            'is_live': is_live
+            'description': self._html_search_meta('Description', webpage),
+            'is_live': is_live,
+            'subtitles': subtitles,
          }
  
  
@@ -241,81 +249,3 @@ def _real_extract(self, url):
                  'User-Agent': 'mobile',
              },
          }
-
-
-class WDRMausIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))'
-    IE_DESC = 'Sendung mit der Maus'
-    _TESTS = [{
-        'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
-        'info_dict': {
-            'id': 'aktuelle-sendung',
-            'ext': 'mp4',
-            'thumbnail': 're:^http://.+\.jpg',
-            'upload_date': 're:^[0-9]{8}$',
-            'title': 're:^[0-9.]{10} - Aktuelle Sendung$',
-        }
-    }, {
-        'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5',
-        'md5': '3b1227ca3ed28d73ec5737c65743b2a3',
-        'info_dict': {
-            'id': '40_jahre_maus',
-            'ext': 'mp4',
-            'thumbnail': 're:^http://.+\.jpg',
-            'upload_date': '20131007',
-            'title': '12.03.2011 - 40 Jahre Maus',
-        }
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-        param_code = self._html_search_regex(
-            r'<a href="\?startVideo=1&amp;([^"]+)"', webpage, 'parameters')
-
-        title_date = self._search_regex(
-            r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>',
-            webpage, 'air date')
-        title_str = self._html_search_regex(
-            r'<h1>(.*?)</h1>', webpage, 'title')
-        title = '%s - %s' % (title_date, title_str)
-        upload_date = unified_strdate(
-            self._html_search_meta('dc.date', webpage))
-
-        fields = compat_parse_qs(param_code)
-        video_url = fields['firstVideo'][0]
-        thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0])
-
-        formats = [{
-            'format_id': 'rtmp',
-            'url': video_url,
-        }]
-
-        jscode = self._download_webpage(
-            'http://www.wdrmaus.de/codebase/js/extended-medien.min.js',
-            video_id, fatal=False,
-            note='Downloading URL translation table',
-            errnote='Could not download URL translation table')
-        if jscode:
-            for m in re.finditer(
-                    r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}",
-                    jscode):
-                if video_url.startswith(m.group('stream')):
-                    http_url = video_url.replace(
-                        m.group('stream'), m.group('dl'))
-                    formats.append({
-                        'format_id': 'http',
-                        'url': http_url,
-                    })
-                    break
-
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'formats': formats,
-            'thumbnail': thumbnail,
-            'upload_date': upload_date,
-        }
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py

index 8b14840a2dba606951f1f7d80694f1e7f0cca8d6..c634b8decddf8fdb15649b05e8f49ad9efc36254 100644 (file)
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -3,16 +3,17 @@
  from .common import InfoExtractor
  from ..utils import (
      ExtractorError,
-    sanitized_Request,
      int_or_none,
+    float_or_none,
  )
  
  
  class WistiaIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
-    _API_URL = 'http://fast.wistia.com/embed/medias/{0:}.json'
+    _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.net/embed/iframe/)(?P<id>[a-z0-9]+)'
+    _API_URL = 'http://fast.wistia.com/embed/medias/%s.json'
+    _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s'
  
-    _TEST = {
+    _TESTS = [{
          'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',
          'md5': 'cafeb56ec0c53c18c97405eecb3133df',
          'info_dict': {
@@ -24,36 +25,54 @@ class WistiaIE(InfoExtractor):
              'timestamp': 1386185018,
              'duration': 117,
          },
-    }
+    }, {
+        'url': 'wistia:sh7fpupwlt',
+        'only_matching': True,
+    }, {
+        # with hls video
+        'url': 'wistia:807fafadvk',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          video_id = self._match_id(url)
  
-        request = sanitized_Request(self._API_URL.format(video_id))
-        request.add_header('Referer', url)  # Some videos require this.
-        data_json = self._download_json(request, video_id)
+        data_json = self._download_json(
+            self._API_URL % video_id, video_id,
+            # Some videos require this.
+            headers={
+                'Referer': url if url.startswith('http') else self._IFRAME_URL % video_id,
+            })
+
          if data_json.get('error'):
-            raise ExtractorError('Error while getting the playlist',
-                                 expected=True)
+            raise ExtractorError(
+                'Error while getting the playlist', expected=True)
+
          data = data_json['media']
          title = data['name']
  
          formats = []
          thumbnails = []
          for a in data['assets']:
+            aurl = a.get('url')
+            if not aurl:
+                continue
              astatus = a.get('status')
              atype = a.get('type')
-            if (astatus is not None and astatus != 2) or atype == 'preview':
+            if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'):
                  continue
              elif atype in ('still', 'still_image'):
                  thumbnails.append({
-                    'url': a['url'],
-                    'resolution': '%dx%d' % (a['width'], a['height']),
+                    'url': aurl,
+                    'width': int_or_none(a.get('width')),
+                    'height': int_or_none(a.get('height')),
                  })
              else:
+                aext = a.get('ext')
+                is_m3u8 = a.get('container') == 'm3u8' or aext == 'm3u8'
                  formats.append({
                      'format_id': atype,
-                    'url': a['url'],
+                    'url': aurl,
                      'tbr': int_or_none(a.get('bitrate')),
                      'vbr': int_or_none(a.get('opt_vbitrate')),
                      'width': int_or_none(a.get('width')),
@@ -61,7 +80,8 @@ def _real_extract(self, url):
                      'filesize': int_or_none(a.get('size')),
                      'vcodec': a.get('codec'),
                      'container': a.get('container'),
-                    'ext': a.get('ext'),
+                    'ext': 'mp4' if is_m3u8 else aext,
+                    'protocol': 'm3u8' if is_m3u8 else None,
                      'preference': 1 if atype == 'original' else None,
                  })
  
@@ -73,6 +93,6 @@ def _real_extract(self, url):
              'description': data.get('seoDescription'),
              'formats': formats,
              'thumbnails': thumbnails,
-            'duration': int_or_none(data.get('duration')),
+            'duration': float_or_none(data.get('duration')),
              'timestamp': int_or_none(data.get('createdAt')),
          }
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py

index b3547174dd92beffafaf8f220b50b94a25f2fa2b..bd8e1af2e0f6c25fc44aea36c23b813b092b4438 100644 (file)
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -12,37 +12,52 @@
  
  
  class XHamsterIE(InfoExtractor):
-    _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
-    _TESTS = [
-        {
-            'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
-            'info_dict': {
-                'id': '1509445',
-                'ext': 'mp4',
-                'title': 'FemaleAgent Shy beauty takes the bait',
-                'upload_date': '20121014',
-                'uploader': 'Ruseful2011',
-                'duration': 893.52,
-                'age_limit': 18,
-            }
+    _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?'
+    _TESTS = [{
+        'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
+        'md5': '8281348b8d3c53d39fffb377d24eac4e',
+        'info_dict': {
+            'id': '1509445',
+            'ext': 'mp4',
+            'title': 'FemaleAgent Shy beauty takes the bait',
+            'upload_date': '20121014',
+            'uploader': 'Ruseful2011',
+            'duration': 893.52,
+            'age_limit': 18,
          },
-        {
-            'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
-            'info_dict': {
-                'id': '2221348',
-                'ext': 'mp4',
-                'title': 'Britney Spears  Sexy Booty',
-                'upload_date': '20130914',
-                'uploader': 'jojo747400',
-                'duration': 200.48,
-                'age_limit': 18,
-            }
+    }, {
+        'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
+        'info_dict': {
+            'id': '2221348',
+            'ext': 'mp4',
+            'title': 'Britney Spears  Sexy Booty',
+            'upload_date': '20130914',
+            'uploader': 'jojo747400',
+            'duration': 200.48,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # empty seo
+        'url': 'http://xhamster.com/movies/5667973/.html',
+        'info_dict': {
+            'id': '5667973',
+            'ext': 'mp4',
+            'title': '....',
+            'upload_date': '20160208',
+            'uploader': 'parejafree',
+            'duration': 72.0,
+            'age_limit': 18,
          },
-        {
-            'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',
-            'only_matching': True,
+        'params': {
+            'skip_download': True,
          },
-    ]
+    }, {
+        'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
          def extract_video_url(webpage, name):
@@ -170,7 +185,7 @@ def _real_extract(self, url):
          webpage = self._download_webpage(url, video_id)
  
          video_url = self._search_regex(
-            r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id,
+            r'href="(https?://xhamster\.com/movies/%s/[^"]*\.html[^"]*)"' % video_id,
              webpage, 'xhamster url', default=None)
  
          if not video_url:
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py

index 710ad5041988b0e1c932b135af91a27036dfd664..1dfe031cab8e9ccf4fa6fb23fbd90d40759ac930 100644 (file)
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -8,7 +8,6 @@
      clean_html,
      ExtractorError,
      determine_ext,
-    sanitized_Request,
  )
  
  
@@ -25,8 +24,6 @@ class XVideosIE(InfoExtractor):
          }
      }
  
-    _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
-
      def _real_extract(self, url):
          video_id = self._match_id(url)
          webpage = self._download_webpage(url, video_id)
@@ -35,31 +32,34 @@ def _real_extract(self, url):
          if mobj:
              raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
  
-        video_url = compat_urllib_parse_unquote(
-            self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
          video_title = self._html_search_regex(
              r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
          video_thumbnail = self._search_regex(
              r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
  
-        formats = [{
-            'url': video_url,
-        }]
+        formats = []
  
-        android_req = sanitized_Request(url)
-        android_req.add_header('User-Agent', self._ANDROID_USER_AGENT)
-        android_webpage = self._download_webpage(android_req, video_id, fatal=False)
+        video_url = compat_urllib_parse_unquote(self._search_regex(
+            r'flv_url=(.+?)&', webpage, 'video URL', default=''))
+        if video_url:
+            formats.append({'url': video_url})
  
-        if android_webpage is not None:
-            player_params_str = self._search_regex(
-                'mobileReplacePlayerDivTwoQual\(([^)]+)\)',
-                android_webpage, 'player parameters', default='')
-            player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(',')))
-            if player_params:
-                formats.extend([{
-                    'url': param,
-                    'preference': -10,
-                } for param in player_params if determine_ext(param) == 'mp4'])
+        player_args = self._search_regex(
+            r'(?s)new\s+HTML5Player\((.+?)\)', webpage, ' html5 player', default=None)
+        if player_args:
+            for arg in player_args.split(','):
+                format_url = self._search_regex(
+                    r'(["\'])(?P<url>https?://.+?)\1', arg, 'url',
+                    default=None, group='url')
+                if not format_url:
+                    continue
+                ext = determine_ext(format_url)
+                if ext == 'mp4':
+                    formats.append({'url': format_url})
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
  
          self._sort_formats(formats)
  
@@ -67,7 +67,6 @@ def _real_extract(self, url):
              'id': video_id,
              'formats': formats,
              'title': video_title,
-            'ext': 'flv',
              'thumbnail': video_thumbnail,
              'age_limit': 18,
          }
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py

index ce3723b55032915a216e23f4daa902a42ca314cf..b37d0eab66b53ab45ff38dcac91598079f08a275 100644 (file)
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -10,8 +10,6 @@
      ExtractorError,
      int_or_none,
      float_or_none,
-    sanitized_Request,
-    urlencode_postdata,
  )
  
  
@@ -22,18 +20,24 @@ def _handle_error(response):
              error = response.get('error')
              if error:
                  raise ExtractorError(error, expected=True)
+            if response.get('type') == 'captcha' or 'captcha' in response:
+                YandexMusicBaseIE._raise_captcha()
+
+    @staticmethod
+    def _raise_captcha():
+        raise ExtractorError(
+            'YandexMusic has considered youtube-dl requests automated and '
+            'asks you to solve a CAPTCHA. You can either wait for some '
+            'time until unblocked and optionally use --sleep-interval '
+            'in future or alternatively you can go to https://music.yandex.ru/ '
+            'solve CAPTCHA, then export cookies and pass cookie file to '
+            'youtube-dl with --cookies',
+            expected=True)
  
      def _download_webpage(self, *args, **kwargs):
          webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs)
          if 'Нам очень жаль, но&nbsp;запросы, поступившие с&nbsp;вашего IP-адреса, похожи на&nbsp;автоматические.' in webpage:
-            raise ExtractorError(
-                'YandexMusic has considered youtube-dl requests automated and '
-                'asks you to solve a CAPTCHA. You can either wait for some '
-                'time until unblocked and optionally use --sleep-interval '
-                'in future or alternatively you can go to https://music.yandex.ru/ '
-                'solve CAPTCHA, then export cookies and pass cookie file to '
-                'youtube-dl with --cookies',
-                expected=True)
+            self._raise_captcha()
          return webpage
  
      def _download_json(self, *args, **kwargs):
@@ -177,7 +181,7 @@ def _real_extract(self, url):
  class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
      IE_NAME = 'yandexmusic:playlist'
      IE_DESC = 'Яндекс.Музыка - Плейлист'
-    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)'
+    _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'
  
      _TESTS = [{
          'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
@@ -196,47 +200,64 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
              'id': '1036',
              'title': 'Музыка 90-х',
          },
-        'playlist_count': 310,
+        'playlist_mincount': 300,
          'skip': 'Travis CI servers blocked by YandexMusic',
      }]
  
      def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, playlist_id)
-
-        mu = self._parse_json(
-            self._search_regex(
-                r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'),
-            playlist_id)
-
-        playlist = mu['pageData']['playlist']
-        tracks, track_ids = playlist['tracks'], playlist['trackIds']
-
-        # tracks dictionary shipped with webpage is limited to 150 tracks,
+        mobj = re.match(self._VALID_URL, url)
+        tld = mobj.group('tld')
+        user = mobj.group('user')
+        playlist_id = mobj.group('id')
+
+        playlist = self._download_json(
+            'https://music.yandex.%s/handlers/playlist.jsx' % tld,
+            playlist_id, 'Downloading missing tracks JSON',
+            fatal=False,
+            headers={
+                'Referer': url,
+                'X-Requested-With': 'XMLHttpRequest',
+                'X-Retpath-Y': url,
+            },
+            query={
+                'owner': user,
+                'kinds': playlist_id,
+                'light': 'true',
+                'lang': tld,
+                'external-domain': 'music.yandex.%s' % tld,
+                'overembed': 'false',
+            })['playlist']
+
+        tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
+
+        # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
          # missing tracks should be retrieved manually.
          if len(tracks) < len(track_ids):
-            present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')])
-            missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids)
-            request = sanitized_Request(
-                'https://music.yandex.ru/handlers/track-entries.jsx',
-                urlencode_postdata({
+            present_track_ids = set([
+                compat_str(track['id'])
+                for track in tracks if track.get('id')])
+            missing_track_ids = [
+                track_id for track_id in track_ids
+                if track_id not in present_track_ids]
+            missing_tracks = self._download_json(
+                'https://music.yandex.%s/handlers/track-entries.jsx' % tld,
+                playlist_id, 'Downloading missing tracks JSON',
+                fatal=False,
+                headers={
+                    'Referer': url,
+                    'X-Requested-With': 'XMLHttpRequest',
+                },
+                query={
                      'entries': ','.join(missing_track_ids),
-                    'lang': mu.get('settings', {}).get('lang', 'en'),
-                    'external-domain': 'music.yandex.ru',
+                    'lang': tld,
+                    'external-domain': 'music.yandex.%s' % tld,
                      'overembed': 'false',
-                    'sign': mu.get('authData', {}).get('user', {}).get('sign'),
                      'strict': 'true',
-                }))
-            request.add_header('Referer', url)
-            request.add_header('X-Requested-With', 'XMLHttpRequest')
-
-            missing_tracks = self._download_json(
-                request, playlist_id, 'Downloading missing tracks JSON', fatal=False)
+                })
              if missing_tracks:
                  tracks.extend(missing_tracks)
  
          return self.playlist_result(
              self._build_playlist(tracks),
              compat_str(playlist_id),
-            playlist['title'], playlist.get('description'))
+            playlist.get('title'), playlist.get('description'))
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py

index 349ce09414b765060ac5f06121b87b529e287a12..147608ebebbbe8b71dda64e2daf974362868c50c 100644 (file)
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -2,7 +2,9 @@
  from __future__ import unicode_literals
  
  import base64
+import itertools
  import random
+import re
  import string
  import time
  
@@ -13,6 +15,7 @@
  )
  from ..utils import (
      ExtractorError,
+    get_element_by_attribute,
      sanitized_Request,
  )
  
@@ -275,6 +278,8 @@ def retrieve_data(req_url, note):
                      'format_id': self.get_format_name(fm),
                      'ext': self.parse_ext_l(fm),
                      'filesize': int(seg['size']),
+                    'width': stream.get('width'),
+                    'height': stream.get('height'),
                  })
  
          return {
@@ -283,3 +288,52 @@ def retrieve_data(req_url, note):
              'title': title,
              'entries': entries,
          }
+
+
+class YoukuShowIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html'
+    IE_NAME = 'youku:show'
+
+    _TEST = {
+        'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html',
+        'info_dict': {
+            'id': 'zc7c670be07ff11e48b3f',
+            'title': '花千骨 未删减版',
+            'description': 'md5:578d4f2145ae3f9128d9d4d863312910',
+        },
+        'playlist_count': 50,
+    }
+
+    _PAGE_SIZE = 40
+
+    def _find_videos_in_page(self, webpage):
+        videos = re.findall(
+            r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage)
+        return [
+            self.url_result(video_url, YoukuIE.ie_key(), title)
+            for video_url, title in videos]
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+        webpage = self._download_webpage(url, show_id)
+
+        entries = self._find_videos_in_page(webpage)
+
+        playlist_title = self._html_search_regex(
+            r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False)
+        detail_div = get_element_by_attribute('class', 'detail', webpage) or ''
+        playlist_description = self._html_search_regex(
+            r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>',
+            detail_div, 'playlist description', fatal=False)
+
+        for idx in itertools.count(1):
+            episodes_page = self._download_webpage(
+                'http://www.youku.com/show_episode/id_%s.html' % show_id,
+                show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)},
+                note='Downloading episodes page %d' % idx)
+            new_entries = self._find_videos_in_page(episodes_page)
+            entries.extend(new_entries)
+            if len(new_entries) < self._PAGE_SIZE:
+                break
+
+        return self.playlist_result(entries, show_id, playlist_title, playlist_description)
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index b7c3cb63fc44d73d429f70392e0bd2cdf5bbd66f..6c9f77d95370e740676faef9e3d5cf1231629af6 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -344,6 +344,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
          '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
          '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
+        '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
+        '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
  
          # Dash webm
          '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
@@ -1326,9 +1328,9 @@ def add_dash_mpd(video_info):
          if video_description:
              video_description = re.sub(r'''(?x)
                  <a\s+
-                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
+                    (?:[a-zA-Z-]+="[^"]*"\s+)*?
                      (?:title|href)="([^"]+)"\s+
-                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
+                    (?:[a-zA-Z-]+="[^"]*"\s+)*?
                      class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>
                  [^<]+\.{3}\s*
                  </a>
diff --git a/youtube_dl/options.py b/youtube_dl/options.py

index d1f8d1331cf153a58a42b4220ebe37b441f3df4b..99ce4131fdfaacbbbca07805fbc86c8563ea95fe 100644 (file)
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -188,7 +188,10 @@ def _hide_login_info(opts):
      network.add_option(
          '--proxy', dest='proxy',
          default=None, metavar='URL',
-        help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
+        help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable experimental '
+             'SOCKS proxy, specify a proper scheme. For example '
+             'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") '
+             'for direct connection')
      network.add_option(
          '--socket-timeout',
          dest='socket_timeout', type=float, default=None, metavar='SECONDS',
@@ -392,8 +395,8 @@ def _hide_login_info(opts):
  
      downloader = optparse.OptionGroup(parser, 'Download Options')
      downloader.add_option(
-        '-r', '--rate-limit',
-        dest='ratelimit', metavar='LIMIT',
+        '-r', '--limit-rate', '--rate-limit',
+        dest='ratelimit', metavar='RATE',
          help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)')
      downloader.add_option(
          '-R', '--retries',
@@ -665,7 +668,7 @@ def _hide_login_info(opts):
          action='store_true', dest='writeannotations', default=False,
          help='Write video annotations to a .annotations.xml file')
      filesystem.add_option(
-        '--load-info',
+        '--load-info-json', '--load-info',
          dest='load_info_filename', metavar='FILE',
          help='JSON file containing the video information (created with the "--write-info-json" option)')
      filesystem.add_option(
diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py

index 74f66d669c0679a9eece06b1924ecc9f5dae00d2..90630c2d7391de9fd288662c8f207433702f8c99 100644 (file)
--- a/youtube_dl/postprocessor/execafterdownload.py
+++ b/youtube_dl/postprocessor/execafterdownload.py
@@ -3,7 +3,7 @@
  import subprocess
  
  from .common import PostProcessor
-from ..compat import shlex_quote
+from ..compat import compat_shlex_quote
  from ..utils import PostProcessingError
  
  
@@ -17,7 +17,7 @@ def run(self, information):
          if '{}' not in cmd:
              cmd += ' {}'
  
-        cmd = cmd.replace('{}', shlex_quote(information['filepath']))
+        cmd = cmd.replace('{}', compat_shlex_quote(information['filepath']))
  
          self._downloader.to_screen('[exec] Executing command: %s' % cmd)
          retCode = subprocess.call(cmd, shell=True)
diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py

new file mode 100644 (file)

index 0000000..fd49d74
--- /dev/null
+++ b/youtube_dl/socks.py
@@ -0,0 +1,271 @@
+# Public Domain SOCKS proxy protocol implementation
+# Adapted from https://gist.github.com/bluec0re/cafd3764412967417fd3
+
+from __future__ import unicode_literals
+
+# References:
+# SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol
+# SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol
+# SOCKS5 protocol https://tools.ietf.org/html/rfc1928
+# SOCKS5 username/password authentication https://tools.ietf.org/html/rfc1929
+
+import collections
+import socket
+
+from .compat import (
+    compat_ord,
+    compat_struct_pack,
+    compat_struct_unpack,
+)
+
+__author__ = 'Timo Schmid <coding@timoschmid.de>'
+
+SOCKS4_VERSION = 4
+SOCKS4_REPLY_VERSION = 0x00
+# Excerpt from SOCKS4A protocol:
+# if the client cannot resolve the destination host's domain name to find its
+# IP address, it should set the first three bytes of DSTIP to NULL and the last
+# byte to a non-zero value.
+SOCKS4_DEFAULT_DSTIP = compat_struct_pack('!BBBB', 0, 0, 0, 0xFF)
+
+SOCKS5_VERSION = 5
+SOCKS5_USER_AUTH_VERSION = 0x01
+SOCKS5_USER_AUTH_SUCCESS = 0x00
+
+
+class Socks4Command(object):
+    CMD_CONNECT = 0x01
+    CMD_BIND = 0x02
+
+
+class Socks5Command(Socks4Command):
+    CMD_UDP_ASSOCIATE = 0x03
+
+
+class Socks5Auth(object):
+    AUTH_NONE = 0x00
+    AUTH_GSSAPI = 0x01
+    AUTH_USER_PASS = 0x02
+    AUTH_NO_ACCEPTABLE = 0xFF  # For server response
+
+
+class Socks5AddressType(object):
+    ATYP_IPV4 = 0x01
+    ATYP_DOMAINNAME = 0x03
+    ATYP_IPV6 = 0x04
+
+
+class ProxyError(IOError):
+    ERR_SUCCESS = 0x00
+
+    def __init__(self, code=None, msg=None):
+        if code is not None and msg is None:
+            msg = self.CODES.get(code) and 'unknown error'
+        super(ProxyError, self).__init__(code, msg)
+
+
+class InvalidVersionError(ProxyError):
+    def __init__(self, expected_version, got_version):
+        msg = ('Invalid response version from server. Expected {0:02x} got '
+               '{1:02x}'.format(expected_version, got_version))
+        super(InvalidVersionError, self).__init__(0, msg)
+
+
+class Socks4Error(ProxyError):
+    ERR_SUCCESS = 90
+
+    CODES = {
+        91: 'request rejected or failed',
+        92: 'request rejected becasue SOCKS server cannot connect to identd on the client',
+        93: 'request rejected because the client program and identd report different user-ids'
+    }
+
+
+class Socks5Error(ProxyError):
+    ERR_GENERAL_FAILURE = 0x01
+
+    CODES = {
+        0x01: 'general SOCKS server failure',
+        0x02: 'connection not allowed by ruleset',
+        0x03: 'Network unreachable',
+        0x04: 'Host unreachable',
+        0x05: 'Connection refused',
+        0x06: 'TTL expired',
+        0x07: 'Command not supported',
+        0x08: 'Address type not supported',
+        0xFE: 'unknown username or invalid password',
+        0xFF: 'all offered authentication methods were rejected'
+    }
+
+
+class ProxyType(object):
+    SOCKS4 = 0
+    SOCKS4A = 1
+    SOCKS5 = 2
+
+Proxy = collections.namedtuple('Proxy', (
+    'type', 'host', 'port', 'username', 'password', 'remote_dns'))
+
+
+class sockssocket(socket.socket):
+    def __init__(self, *args, **kwargs):
+        self._proxy = None
+        super(sockssocket, self).__init__(*args, **kwargs)
+
+    def setproxy(self, proxytype, addr, port, rdns=True, username=None, password=None):
+        assert proxytype in (ProxyType.SOCKS4, ProxyType.SOCKS4A, ProxyType.SOCKS5)
+
+        self._proxy = Proxy(proxytype, addr, port, username, password, rdns)
+
+    def recvall(self, cnt):
+        data = b''
+        while len(data) < cnt:
+            cur = self.recv(cnt - len(data))
+            if not cur:
+                raise IOError('{0} bytes missing'.format(cnt - len(data)))
+            data += cur
+        return data
+
+    def _recv_bytes(self, cnt):
+        data = self.recvall(cnt)
+        return compat_struct_unpack('!{0}B'.format(cnt), data)
+
+    @staticmethod
+    def _len_and_data(data):
+        return compat_struct_pack('!B', len(data)) + data
+
+    def _check_response_version(self, expected_version, got_version):
+        if got_version != expected_version:
+            self.close()
+            raise InvalidVersionError(expected_version, got_version)
+
+    def _resolve_address(self, destaddr, default, use_remote_dns):
+        try:
+            return socket.inet_aton(destaddr)
+        except socket.error:
+            if use_remote_dns and self._proxy.remote_dns:
+                return default
+            else:
+                return socket.inet_aton(socket.gethostbyname(destaddr))
+
+    def _setup_socks4(self, address, is_4a=False):
+        destaddr, port = address
+
+        ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a)
+
+        packet = compat_struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr
+
+        username = (self._proxy.username or '').encode('utf-8')
+        packet += username + b'\x00'
+
+        if is_4a and self._proxy.remote_dns:
+            packet += destaddr.encode('utf-8') + b'\x00'
+
+        self.sendall(packet)
+
+        version, resp_code, dstport, dsthost = compat_struct_unpack('!BBHI', self.recvall(8))
+
+        self._check_response_version(SOCKS4_REPLY_VERSION, version)
+
+        if resp_code != Socks4Error.ERR_SUCCESS:
+            self.close()
+            raise Socks4Error(resp_code)
+
+        return (dsthost, dstport)
+
+    def _setup_socks4a(self, address):
+        self._setup_socks4(address, is_4a=True)
+
+    def _socks5_auth(self):
+        packet = compat_struct_pack('!B', SOCKS5_VERSION)
+
+        auth_methods = [Socks5Auth.AUTH_NONE]
+        if self._proxy.username and self._proxy.password:
+            auth_methods.append(Socks5Auth.AUTH_USER_PASS)
+
+        packet += compat_struct_pack('!B', len(auth_methods))
+        packet += compat_struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods)
+
+        self.sendall(packet)
+
+        version, method = self._recv_bytes(2)
+
+        self._check_response_version(SOCKS5_VERSION, version)
+
+        if method == Socks5Auth.AUTH_NO_ACCEPTABLE:
+            self.close()
+            raise Socks5Error(method)
+
+        if method == Socks5Auth.AUTH_USER_PASS:
+            username = self._proxy.username.encode('utf-8')
+            password = self._proxy.password.encode('utf-8')
+            packet = compat_struct_pack('!B', SOCKS5_USER_AUTH_VERSION)
+            packet += self._len_and_data(username) + self._len_and_data(password)
+            self.sendall(packet)
+
+            version, status = self._recv_bytes(2)
+
+            self._check_response_version(SOCKS5_USER_AUTH_VERSION, version)
+
+            if status != SOCKS5_USER_AUTH_SUCCESS:
+                self.close()
+                raise Socks5Error(Socks5Error.ERR_GENERAL_FAILURE)
+
+    def _setup_socks5(self, address):
+        destaddr, port = address
+
+        ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True)
+
+        self._socks5_auth()
+
+        reserved = 0
+        packet = compat_struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved)
+        if ipaddr is None:
+            destaddr = destaddr.encode('utf-8')
+            packet += compat_struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME)
+            packet += self._len_and_data(destaddr)
+        else:
+            packet += compat_struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr
+        packet += compat_struct_pack('!H', port)
+
+        self.sendall(packet)
+
+        version, status, reserved, atype = self._recv_bytes(4)
+
+        self._check_response_version(SOCKS5_VERSION, version)
+
+        if status != Socks5Error.ERR_SUCCESS:
+            self.close()
+            raise Socks5Error(status)
+
+        if atype == Socks5AddressType.ATYP_IPV4:
+            destaddr = self.recvall(4)
+        elif atype == Socks5AddressType.ATYP_DOMAINNAME:
+            alen = compat_ord(self.recv(1))
+            destaddr = self.recvall(alen)
+        elif atype == Socks5AddressType.ATYP_IPV6:
+            destaddr = self.recvall(16)
+        destport = compat_struct_unpack('!H', self.recvall(2))[0]
+
+        return (destaddr, destport)
+
+    def _make_proxy(self, connect_func, address):
+        if not self._proxy:
+            return connect_func(self, address)
+
+        result = connect_func(self, (self._proxy.host, self._proxy.port))
+        if result != 0 and result is not None:
+            return result
+        setup_funcs = {
+            ProxyType.SOCKS4: self._setup_socks4,
+            ProxyType.SOCKS4A: self._setup_socks4a,
+            ProxyType.SOCKS5: self._setup_socks5,
+        }
+        setup_funcs[self._proxy.type](address)
+        return result
+
+    def connect(self, address):
+        self._make_proxy(socket.socket.connect, address)
+
+    def connect_ex(self, address):
+        return self._make_proxy(socket.socket.connect_ex, address)
diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py

index 06c1d6cc1755ef022aa78967d4b651e21fd66618..7cf490aa43a878b3c377bea0b173c7a2b170c2c7 100644 (file)
--- a/youtube_dl/swfinterp.py
+++ b/youtube_dl/swfinterp.py
@@ -4,10 +4,12 @@
  import io
  import zlib
  
-from .compat import compat_str
+from .compat import (
+    compat_str,
+    compat_struct_unpack,
+)
  from .utils import (
      ExtractorError,
-    struct_unpack,
  )
  
  
@@ -23,17 +25,17 @@ def _extract_tags(file_contents):
              file_contents[:1])
  
      # Determine number of bits in framesize rectangle
-    framesize_nbits = struct_unpack('!B', content[:1])[0] >> 3
+    framesize_nbits = compat_struct_unpack('!B', content[:1])[0] >> 3
      framesize_len = (5 + 4 * framesize_nbits + 7) // 8
  
      pos = framesize_len + 2 + 2
      while pos < len(content):
-        header16 = struct_unpack('<H', content[pos:pos + 2])[0]
+        header16 = compat_struct_unpack('<H', content[pos:pos + 2])[0]
          pos += 2
          tag_code = header16 >> 6
          tag_len = header16 & 0x3f
          if tag_len == 0x3f:
-            tag_len = struct_unpack('<I', content[pos:pos + 4])[0]
+            tag_len = compat_struct_unpack('<I', content[pos:pos + 4])[0]
              pos += 4
          assert pos + tag_len <= len(content), \
              ('Tag %d ends at %d+%d - that\'s longer than the file (%d)'
@@ -101,7 +103,7 @@ def _read_int(reader):
      for _ in range(5):
          buf = reader.read(1)
          assert len(buf) == 1
-        b = struct_unpack('<B', buf)[0]
+        b = compat_struct_unpack('<B', buf)[0]
          res = res | ((b & 0x7f) << shift)
          if b & 0x80 == 0:
              break
@@ -127,7 +129,7 @@ def _s24(reader):
      bs = reader.read(3)
      assert len(bs) == 3
      last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00'
-    return struct_unpack('<i', bs + last_byte)[0]
+    return compat_struct_unpack('<i', bs + last_byte)[0]
  
  
  def _read_string(reader):
@@ -146,7 +148,7 @@ def _read_bytes(count, reader):
  
  def _read_byte(reader):
      resb = _read_bytes(1, reader=reader)
-    res = struct_unpack('<B', resb)[0]
+    res = compat_struct_unpack('<B', resb)[0]
      return res
  
  
diff --git a/youtube_dl/update.py b/youtube_dl/update.py

index 676ebe1c42d1d6b54eb50bfc3f087e6fee8e20f0..ebce9666a21465b53b93ccd0bd263b29349720a0 100644 (file)
--- a/youtube_dl/update.py
+++ b/youtube_dl/update.py
@@ -83,11 +83,8 @@ def version_tuple(version_str):
  
      print_notes(to_screen, versions_info['versions'])
  
-    filename = sys.argv[0]
-    # Py2EXE: Filename could be different
-    if hasattr(sys, 'frozen') and not os.path.isfile(filename):
-        if os.path.isfile(filename + '.exe'):
-            filename += '.exe'
+    # sys.executable is set to the full pathname of the exe-file for py2exe
+    filename = sys.executable if hasattr(sys, 'frozen') else sys.argv[0]
  
      if not os.access(filename, os.W_OK):
          to_screen('ERROR: no write permissions on %s' % filename)
@@ -95,7 +92,7 @@ def version_tuple(version_str):
  
      # Py2EXE
      if hasattr(sys, 'frozen'):
-        exe = os.path.abspath(filename)
+        exe = filename
          directory = os.path.dirname(exe)
          if not os.access(directory, os.W_OK):
              to_screen('ERROR: no write permissions on %s' % directory)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 7bcc85e2b530cb2eadb714e100b56d8f4637b87d..229de4b39a0ee6408277fcf1f999b53c1d78ee78 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -14,8 +14,8 @@
  import errno
  import functools
  import gzip
-import itertools
  import io
+import itertools
  import json
  import locale
  import math
@@ -24,9 +24,8 @@
  import pipes
  import platform
  import re
-import ssl
  import socket
-import struct
+import ssl
  import subprocess
  import sys
  import tempfile
@@ -43,18 +42,34 @@
      compat_http_client,
      compat_kwargs,
      compat_parse_qs,
+    compat_shlex_quote,
      compat_socket_create_connection,
      compat_str,
+    compat_struct_pack,
      compat_urllib_error,
      compat_urllib_parse,
      compat_urllib_parse_urlencode,
      compat_urllib_parse_urlparse,
+    compat_urllib_parse_unquote_plus,
      compat_urllib_request,
      compat_urlparse,
      compat_xpath,
-    shlex_quote,
  )
  
+from .socks import (
+    ProxyType,
+    sockssocket,
+)
+
+
+def register_socks_protocols():
+    # "Register" SOCKS protocols
+    # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
+    # URLs with protocols not in urlparse.uses_netloc are not handled correctly
+    for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
+        if scheme not in compat_urlparse.uses_netloc:
+            compat_urlparse.uses_netloc.append(scheme)
+
  
  # This is not clearly defined otherwise
  compiled_regex_type = type(re.compile(''))
@@ -89,6 +104,11 @@
      'wav',
      'f4f', 'f4m', 'm3u8', 'smil')
  
+# needed for sanitizing filenames in restricted mode
+ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
+                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
+                                        'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
+
  
  def preferredencoding():
      """Get preferred encoding.
@@ -251,9 +271,9 @@ def get_element_by_attribute(attribute, value, html):
  
      m = re.search(r'''(?xs)
          <([a-zA-Z0-9:._-]+)
-         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
           \s+%s=['"]?%s['"]?
-         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
          \s*>
          (?P<content>.*?)
          </\1>
@@ -365,6 +385,8 @@ def sanitize_filename(s, restricted=False, is_id=False):
      Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
      """
      def replace_insane(char):
+        if restricted and char in ACCENT_CHARS:
+            return ACCENT_CHARS[char]
          if char == '?' or ord(char) < 32 or ord(char) == 127:
              return ''
          elif char == '"':
@@ -745,8 +767,15 @@ def __init__(self, params, *args, **kwargs):
          self._params = params
  
      def http_open(self, req):
+        conn_class = compat_http_client.HTTPConnection
+
+        socks_proxy = req.headers.get('Ytdl-socks-proxy')
+        if socks_proxy:
+            conn_class = make_socks_conn_class(conn_class, socks_proxy)
+            del req.headers['Ytdl-socks-proxy']
+
          return self.do_open(functools.partial(
-            _create_http_connection, self, compat_http_client.HTTPConnection, False),
+            _create_http_connection, self, conn_class, False),
              req)
  
      @staticmethod
@@ -832,9 +861,13 @@ def http_response(self, req, resp):
                  # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
                  if sys.version_info >= (3, 0):
                      location = location.encode('iso-8859-1').decode('utf-8')
+                else:
+                    location = location.decode('utf-8')
                  location_escaped = escape_url(location)
                  if location != location_escaped:
                      del resp.headers['Location']
+                    if sys.version_info < (3, 0):
+                        location_escaped = location_escaped.encode('utf-8')
                      resp.headers['Location'] = location_escaped
          return resp
  
@@ -842,6 +875,49 @@ def http_response(self, req, resp):
      https_response = http_response
  
  
+def make_socks_conn_class(base_class, socks_proxy):
+    assert issubclass(base_class, (
+        compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
+
+    url_components = compat_urlparse.urlparse(socks_proxy)
+    if url_components.scheme.lower() == 'socks5':
+        socks_type = ProxyType.SOCKS5
+    elif url_components.scheme.lower() in ('socks', 'socks4'):
+        socks_type = ProxyType.SOCKS4
+    elif url_components.scheme.lower() == 'socks4a':
+        socks_type = ProxyType.SOCKS4A
+
+    def unquote_if_non_empty(s):
+        if not s:
+            return s
+        return compat_urllib_parse_unquote_plus(s)
+
+    proxy_args = (
+        socks_type,
+        url_components.hostname, url_components.port or 1080,
+        True,  # Remote DNS
+        unquote_if_non_empty(url_components.username),
+        unquote_if_non_empty(url_components.password),
+    )
+
+    class SocksConnection(base_class):
+        def connect(self):
+            self.sock = sockssocket()
+            self.sock.setproxy(*proxy_args)
+            if type(self.timeout) in (int, float):
+                self.sock.settimeout(self.timeout)
+            self.sock.connect((self.host, self.port))
+
+            if isinstance(self, compat_http_client.HTTPSConnection):
+                if hasattr(self, '_context'):  # Python > 2.6
+                    self.sock = self._context.wrap_socket(
+                        self.sock, server_hostname=self.host)
+                else:
+                    self.sock = ssl.wrap_socket(self.sock)
+
+    return SocksConnection
+
+
  class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
      def __init__(self, params, https_conn_class=None, *args, **kwargs):
          compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
@@ -850,12 +926,20 @@ def __init__(self, params, https_conn_class=None, *args, **kwargs):
  
      def https_open(self, req):
          kwargs = {}
+        conn_class = self._https_conn_class
+
          if hasattr(self, '_context'):  # python > 2.6
              kwargs['context'] = self._context
          if hasattr(self, '_check_hostname'):  # python 3.x
              kwargs['check_hostname'] = self._check_hostname
+
+        socks_proxy = req.headers.get('Ytdl-socks-proxy')
+        if socks_proxy:
+            conn_class = make_socks_conn_class(conn_class, socks_proxy)
+            del req.headers['Ytdl-socks-proxy']
+
          return self.do_open(functools.partial(
-            _create_http_connection, self, self._https_conn_class, True),
+            _create_http_connection, self, conn_class, True),
              req, **kwargs)
  
  
@@ -955,6 +1039,7 @@ def unified_strdate(date_str, day_first=True):
          format_expressions.extend([
              '%d-%m-%Y',
              '%d.%m.%Y',
+            '%d.%m.%y',
              '%d/%m/%Y',
              '%d/%m/%y',
              '%d/%m/%Y %H:%M:%S',
@@ -975,7 +1060,10 @@ def unified_strdate(date_str, day_first=True):
      if upload_date is None:
          timetuple = email.utils.parsedate_tz(date_str)
          if timetuple:
-            upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+            try:
+                upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+            except ValueError:
+                pass
      if upload_date is not None:
          return compat_str(upload_date)
  
@@ -1186,7 +1274,7 @@ def bytes_to_intlist(bs):
  def intlist_to_bytes(xs):
      if not xs:
          return b''
-    return struct_pack('%dB' % len(xs), *xs)
+    return compat_struct_pack('%dB' % len(xs), *xs)
  
  
  # Cross-platform file locking
@@ -1469,15 +1557,11 @@ def setproctitle(title):
  
  
  def remove_start(s, start):
-    if s.startswith(start):
-        return s[len(start):]
-    return s
+    return s[len(start):] if s is not None and s.startswith(start) else s
  
  
  def remove_end(s, end):
-    if s.endswith(end):
-        return s[:-len(end)]
-    return s
+    return s[:-len(end)] if s is not None and s.endswith(end) else s
  
  
  def remove_quotes(s):
@@ -1754,24 +1838,6 @@ def escape_url(url):
          fragment=escape_rfc3986(url_parsed.fragment)
      ).geturl()
  
-try:
-    struct.pack('!I', 0)
-except TypeError:
-    # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
-    # See https://bugs.python.org/issue19099
-    def struct_pack(spec, *args):
-        if isinstance(spec, compat_str):
-            spec = spec.encode('ascii')
-        return struct.pack(spec, *args)
-
-    def struct_unpack(spec, *args):
-        if isinstance(spec, compat_str):
-            spec = spec.encode('ascii')
-        return struct.unpack(spec, *args)
-else:
-    struct_pack = struct.pack
-    struct_unpack = struct.unpack
-
  
  def read_batch_urls(batch_fd):
      def fixup(url):
@@ -1849,7 +1915,7 @@ def parse_age_limit(s):
  
  def strip_jsonp(code):
      return re.sub(
-        r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
+        r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
  
  
  def js_to_json(code):
@@ -1857,24 +1923,38 @@ def fix_kv(m):
          v = m.group(0)
          if v in ('true', 'false', 'null'):
              return v
-        if v.startswith('"'):
-            v = re.sub(r"\\'", "'", v[1:-1])
-        elif v.startswith("'"):
-            v = v[1:-1]
-            v = re.sub(r"\\\\|\\'|\"", lambda m: {
-                '\\\\': '\\\\',
-                "\\'": "'",
+        elif v.startswith('/*') or v == ',':
+            return ""
+
+        if v[0] in ("'", '"'):
+            v = re.sub(r'(?s)\\.|"', lambda m: {
                  '"': '\\"',
-            }[m.group(0)], v)
+                "\\'": "'",
+                '\\\n': '',
+                '\\x': '\\u00',
+            }.get(m.group(0), m.group(0)), v[1:-1])
+
+        INTEGER_TABLE = (
+            (r'^0[xX][0-9a-fA-F]+', 16),
+            (r'^0+[0-7]+', 8),
+        )
+
+        for regex, base in INTEGER_TABLE:
+            im = re.match(regex, v)
+            if im:
+                i = int(im.group(0), base)
+                return '"%d":' % i if v.endswith(':') else '%d' % i
+
          return '"%s"' % v
  
-    res = re.sub(r'''(?x)
-        "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
-        '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
-        [a-zA-Z_][.a-zA-Z_0-9]*
+    return re.sub(r'''(?sx)
+        "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
+        '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
+        /\*.*?\*/|,(?=\s*[\]}])|
+        [a-zA-Z_][.a-zA-Z_0-9]*|
+        (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
+        [0-9]+(?=\s*:)
          ''', fix_kv, code)
-    res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
-    return res
  
  
  def qualities(quality_ids):
@@ -1922,7 +2002,7 @@ def ytdl_is_updateable():
  
  def args_to_str(args):
      # Get a short string representation for a subprocess command
-    return ' '.join(shlex_quote(a) for a in args)
+    return ' '.join(compat_shlex_quote(a) for a in args)
  
  
  def error_to_compat_str(err):
@@ -1940,6 +2020,9 @@ def mimetype2ext(mt):
  
      ext = {
          'audio/mp4': 'm4a',
+        # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
+        # it's the most popular one
+        'audio/mpeg': 'mp3',
      }.get(mt)
      if ext is not None:
          return ext
@@ -1960,11 +2043,7 @@ def mimetype2ext(mt):
  
  
  def urlhandle_detect_ext(url_handle):
-    try:
-        url_handle.headers
-        getheader = lambda h: url_handle.headers[h]
-    except AttributeError:  # Python < 3
-        getheader = url_handle.info().getheader
+    getheader = url_handle.headers.get
  
      cd = getheader('Content-Disposition')
      if cd:
@@ -2694,6 +2773,10 @@ def proxy_open(self, req, proxy, type):
  
          if proxy == '__noproxy__':
              return None  # No Proxy
+        if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
+            req.add_header('Ytdl-socks-proxy', proxy)
+            # youtube-dl's http/https handlers do wrapping the socket with socks
+            return None
          return compat_urllib_request.ProxyHandler.proxy_open(
              self, req, proxy, type)
  
diff --git a/youtube_dl/version.py b/youtube_dl/version.py

index 5511608979606b18e7a663a977c7a609a1fa7eab..d24d06f4a58cd1c25ebabebdbea5cefbc545474f 100644 (file)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
  from __future__ import unicode_literals
  
-__version__ = '2016.05.01'
+__version__ = '2016.06.03'
author	Yen Chi Hsuan <redacted>
	Thu, 9 Jun 2016 09:20:16 +0000 (17:20 +0800)
committer	Yen Chi Hsuan <redacted>
	Thu, 9 Jun 2016 09:20:16 +0000 (17:20 +0800)
.github/ISSUE_TEMPLATE.md		patch \| blob \| blame \| history
.gitignore		patch \| blob \| blame \| history
.travis.yml		patch \| blob \| blame \| history
AUTHORS		patch \| blob \| blame \| history
Makefile		patch \| blob \| blame \| history
README.md		patch \| blob \| blame \| history
devscripts/buildserver.py		patch \| blob \| blame \| history
devscripts/create-github-release.py	[new file with mode: 0644]	patch \| blob
devscripts/install_srelay.sh	[new file with mode: 0755]	patch \| blob
devscripts/prepare_manpage.py		patch \| blob \| blame \| history
devscripts/release.sh		patch \| blob \| blame \| history
docs/supportedsites.md		patch \| blob \| blame \| history
test/helper.py		patch \| blob \| blame \| history
test/test_compat.py		patch \| blob \| blame \| history
test/test_http.py		patch \| blob \| blame \| history
test/test_socks.py	[new file with mode: 0644]	patch \| blob
test/test_utils.py		patch \| blob \| blame \| history
tox.ini		patch \| blob \| blame \| history
youtube_dl/YoutubeDL.py		patch \| blob \| blame \| history
youtube_dl/__init__.py		patch \| blob \| blame \| history
youtube_dl/compat.py		patch \| blob \| blame \| history
youtube_dl/downloader/external.py		patch \| blob \| blame \| history
youtube_dl/downloader/f4m.py		patch \| blob \| blame \| history
youtube_dl/downloader/hls.py		patch \| blob \| blame \| history
youtube_dl/extractor/abcnews.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/amp.py		patch \| blob \| blame \| history
youtube_dl/extractor/anvato.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/arte.py		patch \| blob \| blame \| history
youtube_dl/extractor/bandcamp.py		patch \| blob \| blame \| history
youtube_dl/extractor/bilibili.py		patch \| blob \| blame \| history
youtube_dl/extractor/biqle.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/bloomberg.py		patch \| blob \| blame \| history
youtube_dl/extractor/brightcove.py		patch \| blob \| blame \| history
youtube_dl/extractor/byutv.py		patch \| blob \| blame \| history
youtube_dl/extractor/canalplus.py		patch \| blob \| blame \| history
youtube_dl/extractor/cbc.py		patch \| blob \| blame \| history
youtube_dl/extractor/cbs.py		patch \| blob \| blame \| history
youtube_dl/extractor/cbslocal.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/ceskatelevize.py		patch \| blob \| blame \| history
youtube_dl/extractor/channel9.py		patch \| blob \| blame \| history
youtube_dl/extractor/cinemassacre.py	[deleted file]	patch \| blob \| blame \| history
youtube_dl/extractor/collegehumor.py	[deleted file]	patch \| blob \| blame \| history
youtube_dl/extractor/comedycentral.py		patch \| blob \| blame \| history
youtube_dl/extractor/common.py		patch \| blob \| blame \| history
youtube_dl/extractor/coub.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/dailymail.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/dw.py		patch \| blob \| blame \| history
youtube_dl/extractor/eporner.py		patch \| blob \| blame \| history
youtube_dl/extractor/espn.py		patch \| blob \| blame \| history
youtube_dl/extractor/extractors.py		patch \| blob \| blame \| history
youtube_dl/extractor/eyedotv.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/fczenit.py		patch \| blob \| blame \| history
youtube_dl/extractor/flickr.py		patch \| blob \| blame \| history
youtube_dl/extractor/formula1.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/generic.py		patch \| blob \| blame \| history
youtube_dl/extractor/groupon.py		patch \| blob \| blame \| history
youtube_dl/extractor/hearthisat.py		patch \| blob \| blame \| history
youtube_dl/extractor/howcast.py		patch \| blob \| blame \| history
youtube_dl/extractor/imdb.py		patch \| blob \| blame \| history
youtube_dl/extractor/iqiyi.py		patch \| blob \| blame \| history
youtube_dl/extractor/jwplatform.py		patch \| blob \| blame \| history
youtube_dl/extractor/kuwo.py		patch \| blob \| blame \| history
youtube_dl/extractor/learnr.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/libraryofcongress.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/lifenews.py		patch \| blob \| blame \| history
youtube_dl/extractor/litv.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/liveleak.py		patch \| blob \| blame \| history
youtube_dl/extractor/livestream.py		patch \| blob \| blame \| history
youtube_dl/extractor/localnews8.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/mgtv.py		patch \| blob \| blame \| history
youtube_dl/extractor/microsoftvirtualacademy.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/nbc.py		patch \| blob \| blame \| history
youtube_dl/extractor/ndtv.py		patch \| blob \| blame \| history
youtube_dl/extractor/nfb.py		patch \| blob \| blame \| history
youtube_dl/extractor/nrk.py		patch \| blob \| blame \| history
youtube_dl/extractor/ooyala.py		patch \| blob \| blame \| history
youtube_dl/extractor/openload.py		patch \| blob \| blame \| history
youtube_dl/extractor/ora.py		patch \| blob \| blame \| history
youtube_dl/extractor/periscope.py		patch \| blob \| blame \| history
youtube_dl/extractor/playwire.py		patch \| blob \| blame \| history
youtube_dl/extractor/radiocanada.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/redtube.py		patch \| blob \| blame \| history
youtube_dl/extractor/reuters.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/revision3.py		patch \| blob \| blame \| history
youtube_dl/extractor/rtve.py		patch \| blob \| blame \| history
youtube_dl/extractor/seeker.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/sendtonews.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/sina.py		patch \| blob \| blame \| history
youtube_dl/extractor/spankwire.py		patch \| blob \| blame \| history
youtube_dl/extractor/teachingchannel.py		patch \| blob \| blame \| history
youtube_dl/extractor/teamcoco.py		patch \| blob \| blame \| history
youtube_dl/extractor/telegraaf.py		patch \| blob \| blame \| history
youtube_dl/extractor/tf1.py		patch \| blob \| blame \| history
youtube_dl/extractor/theplatform.py		patch \| blob \| blame \| history
youtube_dl/extractor/thesixtyone.py		patch \| blob \| blame \| history
youtube_dl/extractor/threeqsdn.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/tvp.py		patch \| blob \| blame \| history
youtube_dl/extractor/twentyfourvideo.py		patch \| blob \| blame \| history
youtube_dl/extractor/twitch.py		patch \| blob \| blame \| history
youtube_dl/extractor/twitter.py		patch \| blob \| blame \| history
youtube_dl/extractor/udemy.py		patch \| blob \| blame \| history
youtube_dl/extractor/udn.py		patch \| blob \| blame \| history
youtube_dl/extractor/ustudio.py		patch \| blob \| blame \| history
youtube_dl/extractor/veoh.py		patch \| blob \| blame \| history
youtube_dl/extractor/vessel.py		patch \| blob \| blame \| history
youtube_dl/extractor/vevo.py		patch \| blob \| blame \| history
youtube_dl/extractor/vice.py		patch \| blob \| blame \| history
youtube_dl/extractor/vidio.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/viewlift.py		patch \| blob \| blame \| history
youtube_dl/extractor/vk.py		patch \| blob \| blame \| history
youtube_dl/extractor/vlive.py		patch \| blob \| blame \| history
youtube_dl/extractor/voxmedia.py		patch \| blob \| blame \| history
youtube_dl/extractor/washingtonpost.py		patch \| blob \| blame \| history
youtube_dl/extractor/wat.py		patch \| blob \| blame \| history
youtube_dl/extractor/wdr.py		patch \| blob \| blame \| history
youtube_dl/extractor/wistia.py		patch \| blob \| blame \| history
youtube_dl/extractor/xhamster.py		patch \| blob \| blame \| history
youtube_dl/extractor/xvideos.py		patch \| blob \| blame \| history
youtube_dl/extractor/yandexmusic.py		patch \| blob \| blame \| history
youtube_dl/extractor/youku.py		patch \| blob \| blame \| history
youtube_dl/extractor/youtube.py		patch \| blob \| blame \| history
youtube_dl/options.py		patch \| blob \| blame \| history
youtube_dl/postprocessor/execafterdownload.py		patch \| blob \| blame \| history
youtube_dl/socks.py	[new file with mode: 0644]	patch \| blob
youtube_dl/swfinterp.py		patch \| blob \| blame \| history
youtube_dl/update.py		patch \| blob \| blame \| history
youtube_dl/utils.py		patch \| blob \| blame \| history
youtube_dl/version.py		patch \| blob \| blame \| history