[skip travis] adding automerge support

author Unknown <redacted>

Wed, 2 Sep 2020 21:33:41 +0000 (23:33 +0200)

committer Unknown <redacted>

Wed, 2 Sep 2020 21:33:41 +0000 (23:33 +0200)
author Unknown <redacted>
Wed, 2 Sep 2020 21:33:41 +0000 (23:33 +0200)
committer Unknown <redacted>
Wed, 2 Sep 2020 21:33:41 +0000 (23:33 +0200)
diff --git a/.travis.yml b/.travis.yml

index fb499845e4b652edf2661a724bb9f7e515d4f056..c53c77e07fae8546a57ef783f3b5a7a85c4f011e 100644 (file)
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,4 +35,7 @@ jobs:
      - env: JYTHON=true; YTDL_TEST_SET=core
  before_install:
    - if [ "$JYTHON" == "true" ]; then ./devscripts/install_jython.sh; export PATH="$HOME/jython/bin:$PATH"; fi
+before_script:
+  - rm -rf /youtube_dlc/*
+  - cp /youtube_dl/* /youtube_dlc
  script: ./devscripts/run_tests.sh
diff --git a/setup.py b/setup.py

index ac56e49954568c3f1514fce9da8a8864646072fe..f5f0bae62401cecaa63d26ff7c72982b9812f182 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -67,7 +67,7 @@ def run(self):
      long_description=LONG_DESCRIPTION,
      # long_description_content_type="text/markdown",
      url="https://github.com/blackjack4494/youtube-dlc",
-    packages=find_packages(),
+    packages=find_packages(exclude=("youtube_dl",)),
         #packages=[
      #    'youtube_dlc',
      #    'youtube_dlc.extractor', 'youtube_dlc.downloader',
diff --git a/youtube-dlc.spec b/youtube-dlc.spec

new file mode 100644 (file)

index 0000000..d2f4ca4
--- /dev/null
+++ b/youtube-dlc.spec
@@ -0,0 +1,33 @@
+# -*- mode: python ; coding: utf-8 -*-
+
+block_cipher = None
+
+
+a = Analysis(['youtube_dlc\\__main__.py'],
+             pathex=['D:\\gitkraken\\youtube-dl'],
+             binaries=[],
+             datas=[],
+             hiddenimports=[],
+             hookspath=[],
+             runtime_hooks=[],
+             excludes=[],
+             win_no_prefer_redirects=False,
+             win_private_assemblies=False,
+             cipher=block_cipher,
+             noarchive=False)
+pyz = PYZ(a.pure, a.zipped_data,
+             cipher=block_cipher)
+exe = EXE(pyz,
+          a.scripts,
+          a.binaries,
+          a.zipfiles,
+          a.datas,
+          [],
+          name='youtube-dlc',
+          debug=False,
+          bootloader_ignore_signals=False,
+          strip=False,
+          upx=True,
+          upx_exclude=[],
+          runtime_tmpdir=None,
+          console=True )
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

new file mode 100644 (file)

index 0000000..f79d31d
--- /dev/null
+++ b/youtube_dl/YoutubeDL.py
@@ -0,0 +1,2417 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+from __future__ import absolute_import, unicode_literals
+
+import collections
+import contextlib
+import copy
+import datetime
+import errno
+import fileinput
+import io
+import itertools
+import json
+import locale
+import operator
+import os
+import platform
+import re
+import shutil
+import subprocess
+import socket
+import sys
+import time
+import tokenize
+import traceback
+import random
+
+from string import ascii_letters
+
+from .compat import (
+    compat_basestring,
+    compat_cookiejar,
+    compat_get_terminal_size,
+    compat_http_client,
+    compat_kwargs,
+    compat_numeric_types,
+    compat_os_name,
+    compat_str,
+    compat_tokenize_tokenize,
+    compat_urllib_error,
+    compat_urllib_request,
+    compat_urllib_request_DataHandler,
+)
+from .utils import (
+    age_restricted,
+    args_to_str,
+    ContentTooShortError,
+    date_from_str,
+    DateRange,
+    DEFAULT_OUTTMPL,
+    determine_ext,
+    determine_protocol,
+    DownloadError,
+    encode_compat_str,
+    encodeFilename,
+    error_to_compat_str,
+    expand_path,
+    ExtractorError,
+    format_bytes,
+    formatSeconds,
+    GeoRestrictedError,
+    int_or_none,
+    ISO3166Utils,
+    locked_file,
+    make_HTTPS_handler,
+    MaxDownloadsReached,
+    orderedSet,
+    PagedList,
+    parse_filesize,
+    PerRequestProxyHandler,
+    platform_name,
+    PostProcessingError,
+    preferredencoding,
+    prepend_extension,
+    register_socks_protocols,
+    render_table,
+    replace_extension,
+    SameFileError,
+    sanitize_filename,
+    sanitize_path,
+    sanitize_url,
+    sanitized_Request,
+    std_headers,
+    str_or_none,
+    subtitles_filename,
+    UnavailableVideoError,
+    url_basename,
+    version_tuple,
+    write_json_file,
+    write_string,
+    YoutubeDLCookieJar,
+    YoutubeDLCookieProcessor,
+    YoutubeDLHandler,
+    YoutubeDLRedirectHandler,
+)
+from .cache import Cache
+from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
+from .extractor.openload import PhantomJSwrapper
+from .downloader import get_suitable_downloader
+from .downloader.rtmp import rtmpdump_version
+from .postprocessor import (
+    FFmpegFixupM3u8PP,
+    FFmpegFixupM4aPP,
+    FFmpegFixupStretchedPP,
+    FFmpegMergerPP,
+    FFmpegPostProcessor,
+    get_postprocessor,
+)
+from .version import __version__
+
+if compat_os_name == 'nt':
+    import ctypes
+
+
+class YoutubeDL(object):
+    """YoutubeDL class.
+
+    YoutubeDL objects are the ones responsible of downloading the
+    actual video file and writing it to disk if the user has requested
+    it, among some other tasks. In most cases there should be one per
+    program. As, given a video URL, the downloader doesn't know how to
+    extract all the needed information, task that InfoExtractors do, it
+    has to pass the URL to one of them.
+
+    For this, YoutubeDL objects have a method that allows
+    InfoExtractors to be registered in a given order. When it is passed
+    a URL, the YoutubeDL object handles it to the first InfoExtractor it
+    finds that reports being able to handle it. The InfoExtractor extracts
+    all the information about the video or videos the URL refers to, and
+    YoutubeDL process the extracted information, possibly using a File
+    Downloader to download the video.
+
+    YoutubeDL objects accept a lot of parameters. In order not to saturate
+    the object constructor with arguments, it receives a dictionary of
+    options instead. These options are available through the params
+    attribute for the InfoExtractors to use. The YoutubeDL also
+    registers itself as the downloader in charge for the InfoExtractors
+    that are added to it, so this is a "mutual registration".
+
+    Available options:
+
+    username:          Username for authentication purposes.
+    password:          Password for authentication purposes.
+    videopassword:     Password for accessing a video.
+    ap_mso:            Adobe Pass multiple-system operator identifier.
+    ap_username:       Multiple-system operator account username.
+    ap_password:       Multiple-system operator account password.
+    usenetrc:          Use netrc for authentication instead.
+    verbose:           Print additional info to stdout.
+    quiet:             Do not print messages to stdout.
+    no_warnings:       Do not print out anything for warnings.
+    forceurl:          Force printing final URL.
+    forcetitle:        Force printing title.
+    forceid:           Force printing ID.
+    forcethumbnail:    Force printing thumbnail URL.
+    forcedescription:  Force printing description.
+    forcefilename:     Force printing final filename.
+    forceduration:     Force printing duration.
+    forcejson:         Force printing info_dict as JSON.
+    dump_single_json:  Force printing the info_dict of the whole playlist
+                       (or video) as a single JSON line.
+    simulate:          Do not download the video files.
+    format:            Video format code. See options.py for more information.
+    outtmpl:           Template for output names.
+    restrictfilenames: Do not allow "&" and spaces in file names
+    ignoreerrors:      Do not stop on download errors.
+    force_generic_extractor: Force downloader to use the generic extractor
+    nooverwrites:      Prevent overwriting files.
+    playliststart:     Playlist item to start at.
+    playlistend:       Playlist item to end at.
+    playlist_items:    Specific indices of playlist to download.
+    playlistreverse:   Download playlist items in reverse order.
+    playlistrandom:    Download playlist items in random order.
+    matchtitle:        Download only matching titles.
+    rejecttitle:       Reject downloads for matching titles.
+    logger:            Log messages to a logging.Logger instance.
+    logtostderr:       Log messages to stderr instead of stdout.
+    writedescription:  Write the video description to a .description file
+    writeinfojson:     Write the video description to a .info.json file
+    writeannotations:  Write the video annotations to a .annotations.xml file
+    writethumbnail:    Write the thumbnail image to a file
+    write_all_thumbnails:  Write all thumbnail formats to files
+    writesubtitles:    Write the video subtitles to a file
+    writeautomaticsub: Write the automatically generated subtitles to a file
+    allsubtitles:      Downloads all the subtitles of the video
+                       (requires writesubtitles or writeautomaticsub)
+    listsubtitles:     Lists all available subtitles for the video
+    subtitlesformat:   The format code for subtitles
+    subtitleslangs:    List of languages of the subtitles to download
+    keepvideo:         Keep the video file after post-processing
+    daterange:         A DateRange object, download only if the upload_date is in the range.
+    skip_download:     Skip the actual download of the video file
+    cachedir:          Location of the cache files in the filesystem.
+                       False to disable filesystem cache.
+    noplaylist:        Download single video instead of a playlist if in doubt.
+    age_limit:         An integer representing the user's age in years.
+                       Unsuitable videos for the given age are skipped.
+    min_views:         An integer representing the minimum view count the video
+                       must have in order to not be skipped.
+                       Videos without view count information are always
+                       downloaded. None for no limit.
+    max_views:         An integer representing the maximum view count.
+                       Videos that are more popular than that are not
+                       downloaded.
+                       Videos without view count information are always
+                       downloaded. None for no limit.
+    download_archive:  File name of a file where all downloads are recorded.
+                       Videos already present in the file are not downloaded
+                       again.
+    cookiefile:        File name where cookies should be read from and dumped to.
+    nocheckcertificate:Do not verify SSL certificates
+    prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
+                       At the moment, this is only supported by YouTube.
+    proxy:             URL of the proxy server to use
+    geo_verification_proxy:  URL of the proxy to use for IP address verification
+                       on geo-restricted sites.
+    socket_timeout:    Time to wait for unresponsive hosts, in seconds
+    bidi_workaround:   Work around buggy terminals without bidirectional text
+                       support, using fridibi
+    debug_printtraffic:Print out sent and received HTTP traffic
+    include_ads:       Download ads as well
+    default_search:    Prepend this string if an input url is not valid.
+                       'auto' for elaborate guessing
+    encoding:          Use this encoding instead of the system-specified.
+    extract_flat:      Do not resolve URLs, return the immediate result.
+                       Pass in 'in_playlist' to only show this behavior for
+                       playlist items.
+    postprocessors:    A list of dictionaries, each with an entry
+                       * key:  The name of the postprocessor. See
+                               youtube_dlc/postprocessor/__init__.py for a list.
+                       as well as any further keyword arguments for the
+                       postprocessor.
+    progress_hooks:    A list of functions that get called on download
+                       progress, with a dictionary with the entries
+                       * status: One of "downloading", "error", or "finished".
+                                 Check this first and ignore unknown values.
+
+                       If status is one of "downloading", or "finished", the
+                       following properties may also be present:
+                       * filename: The final filename (always present)
+                       * tmpfilename: The filename we're currently writing to
+                       * downloaded_bytes: Bytes on disk
+                       * total_bytes: Size of the whole file, None if unknown
+                       * total_bytes_estimate: Guess of the eventual file size,
+                                               None if unavailable.
+                       * elapsed: The number of seconds since download started.
+                       * eta: The estimated time in seconds, None if unknown
+                       * speed: The download speed in bytes/second, None if
+                                unknown
+                       * fragment_index: The counter of the currently
+                                         downloaded video fragment.
+                       * fragment_count: The number of fragments (= individual
+                                         files that will be merged)
+
+                       Progress hooks are guaranteed to be called at least once
+                       (with status "finished") if the download is successful.
+    merge_output_format: Extension to use when merging formats.
+    fixup:             Automatically correct known faults of the file.
+                       One of:
+                       - "never": do nothing
+                       - "warn": only emit a warning
+                       - "detect_or_warn": check whether we can do anything
+                                           about it, warn otherwise (default)
+    source_address:    Client-side IP address to bind to.
+    call_home:         Boolean, true iff we are allowed to contact the
+                       youtube-dlc servers for debugging.
+    sleep_interval:    Number of seconds to sleep before each download when
+                       used alone or a lower bound of a range for randomized
+                       sleep before each download (minimum possible number
+                       of seconds to sleep) when used along with
+                       max_sleep_interval.
+    max_sleep_interval:Upper bound of a range for randomized sleep before each
+                       download (maximum possible number of seconds to sleep).
+                       Must only be used along with sleep_interval.
+                       Actual sleep time will be a random float from range
+                       [sleep_interval; max_sleep_interval].
+    listformats:       Print an overview of available video formats and exit.
+    list_thumbnails:   Print a table of all thumbnails and exit.
+    match_filter:      A function that gets called with the info_dict of
+                       every video.
+                       If it returns a message, the video is ignored.
+                       If it returns None, the video is downloaded.
+                       match_filter_func in utils.py is one example for this.
+    no_color:          Do not emit color codes in output.
+    geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
+                       HTTP header
+    geo_bypass_country:
+                       Two-letter ISO 3166-2 country code that will be used for
+                       explicit geographic restriction bypassing via faking
+                       X-Forwarded-For HTTP header
+    geo_bypass_ip_block:
+                       IP range in CIDR notation that will be used similarly to
+                       geo_bypass_country
+
+    The following options determine which downloader is picked:
+    external_downloader: Executable of the external downloader to call.
+                       None or unset for standard (built-in) downloader.
+    hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
+                       if True, otherwise use ffmpeg/avconv if False, otherwise
+                       use downloader suggested by extractor if None.
+
+    The following parameters are not used by YoutubeDL itself, they are used by
+    the downloader (see youtube_dlc/downloader/common.py):
+    nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
+    noresizebuffer, retries, continuedl, noprogress, consoletitle,
+    xattr_set_filesize, external_downloader_args, hls_use_mpegts,
+    http_chunk_size.
+
+    The following options are used by the post processors:
+    prefer_ffmpeg:     If False, use avconv instead of ffmpeg if both are available,
+                       otherwise prefer ffmpeg.
+    ffmpeg_location:   Location of the ffmpeg/avconv binary; either the path
+                       to the binary or its containing directory.
+    postprocessor_args: A list of additional command-line arguments for the
+                        postprocessor.
+
+    The following options are used by the Youtube extractor:
+    youtube_include_dash_manifest: If True (default), DASH manifests and related
+                        data will be downloaded and processed by extractor.
+                        You can reduce network I/O by disabling it if you don't
+                        care about DASH.
+    """
+
+    _NUMERIC_FIELDS = set((
+        'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
+        'timestamp', 'upload_year', 'upload_month', 'upload_day',
+        'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
+        'average_rating', 'comment_count', 'age_limit',
+        'start_time', 'end_time',
+        'chapter_number', 'season_number', 'episode_number',
+        'track_number', 'disc_number', 'release_year',
+        'playlist_index',
+    ))
+
+    params = None
+    _ies = []
+    _pps = []
+    _download_retcode = None
+    _num_downloads = None
+    _screen_file = None
+
+    def __init__(self, params=None, auto_init=True):
+        """Create a FileDownloader object with the given options."""
+        if params is None:
+            params = {}
+        self._ies = []
+        self._ies_instances = {}
+        self._pps = []
+        self._progress_hooks = []
+        self._download_retcode = 0
+        self._num_downloads = 0
+        self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
+        self._err_file = sys.stderr
+        self.params = {
+            # Default parameters
+            'nocheckcertificate': False,
+        }
+        self.params.update(params)
+        self.cache = Cache(self)
+
+        def check_deprecated(param, option, suggestion):
+            if self.params.get(param) is not None:
+                self.report_warning(
+                    '%s is deprecated. Use %s instead.' % (option, suggestion))
+                return True
+            return False
+
+        if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
+            if self.params.get('geo_verification_proxy') is None:
+                self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
+
+        check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
+        check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
+        check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
+
+        if params.get('bidi_workaround', False):
+            try:
+                import pty
+                master, slave = pty.openpty()
+                width = compat_get_terminal_size().columns
+                if width is None:
+                    width_args = []
+                else:
+                    width_args = ['-w', str(width)]
+                sp_kwargs = dict(
+                    stdin=subprocess.PIPE,
+                    stdout=slave,
+                    stderr=self._err_file)
+                try:
+                    self._output_process = subprocess.Popen(
+                        ['bidiv'] + width_args, **sp_kwargs
+                    )
+                except OSError:
+                    self._output_process = subprocess.Popen(
+                        ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
+                self._output_channel = os.fdopen(master, 'rb')
+            except OSError as ose:
+                if ose.errno == errno.ENOENT:
+                    self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
+                else:
+                    raise
+
+        if (sys.platform != 'win32'
+                and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
+                and not params.get('restrictfilenames', False)):
+            # Unicode filesystem API will throw errors (#1474, #13027)
+            self.report_warning(
+                'Assuming --restrict-filenames since file system encoding '
+                'cannot encode all characters. '
+                'Set the LC_ALL environment variable to fix this.')
+            self.params['restrictfilenames'] = True
+
+        if isinstance(params.get('outtmpl'), bytes):
+            self.report_warning(
+                'Parameter outtmpl is bytes, but should be a unicode string. '
+                'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
+
+        self._setup_opener()
+
+        if auto_init:
+            self.print_debug_header()
+            self.add_default_info_extractors()
+
+        for pp_def_raw in self.params.get('postprocessors', []):
+            pp_class = get_postprocessor(pp_def_raw['key'])
+            pp_def = dict(pp_def_raw)
+            del pp_def['key']
+            pp = pp_class(self, **compat_kwargs(pp_def))
+            self.add_post_processor(pp)
+
+        for ph in self.params.get('progress_hooks', []):
+            self.add_progress_hook(ph)
+
+        register_socks_protocols()
+
+    def warn_if_short_id(self, argv):
+        # short YouTube ID starting with dash?
+        idxs = [
+            i for i, a in enumerate(argv)
+            if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
+        if idxs:
+            correct_argv = (
+                ['youtube-dlc']
+                + [a for i, a in enumerate(argv) if i not in idxs]
+                + ['--'] + [argv[i] for i in idxs]
+            )
+            self.report_warning(
+                'Long argument string detected. '
+                'Use -- to separate parameters and URLs, like this:\n%s\n' %
+                args_to_str(correct_argv))
+
+    def add_info_extractor(self, ie):
+        """Add an InfoExtractor object to the end of the list."""
+        self._ies.append(ie)
+        if not isinstance(ie, type):
+            self._ies_instances[ie.ie_key()] = ie
+            ie.set_downloader(self)
+
+    def get_info_extractor(self, ie_key):
+        """
+        Get an instance of an IE with name ie_key, it will try to get one from
+        the _ies list, if there's no instance it will create a new one and add
+        it to the extractor list.
+        """
+        ie = self._ies_instances.get(ie_key)
+        if ie is None:
+            ie = get_info_extractor(ie_key)()
+            self.add_info_extractor(ie)
+        return ie
+
+    def add_default_info_extractors(self):
+        """
+        Add the InfoExtractors returned by gen_extractors to the end of the list
+        """
+        for ie in gen_extractor_classes():
+            self.add_info_extractor(ie)
+
+    def add_post_processor(self, pp):
+        """Add a PostProcessor object to the end of the chain."""
+        self._pps.append(pp)
+        pp.set_downloader(self)
+
+    def add_progress_hook(self, ph):
+        """Add the progress hook (currently only for the file downloader)"""
+        self._progress_hooks.append(ph)
+
+    def _bidi_workaround(self, message):
+        if not hasattr(self, '_output_channel'):
+            return message
+
+        assert hasattr(self, '_output_process')
+        assert isinstance(message, compat_str)
+        line_count = message.count('\n') + 1
+        self._output_process.stdin.write((message + '\n').encode('utf-8'))
+        self._output_process.stdin.flush()
+        res = ''.join(self._output_channel.readline().decode('utf-8')
+                      for _ in range(line_count))
+        return res[:-len('\n')]
+
+    def to_screen(self, message, skip_eol=False):
+        """Print message to stdout if not in quiet mode."""
+        return self.to_stdout(message, skip_eol, check_quiet=True)
+
+    def _write_string(self, s, out=None):
+        write_string(s, out=out, encoding=self.params.get('encoding'))
+
+    def to_stdout(self, message, skip_eol=False, check_quiet=False):
+        """Print message to stdout if not in quiet mode."""
+        if self.params.get('logger'):
+            self.params['logger'].debug(message)
+        elif not check_quiet or not self.params.get('quiet', False):
+            message = self._bidi_workaround(message)
+            terminator = ['\n', ''][skip_eol]
+            output = message + terminator
+
+            self._write_string(output, self._screen_file)
+
+    def to_stderr(self, message):
+        """Print message to stderr."""
+        assert isinstance(message, compat_str)
+        if self.params.get('logger'):
+            self.params['logger'].error(message)
+        else:
+            message = self._bidi_workaround(message)
+            output = message + '\n'
+            self._write_string(output, self._err_file)
+
+    def to_console_title(self, message):
+        if not self.params.get('consoletitle', False):
+            return
+        if compat_os_name == 'nt':
+            if ctypes.windll.kernel32.GetConsoleWindow():
+                # c_wchar_p() might not be necessary if `message` is
+                # already of type unicode()
+                ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
+        elif 'TERM' in os.environ:
+            self._write_string('\033]0;%s\007' % message, self._screen_file)
+
+    def save_console_title(self):
+        if not self.params.get('consoletitle', False):
+            return
+        if self.params.get('simulate', False):
+            return
+        if compat_os_name != 'nt' and 'TERM' in os.environ:
+            # Save the title on stack
+            self._write_string('\033[22;0t', self._screen_file)
+
+    def restore_console_title(self):
+        if not self.params.get('consoletitle', False):
+            return
+        if self.params.get('simulate', False):
+            return
+        if compat_os_name != 'nt' and 'TERM' in os.environ:
+            # Restore the title from stack
+            self._write_string('\033[23;0t', self._screen_file)
+
+    def __enter__(self):
+        self.save_console_title()
+        return self
+
+    def __exit__(self, *args):
+        self.restore_console_title()
+
+        if self.params.get('cookiefile') is not None:
+            self.cookiejar.save(ignore_discard=True, ignore_expires=True)
+
+    def trouble(self, message=None, tb=None):
+        """Determine action to take when a download problem appears.
+
+        Depending on if the downloader has been configured to ignore
+        download errors or not, this method may throw an exception or
+        not when errors are found, after printing the message.
+
+        tb, if given, is additional traceback information.
+        """
+        if message is not None:
+            self.to_stderr(message)
+        if self.params.get('verbose'):
+            if tb is None:
+                if sys.exc_info()[0]:  # if .trouble has been called from an except block
+                    tb = ''
+                    if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
+                        tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
+                    tb += encode_compat_str(traceback.format_exc())
+                else:
+                    tb_data = traceback.format_list(traceback.extract_stack())
+                    tb = ''.join(tb_data)
+            self.to_stderr(tb)
+        if not self.params.get('ignoreerrors', False):
+            if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
+                exc_info = sys.exc_info()[1].exc_info
+            else:
+                exc_info = sys.exc_info()
+            raise DownloadError(message, exc_info)
+        self._download_retcode = 1
+
+    def report_warning(self, message):
+        '''
+        Print the message to stderr, it will be prefixed with 'WARNING:'
+        If stderr is a tty file the 'WARNING:' will be colored
+        '''
+        if self.params.get('logger') is not None:
+            self.params['logger'].warning(message)
+        else:
+            if self.params.get('no_warnings'):
+                return
+            if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
+                _msg_header = '\033[0;33mWARNING:\033[0m'
+            else:
+                _msg_header = 'WARNING:'
+            warning_message = '%s %s' % (_msg_header, message)
+            self.to_stderr(warning_message)
+
+    def report_error(self, message, tb=None):
+        '''
+        Do the same as trouble, but prefixes the message with 'ERROR:', colored
+        in red if stderr is a tty file.
+        '''
+        if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
+            _msg_header = '\033[0;31mERROR:\033[0m'
+        else:
+            _msg_header = 'ERROR:'
+        error_message = '%s %s' % (_msg_header, message)
+        self.trouble(error_message, tb)
+
+    def report_file_already_downloaded(self, file_name):
+        """Report file has already been fully downloaded."""
+        try:
+            self.to_screen('[download] %s has already been downloaded' % file_name)
+        except UnicodeEncodeError:
+            self.to_screen('[download] The file has already been downloaded')
+
+    def prepare_filename(self, info_dict):
+        """Generate the output filename."""
+        try:
+            template_dict = dict(info_dict)
+
+            template_dict['epoch'] = int(time.time())
+            autonumber_size = self.params.get('autonumber_size')
+            if autonumber_size is None:
+                autonumber_size = 5
+            template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
+            if template_dict.get('resolution') is None:
+                if template_dict.get('width') and template_dict.get('height'):
+                    template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
+                elif template_dict.get('height'):
+                    template_dict['resolution'] = '%sp' % template_dict['height']
+                elif template_dict.get('width'):
+                    template_dict['resolution'] = '%dx?' % template_dict['width']
+
+            sanitize = lambda k, v: sanitize_filename(
+                compat_str(v),
+                restricted=self.params.get('restrictfilenames'),
+                is_id=(k == 'id' or k.endswith('_id')))
+            template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
+                                 for k, v in template_dict.items()
+                                 if v is not None and not isinstance(v, (list, tuple, dict)))
+            template_dict = collections.defaultdict(lambda: 'NA', template_dict)
+
+            outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
+
+            # For fields playlist_index and autonumber convert all occurrences
+            # of %(field)s to %(field)0Nd for backward compatibility
+            field_size_compat_map = {
+                'playlist_index': len(str(template_dict['n_entries'])),
+                'autonumber': autonumber_size,
+            }
+            FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
+            mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
+            if mobj:
+                outtmpl = re.sub(
+                    FIELD_SIZE_COMPAT_RE,
+                    r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
+                    outtmpl)
+
+            # Missing numeric fields used together with integer presentation types
+            # in format specification will break the argument substitution since
+            # string 'NA' is returned for missing fields. We will patch output
+            # template for missing fields to meet string presentation type.
+            for numeric_field in self._NUMERIC_FIELDS:
+                if numeric_field not in template_dict:
+                    # As of [1] format syntax is:
+                    #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
+                    # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
+                    FORMAT_RE = r'''(?x)
+                        (?<!%)
+                        %
+                        \({0}\)  # mapping key
+                        (?:[#0\-+ ]+)?  # conversion flags (optional)
+                        (?:\d+)?  # minimum field width (optional)
+                        (?:\.\d+)?  # precision (optional)
+                        [hlL]?  # length modifier (optional)
+                        [diouxXeEfFgGcrs%]  # conversion type
+                    '''
+                    outtmpl = re.sub(
+                        FORMAT_RE.format(numeric_field),
+                        r'%({0})s'.format(numeric_field), outtmpl)
+
+            # expand_path translates '%%' into '%' and '$$' into '$'
+            # correspondingly that is not what we want since we need to keep
+            # '%%' intact for template dict substitution step. Working around
+            # with boundary-alike separator hack.
+            sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
+            outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
+
+            # outtmpl should be expand_path'ed before template dict substitution
+            # because meta fields may contain env variables we don't want to
+            # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
+            # title "Hello $PATH", we don't want `$PATH` to be expanded.
+            filename = expand_path(outtmpl).replace(sep, '') % template_dict
+
+            # Temporary fix for #4787
+            # 'Treat' all problem characters by passing filename through preferredencoding
+            # to workaround encoding issues with subprocess on python2 @ Windows
+            if sys.version_info < (3, 0) and sys.platform == 'win32':
+                filename = encodeFilename(filename, True).decode(preferredencoding())
+            return sanitize_path(filename)
+        except ValueError as err:
+            self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
+            return None
+
+    def _match_entry(self, info_dict, incomplete):
+        """ Returns None iff the file should be downloaded """
+
+        video_title = info_dict.get('title', info_dict.get('id', 'video'))
+        if 'title' in info_dict:
+            # This can happen when we're just evaluating the playlist
+            title = info_dict['title']
+            matchtitle = self.params.get('matchtitle', False)
+            if matchtitle:
+                if not re.search(matchtitle, title, re.IGNORECASE):
+                    return '"' + title + '" title did not match pattern "' + matchtitle + '"'
+            rejecttitle = self.params.get('rejecttitle', False)
+            if rejecttitle:
+                if re.search(rejecttitle, title, re.IGNORECASE):
+                    return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
+        date = info_dict.get('upload_date')
+        if date is not None:
+            dateRange = self.params.get('daterange', DateRange())
+            if date not in dateRange:
+                return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
+        view_count = info_dict.get('view_count')
+        if view_count is not None:
+            min_views = self.params.get('min_views')
+            if min_views is not None and view_count < min_views:
+                return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
+            max_views = self.params.get('max_views')
+            if max_views is not None and view_count > max_views:
+                return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
+        if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
+            return 'Skipping "%s" because it is age restricted' % video_title
+        if self.in_download_archive(info_dict):
+            return '%s has already been recorded in archive' % video_title
+
+        if not incomplete:
+            match_filter = self.params.get('match_filter')
+            if match_filter is not None:
+                ret = match_filter(info_dict)
+                if ret is not None:
+                    return ret
+
+        return None
+
+    @staticmethod
+    def add_extra_info(info_dict, extra_info):
+        '''Set the keys from extra_info in info dict if they are missing'''
+        for key, value in extra_info.items():
+            info_dict.setdefault(key, value)
+
+    def extract_info(self, url, download=True, ie_key=None, extra_info={},
+                     process=True, force_generic_extractor=False):
+        '''
+        Returns a list with a dictionary for each video we find.
+        If 'download', also downloads the videos.
+        extra_info is a dict containing the extra values to add to each result
+        '''
+
+        if not ie_key and force_generic_extractor:
+            ie_key = 'Generic'
+
+        if ie_key:
+            ies = [self.get_info_extractor(ie_key)]
+        else:
+            ies = self._ies
+
+        for ie in ies:
+            if not ie.suitable(url):
+                continue
+
+            ie = self.get_info_extractor(ie.ie_key())
+            if not ie.working():
+                self.report_warning('The program functionality for this site has been marked as broken, '
+                                    'and will probably not work.')
+
+            try:
+                ie_result = ie.extract(url)
+                if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
+                    break
+                if isinstance(ie_result, list):
+                    # Backwards compatibility: old IE result format
+                    ie_result = {
+                        '_type': 'compat_list',
+                        'entries': ie_result,
+                    }
+                self.add_default_extra_info(ie_result, ie, url)
+                if process:
+                    return self.process_ie_result(ie_result, download, extra_info)
+                else:
+                    return ie_result
+            except GeoRestrictedError as e:
+                msg = e.msg
+                if e.countries:
+                    msg += '\nThis video is available in %s.' % ', '.join(
+                        map(ISO3166Utils.short2full, e.countries))
+                msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
+                self.report_error(msg)
+                break
+            except ExtractorError as e:  # An error we somewhat expected
+                self.report_error(compat_str(e), e.format_traceback())
+                break
+            except MaxDownloadsReached:
+                raise
+            except Exception as e:
+                if self.params.get('ignoreerrors', False):
+                    self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
+                    break
+                else:
+                    raise
+        else:
+            self.report_error('no suitable InfoExtractor for URL %s' % url)
+
+    def add_default_extra_info(self, ie_result, ie, url):
+        self.add_extra_info(ie_result, {
+            'extractor': ie.IE_NAME,
+            'webpage_url': url,
+            'webpage_url_basename': url_basename(url),
+            'extractor_key': ie.ie_key(),
+        })
+
+    def process_ie_result(self, ie_result, download=True, extra_info={}):
+        """
+        Take the result of the ie(may be modified) and resolve all unresolved
+        references (URLs, playlist items).
+
+        It will also download the videos if 'download'.
+        Returns the resolved ie_result.
+        """
+        result_type = ie_result.get('_type', 'video')
+
+        if result_type in ('url', 'url_transparent'):
+            ie_result['url'] = sanitize_url(ie_result['url'])
+            extract_flat = self.params.get('extract_flat', False)
+            if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
+                    or extract_flat is True):
+                self.__forced_printings(
+                    ie_result, self.prepare_filename(ie_result),
+                    incomplete=True)
+                return ie_result
+
+        if result_type == 'video':
+            self.add_extra_info(ie_result, extra_info)
+            return self.process_video_result(ie_result, download=download)
+        elif result_type == 'url':
+            # We have to add extra_info to the results because it may be
+            # contained in a playlist
+            return self.extract_info(ie_result['url'],
+                                     download,
+                                     ie_key=ie_result.get('ie_key'),
+                                     extra_info=extra_info)
+        elif result_type == 'url_transparent':
+            # Use the information from the embedding page
+            info = self.extract_info(
+                ie_result['url'], ie_key=ie_result.get('ie_key'),
+                extra_info=extra_info, download=False, process=False)
+
+            # extract_info may return None when ignoreerrors is enabled and
+            # extraction failed with an error, don't crash and return early
+            # in this case
+            if not info:
+                return info
+
+            force_properties = dict(
+                (k, v) for k, v in ie_result.items() if v is not None)
+            for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
+                if f in force_properties:
+                    del force_properties[f]
+            new_result = info.copy()
+            new_result.update(force_properties)
+
+            # Extracted info may not be a video result (i.e.
+            # info.get('_type', 'video') != video) but rather an url or
+            # url_transparent. In such cases outer metadata (from ie_result)
+            # should be propagated to inner one (info). For this to happen
+            # _type of info should be overridden with url_transparent. This
+            # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
+            if new_result.get('_type') == 'url':
+                new_result['_type'] = 'url_transparent'
+
+            return self.process_ie_result(
+                new_result, download=download, extra_info=extra_info)
+        elif result_type in ('playlist', 'multi_video'):
+            # We process each entry in the playlist
+            playlist = ie_result.get('title') or ie_result.get('id')
+            self.to_screen('[download] Downloading playlist: %s' % playlist)
+
+            playlist_results = []
+
+            playliststart = self.params.get('playliststart', 1) - 1
+            playlistend = self.params.get('playlistend')
+            # For backwards compatibility, interpret -1 as whole list
+            if playlistend == -1:
+                playlistend = None
+
+            playlistitems_str = self.params.get('playlist_items')
+            playlistitems = None
+            if playlistitems_str is not None:
+                def iter_playlistitems(format):
+                    for string_segment in format.split(','):
+                        if '-' in string_segment:
+                            start, end = string_segment.split('-')
+                            for item in range(int(start), int(end) + 1):
+                                yield int(item)
+                        else:
+                            yield int(string_segment)
+                playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
+
+            ie_entries = ie_result['entries']
+
+            def make_playlistitems_entries(list_ie_entries):
+                num_entries = len(list_ie_entries)
+                return [
+                    list_ie_entries[i - 1] for i in playlistitems
+                    if -num_entries <= i - 1 < num_entries]
+
+            def report_download(num_entries):
+                self.to_screen(
+                    '[%s] playlist %s: Downloading %d videos' %
+                    (ie_result['extractor'], playlist, num_entries))
+
+            if isinstance(ie_entries, list):
+                n_all_entries = len(ie_entries)
+                if playlistitems:
+                    entries = make_playlistitems_entries(ie_entries)
+                else:
+                    entries = ie_entries[playliststart:playlistend]
+                n_entries = len(entries)
+                self.to_screen(
+                    '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
+                    (ie_result['extractor'], playlist, n_all_entries, n_entries))
+            elif isinstance(ie_entries, PagedList):
+                if playlistitems:
+                    entries = []
+                    for item in playlistitems:
+                        entries.extend(ie_entries.getslice(
+                            item - 1, item
+                        ))
+                else:
+                    entries = ie_entries.getslice(
+                        playliststart, playlistend)
+                n_entries = len(entries)
+                report_download(n_entries)
+            else:  # iterable
+                if playlistitems:
+                    entries = make_playlistitems_entries(list(itertools.islice(
+                        ie_entries, 0, max(playlistitems))))
+                else:
+                    entries = list(itertools.islice(
+                        ie_entries, playliststart, playlistend))
+                n_entries = len(entries)
+                report_download(n_entries)
+
+            if self.params.get('playlistreverse', False):
+                entries = entries[::-1]
+
+            if self.params.get('playlistrandom', False):
+                random.shuffle(entries)
+
+            x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
+
+            for i, entry in enumerate(entries, 1):
+                self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
+                # This __x_forwarded_for_ip thing is a bit ugly but requires
+                # minimal changes
+                if x_forwarded_for:
+                    entry['__x_forwarded_for_ip'] = x_forwarded_for
+                extra = {
+                    'n_entries': n_entries,
+                    'playlist': playlist,
+                    'playlist_id': ie_result.get('id'),
+                    'playlist_title': ie_result.get('title'),
+                    'playlist_uploader': ie_result.get('uploader'),
+                    'playlist_uploader_id': ie_result.get('uploader_id'),
+                    'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
+                    'extractor': ie_result['extractor'],
+                    'webpage_url': ie_result['webpage_url'],
+                    'webpage_url_basename': url_basename(ie_result['webpage_url']),
+                    'extractor_key': ie_result['extractor_key'],
+                }
+
+                reason = self._match_entry(entry, incomplete=True)
+                if reason is not None:
+                    self.to_screen('[download] ' + reason)
+                    continue
+
+                entry_result = self.process_ie_result(entry,
+                                                      download=download,
+                                                      extra_info=extra)
+                playlist_results.append(entry_result)
+            ie_result['entries'] = playlist_results
+            self.to_screen('[download] Finished downloading playlist: %s' % playlist)
+            return ie_result
+        elif result_type == 'compat_list':
+            self.report_warning(
+                'Extractor %s returned a compat_list result. '
+                'It needs to be updated.' % ie_result.get('extractor'))
+
+            def _fixup(r):
+                self.add_extra_info(
+                    r,
+                    {
+                        'extractor': ie_result['extractor'],
+                        'webpage_url': ie_result['webpage_url'],
+                        'webpage_url_basename': url_basename(ie_result['webpage_url']),
+                        'extractor_key': ie_result['extractor_key'],
+                    }
+                )
+                return r
+            ie_result['entries'] = [
+                self.process_ie_result(_fixup(r), download, extra_info)
+                for r in ie_result['entries']
+            ]
+            return ie_result
+        else:
+            raise Exception('Invalid result type: %s' % result_type)
+
+    def _build_format_filter(self, filter_spec):
+        " Returns a function to filter the formats according to the filter_spec "
+
+        OPERATORS = {
+            '<': operator.lt,
+            '<=': operator.le,
+            '>': operator.gt,
+            '>=': operator.ge,
+            '=': operator.eq,
+            '!=': operator.ne,
+        }
+        operator_rex = re.compile(r'''(?x)\s*
+            (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)
+            \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+            (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
+            $
+            ''' % '|'.join(map(re.escape, OPERATORS.keys())))
+        m = operator_rex.search(filter_spec)
+        if m:
+            try:
+                comparison_value = int(m.group('value'))
+            except ValueError:
+                comparison_value = parse_filesize(m.group('value'))
+                if comparison_value is None:
+                    comparison_value = parse_filesize(m.group('value') + 'B')
+                if comparison_value is None:
+                    raise ValueError(
+                        'Invalid value %r in format specification %r' % (
+                            m.group('value'), filter_spec))
+            op = OPERATORS[m.group('op')]
+
+        if not m:
+            STR_OPERATORS = {
+                '=': operator.eq,
+                '^=': lambda attr, value: attr.startswith(value),
+                '$=': lambda attr, value: attr.endswith(value),
+                '*=': lambda attr, value: value in attr,
+            }
+            str_operator_rex = re.compile(r'''(?x)
+                \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
+                \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?
+                \s*(?P<value>[a-zA-Z0-9._-]+)
+                \s*$
+                ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
+            m = str_operator_rex.search(filter_spec)
+            if m:
+                comparison_value = m.group('value')
+                str_op = STR_OPERATORS[m.group('op')]
+                if m.group('negation'):
+                    op = lambda attr, value: not str_op(attr, value)
+                else:
+                    op = str_op
+
+        if not m:
+            raise ValueError('Invalid filter specification %r' % filter_spec)
+
+        def _filter(f):
+            actual_value = f.get(m.group('key'))
+            if actual_value is None:
+                return m.group('none_inclusive')
+            return op(actual_value, comparison_value)
+        return _filter
+
+    def _default_format_spec(self, info_dict, download=True):
+
+        def can_merge():
+            merger = FFmpegMergerPP(self)
+            return merger.available and merger.can_merge()
+
+        def prefer_best():
+            if self.params.get('simulate', False):
+                return False
+            if not download:
+                return False
+            if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
+                return True
+            if info_dict.get('is_live'):
+                return True
+            if not can_merge():
+                return True
+            return False
+
+        req_format_list = ['bestvideo+bestaudio', 'best']
+        if prefer_best():
+            req_format_list.reverse()
+        return '/'.join(req_format_list)
+
+    def build_format_selector(self, format_spec):
+        def syntax_error(note, start):
+            message = (
+                'Invalid format specification: '
+                '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
+            return SyntaxError(message)
+
+        PICKFIRST = 'PICKFIRST'
+        MERGE = 'MERGE'
+        SINGLE = 'SINGLE'
+        GROUP = 'GROUP'
+        FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
+
+        def _parse_filter(tokens):
+            filter_parts = []
+            for type, string, start, _, _ in tokens:
+                if type == tokenize.OP and string == ']':
+                    return ''.join(filter_parts)
+                else:
+                    filter_parts.append(string)
+
+        def _remove_unused_ops(tokens):
+            # Remove operators that we don't use and join them with the surrounding strings
+            # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
+            ALLOWED_OPS = ('/', '+', ',', '(', ')')
+            last_string, last_start, last_end, last_line = None, None, None, None
+            for type, string, start, end, line in tokens:
+                if type == tokenize.OP and string == '[':
+                    if last_string:
+                        yield tokenize.NAME, last_string, last_start, last_end, last_line
+                        last_string = None
+                    yield type, string, start, end, line
+                    # everything inside brackets will be handled by _parse_filter
+                    for type, string, start, end, line in tokens:
+                        yield type, string, start, end, line
+                        if type == tokenize.OP and string == ']':
+                            break
+                elif type == tokenize.OP and string in ALLOWED_OPS:
+                    if last_string:
+                        yield tokenize.NAME, last_string, last_start, last_end, last_line
+                        last_string = None
+                    yield type, string, start, end, line
+                elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
+                    if not last_string:
+                        last_string = string
+                        last_start = start
+                        last_end = end
+                    else:
+                        last_string += string
+            if last_string:
+                yield tokenize.NAME, last_string, last_start, last_end, last_line
+
+        def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
+            selectors = []
+            current_selector = None
+            for type, string, start, _, _ in tokens:
+                # ENCODING is only defined in python 3.x
+                if type == getattr(tokenize, 'ENCODING', None):
+                    continue
+                elif type in [tokenize.NAME, tokenize.NUMBER]:
+                    current_selector = FormatSelector(SINGLE, string, [])
+                elif type == tokenize.OP:
+                    if string == ')':
+                        if not inside_group:
+                            # ')' will be handled by the parentheses group
+                            tokens.restore_last_token()
+                        break
+                    elif inside_merge and string in ['/', ',']:
+                        tokens.restore_last_token()
+                        break
+                    elif inside_choice and string == ',':
+                        tokens.restore_last_token()
+                        break
+                    elif string == ',':
+                        if not current_selector:
+                            raise syntax_error('"," must follow a format selector', start)
+                        selectors.append(current_selector)
+                        current_selector = None
+                    elif string == '/':
+                        if not current_selector:
+                            raise syntax_error('"/" must follow a format selector', start)
+                        first_choice = current_selector
+                        second_choice = _parse_format_selection(tokens, inside_choice=True)
+                        current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
+                    elif string == '[':
+                        if not current_selector:
+                            current_selector = FormatSelector(SINGLE, 'best', [])
+                        format_filter = _parse_filter(tokens)
+                        current_selector.filters.append(format_filter)
+                    elif string == '(':
+                        if current_selector:
+                            raise syntax_error('Unexpected "("', start)
+                        group = _parse_format_selection(tokens, inside_group=True)
+                        current_selector = FormatSelector(GROUP, group, [])
+                    elif string == '+':
+                        video_selector = current_selector
+                        audio_selector = _parse_format_selection(tokens, inside_merge=True)
+                        if not video_selector or not audio_selector:
+                            raise syntax_error('"+" must be between two format selectors', start)
+                        current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
+                    else:
+                        raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
+                elif type == tokenize.ENDMARKER:
+                    break
+            if current_selector:
+                selectors.append(current_selector)
+            return selectors
+
+        def _build_selector_function(selector):
+            if isinstance(selector, list):
+                fs = [_build_selector_function(s) for s in selector]
+
+                def selector_function(ctx):
+                    for f in fs:
+                        for format in f(ctx):
+                            yield format
+                return selector_function
+            elif selector.type == GROUP:
+                selector_function = _build_selector_function(selector.selector)
+            elif selector.type == PICKFIRST:
+                fs = [_build_selector_function(s) for s in selector.selector]
+
+                def selector_function(ctx):
+                    for f in fs:
+                        picked_formats = list(f(ctx))
+                        if picked_formats:
+                            return picked_formats
+                    return []
+            elif selector.type == SINGLE:
+                format_spec = selector.selector
+
+                def selector_function(ctx):
+                    formats = list(ctx['formats'])
+                    if not formats:
+                        return
+                    if format_spec == 'all':
+                        for f in formats:
+                            yield f
+                    elif format_spec in ['best', 'worst', None]:
+                        format_idx = 0 if format_spec == 'worst' else -1
+                        audiovideo_formats = [
+                            f for f in formats
+                            if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
+                        if audiovideo_formats:
+                            yield audiovideo_formats[format_idx]
+                        # for extractors with incomplete formats (audio only (soundcloud)
+                        # or video only (imgur)) we will fallback to best/worst
+                        # {video,audio}-only format
+                        elif ctx['incomplete_formats']:
+                            yield formats[format_idx]
+                    elif format_spec == 'bestaudio':
+                        audio_formats = [
+                            f for f in formats
+                            if f.get('vcodec') == 'none']
+                        if audio_formats:
+                            yield audio_formats[-1]
+                    elif format_spec == 'worstaudio':
+                        audio_formats = [
+                            f for f in formats
+                            if f.get('vcodec') == 'none']
+                        if audio_formats:
+                            yield audio_formats[0]
+                    elif format_spec == 'bestvideo':
+                        video_formats = [
+                            f for f in formats
+                            if f.get('acodec') == 'none']
+                        if video_formats:
+                            yield video_formats[-1]
+                    elif format_spec == 'worstvideo':
+                        video_formats = [
+                            f for f in formats
+                            if f.get('acodec') == 'none']
+                        if video_formats:
+                            yield video_formats[0]
+                    else:
+                        extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
+                        if format_spec in extensions:
+                            filter_f = lambda f: f['ext'] == format_spec
+                        else:
+                            filter_f = lambda f: f['format_id'] == format_spec
+                        matches = list(filter(filter_f, formats))
+                        if matches:
+                            yield matches[-1]
+            elif selector.type == MERGE:
+                def _merge(formats_info):
+                    format_1, format_2 = [f['format_id'] for f in formats_info]
+                    # The first format must contain the video and the
+                    # second the audio
+                    if formats_info[0].get('vcodec') == 'none':
+                        self.report_error('The first format must '
+                                          'contain the video, try using '
+                                          '"-f %s+%s"' % (format_2, format_1))
+                        return
+                    # Formats must be opposite (video+audio)
+                    if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
+                        self.report_error(
+                            'Both formats %s and %s are video-only, you must specify "-f video+audio"'
+                            % (format_1, format_2))
+                        return
+                    output_ext = (
+                        formats_info[0]['ext']
+                        if self.params.get('merge_output_format') is None
+                        else self.params['merge_output_format'])
+                    return {
+                        'requested_formats': formats_info,
+                        'format': '%s+%s' % (formats_info[0].get('format'),
+                                             formats_info[1].get('format')),
+                        'format_id': '%s+%s' % (formats_info[0].get('format_id'),
+                                                formats_info[1].get('format_id')),
+                        'width': formats_info[0].get('width'),
+                        'height': formats_info[0].get('height'),
+                        'resolution': formats_info[0].get('resolution'),
+                        'fps': formats_info[0].get('fps'),
+                        'vcodec': formats_info[0].get('vcodec'),
+                        'vbr': formats_info[0].get('vbr'),
+                        'stretched_ratio': formats_info[0].get('stretched_ratio'),
+                        'acodec': formats_info[1].get('acodec'),
+                        'abr': formats_info[1].get('abr'),
+                        'ext': output_ext,
+                    }
+                video_selector, audio_selector = map(_build_selector_function, selector.selector)
+
+                def selector_function(ctx):
+                    for pair in itertools.product(
+                            video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
+                        yield _merge(pair)
+
+            filters = [self._build_format_filter(f) for f in selector.filters]
+
+            def final_selector(ctx):
+                ctx_copy = copy.deepcopy(ctx)
+                for _filter in filters:
+                    ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
+                return selector_function(ctx_copy)
+            return final_selector
+
+        stream = io.BytesIO(format_spec.encode('utf-8'))
+        try:
+            tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
+        except tokenize.TokenError:
+            raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
+
+        class TokenIterator(object):
+            def __init__(self, tokens):
+                self.tokens = tokens
+                self.counter = 0
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if self.counter >= len(self.tokens):
+                    raise StopIteration()
+                value = self.tokens[self.counter]
+                self.counter += 1
+                return value
+
+            next = __next__
+
+            def restore_last_token(self):
+                self.counter -= 1
+
+        parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
+        return _build_selector_function(parsed_selector)
+
+    def _calc_headers(self, info_dict):
+        res = std_headers.copy()
+
+        add_headers = info_dict.get('http_headers')
+        if add_headers:
+            res.update(add_headers)
+
+        cookies = self._calc_cookies(info_dict)
+        if cookies:
+            res['Cookie'] = cookies
+
+        if 'X-Forwarded-For' not in res:
+            x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
+            if x_forwarded_for_ip:
+                res['X-Forwarded-For'] = x_forwarded_for_ip
+
+        return res
+
+    def _calc_cookies(self, info_dict):
+        pr = sanitized_Request(info_dict['url'])
+        self.cookiejar.add_cookie_header(pr)
+        return pr.get_header('Cookie')
+
+    def process_video_result(self, info_dict, download=True):
+        assert info_dict.get('_type', 'video') == 'video'
+
+        if 'id' not in info_dict:
+            raise ExtractorError('Missing "id" field in extractor result')
+        if 'title' not in info_dict:
+            raise ExtractorError('Missing "title" field in extractor result')
+
+        def report_force_conversion(field, field_not, conversion):
+            self.report_warning(
+                '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
+                % (field, field_not, conversion))
+
+        def sanitize_string_field(info, string_field):
+            field = info.get(string_field)
+            if field is None or isinstance(field, compat_str):
+                return
+            report_force_conversion(string_field, 'a string', 'string')
+            info[string_field] = compat_str(field)
+
+        def sanitize_numeric_fields(info):
+            for numeric_field in self._NUMERIC_FIELDS:
+                field = info.get(numeric_field)
+                if field is None or isinstance(field, compat_numeric_types):
+                    continue
+                report_force_conversion(numeric_field, 'numeric', 'int')
+                info[numeric_field] = int_or_none(field)
+
+        sanitize_string_field(info_dict, 'id')
+        sanitize_numeric_fields(info_dict)
+
+        if 'playlist' not in info_dict:
+            # It isn't part of a playlist
+            info_dict['playlist'] = None
+            info_dict['playlist_index'] = None
+
+        thumbnails = info_dict.get('thumbnails')
+        if thumbnails is None:
+            thumbnail = info_dict.get('thumbnail')
+            if thumbnail:
+                info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
+        if thumbnails:
+            thumbnails.sort(key=lambda t: (
+                t.get('preference') if t.get('preference') is not None else -1,
+                t.get('width') if t.get('width') is not None else -1,
+                t.get('height') if t.get('height') is not None else -1,
+                t.get('id') if t.get('id') is not None else '', t.get('url')))
+            for i, t in enumerate(thumbnails):
+                t['url'] = sanitize_url(t['url'])
+                if t.get('width') and t.get('height'):
+                    t['resolution'] = '%dx%d' % (t['width'], t['height'])
+                if t.get('id') is None:
+                    t['id'] = '%d' % i
+
+        if self.params.get('list_thumbnails'):
+            self.list_thumbnails(info_dict)
+            return
+
+        thumbnail = info_dict.get('thumbnail')
+        if thumbnail:
+            info_dict['thumbnail'] = sanitize_url(thumbnail)
+        elif thumbnails:
+            info_dict['thumbnail'] = thumbnails[-1]['url']
+
+        if 'display_id' not in info_dict and 'id' in info_dict:
+            info_dict['display_id'] = info_dict['id']
+
+        if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
+            # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+            # see http://bugs.python.org/issue1646728)
+            try:
+                upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
+                info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+            except (ValueError, OverflowError, OSError):
+                pass
+
+        # Auto generate title fields corresponding to the *_number fields when missing
+        # in order to always have clean titles. This is very common for TV series.
+        for field in ('chapter', 'season', 'episode'):
+            if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
+                info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
+
+        for cc_kind in ('subtitles', 'automatic_captions'):
+            cc = info_dict.get(cc_kind)
+            if cc:
+                for _, subtitle in cc.items():
+                    for subtitle_format in subtitle:
+                        if subtitle_format.get('url'):
+                            subtitle_format['url'] = sanitize_url(subtitle_format['url'])
+                        if subtitle_format.get('ext') is None:
+                            subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
+
+        automatic_captions = info_dict.get('automatic_captions')
+        subtitles = info_dict.get('subtitles')
+
+        if self.params.get('listsubtitles', False):
+            if 'automatic_captions' in info_dict:
+                self.list_subtitles(
+                    info_dict['id'], automatic_captions, 'automatic captions')
+            self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
+            return
+
+        info_dict['requested_subtitles'] = self.process_subtitles(
+            info_dict['id'], subtitles, automatic_captions)
+
+        # We now pick which formats have to be downloaded
+        if info_dict.get('formats') is None:
+            # There's only one format available
+            formats = [info_dict]
+        else:
+            formats = info_dict['formats']
+
+        if not formats:
+            raise ExtractorError('No video formats found!')
+
+        def is_wellformed(f):
+            url = f.get('url')
+            if not url:
+                self.report_warning(
+                    '"url" field is missing or empty - skipping format, '
+                    'there is an error in extractor')
+                return False
+            if isinstance(url, bytes):
+                sanitize_string_field(f, 'url')
+            return True
+
+        # Filter out malformed formats for better extraction robustness
+        formats = list(filter(is_wellformed, formats))
+
+        formats_dict = {}
+
+        # We check that all the formats have the format and format_id fields
+        for i, format in enumerate(formats):
+            sanitize_string_field(format, 'format_id')
+            sanitize_numeric_fields(format)
+            format['url'] = sanitize_url(format['url'])
+            if not format.get('format_id'):
+                format['format_id'] = compat_str(i)
+            else:
+                # Sanitize format_id from characters used in format selector expression
+                format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
+            format_id = format['format_id']
+            if format_id not in formats_dict:
+                formats_dict[format_id] = []
+            formats_dict[format_id].append(format)
+
+        # Make sure all formats have unique format_id
+        for format_id, ambiguous_formats in formats_dict.items():
+            if len(ambiguous_formats) > 1:
+                for i, format in enumerate(ambiguous_formats):
+                    format['format_id'] = '%s-%d' % (format_id, i)
+
+        for i, format in enumerate(formats):
+            if format.get('format') is None:
+                format['format'] = '{id} - {res}{note}'.format(
+                    id=format['format_id'],
+                    res=self.format_resolution(format),
+                    note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
+                )
+            # Automatically determine file extension if missing
+            if format.get('ext') is None:
+                format['ext'] = determine_ext(format['url']).lower()
+            # Automatically determine protocol if missing (useful for format
+            # selection purposes)
+            if format.get('protocol') is None:
+                format['protocol'] = determine_protocol(format)
+            # Add HTTP headers, so that external programs can use them from the
+            # json output
+            full_format_info = info_dict.copy()
+            full_format_info.update(format)
+            format['http_headers'] = self._calc_headers(full_format_info)
+        # Remove private housekeeping stuff
+        if '__x_forwarded_for_ip' in info_dict:
+            del info_dict['__x_forwarded_for_ip']
+
+        # TODO Central sorting goes here
+
+        if formats[0] is not info_dict:
+            # only set the 'formats' fields if the original info_dict list them
+            # otherwise we end up with a circular reference, the first (and unique)
+            # element in the 'formats' field in info_dict is info_dict itself,
+            # which can't be exported to json
+            info_dict['formats'] = formats
+        if self.params.get('listformats'):
+            self.list_formats(info_dict)
+            return
+
+        req_format = self.params.get('format')
+        if req_format is None:
+            req_format = self._default_format_spec(info_dict, download=download)
+            if self.params.get('verbose'):
+                self.to_stdout('[debug] Default format spec: %s' % req_format)
+
+        format_selector = self.build_format_selector(req_format)
+
+        # While in format selection we may need to have an access to the original
+        # format set in order to calculate some metrics or do some processing.
+        # For now we need to be able to guess whether original formats provided
+        # by extractor are incomplete or not (i.e. whether extractor provides only
+        # video-only or audio-only formats) for proper formats selection for
+        # extractors with such incomplete formats (see
+        # https://github.com/ytdl-org/youtube-dl/pull/5556).
+        # Since formats may be filtered during format selection and may not match
+        # the original formats the results may be incorrect. Thus original formats
+        # or pre-calculated metrics should be passed to format selection routines
+        # as well.
+        # We will pass a context object containing all necessary additional data
+        # instead of just formats.
+        # This fixes incorrect format selection issue (see
+        # https://github.com/ytdl-org/youtube-dl/issues/10083).
+        incomplete_formats = (
+            # All formats are video-only or
+            all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
+            # all formats are audio-only
+            or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
+
+        ctx = {
+            'formats': formats,
+            'incomplete_formats': incomplete_formats,
+        }
+
+        formats_to_download = list(format_selector(ctx))
+        if not formats_to_download:
+            raise ExtractorError('requested format not available',
+                                 expected=True)
+
+        if download:
+            if len(formats_to_download) > 1:
+                self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
+            for format in formats_to_download:
+                new_info = dict(info_dict)
+                new_info.update(format)
+                self.process_info(new_info)
+        # We update the info dict with the best quality format (backwards compatibility)
+        info_dict.update(formats_to_download[-1])
+        return info_dict
+
+    def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
+        """Select the requested subtitles and their format"""
+        available_subs = {}
+        if normal_subtitles and self.params.get('writesubtitles'):
+            available_subs.update(normal_subtitles)
+        if automatic_captions and self.params.get('writeautomaticsub'):
+            for lang, cap_info in automatic_captions.items():
+                if lang not in available_subs:
+                    available_subs[lang] = cap_info
+
+        if (not self.params.get('writesubtitles') and not
+                self.params.get('writeautomaticsub') or not
+                available_subs):
+            return None
+
+        if self.params.get('allsubtitles', False):
+            requested_langs = available_subs.keys()
+        else:
+            if self.params.get('subtitleslangs', False):
+                requested_langs = self.params.get('subtitleslangs')
+            elif 'en' in available_subs:
+                requested_langs = ['en']
+            else:
+                requested_langs = [list(available_subs.keys())[0]]
+
+        formats_query = self.params.get('subtitlesformat', 'best')
+        formats_preference = formats_query.split('/') if formats_query else []
+        subs = {}
+        for lang in requested_langs:
+            formats = available_subs.get(lang)
+            if formats is None:
+                self.report_warning('%s subtitles not available for %s' % (lang, video_id))
+                continue
+            for ext in formats_preference:
+                if ext == 'best':
+                    f = formats[-1]
+                    break
+                matches = list(filter(lambda f: f['ext'] == ext, formats))
+                if matches:
+                    f = matches[-1]
+                    break
+            else:
+                f = formats[-1]
+                self.report_warning(
+                    'No subtitle format found matching "%s" for language %s, '
+                    'using %s' % (formats_query, lang, f['ext']))
+            subs[lang] = f
+        return subs
+
+    def __forced_printings(self, info_dict, filename, incomplete):
+        def print_mandatory(field):
+            if (self.params.get('force%s' % field, False)
+                    and (not incomplete or info_dict.get(field) is not None)):
+                self.to_stdout(info_dict[field])
+
+        def print_optional(field):
+            if (self.params.get('force%s' % field, False)
+                    and info_dict.get(field) is not None):
+                self.to_stdout(info_dict[field])
+
+        print_mandatory('title')
+        print_mandatory('id')
+        if self.params.get('forceurl', False) and not incomplete:
+            if info_dict.get('requested_formats') is not None:
+                for f in info_dict['requested_formats']:
+                    self.to_stdout(f['url'] + f.get('play_path', ''))
+            else:
+                # For RTMP URLs, also include the playpath
+                self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
+        print_optional('thumbnail')
+        print_optional('description')
+        if self.params.get('forcefilename', False) and filename is not None:
+            self.to_stdout(filename)
+        if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
+            self.to_stdout(formatSeconds(info_dict['duration']))
+        print_mandatory('format')
+        if self.params.get('forcejson', False):
+            self.to_stdout(json.dumps(info_dict))
+
+    def process_info(self, info_dict):
+        """Process a single resolved IE result."""
+
+        assert info_dict.get('_type', 'video') == 'video'
+
+        max_downloads = self.params.get('max_downloads')
+        if max_downloads is not None:
+            if self._num_downloads >= int(max_downloads):
+                raise MaxDownloadsReached()
+
+        # TODO: backward compatibility, to be removed
+        info_dict['fulltitle'] = info_dict['title']
+
+        if 'format' not in info_dict:
+            info_dict['format'] = info_dict['ext']
+
+        reason = self._match_entry(info_dict, incomplete=False)
+        if reason is not None:
+            self.to_screen('[download] ' + reason)
+            return
+
+        self._num_downloads += 1
+
+        info_dict['_filename'] = filename = self.prepare_filename(info_dict)
+
+        # Forced printings
+        self.__forced_printings(info_dict, filename, incomplete=False)
+
+        # Do nothing else if in simulate mode
+        if self.params.get('simulate', False):
+            return
+
+        if filename is None:
+            return
+
+        def ensure_dir_exists(path):
+            try:
+                dn = os.path.dirname(path)
+                if dn and not os.path.exists(dn):
+                    os.makedirs(dn)
+                return True
+            except (OSError, IOError) as err:
+                self.report_error('unable to create directory ' + error_to_compat_str(err))
+                return False
+
+        if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
+            return
+
+        if self.params.get('writedescription', False):
+            descfn = replace_extension(filename, 'description', info_dict.get('ext'))
+            if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
+                self.to_screen('[info] Video description is already present')
+            elif info_dict.get('description') is None:
+                self.report_warning('There\'s no description to write.')
+            else:
+                try:
+                    self.to_screen('[info] Writing video description to: ' + descfn)
+                    with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
+                        descfile.write(info_dict['description'])
+                except (OSError, IOError):
+                    self.report_error('Cannot write description file ' + descfn)
+                    return
+
+        if self.params.get('writeannotations', False):
+            annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
+            if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
+                self.to_screen('[info] Video annotations are already present')
+            elif not info_dict.get('annotations'):
+                self.report_warning('There are no annotations to write.')
+            else:
+                try:
+                    self.to_screen('[info] Writing video annotations to: ' + annofn)
+                    with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
+                        annofile.write(info_dict['annotations'])
+                except (KeyError, TypeError):
+                    self.report_warning('There are no annotations to write.')
+                except (OSError, IOError):
+                    self.report_error('Cannot write annotations file: ' + annofn)
+                    return
+
+        def dl(name, info):
+            fd = get_suitable_downloader(info, self.params)(self, self.params)
+            for ph in self._progress_hooks:
+                fd.add_progress_hook(ph)
+            if self.params.get('verbose'):
+                self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
+            return fd.download(name, info)
+
+        subtitles_are_requested = any([self.params.get('writesubtitles', False),
+                                       self.params.get('writeautomaticsub')])
+
+        if subtitles_are_requested and info_dict.get('requested_subtitles'):
+            # subtitles download errors are already managed as troubles in relevant IE
+            # that way it will silently go on when used with unsupporting IE
+            subtitles = info_dict['requested_subtitles']
+            for sub_lang, sub_info in subtitles.items():
+                sub_format = sub_info['ext']
+                sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
+                if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
+                    self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
+                else:
+                    if sub_info.get('data') is not None:
+                        try:
+                            # Use newline='' to prevent conversion of newline characters
+                            # See https://github.com/ytdl-org/youtube-dl/issues/10268
+                            with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
+                                subfile.write(sub_info['data'])
+                        except (OSError, IOError):
+                            self.report_error('Cannot write subtitles file ' + sub_filename)
+                            return
+                    else:
+                        try:
+                            dl(sub_filename, sub_info)
+                        except (ExtractorError, IOError, OSError, ValueError,
+                                compat_urllib_error.URLError,
+                                compat_http_client.HTTPException,
+                                socket.error) as err:
+                            self.report_warning('Unable to download subtitle for "%s": %s' %
+                                                (sub_lang, error_to_compat_str(err)))
+                            continue
+
+        if self.params.get('writeinfojson', False):
+            infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
+            if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
+                self.to_screen('[info] Video description metadata is already present')
+            else:
+                self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
+                try:
+                    write_json_file(self.filter_requested_info(info_dict), infofn)
+                except (OSError, IOError):
+                    self.report_error('Cannot write metadata to JSON file ' + infofn)
+                    return
+
+        self._write_thumbnails(info_dict, filename)
+
+        if not self.params.get('skip_download', False):
+            try:
+                if info_dict.get('requested_formats') is not None:
+                    downloaded = []
+                    success = True
+                    merger = FFmpegMergerPP(self)
+                    if not merger.available:
+                        postprocessors = []
+                        self.report_warning('You have requested multiple '
+                                            'formats but ffmpeg or avconv are not installed.'
+                                            ' The formats won\'t be merged.')
+                    else:
+                        postprocessors = [merger]
+
+                    def compatible_formats(formats):
+                        video, audio = formats
+                        # Check extension
+                        video_ext, audio_ext = video.get('ext'), audio.get('ext')
+                        if video_ext and audio_ext:
+                            COMPATIBLE_EXTS = (
+                                ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
+                                ('webm')
+                            )
+                            for exts in COMPATIBLE_EXTS:
+                                if video_ext in exts and audio_ext in exts:
+                                    return True
+                        # TODO: Check acodec/vcodec
+                        return False
+
+                    filename_real_ext = os.path.splitext(filename)[1][1:]
+                    filename_wo_ext = (
+                        os.path.splitext(filename)[0]
+                        if filename_real_ext == info_dict['ext']
+                        else filename)
+                    requested_formats = info_dict['requested_formats']
+                    if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
+                        info_dict['ext'] = 'mkv'
+                        self.report_warning(
+                            'Requested formats are incompatible for merge and will be merged into mkv.')
+                    # Ensure filename always has a correct extension for successful merge
+                    filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
+                    if os.path.exists(encodeFilename(filename)):
+                        self.to_screen(
+                            '[download] %s has already been downloaded and '
+                            'merged' % filename)
+                    else:
+                        for f in requested_formats:
+                            new_info = dict(info_dict)
+                            new_info.update(f)
+                            fname = prepend_extension(
+                                self.prepare_filename(new_info),
+                                'f%s' % f['format_id'], new_info['ext'])
+                            if not ensure_dir_exists(fname):
+                                return
+                            downloaded.append(fname)
+                            partial_success = dl(fname, new_info)
+                            success = success and partial_success
+                        info_dict['__postprocessors'] = postprocessors
+                        info_dict['__files_to_merge'] = downloaded
+                else:
+                    # Just a single file
+                    success = dl(filename, info_dict)
+            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+                self.report_error('unable to download video data: %s' % error_to_compat_str(err))
+                return
+            except (OSError, IOError) as err:
+                raise UnavailableVideoError(err)
+            except (ContentTooShortError, ) as err:
+                self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
+                return
+
+            if success and filename != '-':
+                # Fixup content
+                fixup_policy = self.params.get('fixup')
+                if fixup_policy is None:
+                    fixup_policy = 'detect_or_warn'
+
+                INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
+
+                stretched_ratio = info_dict.get('stretched_ratio')
+                if stretched_ratio is not None and stretched_ratio != 1:
+                    if fixup_policy == 'warn':
+                        self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
+                            info_dict['id'], stretched_ratio))
+                    elif fixup_policy == 'detect_or_warn':
+                        stretched_pp = FFmpegFixupStretchedPP(self)
+                        if stretched_pp.available:
+                            info_dict.setdefault('__postprocessors', [])
+                            info_dict['__postprocessors'].append(stretched_pp)
+                        else:
+                            self.report_warning(
+                                '%s: Non-uniform pixel ratio (%s). %s'
+                                % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
+                    else:
+                        assert fixup_policy in ('ignore', 'never')
+
+                if (info_dict.get('requested_formats') is None
+                        and info_dict.get('container') == 'm4a_dash'):
+                    if fixup_policy == 'warn':
+                        self.report_warning(
+                            '%s: writing DASH m4a. '
+                            'Only some players support this container.'
+                            % info_dict['id'])
+                    elif fixup_policy == 'detect_or_warn':
+                        fixup_pp = FFmpegFixupM4aPP(self)
+                        if fixup_pp.available:
+                            info_dict.setdefault('__postprocessors', [])
+                            info_dict['__postprocessors'].append(fixup_pp)
+                        else:
+                            self.report_warning(
+                                '%s: writing DASH m4a. '
+                                'Only some players support this container. %s'
+                                % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
+                    else:
+                        assert fixup_policy in ('ignore', 'never')
+
+                if (info_dict.get('protocol') == 'm3u8_native'
+                        or info_dict.get('protocol') == 'm3u8'
+                        and self.params.get('hls_prefer_native')):
+                    if fixup_policy == 'warn':
+                        self.report_warning('%s: malformed AAC bitstream detected.' % (
+                            info_dict['id']))
+                    elif fixup_policy == 'detect_or_warn':
+                        fixup_pp = FFmpegFixupM3u8PP(self)
+                        if fixup_pp.available:
+                            info_dict.setdefault('__postprocessors', [])
+                            info_dict['__postprocessors'].append(fixup_pp)
+                        else:
+                            self.report_warning(
+                                '%s: malformed AAC bitstream detected. %s'
+                                % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
+                    else:
+                        assert fixup_policy in ('ignore', 'never')
+
+                try:
+                    self.post_process(filename, info_dict)
+                except (PostProcessingError) as err:
+                    self.report_error('postprocessing: %s' % str(err))
+                    return
+                self.record_download_archive(info_dict)
+
+    def download(self, url_list):
+        """Download a given list of URLs."""
+        outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
+        if (len(url_list) > 1
+                and outtmpl != '-'
+                and '%' not in outtmpl
+                and self.params.get('max_downloads') != 1):
+            raise SameFileError(outtmpl)
+
+        for url in url_list:
+            try:
+                # It also downloads the videos
+                res = self.extract_info(
+                    url, force_generic_extractor=self.params.get('force_generic_extractor', False))
+            except UnavailableVideoError:
+                self.report_error('unable to download video')
+            except MaxDownloadsReached:
+                self.to_screen('[info] Maximum number of downloaded files reached.')
+                raise
+            else:
+                if self.params.get('dump_single_json', False):
+                    self.to_stdout(json.dumps(res))
+
+        return self._download_retcode
+
+    def download_with_info_file(self, info_filename):
+        with contextlib.closing(fileinput.FileInput(
+                [info_filename], mode='r',
+                openhook=fileinput.hook_encoded('utf-8'))) as f:
+            # FileInput doesn't have a read method, we can't call json.load
+            info = self.filter_requested_info(json.loads('\n'.join(f)))
+        try:
+            self.process_ie_result(info, download=True)
+        except DownloadError:
+            webpage_url = info.get('webpage_url')
+            if webpage_url is not None:
+                self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
+                return self.download([webpage_url])
+            else:
+                raise
+        return self._download_retcode
+
+    @staticmethod
+    def filter_requested_info(info_dict):
+        return dict(
+            (k, v) for k, v in info_dict.items()
+            if k not in ['requested_formats', 'requested_subtitles'])
+
+    def post_process(self, filename, ie_info):
+        """Run all the postprocessors on the given file."""
+        info = dict(ie_info)
+        info['filepath'] = filename
+        pps_chain = []
+        if ie_info.get('__postprocessors') is not None:
+            pps_chain.extend(ie_info['__postprocessors'])
+        pps_chain.extend(self._pps)
+        for pp in pps_chain:
+            files_to_delete = []
+            try:
+                files_to_delete, info = pp.run(info)
+            except PostProcessingError as e:
+                self.report_error(e.msg)
+            if files_to_delete and not self.params.get('keepvideo', False):
+                for old_filename in files_to_delete:
+                    self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
+                    try:
+                        os.remove(encodeFilename(old_filename))
+                    except (IOError, OSError):
+                        self.report_warning('Unable to remove downloaded original file')
+
+    def _make_archive_id(self, info_dict):
+        video_id = info_dict.get('id')
+        if not video_id:
+            return
+        # Future-proof against any change in case
+        # and backwards compatibility with prior versions
+        extractor = info_dict.get('extractor_key') or info_dict.get('ie_key')  # key in a playlist
+        if extractor is None:
+            url = str_or_none(info_dict.get('url'))
+            if not url:
+                return
+            # Try to find matching extractor for the URL and take its ie_key
+            for ie in self._ies:
+                if ie.suitable(url):
+                    extractor = ie.ie_key()
+                    break
+            else:
+                return
+        return extractor.lower() + ' ' + video_id
+
+    def in_download_archive(self, info_dict):
+        fn = self.params.get('download_archive')
+        if fn is None:
+            return False
+
+        vid_id = self._make_archive_id(info_dict)
+        if not vid_id:
+            return False  # Incomplete video information
+
+        try:
+            with locked_file(fn, 'r', encoding='utf-8') as archive_file:
+                for line in archive_file:
+                    if line.strip() == vid_id:
+                        return True
+        except IOError as ioe:
+            if ioe.errno != errno.ENOENT:
+                raise
+        return False
+
+    def record_download_archive(self, info_dict):
+        fn = self.params.get('download_archive')
+        if fn is None:
+            return
+        vid_id = self._make_archive_id(info_dict)
+        assert vid_id
+        with locked_file(fn, 'a', encoding='utf-8') as archive_file:
+            archive_file.write(vid_id + '\n')
+
+    @staticmethod
+    def format_resolution(format, default='unknown'):
+        if format.get('vcodec') == 'none':
+            return 'audio only'
+        if format.get('resolution') is not None:
+            return format['resolution']
+        if format.get('height') is not None:
+            if format.get('width') is not None:
+                res = '%sx%s' % (format['width'], format['height'])
+            else:
+                res = '%sp' % format['height']
+        elif format.get('width') is not None:
+            res = '%dx?' % format['width']
+        else:
+            res = default
+        return res
+
+    def _format_note(self, fdict):
+        res = ''
+        if fdict.get('ext') in ['f4f', 'f4m']:
+            res += '(unsupported) '
+        if fdict.get('language'):
+            if res:
+                res += ' '
+            res += '[%s] ' % fdict['language']
+        if fdict.get('format_note') is not None:
+            res += fdict['format_note'] + ' '
+        if fdict.get('tbr') is not None:
+            res += '%4dk ' % fdict['tbr']
+        if fdict.get('container') is not None:
+            if res:
+                res += ', '
+            res += '%s container' % fdict['container']
+        if (fdict.get('vcodec') is not None
+                and fdict.get('vcodec') != 'none'):
+            if res:
+                res += ', '
+            res += fdict['vcodec']
+            if fdict.get('vbr') is not None:
+                res += '@'
+        elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
+            res += 'video@'
+        if fdict.get('vbr') is not None:
+            res += '%4dk' % fdict['vbr']
+        if fdict.get('fps') is not None:
+            if res:
+                res += ', '
+            res += '%sfps' % fdict['fps']
+        if fdict.get('acodec') is not None:
+            if res:
+                res += ', '
+            if fdict['acodec'] == 'none':
+                res += 'video only'
+            else:
+                res += '%-5s' % fdict['acodec']
+        elif fdict.get('abr') is not None:
+            if res:
+                res += ', '
+            res += 'audio'
+        if fdict.get('abr') is not None:
+            res += '@%3dk' % fdict['abr']
+        if fdict.get('asr') is not None:
+            res += ' (%5dHz)' % fdict['asr']
+        if fdict.get('filesize') is not None:
+            if res:
+                res += ', '
+            res += format_bytes(fdict['filesize'])
+        elif fdict.get('filesize_approx') is not None:
+            if res:
+                res += ', '
+            res += '~' + format_bytes(fdict['filesize_approx'])
+        return res
+
+    def list_formats(self, info_dict):
+        formats = info_dict.get('formats', [info_dict])
+        table = [
+            [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
+            for f in formats
+            if f.get('preference') is None or f['preference'] >= -1000]
+        if len(formats) > 1:
+            table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
+
+        header_line = ['format code', 'extension', 'resolution', 'note']
+        self.to_screen(
+            '[info] Available formats for %s:\n%s' %
+            (info_dict['id'], render_table(header_line, table)))
+
+    def list_thumbnails(self, info_dict):
+        thumbnails = info_dict.get('thumbnails')
+        if not thumbnails:
+            self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
+            return
+
+        self.to_screen(
+            '[info] Thumbnails for %s:' % info_dict['id'])
+        self.to_screen(render_table(
+            ['ID', 'width', 'height', 'URL'],
+            [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
+
+    def list_subtitles(self, video_id, subtitles, name='subtitles'):
+        if not subtitles:
+            self.to_screen('%s has no %s' % (video_id, name))
+            return
+        self.to_screen(
+            'Available %s for %s:' % (name, video_id))
+        self.to_screen(render_table(
+            ['Language', 'formats'],
+            [[lang, ', '.join(f['ext'] for f in reversed(formats))]
+                for lang, formats in subtitles.items()]))
+
+    def urlopen(self, req):
+        """ Start an HTTP download """
+        if isinstance(req, compat_basestring):
+            req = sanitized_Request(req)
+        return self._opener.open(req, timeout=self._socket_timeout)
+
+    def print_debug_header(self):
+        if not self.params.get('verbose'):
+            return
+
+        if type('') is not compat_str:
+            # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326)
+            self.report_warning(
+                'Your Python is broken! Update to a newer and supported version')
+
+        stdout_encoding = getattr(
+            sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
+        encoding_str = (
+            '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
+                locale.getpreferredencoding(),
+                sys.getfilesystemencoding(),
+                stdout_encoding,
+                self.get_encoding()))
+        write_string(encoding_str, encoding=None)
+
+        self._write_string('[debug] youtube-dlc version ' + __version__ + '\n')
+        if _LAZY_LOADER:
+            self._write_string('[debug] Lazy loading extractors enabled' + '\n')
+        try:
+            sp = subprocess.Popen(
+                ['git', 'rev-parse', '--short', 'HEAD'],
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                cwd=os.path.dirname(os.path.abspath(__file__)))
+            out, err = sp.communicate()
+            out = out.decode().strip()
+            if re.match('[0-9a-f]+', out):
+                self._write_string('[debug] Git HEAD: ' + out + '\n')
+        except Exception:
+            try:
+                sys.exc_clear()
+            except Exception:
+                pass
+
+        def python_implementation():
+            impl_name = platform.python_implementation()
+            if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
+                return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
+            return impl_name
+
+        self._write_string('[debug] Python version %s (%s) - %s\n' % (
+            platform.python_version(), python_implementation(),
+            platform_name()))
+
+        exe_versions = FFmpegPostProcessor.get_versions(self)
+        exe_versions['rtmpdump'] = rtmpdump_version()
+        exe_versions['phantomjs'] = PhantomJSwrapper._version()
+        exe_str = ', '.join(
+            '%s %s' % (exe, v)
+            for exe, v in sorted(exe_versions.items())
+            if v
+        )
+        if not exe_str:
+            exe_str = 'none'
+        self._write_string('[debug] exe versions: %s\n' % exe_str)
+
+        proxy_map = {}
+        for handler in self._opener.handlers:
+            if hasattr(handler, 'proxies'):
+                proxy_map.update(handler.proxies)
+        self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
+
+        if self.params.get('call_home', False):
+            ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
+            self._write_string('[debug] Public IP address: %s\n' % ipaddr)
+            latest_version = self.urlopen(
+                'https://yt-dl.org/latest/version').read().decode('utf-8')
+            if version_tuple(latest_version) > version_tuple(__version__):
+                self.report_warning(
+                    'You are using an outdated version (newest version: %s)! '
+                    'See https://yt-dl.org/update if you need help updating.' %
+                    latest_version)
+
+    def _setup_opener(self):
+        timeout_val = self.params.get('socket_timeout')
+        self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
+
+        opts_cookiefile = self.params.get('cookiefile')
+        opts_proxy = self.params.get('proxy')
+
+        if opts_cookiefile is None:
+            self.cookiejar = compat_cookiejar.CookieJar()
+        else:
+            opts_cookiefile = expand_path(opts_cookiefile)
+            self.cookiejar = YoutubeDLCookieJar(opts_cookiefile)
+            if os.access(opts_cookiefile, os.R_OK):
+                self.cookiejar.load(ignore_discard=True, ignore_expires=True)
+
+        cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
+        if opts_proxy is not None:
+            if opts_proxy == '':
+                proxies = {}
+            else:
+                proxies = {'http': opts_proxy, 'https': opts_proxy}
+        else:
+            proxies = compat_urllib_request.getproxies()
+            # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
+            if 'http' in proxies and 'https' not in proxies:
+                proxies['https'] = proxies['http']
+        proxy_handler = PerRequestProxyHandler(proxies)
+
+        debuglevel = 1 if self.params.get('debug_printtraffic') else 0
+        https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
+        ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
+        redirect_handler = YoutubeDLRedirectHandler()
+        data_handler = compat_urllib_request_DataHandler()
+
+        # When passing our own FileHandler instance, build_opener won't add the
+        # default FileHandler and allows us to disable the file protocol, which
+        # can be used for malicious purposes (see
+        # https://github.com/ytdl-org/youtube-dl/issues/8227)
+        file_handler = compat_urllib_request.FileHandler()
+
+        def file_open(*args, **kwargs):
+            raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dlc for security reasons')
+        file_handler.file_open = file_open
+
+        opener = compat_urllib_request.build_opener(
+            proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
+
+        # Delete the default user-agent header, which would otherwise apply in
+        # cases where our custom HTTP handler doesn't come into play
+        # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
+        opener.addheaders = []
+        self._opener = opener
+
+    def encode(self, s):
+        if isinstance(s, bytes):
+            return s  # Already encoded
+
+        try:
+            return s.encode(self.get_encoding())
+        except UnicodeEncodeError as err:
+            err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
+            raise
+
+    def get_encoding(self):
+        encoding = self.params.get('encoding')
+        if encoding is None:
+            encoding = preferredencoding()
+        return encoding
+
+    def _write_thumbnails(self, info_dict, filename):
+        if self.params.get('writethumbnail', False):
+            thumbnails = info_dict.get('thumbnails')
+            if thumbnails:
+                thumbnails = [thumbnails[-1]]
+        elif self.params.get('write_all_thumbnails', False):
+            thumbnails = info_dict.get('thumbnails')
+        else:
+            return
+
+        if not thumbnails:
+            # No thumbnails present, so return immediately
+            return
+
+        for t in thumbnails:
+            thumb_ext = determine_ext(t['url'], 'jpg')
+            suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
+            thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
+            t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
+
+            if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
+                self.to_screen('[%s] %s: Thumbnail %sis already present' %
+                               (info_dict['extractor'], info_dict['id'], thumb_display_id))
+            else:
+                self.to_screen('[%s] %s: Downloading thumbnail %s...' %
+                               (info_dict['extractor'], info_dict['id'], thumb_display_id))
+                try:
+                    uf = self.urlopen(t['url'])
+                    with open(encodeFilename(thumb_filename), 'wb') as thumbf:
+                        shutil.copyfileobj(uf, thumbf)
+                    self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
+                                   (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
+                except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+                    self.report_warning('Unable to download thumbnail "%s": %s' %
+                                        (t['url'], error_to_compat_str(err)))
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py

new file mode 100644 (file)

index 0000000..a663417
--- /dev/null
+++ b/youtube_dl/__init__.py
@@ -0,0 +1,483 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+__license__ = 'Public Domain'
+
+import codecs
+import io
+import os
+import random
+import sys
+
+
+from .options import (
+    parseOpts,
+)
+from .compat import (
+    compat_getpass,
+    compat_shlex_split,
+    workaround_optparse_bug9161,
+)
+from .utils import (
+    DateRange,
+    decodeOption,
+    DEFAULT_OUTTMPL,
+    DownloadError,
+    expand_path,
+    match_filter_func,
+    MaxDownloadsReached,
+    preferredencoding,
+    read_batch_urls,
+    SameFileError,
+    setproctitle,
+    std_headers,
+    write_string,
+    render_table,
+)
+from .update import update_self
+from .downloader import (
+    FileDownloader,
+)
+from .extractor import gen_extractors, list_extractors
+from .extractor.adobepass import MSO_INFO
+from .YoutubeDL import YoutubeDL
+
+
+def _real_main(argv=None):
+    # Compatibility fixes for Windows
+    if sys.platform == 'win32':
+        # https://github.com/ytdl-org/youtube-dl/issues/820
+        codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
+
+    workaround_optparse_bug9161()
+
+    setproctitle('youtube-dlc')
+
+    parser, opts, args = parseOpts(argv)
+
+    # Set user agent
+    if opts.user_agent is not None:
+        std_headers['User-Agent'] = opts.user_agent
+
+    # Set referer
+    if opts.referer is not None:
+        std_headers['Referer'] = opts.referer
+
+    # Custom HTTP headers
+    if opts.headers is not None:
+        for h in opts.headers:
+            if ':' not in h:
+                parser.error('wrong header formatting, it should be key:value, not "%s"' % h)
+            key, value = h.split(':', 1)
+            if opts.verbose:
+                write_string('[debug] Adding header from command line option %s:%s\n' % (key, value))
+            std_headers[key] = value
+
+    # Dump user agent
+    if opts.dump_user_agent:
+        write_string(std_headers['User-Agent'] + '\n', out=sys.stdout)
+        sys.exit(0)
+
+    # Batch file verification
+    batch_urls = []
+    if opts.batchfile is not None:
+        try:
+            if opts.batchfile == '-':
+                batchfd = sys.stdin
+            else:
+                batchfd = io.open(
+                    expand_path(opts.batchfile),
+                    'r', encoding='utf-8', errors='ignore')
+            batch_urls = read_batch_urls(batchfd)
+            if opts.verbose:
+                write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n')
+        except IOError:
+            sys.exit('ERROR: batch file %s could not be read' % opts.batchfile)
+    all_urls = batch_urls + [url.strip() for url in args]  # batch_urls are already striped in read_batch_urls
+    _enc = preferredencoding()
+    all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]
+
+    if opts.list_extractors:
+        for ie in list_extractors(opts.age_limit):
+            write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout)
+            matchedUrls = [url for url in all_urls if ie.suitable(url)]
+            for mu in matchedUrls:
+                write_string('  ' + mu + '\n', out=sys.stdout)
+        sys.exit(0)
+    if opts.list_extractor_descriptions:
+        for ie in list_extractors(opts.age_limit):
+            if not ie._WORKING:
+                continue
+            desc = getattr(ie, 'IE_DESC', ie.IE_NAME)
+            if desc is False:
+                continue
+            if hasattr(ie, 'SEARCH_KEY'):
+                _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow')
+                _COUNTS = ('', '5', '10', 'all')
+                desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES))
+            write_string(desc + '\n', out=sys.stdout)
+        sys.exit(0)
+    if opts.ap_list_mso:
+        table = [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()]
+        write_string('Supported TV Providers:\n' + render_table(['mso', 'mso name'], table) + '\n', out=sys.stdout)
+        sys.exit(0)
+
+    # Conflicting, missing and erroneous options
+    if opts.usenetrc and (opts.username is not None or opts.password is not None):
+        parser.error('using .netrc conflicts with giving username/password')
+    if opts.password is not None and opts.username is None:
+        parser.error('account username missing\n')
+    if opts.ap_password is not None and opts.ap_username is None:
+        parser.error('TV Provider account username missing\n')
+    if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid):
+        parser.error('using output template conflicts with using title, video ID or auto number')
+    if opts.autonumber_size is not None:
+        if opts.autonumber_size <= 0:
+            parser.error('auto number size must be positive')
+    if opts.autonumber_start is not None:
+        if opts.autonumber_start < 0:
+            parser.error('auto number start must be positive or 0')
+    if opts.usetitle and opts.useid:
+        parser.error('using title conflicts with using video ID')
+    if opts.username is not None and opts.password is None:
+        opts.password = compat_getpass('Type account password and press [Return]: ')
+    if opts.ap_username is not None and opts.ap_password is None:
+        opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ')
+    if opts.ratelimit is not None:
+        numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
+        if numeric_limit is None:
+            parser.error('invalid rate limit specified')
+        opts.ratelimit = numeric_limit
+    if opts.min_filesize is not None:
+        numeric_limit = FileDownloader.parse_bytes(opts.min_filesize)
+        if numeric_limit is None:
+            parser.error('invalid min_filesize specified')
+        opts.min_filesize = numeric_limit
+    if opts.max_filesize is not None:
+        numeric_limit = FileDownloader.parse_bytes(opts.max_filesize)
+        if numeric_limit is None:
+            parser.error('invalid max_filesize specified')
+        opts.max_filesize = numeric_limit
+    if opts.sleep_interval is not None:
+        if opts.sleep_interval < 0:
+            parser.error('sleep interval must be positive or 0')
+    if opts.max_sleep_interval is not None:
+        if opts.max_sleep_interval < 0:
+            parser.error('max sleep interval must be positive or 0')
+        if opts.sleep_interval is None:
+            parser.error('min sleep interval must be specified, use --min-sleep-interval')
+        if opts.max_sleep_interval < opts.sleep_interval:
+            parser.error('max sleep interval must be greater than or equal to min sleep interval')
+    else:
+        opts.max_sleep_interval = opts.sleep_interval
+    if opts.ap_mso and opts.ap_mso not in MSO_INFO:
+        parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers')
+
+    def parse_retries(retries):
+        if retries in ('inf', 'infinite'):
+            parsed_retries = float('inf')
+        else:
+            try:
+                parsed_retries = int(retries)
+            except (TypeError, ValueError):
+                parser.error('invalid retry count specified')
+        return parsed_retries
+    if opts.retries is not None:
+        opts.retries = parse_retries(opts.retries)
+    if opts.fragment_retries is not None:
+        opts.fragment_retries = parse_retries(opts.fragment_retries)
+    if opts.buffersize is not None:
+        numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
+        if numeric_buffersize is None:
+            parser.error('invalid buffer size specified')
+        opts.buffersize = numeric_buffersize
+    if opts.http_chunk_size is not None:
+        numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size)
+        if not numeric_chunksize:
+            parser.error('invalid http chunk size specified')
+        opts.http_chunk_size = numeric_chunksize
+    if opts.playliststart <= 0:
+        raise ValueError('Playlist start must be positive')
+    if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart:
+        raise ValueError('Playlist end must be greater than playlist start')
+    if opts.extractaudio:
+        if opts.audioformat not in ['best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']:
+            parser.error('invalid audio format specified')
+    if opts.audioquality:
+        opts.audioquality = opts.audioquality.strip('k').strip('K')
+        if not opts.audioquality.isdigit():
+            parser.error('invalid audio quality specified')
+    if opts.recodevideo is not None:
+        if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']:
+            parser.error('invalid video recode format specified')
+    if opts.convertsubtitles is not None:
+        if opts.convertsubtitles not in ['srt', 'vtt', 'ass', 'lrc']:
+            parser.error('invalid subtitle format specified')
+
+    if opts.date is not None:
+        date = DateRange.day(opts.date)
+    else:
+        date = DateRange(opts.dateafter, opts.datebefore)
+
+    # Do not download videos when there are audio-only formats
+    if opts.extractaudio and not opts.keepvideo and opts.format is None:
+        opts.format = 'bestaudio/best'
+
+    # --all-sub automatically sets --write-sub if --write-auto-sub is not given
+    # this was the old behaviour if only --all-sub was given.
+    if opts.allsubtitles and not opts.writeautomaticsub:
+        opts.writesubtitles = True
+
+    outtmpl = ((opts.outtmpl is not None and opts.outtmpl)
+               or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s')
+               or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s')
+               or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s')
+               or (opts.usetitle and '%(title)s-%(id)s.%(ext)s')
+               or (opts.useid and '%(id)s.%(ext)s')
+               or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s')
+               or DEFAULT_OUTTMPL)
+    if not os.path.splitext(outtmpl)[1] and opts.extractaudio:
+        parser.error('Cannot download a video and extract audio into the same'
+                     ' file! Use "{0}.%(ext)s" instead of "{0}" as the output'
+                     ' template'.format(outtmpl))
+
+    any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
+    any_printing = opts.print_json
+    download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive
+
+    # PostProcessors
+    postprocessors = []
+    if opts.metafromtitle:
+        postprocessors.append({
+            'key': 'MetadataFromTitle',
+            'titleformat': opts.metafromtitle
+        })
+    if opts.extractaudio:
+        postprocessors.append({
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': opts.audioformat,
+            'preferredquality': opts.audioquality,
+            'nopostoverwrites': opts.nopostoverwrites,
+        })
+    if opts.recodevideo:
+        postprocessors.append({
+            'key': 'FFmpegVideoConvertor',
+            'preferedformat': opts.recodevideo,
+        })
+    # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and
+    # FFmpegExtractAudioPP as containers before conversion may not support
+    # metadata (3gp, webm, etc.)
+    # And this post-processor should be placed before other metadata
+    # manipulating post-processors (FFmpegEmbedSubtitle) to prevent loss of
+    # extra metadata. By default ffmpeg preserves metadata applicable for both
+    # source and target containers. From this point the container won't change,
+    # so metadata can be added here.
+    if opts.addmetadata:
+        postprocessors.append({'key': 'FFmpegMetadata'})
+    if opts.convertsubtitles:
+        postprocessors.append({
+            'key': 'FFmpegSubtitlesConvertor',
+            'format': opts.convertsubtitles,
+        })
+    if opts.embedsubtitles:
+        postprocessors.append({
+            'key': 'FFmpegEmbedSubtitle',
+        })
+    if opts.embedthumbnail:
+        already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails
+        postprocessors.append({
+            'key': 'EmbedThumbnail',
+            'already_have_thumbnail': already_have_thumbnail
+        })
+        if not already_have_thumbnail:
+            opts.writethumbnail = True
+    # XAttrMetadataPP should be run after post-processors that may change file
+    # contents
+    if opts.xattrs:
+        postprocessors.append({'key': 'XAttrMetadata'})
+    # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way.
+    # So if the user is able to remove the file before your postprocessor runs it might cause a few problems.
+    if opts.exec_cmd:
+        postprocessors.append({
+            'key': 'ExecAfterDownload',
+            'exec_cmd': opts.exec_cmd,
+        })
+    external_downloader_args = None
+    if opts.external_downloader_args:
+        external_downloader_args = compat_shlex_split(opts.external_downloader_args)
+    postprocessor_args = None
+    if opts.postprocessor_args:
+        postprocessor_args = compat_shlex_split(opts.postprocessor_args)
+    match_filter = (
+        None if opts.match_filter is None
+        else match_filter_func(opts.match_filter))
+
+    ydl_opts = {
+        'usenetrc': opts.usenetrc,
+        'username': opts.username,
+        'password': opts.password,
+        'twofactor': opts.twofactor,
+        'videopassword': opts.videopassword,
+        'ap_mso': opts.ap_mso,
+        'ap_username': opts.ap_username,
+        'ap_password': opts.ap_password,
+        'quiet': (opts.quiet or any_getting or any_printing),
+        'no_warnings': opts.no_warnings,
+        'forceurl': opts.geturl,
+        'forcetitle': opts.gettitle,
+        'forceid': opts.getid,
+        'forcethumbnail': opts.getthumbnail,
+        'forcedescription': opts.getdescription,
+        'forceduration': opts.getduration,
+        'forcefilename': opts.getfilename,
+        'forceformat': opts.getformat,
+        'forcejson': opts.dumpjson or opts.print_json,
+        'dump_single_json': opts.dump_single_json,
+        'simulate': opts.simulate or any_getting,
+        'skip_download': opts.skip_download,
+        'format': opts.format,
+        'listformats': opts.listformats,
+        'outtmpl': outtmpl,
+        'autonumber_size': opts.autonumber_size,
+        'autonumber_start': opts.autonumber_start,
+        'restrictfilenames': opts.restrictfilenames,
+        'ignoreerrors': opts.ignoreerrors,
+        'force_generic_extractor': opts.force_generic_extractor,
+        'ratelimit': opts.ratelimit,
+        'nooverwrites': opts.nooverwrites,
+        'retries': opts.retries,
+        'fragment_retries': opts.fragment_retries,
+        'skip_unavailable_fragments': opts.skip_unavailable_fragments,
+        'keep_fragments': opts.keep_fragments,
+        'buffersize': opts.buffersize,
+        'noresizebuffer': opts.noresizebuffer,
+        'http_chunk_size': opts.http_chunk_size,
+        'continuedl': opts.continue_dl,
+        'noprogress': opts.noprogress,
+        'progress_with_newline': opts.progress_with_newline,
+        'playliststart': opts.playliststart,
+        'playlistend': opts.playlistend,
+        'playlistreverse': opts.playlist_reverse,
+        'playlistrandom': opts.playlist_random,
+        'noplaylist': opts.noplaylist,
+        'logtostderr': opts.outtmpl == '-',
+        'consoletitle': opts.consoletitle,
+        'nopart': opts.nopart,
+        'updatetime': opts.updatetime,
+        'writedescription': opts.writedescription,
+        'writeannotations': opts.writeannotations,
+        'writeinfojson': opts.writeinfojson,
+        'writethumbnail': opts.writethumbnail,
+        'write_all_thumbnails': opts.write_all_thumbnails,
+        'writesubtitles': opts.writesubtitles,
+        'writeautomaticsub': opts.writeautomaticsub,
+        'allsubtitles': opts.allsubtitles,
+        'listsubtitles': opts.listsubtitles,
+        'subtitlesformat': opts.subtitlesformat,
+        'subtitleslangs': opts.subtitleslangs,
+        'matchtitle': decodeOption(opts.matchtitle),
+        'rejecttitle': decodeOption(opts.rejecttitle),
+        'max_downloads': opts.max_downloads,
+        'prefer_free_formats': opts.prefer_free_formats,
+        'verbose': opts.verbose,
+        'dump_intermediate_pages': opts.dump_intermediate_pages,
+        'write_pages': opts.write_pages,
+        'test': opts.test,
+        'keepvideo': opts.keepvideo,
+        'min_filesize': opts.min_filesize,
+        'max_filesize': opts.max_filesize,
+        'min_views': opts.min_views,
+        'max_views': opts.max_views,
+        'daterange': date,
+        'cachedir': opts.cachedir,
+        'youtube_print_sig_code': opts.youtube_print_sig_code,
+        'age_limit': opts.age_limit,
+        'download_archive': download_archive_fn,
+        'cookiefile': opts.cookiefile,
+        'nocheckcertificate': opts.no_check_certificate,
+        'prefer_insecure': opts.prefer_insecure,
+        'proxy': opts.proxy,
+        'socket_timeout': opts.socket_timeout,
+        'bidi_workaround': opts.bidi_workaround,
+        'debug_printtraffic': opts.debug_printtraffic,
+        'prefer_ffmpeg': opts.prefer_ffmpeg,
+        'include_ads': opts.include_ads,
+        'default_search': opts.default_search,
+        'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
+        'encoding': opts.encoding,
+        'extract_flat': opts.extract_flat,
+        'mark_watched': opts.mark_watched,
+        'merge_output_format': opts.merge_output_format,
+        'postprocessors': postprocessors,
+        'fixup': opts.fixup,
+        'source_address': opts.source_address,
+        'call_home': opts.call_home,
+        'sleep_interval': opts.sleep_interval,
+        'max_sleep_interval': opts.max_sleep_interval,
+        'external_downloader': opts.external_downloader,
+        'list_thumbnails': opts.list_thumbnails,
+        'playlist_items': opts.playlist_items,
+        'xattr_set_filesize': opts.xattr_set_filesize,
+        'match_filter': match_filter,
+        'no_color': opts.no_color,
+        'ffmpeg_location': opts.ffmpeg_location,
+        'hls_prefer_native': opts.hls_prefer_native,
+        'hls_use_mpegts': opts.hls_use_mpegts,
+        'external_downloader_args': external_downloader_args,
+        'postprocessor_args': postprocessor_args,
+        'cn_verification_proxy': opts.cn_verification_proxy,
+        'geo_verification_proxy': opts.geo_verification_proxy,
+        'config_location': opts.config_location,
+        'geo_bypass': opts.geo_bypass,
+        'geo_bypass_country': opts.geo_bypass_country,
+        'geo_bypass_ip_block': opts.geo_bypass_ip_block,
+        # just for deprecation check
+        'autonumber': opts.autonumber if opts.autonumber is True else None,
+        'usetitle': opts.usetitle if opts.usetitle is True else None,
+    }
+
+    with YoutubeDL(ydl_opts) as ydl:
+        # Update version
+        if opts.update_self:
+            update_self(ydl.to_screen, opts.verbose, ydl._opener)
+
+        # Remove cache dir
+        if opts.rm_cachedir:
+            ydl.cache.remove()
+
+        # Maybe do nothing
+        if (len(all_urls) < 1) and (opts.load_info_filename is None):
+            if opts.update_self or opts.rm_cachedir:
+                sys.exit()
+
+            ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv)
+            parser.error(
+                'You must provide at least one URL.\n'
+                'Type youtube-dlc --help to see a list of all options.')
+
+        try:
+            if opts.load_info_filename is not None:
+                retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename))
+            else:
+                retcode = ydl.download(all_urls)
+        except MaxDownloadsReached:
+            ydl.to_screen('--max-download limit reached, aborting.')
+            retcode = 101
+
+    sys.exit(retcode)
+
+
+def main(argv=None):
+    try:
+        _real_main(argv)
+    except DownloadError:
+        sys.exit(1)
+    except SameFileError:
+        sys.exit('ERROR: fixed output name but more than one file to download')
+    except KeyboardInterrupt:
+        sys.exit('\nERROR: Interrupted by user')
+
+
+__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors']
diff --git a/youtube_dl/__main__.py b/youtube_dl/__main__.py

new file mode 100644 (file)

index 0000000..0e76016
--- /dev/null
+++ b/youtube_dl/__main__.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+
+# Execute with
+# $ python youtube_dlc/__main__.py (2.6+)
+# $ python -m youtube_dlc          (2.7+)
+
+import sys
+
+if __package__ is None and not hasattr(sys, 'frozen'):
+    # direct call of __main__.py
+    import os.path
+    path = os.path.realpath(os.path.abspath(__file__))
+    sys.path.insert(0, os.path.dirname(os.path.dirname(path)))
+
+import youtube_dlc
+
+if __name__ == '__main__':
+    youtube_dlc.main()
diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py

new file mode 100644 (file)

index 0000000..461bb6d
--- /dev/null
+++ b/youtube_dl/aes.py
@@ -0,0 +1,361 @@
+from __future__ import unicode_literals
+
+from math import ceil
+
+from .compat import compat_b64decode
+from .utils import bytes_to_intlist, intlist_to_bytes
+
+BLOCK_SIZE_BYTES = 16
+
+
+def aes_ctr_decrypt(data, key, counter):
+    """
+    Decrypt with aes in counter mode
+
+    @param {int[]} data        cipher
+    @param {int[]} key         16/24/32-Byte cipher key
+    @param {instance} counter  Instance whose next_value function (@returns {int[]}  16-Byte block)
+                               returns the next counter block
+    @returns {int[]}           decrypted data
+    """
+    expanded_key = key_expansion(key)
+    block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES))
+
+    decrypted_data = []
+    for i in range(block_count):
+        counter_block = counter.next_value()
+        block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+        block += [0] * (BLOCK_SIZE_BYTES - len(block))
+
+        cipher_counter_block = aes_encrypt(counter_block, expanded_key)
+        decrypted_data += xor(block, cipher_counter_block)
+    decrypted_data = decrypted_data[:len(data)]
+
+    return decrypted_data
+
+
+def aes_cbc_decrypt(data, key, iv):
+    """
+    Decrypt with aes in CBC mode
+
+    @param {int[]} data        cipher
+    @param {int[]} key         16/24/32-Byte cipher key
+    @param {int[]} iv          16-Byte IV
+    @returns {int[]}           decrypted data
+    """
+    expanded_key = key_expansion(key)
+    block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES))
+
+    decrypted_data = []
+    previous_cipher_block = iv
+    for i in range(block_count):
+        block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+        block += [0] * (BLOCK_SIZE_BYTES - len(block))
+
+        decrypted_block = aes_decrypt(block, expanded_key)
+        decrypted_data += xor(decrypted_block, previous_cipher_block)
+        previous_cipher_block = block
+    decrypted_data = decrypted_data[:len(data)]
+
+    return decrypted_data
+
+
+def aes_cbc_encrypt(data, key, iv):
+    """
+    Encrypt with aes in CBC mode. Using PKCS#7 padding
+
+    @param {int[]} data        cleartext
+    @param {int[]} key         16/24/32-Byte cipher key
+    @param {int[]} iv          16-Byte IV
+    @returns {int[]}           encrypted data
+    """
+    expanded_key = key_expansion(key)
+    block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES))
+
+    encrypted_data = []
+    previous_cipher_block = iv
+    for i in range(block_count):
+        block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+        remaining_length = BLOCK_SIZE_BYTES - len(block)
+        block += [remaining_length] * remaining_length
+        mixed_block = xor(block, previous_cipher_block)
+
+        encrypted_block = aes_encrypt(mixed_block, expanded_key)
+        encrypted_data += encrypted_block
+
+        previous_cipher_block = encrypted_block
+
+    return encrypted_data
+
+
+def key_expansion(data):
+    """
+    Generate key schedule
+
+    @param {int[]} data  16/24/32-Byte cipher key
+    @returns {int[]}     176/208/240-Byte expanded key
+    """
+    data = data[:]  # copy
+    rcon_iteration = 1
+    key_size_bytes = len(data)
+    expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES
+
+    while len(data) < expanded_key_size_bytes:
+        temp = data[-4:]
+        temp = key_schedule_core(temp, rcon_iteration)
+        rcon_iteration += 1
+        data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
+        for _ in range(3):
+            temp = data[-4:]
+            data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
+        if key_size_bytes == 32:
+            temp = data[-4:]
+            temp = sub_bytes(temp)
+            data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
+        for _ in range(3 if key_size_bytes == 32 else 2 if key_size_bytes == 24 else 0):
+            temp = data[-4:]
+            data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+    data = data[:expanded_key_size_bytes]
+
+    return data
+
+
+def aes_encrypt(data, expanded_key):
+    """
+    Encrypt one block with aes
+
+    @param {int[]} data          16-Byte state
+    @param {int[]} expanded_key  176/208/240-Byte expanded key
+    @returns {int[]}             16-Byte cipher
+    """
+    rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1
+
+    data = xor(data, expanded_key[:BLOCK_SIZE_BYTES])
+    for i in range(1, rounds + 1):
+        data = sub_bytes(data)
+        data = shift_rows(data)
+        if i != rounds:
+            data = mix_columns(data)
+        data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES])
+
+    return data
+
+
+def aes_decrypt(data, expanded_key):
+    """
+    Decrypt one block with aes
+
+    @param {int[]} data          16-Byte cipher
+    @param {int[]} expanded_key  176/208/240-Byte expanded key
+    @returns {int[]}             16-Byte state
+    """
+    rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1
+
+    for i in range(rounds, 0, -1):
+        data = xor(data, expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES])
+        if i != rounds:
+            data = mix_columns_inv(data)
+        data = shift_rows_inv(data)
+        data = sub_bytes_inv(data)
+    data = xor(data, expanded_key[:BLOCK_SIZE_BYTES])
+
+    return data
+
+
+def aes_decrypt_text(data, password, key_size_bytes):
+    """
+    Decrypt text
+    - The first 8 Bytes of decoded 'data' are the 8 high Bytes of the counter
+    - The cipher key is retrieved by encrypting the first 16 Byte of 'password'
+      with the first 'key_size_bytes' Bytes from 'password' (if necessary filled with 0's)
+    - Mode of operation is 'counter'
+
+    @param {str} data                    Base64 encoded string
+    @param {str,unicode} password        Password (will be encoded with utf-8)
+    @param {int} key_size_bytes          Possible values: 16 for 128-Bit, 24 for 192-Bit or 32 for 256-Bit
+    @returns {str}                       Decrypted data
+    """
+    NONCE_LENGTH_BYTES = 8
+
+    data = bytes_to_intlist(compat_b64decode(data))
+    password = bytes_to_intlist(password.encode('utf-8'))
+
+    key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password))
+    key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES)
+
+    nonce = data[:NONCE_LENGTH_BYTES]
+    cipher = data[NONCE_LENGTH_BYTES:]
+
+    class Counter(object):
+        __value = nonce + [0] * (BLOCK_SIZE_BYTES - NONCE_LENGTH_BYTES)
+
+        def next_value(self):
+            temp = self.__value
+            self.__value = inc(self.__value)
+            return temp
+
+    decrypted_data = aes_ctr_decrypt(cipher, key, Counter())
+    plaintext = intlist_to_bytes(decrypted_data)
+
+    return plaintext
+
+
+RCON = (0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36)
+SBOX = (0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+        0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+        0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+        0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+        0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+        0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+        0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+        0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+        0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+        0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+        0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+        0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+        0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+        0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+        0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+        0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16)
+SBOX_INV = (0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+            0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+            0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+            0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+            0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+            0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+            0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+            0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+            0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+            0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+            0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+            0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+            0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+            0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+            0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+            0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d)
+MIX_COLUMN_MATRIX = ((0x2, 0x3, 0x1, 0x1),
+                     (0x1, 0x2, 0x3, 0x1),
+                     (0x1, 0x1, 0x2, 0x3),
+                     (0x3, 0x1, 0x1, 0x2))
+MIX_COLUMN_MATRIX_INV = ((0xE, 0xB, 0xD, 0x9),
+                         (0x9, 0xE, 0xB, 0xD),
+                         (0xD, 0x9, 0xE, 0xB),
+                         (0xB, 0xD, 0x9, 0xE))
+RIJNDAEL_EXP_TABLE = (0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35,
+                      0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA,
+                      0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31,
+                      0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD,
+                      0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88,
+                      0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A,
+                      0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3,
+                      0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0,
+                      0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41,
+                      0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75,
+                      0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80,
+                      0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54,
+                      0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA,
+                      0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E,
+                      0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17,
+                      0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01)
+RIJNDAEL_LOG_TABLE = (0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
+                      0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
+                      0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
+                      0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
+                      0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
+                      0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
+                      0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
+                      0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
+                      0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
+                      0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
+                      0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
+                      0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
+                      0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
+                      0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
+                      0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
+                      0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07)
+
+
+def sub_bytes(data):
+    return [SBOX[x] for x in data]
+
+
+def sub_bytes_inv(data):
+    return [SBOX_INV[x] for x in data]
+
+
+def rotate(data):
+    return data[1:] + [data[0]]
+
+
+def key_schedule_core(data, rcon_iteration):
+    data = rotate(data)
+    data = sub_bytes(data)
+    data[0] = data[0] ^ RCON[rcon_iteration]
+
+    return data
+
+
+def xor(data1, data2):
+    return [x ^ y for x, y in zip(data1, data2)]
+
+
+def rijndael_mul(a, b):
+    if(a == 0 or b == 0):
+        return 0
+    return RIJNDAEL_EXP_TABLE[(RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF]
+
+
+def mix_column(data, matrix):
+    data_mixed = []
+    for row in range(4):
+        mixed = 0
+        for column in range(4):
+            # xor is (+) and (-)
+            mixed ^= rijndael_mul(data[column], matrix[row][column])
+        data_mixed.append(mixed)
+    return data_mixed
+
+
+def mix_columns(data, matrix=MIX_COLUMN_MATRIX):
+    data_mixed = []
+    for i in range(4):
+        column = data[i * 4: (i + 1) * 4]
+        data_mixed += mix_column(column, matrix)
+    return data_mixed
+
+
+def mix_columns_inv(data):
+    return mix_columns(data, MIX_COLUMN_MATRIX_INV)
+
+
+def shift_rows(data):
+    data_shifted = []
+    for column in range(4):
+        for row in range(4):
+            data_shifted.append(data[((column + row) & 0b11) * 4 + row])
+    return data_shifted
+
+
+def shift_rows_inv(data):
+    data_shifted = []
+    for column in range(4):
+        for row in range(4):
+            data_shifted.append(data[((column - row) & 0b11) * 4 + row])
+    return data_shifted
+
+
+def inc(data):
+    data = data[:]  # copy
+    for i in range(len(data) - 1, -1, -1):
+        if data[i] == 255:
+            data[i] = 0
+        else:
+            data[i] = data[i] + 1
+            break
+    return data
+
+
+__all__ = ['aes_encrypt', 'key_expansion', 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_decrypt_text']
diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py

new file mode 100644 (file)

index 0000000..ada6aa1
--- /dev/null
+++ b/youtube_dl/cache.py
@@ -0,0 +1,96 @@
+from __future__ import unicode_literals
+
+import errno
+import io
+import json
+import os
+import re
+import shutil
+import traceback
+
+from .compat import compat_getenv
+from .utils import (
+    expand_path,
+    write_json_file,
+)
+
+
+class Cache(object):
+    def __init__(self, ydl):
+        self._ydl = ydl
+
+    def _get_root_dir(self):
+        res = self._ydl.params.get('cachedir')
+        if res is None:
+            cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache')
+            res = os.path.join(cache_root, 'youtube-dlc')
+        return expand_path(res)
+
+    def _get_cache_fn(self, section, key, dtype):
+        assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \
+            'invalid section %r' % section
+        assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key
+        return os.path.join(
+            self._get_root_dir(), section, '%s.%s' % (key, dtype))
+
+    @property
+    def enabled(self):
+        return self._ydl.params.get('cachedir') is not False
+
+    def store(self, section, key, data, dtype='json'):
+        assert dtype in ('json',)
+
+        if not self.enabled:
+            return
+
+        fn = self._get_cache_fn(section, key, dtype)
+        try:
+            try:
+                os.makedirs(os.path.dirname(fn))
+            except OSError as ose:
+                if ose.errno != errno.EEXIST:
+                    raise
+            write_json_file(data, fn)
+        except Exception:
+            tb = traceback.format_exc()
+            self._ydl.report_warning(
+                'Writing cache to %r failed: %s' % (fn, tb))
+
+    def load(self, section, key, dtype='json', default=None):
+        assert dtype in ('json',)
+
+        if not self.enabled:
+            return default
+
+        cache_fn = self._get_cache_fn(section, key, dtype)
+        try:
+            try:
+                with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
+                    return json.load(cachef)
+            except ValueError:
+                try:
+                    file_size = os.path.getsize(cache_fn)
+                except (OSError, IOError) as oe:
+                    file_size = str(oe)
+                self._ydl.report_warning(
+                    'Cache retrieval from %s failed (%s)' % (cache_fn, file_size))
+        except IOError:
+            pass  # No cache available
+
+        return default
+
+    def remove(self):
+        if not self.enabled:
+            self._ydl.to_screen('Cache is disabled (Did you combine --no-cache-dir and --rm-cache-dir?)')
+            return
+
+        cachedir = self._get_root_dir()
+        if not any((term in cachedir) for term in ('cache', 'tmp')):
+            raise Exception('Not removing directory %s - this does not look like a cache dir' % cachedir)
+
+        self._ydl.to_screen(
+            'Removing cache dir %s .' % cachedir, skip_eol=True)
+        if os.path.exists(cachedir):
+            self._ydl.to_screen('.', skip_eol=True)
+            shutil.rmtree(cachedir)
+        self._ydl.to_screen('.')
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py

new file mode 100644 (file)

index 0000000..1cf7efe
--- /dev/null
+++ b/youtube_dl/compat.py
@@ -0,0 +1,3050 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import binascii
+import collections
+import ctypes
+import email
+import getpass
+import io
+import itertools
+import optparse
+import os
+import platform
+import re
+import shlex
+import shutil
+import socket
+import struct
+import subprocess
+import sys
+import xml.etree.ElementTree
+
+
+try:
+    import urllib.request as compat_urllib_request
+except ImportError:  # Python 2
+    import urllib2 as compat_urllib_request
+
+try:
+    import urllib.error as compat_urllib_error
+except ImportError:  # Python 2
+    import urllib2 as compat_urllib_error
+
+try:
+    import urllib.parse as compat_urllib_parse
+except ImportError:  # Python 2
+    import urllib as compat_urllib_parse
+
+try:
+    from urllib.parse import urlparse as compat_urllib_parse_urlparse
+except ImportError:  # Python 2
+    from urlparse import urlparse as compat_urllib_parse_urlparse
+
+try:
+    import urllib.parse as compat_urlparse
+except ImportError:  # Python 2
+    import urlparse as compat_urlparse
+
+try:
+    import urllib.response as compat_urllib_response
+except ImportError:  # Python 2
+    import urllib as compat_urllib_response
+
+try:
+    import http.cookiejar as compat_cookiejar
+except ImportError:  # Python 2
+    import cookielib as compat_cookiejar
+
+if sys.version_info[0] == 2:
+    class compat_cookiejar_Cookie(compat_cookiejar.Cookie):
+        def __init__(self, version, name, value, *args, **kwargs):
+            if isinstance(name, compat_str):
+                name = name.encode()
+            if isinstance(value, compat_str):
+                value = value.encode()
+            compat_cookiejar.Cookie.__init__(self, version, name, value, *args, **kwargs)
+else:
+    compat_cookiejar_Cookie = compat_cookiejar.Cookie
+
+try:
+    import http.cookies as compat_cookies
+except ImportError:  # Python 2
+    import Cookie as compat_cookies
+
+try:
+    import html.entities as compat_html_entities
+except ImportError:  # Python 2
+    import htmlentitydefs as compat_html_entities
+
+try:  # Python >= 3.3
+    compat_html_entities_html5 = compat_html_entities.html5
+except AttributeError:
+    # Copied from CPython 3.5.1 html/entities.py
+    compat_html_entities_html5 = {
+        'Aacute': '\xc1',
+        'aacute': '\xe1',
+        'Aacute;': '\xc1',
+        'aacute;': '\xe1',
+        'Abreve;': '\u0102',
+        'abreve;': '\u0103',
+        'ac;': '\u223e',
+        'acd;': '\u223f',
+        'acE;': '\u223e\u0333',
+        'Acirc': '\xc2',
+        'acirc': '\xe2',
+        'Acirc;': '\xc2',
+        'acirc;': '\xe2',
+        'acute': '\xb4',
+        'acute;': '\xb4',
+        'Acy;': '\u0410',
+        'acy;': '\u0430',
+        'AElig': '\xc6',
+        'aelig': '\xe6',
+        'AElig;': '\xc6',
+        'aelig;': '\xe6',
+        'af;': '\u2061',
+        'Afr;': '\U0001d504',
+        'afr;': '\U0001d51e',
+        'Agrave': '\xc0',
+        'agrave': '\xe0',
+        'Agrave;': '\xc0',
+        'agrave;': '\xe0',
+        'alefsym;': '\u2135',
+        'aleph;': '\u2135',
+        'Alpha;': '\u0391',
+        'alpha;': '\u03b1',
+        'Amacr;': '\u0100',
+        'amacr;': '\u0101',
+        'amalg;': '\u2a3f',
+        'AMP': '&',
+        'amp': '&',
+        'AMP;': '&',
+        'amp;': '&',
+        'And;': '\u2a53',
+        'and;': '\u2227',
+        'andand;': '\u2a55',
+        'andd;': '\u2a5c',
+        'andslope;': '\u2a58',
+        'andv;': '\u2a5a',
+        'ang;': '\u2220',
+        'ange;': '\u29a4',
+        'angle;': '\u2220',
+        'angmsd;': '\u2221',
+        'angmsdaa;': '\u29a8',
+        'angmsdab;': '\u29a9',
+        'angmsdac;': '\u29aa',
+        'angmsdad;': '\u29ab',
+        'angmsdae;': '\u29ac',
+        'angmsdaf;': '\u29ad',
+        'angmsdag;': '\u29ae',
+        'angmsdah;': '\u29af',
+        'angrt;': '\u221f',
+        'angrtvb;': '\u22be',
+        'angrtvbd;': '\u299d',
+        'angsph;': '\u2222',
+        'angst;': '\xc5',
+        'angzarr;': '\u237c',
+        'Aogon;': '\u0104',
+        'aogon;': '\u0105',
+        'Aopf;': '\U0001d538',
+        'aopf;': '\U0001d552',
+        'ap;': '\u2248',
+        'apacir;': '\u2a6f',
+        'apE;': '\u2a70',
+        'ape;': '\u224a',
+        'apid;': '\u224b',
+        'apos;': "'",
+        'ApplyFunction;': '\u2061',
+        'approx;': '\u2248',
+        'approxeq;': '\u224a',
+        'Aring': '\xc5',
+        'aring': '\xe5',
+        'Aring;': '\xc5',
+        'aring;': '\xe5',
+        'Ascr;': '\U0001d49c',
+        'ascr;': '\U0001d4b6',
+        'Assign;': '\u2254',
+        'ast;': '*',
+        'asymp;': '\u2248',
+        'asympeq;': '\u224d',
+        'Atilde': '\xc3',
+        'atilde': '\xe3',
+        'Atilde;': '\xc3',
+        'atilde;': '\xe3',
+        'Auml': '\xc4',
+        'auml': '\xe4',
+        'Auml;': '\xc4',
+        'auml;': '\xe4',
+        'awconint;': '\u2233',
+        'awint;': '\u2a11',
+        'backcong;': '\u224c',
+        'backepsilon;': '\u03f6',
+        'backprime;': '\u2035',
+        'backsim;': '\u223d',
+        'backsimeq;': '\u22cd',
+        'Backslash;': '\u2216',
+        'Barv;': '\u2ae7',
+        'barvee;': '\u22bd',
+        'Barwed;': '\u2306',
+        'barwed;': '\u2305',
+        'barwedge;': '\u2305',
+        'bbrk;': '\u23b5',
+        'bbrktbrk;': '\u23b6',
+        'bcong;': '\u224c',
+        'Bcy;': '\u0411',
+        'bcy;': '\u0431',
+        'bdquo;': '\u201e',
+        'becaus;': '\u2235',
+        'Because;': '\u2235',
+        'because;': '\u2235',
+        'bemptyv;': '\u29b0',
+        'bepsi;': '\u03f6',
+        'bernou;': '\u212c',
+        'Bernoullis;': '\u212c',
+        'Beta;': '\u0392',
+        'beta;': '\u03b2',
+        'beth;': '\u2136',
+        'between;': '\u226c',
+        'Bfr;': '\U0001d505',
+        'bfr;': '\U0001d51f',
+        'bigcap;': '\u22c2',
+        'bigcirc;': '\u25ef',
+        'bigcup;': '\u22c3',
+        'bigodot;': '\u2a00',
+        'bigoplus;': '\u2a01',
+        'bigotimes;': '\u2a02',
+        'bigsqcup;': '\u2a06',
+        'bigstar;': '\u2605',
+        'bigtriangledown;': '\u25bd',
+        'bigtriangleup;': '\u25b3',
+        'biguplus;': '\u2a04',
+        'bigvee;': '\u22c1',
+        'bigwedge;': '\u22c0',
+        'bkarow;': '\u290d',
+        'blacklozenge;': '\u29eb',
+        'blacksquare;': '\u25aa',
+        'blacktriangle;': '\u25b4',
+        'blacktriangledown;': '\u25be',
+        'blacktriangleleft;': '\u25c2',
+        'blacktriangleright;': '\u25b8',
+        'blank;': '\u2423',
+        'blk12;': '\u2592',
+        'blk14;': '\u2591',
+        'blk34;': '\u2593',
+        'block;': '\u2588',
+        'bne;': '=\u20e5',
+        'bnequiv;': '\u2261\u20e5',
+        'bNot;': '\u2aed',
+        'bnot;': '\u2310',
+        'Bopf;': '\U0001d539',
+        'bopf;': '\U0001d553',
+        'bot;': '\u22a5',
+        'bottom;': '\u22a5',
+        'bowtie;': '\u22c8',
+        'boxbox;': '\u29c9',
+        'boxDL;': '\u2557',
+        'boxDl;': '\u2556',
+        'boxdL;': '\u2555',
+        'boxdl;': '\u2510',
+        'boxDR;': '\u2554',
+        'boxDr;': '\u2553',
+        'boxdR;': '\u2552',
+        'boxdr;': '\u250c',
+        'boxH;': '\u2550',
+        'boxh;': '\u2500',
+        'boxHD;': '\u2566',
+        'boxHd;': '\u2564',
+        'boxhD;': '\u2565',
+        'boxhd;': '\u252c',
+        'boxHU;': '\u2569',
+        'boxHu;': '\u2567',
+        'boxhU;': '\u2568',
+        'boxhu;': '\u2534',
+        'boxminus;': '\u229f',
+        'boxplus;': '\u229e',
+        'boxtimes;': '\u22a0',
+        'boxUL;': '\u255d',
+        'boxUl;': '\u255c',
+        'boxuL;': '\u255b',
+        'boxul;': '\u2518',
+        'boxUR;': '\u255a',
+        'boxUr;': '\u2559',
+        'boxuR;': '\u2558',
+        'boxur;': '\u2514',
+        'boxV;': '\u2551',
+        'boxv;': '\u2502',
+        'boxVH;': '\u256c',
+        'boxVh;': '\u256b',
+        'boxvH;': '\u256a',
+        'boxvh;': '\u253c',
+        'boxVL;': '\u2563',
+        'boxVl;': '\u2562',
+        'boxvL;': '\u2561',
+        'boxvl;': '\u2524',
+        'boxVR;': '\u2560',
+        'boxVr;': '\u255f',
+        'boxvR;': '\u255e',
+        'boxvr;': '\u251c',
+        'bprime;': '\u2035',
+        'Breve;': '\u02d8',
+        'breve;': '\u02d8',
+        'brvbar': '\xa6',
+        'brvbar;': '\xa6',
+        'Bscr;': '\u212c',
+        'bscr;': '\U0001d4b7',
+        'bsemi;': '\u204f',
+        'bsim;': '\u223d',
+        'bsime;': '\u22cd',
+        'bsol;': '\\',
+        'bsolb;': '\u29c5',
+        'bsolhsub;': '\u27c8',
+        'bull;': '\u2022',
+        'bullet;': '\u2022',
+        'bump;': '\u224e',
+        'bumpE;': '\u2aae',
+        'bumpe;': '\u224f',
+        'Bumpeq;': '\u224e',
+        'bumpeq;': '\u224f',
+        'Cacute;': '\u0106',
+        'cacute;': '\u0107',
+        'Cap;': '\u22d2',
+        'cap;': '\u2229',
+        'capand;': '\u2a44',
+        'capbrcup;': '\u2a49',
+        'capcap;': '\u2a4b',
+        'capcup;': '\u2a47',
+        'capdot;': '\u2a40',
+        'CapitalDifferentialD;': '\u2145',
+        'caps;': '\u2229\ufe00',
+        'caret;': '\u2041',
+        'caron;': '\u02c7',
+        'Cayleys;': '\u212d',
+        'ccaps;': '\u2a4d',
+        'Ccaron;': '\u010c',
+        'ccaron;': '\u010d',
+        'Ccedil': '\xc7',
+        'ccedil': '\xe7',
+        'Ccedil;': '\xc7',
+        'ccedil;': '\xe7',
+        'Ccirc;': '\u0108',
+        'ccirc;': '\u0109',
+        'Cconint;': '\u2230',
+        'ccups;': '\u2a4c',
+        'ccupssm;': '\u2a50',
+        'Cdot;': '\u010a',
+        'cdot;': '\u010b',
+        'cedil': '\xb8',
+        'cedil;': '\xb8',
+        'Cedilla;': '\xb8',
+        'cemptyv;': '\u29b2',
+        'cent': '\xa2',
+        'cent;': '\xa2',
+        'CenterDot;': '\xb7',
+        'centerdot;': '\xb7',
+        'Cfr;': '\u212d',
+        'cfr;': '\U0001d520',
+        'CHcy;': '\u0427',
+        'chcy;': '\u0447',
+        'check;': '\u2713',
+        'checkmark;': '\u2713',
+        'Chi;': '\u03a7',
+        'chi;': '\u03c7',
+        'cir;': '\u25cb',
+        'circ;': '\u02c6',
+        'circeq;': '\u2257',
+        'circlearrowleft;': '\u21ba',
+        'circlearrowright;': '\u21bb',
+        'circledast;': '\u229b',
+        'circledcirc;': '\u229a',
+        'circleddash;': '\u229d',
+        'CircleDot;': '\u2299',
+        'circledR;': '\xae',
+        'circledS;': '\u24c8',
+        'CircleMinus;': '\u2296',
+        'CirclePlus;': '\u2295',
+        'CircleTimes;': '\u2297',
+        'cirE;': '\u29c3',
+        'cire;': '\u2257',
+        'cirfnint;': '\u2a10',
+        'cirmid;': '\u2aef',
+        'cirscir;': '\u29c2',
+        'ClockwiseContourIntegral;': '\u2232',
+        'CloseCurlyDoubleQuote;': '\u201d',
+        'CloseCurlyQuote;': '\u2019',
+        'clubs;': '\u2663',
+        'clubsuit;': '\u2663',
+        'Colon;': '\u2237',
+        'colon;': ':',
+        'Colone;': '\u2a74',
+        'colone;': '\u2254',
+        'coloneq;': '\u2254',
+        'comma;': ',',
+        'commat;': '@',
+        'comp;': '\u2201',
+        'compfn;': '\u2218',
+        'complement;': '\u2201',
+        'complexes;': '\u2102',
+        'cong;': '\u2245',
+        'congdot;': '\u2a6d',
+        'Congruent;': '\u2261',
+        'Conint;': '\u222f',
+        'conint;': '\u222e',
+        'ContourIntegral;': '\u222e',
+        'Copf;': '\u2102',
+        'copf;': '\U0001d554',
+        'coprod;': '\u2210',
+        'Coproduct;': '\u2210',
+        'COPY': '\xa9',
+        'copy': '\xa9',
+        'COPY;': '\xa9',
+        'copy;': '\xa9',
+        'copysr;': '\u2117',
+        'CounterClockwiseContourIntegral;': '\u2233',
+        'crarr;': '\u21b5',
+        'Cross;': '\u2a2f',
+        'cross;': '\u2717',
+        'Cscr;': '\U0001d49e',
+        'cscr;': '\U0001d4b8',
+        'csub;': '\u2acf',
+        'csube;': '\u2ad1',
+        'csup;': '\u2ad0',
+        'csupe;': '\u2ad2',
+        'ctdot;': '\u22ef',
+        'cudarrl;': '\u2938',
+        'cudarrr;': '\u2935',
+        'cuepr;': '\u22de',
+        'cuesc;': '\u22df',
+        'cularr;': '\u21b6',
+        'cularrp;': '\u293d',
+        'Cup;': '\u22d3',
+        'cup;': '\u222a',
+        'cupbrcap;': '\u2a48',
+        'CupCap;': '\u224d',
+        'cupcap;': '\u2a46',
+        'cupcup;': '\u2a4a',
+        'cupdot;': '\u228d',
+        'cupor;': '\u2a45',
+        'cups;': '\u222a\ufe00',
+        'curarr;': '\u21b7',
+        'curarrm;': '\u293c',
+        'curlyeqprec;': '\u22de',
+        'curlyeqsucc;': '\u22df',
+        'curlyvee;': '\u22ce',
+        'curlywedge;': '\u22cf',
+        'curren': '\xa4',
+        'curren;': '\xa4',
+        'curvearrowleft;': '\u21b6',
+        'curvearrowright;': '\u21b7',
+        'cuvee;': '\u22ce',
+        'cuwed;': '\u22cf',
+        'cwconint;': '\u2232',
+        'cwint;': '\u2231',
+        'cylcty;': '\u232d',
+        'Dagger;': '\u2021',
+        'dagger;': '\u2020',
+        'daleth;': '\u2138',
+        'Darr;': '\u21a1',
+        'dArr;': '\u21d3',
+        'darr;': '\u2193',
+        'dash;': '\u2010',
+        'Dashv;': '\u2ae4',
+        'dashv;': '\u22a3',
+        'dbkarow;': '\u290f',
+        'dblac;': '\u02dd',
+        'Dcaron;': '\u010e',
+        'dcaron;': '\u010f',
+        'Dcy;': '\u0414',
+        'dcy;': '\u0434',
+        'DD;': '\u2145',
+        'dd;': '\u2146',
+        'ddagger;': '\u2021',
+        'ddarr;': '\u21ca',
+        'DDotrahd;': '\u2911',
+        'ddotseq;': '\u2a77',
+        'deg': '\xb0',
+        'deg;': '\xb0',
+        'Del;': '\u2207',
+        'Delta;': '\u0394',
+        'delta;': '\u03b4',
+        'demptyv;': '\u29b1',
+        'dfisht;': '\u297f',
+        'Dfr;': '\U0001d507',
+        'dfr;': '\U0001d521',
+        'dHar;': '\u2965',
+        'dharl;': '\u21c3',
+        'dharr;': '\u21c2',
+        'DiacriticalAcute;': '\xb4',
+        'DiacriticalDot;': '\u02d9',
+        'DiacriticalDoubleAcute;': '\u02dd',
+        'DiacriticalGrave;': '`',
+        'DiacriticalTilde;': '\u02dc',
+        'diam;': '\u22c4',
+        'Diamond;': '\u22c4',
+        'diamond;': '\u22c4',
+        'diamondsuit;': '\u2666',
+        'diams;': '\u2666',
+        'die;': '\xa8',
+        'DifferentialD;': '\u2146',
+        'digamma;': '\u03dd',
+        'disin;': '\u22f2',
+        'div;': '\xf7',
+        'divide': '\xf7',
+        'divide;': '\xf7',
+        'divideontimes;': '\u22c7',
+        'divonx;': '\u22c7',
+        'DJcy;': '\u0402',
+        'djcy;': '\u0452',
+        'dlcorn;': '\u231e',
+        'dlcrop;': '\u230d',
+        'dollar;': '$',
+        'Dopf;': '\U0001d53b',
+        'dopf;': '\U0001d555',
+        'Dot;': '\xa8',
+        'dot;': '\u02d9',
+        'DotDot;': '\u20dc',
+        'doteq;': '\u2250',
+        'doteqdot;': '\u2251',
+        'DotEqual;': '\u2250',
+        'dotminus;': '\u2238',
+        'dotplus;': '\u2214',
+        'dotsquare;': '\u22a1',
+        'doublebarwedge;': '\u2306',
+        'DoubleContourIntegral;': '\u222f',
+        'DoubleDot;': '\xa8',
+        'DoubleDownArrow;': '\u21d3',
+        'DoubleLeftArrow;': '\u21d0',
+        'DoubleLeftRightArrow;': '\u21d4',
+        'DoubleLeftTee;': '\u2ae4',
+        'DoubleLongLeftArrow;': '\u27f8',
+        'DoubleLongLeftRightArrow;': '\u27fa',
+        'DoubleLongRightArrow;': '\u27f9',
+        'DoubleRightArrow;': '\u21d2',
+        'DoubleRightTee;': '\u22a8',
+        'DoubleUpArrow;': '\u21d1',
+        'DoubleUpDownArrow;': '\u21d5',
+        'DoubleVerticalBar;': '\u2225',
+        'DownArrow;': '\u2193',
+        'Downarrow;': '\u21d3',
+        'downarrow;': '\u2193',
+        'DownArrowBar;': '\u2913',
+        'DownArrowUpArrow;': '\u21f5',
+        'DownBreve;': '\u0311',
+        'downdownarrows;': '\u21ca',
+        'downharpoonleft;': '\u21c3',
+        'downharpoonright;': '\u21c2',
+        'DownLeftRightVector;': '\u2950',
+        'DownLeftTeeVector;': '\u295e',
+        'DownLeftVector;': '\u21bd',
+        'DownLeftVectorBar;': '\u2956',
+        'DownRightTeeVector;': '\u295f',
+        'DownRightVector;': '\u21c1',
+        'DownRightVectorBar;': '\u2957',
+        'DownTee;': '\u22a4',
+        'DownTeeArrow;': '\u21a7',
+        'drbkarow;': '\u2910',
+        'drcorn;': '\u231f',
+        'drcrop;': '\u230c',
+        'Dscr;': '\U0001d49f',
+        'dscr;': '\U0001d4b9',
+        'DScy;': '\u0405',
+        'dscy;': '\u0455',
+        'dsol;': '\u29f6',
+        'Dstrok;': '\u0110',
+        'dstrok;': '\u0111',
+        'dtdot;': '\u22f1',
+        'dtri;': '\u25bf',
+        'dtrif;': '\u25be',
+        'duarr;': '\u21f5',
+        'duhar;': '\u296f',
+        'dwangle;': '\u29a6',
+        'DZcy;': '\u040f',
+        'dzcy;': '\u045f',
+        'dzigrarr;': '\u27ff',
+        'Eacute': '\xc9',
+        'eacute': '\xe9',
+        'Eacute;': '\xc9',
+        'eacute;': '\xe9',
+        'easter;': '\u2a6e',
+        'Ecaron;': '\u011a',
+        'ecaron;': '\u011b',
+        'ecir;': '\u2256',
+        'Ecirc': '\xca',
+        'ecirc': '\xea',
+        'Ecirc;': '\xca',
+        'ecirc;': '\xea',
+        'ecolon;': '\u2255',
+        'Ecy;': '\u042d',
+        'ecy;': '\u044d',
+        'eDDot;': '\u2a77',
+        'Edot;': '\u0116',
+        'eDot;': '\u2251',
+        'edot;': '\u0117',
+        'ee;': '\u2147',
+        'efDot;': '\u2252',
+        'Efr;': '\U0001d508',
+        'efr;': '\U0001d522',
+        'eg;': '\u2a9a',
+        'Egrave': '\xc8',
+        'egrave': '\xe8',
+        'Egrave;': '\xc8',
+        'egrave;': '\xe8',
+        'egs;': '\u2a96',
+        'egsdot;': '\u2a98',
+        'el;': '\u2a99',
+        'Element;': '\u2208',
+        'elinters;': '\u23e7',
+        'ell;': '\u2113',
+        'els;': '\u2a95',
+        'elsdot;': '\u2a97',
+        'Emacr;': '\u0112',
+        'emacr;': '\u0113',
+        'empty;': '\u2205',
+        'emptyset;': '\u2205',
+        'EmptySmallSquare;': '\u25fb',
+        'emptyv;': '\u2205',
+        'EmptyVerySmallSquare;': '\u25ab',
+        'emsp13;': '\u2004',
+        'emsp14;': '\u2005',
+        'emsp;': '\u2003',
+        'ENG;': '\u014a',
+        'eng;': '\u014b',
+        'ensp;': '\u2002',
+        'Eogon;': '\u0118',
+        'eogon;': '\u0119',
+        'Eopf;': '\U0001d53c',
+        'eopf;': '\U0001d556',
+        'epar;': '\u22d5',
+        'eparsl;': '\u29e3',
+        'eplus;': '\u2a71',
+        'epsi;': '\u03b5',
+        'Epsilon;': '\u0395',
+        'epsilon;': '\u03b5',
+        'epsiv;': '\u03f5',
+        'eqcirc;': '\u2256',
+        'eqcolon;': '\u2255',
+        'eqsim;': '\u2242',
+        'eqslantgtr;': '\u2a96',
+        'eqslantless;': '\u2a95',
+        'Equal;': '\u2a75',
+        'equals;': '=',
+        'EqualTilde;': '\u2242',
+        'equest;': '\u225f',
+        'Equilibrium;': '\u21cc',
+        'equiv;': '\u2261',
+        'equivDD;': '\u2a78',
+        'eqvparsl;': '\u29e5',
+        'erarr;': '\u2971',
+        'erDot;': '\u2253',
+        'Escr;': '\u2130',
+        'escr;': '\u212f',
+        'esdot;': '\u2250',
+        'Esim;': '\u2a73',
+        'esim;': '\u2242',
+        'Eta;': '\u0397',
+        'eta;': '\u03b7',
+        'ETH': '\xd0',
+        'eth': '\xf0',
+        'ETH;': '\xd0',
+        'eth;': '\xf0',
+        'Euml': '\xcb',
+        'euml': '\xeb',
+        'Euml;': '\xcb',
+        'euml;': '\xeb',
+        'euro;': '\u20ac',
+        'excl;': '!',
+        'exist;': '\u2203',
+        'Exists;': '\u2203',
+        'expectation;': '\u2130',
+        'ExponentialE;': '\u2147',
+        'exponentiale;': '\u2147',
+        'fallingdotseq;': '\u2252',
+        'Fcy;': '\u0424',
+        'fcy;': '\u0444',
+        'female;': '\u2640',
+        'ffilig;': '\ufb03',
+        'fflig;': '\ufb00',
+        'ffllig;': '\ufb04',
+        'Ffr;': '\U0001d509',
+        'ffr;': '\U0001d523',
+        'filig;': '\ufb01',
+        'FilledSmallSquare;': '\u25fc',
+        'FilledVerySmallSquare;': '\u25aa',
+        'fjlig;': 'fj',
+        'flat;': '\u266d',
+        'fllig;': '\ufb02',
+        'fltns;': '\u25b1',
+        'fnof;': '\u0192',
+        'Fopf;': '\U0001d53d',
+        'fopf;': '\U0001d557',
+        'ForAll;': '\u2200',
+        'forall;': '\u2200',
+        'fork;': '\u22d4',
+        'forkv;': '\u2ad9',
+        'Fouriertrf;': '\u2131',
+        'fpartint;': '\u2a0d',
+        'frac12': '\xbd',
+        'frac12;': '\xbd',
+        'frac13;': '\u2153',
+        'frac14': '\xbc',
+        'frac14;': '\xbc',
+        'frac15;': '\u2155',
+        'frac16;': '\u2159',
+        'frac18;': '\u215b',
+        'frac23;': '\u2154',
+        'frac25;': '\u2156',
+        'frac34': '\xbe',
+        'frac34;': '\xbe',
+        'frac35;': '\u2157',
+        'frac38;': '\u215c',
+        'frac45;': '\u2158',
+        'frac56;': '\u215a',
+        'frac58;': '\u215d',
+        'frac78;': '\u215e',
+        'frasl;': '\u2044',
+        'frown;': '\u2322',
+        'Fscr;': '\u2131',
+        'fscr;': '\U0001d4bb',
+        'gacute;': '\u01f5',
+        'Gamma;': '\u0393',
+        'gamma;': '\u03b3',
+        'Gammad;': '\u03dc',
+        'gammad;': '\u03dd',
+        'gap;': '\u2a86',
+        'Gbreve;': '\u011e',
+        'gbreve;': '\u011f',
+        'Gcedil;': '\u0122',
+        'Gcirc;': '\u011c',
+        'gcirc;': '\u011d',
+        'Gcy;': '\u0413',
+        'gcy;': '\u0433',
+        'Gdot;': '\u0120',
+        'gdot;': '\u0121',
+        'gE;': '\u2267',
+        'ge;': '\u2265',
+        'gEl;': '\u2a8c',
+        'gel;': '\u22db',
+        'geq;': '\u2265',
+        'geqq;': '\u2267',
+        'geqslant;': '\u2a7e',
+        'ges;': '\u2a7e',
+        'gescc;': '\u2aa9',
+        'gesdot;': '\u2a80',
+        'gesdoto;': '\u2a82',
+        'gesdotol;': '\u2a84',
+        'gesl;': '\u22db\ufe00',
+        'gesles;': '\u2a94',
+        'Gfr;': '\U0001d50a',
+        'gfr;': '\U0001d524',
+        'Gg;': '\u22d9',
+        'gg;': '\u226b',
+        'ggg;': '\u22d9',
+        'gimel;': '\u2137',
+        'GJcy;': '\u0403',
+        'gjcy;': '\u0453',
+        'gl;': '\u2277',
+        'gla;': '\u2aa5',
+        'glE;': '\u2a92',
+        'glj;': '\u2aa4',
+        'gnap;': '\u2a8a',
+        'gnapprox;': '\u2a8a',
+        'gnE;': '\u2269',
+        'gne;': '\u2a88',
+        'gneq;': '\u2a88',
+        'gneqq;': '\u2269',
+        'gnsim;': '\u22e7',
+        'Gopf;': '\U0001d53e',
+        'gopf;': '\U0001d558',
+        'grave;': '`',
+        'GreaterEqual;': '\u2265',
+        'GreaterEqualLess;': '\u22db',
+        'GreaterFullEqual;': '\u2267',
+        'GreaterGreater;': '\u2aa2',
+        'GreaterLess;': '\u2277',
+        'GreaterSlantEqual;': '\u2a7e',
+        'GreaterTilde;': '\u2273',
+        'Gscr;': '\U0001d4a2',
+        'gscr;': '\u210a',
+        'gsim;': '\u2273',
+        'gsime;': '\u2a8e',
+        'gsiml;': '\u2a90',
+        'GT': '>',
+        'gt': '>',
+        'GT;': '>',
+        'Gt;': '\u226b',
+        'gt;': '>',
+        'gtcc;': '\u2aa7',
+        'gtcir;': '\u2a7a',
+        'gtdot;': '\u22d7',
+        'gtlPar;': '\u2995',
+        'gtquest;': '\u2a7c',
+        'gtrapprox;': '\u2a86',
+        'gtrarr;': '\u2978',
+        'gtrdot;': '\u22d7',
+        'gtreqless;': '\u22db',
+        'gtreqqless;': '\u2a8c',
+        'gtrless;': '\u2277',
+        'gtrsim;': '\u2273',
+        'gvertneqq;': '\u2269\ufe00',
+        'gvnE;': '\u2269\ufe00',
+        'Hacek;': '\u02c7',
+        'hairsp;': '\u200a',
+        'half;': '\xbd',
+        'hamilt;': '\u210b',
+        'HARDcy;': '\u042a',
+        'hardcy;': '\u044a',
+        'hArr;': '\u21d4',
+        'harr;': '\u2194',
+        'harrcir;': '\u2948',
+        'harrw;': '\u21ad',
+        'Hat;': '^',
+        'hbar;': '\u210f',
+        'Hcirc;': '\u0124',
+        'hcirc;': '\u0125',
+        'hearts;': '\u2665',
+        'heartsuit;': '\u2665',
+        'hellip;': '\u2026',
+        'hercon;': '\u22b9',
+        'Hfr;': '\u210c',
+        'hfr;': '\U0001d525',
+        'HilbertSpace;': '\u210b',
+        'hksearow;': '\u2925',
+        'hkswarow;': '\u2926',
+        'hoarr;': '\u21ff',
+        'homtht;': '\u223b',
+        'hookleftarrow;': '\u21a9',
+        'hookrightarrow;': '\u21aa',
+        'Hopf;': '\u210d',
+        'hopf;': '\U0001d559',
+        'horbar;': '\u2015',
+        'HorizontalLine;': '\u2500',
+        'Hscr;': '\u210b',
+        'hscr;': '\U0001d4bd',
+        'hslash;': '\u210f',
+        'Hstrok;': '\u0126',
+        'hstrok;': '\u0127',
+        'HumpDownHump;': '\u224e',
+        'HumpEqual;': '\u224f',
+        'hybull;': '\u2043',
+        'hyphen;': '\u2010',
+        'Iacute': '\xcd',
+        'iacute': '\xed',
+        'Iacute;': '\xcd',
+        'iacute;': '\xed',
+        'ic;': '\u2063',
+        'Icirc': '\xce',
+        'icirc': '\xee',
+        'Icirc;': '\xce',
+        'icirc;': '\xee',
+        'Icy;': '\u0418',
+        'icy;': '\u0438',
+        'Idot;': '\u0130',
+        'IEcy;': '\u0415',
+        'iecy;': '\u0435',
+        'iexcl': '\xa1',
+        'iexcl;': '\xa1',
+        'iff;': '\u21d4',
+        'Ifr;': '\u2111',
+        'ifr;': '\U0001d526',
+        'Igrave': '\xcc',
+        'igrave': '\xec',
+        'Igrave;': '\xcc',
+        'igrave;': '\xec',
+        'ii;': '\u2148',
+        'iiiint;': '\u2a0c',
+        'iiint;': '\u222d',
+        'iinfin;': '\u29dc',
+        'iiota;': '\u2129',
+        'IJlig;': '\u0132',
+        'ijlig;': '\u0133',
+        'Im;': '\u2111',
+        'Imacr;': '\u012a',
+        'imacr;': '\u012b',
+        'image;': '\u2111',
+        'ImaginaryI;': '\u2148',
+        'imagline;': '\u2110',
+        'imagpart;': '\u2111',
+        'imath;': '\u0131',
+        'imof;': '\u22b7',
+        'imped;': '\u01b5',
+        'Implies;': '\u21d2',
+        'in;': '\u2208',
+        'incare;': '\u2105',
+        'infin;': '\u221e',
+        'infintie;': '\u29dd',
+        'inodot;': '\u0131',
+        'Int;': '\u222c',
+        'int;': '\u222b',
+        'intcal;': '\u22ba',
+        'integers;': '\u2124',
+        'Integral;': '\u222b',
+        'intercal;': '\u22ba',
+        'Intersection;': '\u22c2',
+        'intlarhk;': '\u2a17',
+        'intprod;': '\u2a3c',
+        'InvisibleComma;': '\u2063',
+        'InvisibleTimes;': '\u2062',
+        'IOcy;': '\u0401',
+        'iocy;': '\u0451',
+        'Iogon;': '\u012e',
+        'iogon;': '\u012f',
+        'Iopf;': '\U0001d540',
+        'iopf;': '\U0001d55a',
+        'Iota;': '\u0399',
+        'iota;': '\u03b9',
+        'iprod;': '\u2a3c',
+        'iquest': '\xbf',
+        'iquest;': '\xbf',
+        'Iscr;': '\u2110',
+        'iscr;': '\U0001d4be',
+        'isin;': '\u2208',
+        'isindot;': '\u22f5',
+        'isinE;': '\u22f9',
+        'isins;': '\u22f4',
+        'isinsv;': '\u22f3',
+        'isinv;': '\u2208',
+        'it;': '\u2062',
+        'Itilde;': '\u0128',
+        'itilde;': '\u0129',
+        'Iukcy;': '\u0406',
+        'iukcy;': '\u0456',
+        'Iuml': '\xcf',
+        'iuml': '\xef',
+        'Iuml;': '\xcf',
+        'iuml;': '\xef',
+        'Jcirc;': '\u0134',
+        'jcirc;': '\u0135',
+        'Jcy;': '\u0419',
+        'jcy;': '\u0439',
+        'Jfr;': '\U0001d50d',
+        'jfr;': '\U0001d527',
+        'jmath;': '\u0237',
+        'Jopf;': '\U0001d541',
+        'jopf;': '\U0001d55b',
+        'Jscr;': '\U0001d4a5',
+        'jscr;': '\U0001d4bf',
+        'Jsercy;': '\u0408',
+        'jsercy;': '\u0458',
+        'Jukcy;': '\u0404',
+        'jukcy;': '\u0454',
+        'Kappa;': '\u039a',
+        'kappa;': '\u03ba',
+        'kappav;': '\u03f0',
+        'Kcedil;': '\u0136',
+        'kcedil;': '\u0137',
+        'Kcy;': '\u041a',
+        'kcy;': '\u043a',
+        'Kfr;': '\U0001d50e',
+        'kfr;': '\U0001d528',
+        'kgreen;': '\u0138',
+        'KHcy;': '\u0425',
+        'khcy;': '\u0445',
+        'KJcy;': '\u040c',
+        'kjcy;': '\u045c',
+        'Kopf;': '\U0001d542',
+        'kopf;': '\U0001d55c',
+        'Kscr;': '\U0001d4a6',
+        'kscr;': '\U0001d4c0',
+        'lAarr;': '\u21da',
+        'Lacute;': '\u0139',
+        'lacute;': '\u013a',
+        'laemptyv;': '\u29b4',
+        'lagran;': '\u2112',
+        'Lambda;': '\u039b',
+        'lambda;': '\u03bb',
+        'Lang;': '\u27ea',
+        'lang;': '\u27e8',
+        'langd;': '\u2991',
+        'langle;': '\u27e8',
+        'lap;': '\u2a85',
+        'Laplacetrf;': '\u2112',
+        'laquo': '\xab',
+        'laquo;': '\xab',
+        'Larr;': '\u219e',
+        'lArr;': '\u21d0',
+        'larr;': '\u2190',
+        'larrb;': '\u21e4',
+        'larrbfs;': '\u291f',
+        'larrfs;': '\u291d',
+        'larrhk;': '\u21a9',
+        'larrlp;': '\u21ab',
+        'larrpl;': '\u2939',
+        'larrsim;': '\u2973',
+        'larrtl;': '\u21a2',
+        'lat;': '\u2aab',
+        'lAtail;': '\u291b',
+        'latail;': '\u2919',
+        'late;': '\u2aad',
+        'lates;': '\u2aad\ufe00',
+        'lBarr;': '\u290e',
+        'lbarr;': '\u290c',
+        'lbbrk;': '\u2772',
+        'lbrace;': '{',
+        'lbrack;': '[',
+        'lbrke;': '\u298b',
+        'lbrksld;': '\u298f',
+        'lbrkslu;': '\u298d',
+        'Lcaron;': '\u013d',
+        'lcaron;': '\u013e',
+        'Lcedil;': '\u013b',
+        'lcedil;': '\u013c',
+        'lceil;': '\u2308',
+        'lcub;': '{',
+        'Lcy;': '\u041b',
+        'lcy;': '\u043b',
+        'ldca;': '\u2936',
+        'ldquo;': '\u201c',
+        'ldquor;': '\u201e',
+        'ldrdhar;': '\u2967',
+        'ldrushar;': '\u294b',
+        'ldsh;': '\u21b2',
+        'lE;': '\u2266',
+        'le;': '\u2264',
+        'LeftAngleBracket;': '\u27e8',
+        'LeftArrow;': '\u2190',
+        'Leftarrow;': '\u21d0',
+        'leftarrow;': '\u2190',
+        'LeftArrowBar;': '\u21e4',
+        'LeftArrowRightArrow;': '\u21c6',
+        'leftarrowtail;': '\u21a2',
+        'LeftCeiling;': '\u2308',
+        'LeftDoubleBracket;': '\u27e6',
+        'LeftDownTeeVector;': '\u2961',
+        'LeftDownVector;': '\u21c3',
+        'LeftDownVectorBar;': '\u2959',
+        'LeftFloor;': '\u230a',
+        'leftharpoondown;': '\u21bd',
+        'leftharpoonup;': '\u21bc',
+        'leftleftarrows;': '\u21c7',
+        'LeftRightArrow;': '\u2194',
+        'Leftrightarrow;': '\u21d4',
+        'leftrightarrow;': '\u2194',
+        'leftrightarrows;': '\u21c6',
+        'leftrightharpoons;': '\u21cb',
+        'leftrightsquigarrow;': '\u21ad',
+        'LeftRightVector;': '\u294e',
+        'LeftTee;': '\u22a3',
+        'LeftTeeArrow;': '\u21a4',
+        'LeftTeeVector;': '\u295a',
+        'leftthreetimes;': '\u22cb',
+        'LeftTriangle;': '\u22b2',
+        'LeftTriangleBar;': '\u29cf',
+        'LeftTriangleEqual;': '\u22b4',
+        'LeftUpDownVector;': '\u2951',
+        'LeftUpTeeVector;': '\u2960',
+        'LeftUpVector;': '\u21bf',
+        'LeftUpVectorBar;': '\u2958',
+        'LeftVector;': '\u21bc',
+        'LeftVectorBar;': '\u2952',
+        'lEg;': '\u2a8b',
+        'leg;': '\u22da',
+        'leq;': '\u2264',
+        'leqq;': '\u2266',
+        'leqslant;': '\u2a7d',
+        'les;': '\u2a7d',
+        'lescc;': '\u2aa8',
+        'lesdot;': '\u2a7f',
+        'lesdoto;': '\u2a81',
+        'lesdotor;': '\u2a83',
+        'lesg;': '\u22da\ufe00',
+        'lesges;': '\u2a93',
+        'lessapprox;': '\u2a85',
+        'lessdot;': '\u22d6',
+        'lesseqgtr;': '\u22da',
+        'lesseqqgtr;': '\u2a8b',
+        'LessEqualGreater;': '\u22da',
+        'LessFullEqual;': '\u2266',
+        'LessGreater;': '\u2276',
+        'lessgtr;': '\u2276',
+        'LessLess;': '\u2aa1',
+        'lesssim;': '\u2272',
+        'LessSlantEqual;': '\u2a7d',
+        'LessTilde;': '\u2272',
+        'lfisht;': '\u297c',
+        'lfloor;': '\u230a',
+        'Lfr;': '\U0001d50f',
+        'lfr;': '\U0001d529',
+        'lg;': '\u2276',
+        'lgE;': '\u2a91',
+        'lHar;': '\u2962',
+        'lhard;': '\u21bd',
+        'lharu;': '\u21bc',
+        'lharul;': '\u296a',
+        'lhblk;': '\u2584',
+        'LJcy;': '\u0409',
+        'ljcy;': '\u0459',
+        'Ll;': '\u22d8',
+        'll;': '\u226a',
+        'llarr;': '\u21c7',
+        'llcorner;': '\u231e',
+        'Lleftarrow;': '\u21da',
+        'llhard;': '\u296b',
+        'lltri;': '\u25fa',
+        'Lmidot;': '\u013f',
+        'lmidot;': '\u0140',
+        'lmoust;': '\u23b0',
+        'lmoustache;': '\u23b0',
+        'lnap;': '\u2a89',
+        'lnapprox;': '\u2a89',
+        'lnE;': '\u2268',
+        'lne;': '\u2a87',
+        'lneq;': '\u2a87',
+        'lneqq;': '\u2268',
+        'lnsim;': '\u22e6',
+        'loang;': '\u27ec',
+        'loarr;': '\u21fd',
+        'lobrk;': '\u27e6',
+        'LongLeftArrow;': '\u27f5',
+        'Longleftarrow;': '\u27f8',
+        'longleftarrow;': '\u27f5',
+        'LongLeftRightArrow;': '\u27f7',
+        'Longleftrightarrow;': '\u27fa',
+        'longleftrightarrow;': '\u27f7',
+        'longmapsto;': '\u27fc',
+        'LongRightArrow;': '\u27f6',
+        'Longrightarrow;': '\u27f9',
+        'longrightarrow;': '\u27f6',
+        'looparrowleft;': '\u21ab',
+        'looparrowright;': '\u21ac',
+        'lopar;': '\u2985',
+        'Lopf;': '\U0001d543',
+        'lopf;': '\U0001d55d',
+        'loplus;': '\u2a2d',
+        'lotimes;': '\u2a34',
+        'lowast;': '\u2217',
+        'lowbar;': '_',
+        'LowerLeftArrow;': '\u2199',
+        'LowerRightArrow;': '\u2198',
+        'loz;': '\u25ca',
+        'lozenge;': '\u25ca',
+        'lozf;': '\u29eb',
+        'lpar;': '(',
+        'lparlt;': '\u2993',
+        'lrarr;': '\u21c6',
+        'lrcorner;': '\u231f',
+        'lrhar;': '\u21cb',
+        'lrhard;': '\u296d',
+        'lrm;': '\u200e',
+        'lrtri;': '\u22bf',
+        'lsaquo;': '\u2039',
+        'Lscr;': '\u2112',
+        'lscr;': '\U0001d4c1',
+        'Lsh;': '\u21b0',
+        'lsh;': '\u21b0',
+        'lsim;': '\u2272',
+        'lsime;': '\u2a8d',
+        'lsimg;': '\u2a8f',
+        'lsqb;': '[',
+        'lsquo;': '\u2018',
+        'lsquor;': '\u201a',
+        'Lstrok;': '\u0141',
+        'lstrok;': '\u0142',
+        'LT': '<',
+        'lt': '<',
+        'LT;': '<',
+        'Lt;': '\u226a',
+        'lt;': '<',
+        'ltcc;': '\u2aa6',
+        'ltcir;': '\u2a79',
+        'ltdot;': '\u22d6',
+        'lthree;': '\u22cb',
+        'ltimes;': '\u22c9',
+        'ltlarr;': '\u2976',
+        'ltquest;': '\u2a7b',
+        'ltri;': '\u25c3',
+        'ltrie;': '\u22b4',
+        'ltrif;': '\u25c2',
+        'ltrPar;': '\u2996',
+        'lurdshar;': '\u294a',
+        'luruhar;': '\u2966',
+        'lvertneqq;': '\u2268\ufe00',
+        'lvnE;': '\u2268\ufe00',
+        'macr': '\xaf',
+        'macr;': '\xaf',
+        'male;': '\u2642',
+        'malt;': '\u2720',
+        'maltese;': '\u2720',
+        'Map;': '\u2905',
+        'map;': '\u21a6',
+        'mapsto;': '\u21a6',
+        'mapstodown;': '\u21a7',
+        'mapstoleft;': '\u21a4',
+        'mapstoup;': '\u21a5',
+        'marker;': '\u25ae',
+        'mcomma;': '\u2a29',
+        'Mcy;': '\u041c',
+        'mcy;': '\u043c',
+        'mdash;': '\u2014',
+        'mDDot;': '\u223a',
+        'measuredangle;': '\u2221',
+        'MediumSpace;': '\u205f',
+        'Mellintrf;': '\u2133',
+        'Mfr;': '\U0001d510',
+        'mfr;': '\U0001d52a',
+        'mho;': '\u2127',
+        'micro': '\xb5',
+        'micro;': '\xb5',
+        'mid;': '\u2223',
+        'midast;': '*',
+        'midcir;': '\u2af0',
+        'middot': '\xb7',
+        'middot;': '\xb7',
+        'minus;': '\u2212',
+        'minusb;': '\u229f',
+        'minusd;': '\u2238',
+        'minusdu;': '\u2a2a',
+        'MinusPlus;': '\u2213',
+        'mlcp;': '\u2adb',
+        'mldr;': '\u2026',
+        'mnplus;': '\u2213',
+        'models;': '\u22a7',
+        'Mopf;': '\U0001d544',
+        'mopf;': '\U0001d55e',
+        'mp;': '\u2213',
+        'Mscr;': '\u2133',
+        'mscr;': '\U0001d4c2',
+        'mstpos;': '\u223e',
+        'Mu;': '\u039c',
+        'mu;': '\u03bc',
+        'multimap;': '\u22b8',
+        'mumap;': '\u22b8',
+        'nabla;': '\u2207',
+        'Nacute;': '\u0143',
+        'nacute;': '\u0144',
+        'nang;': '\u2220\u20d2',
+        'nap;': '\u2249',
+        'napE;': '\u2a70\u0338',
+        'napid;': '\u224b\u0338',
+        'napos;': '\u0149',
+        'napprox;': '\u2249',
+        'natur;': '\u266e',
+        'natural;': '\u266e',
+        'naturals;': '\u2115',
+        'nbsp': '\xa0',
+        'nbsp;': '\xa0',
+        'nbump;': '\u224e\u0338',
+        'nbumpe;': '\u224f\u0338',
+        'ncap;': '\u2a43',
+        'Ncaron;': '\u0147',
+        'ncaron;': '\u0148',
+        'Ncedil;': '\u0145',
+        'ncedil;': '\u0146',
+        'ncong;': '\u2247',
+        'ncongdot;': '\u2a6d\u0338',
+        'ncup;': '\u2a42',
+        'Ncy;': '\u041d',
+        'ncy;': '\u043d',
+        'ndash;': '\u2013',
+        'ne;': '\u2260',
+        'nearhk;': '\u2924',
+        'neArr;': '\u21d7',
+        'nearr;': '\u2197',
+        'nearrow;': '\u2197',
+        'nedot;': '\u2250\u0338',
+        'NegativeMediumSpace;': '\u200b',
+        'NegativeThickSpace;': '\u200b',
+        'NegativeThinSpace;': '\u200b',
+        'NegativeVeryThinSpace;': '\u200b',
+        'nequiv;': '\u2262',
+        'nesear;': '\u2928',
+        'nesim;': '\u2242\u0338',
+        'NestedGreaterGreater;': '\u226b',
+        'NestedLessLess;': '\u226a',
+        'NewLine;': '\n',
+        'nexist;': '\u2204',
+        'nexists;': '\u2204',
+        'Nfr;': '\U0001d511',
+        'nfr;': '\U0001d52b',
+        'ngE;': '\u2267\u0338',
+        'nge;': '\u2271',
+        'ngeq;': '\u2271',
+        'ngeqq;': '\u2267\u0338',
+        'ngeqslant;': '\u2a7e\u0338',
+        'nges;': '\u2a7e\u0338',
+        'nGg;': '\u22d9\u0338',
+        'ngsim;': '\u2275',
+        'nGt;': '\u226b\u20d2',
+        'ngt;': '\u226f',
+        'ngtr;': '\u226f',
+        'nGtv;': '\u226b\u0338',
+        'nhArr;': '\u21ce',
+        'nharr;': '\u21ae',
+        'nhpar;': '\u2af2',
+        'ni;': '\u220b',
+        'nis;': '\u22fc',
+        'nisd;': '\u22fa',
+        'niv;': '\u220b',
+        'NJcy;': '\u040a',
+        'njcy;': '\u045a',
+        'nlArr;': '\u21cd',
+        'nlarr;': '\u219a',
+        'nldr;': '\u2025',
+        'nlE;': '\u2266\u0338',
+        'nle;': '\u2270',
+        'nLeftarrow;': '\u21cd',
+        'nleftarrow;': '\u219a',
+        'nLeftrightarrow;': '\u21ce',
+        'nleftrightarrow;': '\u21ae',
+        'nleq;': '\u2270',
+        'nleqq;': '\u2266\u0338',
+        'nleqslant;': '\u2a7d\u0338',
+        'nles;': '\u2a7d\u0338',
+        'nless;': '\u226e',
+        'nLl;': '\u22d8\u0338',
+        'nlsim;': '\u2274',
+        'nLt;': '\u226a\u20d2',
+        'nlt;': '\u226e',
+        'nltri;': '\u22ea',
+        'nltrie;': '\u22ec',
+        'nLtv;': '\u226a\u0338',
+        'nmid;': '\u2224',
+        'NoBreak;': '\u2060',
+        'NonBreakingSpace;': '\xa0',
+        'Nopf;': '\u2115',
+        'nopf;': '\U0001d55f',
+        'not': '\xac',
+        'Not;': '\u2aec',
+        'not;': '\xac',
+        'NotCongruent;': '\u2262',
+        'NotCupCap;': '\u226d',
+        'NotDoubleVerticalBar;': '\u2226',
+        'NotElement;': '\u2209',
+        'NotEqual;': '\u2260',
+        'NotEqualTilde;': '\u2242\u0338',
+        'NotExists;': '\u2204',
+        'NotGreater;': '\u226f',
+        'NotGreaterEqual;': '\u2271',
+        'NotGreaterFullEqual;': '\u2267\u0338',
+        'NotGreaterGreater;': '\u226b\u0338',
+        'NotGreaterLess;': '\u2279',
+        'NotGreaterSlantEqual;': '\u2a7e\u0338',
+        'NotGreaterTilde;': '\u2275',
+        'NotHumpDownHump;': '\u224e\u0338',
+        'NotHumpEqual;': '\u224f\u0338',
+        'notin;': '\u2209',
+        'notindot;': '\u22f5\u0338',
+        'notinE;': '\u22f9\u0338',
+        'notinva;': '\u2209',
+        'notinvb;': '\u22f7',
+        'notinvc;': '\u22f6',
+        'NotLeftTriangle;': '\u22ea',
+        'NotLeftTriangleBar;': '\u29cf\u0338',
+        'NotLeftTriangleEqual;': '\u22ec',
+        'NotLess;': '\u226e',
+        'NotLessEqual;': '\u2270',
+        'NotLessGreater;': '\u2278',
+        'NotLessLess;': '\u226a\u0338',
+        'NotLessSlantEqual;': '\u2a7d\u0338',
+        'NotLessTilde;': '\u2274',
+        'NotNestedGreaterGreater;': '\u2aa2\u0338',
+        'NotNestedLessLess;': '\u2aa1\u0338',
+        'notni;': '\u220c',
+        'notniva;': '\u220c',
+        'notnivb;': '\u22fe',
+        'notnivc;': '\u22fd',
+        'NotPrecedes;': '\u2280',
+        'NotPrecedesEqual;': '\u2aaf\u0338',
+        'NotPrecedesSlantEqual;': '\u22e0',
+        'NotReverseElement;': '\u220c',
+        'NotRightTriangle;': '\u22eb',
+        'NotRightTriangleBar;': '\u29d0\u0338',
+        'NotRightTriangleEqual;': '\u22ed',
+        'NotSquareSubset;': '\u228f\u0338',
+        'NotSquareSubsetEqual;': '\u22e2',
+        'NotSquareSuperset;': '\u2290\u0338',
+        'NotSquareSupersetEqual;': '\u22e3',
+        'NotSubset;': '\u2282\u20d2',
+        'NotSubsetEqual;': '\u2288',
+        'NotSucceeds;': '\u2281',
+        'NotSucceedsEqual;': '\u2ab0\u0338',
+        'NotSucceedsSlantEqual;': '\u22e1',
+        'NotSucceedsTilde;': '\u227f\u0338',
+        'NotSuperset;': '\u2283\u20d2',
+        'NotSupersetEqual;': '\u2289',
+        'NotTilde;': '\u2241',
+        'NotTildeEqual;': '\u2244',
+        'NotTildeFullEqual;': '\u2247',
+        'NotTildeTilde;': '\u2249',
+        'NotVerticalBar;': '\u2224',
+        'npar;': '\u2226',
+        'nparallel;': '\u2226',
+        'nparsl;': '\u2afd\u20e5',
+        'npart;': '\u2202\u0338',
+        'npolint;': '\u2a14',
+        'npr;': '\u2280',
+        'nprcue;': '\u22e0',
+        'npre;': '\u2aaf\u0338',
+        'nprec;': '\u2280',
+        'npreceq;': '\u2aaf\u0338',
+        'nrArr;': '\u21cf',
+        'nrarr;': '\u219b',
+        'nrarrc;': '\u2933\u0338',
+        'nrarrw;': '\u219d\u0338',
+        'nRightarrow;': '\u21cf',
+        'nrightarrow;': '\u219b',
+        'nrtri;': '\u22eb',
+        'nrtrie;': '\u22ed',
+        'nsc;': '\u2281',
+        'nsccue;': '\u22e1',
+        'nsce;': '\u2ab0\u0338',
+        'Nscr;': '\U0001d4a9',
+        'nscr;': '\U0001d4c3',
+        'nshortmid;': '\u2224',
+        'nshortparallel;': '\u2226',
+        'nsim;': '\u2241',
+        'nsime;': '\u2244',
+        'nsimeq;': '\u2244',
+        'nsmid;': '\u2224',
+        'nspar;': '\u2226',
+        'nsqsube;': '\u22e2',
+        'nsqsupe;': '\u22e3',
+        'nsub;': '\u2284',
+        'nsubE;': '\u2ac5\u0338',
+        'nsube;': '\u2288',
+        'nsubset;': '\u2282\u20d2',
+        'nsubseteq;': '\u2288',
+        'nsubseteqq;': '\u2ac5\u0338',
+        'nsucc;': '\u2281',
+        'nsucceq;': '\u2ab0\u0338',
+        'nsup;': '\u2285',
+        'nsupE;': '\u2ac6\u0338',
+        'nsupe;': '\u2289',
+        'nsupset;': '\u2283\u20d2',
+        'nsupseteq;': '\u2289',
+        'nsupseteqq;': '\u2ac6\u0338',
+        'ntgl;': '\u2279',
+        'Ntilde': '\xd1',
+        'ntilde': '\xf1',
+        'Ntilde;': '\xd1',
+        'ntilde;': '\xf1',
+        'ntlg;': '\u2278',
+        'ntriangleleft;': '\u22ea',
+        'ntrianglelefteq;': '\u22ec',
+        'ntriangleright;': '\u22eb',
+        'ntrianglerighteq;': '\u22ed',
+        'Nu;': '\u039d',
+        'nu;': '\u03bd',
+        'num;': '#',
+        'numero;': '\u2116',
+        'numsp;': '\u2007',
+        'nvap;': '\u224d\u20d2',
+        'nVDash;': '\u22af',
+        'nVdash;': '\u22ae',
+        'nvDash;': '\u22ad',
+        'nvdash;': '\u22ac',
+        'nvge;': '\u2265\u20d2',
+        'nvgt;': '>\u20d2',
+        'nvHarr;': '\u2904',
+        'nvinfin;': '\u29de',
+        'nvlArr;': '\u2902',
+        'nvle;': '\u2264\u20d2',
+        'nvlt;': '<\u20d2',
+        'nvltrie;': '\u22b4\u20d2',
+        'nvrArr;': '\u2903',
+        'nvrtrie;': '\u22b5\u20d2',
+        'nvsim;': '\u223c\u20d2',
+        'nwarhk;': '\u2923',
+        'nwArr;': '\u21d6',
+        'nwarr;': '\u2196',
+        'nwarrow;': '\u2196',
+        'nwnear;': '\u2927',
+        'Oacute': '\xd3',
+        'oacute': '\xf3',
+        'Oacute;': '\xd3',
+        'oacute;': '\xf3',
+        'oast;': '\u229b',
+        'ocir;': '\u229a',
+        'Ocirc': '\xd4',
+        'ocirc': '\xf4',
+        'Ocirc;': '\xd4',
+        'ocirc;': '\xf4',
+        'Ocy;': '\u041e',
+        'ocy;': '\u043e',
+        'odash;': '\u229d',
+        'Odblac;': '\u0150',
+        'odblac;': '\u0151',
+        'odiv;': '\u2a38',
+        'odot;': '\u2299',
+        'odsold;': '\u29bc',
+        'OElig;': '\u0152',
+        'oelig;': '\u0153',
+        'ofcir;': '\u29bf',
+        'Ofr;': '\U0001d512',
+        'ofr;': '\U0001d52c',
+        'ogon;': '\u02db',
+        'Ograve': '\xd2',
+        'ograve': '\xf2',
+        'Ograve;': '\xd2',
+        'ograve;': '\xf2',
+        'ogt;': '\u29c1',
+        'ohbar;': '\u29b5',
+        'ohm;': '\u03a9',
+        'oint;': '\u222e',
+        'olarr;': '\u21ba',
+        'olcir;': '\u29be',
+        'olcross;': '\u29bb',
+        'oline;': '\u203e',
+        'olt;': '\u29c0',
+        'Omacr;': '\u014c',
+        'omacr;': '\u014d',
+        'Omega;': '\u03a9',
+        'omega;': '\u03c9',
+        'Omicron;': '\u039f',
+        'omicron;': '\u03bf',
+        'omid;': '\u29b6',
+        'ominus;': '\u2296',
+        'Oopf;': '\U0001d546',
+        'oopf;': '\U0001d560',
+        'opar;': '\u29b7',
+        'OpenCurlyDoubleQuote;': '\u201c',
+        'OpenCurlyQuote;': '\u2018',
+        'operp;': '\u29b9',
+        'oplus;': '\u2295',
+        'Or;': '\u2a54',
+        'or;': '\u2228',
+        'orarr;': '\u21bb',
+        'ord;': '\u2a5d',
+        'order;': '\u2134',
+        'orderof;': '\u2134',
+        'ordf': '\xaa',
+        'ordf;': '\xaa',
+        'ordm': '\xba',
+        'ordm;': '\xba',
+        'origof;': '\u22b6',
+        'oror;': '\u2a56',
+        'orslope;': '\u2a57',
+        'orv;': '\u2a5b',
+        'oS;': '\u24c8',
+        'Oscr;': '\U0001d4aa',
+        'oscr;': '\u2134',
+        'Oslash': '\xd8',
+        'oslash': '\xf8',
+        'Oslash;': '\xd8',
+        'oslash;': '\xf8',
+        'osol;': '\u2298',
+        'Otilde': '\xd5',
+        'otilde': '\xf5',
+        'Otilde;': '\xd5',
+        'otilde;': '\xf5',
+        'Otimes;': '\u2a37',
+        'otimes;': '\u2297',
+        'otimesas;': '\u2a36',
+        'Ouml': '\xd6',
+        'ouml': '\xf6',
+        'Ouml;': '\xd6',
+        'ouml;': '\xf6',
+        'ovbar;': '\u233d',
+        'OverBar;': '\u203e',
+        'OverBrace;': '\u23de',
+        'OverBracket;': '\u23b4',
+        'OverParenthesis;': '\u23dc',
+        'par;': '\u2225',
+        'para': '\xb6',
+        'para;': '\xb6',
+        'parallel;': '\u2225',
+        'parsim;': '\u2af3',
+        'parsl;': '\u2afd',
+        'part;': '\u2202',
+        'PartialD;': '\u2202',
+        'Pcy;': '\u041f',
+        'pcy;': '\u043f',
+        'percnt;': '%',
+        'period;': '.',
+        'permil;': '\u2030',
+        'perp;': '\u22a5',
+        'pertenk;': '\u2031',
+        'Pfr;': '\U0001d513',
+        'pfr;': '\U0001d52d',
+        'Phi;': '\u03a6',
+        'phi;': '\u03c6',
+        'phiv;': '\u03d5',
+        'phmmat;': '\u2133',
+        'phone;': '\u260e',
+        'Pi;': '\u03a0',
+        'pi;': '\u03c0',
+        'pitchfork;': '\u22d4',
+        'piv;': '\u03d6',
+        'planck;': '\u210f',
+        'planckh;': '\u210e',
+        'plankv;': '\u210f',
+        'plus;': '+',
+        'plusacir;': '\u2a23',
+        'plusb;': '\u229e',
+        'pluscir;': '\u2a22',
+        'plusdo;': '\u2214',
+        'plusdu;': '\u2a25',
+        'pluse;': '\u2a72',
+        'PlusMinus;': '\xb1',
+        'plusmn': '\xb1',
+        'plusmn;': '\xb1',
+        'plussim;': '\u2a26',
+        'plustwo;': '\u2a27',
+        'pm;': '\xb1',
+        'Poincareplane;': '\u210c',
+        'pointint;': '\u2a15',
+        'Popf;': '\u2119',
+        'popf;': '\U0001d561',
+        'pound': '\xa3',
+        'pound;': '\xa3',
+        'Pr;': '\u2abb',
+        'pr;': '\u227a',
+        'prap;': '\u2ab7',
+        'prcue;': '\u227c',
+        'prE;': '\u2ab3',
+        'pre;': '\u2aaf',
+        'prec;': '\u227a',
+        'precapprox;': '\u2ab7',
+        'preccurlyeq;': '\u227c',
+        'Precedes;': '\u227a',
+        'PrecedesEqual;': '\u2aaf',
+        'PrecedesSlantEqual;': '\u227c',
+        'PrecedesTilde;': '\u227e',
+        'preceq;': '\u2aaf',
+        'precnapprox;': '\u2ab9',
+        'precneqq;': '\u2ab5',
+        'precnsim;': '\u22e8',
+        'precsim;': '\u227e',
+        'Prime;': '\u2033',
+        'prime;': '\u2032',
+        'primes;': '\u2119',
+        'prnap;': '\u2ab9',
+        'prnE;': '\u2ab5',
+        'prnsim;': '\u22e8',
+        'prod;': '\u220f',
+        'Product;': '\u220f',
+        'profalar;': '\u232e',
+        'profline;': '\u2312',
+        'profsurf;': '\u2313',
+        'prop;': '\u221d',
+        'Proportion;': '\u2237',
+        'Proportional;': '\u221d',
+        'propto;': '\u221d',
+        'prsim;': '\u227e',
+        'prurel;': '\u22b0',
+        'Pscr;': '\U0001d4ab',
+        'pscr;': '\U0001d4c5',
+        'Psi;': '\u03a8',
+        'psi;': '\u03c8',
+        'puncsp;': '\u2008',
+        'Qfr;': '\U0001d514',
+        'qfr;': '\U0001d52e',
+        'qint;': '\u2a0c',
+        'Qopf;': '\u211a',
+        'qopf;': '\U0001d562',
+        'qprime;': '\u2057',
+        'Qscr;': '\U0001d4ac',
+        'qscr;': '\U0001d4c6',
+        'quaternions;': '\u210d',
+        'quatint;': '\u2a16',
+        'quest;': '?',
+        'questeq;': '\u225f',
+        'QUOT': '"',
+        'quot': '"',
+        'QUOT;': '"',
+        'quot;': '"',
+        'rAarr;': '\u21db',
+        'race;': '\u223d\u0331',
+        'Racute;': '\u0154',
+        'racute;': '\u0155',
+        'radic;': '\u221a',
+        'raemptyv;': '\u29b3',
+        'Rang;': '\u27eb',
+        'rang;': '\u27e9',
+        'rangd;': '\u2992',
+        'range;': '\u29a5',
+        'rangle;': '\u27e9',
+        'raquo': '\xbb',
+        'raquo;': '\xbb',
+        'Rarr;': '\u21a0',
+        'rArr;': '\u21d2',
+        'rarr;': '\u2192',
+        'rarrap;': '\u2975',
+        'rarrb;': '\u21e5',
+        'rarrbfs;': '\u2920',
+        'rarrc;': '\u2933',
+        'rarrfs;': '\u291e',
+        'rarrhk;': '\u21aa',
+        'rarrlp;': '\u21ac',
+        'rarrpl;': '\u2945',
+        'rarrsim;': '\u2974',
+        'Rarrtl;': '\u2916',
+        'rarrtl;': '\u21a3',
+        'rarrw;': '\u219d',
+        'rAtail;': '\u291c',
+        'ratail;': '\u291a',
+        'ratio;': '\u2236',
+        'rationals;': '\u211a',
+        'RBarr;': '\u2910',
+        'rBarr;': '\u290f',
+        'rbarr;': '\u290d',
+        'rbbrk;': '\u2773',
+        'rbrace;': '}',
+        'rbrack;': ']',
+        'rbrke;': '\u298c',
+        'rbrksld;': '\u298e',
+        'rbrkslu;': '\u2990',
+        'Rcaron;': '\u0158',
+        'rcaron;': '\u0159',
+        'Rcedil;': '\u0156',
+        'rcedil;': '\u0157',
+        'rceil;': '\u2309',
+        'rcub;': '}',
+        'Rcy;': '\u0420',
+        'rcy;': '\u0440',
+        'rdca;': '\u2937',
+        'rdldhar;': '\u2969',
+        'rdquo;': '\u201d',
+        'rdquor;': '\u201d',
+        'rdsh;': '\u21b3',
+        'Re;': '\u211c',
+        'real;': '\u211c',
+        'realine;': '\u211b',
+        'realpart;': '\u211c',
+        'reals;': '\u211d',
+        'rect;': '\u25ad',
+        'REG': '\xae',
+        'reg': '\xae',
+        'REG;': '\xae',
+        'reg;': '\xae',
+        'ReverseElement;': '\u220b',
+        'ReverseEquilibrium;': '\u21cb',
+        'ReverseUpEquilibrium;': '\u296f',
+        'rfisht;': '\u297d',
+        'rfloor;': '\u230b',
+        'Rfr;': '\u211c',
+        'rfr;': '\U0001d52f',
+        'rHar;': '\u2964',
+        'rhard;': '\u21c1',
+        'rharu;': '\u21c0',
+        'rharul;': '\u296c',
+        'Rho;': '\u03a1',
+        'rho;': '\u03c1',
+        'rhov;': '\u03f1',
+        'RightAngleBracket;': '\u27e9',
+        'RightArrow;': '\u2192',
+        'Rightarrow;': '\u21d2',
+        'rightarrow;': '\u2192',
+        'RightArrowBar;': '\u21e5',
+        'RightArrowLeftArrow;': '\u21c4',
+        'rightarrowtail;': '\u21a3',
+        'RightCeiling;': '\u2309',
+        'RightDoubleBracket;': '\u27e7',
+        'RightDownTeeVector;': '\u295d',
+        'RightDownVector;': '\u21c2',
+        'RightDownVectorBar;': '\u2955',
+        'RightFloor;': '\u230b',
+        'rightharpoondown;': '\u21c1',
+        'rightharpoonup;': '\u21c0',
+        'rightleftarrows;': '\u21c4',
+        'rightleftharpoons;': '\u21cc',
+        'rightrightarrows;': '\u21c9',
+        'rightsquigarrow;': '\u219d',
+        'RightTee;': '\u22a2',
+        'RightTeeArrow;': '\u21a6',
+        'RightTeeVector;': '\u295b',
+        'rightthreetimes;': '\u22cc',
+        'RightTriangle;': '\u22b3',
+        'RightTriangleBar;': '\u29d0',
+        'RightTriangleEqual;': '\u22b5',
+        'RightUpDownVector;': '\u294f',
+        'RightUpTeeVector;': '\u295c',
+        'RightUpVector;': '\u21be',
+        'RightUpVectorBar;': '\u2954',
+        'RightVector;': '\u21c0',
+        'RightVectorBar;': '\u2953',
+        'ring;': '\u02da',
+        'risingdotseq;': '\u2253',
+        'rlarr;': '\u21c4',
+        'rlhar;': '\u21cc',
+        'rlm;': '\u200f',
+        'rmoust;': '\u23b1',
+        'rmoustache;': '\u23b1',
+        'rnmid;': '\u2aee',
+        'roang;': '\u27ed',
+        'roarr;': '\u21fe',
+        'robrk;': '\u27e7',
+        'ropar;': '\u2986',
+        'Ropf;': '\u211d',
+        'ropf;': '\U0001d563',
+        'roplus;': '\u2a2e',
+        'rotimes;': '\u2a35',
+        'RoundImplies;': '\u2970',
+        'rpar;': ')',
+        'rpargt;': '\u2994',
+        'rppolint;': '\u2a12',
+        'rrarr;': '\u21c9',
+        'Rrightarrow;': '\u21db',
+        'rsaquo;': '\u203a',
+        'Rscr;': '\u211b',
+        'rscr;': '\U0001d4c7',
+        'Rsh;': '\u21b1',
+        'rsh;': '\u21b1',
+        'rsqb;': ']',
+        'rsquo;': '\u2019',
+        'rsquor;': '\u2019',
+        'rthree;': '\u22cc',
+        'rtimes;': '\u22ca',
+        'rtri;': '\u25b9',
+        'rtrie;': '\u22b5',
+        'rtrif;': '\u25b8',
+        'rtriltri;': '\u29ce',
+        'RuleDelayed;': '\u29f4',
+        'ruluhar;': '\u2968',
+        'rx;': '\u211e',
+        'Sacute;': '\u015a',
+        'sacute;': '\u015b',
+        'sbquo;': '\u201a',
+        'Sc;': '\u2abc',
+        'sc;': '\u227b',
+        'scap;': '\u2ab8',
+        'Scaron;': '\u0160',
+        'scaron;': '\u0161',
+        'sccue;': '\u227d',
+        'scE;': '\u2ab4',
+        'sce;': '\u2ab0',
+        'Scedil;': '\u015e',
+        'scedil;': '\u015f',
+        'Scirc;': '\u015c',
+        'scirc;': '\u015d',
+        'scnap;': '\u2aba',
+        'scnE;': '\u2ab6',
+        'scnsim;': '\u22e9',
+        'scpolint;': '\u2a13',
+        'scsim;': '\u227f',
+        'Scy;': '\u0421',
+        'scy;': '\u0441',
+        'sdot;': '\u22c5',
+        'sdotb;': '\u22a1',
+        'sdote;': '\u2a66',
+        'searhk;': '\u2925',
+        'seArr;': '\u21d8',
+        'searr;': '\u2198',
+        'searrow;': '\u2198',
+        'sect': '\xa7',
+        'sect;': '\xa7',
+        'semi;': ';',
+        'seswar;': '\u2929',
+        'setminus;': '\u2216',
+        'setmn;': '\u2216',
+        'sext;': '\u2736',
+        'Sfr;': '\U0001d516',
+        'sfr;': '\U0001d530',
+        'sfrown;': '\u2322',
+        'sharp;': '\u266f',
+        'SHCHcy;': '\u0429',
+        'shchcy;': '\u0449',
+        'SHcy;': '\u0428',
+        'shcy;': '\u0448',
+        'ShortDownArrow;': '\u2193',
+        'ShortLeftArrow;': '\u2190',
+        'shortmid;': '\u2223',
+        'shortparallel;': '\u2225',
+        'ShortRightArrow;': '\u2192',
+        'ShortUpArrow;': '\u2191',
+        'shy': '\xad',
+        'shy;': '\xad',
+        'Sigma;': '\u03a3',
+        'sigma;': '\u03c3',
+        'sigmaf;': '\u03c2',
+        'sigmav;': '\u03c2',
+        'sim;': '\u223c',
+        'simdot;': '\u2a6a',
+        'sime;': '\u2243',
+        'simeq;': '\u2243',
+        'simg;': '\u2a9e',
+        'simgE;': '\u2aa0',
+        'siml;': '\u2a9d',
+        'simlE;': '\u2a9f',
+        'simne;': '\u2246',
+        'simplus;': '\u2a24',
+        'simrarr;': '\u2972',
+        'slarr;': '\u2190',
+        'SmallCircle;': '\u2218',
+        'smallsetminus;': '\u2216',
+        'smashp;': '\u2a33',
+        'smeparsl;': '\u29e4',
+        'smid;': '\u2223',
+        'smile;': '\u2323',
+        'smt;': '\u2aaa',
+        'smte;': '\u2aac',
+        'smtes;': '\u2aac\ufe00',
+        'SOFTcy;': '\u042c',
+        'softcy;': '\u044c',
+        'sol;': '/',
+        'solb;': '\u29c4',
+        'solbar;': '\u233f',
+        'Sopf;': '\U0001d54a',
+        'sopf;': '\U0001d564',
+        'spades;': '\u2660',
+        'spadesuit;': '\u2660',
+        'spar;': '\u2225',
+        'sqcap;': '\u2293',
+        'sqcaps;': '\u2293\ufe00',
+        'sqcup;': '\u2294',
+        'sqcups;': '\u2294\ufe00',
+        'Sqrt;': '\u221a',
+        'sqsub;': '\u228f',
+        'sqsube;': '\u2291',
+        'sqsubset;': '\u228f',
+        'sqsubseteq;': '\u2291',
+        'sqsup;': '\u2290',
+        'sqsupe;': '\u2292',
+        'sqsupset;': '\u2290',
+        'sqsupseteq;': '\u2292',
+        'squ;': '\u25a1',
+        'Square;': '\u25a1',
+        'square;': '\u25a1',
+        'SquareIntersection;': '\u2293',
+        'SquareSubset;': '\u228f',
+        'SquareSubsetEqual;': '\u2291',
+        'SquareSuperset;': '\u2290',
+        'SquareSupersetEqual;': '\u2292',
+        'SquareUnion;': '\u2294',
+        'squarf;': '\u25aa',
+        'squf;': '\u25aa',
+        'srarr;': '\u2192',
+        'Sscr;': '\U0001d4ae',
+        'sscr;': '\U0001d4c8',
+        'ssetmn;': '\u2216',
+        'ssmile;': '\u2323',
+        'sstarf;': '\u22c6',
+        'Star;': '\u22c6',
+        'star;': '\u2606',
+        'starf;': '\u2605',
+        'straightepsilon;': '\u03f5',
+        'straightphi;': '\u03d5',
+        'strns;': '\xaf',
+        'Sub;': '\u22d0',
+        'sub;': '\u2282',
+        'subdot;': '\u2abd',
+        'subE;': '\u2ac5',
+        'sube;': '\u2286',
+        'subedot;': '\u2ac3',
+        'submult;': '\u2ac1',
+        'subnE;': '\u2acb',
+        'subne;': '\u228a',
+        'subplus;': '\u2abf',
+        'subrarr;': '\u2979',
+        'Subset;': '\u22d0',
+        'subset;': '\u2282',
+        'subseteq;': '\u2286',
+        'subseteqq;': '\u2ac5',
+        'SubsetEqual;': '\u2286',
+        'subsetneq;': '\u228a',
+        'subsetneqq;': '\u2acb',
+        'subsim;': '\u2ac7',
+        'subsub;': '\u2ad5',
+        'subsup;': '\u2ad3',
+        'succ;': '\u227b',
+        'succapprox;': '\u2ab8',
+        'succcurlyeq;': '\u227d',
+        'Succeeds;': '\u227b',
+        'SucceedsEqual;': '\u2ab0',
+        'SucceedsSlantEqual;': '\u227d',
+        'SucceedsTilde;': '\u227f',
+        'succeq;': '\u2ab0',
+        'succnapprox;': '\u2aba',
+        'succneqq;': '\u2ab6',
+        'succnsim;': '\u22e9',
+        'succsim;': '\u227f',
+        'SuchThat;': '\u220b',
+        'Sum;': '\u2211',
+        'sum;': '\u2211',
+        'sung;': '\u266a',
+        'sup1': '\xb9',
+        'sup1;': '\xb9',
+        'sup2': '\xb2',
+        'sup2;': '\xb2',
+        'sup3': '\xb3',
+        'sup3;': '\xb3',
+        'Sup;': '\u22d1',
+        'sup;': '\u2283',
+        'supdot;': '\u2abe',
+        'supdsub;': '\u2ad8',
+        'supE;': '\u2ac6',
+        'supe;': '\u2287',
+        'supedot;': '\u2ac4',
+        'Superset;': '\u2283',
+        'SupersetEqual;': '\u2287',
+        'suphsol;': '\u27c9',
+        'suphsub;': '\u2ad7',
+        'suplarr;': '\u297b',
+        'supmult;': '\u2ac2',
+        'supnE;': '\u2acc',
+        'supne;': '\u228b',
+        'supplus;': '\u2ac0',
+        'Supset;': '\u22d1',
+        'supset;': '\u2283',
+        'supseteq;': '\u2287',
+        'supseteqq;': '\u2ac6',
+        'supsetneq;': '\u228b',
+        'supsetneqq;': '\u2acc',
+        'supsim;': '\u2ac8',
+        'supsub;': '\u2ad4',
+        'supsup;': '\u2ad6',
+        'swarhk;': '\u2926',
+        'swArr;': '\u21d9',
+        'swarr;': '\u2199',
+        'swarrow;': '\u2199',
+        'swnwar;': '\u292a',
+        'szlig': '\xdf',
+        'szlig;': '\xdf',
+        'Tab;': '\t',
+        'target;': '\u2316',
+        'Tau;': '\u03a4',
+        'tau;': '\u03c4',
+        'tbrk;': '\u23b4',
+        'Tcaron;': '\u0164',
+        'tcaron;': '\u0165',
+        'Tcedil;': '\u0162',
+        'tcedil;': '\u0163',
+        'Tcy;': '\u0422',
+        'tcy;': '\u0442',
+        'tdot;': '\u20db',
+        'telrec;': '\u2315',
+        'Tfr;': '\U0001d517',
+        'tfr;': '\U0001d531',
+        'there4;': '\u2234',
+        'Therefore;': '\u2234',
+        'therefore;': '\u2234',
+        'Theta;': '\u0398',
+        'theta;': '\u03b8',
+        'thetasym;': '\u03d1',
+        'thetav;': '\u03d1',
+        'thickapprox;': '\u2248',
+        'thicksim;': '\u223c',
+        'ThickSpace;': '\u205f\u200a',
+        'thinsp;': '\u2009',
+        'ThinSpace;': '\u2009',
+        'thkap;': '\u2248',
+        'thksim;': '\u223c',
+        'THORN': '\xde',
+        'thorn': '\xfe',
+        'THORN;': '\xde',
+        'thorn;': '\xfe',
+        'Tilde;': '\u223c',
+        'tilde;': '\u02dc',
+        'TildeEqual;': '\u2243',
+        'TildeFullEqual;': '\u2245',
+        'TildeTilde;': '\u2248',
+        'times': '\xd7',
+        'times;': '\xd7',
+        'timesb;': '\u22a0',
+        'timesbar;': '\u2a31',
+        'timesd;': '\u2a30',
+        'tint;': '\u222d',
+        'toea;': '\u2928',
+        'top;': '\u22a4',
+        'topbot;': '\u2336',
+        'topcir;': '\u2af1',
+        'Topf;': '\U0001d54b',
+        'topf;': '\U0001d565',
+        'topfork;': '\u2ada',
+        'tosa;': '\u2929',
+        'tprime;': '\u2034',
+        'TRADE;': '\u2122',
+        'trade;': '\u2122',
+        'triangle;': '\u25b5',
+        'triangledown;': '\u25bf',
+        'triangleleft;': '\u25c3',
+        'trianglelefteq;': '\u22b4',
+        'triangleq;': '\u225c',
+        'triangleright;': '\u25b9',
+        'trianglerighteq;': '\u22b5',
+        'tridot;': '\u25ec',
+        'trie;': '\u225c',
+        'triminus;': '\u2a3a',
+        'TripleDot;': '\u20db',
+        'triplus;': '\u2a39',
+        'trisb;': '\u29cd',
+        'tritime;': '\u2a3b',
+        'trpezium;': '\u23e2',
+        'Tscr;': '\U0001d4af',
+        'tscr;': '\U0001d4c9',
+        'TScy;': '\u0426',
+        'tscy;': '\u0446',
+        'TSHcy;': '\u040b',
+        'tshcy;': '\u045b',
+        'Tstrok;': '\u0166',
+        'tstrok;': '\u0167',
+        'twixt;': '\u226c',
+        'twoheadleftarrow;': '\u219e',
+        'twoheadrightarrow;': '\u21a0',
+        'Uacute': '\xda',
+        'uacute': '\xfa',
+        'Uacute;': '\xda',
+        'uacute;': '\xfa',
+        'Uarr;': '\u219f',
+        'uArr;': '\u21d1',
+        'uarr;': '\u2191',
+        'Uarrocir;': '\u2949',
+        'Ubrcy;': '\u040e',
+        'ubrcy;': '\u045e',
+        'Ubreve;': '\u016c',
+        'ubreve;': '\u016d',
+        'Ucirc': '\xdb',
+        'ucirc': '\xfb',
+        'Ucirc;': '\xdb',
+        'ucirc;': '\xfb',
+        'Ucy;': '\u0423',
+        'ucy;': '\u0443',
+        'udarr;': '\u21c5',
+        'Udblac;': '\u0170',
+        'udblac;': '\u0171',
+        'udhar;': '\u296e',
+        'ufisht;': '\u297e',
+        'Ufr;': '\U0001d518',
+        'ufr;': '\U0001d532',
+        'Ugrave': '\xd9',
+        'ugrave': '\xf9',
+        'Ugrave;': '\xd9',
+        'ugrave;': '\xf9',
+        'uHar;': '\u2963',
+        'uharl;': '\u21bf',
+        'uharr;': '\u21be',
+        'uhblk;': '\u2580',
+        'ulcorn;': '\u231c',
+        'ulcorner;': '\u231c',
+        'ulcrop;': '\u230f',
+        'ultri;': '\u25f8',
+        'Umacr;': '\u016a',
+        'umacr;': '\u016b',
+        'uml': '\xa8',
+        'uml;': '\xa8',
+        'UnderBar;': '_',
+        'UnderBrace;': '\u23df',
+        'UnderBracket;': '\u23b5',
+        'UnderParenthesis;': '\u23dd',
+        'Union;': '\u22c3',
+        'UnionPlus;': '\u228e',
+        'Uogon;': '\u0172',
+        'uogon;': '\u0173',
+        'Uopf;': '\U0001d54c',
+        'uopf;': '\U0001d566',
+        'UpArrow;': '\u2191',
+        'Uparrow;': '\u21d1',
+        'uparrow;': '\u2191',
+        'UpArrowBar;': '\u2912',
+        'UpArrowDownArrow;': '\u21c5',
+        'UpDownArrow;': '\u2195',
+        'Updownarrow;': '\u21d5',
+        'updownarrow;': '\u2195',
+        'UpEquilibrium;': '\u296e',
+        'upharpoonleft;': '\u21bf',
+        'upharpoonright;': '\u21be',
+        'uplus;': '\u228e',
+        'UpperLeftArrow;': '\u2196',
+        'UpperRightArrow;': '\u2197',
+        'Upsi;': '\u03d2',
+        'upsi;': '\u03c5',
+        'upsih;': '\u03d2',
+        'Upsilon;': '\u03a5',
+        'upsilon;': '\u03c5',
+        'UpTee;': '\u22a5',
+        'UpTeeArrow;': '\u21a5',
+        'upuparrows;': '\u21c8',
+        'urcorn;': '\u231d',
+        'urcorner;': '\u231d',
+        'urcrop;': '\u230e',
+        'Uring;': '\u016e',
+        'uring;': '\u016f',
+        'urtri;': '\u25f9',
+        'Uscr;': '\U0001d4b0',
+        'uscr;': '\U0001d4ca',
+        'utdot;': '\u22f0',
+        'Utilde;': '\u0168',
+        'utilde;': '\u0169',
+        'utri;': '\u25b5',
+        'utrif;': '\u25b4',
+        'uuarr;': '\u21c8',
+        'Uuml': '\xdc',
+        'uuml': '\xfc',
+        'Uuml;': '\xdc',
+        'uuml;': '\xfc',
+        'uwangle;': '\u29a7',
+        'vangrt;': '\u299c',
+        'varepsilon;': '\u03f5',
+        'varkappa;': '\u03f0',
+        'varnothing;': '\u2205',
+        'varphi;': '\u03d5',
+        'varpi;': '\u03d6',
+        'varpropto;': '\u221d',
+        'vArr;': '\u21d5',
+        'varr;': '\u2195',
+        'varrho;': '\u03f1',
+        'varsigma;': '\u03c2',
+        'varsubsetneq;': '\u228a\ufe00',
+        'varsubsetneqq;': '\u2acb\ufe00',
+        'varsupsetneq;': '\u228b\ufe00',
+        'varsupsetneqq;': '\u2acc\ufe00',
+        'vartheta;': '\u03d1',
+        'vartriangleleft;': '\u22b2',
+        'vartriangleright;': '\u22b3',
+        'Vbar;': '\u2aeb',
+        'vBar;': '\u2ae8',
+        'vBarv;': '\u2ae9',
+        'Vcy;': '\u0412',
+        'vcy;': '\u0432',
+        'VDash;': '\u22ab',
+        'Vdash;': '\u22a9',
+        'vDash;': '\u22a8',
+        'vdash;': '\u22a2',
+        'Vdashl;': '\u2ae6',
+        'Vee;': '\u22c1',
+        'vee;': '\u2228',
+        'veebar;': '\u22bb',
+        'veeeq;': '\u225a',
+        'vellip;': '\u22ee',
+        'Verbar;': '\u2016',
+        'verbar;': '|',
+        'Vert;': '\u2016',
+        'vert;': '|',
+        'VerticalBar;': '\u2223',
+        'VerticalLine;': '|',
+        'VerticalSeparator;': '\u2758',
+        'VerticalTilde;': '\u2240',
+        'VeryThinSpace;': '\u200a',
+        'Vfr;': '\U0001d519',
+        'vfr;': '\U0001d533',
+        'vltri;': '\u22b2',
+        'vnsub;': '\u2282\u20d2',
+        'vnsup;': '\u2283\u20d2',
+        'Vopf;': '\U0001d54d',
+        'vopf;': '\U0001d567',
+        'vprop;': '\u221d',
+        'vrtri;': '\u22b3',
+        'Vscr;': '\U0001d4b1',
+        'vscr;': '\U0001d4cb',
+        'vsubnE;': '\u2acb\ufe00',
+        'vsubne;': '\u228a\ufe00',
+        'vsupnE;': '\u2acc\ufe00',
+        'vsupne;': '\u228b\ufe00',
+        'Vvdash;': '\u22aa',
+        'vzigzag;': '\u299a',
+        'Wcirc;': '\u0174',
+        'wcirc;': '\u0175',
+        'wedbar;': '\u2a5f',
+        'Wedge;': '\u22c0',
+        'wedge;': '\u2227',
+        'wedgeq;': '\u2259',
+        'weierp;': '\u2118',
+        'Wfr;': '\U0001d51a',
+        'wfr;': '\U0001d534',
+        'Wopf;': '\U0001d54e',
+        'wopf;': '\U0001d568',
+        'wp;': '\u2118',
+        'wr;': '\u2240',
+        'wreath;': '\u2240',
+        'Wscr;': '\U0001d4b2',
+        'wscr;': '\U0001d4cc',
+        'xcap;': '\u22c2',
+        'xcirc;': '\u25ef',
+        'xcup;': '\u22c3',
+        'xdtri;': '\u25bd',
+        'Xfr;': '\U0001d51b',
+        'xfr;': '\U0001d535',
+        'xhArr;': '\u27fa',
+        'xharr;': '\u27f7',
+        'Xi;': '\u039e',
+        'xi;': '\u03be',
+        'xlArr;': '\u27f8',
+        'xlarr;': '\u27f5',
+        'xmap;': '\u27fc',
+        'xnis;': '\u22fb',
+        'xodot;': '\u2a00',
+        'Xopf;': '\U0001d54f',
+        'xopf;': '\U0001d569',
+        'xoplus;': '\u2a01',
+        'xotime;': '\u2a02',
+        'xrArr;': '\u27f9',
+        'xrarr;': '\u27f6',
+        'Xscr;': '\U0001d4b3',
+        'xscr;': '\U0001d4cd',
+        'xsqcup;': '\u2a06',
+        'xuplus;': '\u2a04',
+        'xutri;': '\u25b3',
+        'xvee;': '\u22c1',
+        'xwedge;': '\u22c0',
+        'Yacute': '\xdd',
+        'yacute': '\xfd',
+        'Yacute;': '\xdd',
+        'yacute;': '\xfd',
+        'YAcy;': '\u042f',
+        'yacy;': '\u044f',
+        'Ycirc;': '\u0176',
+        'ycirc;': '\u0177',
+        'Ycy;': '\u042b',
+        'ycy;': '\u044b',
+        'yen': '\xa5',
+        'yen;': '\xa5',
+        'Yfr;': '\U0001d51c',
+        'yfr;': '\U0001d536',
+        'YIcy;': '\u0407',
+        'yicy;': '\u0457',
+        'Yopf;': '\U0001d550',
+        'yopf;': '\U0001d56a',
+        'Yscr;': '\U0001d4b4',
+        'yscr;': '\U0001d4ce',
+        'YUcy;': '\u042e',
+        'yucy;': '\u044e',
+        'yuml': '\xff',
+        'Yuml;': '\u0178',
+        'yuml;': '\xff',
+        'Zacute;': '\u0179',
+        'zacute;': '\u017a',
+        'Zcaron;': '\u017d',
+        'zcaron;': '\u017e',
+        'Zcy;': '\u0417',
+        'zcy;': '\u0437',
+        'Zdot;': '\u017b',
+        'zdot;': '\u017c',
+        'zeetrf;': '\u2128',
+        'ZeroWidthSpace;': '\u200b',
+        'Zeta;': '\u0396',
+        'zeta;': '\u03b6',
+        'Zfr;': '\u2128',
+        'zfr;': '\U0001d537',
+        'ZHcy;': '\u0416',
+        'zhcy;': '\u0436',
+        'zigrarr;': '\u21dd',
+        'Zopf;': '\u2124',
+        'zopf;': '\U0001d56b',
+        'Zscr;': '\U0001d4b5',
+        'zscr;': '\U0001d4cf',
+        'zwj;': '\u200d',
+        'zwnj;': '\u200c',
+    }
+
+try:
+    import http.client as compat_http_client
+except ImportError:  # Python 2
+    import httplib as compat_http_client
+
+try:
+    from urllib.error import HTTPError as compat_HTTPError
+except ImportError:  # Python 2
+    from urllib2 import HTTPError as compat_HTTPError
+
+try:
+    from urllib.request import urlretrieve as compat_urlretrieve
+except ImportError:  # Python 2
+    from urllib import urlretrieve as compat_urlretrieve
+
+try:
+    from html.parser import HTMLParser as compat_HTMLParser
+except ImportError:  # Python 2
+    from HTMLParser import HTMLParser as compat_HTMLParser
+
+try:  # Python 2
+    from HTMLParser import HTMLParseError as compat_HTMLParseError
+except ImportError:  # Python <3.4
+    try:
+        from html.parser import HTMLParseError as compat_HTMLParseError
+    except ImportError:  # Python >3.4
+
+        # HTMLParseError has been deprecated in Python 3.3 and removed in
+        # Python 3.5. Introducing dummy exception for Python >3.5 for compatible
+        # and uniform cross-version exceptiong handling
+        class compat_HTMLParseError(Exception):
+            pass
+
+try:
+    from subprocess import DEVNULL
+    compat_subprocess_get_DEVNULL = lambda: DEVNULL
+except ImportError:
+    compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
+
+try:
+    import http.server as compat_http_server
+except ImportError:
+    import BaseHTTPServer as compat_http_server
+
+try:
+    compat_str = unicode  # Python 2
+except NameError:
+    compat_str = str
+
+try:
+    from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
+    from urllib.parse import unquote as compat_urllib_parse_unquote
+    from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
+except ImportError:  # Python 2
+    _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
+                else re.compile(r'([\x00-\x7f]+)'))
+
+    # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus
+    # implementations from cpython 3.4.3's stdlib. Python 2's version
+    # is apparently broken (see https://github.com/ytdl-org/youtube-dl/pull/6244)
+
+    def compat_urllib_parse_unquote_to_bytes(string):
+        """unquote_to_bytes('abc%20def') -> b'abc def'."""
+        # Note: strings are encoded as UTF-8. This is only an issue if it contains
+        # unescaped non-ASCII characters, which URIs should not.
+        if not string:
+            # Is it a string-like object?
+            string.split
+            return b''
+        if isinstance(string, compat_str):
+            string = string.encode('utf-8')
+        bits = string.split(b'%')
+        if len(bits) == 1:
+            return string
+        res = [bits[0]]
+        append = res.append
+        for item in bits[1:]:
+            try:
+                append(compat_urllib_parse._hextochr[item[:2]])
+                append(item[2:])
+            except KeyError:
+                append(b'%')
+                append(item)
+        return b''.join(res)
+
+    def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
+        """Replace %xx escapes by their single-character equivalent. The optional
+        encoding and errors parameters specify how to decode percent-encoded
+        sequences into Unicode characters, as accepted by the bytes.decode()
+        method.
+        By default, percent-encoded sequences are decoded with UTF-8, and invalid
+        sequences are replaced by a placeholder character.
+
+        unquote('abc%20def') -> 'abc def'.
+        """
+        if '%' not in string:
+            string.split
+            return string
+        if encoding is None:
+            encoding = 'utf-8'
+        if errors is None:
+            errors = 'replace'
+        bits = _asciire.split(string)
+        res = [bits[0]]
+        append = res.append
+        for i in range(1, len(bits), 2):
+            append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors))
+            append(bits[i + 1])
+        return ''.join(res)
+
+    def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'):
+        """Like unquote(), but also replace plus signs by spaces, as required for
+        unquoting HTML form values.
+
+        unquote_plus('%7e/abc+def') -> '~/abc def'
+        """
+        string = string.replace('+', ' ')
+        return compat_urllib_parse_unquote(string, encoding, errors)
+
+try:
+    from urllib.parse import urlencode as compat_urllib_parse_urlencode
+except ImportError:  # Python 2
+    # Python 2 will choke in urlencode on mixture of byte and unicode strings.
+    # Possible solutions are to either port it from python 3 with all
+    # the friends or manually ensure input query contains only byte strings.
+    # We will stick with latter thus recursively encoding the whole query.
+    def compat_urllib_parse_urlencode(query, doseq=0, encoding='utf-8'):
+        def encode_elem(e):
+            if isinstance(e, dict):
+                e = encode_dict(e)
+            elif isinstance(e, (list, tuple,)):
+                list_e = encode_list(e)
+                e = tuple(list_e) if isinstance(e, tuple) else list_e
+            elif isinstance(e, compat_str):
+                e = e.encode(encoding)
+            return e
+
+        def encode_dict(d):
+            return dict((encode_elem(k), encode_elem(v)) for k, v in d.items())
+
+        def encode_list(l):
+            return [encode_elem(e) for e in l]
+
+        return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq)
+
+try:
+    from urllib.request import DataHandler as compat_urllib_request_DataHandler
+except ImportError:  # Python < 3.4
+    # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py
+    class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler):
+        def data_open(self, req):
+            # data URLs as specified in RFC 2397.
+            #
+            # ignores POSTed data
+            #
+            # syntax:
+            # dataurl   := "data:" [ mediatype ] [ ";base64" ] "," data
+            # mediatype := [ type "/" subtype ] *( ";" parameter )
+            # data      := *urlchar
+            # parameter := attribute "=" value
+            url = req.get_full_url()
+
+            scheme, data = url.split(':', 1)
+            mediatype, data = data.split(',', 1)
+
+            # even base64 encoded data URLs might be quoted so unquote in any case:
+            data = compat_urllib_parse_unquote_to_bytes(data)
+            if mediatype.endswith(';base64'):
+                data = binascii.a2b_base64(data)
+                mediatype = mediatype[:-7]
+
+            if not mediatype:
+                mediatype = 'text/plain;charset=US-ASCII'
+
+            headers = email.message_from_string(
+                'Content-type: %s\nContent-length: %d\n' % (mediatype, len(data)))
+
+            return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url)
+
+try:
+    compat_basestring = basestring  # Python 2
+except NameError:
+    compat_basestring = str
+
+try:
+    compat_chr = unichr  # Python 2
+except NameError:
+    compat_chr = chr
+
+try:
+    from xml.etree.ElementTree import ParseError as compat_xml_parse_error
+except ImportError:  # Python 2.6
+    from xml.parsers.expat import ExpatError as compat_xml_parse_error
+
+
+etree = xml.etree.ElementTree
+
+
+class _TreeBuilder(etree.TreeBuilder):
+    def doctype(self, name, pubid, system):
+        pass
+
+
+try:
+    # xml.etree.ElementTree.Element is a method in Python <=2.6 and
+    # the following will crash with:
+    #  TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types
+    isinstance(None, xml.etree.ElementTree.Element)
+    from xml.etree.ElementTree import Element as compat_etree_Element
+except TypeError:  # Python <=2.6
+    from xml.etree.ElementTree import _ElementInterface as compat_etree_Element
+
+if sys.version_info[0] >= 3:
+    def compat_etree_fromstring(text):
+        return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder()))
+else:
+    # python 2.x tries to encode unicode strings with ascii (see the
+    # XMLParser._fixtext method)
+    try:
+        _etree_iter = etree.Element.iter
+    except AttributeError:  # Python <=2.6
+        def _etree_iter(root):
+            for el in root.findall('*'):
+                yield el
+                for sub in _etree_iter(el):
+                    yield sub
+
+    # on 2.6 XML doesn't have a parser argument, function copied from CPython
+    # 2.7 source
+    def _XML(text, parser=None):
+        if not parser:
+            parser = etree.XMLParser(target=_TreeBuilder())
+        parser.feed(text)
+        return parser.close()
+
+    def _element_factory(*args, **kwargs):
+        el = etree.Element(*args, **kwargs)
+        for k, v in el.items():
+            if isinstance(v, bytes):
+                el.set(k, v.decode('utf-8'))
+        return el
+
+    def compat_etree_fromstring(text):
+        doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory)))
+        for el in _etree_iter(doc):
+            if el.text is not None and isinstance(el.text, bytes):
+                el.text = el.text.decode('utf-8')
+        return doc
+
+if hasattr(etree, 'register_namespace'):
+    compat_etree_register_namespace = etree.register_namespace
+else:
+    def compat_etree_register_namespace(prefix, uri):
+        """Register a namespace prefix.
+        The registry is global, and any existing mapping for either the
+        given prefix or the namespace URI will be removed.
+        *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
+        attributes in this namespace will be serialized with prefix if possible.
+        ValueError is raised if prefix is reserved or is invalid.
+        """
+        if re.match(r"ns\d+$", prefix):
+            raise ValueError("Prefix format reserved for internal use")
+        for k, v in list(etree._namespace_map.items()):
+            if k == uri or v == prefix:
+                del etree._namespace_map[k]
+        etree._namespace_map[uri] = prefix
+
+if sys.version_info < (2, 7):
+    # Here comes the crazy part: In 2.6, if the xpath is a unicode,
+    # .//node does not match if a node is a direct child of . !
+    def compat_xpath(xpath):
+        if isinstance(xpath, compat_str):
+            xpath = xpath.encode('ascii')
+        return xpath
+else:
+    compat_xpath = lambda xpath: xpath
+
+try:
+    from urllib.parse import parse_qs as compat_parse_qs
+except ImportError:  # Python 2
+    # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
+    # Python 2's version is apparently totally broken
+
+    def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
+                   encoding='utf-8', errors='replace'):
+        qs, _coerce_result = qs, compat_str
+        pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
+        r = []
+        for name_value in pairs:
+            if not name_value and not strict_parsing:
+                continue
+            nv = name_value.split('=', 1)
+            if len(nv) != 2:
+                if strict_parsing:
+                    raise ValueError('bad query field: %r' % (name_value,))
+                # Handle case of a control-name with no equal sign
+                if keep_blank_values:
+                    nv.append('')
+                else:
+                    continue
+            if len(nv[1]) or keep_blank_values:
+                name = nv[0].replace('+', ' ')
+                name = compat_urllib_parse_unquote(
+                    name, encoding=encoding, errors=errors)
+                name = _coerce_result(name)
+                value = nv[1].replace('+', ' ')
+                value = compat_urllib_parse_unquote(
+                    value, encoding=encoding, errors=errors)
+                value = _coerce_result(value)
+                r.append((name, value))
+        return r
+
+    def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
+                        encoding='utf-8', errors='replace'):
+        parsed_result = {}
+        pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
+                           encoding=encoding, errors=errors)
+        for name, value in pairs:
+            if name in parsed_result:
+                parsed_result[name].append(value)
+            else:
+                parsed_result[name] = [value]
+        return parsed_result
+
+
+compat_os_name = os._name if os.name == 'java' else os.name
+
+
+if compat_os_name == 'nt':
+    def compat_shlex_quote(s):
+        return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"')
+else:
+    try:
+        from shlex import quote as compat_shlex_quote
+    except ImportError:  # Python < 3.3
+        def compat_shlex_quote(s):
+            if re.match(r'^[-_\w./]+$', s):
+                return s
+            else:
+                return "'" + s.replace("'", "'\"'\"'") + "'"
+
+
+try:
+    args = shlex.split('中文')
+    assert (isinstance(args, list)
+            and isinstance(args[0], compat_str)
+            and args[0] == '中文')
+    compat_shlex_split = shlex.split
+except (AssertionError, UnicodeEncodeError):
+    # Working around shlex issue with unicode strings on some python 2
+    # versions (see http://bugs.python.org/issue1548891)
+    def compat_shlex_split(s, comments=False, posix=True):
+        if isinstance(s, compat_str):
+            s = s.encode('utf-8')
+        return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix)))
+
+
+def compat_ord(c):
+    if type(c) is int:
+        return c
+    else:
+        return ord(c)
+
+
+if sys.version_info >= (3, 0):
+    compat_getenv = os.getenv
+    compat_expanduser = os.path.expanduser
+
+    def compat_setenv(key, value, env=os.environ):
+        env[key] = value
+else:
+    # Environment variables should be decoded with filesystem encoding.
+    # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
+
+    def compat_getenv(key, default=None):
+        from .utils import get_filesystem_encoding
+        env = os.getenv(key, default)
+        if env:
+            env = env.decode(get_filesystem_encoding())
+        return env
+
+    def compat_setenv(key, value, env=os.environ):
+        def encode(v):
+            from .utils import get_filesystem_encoding
+            return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v
+        env[encode(key)] = encode(value)
+
+    # HACK: The default implementations of os.path.expanduser from cpython do not decode
+    # environment variables with filesystem encoding. We will work around this by
+    # providing adjusted implementations.
+    # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib
+    # for different platforms with correct environment variables decoding.
+
+    if compat_os_name == 'posix':
+        def compat_expanduser(path):
+            """Expand ~ and ~user constructions.  If user or $HOME is unknown,
+            do nothing."""
+            if not path.startswith('~'):
+                return path
+            i = path.find('/', 1)
+            if i < 0:
+                i = len(path)
+            if i == 1:
+                if 'HOME' not in os.environ:
+                    import pwd
+                    userhome = pwd.getpwuid(os.getuid()).pw_dir
+                else:
+                    userhome = compat_getenv('HOME')
+            else:
+                import pwd
+                try:
+                    pwent = pwd.getpwnam(path[1:i])
+                except KeyError:
+                    return path
+                userhome = pwent.pw_dir
+            userhome = userhome.rstrip('/')
+            return (userhome + path[i:]) or '/'
+    elif compat_os_name in ('nt', 'ce'):
+        def compat_expanduser(path):
+            """Expand ~ and ~user constructs.
+
+            If user or $HOME is unknown, do nothing."""
+            if path[:1] != '~':
+                return path
+            i, n = 1, len(path)
+            while i < n and path[i] not in '/\\':
+                i = i + 1
+
+            if 'HOME' in os.environ:
+                userhome = compat_getenv('HOME')
+            elif 'USERPROFILE' in os.environ:
+                userhome = compat_getenv('USERPROFILE')
+            elif 'HOMEPATH' not in os.environ:
+                return path
+            else:
+                try:
+                    drive = compat_getenv('HOMEDRIVE')
+                except KeyError:
+                    drive = ''
+                userhome = os.path.join(drive, compat_getenv('HOMEPATH'))
+
+            if i != 1:  # ~user
+                userhome = os.path.join(os.path.dirname(userhome), path[1:i])
+
+            return userhome + path[i:]
+    else:
+        compat_expanduser = os.path.expanduser
+
+
+if compat_os_name == 'nt' and sys.version_info < (3, 8):
+    # os.path.realpath on Windows does not follow symbolic links
+    # prior to Python 3.8 (see https://bugs.python.org/issue9949)
+    def compat_realpath(path):
+        while os.path.islink(path):
+            path = os.path.abspath(os.readlink(path))
+        return path
+else:
+    compat_realpath = os.path.realpath
+
+
+if sys.version_info < (3, 0):
+    def compat_print(s):
+        from .utils import preferredencoding
+        print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
+else:
+    def compat_print(s):
+        assert isinstance(s, compat_str)
+        print(s)
+
+
+if sys.version_info < (3, 0) and sys.platform == 'win32':
+    def compat_getpass(prompt, *args, **kwargs):
+        if isinstance(prompt, compat_str):
+            from .utils import preferredencoding
+            prompt = prompt.encode(preferredencoding())
+        return getpass.getpass(prompt, *args, **kwargs)
+else:
+    compat_getpass = getpass.getpass
+
+try:
+    compat_input = raw_input
+except NameError:  # Python 3
+    compat_input = input
+
+# Python < 2.6.5 require kwargs to be bytes
+try:
+    def _testfunc(x):
+        pass
+    _testfunc(**{'x': 0})
+except TypeError:
+    def compat_kwargs(kwargs):
+        return dict((bytes(k), v) for k, v in kwargs.items())
+else:
+    compat_kwargs = lambda kwargs: kwargs
+
+
+try:
+    compat_numeric_types = (int, float, long, complex)
+except NameError:  # Python 3
+    compat_numeric_types = (int, float, complex)
+
+
+try:
+    compat_integer_types = (int, long)
+except NameError:  # Python 3
+    compat_integer_types = (int, )
+
+
+if sys.version_info < (2, 7):
+    def compat_socket_create_connection(address, timeout, source_address=None):
+        host, port = address
+        err = None
+        for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM):
+            af, socktype, proto, canonname, sa = res
+            sock = None
+            try:
+                sock = socket.socket(af, socktype, proto)
+                sock.settimeout(timeout)
+                if source_address:
+                    sock.bind(source_address)
+                sock.connect(sa)
+                return sock
+            except socket.error as _:
+                err = _
+                if sock is not None:
+                    sock.close()
+        if err is not None:
+            raise err
+        else:
+            raise socket.error('getaddrinfo returns an empty list')
+else:
+    compat_socket_create_connection = socket.create_connection
+
+
+# Fix https://github.com/ytdl-org/youtube-dl/issues/4223
+# See http://bugs.python.org/issue9161 for what is broken
+def workaround_optparse_bug9161():
+    op = optparse.OptionParser()
+    og = optparse.OptionGroup(op, 'foo')
+    try:
+        og.add_option('-t')
+    except TypeError:
+        real_add_option = optparse.OptionGroup.add_option
+
+        def _compat_add_option(self, *args, **kwargs):
+            enc = lambda v: (
+                v.encode('ascii', 'replace') if isinstance(v, compat_str)
+                else v)
+            bargs = [enc(a) for a in args]
+            bkwargs = dict(
+                (k, enc(v)) for k, v in kwargs.items())
+            return real_add_option(self, *bargs, **bkwargs)
+        optparse.OptionGroup.add_option = _compat_add_option
+
+
+if hasattr(shutil, 'get_terminal_size'):  # Python >= 3.3
+    compat_get_terminal_size = shutil.get_terminal_size
+else:
+    _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines'])
+
+    def compat_get_terminal_size(fallback=(80, 24)):
+        columns = compat_getenv('COLUMNS')
+        if columns:
+            columns = int(columns)
+        else:
+            columns = None
+        lines = compat_getenv('LINES')
+        if lines:
+            lines = int(lines)
+        else:
+            lines = None
+
+        if columns is None or lines is None or columns <= 0 or lines <= 0:
+            try:
+                sp = subprocess.Popen(
+                    ['stty', 'size'],
+                    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                out, err = sp.communicate()
+                _lines, _columns = map(int, out.split())
+            except Exception:
+                _columns, _lines = _terminal_size(*fallback)
+
+            if columns is None or columns <= 0:
+                columns = _columns
+            if lines is None or lines <= 0:
+                lines = _lines
+        return _terminal_size(columns, lines)
+
+try:
+    itertools.count(start=0, step=1)
+    compat_itertools_count = itertools.count
+except TypeError:  # Python 2.6
+    def compat_itertools_count(start=0, step=1):
+        n = start
+        while True:
+            yield n
+            n += step
+
+if sys.version_info >= (3, 0):
+    from tokenize import tokenize as compat_tokenize_tokenize
+else:
+    from tokenize import generate_tokens as compat_tokenize_tokenize
+
+
+try:
+    struct.pack('!I', 0)
+except TypeError:
+    # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
+    # See https://bugs.python.org/issue19099
+    def compat_struct_pack(spec, *args):
+        if isinstance(spec, compat_str):
+            spec = spec.encode('ascii')
+        return struct.pack(spec, *args)
+
+    def compat_struct_unpack(spec, *args):
+        if isinstance(spec, compat_str):
+            spec = spec.encode('ascii')
+        return struct.unpack(spec, *args)
+
+    class compat_Struct(struct.Struct):
+        def __init__(self, fmt):
+            if isinstance(fmt, compat_str):
+                fmt = fmt.encode('ascii')
+            super(compat_Struct, self).__init__(fmt)
+else:
+    compat_struct_pack = struct.pack
+    compat_struct_unpack = struct.unpack
+    if platform.python_implementation() == 'IronPython' and sys.version_info < (2, 7, 8):
+        class compat_Struct(struct.Struct):
+            def unpack(self, string):
+                if not isinstance(string, buffer):  # noqa: F821
+                    string = buffer(string)  # noqa: F821
+                return super(compat_Struct, self).unpack(string)
+    else:
+        compat_Struct = struct.Struct
+
+
+try:
+    from future_builtins import zip as compat_zip
+except ImportError:  # not 2.6+ or is 3.x
+    try:
+        from itertools import izip as compat_zip  # < 2.5 or 3.x
+    except ImportError:
+        compat_zip = zip
+
+
+if sys.version_info < (3, 3):
+    def compat_b64decode(s, *args, **kwargs):
+        if isinstance(s, compat_str):
+            s = s.encode('ascii')
+        return base64.b64decode(s, *args, **kwargs)
+else:
+    compat_b64decode = base64.b64decode
+
+
+if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, 0):
+    # PyPy2 prior to version 5.4.0 expects byte strings as Windows function
+    # names, see the original PyPy issue [1] and the youtube-dlc one [2].
+    # 1. https://bitbucket.org/pypy/pypy/issues/2360/windows-ctypescdll-typeerror-function-name
+    # 2. https://github.com/ytdl-org/youtube-dl/pull/4392
+    def compat_ctypes_WINFUNCTYPE(*args, **kwargs):
+        real = ctypes.WINFUNCTYPE(*args, **kwargs)
+
+        def resf(tpl, *args, **kwargs):
+            funcname, dll = tpl
+            return real((str(funcname), dll), *args, **kwargs)
+
+        return resf
+else:
+    def compat_ctypes_WINFUNCTYPE(*args, **kwargs):
+        return ctypes.WINFUNCTYPE(*args, **kwargs)
+
+
+__all__ = [
+    'compat_HTMLParseError',
+    'compat_HTMLParser',
+    'compat_HTTPError',
+    'compat_Struct',
+    'compat_b64decode',
+    'compat_basestring',
+    'compat_chr',
+    'compat_cookiejar',
+    'compat_cookiejar_Cookie',
+    'compat_cookies',
+    'compat_ctypes_WINFUNCTYPE',
+    'compat_etree_Element',
+    'compat_etree_fromstring',
+    'compat_etree_register_namespace',
+    'compat_expanduser',
+    'compat_get_terminal_size',
+    'compat_getenv',
+    'compat_getpass',
+    'compat_html_entities',
+    'compat_html_entities_html5',
+    'compat_http_client',
+    'compat_http_server',
+    'compat_input',
+    'compat_integer_types',
+    'compat_itertools_count',
+    'compat_kwargs',
+    'compat_numeric_types',
+    'compat_ord',
+    'compat_os_name',
+    'compat_parse_qs',
+    'compat_print',
+    'compat_realpath',
+    'compat_setenv',
+    'compat_shlex_quote',
+    'compat_shlex_split',
+    'compat_socket_create_connection',
+    'compat_str',
+    'compat_struct_pack',
+    'compat_struct_unpack',
+    'compat_subprocess_get_DEVNULL',
+    'compat_tokenize_tokenize',
+    'compat_urllib_error',
+    'compat_urllib_parse',
+    'compat_urllib_parse_unquote',
+    'compat_urllib_parse_unquote_plus',
+    'compat_urllib_parse_unquote_to_bytes',
+    'compat_urllib_parse_urlencode',
+    'compat_urllib_parse_urlparse',
+    'compat_urllib_request',
+    'compat_urllib_request_DataHandler',
+    'compat_urllib_response',
+    'compat_urlparse',
+    'compat_urlretrieve',
+    'compat_xml_parse_error',
+    'compat_xpath',
+    'compat_zip',
+    'workaround_optparse_bug9161',
+]
diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py

new file mode 100644 (file)

index 0000000..4ae81f5
--- /dev/null
+++ b/youtube_dl/downloader/__init__.py
@@ -0,0 +1,63 @@
+from __future__ import unicode_literals
+
+from .common import FileDownloader
+from .f4m import F4mFD
+from .hls import HlsFD
+from .http import HttpFD
+from .rtmp import RtmpFD
+from .dash import DashSegmentsFD
+from .rtsp import RtspFD
+from .ism import IsmFD
+from .youtube_live_chat import YoutubeLiveChatReplayFD
+from .external import (
+    get_external_downloader,
+    FFmpegFD,
+)
+
+from ..utils import (
+    determine_protocol,
+)
+
+PROTOCOL_MAP = {
+    'rtmp': RtmpFD,
+    'm3u8_native': HlsFD,
+    'm3u8': FFmpegFD,
+    'mms': RtspFD,
+    'rtsp': RtspFD,
+    'f4m': F4mFD,
+    'http_dash_segments': DashSegmentsFD,
+    'ism': IsmFD,
+    'youtube_live_chat_replay': YoutubeLiveChatReplayFD,
+}
+
+
+def get_suitable_downloader(info_dict, params={}):
+    """Get the downloader class that can handle the info dict."""
+    protocol = determine_protocol(info_dict)
+    info_dict['protocol'] = protocol
+
+    # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict):
+    #     return FFmpegFD
+
+    external_downloader = params.get('external_downloader')
+    if external_downloader is not None:
+        ed = get_external_downloader(external_downloader)
+        if ed.can_download(info_dict):
+            return ed
+
+    if protocol.startswith('m3u8') and info_dict.get('is_live'):
+        return FFmpegFD
+
+    if protocol == 'm3u8' and params.get('hls_prefer_native') is True:
+        return HlsFD
+
+    if protocol == 'm3u8_native' and params.get('hls_prefer_native') is False:
+        return FFmpegFD
+
+    return PROTOCOL_MAP.get(protocol, HttpFD)
+
+
+__all__ = [
+    'get_suitable_downloader',
+    'FileDownloader',
+]
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py

new file mode 100644 (file)

index 0000000..31c2864
--- /dev/null
+++ b/youtube_dl/downloader/common.py
@@ -0,0 +1,391 @@
+from __future__ import division, unicode_literals
+
+import os
+import re
+import sys
+import time
+import random
+
+from ..compat import compat_os_name
+from ..utils import (
+    decodeArgument,
+    encodeFilename,
+    error_to_compat_str,
+    format_bytes,
+    shell_quote,
+    timeconvert,
+)
+
+
+class FileDownloader(object):
+    """File Downloader class.
+
+    File downloader objects are the ones responsible of downloading the
+    actual video file and writing it to disk.
+
+    File downloaders accept a lot of parameters. In order not to saturate
+    the object constructor with arguments, it receives a dictionary of
+    options instead.
+
+    Available options:
+
+    verbose:            Print additional info to stdout.
+    quiet:              Do not print messages to stdout.
+    ratelimit:          Download speed limit, in bytes/sec.
+    retries:            Number of times to retry for HTTP error 5xx
+    buffersize:         Size of download buffer in bytes.
+    noresizebuffer:     Do not automatically resize the download buffer.
+    continuedl:         Try to continue downloads if possible.
+    noprogress:         Do not print the progress bar.
+    logtostderr:        Log messages to stderr instead of stdout.
+    consoletitle:       Display progress in console window's titlebar.
+    nopart:             Do not use temporary .part files.
+    updatetime:         Use the Last-modified header to set output file timestamps.
+    test:               Download only first bytes to test the downloader.
+    min_filesize:       Skip files smaller than this size
+    max_filesize:       Skip files larger than this size
+    xattr_set_filesize: Set ytdl.filesize user xattribute with expected size.
+    external_downloader_args:  A list of additional command-line arguments for the
+                        external downloader.
+    hls_use_mpegts:     Use the mpegts container for HLS videos.
+    http_chunk_size:    Size of a chunk for chunk-based HTTP downloading. May be
+                        useful for bypassing bandwidth throttling imposed by
+                        a webserver (experimental)
+
+    Subclasses of this one must re-define the real_download method.
+    """
+
+    _TEST_FILE_SIZE = 10241
+    params = None
+
+    def __init__(self, ydl, params):
+        """Create a FileDownloader object with the given options."""
+        self.ydl = ydl
+        self._progress_hooks = []
+        self.params = params
+        self.add_progress_hook(self.report_progress)
+
+    @staticmethod
+    def format_seconds(seconds):
+        (mins, secs) = divmod(seconds, 60)
+        (hours, mins) = divmod(mins, 60)
+        if hours > 99:
+            return '--:--:--'
+        if hours == 0:
+            return '%02d:%02d' % (mins, secs)
+        else:
+            return '%02d:%02d:%02d' % (hours, mins, secs)
+
+    @staticmethod
+    def calc_percent(byte_counter, data_len):
+        if data_len is None:
+            return None
+        return float(byte_counter) / float(data_len) * 100.0
+
+    @staticmethod
+    def format_percent(percent):
+        if percent is None:
+            return '---.-%'
+        return '%6s' % ('%3.1f%%' % percent)
+
+    @staticmethod
+    def calc_eta(start, now, total, current):
+        if total is None:
+            return None
+        if now is None:
+            now = time.time()
+        dif = now - start
+        if current == 0 or dif < 0.001:  # One millisecond
+            return None
+        rate = float(current) / dif
+        return int((float(total) - float(current)) / rate)
+
+    @staticmethod
+    def format_eta(eta):
+        if eta is None:
+            return '--:--'
+        return FileDownloader.format_seconds(eta)
+
+    @staticmethod
+    def calc_speed(start, now, bytes):
+        dif = now - start
+        if bytes == 0 or dif < 0.001:  # One millisecond
+            return None
+        return float(bytes) / dif
+
+    @staticmethod
+    def format_speed(speed):
+        if speed is None:
+            return '%10s' % '---b/s'
+        return '%10s' % ('%s/s' % format_bytes(speed))
+
+    @staticmethod
+    def format_retries(retries):
+        return 'inf' if retries == float('inf') else '%.0f' % retries
+
+    @staticmethod
+    def best_block_size(elapsed_time, bytes):
+        new_min = max(bytes / 2.0, 1.0)
+        new_max = min(max(bytes * 2.0, 1.0), 4194304)  # Do not surpass 4 MB
+        if elapsed_time < 0.001:
+            return int(new_max)
+        rate = bytes / elapsed_time
+        if rate > new_max:
+            return int(new_max)
+        if rate < new_min:
+            return int(new_min)
+        return int(rate)
+
+    @staticmethod
+    def parse_bytes(bytestr):
+        """Parse a string indicating a byte quantity into an integer."""
+        matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
+        if matchobj is None:
+            return None
+        number = float(matchobj.group(1))
+        multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
+        return int(round(number * multiplier))
+
+    def to_screen(self, *args, **kargs):
+        self.ydl.to_screen(*args, **kargs)
+
+    def to_stderr(self, message):
+        self.ydl.to_screen(message)
+
+    def to_console_title(self, message):
+        self.ydl.to_console_title(message)
+
+    def trouble(self, *args, **kargs):
+        self.ydl.trouble(*args, **kargs)
+
+    def report_warning(self, *args, **kargs):
+        self.ydl.report_warning(*args, **kargs)
+
+    def report_error(self, *args, **kargs):
+        self.ydl.report_error(*args, **kargs)
+
+    def slow_down(self, start_time, now, byte_counter):
+        """Sleep if the download speed is over the rate limit."""
+        rate_limit = self.params.get('ratelimit')
+        if rate_limit is None or byte_counter == 0:
+            return
+        if now is None:
+            now = time.time()
+        elapsed = now - start_time
+        if elapsed <= 0.0:
+            return
+        speed = float(byte_counter) / elapsed
+        if speed > rate_limit:
+            sleep_time = float(byte_counter) / rate_limit - elapsed
+            if sleep_time > 0:
+                time.sleep(sleep_time)
+
+    def temp_name(self, filename):
+        """Returns a temporary filename for the given filename."""
+        if self.params.get('nopart', False) or filename == '-' or \
+                (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))):
+            return filename
+        return filename + '.part'
+
+    def undo_temp_name(self, filename):
+        if filename.endswith('.part'):
+            return filename[:-len('.part')]
+        return filename
+
+    def ytdl_filename(self, filename):
+        return filename + '.ytdl'
+
+    def try_rename(self, old_filename, new_filename):
+        try:
+            if old_filename == new_filename:
+                return
+            os.rename(encodeFilename(old_filename), encodeFilename(new_filename))
+        except (IOError, OSError) as err:
+            self.report_error('unable to rename file: %s' % error_to_compat_str(err))
+
+    def try_utime(self, filename, last_modified_hdr):
+        """Try to set the last-modified time of the given file."""
+        if last_modified_hdr is None:
+            return
+        if not os.path.isfile(encodeFilename(filename)):
+            return
+        timestr = last_modified_hdr
+        if timestr is None:
+            return
+        filetime = timeconvert(timestr)
+        if filetime is None:
+            return filetime
+        # Ignore obviously invalid dates
+        if filetime == 0:
+            return
+        try:
+            os.utime(filename, (time.time(), filetime))
+        except Exception:
+            pass
+        return filetime
+
+    def report_destination(self, filename):
+        """Report destination filename."""
+        self.to_screen('[download] Destination: ' + filename)
+
+    def _report_progress_status(self, msg, is_last_line=False):
+        fullmsg = '[download] ' + msg
+        if self.params.get('progress_with_newline', False):
+            self.to_screen(fullmsg)
+        else:
+            if compat_os_name == 'nt':
+                prev_len = getattr(self, '_report_progress_prev_line_length',
+                                   0)
+                if prev_len > len(fullmsg):
+                    fullmsg += ' ' * (prev_len - len(fullmsg))
+                self._report_progress_prev_line_length = len(fullmsg)
+                clear_line = '\r'
+            else:
+                clear_line = ('\r\x1b[K' if sys.stderr.isatty() else '\r')
+            self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line)
+        self.to_console_title('youtube-dlc ' + msg)
+
+    def report_progress(self, s):
+        if s['status'] == 'finished':
+            if self.params.get('noprogress', False):
+                self.to_screen('[download] Download completed')
+            else:
+                msg_template = '100%%'
+                if s.get('total_bytes') is not None:
+                    s['_total_bytes_str'] = format_bytes(s['total_bytes'])
+                    msg_template += ' of %(_total_bytes_str)s'
+                if s.get('elapsed') is not None:
+                    s['_elapsed_str'] = self.format_seconds(s['elapsed'])
+                    msg_template += ' in %(_elapsed_str)s'
+                self._report_progress_status(
+                    msg_template % s, is_last_line=True)
+
+        if self.params.get('noprogress'):
+            return
+
+        if s['status'] != 'downloading':
+            return
+
+        if s.get('eta') is not None:
+            s['_eta_str'] = self.format_eta(s['eta'])
+        else:
+            s['_eta_str'] = 'Unknown ETA'
+
+        if s.get('total_bytes') and s.get('downloaded_bytes') is not None:
+            s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes'])
+        elif s.get('total_bytes_estimate') and s.get('downloaded_bytes') is not None:
+            s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes_estimate'])
+        else:
+            if s.get('downloaded_bytes') == 0:
+                s['_percent_str'] = self.format_percent(0)
+            else:
+                s['_percent_str'] = 'Unknown %'
+
+        if s.get('speed') is not None:
+            s['_speed_str'] = self.format_speed(s['speed'])
+        else:
+            s['_speed_str'] = 'Unknown speed'
+
+        if s.get('total_bytes') is not None:
+            s['_total_bytes_str'] = format_bytes(s['total_bytes'])
+            msg_template = '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s'
+        elif s.get('total_bytes_estimate') is not None:
+            s['_total_bytes_estimate_str'] = format_bytes(s['total_bytes_estimate'])
+            msg_template = '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s'
+        else:
+            if s.get('downloaded_bytes') is not None:
+                s['_downloaded_bytes_str'] = format_bytes(s['downloaded_bytes'])
+                if s.get('elapsed'):
+                    s['_elapsed_str'] = self.format_seconds(s['elapsed'])
+                    msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)'
+                else:
+                    msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s'
+            else:
+                msg_template = '%(_percent_str)s % at %(_speed_str)s ETA %(_eta_str)s'
+
+        self._report_progress_status(msg_template % s)
+
+    def report_resuming_byte(self, resume_len):
+        """Report attempt to resume at given byte."""
+        self.to_screen('[download] Resuming download at byte %s' % resume_len)
+
+    def report_retry(self, err, count, retries):
+        """Report retry in case of HTTP error 5xx"""
+        self.to_screen(
+            '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...'
+            % (error_to_compat_str(err), count, self.format_retries(retries)))
+
+    def report_file_already_downloaded(self, file_name):
+        """Report file has already been fully downloaded."""
+        try:
+            self.to_screen('[download] %s has already been downloaded' % file_name)
+        except UnicodeEncodeError:
+            self.to_screen('[download] The file has already been downloaded')
+
+    def report_unable_to_resume(self):
+        """Report it was impossible to resume download."""
+        self.to_screen('[download] Unable to resume')
+
+    def download(self, filename, info_dict):
+        """Download to a filename using the info from info_dict
+        Return True on success and False otherwise
+        """
+
+        nooverwrites_and_exists = (
+            self.params.get('nooverwrites', False)
+            and os.path.exists(encodeFilename(filename))
+        )
+
+        if not hasattr(filename, 'write'):
+            continuedl_and_exists = (
+                self.params.get('continuedl', True)
+                and os.path.isfile(encodeFilename(filename))
+                and not self.params.get('nopart', False)
+            )
+
+            # Check file already present
+            if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists):
+                self.report_file_already_downloaded(filename)
+                self._hook_progress({
+                    'filename': filename,
+                    'status': 'finished',
+                    'total_bytes': os.path.getsize(encodeFilename(filename)),
+                })
+                return True
+
+        min_sleep_interval = self.params.get('sleep_interval')
+        if min_sleep_interval:
+            max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval)
+            sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval)
+            self.to_screen(
+                '[download] Sleeping %s seconds...' % (
+                    int(sleep_interval) if sleep_interval.is_integer()
+                    else '%.2f' % sleep_interval))
+            time.sleep(sleep_interval)
+
+        return self.real_download(filename, info_dict)
+
+    def real_download(self, filename, info_dict):
+        """Real download process. Redefine in subclasses."""
+        raise NotImplementedError('This method must be implemented by subclasses')
+
+    def _hook_progress(self, status):
+        for ph in self._progress_hooks:
+            ph(status)
+
+    def add_progress_hook(self, ph):
+        # See YoutubeDl.py (search for progress_hooks) for a description of
+        # this interface
+        self._progress_hooks.append(ph)
+
+    def _debug_cmd(self, args, exe=None):
+        if not self.params.get('verbose', False):
+            return
+
+        str_args = [decodeArgument(a) for a in args]
+
+        if exe is None:
+            exe = os.path.basename(str_args[0])
+
+        self.to_screen('[debug] %s command line: %s' % (
+            exe, shell_quote(str_args)))
diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py

new file mode 100644 (file)

index 0000000..c6d674b
--- /dev/null
+++ b/youtube_dl/downloader/dash.py
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+from .fragment import FragmentFD
+from ..compat import compat_urllib_error
+from ..utils import (
+    DownloadError,
+    urljoin,
+)
+
+
+class DashSegmentsFD(FragmentFD):
+    """
+    Download segments in a DASH manifest
+    """
+
+    FD_NAME = 'dashsegments'
+
+    def real_download(self, filename, info_dict):
+        fragment_base_url = info_dict.get('fragment_base_url')
+        fragments = info_dict['fragments'][:1] if self.params.get(
+            'test', False) else info_dict['fragments']
+
+        ctx = {
+            'filename': filename,
+            'total_frags': len(fragments),
+        }
+
+        self._prepare_and_start_frag_download(ctx)
+
+        fragment_retries = self.params.get('fragment_retries', 0)
+        skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
+
+        frag_index = 0
+        for i, fragment in enumerate(fragments):
+            frag_index += 1
+            if frag_index <= ctx['fragment_index']:
+                continue
+            # In DASH, the first segment contains necessary headers to
+            # generate a valid MP4 file, so always abort for the first segment
+            fatal = i == 0 or not skip_unavailable_fragments
+            count = 0
+            while count <= fragment_retries:
+                try:
+                    fragment_url = fragment.get('url')
+                    if not fragment_url:
+                        assert fragment_base_url
+                        fragment_url = urljoin(fragment_base_url, fragment['path'])
+                    success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
+                    if not success:
+                        return False
+                    self._append_fragment(ctx, frag_content)
+                    break
+                except compat_urllib_error.HTTPError as err:
+                    # YouTube may often return 404 HTTP error for a fragment causing the
+                    # whole download to fail. However if the same fragment is immediately
+                    # retried with the same request data this usually succeeds (1-2 attempts
+                    # is usually enough) thus allowing to download the whole file successfully.
+                    # To be future-proof we will retry all fragments that fail with any
+                    # HTTP error.
+                    count += 1
+                    if count <= fragment_retries:
+                        self.report_retry_fragment(err, frag_index, count, fragment_retries)
+                except DownloadError:
+                    # Don't retry fragment if error occurred during HTTP downloading
+                    # itself since it has own retry settings
+                    if not fatal:
+                        self.report_skip_fragment(frag_index)
+                        break
+                    raise
+
+            if count > fragment_retries:
+                if not fatal:
+                    self.report_skip_fragment(frag_index)
+                    continue
+                self.report_error('giving up after %s fragment retries' % fragment_retries)
+                return False
+
+        self._finish_frag_download(ctx)
+
+        return True
diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py

new file mode 100644 (file)

index 0000000..c31f891
--- /dev/null
+++ b/youtube_dl/downloader/external.py
@@ -0,0 +1,371 @@
+from __future__ import unicode_literals
+
+import os.path
+import re
+import subprocess
+import sys
+import time
+
+from .common import FileDownloader
+from ..compat import (
+    compat_setenv,
+    compat_str,
+)
+from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS
+from ..utils import (
+    cli_option,
+    cli_valueless_option,
+    cli_bool_option,
+    cli_configuration_args,
+    encodeFilename,
+    encodeArgument,
+    handle_youtubedl_headers,
+    check_executable,
+    is_outdated_version,
+)
+
+
+class ExternalFD(FileDownloader):
+    def real_download(self, filename, info_dict):
+        self.report_destination(filename)
+        tmpfilename = self.temp_name(filename)
+
+        try:
+            started = time.time()
+            retval = self._call_downloader(tmpfilename, info_dict)
+        except KeyboardInterrupt:
+            if not info_dict.get('is_live'):
+                raise
+            # Live stream downloading cancellation should be considered as
+            # correct and expected termination thus all postprocessing
+            # should take place
+            retval = 0
+            self.to_screen('[%s] Interrupted by user' % self.get_basename())
+
+        if retval == 0:
+            status = {
+                'filename': filename,
+                'status': 'finished',
+                'elapsed': time.time() - started,
+            }
+            if filename != '-':
+                fsize = os.path.getsize(encodeFilename(tmpfilename))
+                self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize))
+                self.try_rename(tmpfilename, filename)
+                status.update({
+                    'downloaded_bytes': fsize,
+                    'total_bytes': fsize,
+                })
+            self._hook_progress(status)
+            return True
+        else:
+            self.to_stderr('\n')
+            self.report_error('%s exited with code %d' % (
+                self.get_basename(), retval))
+            return False
+
+    @classmethod
+    def get_basename(cls):
+        return cls.__name__[:-2].lower()
+
+    @property
+    def exe(self):
+        return self.params.get('external_downloader')
+
+    @classmethod
+    def available(cls):
+        return check_executable(cls.get_basename(), [cls.AVAILABLE_OPT])
+
+    @classmethod
+    def supports(cls, info_dict):
+        return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps')
+
+    @classmethod
+    def can_download(cls, info_dict):
+        return cls.available() and cls.supports(info_dict)
+
+    def _option(self, command_option, param):
+        return cli_option(self.params, command_option, param)
+
+    def _bool_option(self, command_option, param, true_value='true', false_value='false', separator=None):
+        return cli_bool_option(self.params, command_option, param, true_value, false_value, separator)
+
+    def _valueless_option(self, command_option, param, expected_value=True):
+        return cli_valueless_option(self.params, command_option, param, expected_value)
+
+    def _configuration_args(self, default=[]):
+        return cli_configuration_args(self.params, 'external_downloader_args', default)
+
+    def _call_downloader(self, tmpfilename, info_dict):
+        """ Either overwrite this or implement _make_cmd """
+        cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)]
+
+        self._debug_cmd(cmd)
+
+        p = subprocess.Popen(
+            cmd, stderr=subprocess.PIPE)
+        _, stderr = p.communicate()
+        if p.returncode != 0:
+            self.to_stderr(stderr.decode('utf-8', 'replace'))
+        return p.returncode
+
+
+class CurlFD(ExternalFD):
+    AVAILABLE_OPT = '-V'
+
+    def _make_cmd(self, tmpfilename, info_dict):
+        cmd = [self.exe, '--location', '-o', tmpfilename]
+        for key, val in info_dict['http_headers'].items():
+            cmd += ['--header', '%s: %s' % (key, val)]
+        cmd += self._bool_option('--continue-at', 'continuedl', '-', '0')
+        cmd += self._valueless_option('--silent', 'noprogress')
+        cmd += self._valueless_option('--verbose', 'verbose')
+        cmd += self._option('--limit-rate', 'ratelimit')
+        retry = self._option('--retry', 'retries')
+        if len(retry) == 2:
+            if retry[1] in ('inf', 'infinite'):
+                retry[1] = '2147483647'
+            cmd += retry
+        cmd += self._option('--max-filesize', 'max_filesize')
+        cmd += self._option('--interface', 'source_address')
+        cmd += self._option('--proxy', 'proxy')
+        cmd += self._valueless_option('--insecure', 'nocheckcertificate')
+        cmd += self._configuration_args()
+        cmd += ['--', info_dict['url']]
+        return cmd
+
+    def _call_downloader(self, tmpfilename, info_dict):
+        cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)]
+
+        self._debug_cmd(cmd)
+
+        # curl writes the progress to stderr so don't capture it.
+        p = subprocess.Popen(cmd)
+        p.communicate()
+        return p.returncode
+
+
+class AxelFD(ExternalFD):
+    AVAILABLE_OPT = '-V'
+
+    def _make_cmd(self, tmpfilename, info_dict):
+        cmd = [self.exe, '-o', tmpfilename]
+        for key, val in info_dict['http_headers'].items():
+            cmd += ['-H', '%s: %s' % (key, val)]
+        cmd += self._configuration_args()
+        cmd += ['--', info_dict['url']]
+        return cmd
+
+
+class WgetFD(ExternalFD):
+    AVAILABLE_OPT = '--version'
+
+    def _make_cmd(self, tmpfilename, info_dict):
+        cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
+        for key, val in info_dict['http_headers'].items():
+            cmd += ['--header', '%s: %s' % (key, val)]
+        cmd += self._option('--limit-rate', 'ratelimit')
+        retry = self._option('--tries', 'retries')
+        if len(retry) == 2:
+            if retry[1] in ('inf', 'infinite'):
+                retry[1] = '0'
+            cmd += retry
+        cmd += self._option('--bind-address', 'source_address')
+        cmd += self._option('--proxy', 'proxy')
+        cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate')
+        cmd += self._configuration_args()
+        cmd += ['--', info_dict['url']]
+        return cmd
+
+
+class Aria2cFD(ExternalFD):
+    AVAILABLE_OPT = '-v'
+
+    def _make_cmd(self, tmpfilename, info_dict):
+        cmd = [self.exe, '-c']
+        cmd += self._configuration_args([
+            '--min-split-size', '1M', '--max-connection-per-server', '4'])
+        dn = os.path.dirname(tmpfilename)
+        if dn:
+            cmd += ['--dir', dn]
+        cmd += ['--out', os.path.basename(tmpfilename)]
+        for key, val in info_dict['http_headers'].items():
+            cmd += ['--header', '%s: %s' % (key, val)]
+        cmd += self._option('--interface', 'source_address')
+        cmd += self._option('--all-proxy', 'proxy')
+        cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=')
+        cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=')
+        cmd += ['--', info_dict['url']]
+        return cmd
+
+
+class HttpieFD(ExternalFD):
+    @classmethod
+    def available(cls):
+        return check_executable('http', ['--version'])
+
+    def _make_cmd(self, tmpfilename, info_dict):
+        cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]
+        for key, val in info_dict['http_headers'].items():
+            cmd += ['%s:%s' % (key, val)]
+        return cmd
+
+
+class FFmpegFD(ExternalFD):
+    @classmethod
+    def supports(cls, info_dict):
+        return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms')
+
+    @classmethod
+    def available(cls):
+        return FFmpegPostProcessor().available
+
+    def _call_downloader(self, tmpfilename, info_dict):
+        url = info_dict['url']
+        ffpp = FFmpegPostProcessor(downloader=self)
+        if not ffpp.available:
+            self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.')
+            return False
+        ffpp.check_version()
+
+        args = [ffpp.executable, '-y']
+
+        for log_level in ('quiet', 'verbose'):
+            if self.params.get(log_level, False):
+                args += ['-loglevel', log_level]
+                break
+
+        seekable = info_dict.get('_seekable')
+        if seekable is not None:
+            # setting -seekable prevents ffmpeg from guessing if the server
+            # supports seeking(by adding the header `Range: bytes=0-`), which
+            # can cause problems in some cases
+            # https://github.com/ytdl-org/youtube-dl/issues/11800#issuecomment-275037127
+            # http://trac.ffmpeg.org/ticket/6125#comment:10
+            args += ['-seekable', '1' if seekable else '0']
+
+        args += self._configuration_args()
+
+        # start_time = info_dict.get('start_time') or 0
+        # if start_time:
+        #     args += ['-ss', compat_str(start_time)]
+        # end_time = info_dict.get('end_time')
+        # if end_time:
+        #     args += ['-t', compat_str(end_time - start_time)]
+
+        if info_dict['http_headers'] and re.match(r'^https?://', url):
+            # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv:
+            # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header.
+            headers = handle_youtubedl_headers(info_dict['http_headers'])
+            args += [
+                '-headers',
+                ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())]
+
+        env = None
+        proxy = self.params.get('proxy')
+        if proxy:
+            if not re.match(r'^[\da-zA-Z]+://', proxy):
+                proxy = 'http://%s' % proxy
+
+            if proxy.startswith('socks'):
+                self.report_warning(
+                    '%s does not support SOCKS proxies. Downloading is likely to fail. '
+                    'Consider adding --hls-prefer-native to your command.' % self.get_basename())
+
+            # Since December 2015 ffmpeg supports -http_proxy option (see
+            # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd)
+            # We could switch to the following code if we are able to detect version properly
+            # args += ['-http_proxy', proxy]
+            env = os.environ.copy()
+            compat_setenv('HTTP_PROXY', proxy, env=env)
+            compat_setenv('http_proxy', proxy, env=env)
+
+        protocol = info_dict.get('protocol')
+
+        if protocol == 'rtmp':
+            player_url = info_dict.get('player_url')
+            page_url = info_dict.get('page_url')
+            app = info_dict.get('app')
+            play_path = info_dict.get('play_path')
+            tc_url = info_dict.get('tc_url')
+            flash_version = info_dict.get('flash_version')
+            live = info_dict.get('rtmp_live', False)
+            conn = info_dict.get('rtmp_conn')
+            if player_url is not None:
+                args += ['-rtmp_swfverify', player_url]
+            if page_url is not None:
+                args += ['-rtmp_pageurl', page_url]
+            if app is not None:
+                args += ['-rtmp_app', app]
+            if play_path is not None:
+                args += ['-rtmp_playpath', play_path]
+            if tc_url is not None:
+                args += ['-rtmp_tcurl', tc_url]
+            if flash_version is not None:
+                args += ['-rtmp_flashver', flash_version]
+            if live:
+                args += ['-rtmp_live', 'live']
+            if isinstance(conn, list):
+                for entry in conn:
+                    args += ['-rtmp_conn', entry]
+            elif isinstance(conn, compat_str):
+                args += ['-rtmp_conn', conn]
+
+        args += ['-i', url, '-c', 'copy']
+
+        if self.params.get('test', False):
+            args += ['-fs', compat_str(self._TEST_FILE_SIZE)]
+
+        if protocol in ('m3u8', 'm3u8_native'):
+            if self.params.get('hls_use_mpegts', False) or tmpfilename == '-':
+                args += ['-f', 'mpegts']
+            else:
+                args += ['-f', 'mp4']
+                if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2', False)) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')):
+                    args += ['-bsf:a', 'aac_adtstoasc']
+        elif protocol == 'rtmp':
+            args += ['-f', 'flv']
+        else:
+            args += ['-f', EXT_TO_OUT_FORMATS.get(info_dict['ext'], info_dict['ext'])]
+
+        args = [encodeArgument(opt) for opt in args]
+        args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True))
+
+        self._debug_cmd(args)
+
+        proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env)
+        try:
+            retval = proc.wait()
+        except KeyboardInterrupt:
+            # subprocces.run would send the SIGKILL signal to ffmpeg and the
+            # mp4 file couldn't be played, but if we ask ffmpeg to quit it
+            # produces a file that is playable (this is mostly useful for live
+            # streams). Note that Windows is not affected and produces playable
+            # files (see https://github.com/ytdl-org/youtube-dl/issues/8300).
+            if sys.platform != 'win32':
+                proc.communicate(b'q')
+            raise
+        return retval
+
+
+class AVconvFD(FFmpegFD):
+    pass
+
+
+_BY_NAME = dict(
+    (klass.get_basename(), klass)
+    for name, klass in globals().items()
+    if name.endswith('FD') and name != 'ExternalFD'
+)
+
+
+def list_external_downloaders():
+    return sorted(_BY_NAME.keys())
+
+
+def get_external_downloader(external_downloader):
+    """ Given the name of the executable, see whether we support the given
+        downloader . """
+    # Drop .exe extension on Windows
+    bn = os.path.splitext(os.path.basename(external_downloader))[0]
+    return _BY_NAME[bn]
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py

new file mode 100644 (file)

index 0000000..8dd3c2e
--- /dev/null
+++ b/youtube_dl/downloader/f4m.py
@@ -0,0 +1,438 @@
+from __future__ import division, unicode_literals
+
+import io
+import itertools
+import time
+
+from .fragment import FragmentFD
+from ..compat import (
+    compat_b64decode,
+    compat_etree_fromstring,
+    compat_urlparse,
+    compat_urllib_error,
+    compat_urllib_parse_urlparse,
+    compat_struct_pack,
+    compat_struct_unpack,
+)
+from ..utils import (
+    fix_xml_ampersands,
+    xpath_text,
+)
+
+
+class DataTruncatedError(Exception):
+    pass
+
+
+class FlvReader(io.BytesIO):
+    """
+    Reader for Flv files
+    The file format is documented in https://www.adobe.com/devnet/f4v.html
+    """
+
+    def read_bytes(self, n):
+        data = self.read(n)
+        if len(data) < n:
+            raise DataTruncatedError(
+                'FlvReader error: need %d bytes while only %d bytes got' % (
+                    n, len(data)))
+        return data
+
+    # Utility functions for reading numbers and strings
+    def read_unsigned_long_long(self):
+        return compat_struct_unpack('!Q', self.read_bytes(8))[0]
+
+    def read_unsigned_int(self):
+        return compat_struct_unpack('!I', self.read_bytes(4))[0]
+
+    def read_unsigned_char(self):
+        return compat_struct_unpack('!B', self.read_bytes(1))[0]
+
+    def read_string(self):
+        res = b''
+        while True:
+            char = self.read_bytes(1)
+            if char == b'\x00':
+                break
+            res += char
+        return res
+
+    def read_box_info(self):
+        """
+        Read a box and return the info as a tuple: (box_size, box_type, box_data)
+        """
+        real_size = size = self.read_unsigned_int()
+        box_type = self.read_bytes(4)
+        header_end = 8
+        if size == 1:
+            real_size = self.read_unsigned_long_long()
+            header_end = 16
+        return real_size, box_type, self.read_bytes(real_size - header_end)
+
+    def read_asrt(self):
+        # version
+        self.read_unsigned_char()
+        # flags
+        self.read_bytes(3)
+        quality_entry_count = self.read_unsigned_char()
+        # QualityEntryCount
+        for i in range(quality_entry_count):
+            self.read_string()
+
+        segment_run_count = self.read_unsigned_int()
+        segments = []
+        for i in range(segment_run_count):
+            first_segment = self.read_unsigned_int()
+            fragments_per_segment = self.read_unsigned_int()
+            segments.append((first_segment, fragments_per_segment))
+
+        return {
+            'segment_run': segments,
+        }
+
+    def read_afrt(self):
+        # version
+        self.read_unsigned_char()
+        # flags
+        self.read_bytes(3)
+        # time scale
+        self.read_unsigned_int()
+
+        quality_entry_count = self.read_unsigned_char()
+        # QualitySegmentUrlModifiers
+        for i in range(quality_entry_count):
+            self.read_string()
+
+        fragments_count = self.read_unsigned_int()
+        fragments = []
+        for i in range(fragments_count):
+            first = self.read_unsigned_int()
+            first_ts = self.read_unsigned_long_long()
+            duration = self.read_unsigned_int()
+            if duration == 0:
+                discontinuity_indicator = self.read_unsigned_char()
+            else:
+                discontinuity_indicator = None
+            fragments.append({
+                'first': first,
+                'ts': first_ts,
+                'duration': duration,
+                'discontinuity_indicator': discontinuity_indicator,
+            })
+
+        return {
+            'fragments': fragments,
+        }
+
+    def read_abst(self):
+        # version
+        self.read_unsigned_char()
+        # flags
+        self.read_bytes(3)
+
+        self.read_unsigned_int()  # BootstrapinfoVersion
+        # Profile,Live,Update,Reserved
+        flags = self.read_unsigned_char()
+        live = flags & 0x20 != 0
+        # time scale
+        self.read_unsigned_int()
+        # CurrentMediaTime
+        self.read_unsigned_long_long()
+        # SmpteTimeCodeOffset
+        self.read_unsigned_long_long()
+
+        self.read_string()  # MovieIdentifier
+        server_count = self.read_unsigned_char()
+        # ServerEntryTable
+        for i in range(server_count):
+            self.read_string()
+        quality_count = self.read_unsigned_char()
+        # QualityEntryTable
+        for i in range(quality_count):
+            self.read_string()
+        # DrmData
+        self.read_string()
+        # MetaData
+        self.read_string()
+
+        segments_count = self.read_unsigned_char()
+        segments = []
+        for i in range(segments_count):
+            box_size, box_type, box_data = self.read_box_info()
+            assert box_type == b'asrt'
+            segment = FlvReader(box_data).read_asrt()
+            segments.append(segment)
+        fragments_run_count = self.read_unsigned_char()
+        fragments = []
+        for i in range(fragments_run_count):
+            box_size, box_type, box_data = self.read_box_info()
+            assert box_type == b'afrt'
+            fragments.append(FlvReader(box_data).read_afrt())
+
+        return {
+            'segments': segments,
+            'fragments': fragments,
+            'live': live,
+        }
+
+    def read_bootstrap_info(self):
+        total_size, box_type, box_data = self.read_box_info()
+        assert box_type == b'abst'
+        return FlvReader(box_data).read_abst()
+
+
+def read_bootstrap_info(bootstrap_bytes):
+    return FlvReader(bootstrap_bytes).read_bootstrap_info()
+
+
+def build_fragments_list(boot_info):
+    """ Return a list of (segment, fragment) for each fragment in the video """
+    res = []
+    segment_run_table = boot_info['segments'][0]
+    fragment_run_entry_table = boot_info['fragments'][0]['fragments']
+    first_frag_number = fragment_run_entry_table[0]['first']
+    fragments_counter = itertools.count(first_frag_number)
+    for segment, fragments_count in segment_run_table['segment_run']:
+        # In some live HDS streams (for example Rai), `fragments_count` is
+        # abnormal and causing out-of-memory errors. It's OK to change the
+        # number of fragments for live streams as they are updated periodically
+        if fragments_count == 4294967295 and boot_info['live']:
+            fragments_count = 2
+        for _ in range(fragments_count):
+            res.append((segment, next(fragments_counter)))
+
+    if boot_info['live']:
+        res = res[-2:]
+
+    return res
+
+
+def write_unsigned_int(stream, val):
+    stream.write(compat_struct_pack('!I', val))
+
+
+def write_unsigned_int_24(stream, val):
+    stream.write(compat_struct_pack('!I', val)[1:])
+
+
+def write_flv_header(stream):
+    """Writes the FLV header to stream"""
+    # FLV header
+    stream.write(b'FLV\x01')
+    stream.write(b'\x05')
+    stream.write(b'\x00\x00\x00\x09')
+    stream.write(b'\x00\x00\x00\x00')
+
+
+def write_metadata_tag(stream, metadata):
+    """Writes optional metadata tag to stream"""
+    SCRIPT_TAG = b'\x12'
+    FLV_TAG_HEADER_LEN = 11
+
+    if metadata:
+        stream.write(SCRIPT_TAG)
+        write_unsigned_int_24(stream, len(metadata))
+        stream.write(b'\x00\x00\x00\x00\x00\x00\x00')
+        stream.write(metadata)
+        write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata))
+
+
+def remove_encrypted_media(media):
+    return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib
+                                 and 'drmAdditionalHeaderSetId' not in e.attrib,
+                       media))
+
+
+def _add_ns(prop, ver=1):
+    return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop)
+
+
+def get_base_url(manifest):
+    base_url = xpath_text(
+        manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)],
+        'base URL', default=None)
+    if base_url:
+        base_url = base_url.strip()
+    return base_url
+
+
+class F4mFD(FragmentFD):
+    """
+    A downloader for f4m manifests or AdobeHDS.
+    """
+
+    FD_NAME = 'f4m'
+
+    def _get_unencrypted_media(self, doc):
+        media = doc.findall(_add_ns('media'))
+        if not media:
+            self.report_error('No media found')
+        for e in (doc.findall(_add_ns('drmAdditionalHeader'))
+                  + doc.findall(_add_ns('drmAdditionalHeaderSet'))):
+            # If id attribute is missing it's valid for all media nodes
+            # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute
+            if 'id' not in e.attrib:
+                self.report_error('Missing ID in f4m DRM')
+        media = remove_encrypted_media(media)
+        if not media:
+            self.report_error('Unsupported DRM')
+        return media
+
+    def _get_bootstrap_from_url(self, bootstrap_url):
+        bootstrap = self.ydl.urlopen(bootstrap_url).read()
+        return read_bootstrap_info(bootstrap)
+
+    def _update_live_fragments(self, bootstrap_url, latest_fragment):
+        fragments_list = []
+        retries = 30
+        while (not fragments_list) and (retries > 0):
+            boot_info = self._get_bootstrap_from_url(bootstrap_url)
+            fragments_list = build_fragments_list(boot_info)
+            fragments_list = [f for f in fragments_list if f[1] > latest_fragment]
+            if not fragments_list:
+                # Retry after a while
+                time.sleep(5.0)
+                retries -= 1
+
+        if not fragments_list:
+            self.report_error('Failed to update fragments')
+
+        return fragments_list
+
+    def _parse_bootstrap_node(self, node, base_url):
+        # Sometimes non empty inline bootstrap info can be specified along
+        # with bootstrap url attribute (e.g. dummy inline bootstrap info
+        # contains whitespace characters in [1]). We will prefer bootstrap
+        # url over inline bootstrap info when present.
+        # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m
+        bootstrap_url = node.get('url')
+        if bootstrap_url:
+            bootstrap_url = compat_urlparse.urljoin(
+                base_url, bootstrap_url)
+            boot_info = self._get_bootstrap_from_url(bootstrap_url)
+        else:
+            bootstrap_url = None
+            bootstrap = compat_b64decode(node.text)
+            boot_info = read_bootstrap_info(bootstrap)
+        return boot_info, bootstrap_url
+
+    def real_download(self, filename, info_dict):
+        man_url = info_dict['url']
+        requested_bitrate = info_dict.get('tbr')
+        self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME)
+
+        urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
+        man_url = urlh.geturl()
+        # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+        # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244
+        # and https://github.com/ytdl-org/youtube-dl/issues/7823)
+        manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip()
+
+        doc = compat_etree_fromstring(manifest)
+        formats = [(int(f.attrib.get('bitrate', -1)), f)
+                   for f in self._get_unencrypted_media(doc)]
+        if requested_bitrate is None or len(formats) == 1:
+            # get the best format
+            formats = sorted(formats, key=lambda f: f[0])
+            rate, media = formats[-1]
+        else:
+            rate, media = list(filter(
+                lambda f: int(f[0]) == requested_bitrate, formats))[0]
+
+        # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec.
+        man_base_url = get_base_url(doc) or man_url
+
+        base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url'])
+        bootstrap_node = doc.find(_add_ns('bootstrapInfo'))
+        boot_info, bootstrap_url = self._parse_bootstrap_node(
+            bootstrap_node, man_base_url)
+        live = boot_info['live']
+        metadata_node = media.find(_add_ns('metadata'))
+        if metadata_node is not None:
+            metadata = compat_b64decode(metadata_node.text)
+        else:
+            metadata = None
+
+        fragments_list = build_fragments_list(boot_info)
+        test = self.params.get('test', False)
+        if test:
+            # We only download the first fragment
+            fragments_list = fragments_list[:1]
+        total_frags = len(fragments_list)
+        # For some akamai manifests we'll need to add a query to the fragment url
+        akamai_pv = xpath_text(doc, _add_ns('pv-2.0'))
+
+        ctx = {
+            'filename': filename,
+            'total_frags': total_frags,
+            'live': live,
+        }
+
+        self._prepare_frag_download(ctx)
+
+        dest_stream = ctx['dest_stream']
+
+        if ctx['complete_frags_downloaded_bytes'] == 0:
+            write_flv_header(dest_stream)
+            if not live:
+                write_metadata_tag(dest_stream, metadata)
+
+        base_url_parsed = compat_urllib_parse_urlparse(base_url)
+
+        self._start_frag_download(ctx)
+
+        frag_index = 0
+        while fragments_list:
+            seg_i, frag_i = fragments_list.pop(0)
+            frag_index += 1
+            if frag_index <= ctx['fragment_index']:
+                continue
+            name = 'Seg%d-Frag%d' % (seg_i, frag_i)
+            query = []
+            if base_url_parsed.query:
+                query.append(base_url_parsed.query)
+            if akamai_pv:
+                query.append(akamai_pv.strip(';'))
+            if info_dict.get('extra_param_to_segment_url'):
+                query.append(info_dict['extra_param_to_segment_url'])
+            url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query))
+            try:
+                success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict)
+                if not success:
+                    return False
+                reader = FlvReader(down_data)
+                while True:
+                    try:
+                        _, box_type, box_data = reader.read_box_info()
+                    except DataTruncatedError:
+                        if test:
+                            # In tests, segments may be truncated, and thus
+                            # FlvReader may not be able to parse the whole
+                            # chunk. If so, write the segment as is
+                            # See https://github.com/ytdl-org/youtube-dl/issues/9214
+                            dest_stream.write(down_data)
+                            break
+                        raise
+                    if box_type == b'mdat':
+                        self._append_fragment(ctx, box_data)
+                        break
+            except (compat_urllib_error.HTTPError, ) as err:
+                if live and (err.code == 404 or err.code == 410):
+                    # We didn't keep up with the live window. Continue
+                    # with the next available fragment.
+                    msg = 'Fragment %d unavailable' % frag_i
+                    self.report_warning(msg)
+                    fragments_list = []
+                else:
+                    raise
+
+            if not fragments_list and not test and live and bootstrap_url:
+                fragments_list = self._update_live_fragments(bootstrap_url, frag_i)
+                total_frags += len(fragments_list)
+                if fragments_list and (fragments_list[0][1] > frag_i + 1):
+                    msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1))
+                    self.report_warning(msg)
+
+        self._finish_frag_download(ctx)
+
+        return True
diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py

new file mode 100644 (file)

index 0000000..9339b3a
--- /dev/null
+++ b/youtube_dl/downloader/fragment.py
@@ -0,0 +1,269 @@
+from __future__ import division, unicode_literals
+
+import os
+import time
+import json
+
+from .common import FileDownloader
+from .http import HttpFD
+from ..utils import (
+    error_to_compat_str,
+    encodeFilename,
+    sanitize_open,
+    sanitized_Request,
+)
+
+
+class HttpQuietDownloader(HttpFD):
+    def to_screen(self, *args, **kargs):
+        pass
+
+
+class FragmentFD(FileDownloader):
+    """
+    A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests).
+
+    Available options:
+
+    fragment_retries:   Number of times to retry a fragment for HTTP error (DASH
+                        and hlsnative only)
+    skip_unavailable_fragments:
+                        Skip unavailable fragments (DASH and hlsnative only)
+    keep_fragments:     Keep downloaded fragments on disk after downloading is
+                        finished
+
+    For each incomplete fragment download youtube-dlc keeps on disk a special
+    bookkeeping file with download state and metadata (in future such files will
+    be used for any incomplete download handled by youtube-dlc). This file is
+    used to properly handle resuming, check download file consistency and detect
+    potential errors. The file has a .ytdl extension and represents a standard
+    JSON file of the following format:
+
+    extractor:
+        Dictionary of extractor related data. TBD.
+
+    downloader:
+        Dictionary of downloader related data. May contain following data:
+            current_fragment:
+                Dictionary with current (being downloaded) fragment data:
+                index:  0-based index of current fragment among all fragments
+            fragment_count:
+                Total count of fragments
+
+    This feature is experimental and file format may change in future.
+    """
+
+    def report_retry_fragment(self, err, frag_index, count, retries):
+        self.to_screen(
+            '[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s)...'
+            % (error_to_compat_str(err), frag_index, count, self.format_retries(retries)))
+
+    def report_skip_fragment(self, frag_index):
+        self.to_screen('[download] Skipping fragment %d...' % frag_index)
+
+    def _prepare_url(self, info_dict, url):
+        headers = info_dict.get('http_headers')
+        return sanitized_Request(url, None, headers) if headers else url
+
+    def _prepare_and_start_frag_download(self, ctx):
+        self._prepare_frag_download(ctx)
+        self._start_frag_download(ctx)
+
+    @staticmethod
+    def __do_ytdl_file(ctx):
+        return not ctx['live'] and not ctx['tmpfilename'] == '-'
+
+    def _read_ytdl_file(self, ctx):
+        assert 'ytdl_corrupt' not in ctx
+        stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r')
+        try:
+            ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index']
+        except Exception:
+            ctx['ytdl_corrupt'] = True
+        finally:
+            stream.close()
+
+    def _write_ytdl_file(self, ctx):
+        frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w')
+        downloader = {
+            'current_fragment': {
+                'index': ctx['fragment_index'],
+            },
+        }
+        if ctx.get('fragment_count') is not None:
+            downloader['fragment_count'] = ctx['fragment_count']
+        frag_index_stream.write(json.dumps({'downloader': downloader}))
+        frag_index_stream.close()
+
+    def _download_fragment(self, ctx, frag_url, info_dict, headers=None):
+        fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index'])
+        success = ctx['dl'].download(fragment_filename, {
+            'url': frag_url,
+            'http_headers': headers or info_dict.get('http_headers'),
+        })
+        if not success:
+            return False, None
+        down, frag_sanitized = sanitize_open(fragment_filename, 'rb')
+        ctx['fragment_filename_sanitized'] = frag_sanitized
+        frag_content = down.read()
+        down.close()
+        return True, frag_content
+
+    def _append_fragment(self, ctx, frag_content):
+        try:
+            ctx['dest_stream'].write(frag_content)
+            ctx['dest_stream'].flush()
+        finally:
+            if self.__do_ytdl_file(ctx):
+                self._write_ytdl_file(ctx)
+            if not self.params.get('keep_fragments', False):
+                os.remove(encodeFilename(ctx['fragment_filename_sanitized']))
+            del ctx['fragment_filename_sanitized']
+
+    def _prepare_frag_download(self, ctx):
+        if 'live' not in ctx:
+            ctx['live'] = False
+        if not ctx['live']:
+            total_frags_str = '%d' % ctx['total_frags']
+            ad_frags = ctx.get('ad_frags', 0)
+            if ad_frags:
+                total_frags_str += ' (not including %d ad)' % ad_frags
+        else:
+            total_frags_str = 'unknown (live)'
+        self.to_screen(
+            '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str))
+        self.report_destination(ctx['filename'])
+        dl = HttpQuietDownloader(
+            self.ydl,
+            {
+                'continuedl': True,
+                'quiet': True,
+                'noprogress': True,
+                'ratelimit': self.params.get('ratelimit'),
+                'retries': self.params.get('retries', 0),
+                'nopart': self.params.get('nopart', False),
+                'test': self.params.get('test', False),
+            }
+        )
+        tmpfilename = self.temp_name(ctx['filename'])
+        open_mode = 'wb'
+        resume_len = 0
+
+        # Establish possible resume length
+        if os.path.isfile(encodeFilename(tmpfilename)):
+            open_mode = 'ab'
+            resume_len = os.path.getsize(encodeFilename(tmpfilename))
+
+        # Should be initialized before ytdl file check
+        ctx.update({
+            'tmpfilename': tmpfilename,
+            'fragment_index': 0,
+        })
+
+        if self.__do_ytdl_file(ctx):
+            if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))):
+                self._read_ytdl_file(ctx)
+                is_corrupt = ctx.get('ytdl_corrupt') is True
+                is_inconsistent = ctx['fragment_index'] > 0 and resume_len == 0
+                if is_corrupt or is_inconsistent:
+                    message = (
+                        '.ytdl file is corrupt' if is_corrupt else
+                        'Inconsistent state of incomplete fragment download')
+                    self.report_warning(
+                        '%s. Restarting from the beginning...' % message)
+                    ctx['fragment_index'] = resume_len = 0
+                    if 'ytdl_corrupt' in ctx:
+                        del ctx['ytdl_corrupt']
+                    self._write_ytdl_file(ctx)
+            else:
+                self._write_ytdl_file(ctx)
+                assert ctx['fragment_index'] == 0
+
+        dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode)
+
+        ctx.update({
+            'dl': dl,
+            'dest_stream': dest_stream,
+            'tmpfilename': tmpfilename,
+            # Total complete fragments downloaded so far in bytes
+            'complete_frags_downloaded_bytes': resume_len,
+        })
+
+    def _start_frag_download(self, ctx):
+        resume_len = ctx['complete_frags_downloaded_bytes']
+        total_frags = ctx['total_frags']
+        # This dict stores the download progress, it's updated by the progress
+        # hook
+        state = {
+            'status': 'downloading',
+            'downloaded_bytes': resume_len,
+            'fragment_index': ctx['fragment_index'],
+            'fragment_count': total_frags,
+            'filename': ctx['filename'],
+            'tmpfilename': ctx['tmpfilename'],
+        }
+
+        start = time.time()
+        ctx.update({
+            'started': start,
+            # Amount of fragment's bytes downloaded by the time of the previous
+            # frag progress hook invocation
+            'prev_frag_downloaded_bytes': 0,
+        })
+
+        def frag_progress_hook(s):
+            if s['status'] not in ('downloading', 'finished'):
+                return
+
+            time_now = time.time()
+            state['elapsed'] = time_now - start
+            frag_total_bytes = s.get('total_bytes') or 0
+            if not ctx['live']:
+                estimated_size = (
+                    (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes)
+                    / (state['fragment_index'] + 1) * total_frags)
+                state['total_bytes_estimate'] = estimated_size
+
+            if s['status'] == 'finished':
+                state['fragment_index'] += 1
+                ctx['fragment_index'] = state['fragment_index']
+                state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes']
+                ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes']
+                ctx['prev_frag_downloaded_bytes'] = 0
+            else:
+                frag_downloaded_bytes = s['downloaded_bytes']
+                state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes']
+                if not ctx['live']:
+                    state['eta'] = self.calc_eta(
+                        start, time_now, estimated_size - resume_len,
+                        state['downloaded_bytes'] - resume_len)
+                state['speed'] = s.get('speed') or ctx.get('speed')
+                ctx['speed'] = state['speed']
+                ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes
+            self._hook_progress(state)
+
+        ctx['dl'].add_progress_hook(frag_progress_hook)
+
+        return start
+
+    def _finish_frag_download(self, ctx):
+        ctx['dest_stream'].close()
+        if self.__do_ytdl_file(ctx):
+            ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename']))
+            if os.path.isfile(ytdl_filename):
+                os.remove(ytdl_filename)
+        elapsed = time.time() - ctx['started']
+
+        if ctx['tmpfilename'] == '-':
+            downloaded_bytes = ctx['complete_frags_downloaded_bytes']
+        else:
+            self.try_rename(ctx['tmpfilename'], ctx['filename'])
+            downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename']))
+
+        self._hook_progress({
+            'downloaded_bytes': downloaded_bytes,
+            'total_bytes': downloaded_bytes,
+            'filename': ctx['filename'],
+            'status': 'finished',
+            'elapsed': elapsed,
+        })
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py

new file mode 100644 (file)

index 0000000..84bc349
--- /dev/null
+++ b/youtube_dl/downloader/hls.py
@@ -0,0 +1,210 @@
+from __future__ import unicode_literals
+
+import re
+import binascii
+try:
+    from Crypto.Cipher import AES
+    can_decrypt_frag = True
+except ImportError:
+    can_decrypt_frag = False
+
+from .fragment import FragmentFD
+from .external import FFmpegFD
+
+from ..compat import (
+    compat_urllib_error,
+    compat_urlparse,
+    compat_struct_pack,
+)
+from ..utils import (
+    parse_m3u8_attributes,
+    update_url_query,
+)
+
+
+class HlsFD(FragmentFD):
+    """ A limited implementation that does not require ffmpeg """
+
+    FD_NAME = 'hlsnative'
+
+    @staticmethod
+    def can_download(manifest, info_dict):
+        UNSUPPORTED_FEATURES = (
+            r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)',  # encrypted streams [1]
+            # r'#EXT-X-BYTERANGE',  # playlists composed of byte ranges of media files [2]
+
+            # Live streams heuristic does not always work (e.g. geo restricted to Germany
+            # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0)
+            # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)',  # live streams [3]
+
+            # This heuristic also is not correct since segments may not be appended as well.
+            # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite
+            # no segments will definitely be appended to the end of the playlist.
+            # r'#EXT-X-PLAYLIST-TYPE:EVENT',  # media segments may be appended to the end of
+            #                                 # event media playlists [4]
+
+            # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
+            # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
+            # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
+            # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
+        )
+        check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES]
+        is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest
+        check_results.append(can_decrypt_frag or not is_aes128_enc)
+        check_results.append(not (is_aes128_enc and r'#EXT-X-BYTERANGE' in manifest))
+        check_results.append(not info_dict.get('is_live'))
+        return all(check_results)
+
+    def real_download(self, filename, info_dict):
+        man_url = info_dict['url']
+        self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
+
+        urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
+        man_url = urlh.geturl()
+        s = urlh.read().decode('utf-8', 'ignore')
+
+        if not self.can_download(s, info_dict):
+            if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'):
+                self.report_error('pycrypto not found. Please install it.')
+                return False
+            self.report_warning(
+                'hlsnative has detected features it does not support, '
+                'extraction will be delegated to ffmpeg')
+            fd = FFmpegFD(self.ydl, self.params)
+            for ph in self._progress_hooks:
+                fd.add_progress_hook(ph)
+            return fd.real_download(filename, info_dict)
+
+        def is_ad_fragment_start(s):
+            return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s
+                    or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad'))
+
+        def is_ad_fragment_end(s):
+            return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s
+                    or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment'))
+
+        media_frags = 0
+        ad_frags = 0
+        ad_frag_next = False
+        for line in s.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            if line.startswith('#'):
+                if is_ad_fragment_start(line):
+                    ad_frag_next = True
+                elif is_ad_fragment_end(line):
+                    ad_frag_next = False
+                continue
+            if ad_frag_next:
+                ad_frags += 1
+                continue
+            media_frags += 1
+
+        ctx = {
+            'filename': filename,
+            'total_frags': media_frags,
+            'ad_frags': ad_frags,
+        }
+
+        self._prepare_and_start_frag_download(ctx)
+
+        fragment_retries = self.params.get('fragment_retries', 0)
+        skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
+        test = self.params.get('test', False)
+
+        extra_query = None
+        extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
+        if extra_param_to_segment_url:
+            extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url)
+        i = 0
+        media_sequence = 0
+        decrypt_info = {'METHOD': 'NONE'}
+        byte_range = {}
+        frag_index = 0
+        ad_frag_next = False
+        for line in s.splitlines():
+            line = line.strip()
+            if line:
+                if not line.startswith('#'):
+                    if ad_frag_next:
+                        continue
+                    frag_index += 1
+                    if frag_index <= ctx['fragment_index']:
+                        continue
+                    frag_url = (
+                        line
+                        if re.match(r'^https?://', line)
+                        else compat_urlparse.urljoin(man_url, line))
+                    if extra_query:
+                        frag_url = update_url_query(frag_url, extra_query)
+                    count = 0
+                    headers = info_dict.get('http_headers', {})
+                    if byte_range:
+                        headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'])
+                    while count <= fragment_retries:
+                        try:
+                            success, frag_content = self._download_fragment(
+                                ctx, frag_url, info_dict, headers)
+                            if not success:
+                                return False
+                            break
+                        except compat_urllib_error.HTTPError as err:
+                            # Unavailable (possibly temporary) fragments may be served.
+                            # First we try to retry then either skip or abort.
+                            # See https://github.com/ytdl-org/youtube-dl/issues/10165,
+                            # https://github.com/ytdl-org/youtube-dl/issues/10448).
+                            count += 1
+                            if count <= fragment_retries:
+                                self.report_retry_fragment(err, frag_index, count, fragment_retries)
+                    if count > fragment_retries:
+                        if skip_unavailable_fragments:
+                            i += 1
+                            media_sequence += 1
+                            self.report_skip_fragment(frag_index)
+                            continue
+                        self.report_error(
+                            'giving up after %s fragment retries' % fragment_retries)
+                        return False
+                    if decrypt_info['METHOD'] == 'AES-128':
+                        iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
+                        decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(
+                            self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read()
+                        frag_content = AES.new(
+                            decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
+                    self._append_fragment(ctx, frag_content)
+                    # We only download the first fragment during the test
+                    if test:
+                        break
+                    i += 1
+                    media_sequence += 1
+                elif line.startswith('#EXT-X-KEY'):
+                    decrypt_url = decrypt_info.get('URI')
+                    decrypt_info = parse_m3u8_attributes(line[11:])
+                    if decrypt_info['METHOD'] == 'AES-128':
+                        if 'IV' in decrypt_info:
+                            decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32))
+                        if not re.match(r'^https?://', decrypt_info['URI']):
+                            decrypt_info['URI'] = compat_urlparse.urljoin(
+                                man_url, decrypt_info['URI'])
+                        if extra_query:
+                            decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
+                        if decrypt_url != decrypt_info['URI']:
+                            decrypt_info['KEY'] = None
+                elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
+                    media_sequence = int(line[22:])
+                elif line.startswith('#EXT-X-BYTERANGE'):
+                    splitted_byte_range = line[17:].split('@')
+                    sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
+                    byte_range = {
+                        'start': sub_range_start,
+                        'end': sub_range_start + int(splitted_byte_range[0]),
+                    }
+                elif is_ad_fragment_start(line):
+                    ad_frag_next = True
+                elif is_ad_fragment_end(line):
+                    ad_frag_next = False
+
+        self._finish_frag_download(ctx)
+
+        return True
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py

new file mode 100644 (file)

index 0000000..5046878
--- /dev/null
+++ b/youtube_dl/downloader/http.py
@@ -0,0 +1,354 @@
+from __future__ import unicode_literals
+
+import errno
+import os
+import socket
+import time
+import random
+import re
+
+from .common import FileDownloader
+from ..compat import (
+    compat_str,
+    compat_urllib_error,
+)
+from ..utils import (
+    ContentTooShortError,
+    encodeFilename,
+    int_or_none,
+    sanitize_open,
+    sanitized_Request,
+    write_xattr,
+    XAttrMetadataError,
+    XAttrUnavailableError,
+)
+
+
+class HttpFD(FileDownloader):
+    def real_download(self, filename, info_dict):
+        url = info_dict['url']
+
+        class DownloadContext(dict):
+            __getattr__ = dict.get
+            __setattr__ = dict.__setitem__
+            __delattr__ = dict.__delitem__
+
+        ctx = DownloadContext()
+        ctx.filename = filename
+        ctx.tmpfilename = self.temp_name(filename)
+        ctx.stream = None
+
+        # Do not include the Accept-Encoding header
+        headers = {'Youtubedl-no-compression': 'True'}
+        add_headers = info_dict.get('http_headers')
+        if add_headers:
+            headers.update(add_headers)
+
+        is_test = self.params.get('test', False)
+        chunk_size = self._TEST_FILE_SIZE if is_test else (
+            info_dict.get('downloader_options', {}).get('http_chunk_size')
+            or self.params.get('http_chunk_size') or 0)
+
+        ctx.open_mode = 'wb'
+        ctx.resume_len = 0
+        ctx.data_len = None
+        ctx.block_size = self.params.get('buffersize', 1024)
+        ctx.start_time = time.time()
+        ctx.chunk_size = None
+
+        if self.params.get('continuedl', True):
+            # Establish possible resume length
+            if os.path.isfile(encodeFilename(ctx.tmpfilename)):
+                ctx.resume_len = os.path.getsize(
+                    encodeFilename(ctx.tmpfilename))
+
+        ctx.is_resume = ctx.resume_len > 0
+
+        count = 0
+        retries = self.params.get('retries', 0)
+
+        class SucceedDownload(Exception):
+            pass
+
+        class RetryDownload(Exception):
+            def __init__(self, source_error):
+                self.source_error = source_error
+
+        class NextFragment(Exception):
+            pass
+
+        def set_range(req, start, end):
+            range_header = 'bytes=%d-' % start
+            if end:
+                range_header += compat_str(end)
+            req.add_header('Range', range_header)
+
+        def establish_connection():
+            ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size)
+                              if not is_test and chunk_size else chunk_size)
+            if ctx.resume_len > 0:
+                range_start = ctx.resume_len
+                if ctx.is_resume:
+                    self.report_resuming_byte(ctx.resume_len)
+                ctx.open_mode = 'ab'
+            elif ctx.chunk_size > 0:
+                range_start = 0
+            else:
+                range_start = None
+            ctx.is_resume = False
+            range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None
+            if range_end and ctx.data_len is not None and range_end >= ctx.data_len:
+                range_end = ctx.data_len - 1
+            has_range = range_start is not None
+            ctx.has_range = has_range
+            request = sanitized_Request(url, None, headers)
+            if has_range:
+                set_range(request, range_start, range_end)
+            # Establish connection
+            try:
+                ctx.data = self.ydl.urlopen(request)
+                # When trying to resume, Content-Range HTTP header of response has to be checked
+                # to match the value of requested Range HTTP header. This is due to a webservers
+                # that don't support resuming and serve a whole file with no Content-Range
+                # set in response despite of requested Range (see
+                # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799)
+                if has_range:
+                    content_range = ctx.data.headers.get('Content-Range')
+                    if content_range:
+                        content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range)
+                        # Content-Range is present and matches requested Range, resume is possible
+                        if content_range_m:
+                            if range_start == int(content_range_m.group(1)):
+                                content_range_end = int_or_none(content_range_m.group(2))
+                                content_len = int_or_none(content_range_m.group(3))
+                                accept_content_len = (
+                                    # Non-chunked download
+                                    not ctx.chunk_size
+                                    # Chunked download and requested piece or
+                                    # its part is promised to be served
+                                    or content_range_end == range_end
+                                    or content_len < range_end)
+                                if accept_content_len:
+                                    ctx.data_len = content_len
+                                    return
+                    # Content-Range is either not present or invalid. Assuming remote webserver is
+                    # trying to send the whole file, resume is not possible, so wiping the local file
+                    # and performing entire redownload
+                    self.report_unable_to_resume()
+                    ctx.resume_len = 0
+                    ctx.open_mode = 'wb'
+                ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None))
+                return
+            except (compat_urllib_error.HTTPError, ) as err:
+                if err.code == 416:
+                    # Unable to resume (requested range not satisfiable)
+                    try:
+                        # Open the connection again without the range header
+                        ctx.data = self.ydl.urlopen(
+                            sanitized_Request(url, None, headers))
+                        content_length = ctx.data.info()['Content-Length']
+                    except (compat_urllib_error.HTTPError, ) as err:
+                        if err.code < 500 or err.code >= 600:
+                            raise
+                    else:
+                        # Examine the reported length
+                        if (content_length is not None
+                                and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)):
+                            # The file had already been fully downloaded.
+                            # Explanation to the above condition: in issue #175 it was revealed that
+                            # YouTube sometimes adds or removes a few bytes from the end of the file,
+                            # changing the file size slightly and causing problems for some users. So
+                            # I decided to implement a suggested change and consider the file
+                            # completely downloaded if the file size differs less than 100 bytes from
+                            # the one in the hard drive.
+                            self.report_file_already_downloaded(ctx.filename)
+                            self.try_rename(ctx.tmpfilename, ctx.filename)
+                            self._hook_progress({
+                                'filename': ctx.filename,
+                                'status': 'finished',
+                                'downloaded_bytes': ctx.resume_len,
+                                'total_bytes': ctx.resume_len,
+                            })
+                            raise SucceedDownload()
+                        else:
+                            # The length does not match, we start the download over
+                            self.report_unable_to_resume()
+                            ctx.resume_len = 0
+                            ctx.open_mode = 'wb'
+                            return
+                elif err.code < 500 or err.code >= 600:
+                    # Unexpected HTTP error
+                    raise
+                raise RetryDownload(err)
+            except socket.error as err:
+                if err.errno != errno.ECONNRESET:
+                    # Connection reset is no problem, just retry
+                    raise
+                raise RetryDownload(err)
+
+        def download():
+            data_len = ctx.data.info().get('Content-length', None)
+
+            # Range HTTP header may be ignored/unsupported by a webserver
+            # (e.g. extractor/scivee.py, extractor/bambuser.py).
+            # However, for a test we still would like to download just a piece of a file.
+            # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
+            # block size when downloading a file.
+            if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
+                data_len = self._TEST_FILE_SIZE
+
+            if data_len is not None:
+                data_len = int(data_len) + ctx.resume_len
+                min_data_len = self.params.get('min_filesize')
+                max_data_len = self.params.get('max_filesize')
+                if min_data_len is not None and data_len < min_data_len:
+                    self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
+                    return False
+                if max_data_len is not None and data_len > max_data_len:
+                    self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
+                    return False
+
+            byte_counter = 0 + ctx.resume_len
+            block_size = ctx.block_size
+            start = time.time()
+
+            # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
+            now = None  # needed for slow_down() in the first loop run
+            before = start  # start measuring
+
+            def retry(e):
+                to_stdout = ctx.tmpfilename == '-'
+                if not to_stdout:
+                    ctx.stream.close()
+                ctx.stream = None
+                ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename))
+                raise RetryDownload(e)
+
+            while True:
+                try:
+                    # Download and write
+                    data_block = ctx.data.read(block_size if data_len is None else min(block_size, data_len - byte_counter))
+                # socket.timeout is a subclass of socket.error but may not have
+                # errno set
+                except socket.timeout as e:
+                    retry(e)
+                except socket.error as e:
+                    if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT):
+                        raise
+                    retry(e)
+
+                byte_counter += len(data_block)
+
+                # exit loop when download is finished
+                if len(data_block) == 0:
+                    break
+
+                # Open destination file just in time
+                if ctx.stream is None:
+                    try:
+                        ctx.stream, ctx.tmpfilename = sanitize_open(
+                            ctx.tmpfilename, ctx.open_mode)
+                        assert ctx.stream is not None
+                        ctx.filename = self.undo_temp_name(ctx.tmpfilename)
+                        self.report_destination(ctx.filename)
+                    except (OSError, IOError) as err:
+                        self.report_error('unable to open for writing: %s' % str(err))
+                        return False
+
+                    if self.params.get('xattr_set_filesize', False) and data_len is not None:
+                        try:
+                            write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8'))
+                        except (XAttrUnavailableError, XAttrMetadataError) as err:
+                            self.report_error('unable to set filesize xattr: %s' % str(err))
+
+                try:
+                    ctx.stream.write(data_block)
+                except (IOError, OSError) as err:
+                    self.to_stderr('\n')
+                    self.report_error('unable to write data: %s' % str(err))
+                    return False
+
+                # Apply rate limit
+                self.slow_down(start, now, byte_counter - ctx.resume_len)
+
+                # end measuring of one loop run
+                now = time.time()
+                after = now
+
+                # Adjust block size
+                if not self.params.get('noresizebuffer', False):
+                    block_size = self.best_block_size(after - before, len(data_block))
+
+                before = after
+
+                # Progress message
+                speed = self.calc_speed(start, now, byte_counter - ctx.resume_len)
+                if ctx.data_len is None:
+                    eta = None
+                else:
+                    eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len)
+
+                self._hook_progress({
+                    'status': 'downloading',
+                    'downloaded_bytes': byte_counter,
+                    'total_bytes': ctx.data_len,
+                    'tmpfilename': ctx.tmpfilename,
+                    'filename': ctx.filename,
+                    'eta': eta,
+                    'speed': speed,
+                    'elapsed': now - ctx.start_time,
+                })
+
+                if data_len is not None and byte_counter == data_len:
+                    break
+
+            if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len:
+                ctx.resume_len = byte_counter
+                # ctx.block_size = block_size
+                raise NextFragment()
+
+            if ctx.stream is None:
+                self.to_stderr('\n')
+                self.report_error('Did not get any data blocks')
+                return False
+            if ctx.tmpfilename != '-':
+                ctx.stream.close()
+
+            if data_len is not None and byte_counter != data_len:
+                err = ContentTooShortError(byte_counter, int(data_len))
+                if count <= retries:
+                    retry(err)
+                raise err
+
+            self.try_rename(ctx.tmpfilename, ctx.filename)
+
+            # Update file modification time
+            if self.params.get('updatetime', True):
+                info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None))
+
+            self._hook_progress({
+                'downloaded_bytes': byte_counter,
+                'total_bytes': byte_counter,
+                'filename': ctx.filename,
+                'status': 'finished',
+                'elapsed': time.time() - ctx.start_time,
+            })
+
+            return True
+
+        while count <= retries:
+            try:
+                establish_connection()
+                return download()
+            except RetryDownload as e:
+                count += 1
+                if count <= retries:
+                    self.report_retry(e.source_error, count, retries)
+                continue
+            except NextFragment:
+                continue
+            except SucceedDownload:
+                return True
+
+        self.report_error('giving up after %s retries' % retries)
+        return False
diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py

new file mode 100644 (file)

index 0000000..1ca666b
--- /dev/null
+++ b/youtube_dl/downloader/ism.py
@@ -0,0 +1,259 @@
+from __future__ import unicode_literals
+
+import time
+import binascii
+import io
+
+from .fragment import FragmentFD
+from ..compat import (
+    compat_Struct,
+    compat_urllib_error,
+)
+
+
+u8 = compat_Struct('>B')
+u88 = compat_Struct('>Bx')
+u16 = compat_Struct('>H')
+u1616 = compat_Struct('>Hxx')
+u32 = compat_Struct('>I')
+u64 = compat_Struct('>Q')
+
+s88 = compat_Struct('>bx')
+s16 = compat_Struct('>h')
+s1616 = compat_Struct('>hxx')
+s32 = compat_Struct('>i')
+
+unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000)
+
+TRACK_ENABLED = 0x1
+TRACK_IN_MOVIE = 0x2
+TRACK_IN_PREVIEW = 0x4
+
+SELF_CONTAINED = 0x1
+
+
+def box(box_type, payload):
+    return u32.pack(8 + len(payload)) + box_type + payload
+
+
+def full_box(box_type, version, flags, payload):
+    return box(box_type, u8.pack(version) + u32.pack(flags)[1:] + payload)
+
+
+def write_piff_header(stream, params):
+    track_id = params['track_id']
+    fourcc = params['fourcc']
+    duration = params['duration']
+    timescale = params.get('timescale', 10000000)
+    language = params.get('language', 'und')
+    height = params.get('height', 0)
+    width = params.get('width', 0)
+    is_audio = width == 0 and height == 0
+    creation_time = modification_time = int(time.time())
+
+    ftyp_payload = b'isml'  # major brand
+    ftyp_payload += u32.pack(1)  # minor version
+    ftyp_payload += b'piff' + b'iso2'  # compatible brands
+    stream.write(box(b'ftyp', ftyp_payload))  # File Type Box
+
+    mvhd_payload = u64.pack(creation_time)
+    mvhd_payload += u64.pack(modification_time)
+    mvhd_payload += u32.pack(timescale)
+    mvhd_payload += u64.pack(duration)
+    mvhd_payload += s1616.pack(1)  # rate
+    mvhd_payload += s88.pack(1)  # volume
+    mvhd_payload += u16.pack(0)  # reserved
+    mvhd_payload += u32.pack(0) * 2  # reserved
+    mvhd_payload += unity_matrix
+    mvhd_payload += u32.pack(0) * 6  # pre defined
+    mvhd_payload += u32.pack(0xffffffff)  # next track id
+    moov_payload = full_box(b'mvhd', 1, 0, mvhd_payload)  # Movie Header Box
+
+    tkhd_payload = u64.pack(creation_time)
+    tkhd_payload += u64.pack(modification_time)
+    tkhd_payload += u32.pack(track_id)  # track id
+    tkhd_payload += u32.pack(0)  # reserved
+    tkhd_payload += u64.pack(duration)
+    tkhd_payload += u32.pack(0) * 2  # reserved
+    tkhd_payload += s16.pack(0)  # layer
+    tkhd_payload += s16.pack(0)  # alternate group
+    tkhd_payload += s88.pack(1 if is_audio else 0)  # volume
+    tkhd_payload += u16.pack(0)  # reserved
+    tkhd_payload += unity_matrix
+    tkhd_payload += u1616.pack(width)
+    tkhd_payload += u1616.pack(height)
+    trak_payload = full_box(b'tkhd', 1, TRACK_ENABLED | TRACK_IN_MOVIE | TRACK_IN_PREVIEW, tkhd_payload)  # Track Header Box
+
+    mdhd_payload = u64.pack(creation_time)
+    mdhd_payload += u64.pack(modification_time)
+    mdhd_payload += u32.pack(timescale)
+    mdhd_payload += u64.pack(duration)
+    mdhd_payload += u16.pack(((ord(language[0]) - 0x60) << 10) | ((ord(language[1]) - 0x60) << 5) | (ord(language[2]) - 0x60))
+    mdhd_payload += u16.pack(0)  # pre defined
+    mdia_payload = full_box(b'mdhd', 1, 0, mdhd_payload)  # Media Header Box
+
+    hdlr_payload = u32.pack(0)  # pre defined
+    hdlr_payload += b'soun' if is_audio else b'vide'  # handler type
+    hdlr_payload += u32.pack(0) * 3  # reserved
+    hdlr_payload += (b'Sound' if is_audio else b'Video') + b'Handler\0'  # name
+    mdia_payload += full_box(b'hdlr', 0, 0, hdlr_payload)  # Handler Reference Box
+
+    if is_audio:
+        smhd_payload = s88.pack(0)  # balance
+        smhd_payload += u16.pack(0)  # reserved
+        media_header_box = full_box(b'smhd', 0, 0, smhd_payload)  # Sound Media Header
+    else:
+        vmhd_payload = u16.pack(0)  # graphics mode
+        vmhd_payload += u16.pack(0) * 3  # opcolor
+        media_header_box = full_box(b'vmhd', 0, 1, vmhd_payload)  # Video Media Header
+    minf_payload = media_header_box
+
+    dref_payload = u32.pack(1)  # entry count
+    dref_payload += full_box(b'url ', 0, SELF_CONTAINED, b'')  # Data Entry URL Box
+    dinf_payload = full_box(b'dref', 0, 0, dref_payload)  # Data Reference Box
+    minf_payload += box(b'dinf', dinf_payload)  # Data Information Box
+
+    stsd_payload = u32.pack(1)  # entry count
+
+    sample_entry_payload = u8.pack(0) * 6  # reserved
+    sample_entry_payload += u16.pack(1)  # data reference index
+    if is_audio:
+        sample_entry_payload += u32.pack(0) * 2  # reserved
+        sample_entry_payload += u16.pack(params.get('channels', 2))
+        sample_entry_payload += u16.pack(params.get('bits_per_sample', 16))
+        sample_entry_payload += u16.pack(0)  # pre defined
+        sample_entry_payload += u16.pack(0)  # reserved
+        sample_entry_payload += u1616.pack(params['sampling_rate'])
+
+        if fourcc == 'AACL':
+            sample_entry_box = box(b'mp4a', sample_entry_payload)
+    else:
+        sample_entry_payload += u16.pack(0)  # pre defined
+        sample_entry_payload += u16.pack(0)  # reserved
+        sample_entry_payload += u32.pack(0) * 3  # pre defined
+        sample_entry_payload += u16.pack(width)
+        sample_entry_payload += u16.pack(height)
+        sample_entry_payload += u1616.pack(0x48)  # horiz resolution 72 dpi
+        sample_entry_payload += u1616.pack(0x48)  # vert resolution 72 dpi
+        sample_entry_payload += u32.pack(0)  # reserved
+        sample_entry_payload += u16.pack(1)  # frame count
+        sample_entry_payload += u8.pack(0) * 32  # compressor name
+        sample_entry_payload += u16.pack(0x18)  # depth
+        sample_entry_payload += s16.pack(-1)  # pre defined
+
+        codec_private_data = binascii.unhexlify(params['codec_private_data'].encode('utf-8'))
+        if fourcc in ('H264', 'AVC1'):
+            sps, pps = codec_private_data.split(u32.pack(1))[1:]
+            avcc_payload = u8.pack(1)  # configuration version
+            avcc_payload += sps[1:4]  # avc profile indication + profile compatibility + avc level indication
+            avcc_payload += u8.pack(0xfc | (params.get('nal_unit_length_field', 4) - 1))  # complete representation (1) + reserved (11111) + length size minus one
+            avcc_payload += u8.pack(1)  # reserved (0) + number of sps (0000001)
+            avcc_payload += u16.pack(len(sps))
+            avcc_payload += sps
+            avcc_payload += u8.pack(1)  # number of pps
+            avcc_payload += u16.pack(len(pps))
+            avcc_payload += pps
+            sample_entry_payload += box(b'avcC', avcc_payload)  # AVC Decoder Configuration Record
+            sample_entry_box = box(b'avc1', sample_entry_payload)  # AVC Simple Entry
+    stsd_payload += sample_entry_box
+
+    stbl_payload = full_box(b'stsd', 0, 0, stsd_payload)  # Sample Description Box
+
+    stts_payload = u32.pack(0)  # entry count
+    stbl_payload += full_box(b'stts', 0, 0, stts_payload)  # Decoding Time to Sample Box
+
+    stsc_payload = u32.pack(0)  # entry count
+    stbl_payload += full_box(b'stsc', 0, 0, stsc_payload)  # Sample To Chunk Box
+
+    stco_payload = u32.pack(0)  # entry count
+    stbl_payload += full_box(b'stco', 0, 0, stco_payload)  # Chunk Offset Box
+
+    minf_payload += box(b'stbl', stbl_payload)  # Sample Table Box
+
+    mdia_payload += box(b'minf', minf_payload)  # Media Information Box
+
+    trak_payload += box(b'mdia', mdia_payload)  # Media Box
+
+    moov_payload += box(b'trak', trak_payload)  # Track Box
+
+    mehd_payload = u64.pack(duration)
+    mvex_payload = full_box(b'mehd', 1, 0, mehd_payload)  # Movie Extends Header Box
+
+    trex_payload = u32.pack(track_id)  # track id
+    trex_payload += u32.pack(1)  # default sample description index
+    trex_payload += u32.pack(0)  # default sample duration
+    trex_payload += u32.pack(0)  # default sample size
+    trex_payload += u32.pack(0)  # default sample flags
+    mvex_payload += full_box(b'trex', 0, 0, trex_payload)  # Track Extends Box
+
+    moov_payload += box(b'mvex', mvex_payload)  # Movie Extends Box
+    stream.write(box(b'moov', moov_payload))  # Movie Box
+
+
+def extract_box_data(data, box_sequence):
+    data_reader = io.BytesIO(data)
+    while True:
+        box_size = u32.unpack(data_reader.read(4))[0]
+        box_type = data_reader.read(4)
+        if box_type == box_sequence[0]:
+            box_data = data_reader.read(box_size - 8)
+            if len(box_sequence) == 1:
+                return box_data
+            return extract_box_data(box_data, box_sequence[1:])
+        data_reader.seek(box_size - 8, 1)
+
+
+class IsmFD(FragmentFD):
+    """
+    Download segments in a ISM manifest
+    """
+
+    FD_NAME = 'ism'
+
+    def real_download(self, filename, info_dict):
+        segments = info_dict['fragments'][:1] if self.params.get(
+            'test', False) else info_dict['fragments']
+
+        ctx = {
+            'filename': filename,
+            'total_frags': len(segments),
+        }
+
+        self._prepare_and_start_frag_download(ctx)
+
+        fragment_retries = self.params.get('fragment_retries', 0)
+        skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
+
+        track_written = False
+        frag_index = 0
+        for i, segment in enumerate(segments):
+            frag_index += 1
+            if frag_index <= ctx['fragment_index']:
+                continue
+            count = 0
+            while count <= fragment_retries:
+                try:
+                    success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
+                    if not success:
+                        return False
+                    if not track_written:
+                        tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd'])
+                        info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0]
+                        write_piff_header(ctx['dest_stream'], info_dict['_download_params'])
+                        track_written = True
+                    self._append_fragment(ctx, frag_content)
+                    break
+                except compat_urllib_error.HTTPError as err:
+                    count += 1
+                    if count <= fragment_retries:
+                        self.report_retry_fragment(err, frag_index, count, fragment_retries)
+            if count > fragment_retries:
+                if skip_unavailable_fragments:
+                    self.report_skip_fragment(frag_index)
+                    continue
+                self.report_error('giving up after %s fragment retries' % fragment_retries)
+                return False
+
+        self._finish_frag_download(ctx)
+
+        return True
diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py

new file mode 100644 (file)

index 0000000..fbb7f51
--- /dev/null
+++ b/youtube_dl/downloader/rtmp.py
@@ -0,0 +1,214 @@
+from __future__ import unicode_literals
+
+import os
+import re
+import subprocess
+import time
+
+from .common import FileDownloader
+from ..compat import compat_str
+from ..utils import (
+    check_executable,
+    encodeFilename,
+    encodeArgument,
+    get_exe_version,
+)
+
+
+def rtmpdump_version():
+    return get_exe_version(
+        'rtmpdump', ['--help'], r'(?i)RTMPDump\s*v?([0-9a-zA-Z._-]+)')
+
+
+class RtmpFD(FileDownloader):
+    def real_download(self, filename, info_dict):
+        def run_rtmpdump(args):
+            start = time.time()
+            resume_percent = None
+            resume_downloaded_data_len = None
+            proc = subprocess.Popen(args, stderr=subprocess.PIPE)
+            cursor_in_new_line = True
+            proc_stderr_closed = False
+            try:
+                while not proc_stderr_closed:
+                    # read line from stderr
+                    line = ''
+                    while True:
+                        char = proc.stderr.read(1)
+                        if not char:
+                            proc_stderr_closed = True
+                            break
+                        if char in [b'\r', b'\n']:
+                            break
+                        line += char.decode('ascii', 'replace')
+                    if not line:
+                        # proc_stderr_closed is True
+                        continue
+                    mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line)
+                    if mobj:
+                        downloaded_data_len = int(float(mobj.group(1)) * 1024)
+                        percent = float(mobj.group(2))
+                        if not resume_percent:
+                            resume_percent = percent
+                            resume_downloaded_data_len = downloaded_data_len
+                        time_now = time.time()
+                        eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent)
+                        speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len)
+                        data_len = None
+                        if percent > 0:
+                            data_len = int(downloaded_data_len * 100 / percent)
+                        self._hook_progress({
+                            'status': 'downloading',
+                            'downloaded_bytes': downloaded_data_len,
+                            'total_bytes_estimate': data_len,
+                            'tmpfilename': tmpfilename,
+                            'filename': filename,
+                            'eta': eta,
+                            'elapsed': time_now - start,
+                            'speed': speed,
+                        })
+                        cursor_in_new_line = False
+                    else:
+                        # no percent for live streams
+                        mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line)
+                        if mobj:
+                            downloaded_data_len = int(float(mobj.group(1)) * 1024)
+                            time_now = time.time()
+                            speed = self.calc_speed(start, time_now, downloaded_data_len)
+                            self._hook_progress({
+                                'downloaded_bytes': downloaded_data_len,
+                                'tmpfilename': tmpfilename,
+                                'filename': filename,
+                                'status': 'downloading',
+                                'elapsed': time_now - start,
+                                'speed': speed,
+                            })
+                            cursor_in_new_line = False
+                        elif self.params.get('verbose', False):
+                            if not cursor_in_new_line:
+                                self.to_screen('')
+                            cursor_in_new_line = True
+                            self.to_screen('[rtmpdump] ' + line)
+            finally:
+                proc.wait()
+            if not cursor_in_new_line:
+                self.to_screen('')
+            return proc.returncode
+
+        url = info_dict['url']
+        player_url = info_dict.get('player_url')
+        page_url = info_dict.get('page_url')
+        app = info_dict.get('app')
+        play_path = info_dict.get('play_path')
+        tc_url = info_dict.get('tc_url')
+        flash_version = info_dict.get('flash_version')
+        live = info_dict.get('rtmp_live', False)
+        conn = info_dict.get('rtmp_conn')
+        protocol = info_dict.get('rtmp_protocol')
+        real_time = info_dict.get('rtmp_real_time', False)
+        no_resume = info_dict.get('no_resume', False)
+        continue_dl = self.params.get('continuedl', True)
+
+        self.report_destination(filename)
+        tmpfilename = self.temp_name(filename)
+        test = self.params.get('test', False)
+
+        # Check for rtmpdump first
+        if not check_executable('rtmpdump', ['-h']):
+            self.report_error('RTMP download detected but "rtmpdump" could not be run. Please install it.')
+            return False
+
+        # Download using rtmpdump. rtmpdump returns exit code 2 when
+        # the connection was interrupted and resuming appears to be
+        # possible. This is part of rtmpdump's normal usage, AFAIK.
+        basic_args = [
+            'rtmpdump', '--verbose', '-r', url,
+            '-o', tmpfilename]
+        if player_url is not None:
+            basic_args += ['--swfVfy', player_url]
+        if page_url is not None:
+            basic_args += ['--pageUrl', page_url]
+        if app is not None:
+            basic_args += ['--app', app]
+        if play_path is not None:
+            basic_args += ['--playpath', play_path]
+        if tc_url is not None:
+            basic_args += ['--tcUrl', tc_url]
+        if test:
+            basic_args += ['--stop', '1']
+        if flash_version is not None:
+            basic_args += ['--flashVer', flash_version]
+        if live:
+            basic_args += ['--live']
+        if isinstance(conn, list):
+            for entry in conn:
+                basic_args += ['--conn', entry]
+        elif isinstance(conn, compat_str):
+            basic_args += ['--conn', conn]
+        if protocol is not None:
+            basic_args += ['--protocol', protocol]
+        if real_time:
+            basic_args += ['--realtime']
+
+        args = basic_args
+        if not no_resume and continue_dl and not live:
+            args += ['--resume']
+        if not live and continue_dl:
+            args += ['--skip', '1']
+
+        args = [encodeArgument(a) for a in args]
+
+        self._debug_cmd(args, exe='rtmpdump')
+
+        RD_SUCCESS = 0
+        RD_FAILED = 1
+        RD_INCOMPLETE = 2
+        RD_NO_CONNECT = 3
+
+        started = time.time()
+
+        try:
+            retval = run_rtmpdump(args)
+        except KeyboardInterrupt:
+            if not info_dict.get('is_live'):
+                raise
+            retval = RD_SUCCESS
+            self.to_screen('\n[rtmpdump] Interrupted by user')
+
+        if retval == RD_NO_CONNECT:
+            self.report_error('[rtmpdump] Could not connect to RTMP server.')
+            return False
+
+        while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live:
+            prevsize = os.path.getsize(encodeFilename(tmpfilename))
+            self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize)
+            time.sleep(5.0)  # This seems to be needed
+            args = basic_args + ['--resume']
+            if retval == RD_FAILED:
+                args += ['--skip', '1']
+            args = [encodeArgument(a) for a in args]
+            retval = run_rtmpdump(args)
+            cursize = os.path.getsize(encodeFilename(tmpfilename))
+            if prevsize == cursize and retval == RD_FAILED:
+                break
+            # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
+            if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024:
+                self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
+                retval = RD_SUCCESS
+                break
+        if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE):
+            fsize = os.path.getsize(encodeFilename(tmpfilename))
+            self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize)
+            self.try_rename(tmpfilename, filename)
+            self._hook_progress({
+                'downloaded_bytes': fsize,
+                'total_bytes': fsize,
+                'filename': filename,
+                'status': 'finished',
+                'elapsed': time.time() - started,
+            })
+            return True
+        else:
+            self.to_stderr('\n')
+            self.report_error('rtmpdump exited with code %d' % retval)
+            return False
diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py

new file mode 100644 (file)

index 0000000..939358b
--- /dev/null
+++ b/youtube_dl/downloader/rtsp.py
@@ -0,0 +1,47 @@
+from __future__ import unicode_literals
+
+import os
+import subprocess
+
+from .common import FileDownloader
+from ..utils import (
+    check_executable,
+    encodeFilename,
+)
+
+
+class RtspFD(FileDownloader):
+    def real_download(self, filename, info_dict):
+        url = info_dict['url']
+        self.report_destination(filename)
+        tmpfilename = self.temp_name(filename)
+
+        if check_executable('mplayer', ['-h']):
+            args = [
+                'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy',
+                '-dumpstream', '-dumpfile', tmpfilename, url]
+        elif check_executable('mpv', ['-h']):
+            args = [
+                'mpv', '-really-quiet', '--vo=null', '--stream-dump=' + tmpfilename, url]
+        else:
+            self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.')
+            return False
+
+        self._debug_cmd(args)
+
+        retval = subprocess.call(args)
+        if retval == 0:
+            fsize = os.path.getsize(encodeFilename(tmpfilename))
+            self.to_screen('\r[%s] %s bytes' % (args[0], fsize))
+            self.try_rename(tmpfilename, filename)
+            self._hook_progress({
+                'downloaded_bytes': fsize,
+                'total_bytes': fsize,
+                'filename': filename,
+                'status': 'finished',
+            })
+            return True
+        else:
+            self.to_stderr('\n')
+            self.report_error('%s exited with code %d' % (args[0], retval))
+            return False
diff --git a/youtube_dl/downloader/youtube_live_chat.py b/youtube_dl/downloader/youtube_live_chat.py

new file mode 100644 (file)

index 0000000..4932dd9
--- /dev/null
+++ b/youtube_dl/downloader/youtube_live_chat.py
@@ -0,0 +1,94 @@
+from __future__ import division, unicode_literals
+
+import re
+import json
+
+from .fragment import FragmentFD
+
+
+class YoutubeLiveChatReplayFD(FragmentFD):
+    """ Downloads YouTube live chat replays fragment by fragment """
+
+    FD_NAME = 'youtube_live_chat_replay'
+
+    def real_download(self, filename, info_dict):
+        video_id = info_dict['video_id']
+        self.to_screen('[%s] Downloading live chat' % self.FD_NAME)
+
+        test = self.params.get('test', False)
+
+        ctx = {
+            'filename': filename,
+            'live': True,
+            'total_frags': None,
+        }
+
+        def dl_fragment(url):
+            headers = info_dict.get('http_headers', {})
+            return self._download_fragment(ctx, url, info_dict, headers)
+
+        def parse_yt_initial_data(data):
+            window_patt = b'window\\["ytInitialData"\\]\\s*=\\s*(.*?)(?<=});'
+            var_patt = b'var\\s+ytInitialData\\s*=\\s*(.*?)(?<=});'
+            for patt in window_patt, var_patt:
+                try:
+                    raw_json = re.search(patt, data).group(1)
+                    return json.loads(raw_json)
+                except AttributeError:
+                    continue
+
+        self._prepare_and_start_frag_download(ctx)
+
+        success, raw_fragment = dl_fragment(
+            'https://www.youtube.com/watch?v={}'.format(video_id))
+        if not success:
+            return False
+        data = parse_yt_initial_data(raw_fragment)
+        continuation_id = data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
+        # no data yet but required to call _append_fragment
+        self._append_fragment(ctx, b'')
+
+        first = True
+        offset = None
+        while continuation_id is not None:
+            data = None
+            if first:
+                url = 'https://www.youtube.com/live_chat_replay?continuation={}'.format(continuation_id)
+                success, raw_fragment = dl_fragment(url)
+                if not success:
+                    return False
+                data = parse_yt_initial_data(raw_fragment)
+            else:
+                url = ('https://www.youtube.com/live_chat_replay/get_live_chat_replay'
+                       + '?continuation={}'.format(continuation_id)
+                       + '&playerOffsetMs={}'.format(offset - 5000)
+                       + '&hidden=false'
+                       + '&pbj=1')
+                success, raw_fragment = dl_fragment(url)
+                if not success:
+                    return False
+                data = json.loads(raw_fragment)['response']
+
+            first = False
+            continuation_id = None
+
+            live_chat_continuation = data['continuationContents']['liveChatContinuation']
+            offset = None
+            processed_fragment = bytearray()
+            if 'actions' in live_chat_continuation:
+                for action in live_chat_continuation['actions']:
+                    if 'replayChatItemAction' in action:
+                        replay_chat_item_action = action['replayChatItemAction']
+                        offset = int(replay_chat_item_action['videoOffsetTimeMsec'])
+                    processed_fragment.extend(
+                        json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n')
+                continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation']
+
+            self._append_fragment(ctx, processed_fragment)
+
+            if test or offset is None:
+                break
+
+        self._finish_frag_download(ctx)
+
+        return True
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

new file mode 100644 (file)

index 0000000..18d8dbc
--- /dev/null
+++ b/youtube_dl/extractor/__init__.py
@@ -0,0 +1,46 @@
+from __future__ import unicode_literals
+
+try:
+    from .lazy_extractors import *
+    from .lazy_extractors import _ALL_CLASSES
+    _LAZY_LOADER = True
+except ImportError:
+    _LAZY_LOADER = False
+    from .extractors import *
+
+    _ALL_CLASSES = [
+        klass
+        for name, klass in globals().items()
+        if name.endswith('IE') and name != 'GenericIE'
+    ]
+    _ALL_CLASSES.append(GenericIE)
+
+
+def gen_extractor_classes():
+    """ Return a list of supported extractors.
+    The order does matter; the first extractor matched is the one handling the URL.
+    """
+    return _ALL_CLASSES
+
+
+def gen_extractors():
+    """ Return a list of an instance of every supported extractor.
+    The order does matter; the first extractor matched is the one handling the URL.
+    """
+    return [klass() for klass in gen_extractor_classes()]
+
+
+def list_extractors(age_limit):
+    """
+    Return a list of extractors that are suitable for the given age,
+    sorted by extractor ID.
+    """
+
+    return sorted(
+        filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()),
+        key=lambda ie: ie.IE_NAME.lower())
+
+
+def get_info_extractor(ie_name):
+    """Returns the info extractor class with the given ie_name"""
+    return globals()[ie_name + 'IE']
diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py

new file mode 100644 (file)

index 0000000..6637f4f
--- /dev/null
+++ b/youtube_dl/extractor/abc.py
@@ -0,0 +1,193 @@
+from __future__ import unicode_literals
+
+import hashlib
+import hmac
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    js_to_json,
+    int_or_none,
+    parse_iso8601,
+    try_get,
+    unescapeHTML,
+    update_url_query,
+)
+
+
+class ABCIE(InfoExtractor):
+    IE_NAME = 'abc.net.au'
+    _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
+        'md5': 'cb3dd03b18455a661071ee1e28344d9f',
+        'info_dict': {
+            'id': '5868334',
+            'ext': 'mp4',
+            'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',
+            'description': 'md5:809ad29c67a05f54eb41f2a105693a67',
+        },
+        'skip': 'this video has expired',
+    }, {
+        'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326',
+        'md5': 'db2a5369238b51f9811ad815b69dc086',
+        'info_dict': {
+            'id': 'NvqvPeNZsHU',
+            'ext': 'mp4',
+            'upload_date': '20150816',
+            'uploader': 'ABC News (Australia)',
+            'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef',
+            'uploader_id': 'NewsOnABC',
+            'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill',
+        },
+        'add_ie': ['Youtube'],
+        'skip': 'Not accessible from Travis CI server',
+    }, {
+        'url': 'http://www.abc.net.au/news/2015-10-23/nab-lifts-interest-rates-following-westpac-and-cba/6880080',
+        'md5': 'b96eee7c9edf4fc5a358a0252881cc1f',
+        'info_dict': {
+            'id': '6880080',
+            'ext': 'mp3',
+            'title': 'NAB lifts interest rates, following Westpac and CBA',
+            'description': 'md5:f13d8edc81e462fce4a0437c7dc04728',
+        },
+    }, {
+        'url': 'http://www.abc.net.au/news/2015-10-19/6866214',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        mobj = re.search(
+            r'inline(?P<type>Video|Audio|YouTube)Data\.push\((?P<json_data>[^)]+)\);',
+            webpage)
+        if mobj is None:
+            expired = self._html_search_regex(r'(?s)class="expired-(?:video|audio)".+?<span>(.+?)</span>', webpage, 'expired', None)
+            if expired:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, expired), expected=True)
+            raise ExtractorError('Unable to extract video urls')
+
+        urls_info = self._parse_json(
+            mobj.group('json_data'), video_id, transform_source=js_to_json)
+
+        if not isinstance(urls_info, list):
+            urls_info = [urls_info]
+
+        if mobj.group('type') == 'YouTube':
+            return self.playlist_result([
+                self.url_result(url_info['url']) for url_info in urls_info])
+
+        formats = [{
+            'url': url_info['url'],
+            'vcodec': url_info.get('codec') if mobj.group('type') == 'Video' else 'none',
+            'width': int_or_none(url_info.get('width')),
+            'height': int_or_none(url_info.get('height')),
+            'tbr': int_or_none(url_info.get('bitrate')),
+            'filesize': int_or_none(url_info.get('filesize')),
+        } for url_info in urls_info]
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'formats': formats,
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
+
+
+class ABCIViewIE(InfoExtractor):
+    IE_NAME = 'abc.net.au:iview'
+    _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P<id>[^/?#]+)'
+    _GEO_COUNTRIES = ['AU']
+
+    # ABC iview programs are normally available for 14 days only.
+    _TESTS = [{
+        'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00',
+        'md5': '67715ce3c78426b11ba167d875ac6abf',
+        'info_dict': {
+            'id': 'LE1927H001S00',
+            'ext': 'mp4',
+            'title': "Series 11 Ep 1",
+            'series': "Gruen",
+            'description': 'md5:52cc744ad35045baf6aded2ce7287f67',
+            'upload_date': '20190925',
+            'uploader_id': 'abc1',
+            'timestamp': 1569445289,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_params = self._download_json(
+            'https://iview.abc.net.au/api/programs/' + video_id, video_id)
+        title = unescapeHTML(video_params.get('title') or video_params['seriesTitle'])
+        stream = next(s for s in video_params['playlist'] if s.get('type') in ('program', 'livestream'))
+
+        house_number = video_params.get('episodeHouseNumber') or video_id
+        path = '/auth/hls/sign?ts={0}&hn={1}&d=android-tablet'.format(
+            int(time.time()), house_number)
+        sig = hmac.new(
+            b'android.content.res.Resources',
+            path.encode('utf-8'), hashlib.sha256).hexdigest()
+        token = self._download_webpage(
+            'http://iview.abc.net.au{0}&sig={1}'.format(path, sig), video_id)
+
+        def tokenize_url(url, token):
+            return update_url_query(url, {
+                'hdnea': token,
+            })
+
+        for sd in ('720', 'sd', 'sd-low'):
+            sd_url = try_get(
+                stream, lambda x: x['streams']['hls'][sd], compat_str)
+            if not sd_url:
+                continue
+            formats = self._extract_m3u8_formats(
+                tokenize_url(sd_url, token), video_id, 'mp4',
+                entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+            if formats:
+                break
+        self._sort_formats(formats)
+
+        subtitles = {}
+        src_vtt = stream.get('captions', {}).get('src-vtt')
+        if src_vtt:
+            subtitles['en'] = [{
+                'url': src_vtt,
+                'ext': 'vtt',
+            }]
+
+        is_live = video_params.get('livestream') == '1'
+        if is_live:
+            title = self._live_title(title)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_params.get('description'),
+            'thumbnail': video_params.get('thumbnail'),
+            'duration': int_or_none(video_params.get('eventDuration')),
+            'timestamp': parse_iso8601(video_params.get('pubDate'), ' '),
+            'series': unescapeHTML(video_params.get('seriesTitle')),
+            'series_id': video_params.get('seriesHouseNumber') or video_id[:7],
+            'season_number': int_or_none(self._search_regex(
+                r'\bSeries\s+(\d+)\b', title, 'season number', default=None)),
+            'episode_number': int_or_none(self._search_regex(
+                r'\bEp\s+(\d+)\b', title, 'episode number', default=None)),
+            'episode_id': house_number,
+            'uploader_id': video_params.get('channel'),
+            'formats': formats,
+            'subtitles': subtitles,
+            'is_live': is_live,
+        }
diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py

new file mode 100644 (file)

index 0000000..8b407bf
--- /dev/null
+++ b/youtube_dl/extractor/abcnews.py
@@ -0,0 +1,148 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import calendar
+import re
+import time
+
+from .amp import AMPIE
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..compat import compat_urlparse
+
+
+class AbcNewsVideoIE(AMPIE):
+    IE_NAME = 'abcnews:video'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            abcnews\.go\.com/
+                            (?:
+                                [^/]+/video/(?P<display_id>[0-9a-z-]+)-|
+                                video/embed\?.*?\bid=
+                            )|
+                            fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/
+                        )
+                        (?P<id>\d+)
+                    '''
+
+    _TESTS = [{
+        'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',
+        'info_dict': {
+            'id': '20411932',
+            'ext': 'mp4',
+            'display_id': 'week-exclusive-irans-foreign-minister-zarif',
+            'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif',
+            'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',
+            'duration': 180,
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://abcnews.go.com/video/embed?id=46979033',
+        'only_matching': True,
+    }, {
+        'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+        video_id = mobj.group('id')
+        info_dict = self._extract_feed_info(
+            'http://abcnews.go.com/video/itemfeed?id=%s' % video_id)
+        info_dict.update({
+            'id': video_id,
+            'display_id': display_id,
+        })
+        return info_dict
+
+
+class AbcNewsIE(InfoExtractor):
+    IE_NAME = 'abcnews'
+    _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY',
+        'info_dict': {
+            'id': '10505354',
+            'ext': 'flv',
+            'display_id': 'dramatic-video-rare-death-job-america',
+            'title': 'Occupational Hazards',
+            'description': 'Nightline investigates the dangers that lurk at various jobs.',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20100428',
+            'timestamp': 1272412800,
+        },
+        'add_ie': ['AbcNewsVideo'],
+    }, {
+        'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
+        'info_dict': {
+            'id': '38897857',
+            'ext': 'mp4',
+            'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',
+            'title': 'Justin Timberlake Drops Hints For Secret Single',
+            'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
+            'upload_date': '20160515',
+            'timestamp': 1463329500,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+            # The embedded YouTube video is blocked due to copyright issues
+            'playlist_items': '1',
+        },
+        'add_ie': ['AbcNewsVideo'],
+    }, {
+        'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        video_url = self._search_regex(
+            r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
+        full_video_url = compat_urlparse.urljoin(url, video_url)
+
+        youtube_url = YoutubeIE._extract_url(webpage)
+
+        timestamp = None
+        date_str = self._html_search_regex(
+            r'<span[^>]+class="timestamp">([^<]+)</span>',
+            webpage, 'timestamp', fatal=False)
+        if date_str:
+            tz_offset = 0
+            if date_str.endswith(' ET'):  # Eastern Time
+                tz_offset = -5
+                date_str = date_str[:-3]
+            date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p']
+            for date_format in date_formats:
+                try:
+                    timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format))
+                except ValueError:
+                    continue
+            if timestamp is not None:
+                timestamp -= tz_offset * 3600
+
+        entry = {
+            '_type': 'url_transparent',
+            'ie_key': AbcNewsVideoIE.ie_key(),
+            'url': full_video_url,
+            'id': video_id,
+            'display_id': display_id,
+            'timestamp': timestamp,
+        }
+
+        if youtube_url:
+            entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())]
+            return self.playlist_result(entries)
+
+        return entry
diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py

new file mode 100644 (file)

index 0000000..0bc69a6
--- /dev/null
+++ b/youtube_dl/extractor/abcotvs.py
@@ -0,0 +1,137 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    dict_get,
+    int_or_none,
+    try_get,
+)
+
+
+class ABCOTVSIE(InfoExtractor):
+    IE_NAME = 'abcotvs'
+    IE_DESC = 'ABC Owned Television Stations'
+    _VALID_URL = r'https?://(?P<site>abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P<display_id>[^/]+))?/(?P<id>\d+)'
+    _TESTS = [
+        {
+            'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/',
+            'info_dict': {
+                'id': '472548',
+                'display_id': 'east-bay-museum-celebrates-vintage-synthesizers',
+                'ext': 'mp4',
+                'title': 'East Bay museum celebrates synthesized music',
+                'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'timestamp': 1421118520,
+                'upload_date': '20150113',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://abc7news.com/472581',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/',
+            'only_matching': True,
+        },
+    ]
+    _SITE_MAP = {
+        '6abc': 'wpvi',
+        'abc11': 'wtvd',
+        'abc13': 'ktrk',
+        'abc30': 'kfsn',
+        'abc7': 'kabc',
+        'abc7chicago': 'wls',
+        'abc7news': 'kgo',
+        'abc7ny': 'wabc',
+    }
+
+    def _real_extract(self, url):
+        site, display_id, video_id = re.match(self._VALID_URL, url).groups()
+        display_id = display_id or video_id
+        station = self._SITE_MAP[site]
+
+        data = self._download_json(
+            'https://api.abcotvs.com/v2/content', display_id, query={
+                'id': video_id,
+                'key': 'otv.web.%s.story' % station,
+                'station': station,
+            })['data']
+        video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data
+        video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id))
+        title = video.get('title') or video['linkText']
+
+        formats = []
+        m3u8_url = video.get('m3u8')
+        if m3u8_url:
+            formats = self._extract_m3u8_formats(
+                video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False)
+        mp4_url = video.get('mp4')
+        if mp4_url:
+            formats.append({
+                'abr': 128,
+                'format_id': 'https',
+                'height': 360,
+                'url': mp4_url,
+                'width': 640,
+            })
+        self._sort_formats(formats)
+
+        image = video.get('image') or {}
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': dict_get(video, ('description', 'caption'), try_get(video, lambda x: x['meta']['description'])),
+            'thumbnail': dict_get(image, ('source', 'dynamicSource')),
+            'timestamp': int_or_none(video.get('date')),
+            'duration': int_or_none(video.get('length')),
+            'formats': formats,
+        }
+
+
+class ABCOTVSClipsIE(InfoExtractor):
+    IE_NAME = 'abcotvs:clips'
+    _VALID_URL = r'https?://clips\.abcotvs\.com/(?:[^/]+/)*video/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://clips.abcotvs.com/kabc/video/214814',
+        'info_dict': {
+            'id': '214814',
+            'ext': 'mp4',
+            'title': 'SpaceX launch pad explosion destroys rocket, satellite',
+            'description': 'md5:9f186e5ad8f490f65409965ee9c7be1b',
+            'upload_date': '20160901',
+            'timestamp': 1472756695,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json('https://clips.abcotvs.com/vogo/video/getByIds?ids=' + video_id, video_id)['results'][0]
+        title = video_data['title']
+        formats = self._extract_m3u8_formats(
+            video_data['videoURL'].split('?')[0], video_id, 'mp4')
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('thumbnailURL'),
+            'duration': int_or_none(video_data.get('duration')),
+            'timestamp': int_or_none(video_data.get('pubDate')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py

new file mode 100644 (file)

index 0000000..3409550
--- /dev/null
+++ b/youtube_dl/extractor/academicearth.py
@@ -0,0 +1,41 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class AcademicEarthCourseIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
+    IE_NAME = 'AcademicEarth:Course'
+    _TEST = {
+        'url': 'http://academicearth.org/playlists/laws-of-nature/',
+        'info_dict': {
+            'id': 'laws-of-nature',
+            'title': 'Laws of Nature',
+            'description': 'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.',
+        },
+        'playlist_count': 3,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+        title = self._html_search_regex(
+            r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, 'title')
+        description = self._html_search_regex(
+            r'<p class="excerpt"[^>]*?>(.*?)</p>',
+            webpage, 'description', fatal=False)
+        urls = re.findall(
+            r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">',
+            webpage)
+        entries = [self.url_result(u) for u in urls]
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': title,
+            'description': description,
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py

new file mode 100644 (file)

index 0000000..b17c792
--- /dev/null
+++ b/youtube_dl/extractor/acast.py
@@ -0,0 +1,135 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import functools
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    clean_html,
+    float_or_none,
+    int_or_none,
+    try_get,
+    unified_timestamp,
+    OnDemandPagedList,
+)
+
+
+class ACastIE(InfoExtractor):
+    IE_NAME = 'acast'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:(?:embed|www)\.)?acast\.com/|
+                            play\.acast\.com/s/
+                        )
+                        (?P<channel>[^/]+)/(?P<id>[^/#?]+)
+                    '''
+    _TESTS = [{
+        'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna',
+        'md5': '16d936099ec5ca2d5869e3a813ee8dc4',
+        'info_dict': {
+            'id': '2a92b283-1a75-4ad8-8396-499c641de0d9',
+            'ext': 'mp3',
+            'title': '2. Raggarmordet - Röster ur det förflutna',
+            'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4',
+            'timestamp': 1477346700,
+            'upload_date': '20161024',
+            'duration': 2766.602563,
+            'creator': 'Anton Berg & Martin Johnson',
+            'series': 'Spår',
+            'episode': '2. Raggarmordet - Röster ur det förflutna',
+        }
+    }, {
+        'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015',
+        'only_matching': True,
+    }, {
+        'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22',
+        'only_matching': True,
+    }, {
+        'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        channel, display_id = re.match(self._VALID_URL, url).groups()
+        s = self._download_json(
+            'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id),
+            display_id)
+        media_url = s['url']
+        if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id):
+            episode_url = s.get('episodeUrl')
+            if episode_url:
+                display_id = episode_url
+            else:
+                channel, display_id = re.match(self._VALID_URL, s['link']).groups()
+        cast_data = self._download_json(
+            'https://play-api.acast.com/splash/%s/%s' % (channel, display_id),
+            display_id)['result']
+        e = cast_data['episode']
+        title = e.get('name') or s['title']
+        return {
+            'id': compat_str(e['id']),
+            'display_id': display_id,
+            'url': media_url,
+            'title': title,
+            'description': e.get('summary') or clean_html(e.get('description') or s.get('description')),
+            'thumbnail': e.get('image'),
+            'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')),
+            'duration': float_or_none(e.get('duration') or s.get('duration')),
+            'filesize': int_or_none(e.get('contentLength')),
+            'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str),
+            'series': try_get(cast_data, lambda x: x['show']['name'], compat_str),
+            'season_number': int_or_none(e.get('seasonNumber')),
+            'episode': title,
+            'episode_number': int_or_none(e.get('episodeNumber')),
+        }
+
+
+class ACastChannelIE(InfoExtractor):
+    IE_NAME = 'acast:channel'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:www\.)?acast\.com/|
+                            play\.acast\.com/s/
+                        )
+                        (?P<id>[^/#?]+)
+                    '''
+    _TESTS = [{
+        'url': 'https://www.acast.com/todayinfocus',
+        'info_dict': {
+            'id': '4efc5294-5385-4847-98bd-519799ce5786',
+            'title': 'Today in Focus',
+            'description': 'md5:9ba5564de5ce897faeb12963f4537a64',
+        },
+        'playlist_mincount': 35,
+    }, {
+        'url': 'http://play.acast.com/s/ft-banking-weekly',
+        'only_matching': True,
+    }]
+    _API_BASE_URL = 'https://play.acast.com/api/'
+    _PAGE_SIZE = 10
+
+    @classmethod
+    def suitable(cls, url):
+        return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url)
+
+    def _fetch_page(self, channel_slug, page):
+        casts = self._download_json(
+            self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page),
+            channel_slug, note='Download page %d of channel data' % page)
+        for cast in casts:
+            yield self.url_result(
+                'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']),
+                'ACast', cast['id'])
+
+    def _real_extract(self, url):
+        channel_slug = self._match_id(url)
+        channel_data = self._download_json(
+            self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug)
+        entries = OnDemandPagedList(functools.partial(
+            self._fetch_page, channel_slug), self._PAGE_SIZE)
+        return self.playlist_result(entries, compat_str(
+            channel_data['id']), channel_data['name'], channel_data.get('description'))
diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py

new file mode 100644 (file)

index 0000000..c95ad21
--- /dev/null
+++ b/youtube_dl/extractor/adn.py
@@ -0,0 +1,207 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import binascii
+import json
+import os
+import random
+
+from .common import InfoExtractor
+from ..aes import aes_cbc_decrypt
+from ..compat import (
+    compat_b64decode,
+    compat_ord,
+)
+from ..utils import (
+    bytes_to_intlist,
+    bytes_to_long,
+    ExtractorError,
+    float_or_none,
+    intlist_to_bytes,
+    long_to_bytes,
+    pkcs1pad,
+    strip_or_none,
+    urljoin,
+)
+
+
+class ADNIE(InfoExtractor):
+    IE_DESC = 'Anime Digital Network'
+    _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
+        'md5': 'e497370d847fd79d9d4c74be55575c7a',
+        'info_dict': {
+            'id': '7778',
+            'ext': 'mp4',
+            'title': 'Blue Exorcist - Kyôto Saga - Épisode 1',
+            'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',
+        }
+    }
+    _BASE_URL = 'http://animedigitalnetwork.fr'
+    _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537)
+    _POS_ALIGN_MAP = {
+        'start': 1,
+        'end': 3,
+    }
+    _LINE_ALIGN_MAP = {
+        'middle': 8,
+        'end': 4,
+    }
+
+    @staticmethod
+    def _ass_subtitles_timecode(seconds):
+        return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100)
+
+    def _get_subtitles(self, sub_path, video_id):
+        if not sub_path:
+            return None
+
+        enc_subtitles = self._download_webpage(
+            urljoin(self._BASE_URL, sub_path),
+            video_id, 'Downloading subtitles location', fatal=False) or '{}'
+        subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location')
+        if subtitle_location:
+            enc_subtitles = self._download_webpage(
+                urljoin(self._BASE_URL, subtitle_location),
+                video_id, 'Downloading subtitles data', fatal=False,
+                headers={'Origin': 'https://animedigitalnetwork.fr'})
+        if not enc_subtitles:
+            return None
+
+        # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
+        dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
+            bytes_to_intlist(compat_b64decode(enc_subtitles[24:])),
+            bytes_to_intlist(binascii.unhexlify(self._K + '4b8ef13ec1872730')),
+            bytes_to_intlist(compat_b64decode(enc_subtitles[:24]))
+        ))
+        subtitles_json = self._parse_json(
+            dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(),
+            None, fatal=False)
+        if not subtitles_json:
+            return None
+
+        subtitles = {}
+        for sub_lang, sub in subtitles_json.items():
+            ssa = '''[Script Info]
+ScriptType:V4.00
+[V4 Styles]
+Format: Name,Fontname,Fontsize,PrimaryColour,SecondaryColour,TertiaryColour,BackColour,Bold,Italic,BorderStyle,Outline,Shadow,Alignment,MarginL,MarginR,MarginV,AlphaLevel,Encoding
+Style: Default,Arial,18,16777215,16777215,16777215,0,-1,0,1,1,0,2,20,20,20,0,0
+[Events]
+Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
+            for current in sub:
+                start, end, text, line_align, position_align = (
+                    float_or_none(current.get('startTime')),
+                    float_or_none(current.get('endTime')),
+                    current.get('text'), current.get('lineAlign'),
+                    current.get('positionAlign'))
+                if start is None or end is None or text is None:
+                    continue
+                alignment = self._POS_ALIGN_MAP.get(position_align, 2) + self._LINE_ALIGN_MAP.get(line_align, 0)
+                ssa += os.linesep + 'Dialogue: Marked=0,%s,%s,Default,,0,0,0,,%s%s' % (
+                    self._ass_subtitles_timecode(start),
+                    self._ass_subtitles_timecode(end),
+                    '{\\a%d}' % alignment if alignment != 2 else '',
+                    text.replace('\n', '\\N').replace('<i>', '{\\i1}').replace('</i>', '{\\i0}'))
+
+            if sub_lang == 'vostf':
+                sub_lang = 'fr'
+            subtitles.setdefault(sub_lang, []).extend([{
+                'ext': 'json',
+                'data': json.dumps(sub),
+            }, {
+                'ext': 'ssa',
+                'data': ssa,
+            }])
+        return subtitles
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        player_config = self._parse_json(self._search_regex(
+            r'playerConfig\s*=\s*({.+});', webpage,
+            'player config', default='{}'), video_id, fatal=False)
+        if not player_config:
+            config_url = urljoin(self._BASE_URL, self._search_regex(
+                r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"',
+                webpage, 'config url'))
+            player_config = self._download_json(
+                config_url, video_id,
+                'Downloading player config JSON metadata')['player']
+
+        video_info = {}
+        video_info_str = self._search_regex(
+            r'videoInfo\s*=\s*({.+});', webpage,
+            'video info', fatal=False)
+        if video_info_str:
+            video_info = self._parse_json(
+                video_info_str, video_id, fatal=False) or {}
+
+        options = player_config.get('options') or {}
+        metas = options.get('metas') or {}
+        links = player_config.get('links') or {}
+        sub_path = player_config.get('subtitles')
+        error = None
+        if not links:
+            links_url = player_config.get('linksurl') or options['videoUrl']
+            token = options['token']
+            self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)])
+            message = bytes_to_intlist(json.dumps({
+                'k': self._K,
+                'e': 60,
+                't': token,
+            }))
+            padded_message = intlist_to_bytes(pkcs1pad(message, 128))
+            n, e = self._RSA_KEY
+            encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n))
+            authorization = base64.b64encode(encrypted_message).decode()
+            links_data = self._download_json(
+                urljoin(self._BASE_URL, links_url), video_id,
+                'Downloading links JSON metadata', headers={
+                    'Authorization': 'Bearer ' + authorization,
+                })
+            links = links_data.get('links') or {}
+            metas = metas or links_data.get('meta') or {}
+            sub_path = sub_path or links_data.get('subtitles') or \
+                'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id
+            sub_path += '&token=' + token
+            error = links_data.get('error')
+        title = metas.get('title') or video_info['title']
+
+        formats = []
+        for format_id, qualities in links.items():
+            if not isinstance(qualities, dict):
+                continue
+            for quality, load_balancer_url in qualities.items():
+                load_balancer_data = self._download_json(
+                    load_balancer_url, video_id,
+                    'Downloading %s %s JSON metadata' % (format_id, quality),
+                    fatal=False) or {}
+                m3u8_url = load_balancer_data.get('location')
+                if not m3u8_url:
+                    continue
+                m3u8_formats = self._extract_m3u8_formats(
+                    m3u8_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id=format_id, fatal=False)
+                if format_id == 'vf':
+                    for f in m3u8_formats:
+                        f['language'] = 'fr'
+                formats.extend(m3u8_formats)
+        if not error:
+            error = options.get('error')
+        if not formats and error:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': strip_or_none(metas.get('summary') or video_info.get('resume')),
+            'thumbnail': video_info.get('image'),
+            'formats': formats,
+            'subtitles': self.extract_subtitles(sub_path, video_id),
+            'episode': metas.get('subtitle') or video_info.get('videoTitle'),
+            'series': video_info.get('playlistTitle'),
+        }
diff --git a/youtube_dl/extractor/adobeconnect.py b/youtube_dl/extractor/adobeconnect.py

new file mode 100644 (file)

index 0000000..728549e
--- /dev/null
+++ b/youtube_dl/extractor/adobeconnect.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+
+
+class AdobeConnectIE(InfoExtractor):
+    _VALID_URL = r'https?://\w+\.adobeconnect\.com/(?P<id>[\w-]+)'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
+        qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1])
+        is_live = qs.get('isLive', ['false'])[0] == 'true'
+        formats = []
+        for con_string in qs['conStrings'][0].split(','):
+            formats.append({
+                'format_id': con_string.split('://')[0],
+                'app': compat_urlparse.quote('?' + con_string.split('?')[1] + 'flvplayerapp/' + qs['appInstance'][0]),
+                'ext': 'flv',
+                'play_path': 'mp4:' + qs['streamName'][0],
+                'rtmp_conn': 'S:' + qs['ticket'][0],
+                'rtmp_live': is_live,
+                'url': con_string,
+            })
+
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'formats': formats,
+            'is_live': is_live,
+        }
diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py

new file mode 100644 (file)

index 0000000..38dca1b
--- /dev/null
+++ b/youtube_dl/extractor/adobepass.py
@@ -0,0 +1,1572 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import time
+import xml.etree.ElementTree as etree
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_kwargs,
+    compat_urlparse,
+)
+from ..utils import (
+    unescapeHTML,
+    urlencode_postdata,
+    unified_timestamp,
+    ExtractorError,
+    NO_DEFAULT,
+)
+
+
+MSO_INFO = {
+    'DTV': {
+        'name': 'DIRECTV',
+        'username_field': 'username',
+        'password_field': 'password',
+    },
+    'ATT': {
+        'name': 'AT&T U-verse',
+        'username_field': 'userid',
+        'password_field': 'password',
+    },
+    'ATTOTT': {
+        'name': 'DIRECTV NOW',
+        'username_field': 'email',
+        'password_field': 'loginpassword',
+    },
+    'Rogers': {
+        'name': 'Rogers',
+        'username_field': 'UserName',
+        'password_field': 'UserPassword',
+    },
+    'Comcast_SSO': {
+        'name': 'Comcast XFINITY',
+        'username_field': 'user',
+        'password_field': 'passwd',
+    },
+    'TWC': {
+        'name': 'Time Warner Cable | Spectrum',
+        'username_field': 'Ecom_User_ID',
+        'password_field': 'Ecom_Password',
+    },
+    'Brighthouse': {
+        'name': 'Bright House Networks | Spectrum',
+        'username_field': 'j_username',
+        'password_field': 'j_password',
+    },
+    'Charter_Direct': {
+        'name': 'Charter Spectrum',
+        'username_field': 'IDToken1',
+        'password_field': 'IDToken2',
+    },
+    'Verizon': {
+        'name': 'Verizon FiOS',
+        'username_field': 'IDToken1',
+        'password_field': 'IDToken2',
+    },
+    'thr030': {
+        'name': '3 Rivers Communications'
+    },
+    'com140': {
+        'name': 'Access Montana'
+    },
+    'acecommunications': {
+        'name': 'AcenTek'
+    },
+    'acm010': {
+        'name': 'Acme Communications'
+    },
+    'ada020': {
+        'name': 'Adams Cable Service'
+    },
+    'alb020': {
+        'name': 'Albany Mutual Telephone'
+    },
+    'algona': {
+        'name': 'Algona Municipal Utilities'
+    },
+    'allwest': {
+        'name': 'All West Communications'
+    },
+    'all025': {
+        'name': 'Allen\'s Communications'
+    },
+    'spl010': {
+        'name': 'Alliance Communications'
+    },
+    'all070': {
+        'name': 'ALLO Communications'
+    },
+    'alpine': {
+        'name': 'Alpine Communications'
+    },
+    'hun015': {
+        'name': 'American Broadband'
+    },
+    'nwc010': {
+        'name': 'American Broadband Missouri'
+    },
+    'com130-02': {
+        'name': 'American Community Networks'
+    },
+    'com130-01': {
+        'name': 'American Warrior Networks'
+    },
+    'tom020': {
+        'name': 'Amherst Telephone/Tomorrow Valley'
+    },
+    'tvc020': {
+        'name': 'Andycable'
+    },
+    'arkwest': {
+        'name': 'Arkwest Communications'
+    },
+    'art030': {
+        'name': 'Arthur Mutual Telephone Company'
+    },
+    'arvig': {
+        'name': 'Arvig'
+    },
+    'nttcash010': {
+        'name': 'Ashland Home Net'
+    },
+    'astound': {
+        'name': 'Astound (now Wave)'
+    },
+    'dix030': {
+        'name': 'ATC Broadband'
+    },
+    'ara010': {
+        'name': 'ATC Communications'
+    },
+    'she030-02': {
+        'name': 'Ayersville Communications'
+    },
+    'baldwin': {
+        'name': 'Baldwin Lightstream'
+    },
+    'bal040': {
+        'name': 'Ballard TV'
+    },
+    'cit025': {
+        'name': 'Bardstown Cable TV'
+    },
+    'bay030': {
+        'name': 'Bay Country Communications'
+    },
+    'tel095': {
+        'name': 'Beaver Creek Cooperative Telephone'
+    },
+    'bea020': {
+        'name': 'Beaver Valley Cable'
+    },
+    'bee010': {
+        'name': 'Bee Line Cable'
+    },
+    'wir030': {
+        'name': 'Beehive Broadband'
+    },
+    'bra020': {
+        'name': 'BELD'
+    },
+    'bel020': {
+        'name': 'Bellevue Municipal Cable'
+    },
+    'vol040-01': {
+        'name': 'Ben Lomand Connect / BLTV'
+    },
+    'bev010': {
+        'name': 'BEVCOMM'
+    },
+    'big020': {
+        'name': 'Big Sandy Broadband'
+    },
+    'ble020': {
+        'name': 'Bledsoe Telephone Cooperative'
+    },
+    'bvt010': {
+        'name': 'Blue Valley Tele-Communications'
+    },
+    'bra050': {
+        'name': 'Brandenburg Telephone Co.'
+    },
+    'bte010': {
+        'name': 'Bristol Tennessee Essential Services'
+    },
+    'annearundel': {
+        'name': 'Broadstripe'
+    },
+    'btc010': {
+        'name': 'BTC Communications'
+    },
+    'btc040': {
+        'name': 'BTC Vision - Nahunta'
+    },
+    'bul010': {
+        'name': 'Bulloch Telephone Cooperative'
+    },
+    'but010': {
+        'name': 'Butler-Bremer Communications'
+    },
+    'tel160-csp': {
+        'name': 'C Spire SNAP'
+    },
+    'csicable': {
+        'name': 'Cable Services Inc.'
+    },
+    'cableamerica': {
+        'name': 'CableAmerica'
+    },
+    'cab038': {
+        'name': 'CableSouth Media 3'
+    },
+    'weh010-camtel': {
+        'name': 'Cam-Tel Company'
+    },
+    'car030': {
+        'name': 'Cameron Communications'
+    },
+    'canbytel': {
+        'name': 'Canby Telcom'
+    },
+    'crt020': {
+        'name': 'CapRock Tv'
+    },
+    'car050': {
+        'name': 'Carnegie Cable'
+    },
+    'cas': {
+        'name': 'CAS Cable'
+    },
+    'casscomm': {
+        'name': 'CASSCOMM'
+    },
+    'mid180-02': {
+        'name': 'Catalina Broadband Solutions'
+    },
+    'cccomm': {
+        'name': 'CC Communications'
+    },
+    'nttccde010': {
+        'name': 'CDE Lightband'
+    },
+    'cfunet': {
+        'name': 'Cedar Falls Utilities'
+    },
+    'dem010-01': {
+        'name': 'Celect-Bloomer Telephone Area'
+    },
+    'dem010-02': {
+        'name': 'Celect-Bruce Telephone Area'
+    },
+    'dem010-03': {
+        'name': 'Celect-Citizens Connected Area'
+    },
+    'dem010-04': {
+        'name': 'Celect-Elmwood/Spring Valley Area'
+    },
+    'dem010-06': {
+        'name': 'Celect-Mosaic Telecom'
+    },
+    'dem010-05': {
+        'name': 'Celect-West WI Telephone Area'
+    },
+    'net010-02': {
+        'name': 'Cellcom/Nsight Telservices'
+    },
+    'cen100': {
+        'name': 'CentraCom'
+    },
+    'nttccst010': {
+        'name': 'Central Scott / CSTV'
+    },
+    'cha035': {
+        'name': 'Chaparral CableVision'
+    },
+    'cha050': {
+        'name': 'Chariton Valley Communication Corporation, Inc.'
+    },
+    'cha060': {
+        'name': 'Chatmoss Cablevision'
+    },
+    'nttcche010': {
+        'name': 'Cherokee Communications'
+    },
+    'che050': {
+        'name': 'Chesapeake Bay Communications'
+    },
+    'cimtel': {
+        'name': 'Cim-Tel Cable, LLC.'
+    },
+    'cit180': {
+        'name': 'Citizens Cablevision - Floyd, VA'
+    },
+    'cit210': {
+        'name': 'Citizens Cablevision, Inc.'
+    },
+    'cit040': {
+        'name': 'Citizens Fiber'
+    },
+    'cit250': {
+        'name': 'Citizens Mutual'
+    },
+    'war040': {
+        'name': 'Citizens Telephone Corporation'
+    },
+    'wat025': {
+        'name': 'City Of Monroe'
+    },
+    'wadsworth': {
+        'name': 'CityLink'
+    },
+    'nor100': {
+        'name': 'CL Tel'
+    },
+    'cla010': {
+        'name': 'Clarence Telephone and Cedar Communications'
+    },
+    'ser060': {
+        'name': 'Clear Choice Communications'
+    },
+    'tac020': {
+        'name': 'Click! Cable TV'
+    },
+    'war020': {
+        'name': 'CLICK1.NET'
+    },
+    'cml010': {
+        'name': 'CML Telephone Cooperative Association'
+    },
+    'cns': {
+        'name': 'CNS'
+    },
+    'com160': {
+        'name': 'Co-Mo Connect'
+    },
+    'coa020': {
+        'name': 'Coast Communications'
+    },
+    'coa030': {
+        'name': 'Coaxial Cable TV'
+    },
+    'mid055': {
+        'name': 'Cobalt TV (Mid-State Community TV)'
+    },
+    'col070': {
+        'name': 'Columbia Power & Water Systems'
+    },
+    'col080': {
+        'name': 'Columbus Telephone'
+    },
+    'nor105': {
+        'name': 'Communications 1 Cablevision, Inc.'
+    },
+    'com150': {
+        'name': 'Community Cable & Broadband'
+    },
+    'com020': {
+        'name': 'Community Communications Company'
+    },
+    'coy010': {
+        'name': 'commZoom'
+    },
+    'com025': {
+        'name': 'Complete Communication Services'
+    },
+    'cat020': {
+        'name': 'Comporium'
+    },
+    'com071': {
+        'name': 'ComSouth Telesys'
+    },
+    'consolidatedcable': {
+        'name': 'Consolidated'
+    },
+    'conwaycorp': {
+        'name': 'Conway Corporation'
+    },
+    'coo050': {
+        'name': 'Coon Valley Telecommunications Inc'
+    },
+    'coo080': {
+        'name': 'Cooperative Telephone Company'
+    },
+    'cpt010': {
+        'name': 'CP-TEL'
+    },
+    'cra010': {
+        'name': 'Craw-Kan Telephone'
+    },
+    'crestview': {
+        'name': 'Crestview Cable Communications'
+    },
+    'cross': {
+        'name': 'Cross TV'
+    },
+    'cro030': {
+        'name': 'Crosslake Communications'
+    },
+    'ctc040': {
+        'name': 'CTC - Brainerd MN'
+    },
+    'phe030': {
+        'name': 'CTV-Beam - East Alabama'
+    },
+    'cun010': {
+        'name': 'Cunningham Telephone & Cable'
+    },
+    'dpc010': {
+        'name': 'D & P Communications'
+    },
+    'dak030': {
+        'name': 'Dakota Central Telecommunications'
+    },
+    'nttcdel010': {
+        'name': 'Delcambre Telephone LLC'
+    },
+    'tel160-del': {
+        'name': 'Delta Telephone Company'
+    },
+    'sal040': {
+        'name': 'DiamondNet'
+    },
+    'ind060-dc': {
+        'name': 'Direct Communications'
+    },
+    'doy010': {
+        'name': 'Doylestown Cable TV'
+    },
+    'dic010': {
+        'name': 'DRN'
+    },
+    'dtc020': {
+        'name': 'DTC'
+    },
+    'dtc010': {
+        'name': 'DTC Cable (Delhi)'
+    },
+    'dum010': {
+        'name': 'Dumont Telephone Company'
+    },
+    'dun010': {
+        'name': 'Dunkerton Telephone Cooperative'
+    },
+    'cci010': {
+        'name': 'Duo County Telecom'
+    },
+    'eagle': {
+        'name': 'Eagle Communications'
+    },
+    'weh010-east': {
+        'name': 'East Arkansas Cable TV'
+    },
+    'eatel': {
+        'name': 'EATEL Video, LLC'
+    },
+    'ell010': {
+        'name': 'ECTA'
+    },
+    'emerytelcom': {
+        'name': 'Emery Telcom Video LLC'
+    },
+    'nor200': {
+        'name': 'Empire Access'
+    },
+    'endeavor': {
+        'name': 'Endeavor Communications'
+    },
+    'sun045': {
+        'name': 'Enhanced Telecommunications Corporation'
+    },
+    'mid030': {
+        'name': 'enTouch'
+    },
+    'epb020': {
+        'name': 'EPB Smartnet'
+    },
+    'jea010': {
+        'name': 'EPlus Broadband'
+    },
+    'com065': {
+        'name': 'ETC'
+    },
+    'ete010': {
+        'name': 'Etex Communications'
+    },
+    'fbc-tele': {
+        'name': 'F&B Communications'
+    },
+    'fal010': {
+        'name': 'Falcon Broadband'
+    },
+    'fam010': {
+        'name': 'FamilyView CableVision'
+    },
+    'far020': {
+        'name': 'Farmers Mutual Telephone Company'
+    },
+    'fay010': {
+        'name': 'Fayetteville Public Utilities'
+    },
+    'sal060': {
+        'name': 'fibrant'
+    },
+    'fid010': {
+        'name': 'Fidelity Communications'
+    },
+    'for030': {
+        'name': 'FJ Communications'
+    },
+    'fli020': {
+        'name': 'Flint River Communications'
+    },
+    'far030': {
+        'name': 'FMT - Jesup'
+    },
+    'foo010': {
+        'name': 'Foothills Communications'
+    },
+    'for080': {
+        'name': 'Forsyth CableNet'
+    },
+    'fbcomm': {
+        'name': 'Frankfort Plant Board'
+    },
+    'tel160-fra': {
+        'name': 'Franklin Telephone Company'
+    },
+    'nttcftc010': {
+        'name': 'FTC'
+    },
+    'fullchannel': {
+        'name': 'Full Channel, Inc.'
+    },
+    'gar040': {
+        'name': 'Gardonville Cooperative Telephone Association'
+    },
+    'gbt010': {
+        'name': 'GBT Communications, Inc.'
+    },
+    'tec010': {
+        'name': 'Genuine Telecom'
+    },
+    'clr010': {
+        'name': 'Giant Communications'
+    },
+    'gla010': {
+        'name': 'Glasgow EPB'
+    },
+    'gle010': {
+        'name': 'Glenwood Telecommunications'
+    },
+    'gra060': {
+        'name': 'GLW Broadband Inc.'
+    },
+    'goldenwest': {
+        'name': 'Golden West Cablevision'
+    },
+    'vis030': {
+        'name': 'Grantsburg Telcom'
+    },
+    'gpcom': {
+        'name': 'Great Plains Communications'
+    },
+    'gri010': {
+        'name': 'Gridley Cable Inc'
+    },
+    'hbc010': {
+        'name': 'H&B Cable Services'
+    },
+    'hae010': {
+        'name': 'Haefele TV Inc.'
+    },
+    'htc010': {
+        'name': 'Halstad Telephone Company'
+    },
+    'har005': {
+        'name': 'Harlan Municipal Utilities'
+    },
+    'har020': {
+        'name': 'Hart Communications'
+    },
+    'ced010': {
+        'name': 'Hartelco TV'
+    },
+    'hea040': {
+        'name': 'Heart of Iowa Communications Cooperative'
+    },
+    'htc020': {
+        'name': 'Hickory Telephone Company'
+    },
+    'nttchig010': {
+        'name': 'Highland Communication Services'
+    },
+    'hig030': {
+        'name': 'Highland Media'
+    },
+    'spc010': {
+        'name': 'Hilliary Communications'
+    },
+    'hin020': {
+        'name': 'Hinton CATV Co.'
+    },
+    'hometel': {
+        'name': 'HomeTel Entertainment, Inc.'
+    },
+    'hoodcanal': {
+        'name': 'Hood Canal Communications'
+    },
+    'weh010-hope': {
+        'name': 'Hope - Prescott Cable TV'
+    },
+    'horizoncable': {
+        'name': 'Horizon Cable TV, Inc.'
+    },
+    'hor040': {
+        'name': 'Horizon Chillicothe Telephone'
+    },
+    'htc030': {
+        'name': 'HTC Communications Co. - IL'
+    },
+    'htccomm': {
+        'name': 'HTC Communications, Inc. - IA'
+    },
+    'wal005': {
+        'name': 'Huxley Communications'
+    },
+    'imon': {
+        'name': 'ImOn Communications'
+    },
+    'ind040': {
+        'name': 'Independence Telecommunications'
+    },
+    'rrc010': {
+        'name': 'Inland Networks'
+    },
+    'stc020': {
+        'name': 'Innovative Cable TV St Croix'
+    },
+    'car100': {
+        'name': 'Innovative Cable TV St Thomas-St John'
+    },
+    'icc010': {
+        'name': 'Inside Connect Cable'
+    },
+    'int100': {
+        'name': 'Integra Telecom'
+    },
+    'int050': {
+        'name': 'Interstate Telecommunications Coop'
+    },
+    'irv010': {
+        'name': 'Irvine Cable'
+    },
+    'k2c010': {
+        'name': 'K2 Communications'
+    },
+    'kal010': {
+        'name': 'Kalida Telephone Company, Inc.'
+    },
+    'kal030': {
+        'name': 'Kalona Cooperative Telephone Company'
+    },
+    'kmt010': {
+        'name': 'KMTelecom'
+    },
+    'kpu010': {
+        'name': 'KPU Telecommunications'
+    },
+    'kuh010': {
+        'name': 'Kuhn Communications, Inc.'
+    },
+    'lak130': {
+        'name': 'Lakeland Communications'
+    },
+    'lan010': {
+        'name': 'Langco'
+    },
+    'lau020': {
+        'name': 'Laurel Highland Total Communications, Inc.'
+    },
+    'leh010': {
+        'name': 'Lehigh Valley Cooperative Telephone'
+    },
+    'bra010': {
+        'name': 'Limestone Cable/Bracken Cable'
+    },
+    'loc020': {
+        'name': 'LISCO'
+    },
+    'lit020': {
+        'name': 'Litestream'
+    },
+    'tel140': {
+        'name': 'LivCom'
+    },
+    'loc010': {
+        'name': 'LocalTel Communications'
+    },
+    'weh010-longview': {
+        'name': 'Longview - Kilgore Cable TV'
+    },
+    'lon030': {
+        'name': 'Lonsdale Video Ventures, LLC'
+    },
+    'lns010': {
+        'name': 'Lost Nation-Elwood Telephone Co.'
+    },
+    'nttclpc010': {
+        'name': 'LPC Connect'
+    },
+    'lumos': {
+        'name': 'Lumos Networks'
+    },
+    'madison': {
+        'name': 'Madison Communications'
+    },
+    'mad030': {
+        'name': 'Madison County Cable Inc.'
+    },
+    'nttcmah010': {
+        'name': 'Mahaska Communication Group'
+    },
+    'mar010': {
+        'name': 'Marne & Elk Horn Telephone Company'
+    },
+    'mcc040': {
+        'name': 'McClure Telephone Co.'
+    },
+    'mctv': {
+        'name': 'MCTV'
+    },
+    'merrimac': {
+        'name': 'Merrimac Communications Ltd.'
+    },
+    'metronet': {
+        'name': 'Metronet'
+    },
+    'mhtc': {
+        'name': 'MHTC'
+    },
+    'midhudson': {
+        'name': 'Mid-Hudson Cable'
+    },
+    'midrivers': {
+        'name': 'Mid-Rivers Communications'
+    },
+    'mid045': {
+        'name': 'Midstate Communications'
+    },
+    'mil080': {
+        'name': 'Milford Communications'
+    },
+    'min030': {
+        'name': 'MINET'
+    },
+    'nttcmin010': {
+        'name': 'Minford TV'
+    },
+    'san040-02': {
+        'name': 'Mitchell Telecom'
+    },
+    'mlg010': {
+        'name': 'MLGC'
+    },
+    'mon060': {
+        'name': 'Mon-Cre TVE'
+    },
+    'mou110': {
+        'name': 'Mountain Telephone'
+    },
+    'mou050': {
+        'name': 'Mountain Village Cable'
+    },
+    'mtacomm': {
+        'name': 'MTA Communications, LLC'
+    },
+    'mtc010': {
+        'name': 'MTC Cable'
+    },
+    'med040': {
+        'name': 'MTC Technologies'
+    },
+    'man060': {
+        'name': 'MTCC'
+    },
+    'mtc030': {
+        'name': 'MTCO Communications'
+    },
+    'mul050': {
+        'name': 'Mulberry Telecommunications'
+    },
+    'mur010': {
+        'name': 'Murray Electric System'
+    },
+    'musfiber': {
+        'name': 'MUS FiberNET'
+    },
+    'mpw': {
+        'name': 'Muscatine Power & Water'
+    },
+    'nttcsli010': {
+        'name': 'myEVTV.com'
+    },
+    'nor115': {
+        'name': 'NCC'
+    },
+    'nor260': {
+        'name': 'NDTC'
+    },
+    'nctc': {
+        'name': 'Nebraska Central Telecom, Inc.'
+    },
+    'nel020': {
+        'name': 'Nelsonville TV Cable'
+    },
+    'nem010': {
+        'name': 'Nemont'
+    },
+    'new075': {
+        'name': 'New Hope Telephone Cooperative'
+    },
+    'nor240': {
+        'name': 'NICP'
+    },
+    'cic010': {
+        'name': 'NineStar Connect'
+    },
+    'nktelco': {
+        'name': 'NKTelco'
+    },
+    'nortex': {
+        'name': 'Nortex Communications'
+    },
+    'nor140': {
+        'name': 'North Central Telephone Cooperative'
+    },
+    'nor030': {
+        'name': 'Northland Communications'
+    },
+    'nor075': {
+        'name': 'Northwest Communications'
+    },
+    'nor125': {
+        'name': 'Norwood Light Broadband'
+    },
+    'net010': {
+        'name': 'Nsight Telservices'
+    },
+    'dur010': {
+        'name': 'Ntec'
+    },
+    'nts010': {
+        'name': 'NTS Communications'
+    },
+    'new045': {
+        'name': 'NU-Telecom'
+    },
+    'nulink': {
+        'name': 'NuLink'
+    },
+    'jam030': {
+        'name': 'NVC'
+    },
+    'far035': {
+        'name': 'OmniTel Communications'
+    },
+    'onesource': {
+        'name': 'OneSource Communications'
+    },
+    'cit230': {
+        'name': 'Opelika Power Services'
+    },
+    'daltonutilities': {
+        'name': 'OptiLink'
+    },
+    'mid140': {
+        'name': 'OPTURA'
+    },
+    'ote010': {
+        'name': 'OTEC Communication Company'
+    },
+    'cci020': {
+        'name': 'Packerland Broadband'
+    },
+    'pan010': {
+        'name': 'Panora Telco/Guthrie Center Communications'
+    },
+    'otter': {
+        'name': 'Park Region Telephone & Otter Tail Telcom'
+    },
+    'mid050': {
+        'name': 'Partner Communications Cooperative'
+    },
+    'fib010': {
+        'name': 'Pathway'
+    },
+    'paulbunyan': {
+        'name': 'Paul Bunyan Communications'
+    },
+    'pem020': {
+        'name': 'Pembroke Telephone Company'
+    },
+    'mck010': {
+        'name': 'Peoples Rural Telephone Cooperative'
+    },
+    'pul010': {
+        'name': 'PES Energize'
+    },
+    'phi010': {
+        'name': 'Philippi Communications System'
+    },
+    'phonoscope': {
+        'name': 'Phonoscope Cable'
+    },
+    'pin070': {
+        'name': 'Pine Belt Communications, Inc.'
+    },
+    'weh010-pine': {
+        'name': 'Pine Bluff Cable TV'
+    },
+    'pin060': {
+        'name': 'Pineland Telephone Cooperative'
+    },
+    'cam010': {
+        'name': 'Pinpoint Communications'
+    },
+    'pio060': {
+        'name': 'Pioneer Broadband'
+    },
+    'pioncomm': {
+        'name': 'Pioneer Communications'
+    },
+    'pioneer': {
+        'name': 'Pioneer DTV'
+    },
+    'pla020': {
+        'name': 'Plant TiftNet, Inc.'
+    },
+    'par010': {
+        'name': 'PLWC'
+    },
+    'pro035': {
+        'name': 'PMT'
+    },
+    'vik011': {
+        'name': 'Polar Cablevision'
+    },
+    'pottawatomie': {
+        'name': 'Pottawatomie Telephone Co.'
+    },
+    'premiercomm': {
+        'name': 'Premier Communications'
+    },
+    'psc010': {
+        'name': 'PSC'
+    },
+    'pan020': {
+        'name': 'PTCI'
+    },
+    'qco010': {
+        'name': 'QCOL'
+    },
+    'qua010': {
+        'name': 'Quality Cablevision'
+    },
+    'rad010': {
+        'name': 'Radcliffe Telephone Company'
+    },
+    'car040': {
+        'name': 'Rainbow Communications'
+    },
+    'rai030': {
+        'name': 'Rainier Connect'
+    },
+    'ral010': {
+        'name': 'Ralls Technologies'
+    },
+    'rct010': {
+        'name': 'RC Technologies'
+    },
+    'red040': {
+        'name': 'Red River Communications'
+    },
+    'ree010': {
+        'name': 'Reedsburg Utility Commission'
+    },
+    'mol010': {
+        'name': 'Reliance Connects- Oregon'
+    },
+    'res020': {
+        'name': 'Reserve Telecommunications'
+    },
+    'weh010-resort': {
+        'name': 'Resort TV Cable'
+    },
+    'rld010': {
+        'name': 'Richland Grant Telephone Cooperative, Inc.'
+    },
+    'riv030': {
+        'name': 'River Valley Telecommunications Coop'
+    },
+    'rockportcable': {
+        'name': 'Rock Port Cablevision'
+    },
+    'rsf010': {
+        'name': 'RS Fiber'
+    },
+    'rtc': {
+        'name': 'RTC Communication Corp'
+    },
+    'res040': {
+        'name': 'RTC-Reservation Telephone Coop.'
+    },
+    'rte010': {
+        'name': 'RTEC Communications'
+    },
+    'stc010': {
+        'name': 'S&T'
+    },
+    'san020': {
+        'name': 'San Bruno Cable TV'
+    },
+    'san040-01': {
+        'name': 'Santel'
+    },
+    'sav010': {
+        'name': 'SCI Broadband-Savage Communications Inc.'
+    },
+    'sco050': {
+        'name': 'Scottsboro Electric Power Board'
+    },
+    'scr010': {
+        'name': 'Scranton Telephone Company'
+    },
+    'selco': {
+        'name': 'SELCO'
+    },
+    'she010': {
+        'name': 'Shentel'
+    },
+    'she030': {
+        'name': 'Sherwood Mutual Telephone Association, Inc.'
+    },
+    'ind060-ssc': {
+        'name': 'Silver Star Communications'
+    },
+    'sjoberg': {
+        'name': 'Sjoberg\'s Inc.'
+    },
+    'sou025': {
+        'name': 'SKT'
+    },
+    'sky050': {
+        'name': 'SkyBest TV'
+    },
+    'nttcsmi010': {
+        'name': 'Smithville Communications'
+    },
+    'woo010': {
+        'name': 'Solarus'
+    },
+    'sou075': {
+        'name': 'South Central Rural Telephone Cooperative'
+    },
+    'sou065': {
+        'name': 'South Holt Cablevision, Inc.'
+    },
+    'sou035': {
+        'name': 'South Slope Cooperative Communications'
+    },
+    'spa020': {
+        'name': 'Spanish Fork Community Network'
+    },
+    'spe010': {
+        'name': 'Spencer Municipal Utilities'
+    },
+    'spi005': {
+        'name': 'Spillway Communications, Inc.'
+    },
+    'srt010': {
+        'name': 'SRT'
+    },
+    'cccsmc010': {
+        'name': 'St. Maarten Cable TV'
+    },
+    'sta025': {
+        'name': 'Star Communications'
+    },
+    'sco020': {
+        'name': 'STE'
+    },
+    'uin010': {
+        'name': 'STRATA Networks'
+    },
+    'sum010': {
+        'name': 'Sumner Cable TV'
+    },
+    'pie010': {
+        'name': 'Surry TV/PCSI TV'
+    },
+    'swa010': {
+        'name': 'Swayzee Communications'
+    },
+    'sweetwater': {
+        'name': 'Sweetwater Cable Television Co'
+    },
+    'weh010-talequah': {
+        'name': 'Tahlequah Cable TV'
+    },
+    'tct': {
+        'name': 'TCT'
+    },
+    'tel050': {
+        'name': 'Tele-Media Company'
+    },
+    'com050': {
+        'name': 'The Community Agency'
+    },
+    'thr020': {
+        'name': 'Three River'
+    },
+    'cab140': {
+        'name': 'Town & Country Technologies'
+    },
+    'tra010': {
+        'name': 'Trans-Video'
+    },
+    'tre010': {
+        'name': 'Trenton TV Cable Company'
+    },
+    'tcc': {
+        'name': 'Tri County Communications Cooperative'
+    },
+    'tri025': {
+        'name': 'TriCounty Telecom'
+    },
+    'tri110': {
+        'name': 'TrioTel Communications, Inc.'
+    },
+    'tro010': {
+        'name': 'Troy Cablevision, Inc.'
+    },
+    'tsc': {
+        'name': 'TSC'
+    },
+    'cit220': {
+        'name': 'Tullahoma Utilities Board'
+    },
+    'tvc030': {
+        'name': 'TV Cable of Rensselaer'
+    },
+    'tvc015': {
+        'name': 'TVC Cable'
+    },
+    'cab180': {
+        'name': 'TVision'
+    },
+    'twi040': {
+        'name': 'Twin Lakes'
+    },
+    'tvtinc': {
+        'name': 'Twin Valley'
+    },
+    'uis010': {
+        'name': 'Union Telephone Company'
+    },
+    'uni110': {
+        'name': 'United Communications - TN'
+    },
+    'uni120': {
+        'name': 'United Services'
+    },
+    'uss020': {
+        'name': 'US Sonet'
+    },
+    'cab060': {
+        'name': 'USA Communications'
+    },
+    'she005': {
+        'name': 'USA Communications/Shellsburg, IA'
+    },
+    'val040': {
+        'name': 'Valley TeleCom Group'
+    },
+    'val025': {
+        'name': 'Valley Telecommunications'
+    },
+    'val030': {
+        'name': 'Valparaiso Broadband'
+    },
+    'cla050': {
+        'name': 'Vast Broadband'
+    },
+    'sul015': {
+        'name': 'Venture Communications Cooperative, Inc.'
+    },
+    'ver025': {
+        'name': 'Vernon Communications Co-op'
+    },
+    'weh010-vicksburg': {
+        'name': 'Vicksburg Video'
+    },
+    'vis070': {
+        'name': 'Vision Communications'
+    },
+    'volcanotel': {
+        'name': 'Volcano Vision, Inc.'
+    },
+    'vol040-02': {
+        'name': 'VolFirst / BLTV'
+    },
+    'ver070': {
+        'name': 'VTel'
+    },
+    'nttcvtx010': {
+        'name': 'VTX1'
+    },
+    'bci010-02': {
+        'name': 'Vyve Broadband'
+    },
+    'wab020': {
+        'name': 'Wabash Mutual Telephone'
+    },
+    'waitsfield': {
+        'name': 'Waitsfield Cable'
+    },
+    'wal010': {
+        'name': 'Walnut Communications'
+    },
+    'wavebroadband': {
+        'name': 'Wave'
+    },
+    'wav030': {
+        'name': 'Waverly Communications Utility'
+    },
+    'wbi010': {
+        'name': 'WBI'
+    },
+    'web020': {
+        'name': 'Webster-Calhoun Cooperative Telephone Association'
+    },
+    'wes005': {
+        'name': 'West Alabama TV Cable'
+    },
+    'carolinata': {
+        'name': 'West Carolina Communications'
+    },
+    'wct010': {
+        'name': 'West Central Telephone Association'
+    },
+    'wes110': {
+        'name': 'West River Cooperative Telephone Company'
+    },
+    'ani030': {
+        'name': 'WesTel Systems'
+    },
+    'westianet': {
+        'name': 'Western Iowa Networks'
+    },
+    'nttcwhi010': {
+        'name': 'Whidbey Telecom'
+    },
+    'weh010-white': {
+        'name': 'White County Cable TV'
+    },
+    'wes130': {
+        'name': 'Wiatel'
+    },
+    'wik010': {
+        'name': 'Wiktel'
+    },
+    'wil070': {
+        'name': 'Wilkes Communications, Inc./RiverStreet Networks'
+    },
+    'wil015': {
+        'name': 'Wilson Communications'
+    },
+    'win010': {
+        'name': 'Windomnet/SMBS'
+    },
+    'win090': {
+        'name': 'Windstream Cable TV'
+    },
+    'wcta': {
+        'name': 'Winnebago Cooperative Telecom Association'
+    },
+    'wtc010': {
+        'name': 'WTC'
+    },
+    'wil040': {
+        'name': 'WTC Communications, Inc.'
+    },
+    'wya010': {
+        'name': 'Wyandotte Cable'
+    },
+    'hin020-02': {
+        'name': 'X-Stream Services'
+    },
+    'xit010': {
+        'name': 'XIT Communications'
+    },
+    'yel010': {
+        'name': 'Yelcot Communications'
+    },
+    'mid180-01': {
+        'name': 'yondoo'
+    },
+    'cou060': {
+        'name': 'Zito Media'
+    },
+}
+
+
+class AdobePassIE(InfoExtractor):
+    _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s'
+    _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
+    _MVPD_CACHE = 'ap-mvpd'
+
+    _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page'
+
+    def _download_webpage_handle(self, *args, **kwargs):
+        headers = self.geo_verification_headers()
+        headers.update(kwargs.get('headers', {}))
+        kwargs['headers'] = headers
+        return super(AdobePassIE, self)._download_webpage_handle(
+            *args, **compat_kwargs(kwargs))
+
+    @staticmethod
+    def _get_mvpd_resource(provider_id, title, guid, rating):
+        channel = etree.Element('channel')
+        channel_title = etree.SubElement(channel, 'title')
+        channel_title.text = provider_id
+        item = etree.SubElement(channel, 'item')
+        resource_title = etree.SubElement(item, 'title')
+        resource_title.text = title
+        resource_guid = etree.SubElement(item, 'guid')
+        resource_guid.text = guid
+        resource_rating = etree.SubElement(item, 'media:rating')
+        resource_rating.attrib = {'scheme': 'urn:v-chip'}
+        resource_rating.text = rating
+        return '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">' + etree.tostring(channel).decode() + '</rss>'
+
+    def _extract_mvpd_auth(self, url, video_id, requestor_id, resource):
+        def xml_text(xml_str, tag):
+            return self._search_regex(
+                '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag)
+
+        def is_expired(token, date_ele):
+            token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele)))
+            return token_expires and token_expires <= int(time.time())
+
+        def post_form(form_page_res, note, data={}):
+            form_page, urlh = form_page_res
+            post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url')
+            if not re.match(r'https?://', post_url):
+                post_url = compat_urlparse.urljoin(urlh.geturl(), post_url)
+            form_data = self._hidden_inputs(form_page)
+            form_data.update(data)
+            return self._download_webpage_handle(
+                post_url, video_id, note, data=urlencode_postdata(form_data), headers={
+                    'Content-Type': 'application/x-www-form-urlencoded',
+                })
+
+        def raise_mvpd_required():
+            raise ExtractorError(
+                'This video is only available for users of participating TV providers. '
+                'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier '
+                'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True)
+
+        def extract_redirect_url(html, url=None, fatal=False):
+            # TODO: eliminate code duplication with generic extractor and move
+            # redirection code into _download_webpage_handle
+            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
+            redirect_url = self._search_regex(
+                r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
+                r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
+                html, 'meta refresh redirect',
+                default=NO_DEFAULT if fatal else None, fatal=fatal)
+            if not redirect_url:
+                return None
+            if url:
+                redirect_url = compat_urlparse.urljoin(url, unescapeHTML(redirect_url))
+            return redirect_url
+
+        mvpd_headers = {
+            'ap_42': 'anonymous',
+            'ap_11': 'Linux i686',
+            'ap_z': self._USER_AGENT,
+            'User-Agent': self._USER_AGENT,
+        }
+
+        guid = xml_text(resource, 'guid') if '<' in resource else resource
+        count = 0
+        while count < 2:
+            requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {}
+            authn_token = requestor_info.get('authn_token')
+            if authn_token and is_expired(authn_token, 'simpleTokenExpires'):
+                authn_token = None
+            if not authn_token:
+                # TODO add support for other TV Providers
+                mso_id = self._downloader.params.get('ap_mso')
+                if not mso_id:
+                    raise_mvpd_required()
+                username, password = self._get_login_info('ap_username', 'ap_password', mso_id)
+                if not username or not password:
+                    raise_mvpd_required()
+                mso_info = MSO_INFO[mso_id]
+
+                provider_redirect_page_res = self._download_webpage_handle(
+                    self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id,
+                    'Downloading Provider Redirect Page', query={
+                        'noflash': 'true',
+                        'mso_id': mso_id,
+                        'requestor_id': requestor_id,
+                        'no_iframe': 'false',
+                        'domain_name': 'adobe.com',
+                        'redirect_url': url,
+                    })
+
+                if mso_id == 'Comcast_SSO':
+                    # Comcast page flow varies by video site and whether you
+                    # are on Comcast's network.
+                    provider_redirect_page, urlh = provider_redirect_page_res
+                    if 'automatically signing you in' in provider_redirect_page:
+                        oauth_redirect_url = self._html_search_regex(
+                            r'window\.location\s*=\s*[\'"]([^\'"]+)',
+                            provider_redirect_page, 'oauth redirect')
+                        self._download_webpage(
+                            oauth_redirect_url, video_id, 'Confirming auto login')
+                    else:
+                        if '<form name="signin"' in provider_redirect_page:
+                            provider_login_page_res = provider_redirect_page_res
+                        elif 'http-equiv="refresh"' in provider_redirect_page:
+                            oauth_redirect_url = extract_redirect_url(
+                                provider_redirect_page, fatal=True)
+                            provider_login_page_res = self._download_webpage_handle(
+                                oauth_redirect_url, video_id,
+                                self._DOWNLOADING_LOGIN_PAGE)
+                        else:
+                            provider_login_page_res = post_form(
+                                provider_redirect_page_res,
+                                self._DOWNLOADING_LOGIN_PAGE)
+
+                        mvpd_confirm_page_res = post_form(
+                            provider_login_page_res, 'Logging in', {
+                                mso_info['username_field']: username,
+                                mso_info['password_field']: password,
+                            })
+                        mvpd_confirm_page, urlh = mvpd_confirm_page_res
+                        if '<button class="submit" value="Resume">Resume</button>' in mvpd_confirm_page:
+                            post_form(mvpd_confirm_page_res, 'Confirming Login')
+                elif mso_id == 'Verizon':
+                    # In general, if you're connecting from a Verizon-assigned IP,
+                    # you will not actually pass your credentials.
+                    provider_redirect_page, urlh = provider_redirect_page_res
+                    if 'Please wait ...' in provider_redirect_page:
+                        saml_redirect_url = self._html_search_regex(
+                            r'self\.parent\.location=(["\'])(?P<url>.+?)\1',
+                            provider_redirect_page,
+                            'SAML Redirect URL', group='url')
+                        saml_login_page = self._download_webpage(
+                            saml_redirect_url, video_id,
+                            'Downloading SAML Login Page')
+                    else:
+                        saml_login_page_res = post_form(
+                            provider_redirect_page_res, 'Logging in', {
+                                mso_info['username_field']: username,
+                                mso_info['password_field']: password,
+                            })
+                        saml_login_page, urlh = saml_login_page_res
+                        if 'Please try again.' in saml_login_page:
+                            raise ExtractorError(
+                                'We\'re sorry, but either the User ID or Password entered is not correct.')
+                    saml_login_url = self._search_regex(
+                        r'xmlHttp\.open\("POST"\s*,\s*(["\'])(?P<url>.+?)\1',
+                        saml_login_page, 'SAML Login URL', group='url')
+                    saml_response_json = self._download_json(
+                        saml_login_url, video_id, 'Downloading SAML Response',
+                        headers={'Content-Type': 'text/xml'})
+                    self._download_webpage(
+                        saml_response_json['targetValue'], video_id,
+                        'Confirming Login', data=urlencode_postdata({
+                            'SAMLResponse': saml_response_json['SAMLResponse'],
+                            'RelayState': saml_response_json['RelayState']
+                        }), headers={
+                            'Content-Type': 'application/x-www-form-urlencoded'
+                        })
+                else:
+                    # Some providers (e.g. DIRECTV NOW) have another meta refresh
+                    # based redirect that should be followed.
+                    provider_redirect_page, urlh = provider_redirect_page_res
+                    provider_refresh_redirect_url = extract_redirect_url(
+                        provider_redirect_page, url=urlh.geturl())
+                    if provider_refresh_redirect_url:
+                        provider_redirect_page_res = self._download_webpage_handle(
+                            provider_refresh_redirect_url, video_id,
+                            'Downloading Provider Redirect Page (meta refresh)')
+                    provider_login_page_res = post_form(
+                        provider_redirect_page_res, self._DOWNLOADING_LOGIN_PAGE)
+                    mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', {
+                        mso_info.get('username_field', 'username'): username,
+                        mso_info.get('password_field', 'password'): password,
+                    })
+                    if mso_id != 'Rogers':
+                        post_form(mvpd_confirm_page_res, 'Confirming Login')
+
+                session = self._download_webpage(
+                    self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id,
+                    'Retrieving Session', data=urlencode_postdata({
+                        '_method': 'GET',
+                        'requestor_id': requestor_id,
+                    }), headers=mvpd_headers)
+                if '<pendingLogout' in session:
+                    self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {})
+                    count += 1
+                    continue
+                authn_token = unescapeHTML(xml_text(session, 'authnToken'))
+                requestor_info['authn_token'] = authn_token
+                self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info)
+
+            authz_token = requestor_info.get(guid)
+            if authz_token and is_expired(authz_token, 'simpleTokenTTL'):
+                authz_token = None
+            if not authz_token:
+                authorize = self._download_webpage(
+                    self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id,
+                    'Retrieving Authorization Token', data=urlencode_postdata({
+                        'resource_id': resource,
+                        'requestor_id': requestor_id,
+                        'authentication_token': authn_token,
+                        'mso_id': xml_text(authn_token, 'simpleTokenMsoID'),
+                        'userMeta': '1',
+                    }), headers=mvpd_headers)
+                if '<pendingLogout' in authorize:
+                    self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {})
+                    count += 1
+                    continue
+                if '<error' in authorize:
+                    raise ExtractorError(xml_text(authorize, 'details'), expected=True)
+                authz_token = unescapeHTML(xml_text(authorize, 'authzToken'))
+                requestor_info[guid] = authz_token
+                self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info)
+
+            mvpd_headers.update({
+                'ap_19': xml_text(authn_token, 'simpleSamlNameID'),
+                'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'),
+            })
+
+            short_authorize = self._download_webpage(
+                self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize',
+                video_id, 'Retrieving Media Token', data=urlencode_postdata({
+                    'authz_token': authz_token,
+                    'requestor_id': requestor_id,
+                    'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'),
+                    'hashed_guid': 'false',
+                }), headers=mvpd_headers)
+            if '<pendingLogout' in short_authorize:
+                self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {})
+                count += 1
+                continue
+            return short_authorize
diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py

new file mode 100644 (file)

index 0000000..80060f0
--- /dev/null
+++ b/youtube_dl/extractor/adobetv.py
@@ -0,0 +1,288 @@
+from __future__ import unicode_literals
+
+import functools
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    ISO639Utils,
+    OnDemandPagedList,
+    parse_duration,
+    str_or_none,
+    str_to_int,
+    unified_strdate,
+)
+
+
+class AdobeTVBaseIE(InfoExtractor):
+    def _call_api(self, path, video_id, query, note=None):
+        return self._download_json(
+            'http://tv.adobe.com/api/v4/' + path,
+            video_id, note, query=query)['data']
+
+    def _parse_subtitles(self, video_data, url_key):
+        subtitles = {}
+        for translation in video_data.get('translations', []):
+            vtt_path = translation.get(url_key)
+            if not vtt_path:
+                continue
+            lang = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
+            subtitles.setdefault(lang, []).append({
+                'ext': 'vtt',
+                'url': vtt_path,
+            })
+        return subtitles
+
+    def _parse_video_data(self, video_data):
+        video_id = compat_str(video_data['id'])
+        title = video_data['title']
+
+        s3_extracted = False
+        formats = []
+        for source in video_data.get('videos', []):
+            source_url = source.get('url')
+            if not source_url:
+                continue
+            f = {
+                'format_id': source.get('quality_level'),
+                'fps': int_or_none(source.get('frame_rate')),
+                'height': int_or_none(source.get('height')),
+                'tbr': int_or_none(source.get('video_data_rate')),
+                'width': int_or_none(source.get('width')),
+                'url': source_url,
+            }
+            original_filename = source.get('original_filename')
+            if original_filename:
+                if not (f.get('height') and f.get('width')):
+                    mobj = re.search(r'_(\d+)x(\d+)', original_filename)
+                    if mobj:
+                        f.update({
+                            'height': int(mobj.group(2)),
+                            'width': int(mobj.group(1)),
+                        })
+                if original_filename.startswith('s3://') and not s3_extracted:
+                    formats.append({
+                        'format_id': 'original',
+                        'preference': 1,
+                        'url': original_filename.replace('s3://', 'https://s3.amazonaws.com/'),
+                    })
+                    s3_extracted = True
+            formats.append(f)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('thumbnail'),
+            'upload_date': unified_strdate(video_data.get('start_date')),
+            'duration': parse_duration(video_data.get('duration')),
+            'view_count': str_to_int(video_data.get('playcount')),
+            'formats': formats,
+            'subtitles': self._parse_subtitles(video_data, 'vtt'),
+        }
+
+
+class AdobeTVEmbedIE(AdobeTVBaseIE):
+    IE_NAME = 'adobetv:embed'
+    _VALID_URL = r'https?://tv\.adobe\.com/embed/\d+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://tv.adobe.com/embed/22/4153',
+        'md5': 'c8c0461bf04d54574fc2b4d07ac6783a',
+        'info_dict': {
+            'id': '4153',
+            'ext': 'flv',
+            'title': 'Creating Graphics Optimized for BlackBerry',
+            'description': 'md5:eac6e8dced38bdaae51cd94447927459',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'upload_date': '20091109',
+            'duration': 377,
+            'view_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video_data = self._call_api(
+            'episode/' + video_id, video_id, {'disclosure': 'standard'})[0]
+        return self._parse_video_data(video_data)
+
+
+class AdobeTVIE(AdobeTVBaseIE):
+    IE_NAME = 'adobetv'
+    _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?watch/(?P<show_urlname>[^/]+)/(?P<id>[^/]+)'
+
+    _TEST = {
+        'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/',
+        'md5': '9bc5727bcdd55251f35ad311ca74fa1e',
+        'info_dict': {
+            'id': '10981',
+            'ext': 'mp4',
+            'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop',
+            'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'upload_date': '20110914',
+            'duration': 60,
+            'view_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        language, show_urlname, urlname = re.match(self._VALID_URL, url).groups()
+        if not language:
+            language = 'en'
+
+        video_data = self._call_api(
+            'episode/get', urlname, {
+                'disclosure': 'standard',
+                'language': language,
+                'show_urlname': show_urlname,
+                'urlname': urlname,
+            })[0]
+        return self._parse_video_data(video_data)
+
+
+class AdobeTVPlaylistBaseIE(AdobeTVBaseIE):
+    _PAGE_SIZE = 25
+
+    def _fetch_page(self, display_id, query, page):
+        page += 1
+        query['page'] = page
+        for element_data in self._call_api(
+                self._RESOURCE, display_id, query, 'Download Page %d' % page):
+            yield self._process_data(element_data)
+
+    def _extract_playlist_entries(self, display_id, query):
+        return OnDemandPagedList(functools.partial(
+            self._fetch_page, display_id, query), self._PAGE_SIZE)
+
+
+class AdobeTVShowIE(AdobeTVPlaylistBaseIE):
+    IE_NAME = 'adobetv:show'
+    _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?show/(?P<id>[^/]+)'
+
+    _TEST = {
+        'url': 'http://tv.adobe.com/show/the-complete-picture-with-julieanne-kost',
+        'info_dict': {
+            'id': '36',
+            'title': 'The Complete Picture with Julieanne Kost',
+            'description': 'md5:fa50867102dcd1aa0ddf2ab039311b27',
+        },
+        'playlist_mincount': 136,
+    }
+    _RESOURCE = 'episode'
+    _process_data = AdobeTVBaseIE._parse_video_data
+
+    def _real_extract(self, url):
+        language, show_urlname = re.match(self._VALID_URL, url).groups()
+        if not language:
+            language = 'en'
+        query = {
+            'disclosure': 'standard',
+            'language': language,
+            'show_urlname': show_urlname,
+        }
+
+        show_data = self._call_api(
+            'show/get', show_urlname, query)[0]
+
+        return self.playlist_result(
+            self._extract_playlist_entries(show_urlname, query),
+            str_or_none(show_data.get('id')),
+            show_data.get('show_name'),
+            show_data.get('show_description'))
+
+
+class AdobeTVChannelIE(AdobeTVPlaylistBaseIE):
+    IE_NAME = 'adobetv:channel'
+    _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?channel/(?P<id>[^/]+)(?:/(?P<category_urlname>[^/]+))?'
+
+    _TEST = {
+        'url': 'http://tv.adobe.com/channel/development',
+        'info_dict': {
+            'id': 'development',
+        },
+        'playlist_mincount': 96,
+    }
+    _RESOURCE = 'show'
+
+    def _process_data(self, show_data):
+        return self.url_result(
+            show_data['url'], 'AdobeTVShow', str_or_none(show_data.get('id')))
+
+    def _real_extract(self, url):
+        language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups()
+        if not language:
+            language = 'en'
+        query = {
+            'channel_urlname': channel_urlname,
+            'language': language,
+        }
+        if category_urlname:
+            query['category_urlname'] = category_urlname
+
+        return self.playlist_result(
+            self._extract_playlist_entries(channel_urlname, query),
+            channel_urlname)
+
+
+class AdobeTVVideoIE(AdobeTVBaseIE):
+    IE_NAME = 'adobetv:video'
+    _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
+
+    _TEST = {
+        # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners
+        'url': 'https://video.tv.adobe.com/v/2456/',
+        'md5': '43662b577c018ad707a63766462b1e87',
+        'info_dict': {
+            'id': '2456',
+            'ext': 'mp4',
+            'title': 'New experience with Acrobat DC',
+            'description': 'New experience with Acrobat DC',
+            'duration': 248.667,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_data = self._parse_json(self._search_regex(
+            r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id)
+        title = video_data['title']
+
+        formats = []
+        sources = video_data.get('sources') or []
+        for source in sources:
+            source_src = source.get('src')
+            if not source_src:
+                continue
+            formats.append({
+                'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000),
+                'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])),
+                'height': int_or_none(source.get('height') or None),
+                'tbr': int_or_none(source.get('bitrate') or None),
+                'width': int_or_none(source.get('width') or None),
+                'url': source_src,
+            })
+        self._sort_formats(formats)
+
+        # For both metadata and downloaded files the duration varies among
+        # formats. I just pick the max one
+        duration = max(filter(None, [
+            float_or_none(source.get('duration'), scale=1000)
+            for source in sources]))
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('video', {}).get('poster'),
+            'duration': duration,
+            'subtitles': self._parse_subtitles(video_data, 'vttPath'),
+        }
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py

new file mode 100644 (file)

index 0000000..8d1d9ac
--- /dev/null
+++ b/youtube_dl/extractor/adultswim.py
@@ -0,0 +1,202 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .turner import TurnerBaseIE
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    mimetype2ext,
+    parse_age_limit,
+    parse_iso8601,
+    strip_or_none,
+    try_get,
+)
+
+
+class AdultSwimIE(TurnerBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?'
+
+    _TESTS = [{
+        'url': 'http://adultswim.com/videos/rick-and-morty/pilot',
+        'info_dict': {
+            'id': 'rQxZvXQ4ROaSOqq-or2Mow',
+            'ext': 'mp4',
+            'title': 'Rick and Morty - Pilot',
+            'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.',
+            'timestamp': 1543294800,
+            'upload_date': '20181127',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'expected_warnings': ['Unable to download f4m manifest'],
+    }, {
+        'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
+        'info_dict': {
+            'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
+            'ext': 'mp4',
+            'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
+            'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.',
+            'upload_date': '20080124',
+            'timestamp': 1201150800,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': '404 Not Found',
+    }, {
+        'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/',
+        'info_dict': {
+            'id': 'I0LQFQkaSUaFp8PnAWHhoQ',
+            'ext': 'mp4',
+            'title': 'Decker - Inside Decker: A New Hero',
+            'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.',
+            'timestamp': 1469480460,
+            'upload_date': '20160725',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'expected_warnings': ['Unable to download f4m manifest'],
+    }, {
+        'url': 'http://www.adultswim.com/videos/attack-on-titan',
+        'info_dict': {
+            'id': 'attack-on-titan',
+            'title': 'Attack on Titan',
+            'description': 'md5:41caa9416906d90711e31dc00cb7db7e',
+        },
+        'playlist_mincount': 12,
+    }, {
+        'url': 'http://www.adultswim.com/videos/streams/williams-stream',
+        'info_dict': {
+            'id': 'd8DEBj7QRfetLsRgFnGEyg',
+            'ext': 'mp4',
+            'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+            'description': 'original programming',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': '404 Not Found',
+    }]
+
+    def _real_extract(self, url):
+        show_path, episode_path = re.match(self._VALID_URL, url).groups()
+        display_id = episode_path or show_path
+        query = '''query {
+  getShowBySlug(slug:"%s") {
+    %%s
+  }
+}''' % show_path
+        if episode_path:
+            query = query % '''title
+    getVideoBySlug(slug:"%s") {
+      _id
+      auth
+      description
+      duration
+      episodeNumber
+      launchDate
+      mediaID
+      seasonNumber
+      poster
+      title
+      tvRating
+    }''' % episode_path
+            ['getVideoBySlug']
+        else:
+            query = query % '''metaDescription
+    title
+    videos(first:1000,sort:["episode_number"]) {
+      edges {
+        node {
+           _id
+           slug
+        }
+      }
+    }'''
+        show_data = self._download_json(
+            'https://www.adultswim.com/api/search', display_id,
+            data=json.dumps({'query': query}).encode(),
+            headers={'Content-Type': 'application/json'})['data']['getShowBySlug']
+        if episode_path:
+            video_data = show_data['getVideoBySlug']
+            video_id = video_data['_id']
+            episode_title = title = video_data['title']
+            series = show_data.get('title')
+            if series:
+                title = '%s - %s' % (series, title)
+            info = {
+                'id': video_id,
+                'title': title,
+                'description': strip_or_none(video_data.get('description')),
+                'duration': float_or_none(video_data.get('duration')),
+                'formats': [],
+                'subtitles': {},
+                'age_limit': parse_age_limit(video_data.get('tvRating')),
+                'thumbnail': video_data.get('poster'),
+                'timestamp': parse_iso8601(video_data.get('launchDate')),
+                'series': series,
+                'season_number': int_or_none(video_data.get('seasonNumber')),
+                'episode': episode_title,
+                'episode_number': int_or_none(video_data.get('episodeNumber')),
+            }
+
+            auth = video_data.get('auth')
+            media_id = video_data.get('mediaID')
+            if media_id:
+                info.update(self._extract_ngtv_info(media_id, {
+                    # CDN_TOKEN_APP_ID from:
+                    # https://d2gg02c3xr550i.cloudfront.net/assets/asvp.e9c8bef24322d060ef87.bundle.js
+                    'appId': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhcHBJZCI6ImFzLXR2ZS1kZXNrdG9wLXB0enQ2bSIsInByb2R1Y3QiOiJ0dmUiLCJuZXR3b3JrIjoiYXMiLCJwbGF0Zm9ybSI6ImRlc2t0b3AiLCJpYXQiOjE1MzI3MDIyNzl9.BzSCk-WYOZ2GMCIaeVb8zWnzhlgnXuJTCu0jGp_VaZE',
+                }, {
+                    'url': url,
+                    'site_name': 'AdultSwim',
+                    'auth_required': auth,
+                }))
+
+            if not auth:
+                extract_data = self._download_json(
+                    'https://www.adultswim.com/api/shows/v1/videos/' + video_id,
+                    video_id, query={'fields': 'stream'}, fatal=False) or {}
+                assets = try_get(extract_data, lambda x: x['data']['video']['stream']['assets'], list) or []
+                for asset in assets:
+                    asset_url = asset.get('url')
+                    if not asset_url:
+                        continue
+                    ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type')))
+                    if ext == 'm3u8':
+                        info['formats'].extend(self._extract_m3u8_formats(
+                            asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+                    elif ext == 'f4m':
+                        continue
+                        # info['formats'].extend(self._extract_f4m_formats(
+                        #     asset_url, video_id, f4m_id='hds', fatal=False))
+                    elif ext in ('scc', 'ttml', 'vtt'):
+                        info['subtitles'].setdefault('en', []).append({
+                            'url': asset_url,
+                        })
+            self._sort_formats(info['formats'])
+
+            return info
+        else:
+            entries = []
+            for edge in show_data.get('videos', {}).get('edges', []):
+                video = edge.get('node') or {}
+                slug = video.get('slug')
+                if not slug:
+                    continue
+                entries.append(self.url_result(
+                    'http://adultswim.com/videos/%s/%s' % (show_path, slug),
+                    'AdultSwim', video.get('_id')))
+            return self.playlist_result(
+                entries, show_path, show_data.get('title'),
+                strip_or_none(show_data.get('metaDescription')))
diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py

new file mode 100644 (file)

index 0000000..611b948
--- /dev/null
+++ b/youtube_dl/extractor/aenetworks.py
@@ -0,0 +1,247 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .theplatform import ThePlatformIE
+from ..utils import (
+    extract_attributes,
+    ExtractorError,
+    int_or_none,
+    smuggle_url,
+    update_url_query,
+)
+from ..compat import (
+    compat_urlparse,
+)
+
+
+class AENetworksBaseIE(ThePlatformIE):
+    _THEPLATFORM_KEY = 'crazyjava'
+    _THEPLATFORM_SECRET = 's3cr3t'
+
+    def _extract_aen_smil(self, smil_url, video_id, auth=None):
+        query = {'mbr': 'true'}
+        if auth:
+            query['auth'] = auth
+        TP_SMIL_QUERY = [{
+            'assetTypes': 'high_video_ak',
+            'switch': 'hls_high_ak'
+        }, {
+            'assetTypes': 'high_video_s3'
+        }, {
+            'assetTypes': 'high_video_s3',
+            'switch': 'hls_ingest_fastly'
+        }]
+        formats = []
+        subtitles = {}
+        last_e = None
+        for q in TP_SMIL_QUERY:
+            q.update(query)
+            m_url = update_url_query(smil_url, q)
+            m_url = self._sign_url(m_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET)
+            try:
+                tp_formats, tp_subtitles = self._extract_theplatform_smil(
+                    m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes']))
+            except ExtractorError as e:
+                last_e = e
+                continue
+            formats.extend(tp_formats)
+            subtitles = self._merge_subtitles(subtitles, tp_subtitles)
+        if last_e and not formats:
+            raise last_e
+        self._sort_formats(formats)
+        return {
+            'id': video_id,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class AENetworksIE(AENetworksBaseIE):
+    IE_NAME = 'aenetworks'
+    IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        (?P<domain>
+                            (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com|
+                            fyi\.tv
+                        )/
+                        (?:
+                            shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|
+                            movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?|
+                            specials/(?P<special_display_id>[^/]+)/(?:full-special|preview-)|
+                            collections/[^/]+/(?P<collection_display_id>[^/]+)
+                        )
+                    '''
+    _TESTS = [{
+        'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
+        'info_dict': {
+            'id': '22253814',
+            'ext': 'mp4',
+            'title': 'Winter is Coming',
+            'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
+            'timestamp': 1338306241,
+            'upload_date': '20120529',
+            'uploader': 'AENE-NEW',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['ThePlatform'],
+    }, {
+        'url': 'http://www.history.com/shows/ancient-aliens/season-1',
+        'info_dict': {
+            'id': '71889446852',
+        },
+        'playlist_mincount': 5,
+    }, {
+        'url': 'http://www.mylifetime.com/shows/atlanta-plastic',
+        'info_dict': {
+            'id': 'SERIES4317',
+            'title': 'Atlanta Plastic',
+        },
+        'playlist_mincount': 2,
+    }, {
+        'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
+        'only_matching': True
+    }, {
+        'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
+        'only_matching': True
+    }, {
+        'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6',
+        'only_matching': True
+    }, {
+        'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie',
+        'only_matching': True
+    }, {
+        'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us',
+        'only_matching': True
+    }, {
+        'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special',
+        'only_matching': True
+    }, {
+        'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward',
+        'only_matching': True
+    }, {
+        'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story',
+        'only_matching': True
+    }]
+    _DOMAIN_TO_REQUESTOR_ID = {
+        'history.com': 'HISTORY',
+        'aetv.com': 'AETV',
+        'mylifetime.com': 'LIFETIME',
+        'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB',
+        'fyi.tv': 'FYI',
+    }
+
+    def _real_extract(self, url):
+        domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups()
+        display_id = show_path or movie_display_id or special_display_id or collection_display_id
+        webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers())
+        if show_path:
+            url_parts = show_path.split('/')
+            url_parts_len = len(url_parts)
+            if url_parts_len == 1:
+                entries = []
+                for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
+                    entries.append(self.url_result(
+                        compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
+                if entries:
+                    return self.playlist_result(
+                        entries, self._html_search_meta('aetn:SeriesId', webpage),
+                        self._html_search_meta('aetn:SeriesTitle', webpage))
+                else:
+                    # single season
+                    url_parts_len = 2
+            if url_parts_len == 2:
+                entries = []
+                for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage):
+                    episode_attributes = extract_attributes(episode_item)
+                    episode_url = compat_urlparse.urljoin(
+                        url, episode_attributes['data-canonical'])
+                    entries.append(self.url_result(
+                        episode_url, 'AENetworks',
+                        episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id')))
+                return self.playlist_result(
+                    entries, self._html_search_meta('aetn:SeasonId', webpage))
+
+        video_id = self._html_search_meta('aetn:VideoID', webpage)
+        media_url = self._search_regex(
+            [r"media_url\s*=\s*'(?P<url>[^']+)'",
+             r'data-media-url=(?P<url>(?:https?:)?//[^\s>]+)',
+             r'data-media-url=(["\'])(?P<url>(?:(?!\1).)+?)\1'],
+            webpage, 'video url', group='url')
+        theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
+            r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
+        info = self._parse_theplatform_metadata(theplatform_metadata)
+        auth = None
+        if theplatform_metadata.get('AETN$isBehindWall'):
+            requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain]
+            resource = self._get_mvpd_resource(
+                requestor_id, theplatform_metadata['title'],
+                theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'),
+                theplatform_metadata['ratings'][0]['rating'])
+            auth = self._extract_mvpd_auth(
+                url, video_id, requestor_id, resource)
+        info.update(self._search_json_ld(webpage, video_id, fatal=False))
+        info.update(self._extract_aen_smil(media_url, video_id, auth))
+        return info
+
+
+class HistoryTopicIE(AENetworksBaseIE):
+    IE_NAME = 'history:topic'
+    IE_DESC = 'History.com Topic'
+    _VALID_URL = r'https?://(?:www\.)?history\.com/topics/[^/]+/(?P<id>[\w+-]+?)-video'
+    _TESTS = [{
+        'url': 'https://www.history.com/topics/valentines-day/history-of-valentines-day-video',
+        'info_dict': {
+            'id': '40700995724',
+            'ext': 'mp4',
+            'title': "History of Valentine’s Day",
+            'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
+            'timestamp': 1375819729,
+            'upload_date': '20130806',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['ThePlatform'],
+    }]
+
+    def theplatform_url_result(self, theplatform_url, video_id, query):
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': smuggle_url(
+                update_url_query(theplatform_url, query),
+                {
+                    'sig': {
+                        'key': self._THEPLATFORM_KEY,
+                        'secret': self._THEPLATFORM_SECRET,
+                    },
+                    'force_smil_url': True
+                }),
+            'ie_key': 'ThePlatform',
+        }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._search_regex(
+            r'<phoenix-iframe[^>]+src="[^"]+\btpid=(\d+)', webpage, 'tpid')
+        result = self._download_json(
+            'https://feeds.video.aetnd.com/api/v2/history/videos',
+            video_id, query={'filter[id]': video_id})['results'][0]
+        title = result['title']
+        info = self._extract_aen_smil(result['publicUrl'], video_id)
+        info.update({
+            'title': title,
+            'description': result.get('description'),
+            'duration': int_or_none(result.get('duration')),
+            'timestamp': int_or_none(result.get('added'), 1000),
+        })
+        return info
diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py

new file mode 100644 (file)

index 0000000..6275e52
--- /dev/null
+++ b/youtube_dl/extractor/afreecatv.py
@@ -0,0 +1,367 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_xpath
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    url_or_none,
+    urlencode_postdata,
+    xpath_text,
+)
+
+
+class AfreecaTVIE(InfoExtractor):
+    IE_NAME = 'afreecatv'
+    IE_DESC = 'afreecatv.com'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)?
+                            (?:
+                                /app/(?:index|read_ucc_bbs)\.cgi|
+                                /player/[Pp]layer\.(?:swf|html)
+                            )\?.*?\bnTitleNo=|
+                            vod\.afreecatv\.com/PLAYER/STATION/
+                        )
+                        (?P<id>\d+)
+                    '''
+    _NETRC_MACHINE = 'afreecatv'
+    _TESTS = [{
+        'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',
+        'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
+        'info_dict': {
+            'id': '36164052',
+            'ext': 'mp4',
+            'title': '데일리 에이프릴 요정들의 시상식!',
+            'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+            'uploader': 'dailyapril',
+            'uploader_id': 'dailyapril',
+            'upload_date': '20160503',
+        },
+        'skip': 'Video is gone',
+    }, {
+        'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867',
+        'info_dict': {
+            'id': '36153164',
+            'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+            'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+            'uploader': 'dailyapril',
+            'uploader_id': 'dailyapril',
+        },
+        'playlist_count': 2,
+        'playlist': [{
+            'md5': 'd8b7c174568da61d774ef0203159bf97',
+            'info_dict': {
+                'id': '36153164_1',
+                'ext': 'mp4',
+                'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+                'upload_date': '20160502',
+            },
+        }, {
+            'md5': '58f2ce7f6044e34439ab2d50612ab02b',
+            'info_dict': {
+                'id': '36153164_2',
+                'ext': 'mp4',
+                'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+                'upload_date': '20160502',
+            },
+        }],
+        'skip': 'Video is gone',
+    }, {
+        'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793',
+        'info_dict': {
+            'id': '18650793',
+            'ext': 'mp4',
+            'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': '윈아디',
+            'uploader_id': 'badkids',
+            'duration': 107,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652',
+        'info_dict': {
+            'id': '10481652',
+            'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+            'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+            'uploader': 'dailyapril',
+            'uploader_id': 'dailyapril',
+            'duration': 6492,
+        },
+        'playlist_count': 2,
+        'playlist': [{
+            'md5': 'd8b7c174568da61d774ef0203159bf97',
+            'info_dict': {
+                'id': '20160502_c4c62b9d_174361386_1',
+                'ext': 'mp4',
+                'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)",
+                'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+                'uploader': 'dailyapril',
+                'uploader_id': 'dailyapril',
+                'upload_date': '20160502',
+                'duration': 3601,
+            },
+        }, {
+            'md5': '58f2ce7f6044e34439ab2d50612ab02b',
+            'info_dict': {
+                'id': '20160502_39e739bb_174361386_2',
+                'ext': 'mp4',
+                'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)",
+                'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+                'uploader': 'dailyapril',
+                'uploader_id': 'dailyapril',
+                'upload_date': '20160502',
+                'duration': 2891,
+            },
+        }],
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # non standard key
+        'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605',
+        'info_dict': {
+            'id': '20170411_BE689A0E_190960999_1_2_h',
+            'ext': 'mp4',
+            'title': '혼자사는여자집',
+            'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+            'uploader': '♥이슬이',
+            'uploader_id': 'dasl8121',
+            'upload_date': '20170411',
+            'duration': 213,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # PARTIAL_ADULT
+        'url': 'http://vod.afreecatv.com/PLAYER/STATION/32028439',
+        'info_dict': {
+            'id': '20180327_27901457_202289533_1',
+            'ext': 'mp4',
+            'title': '[생]빨개요♥ (part 1)',
+            'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+            'uploader': '[SA]서아',
+            'uploader_id': 'bjdyrksu',
+            'upload_date': '20180327',
+            'duration': 3601,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['adult content'],
+    }, {
+        'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
+        'only_matching': True,
+    }, {
+        'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def parse_video_key(key):
+        video_key = {}
+        m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key)
+        if m:
+            video_key['upload_date'] = m.group('upload_date')
+            video_key['part'] = int(m.group('part'))
+        return video_key
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'szWork': 'login',
+            'szType': 'json',
+            'szUid': username,
+            'szPassword': password,
+            'isSaveId': 'false',
+            'szScriptVar': 'oLoginRet',
+            'szAction': '',
+        }
+
+        response = self._download_json(
+            'https://login.afreecatv.com/app/LoginAction.php', None,
+            'Logging in', data=urlencode_postdata(login_form))
+
+        _ERRORS = {
+            -4: 'Your account has been suspended due to a violation of our terms and policies.',
+            -5: 'https://member.afreecatv.com/app/user_delete_progress.php',
+            -6: 'https://login.afreecatv.com/membership/changeMember.php',
+            -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.",
+            -9: 'https://member.afreecatv.com/app/pop_login_block.php',
+            -11: 'https://login.afreecatv.com/afreeca/second_login.php',
+            -12: 'https://member.afreecatv.com/app/user_security.php',
+            0: 'The username does not exist or you have entered the wrong password.',
+            -1: 'The username does not exist or you have entered the wrong password.',
+            -3: 'You have entered your username/password incorrectly.',
+            -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.',
+            -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.',
+            -32008: 'You have failed to log in. Please contact our Help Center.',
+        }
+
+        result = int_or_none(response.get('RESULT'))
+        if result != 1:
+            error = _ERRORS.get(result, 'You have failed to log in.')
+            raise ExtractorError(
+                'Unable to login: %s said: %s' % (self.IE_NAME, error),
+                expected=True)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        if re.search(r'alert\(["\']This video has been deleted', webpage):
+            raise ExtractorError(
+                'Video %s has been deleted' % video_id, expected=True)
+
+        station_id = self._search_regex(
+            r'nStationNo\s*=\s*(\d+)', webpage, 'station')
+        bbs_id = self._search_regex(
+            r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs')
+        video_id = self._search_regex(
+            r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id)
+
+        partial_view = False
+        for _ in range(2):
+            query = {
+                'nTitleNo': video_id,
+                'nStationNo': station_id,
+                'nBbsNo': bbs_id,
+            }
+            if partial_view:
+                query['partialView'] = 'SKIP_ADULT'
+            video_xml = self._download_xml(
+                'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php',
+                video_id, 'Downloading video info XML%s'
+                % (' (skipping adult)' if partial_view else ''),
+                video_id, headers={
+                    'Referer': url,
+                }, query=query)
+
+            flag = xpath_text(video_xml, './track/flag', 'flag', default=None)
+            if flag and flag == 'SUCCEED':
+                break
+            if flag == 'PARTIAL_ADULT':
+                self._downloader.report_warning(
+                    'In accordance with local laws and regulations, underage users are restricted from watching adult content. '
+                    'Only content suitable for all ages will be downloaded. '
+                    'Provide account credentials if you wish to download restricted content.')
+                partial_view = True
+                continue
+            elif flag == 'ADULT':
+                error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.'
+            else:
+                error = flag
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error), expected=True)
+        else:
+            raise ExtractorError('Unable to download video info')
+
+        video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
+        if video_element is None or video_element.text is None:
+            raise ExtractorError(
+                'Video %s video does not exist' % video_id, expected=True)
+
+        video_url = video_element.text.strip()
+
+        title = xpath_text(video_xml, './track/title', 'title', fatal=True)
+
+        uploader = xpath_text(video_xml, './track/nickname', 'uploader')
+        uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
+        duration = int_or_none(xpath_text(
+            video_xml, './track/duration', 'duration'))
+        thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
+
+        common_entry = {
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'thumbnail': thumbnail,
+        }
+
+        info = common_entry.copy()
+        info.update({
+            'id': video_id,
+            'title': title,
+            'duration': duration,
+        })
+
+        if not video_url:
+            entries = []
+            file_elements = video_element.findall(compat_xpath('./file'))
+            one = len(file_elements) == 1
+            for file_num, file_element in enumerate(file_elements, start=1):
+                file_url = url_or_none(file_element.text)
+                if not file_url:
+                    continue
+                key = file_element.get('key', '')
+                upload_date = self._search_regex(
+                    r'^(\d{8})_', key, 'upload date', default=None)
+                file_duration = int_or_none(file_element.get('duration'))
+                format_id = key if key else '%s_%s' % (video_id, file_num)
+                if determine_ext(file_url) == 'm3u8':
+                    formats = self._extract_m3u8_formats(
+                        file_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id='hls',
+                        note='Downloading part %d m3u8 information' % file_num)
+                else:
+                    formats = [{
+                        'url': file_url,
+                        'format_id': 'http',
+                    }]
+                if not formats:
+                    continue
+                self._sort_formats(formats)
+                file_info = common_entry.copy()
+                file_info.update({
+                    'id': format_id,
+                    'title': title if one else '%s (part %d)' % (title, file_num),
+                    'upload_date': upload_date,
+                    'duration': file_duration,
+                    'formats': formats,
+                })
+                entries.append(file_info)
+            entries_info = info.copy()
+            entries_info.update({
+                '_type': 'multi_video',
+                'entries': entries,
+            })
+            return entries_info
+
+        info = {
+            'id': video_id,
+            'title': title,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'duration': duration,
+            'thumbnail': thumbnail,
+        }
+
+        if determine_ext(video_url) == 'm3u8':
+            info['formats'] = self._extract_m3u8_formats(
+                video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls')
+        else:
+            app, playpath = video_url.split('mp4:')
+            info.update({
+                'url': app,
+                'ext': 'flv',
+                'play_path': 'mp4:' + playpath,
+                'rtmp_live': True,  # downloading won't end without this
+            })
+
+        return info
diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py

new file mode 100644 (file)

index 0000000..9e38136
--- /dev/null
+++ b/youtube_dl/extractor/airmozilla.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+)
+
+
+class AirMozillaIE(InfoExtractor):
+    _VALID_URL = r'https?://air\.mozilla\.org/(?P<id>[0-9a-z-]+)/?'
+    _TEST = {
+        'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/',
+        'md5': '8d02f53ee39cf006009180e21df1f3ba',
+        'info_dict': {
+            'id': '6x4q2w',
+            'ext': 'mp4',
+            'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco',
+            'thumbnail': r're:https?://.*/poster\.jpg',
+            'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...',
+            'timestamp': 1422487800,
+            'upload_date': '20150128',
+            'location': 'SFO Commons',
+            'duration': 3780,
+            'view_count': int,
+            'categories': ['Main', 'Privacy'],
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._html_search_regex(r'//vid\.ly/(.*?)/embed', webpage, 'id')
+
+        embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id)
+        jwconfig = self._parse_json(self._search_regex(
+            r'initCallback\((.*)\);', embed_script, 'metadata'), video_id)['config']
+
+        info_dict = self._parse_jwplayer_data(jwconfig, video_id)
+        view_count = int_or_none(self._html_search_regex(
+            r'Views since archived: ([0-9]+)',
+            webpage, 'view count', fatal=False))
+        timestamp = parse_iso8601(self._html_search_regex(
+            r'<time datetime="(.*?)"', webpage, 'timestamp', fatal=False))
+        duration = parse_duration(self._search_regex(
+            r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)',
+            webpage, 'duration', fatal=False))
+
+        info_dict.update({
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'url': self._og_search_url(webpage),
+            'display_id': display_id,
+            'description': self._og_search_description(webpage),
+            'timestamp': timestamp,
+            'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None),
+            'duration': duration,
+            'view_count': view_count,
+            'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage),
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py

new file mode 100644 (file)

index 0000000..6f241e6
--- /dev/null
+++ b/youtube_dl/extractor/aliexpress.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    float_or_none,
+    try_get,
+)
+
+
+class AliExpressLiveIE(InfoExtractor):
+    _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://live.aliexpress.com/live/2800002704436634',
+        'md5': 'e729e25d47c5e557f2630eaf99b740a5',
+        'info_dict': {
+            'id': '2800002704436634',
+            'ext': 'mp4',
+            'title': 'CASIMA7.22',
+            'thumbnail': r're:http://.*\.jpg',
+            'uploader': 'CASIMA Official Store',
+            'timestamp': 1500717600,
+            'upload_date': '20170722',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        data = self._parse_json(
+            self._search_regex(
+                r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var',
+                webpage, 'runParams'),
+            video_id)
+
+        title = data['title']
+
+        formats = self._extract_m3u8_formats(
+            data['replyStreamUrl'], video_id, 'mp4',
+            entry_protocol='m3u8_native', m3u8_id='hls')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': data.get('coverUrl'),
+            'uploader': try_get(
+                data, lambda x: x['followBar']['name'], compat_str),
+            'timestamp': float_or_none(data.get('startTimeLong'), scale=1000),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py

new file mode 100644 (file)

index 0000000..c68be31
--- /dev/null
+++ b/youtube_dl/extractor/aljazeera.py
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class AlJazeeraIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html'
+
+    _TESTS = [{
+        'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
+        'info_dict': {
+            'id': '3792260579001',
+            'ext': 'mp4',
+            'title': 'The Slum - Episode 1: Deliverance',
+            'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.',
+            'uploader_id': '665003303001',
+            'timestamp': 1411116829,
+            'upload_date': '20140919',
+        },
+        'add_ie': ['BrightcoveNew'],
+        'skip': 'Not accessible from Travis CI server',
+    }, {
+        'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html',
+        'only_matching': True,
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        program_name = self._match_id(url)
+        webpage = self._download_webpage(url, program_name)
+        brightcove_id = self._search_regex(
+            r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id')
+        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py

new file mode 100644 (file)

index 0000000..cd533ac
--- /dev/null
+++ b/youtube_dl/extractor/allocine.py
@@ -0,0 +1,132 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    qualities,
+    remove_end,
+    try_get,
+    unified_timestamp,
+    url_basename,
+)
+
+
+class AllocineIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?allocine\.fr/(?:article|video|film)/(?:fichearticle_gen_carticle=|player_gen_cmedia=|fichefilm_gen_cfilm=|video-)(?P<id>[0-9]+)(?:\.html)?'
+
+    _TESTS = [{
+        'url': 'http://www.allocine.fr/article/fichearticle_gen_carticle=18635087.html',
+        'md5': '0c9fcf59a841f65635fa300ac43d8269',
+        'info_dict': {
+            'id': '19546517',
+            'display_id': '18635087',
+            'ext': 'mp4',
+            'title': 'Astérix - Le Domaine des Dieux Teaser VF',
+            'description': 'md5:4a754271d9c6f16c72629a8a993ee884',
+            'thumbnail': r're:http://.*\.jpg',
+            'duration': 39,
+            'timestamp': 1404273600,
+            'upload_date': '20140702',
+            'view_count': int,
+        },
+    }, {
+        'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html',
+        'md5': 'd0cdce5d2b9522ce279fdfec07ff16e0',
+        'info_dict': {
+            'id': '19540403',
+            'display_id': '19540403',
+            'ext': 'mp4',
+            'title': 'Planes 2 Bande-annonce VF',
+            'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway',
+            'thumbnail': r're:http://.*\.jpg',
+            'duration': 69,
+            'timestamp': 1385659800,
+            'upload_date': '20131128',
+            'view_count': int,
+        },
+    }, {
+        'url': 'http://www.allocine.fr/video/player_gen_cmedia=19544709&cfilm=181290.html',
+        'md5': '101250fb127ef9ca3d73186ff22a47ce',
+        'info_dict': {
+            'id': '19544709',
+            'display_id': '19544709',
+            'ext': 'mp4',
+            'title': 'Dragons 2 - Bande annonce finale VF',
+            'description': 'md5:6cdd2d7c2687d4c6aafe80a35e17267a',
+            'thumbnail': r're:http://.*\.jpg',
+            'duration': 144,
+            'timestamp': 1397589900,
+            'upload_date': '20140415',
+            'view_count': int,
+        },
+    }, {
+        'url': 'http://www.allocine.fr/video/video-19550147/',
+        'md5': '3566c0668c0235e2d224fd8edb389f67',
+        'info_dict': {
+            'id': '19550147',
+            'ext': 'mp4',
+            'title': 'Faux Raccord N°123 - Les gaffes de Cliffhanger',
+            'description': 'md5:bc734b83ffa2d8a12188d9eb48bb6354',
+            'thumbnail': r're:http://.*\.jpg',
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        formats = []
+        quality = qualities(['ld', 'md', 'hd'])
+
+        model = self._html_search_regex(
+            r'data-model="([^"]+)"', webpage, 'data model', default=None)
+        if model:
+            model_data = self._parse_json(model, display_id)
+            video = model_data['videos'][0]
+            title = video['title']
+            for video_url in video['sources'].values():
+                video_id, format_id = url_basename(video_url).split('_')[:2]
+                formats.append({
+                    'format_id': format_id,
+                    'quality': quality(format_id),
+                    'url': video_url,
+                })
+            duration = int_or_none(video.get('duration'))
+            view_count = int_or_none(video.get('view_count'))
+            timestamp = unified_timestamp(try_get(
+                video, lambda x: x['added_at']['date'], compat_str))
+        else:
+            video_id = display_id
+            media_data = self._download_json(
+                'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
+            title = remove_end(
+                self._html_search_regex(
+                    r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
+                ' - AlloCiné')
+            for key, value in media_data['video'].items():
+                if not key.endswith('Path'):
+                    continue
+                format_id = key[:-len('Path')]
+                formats.append({
+                    'format_id': format_id,
+                    'quality': quality(format_id),
+                    'url': value,
+                })
+            duration, view_count, timestamp = [None] * 3
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'duration': duration,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/alphaporno.py b/youtube_dl/extractor/alphaporno.py

new file mode 100644 (file)

index 0000000..3a6d99f
--- /dev/null
+++ b/youtube_dl/extractor/alphaporno.py
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_iso8601,
+    parse_duration,
+    parse_filesize,
+    int_or_none,
+)
+
+
+class AlphaPornoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?alphaporno\.com/videos/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://www.alphaporno.com/videos/sensual-striptease-porn-with-samantha-alexandra/',
+        'md5': 'feb6d3bba8848cd54467a87ad34bd38e',
+        'info_dict': {
+            'id': '258807',
+            'display_id': 'sensual-striptease-porn-with-samantha-alexandra',
+            'ext': 'mp4',
+            'title': 'Sensual striptease porn with Samantha Alexandra',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'timestamp': 1418694611,
+            'upload_date': '20141216',
+            'duration': 387,
+            'filesize_approx': 54120000,
+            'tbr': 1145,
+            'categories': list,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r"video_id\s*:\s*'([^']+)'", webpage, 'video id', default=None)
+
+        video_url = self._search_regex(
+            r"video_url\s*:\s*'([^']+)'", webpage, 'video url')
+        ext = self._html_search_meta(
+            'encodingFormat', webpage, 'ext', default='.mp4')[1:]
+
+        title = self._search_regex(
+            [r'<meta content="([^"]+)" itemprop="description">',
+             r'class="title" itemprop="name">([^<]+)<'],
+            webpage, 'title')
+        thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail')
+        timestamp = parse_iso8601(self._html_search_meta(
+            'uploadDate', webpage, 'upload date'))
+        duration = parse_duration(self._html_search_meta(
+            'duration', webpage, 'duration'))
+        filesize_approx = parse_filesize(self._html_search_meta(
+            'contentSize', webpage, 'file size'))
+        bitrate = int_or_none(self._html_search_meta(
+            'bitrate', webpage, 'bitrate'))
+        categories = self._html_search_meta(
+            'keywords', webpage, 'categories', default='').split(',')
+
+        age_limit = self._rta_search(webpage)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'ext': ext,
+            'title': title,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'filesize_approx': filesize_approx,
+            'tbr': bitrate,
+            'categories': categories,
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py

new file mode 100644 (file)

index 0000000..6fb3d6c
--- /dev/null
+++ b/youtube_dl/extractor/amcnetworks.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .theplatform import ThePlatformIE
+from ..utils import (
+    int_or_none,
+    parse_age_limit,
+    try_get,
+    update_url_query,
+)
+
+
+class AMCNetworksIE(ThePlatformIE):
+    _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1',
+        'md5': '',
+        'info_dict': {
+            'id': 's3MX01Nl4vPH',
+            'ext': 'mp4',
+            'title': 'Maron - Season 4 - Step 1',
+            'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.',
+            'age_limit': 17,
+            'upload_date': '20160505',
+            'timestamp': 1462468831,
+            'uploader': 'AMCN',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': 'Requires TV provider accounts',
+    }, {
+        'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ifc.com/movies/chaos',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bbcamerica.com/shows/doctor-who/full-episodes/the-power-of-the-daleks/episode-01-episode-1-color-version',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.wetv.com/shows/mama-june-from-not-to-hot/full-episode/season-01/thin-tervention',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.wetv.com/shows/la-hair/videos/season-05/episode-09-episode-9-2/episode-9-sneak-peek-3',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        query = {
+            'mbr': 'true',
+            'manifest': 'm3u',
+        }
+        media_url = self._search_regex(
+            r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)',
+            webpage, 'media url')
+        theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
+            r'link\.theplatform\.com/s/([^?]+)',
+            media_url, 'theplatform_path'), display_id)
+        info = self._parse_theplatform_metadata(theplatform_metadata)
+        video_id = theplatform_metadata['pid']
+        title = theplatform_metadata['title']
+        rating = try_get(
+            theplatform_metadata, lambda x: x['ratings'][0]['rating'])
+        auth_required = self._search_regex(
+            r'window\.authRequired\s*=\s*(true|false);',
+            webpage, 'auth required')
+        if auth_required == 'true':
+            requestor_id = self._search_regex(
+                r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)',
+                webpage, 'requestor id')
+            resource = self._get_mvpd_resource(
+                requestor_id, title, video_id, rating)
+            query['auth'] = self._extract_mvpd_auth(
+                url, video_id, requestor_id, resource)
+        media_url = update_url_query(media_url, query)
+        formats, subtitles = self._extract_theplatform_smil(
+            media_url, video_id)
+        self._sort_formats(formats)
+        info.update({
+            'id': video_id,
+            'subtitles': subtitles,
+            'formats': formats,
+            'age_limit': parse_age_limit(parse_age_limit(rating)),
+        })
+        ns_keys = theplatform_metadata.get('$xmlns', {}).keys()
+        if ns_keys:
+            ns = list(ns_keys)[0]
+            series = theplatform_metadata.get(ns + '$show')
+            season_number = int_or_none(
+                theplatform_metadata.get(ns + '$season'))
+            episode = theplatform_metadata.get(ns + '$episodeTitle')
+            episode_number = int_or_none(
+                theplatform_metadata.get(ns + '$episode'))
+            if season_number:
+                title = 'Season %d - %s' % (season_number, title)
+            if series:
+                title = '%s - %s' % (series, title)
+            info.update({
+                'title': title,
+                'series': series,
+                'season_number': season_number,
+                'episode': episode,
+                'episode_number': episode_number,
+            })
+        return info
diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py

new file mode 100644 (file)

index 0000000..9c9d77a
--- /dev/null
+++ b/youtube_dl/extractor/americastestkitchen.py
@@ -0,0 +1,82 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+    js_to_json,
+    try_get,
+    unified_strdate,
+)
+
+
+class AmericasTestKitchenIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers',
+        'md5': 'b861c3e365ac38ad319cfd509c30577f',
+        'info_dict': {
+            'id': '5b400b9ee338f922cb06450c',
+            'title': 'Weeknight Japanese Suppers',
+            'ext': 'mp4',
+            'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8',
+            'thumbnail': r're:^https?://',
+            'timestamp': 1523664000,
+            'upload_date': '20180414',
+            'release_date': '20180414',
+            'series': "America's Test Kitchen",
+            'season_number': 18,
+            'episode': 'Weeknight Japanese Suppers',
+            'episode_number': 15,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_data = self._parse_json(
+            self._search_regex(
+                r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*</script>',
+                webpage, 'initial context'),
+            video_id, js_to_json)
+
+        ep_data = try_get(
+            video_data,
+            (lambda x: x['episodeDetail']['content']['data'],
+             lambda x: x['videoDetail']['content']['data']), dict)
+        ep_meta = ep_data.get('full_video', {})
+
+        zype_id = ep_data.get('zype_id') or ep_meta['zype_id']
+
+        title = ep_data.get('title') or ep_meta.get('title')
+        description = clean_html(ep_meta.get('episode_description') or ep_data.get(
+            'description') or ep_meta.get('description'))
+        thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url'])
+        release_date = unified_strdate(ep_data.get('aired_at'))
+
+        season_number = int_or_none(ep_meta.get('season_number'))
+        episode = ep_meta.get('title')
+        episode_number = int_or_none(ep_meta.get('episode_number'))
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id,
+            'ie_key': 'Zype',
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'release_date': release_date,
+            'series': "America's Test Kitchen",
+            'season_number': season_number,
+            'episode': episode,
+            'episode_number': episode_number,
+        }
diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py

new file mode 100644 (file)

index 0000000..7ff098c
--- /dev/null
+++ b/youtube_dl/extractor/amp.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    mimetype2ext,
+    parse_iso8601,
+    url_or_none,
+)
+
+
+class AMPIE(InfoExtractor):
+    # parse Akamai Adaptive Media Player feed
+    def _extract_feed_info(self, url):
+        feed = self._download_json(
+            url, None, 'Downloading Akamai AMP feed',
+            'Unable to download Akamai AMP feed')
+        item = feed.get('channel', {}).get('item')
+        if not item:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error']))
+
+        video_id = item['guid']
+
+        def get_media_node(name, default=None):
+            media_name = 'media-%s' % name
+            media_group = item.get('media-group') or item
+            return media_group.get(media_name) or item.get(media_name) or item.get(name, default)
+
+        thumbnails = []
+        media_thumbnail = get_media_node('thumbnail')
+        if media_thumbnail:
+            if isinstance(media_thumbnail, dict):
+                media_thumbnail = [media_thumbnail]
+            for thumbnail_data in media_thumbnail:
+                thumbnail = thumbnail_data.get('@attributes', {})
+                thumbnail_url = url_or_none(thumbnail.get('url'))
+                if not thumbnail_url:
+                    continue
+                thumbnails.append({
+                    'url': self._proto_relative_url(thumbnail_url, 'http:'),
+                    'width': int_or_none(thumbnail.get('width')),
+                    'height': int_or_none(thumbnail.get('height')),
+                })
+
+        subtitles = {}
+        media_subtitle = get_media_node('subTitle')
+        if media_subtitle:
+            if isinstance(media_subtitle, dict):
+                media_subtitle = [media_subtitle]
+            for subtitle_data in media_subtitle:
+                subtitle = subtitle_data.get('@attributes', {})
+                subtitle_href = url_or_none(subtitle.get('href'))
+                if not subtitle_href:
+                    continue
+                subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
+                    'url': subtitle_href,
+                    'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href),
+                })
+
+        formats = []
+        media_content = get_media_node('content')
+        if isinstance(media_content, dict):
+            media_content = [media_content]
+        for media_data in media_content:
+            media = media_data.get('@attributes', {})
+            media_url = url_or_none(media.get('url'))
+            if not media_url:
+                continue
+            ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
+            if ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
+                    video_id, f4m_id='hds', fatal=False))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
+                    'url': media_url,
+                    'tbr': int_or_none(media.get('bitrate')),
+                    'filesize': int_or_none(media.get('fileSize')),
+                    'ext': ext,
+                })
+
+        self._sort_formats(formats)
+
+        timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date'))
+
+        return {
+            'id': video_id,
+            'title': get_media_node('title'),
+            'description': get_media_node('description'),
+            'thumbnails': thumbnails,
+            'timestamp': timestamp,
+            'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py

new file mode 100644 (file)

index 0000000..00ce684
--- /dev/null
+++ b/youtube_dl/extractor/animeondemand.py
@@ -0,0 +1,293 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    extract_attributes,
+    ExtractorError,
+    url_or_none,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class AnimeOnDemandIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P<id>\d+)'
+    _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in'
+    _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
+    _NETRC_MACHINE = 'animeondemand'
+    # German-speaking countries of Europe
+    _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU']
+    _TESTS = [{
+        # jap, OmU
+        'url': 'https://www.anime-on-demand.de/anime/161',
+        'info_dict': {
+            'id': '161',
+            'title': 'Grimgar, Ashes and Illusions (OmU)',
+            'description': 'md5:6681ce3c07c7189d255ac6ab23812d31',
+        },
+        'playlist_mincount': 4,
+    }, {
+        # Film wording is used instead of Episode, ger/jap, Dub/OmU
+        'url': 'https://www.anime-on-demand.de/anime/39',
+        'only_matching': True,
+    }, {
+        # Episodes without titles, jap, OmU
+        'url': 'https://www.anime-on-demand.de/anime/162',
+        'only_matching': True,
+    }, {
+        # ger/jap, Dub/OmU, account required
+        'url': 'https://www.anime-on-demand.de/anime/169',
+        'only_matching': True,
+    }, {
+        # Full length film, non-series, ger/jap, Dub/OmU, account required
+        'url': 'https://www.anime-on-demand.de/anime/185',
+        'only_matching': True,
+    }, {
+        # Flash videos
+        'url': 'https://www.anime-on-demand.de/anime/12',
+        'only_matching': True,
+    }]
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login page')
+
+        if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page:
+            self.raise_geo_restricted(
+                '%s is only available in German-speaking countries of Europe' % self.IE_NAME)
+
+        login_form = self._form_hidden_inputs('new_user', login_page)
+
+        login_form.update({
+            'user[login]': username,
+            'user[password]': password,
+        })
+
+        post_url = self._search_regex(
+            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+            'post url', default=self._LOGIN_URL, group='url')
+
+        if not post_url.startswith('http'):
+            post_url = urljoin(self._LOGIN_URL, post_url)
+
+        response = self._download_webpage(
+            post_url, None, 'Logging in',
+            data=urlencode_postdata(login_form), headers={
+                'Referer': self._LOGIN_URL,
+            })
+
+        if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')):
+            error = self._search_regex(
+                r'<p[^>]+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</p>',
+                response, 'error', default=None, group='error')
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
+            raise ExtractorError('Unable to log in')
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        anime_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, anime_id)
+
+        if 'data-playlist=' not in webpage:
+            self._download_webpage(
+                self._APPLY_HTML5_URL, anime_id,
+                'Activating HTML5 beta', 'Unable to apply HTML5 beta')
+            webpage = self._download_webpage(url, anime_id)
+
+        csrf_token = self._html_search_meta(
+            'csrf-token', webpage, 'csrf token', fatal=True)
+
+        anime_title = self._html_search_regex(
+            r'(?s)<h1[^>]+itemprop="name"[^>]*>(.+?)</h1>',
+            webpage, 'anime name')
+        anime_description = self._html_search_regex(
+            r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>',
+            webpage, 'anime description', default=None)
+
+        entries = []
+
+        def extract_info(html, video_id, num=None):
+            title, description = [None] * 2
+            formats = []
+
+            for input_ in re.findall(
+                    r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html):
+                attributes = extract_attributes(input_)
+                title = attributes.get('data-dialog-header')
+                playlist_urls = []
+                for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'):
+                    playlist_url = attributes.get(playlist_key)
+                    if isinstance(playlist_url, compat_str) and re.match(
+                            r'/?[\da-zA-Z]+', playlist_url):
+                        playlist_urls.append(attributes[playlist_key])
+                if not playlist_urls:
+                    continue
+
+                lang = attributes.get('data-lang')
+                lang_note = attributes.get('value')
+
+                for playlist_url in playlist_urls:
+                    kind = self._search_regex(
+                        r'videomaterialurl/\d+/([^/]+)/',
+                        playlist_url, 'media kind', default=None)
+                    format_id_list = []
+                    if lang:
+                        format_id_list.append(lang)
+                    if kind:
+                        format_id_list.append(kind)
+                    if not format_id_list and num is not None:
+                        format_id_list.append(compat_str(num))
+                    format_id = '-'.join(format_id_list)
+                    format_note = ', '.join(filter(None, (kind, lang_note)))
+                    item_id_list = []
+                    if format_id:
+                        item_id_list.append(format_id)
+                    item_id_list.append('videomaterial')
+                    playlist = self._download_json(
+                        urljoin(url, playlist_url), video_id,
+                        'Downloading %s JSON' % ' '.join(item_id_list),
+                        headers={
+                            'X-Requested-With': 'XMLHttpRequest',
+                            'X-CSRF-Token': csrf_token,
+                            'Referer': url,
+                            'Accept': 'application/json, text/javascript, */*; q=0.01',
+                        }, fatal=False)
+                    if not playlist:
+                        continue
+                    stream_url = url_or_none(playlist.get('streamurl'))
+                    if stream_url:
+                        rtmp = re.search(
+                            r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',
+                            stream_url)
+                        if rtmp:
+                            formats.append({
+                                'url': rtmp.group('url'),
+                                'app': rtmp.group('app'),
+                                'play_path': rtmp.group('playpath'),
+                                'page_url': url,
+                                'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf',
+                                'rtmp_real_time': True,
+                                'format_id': 'rtmp',
+                                'ext': 'flv',
+                            })
+                            continue
+                    start_video = playlist.get('startvideo', 0)
+                    playlist = playlist.get('playlist')
+                    if not playlist or not isinstance(playlist, list):
+                        continue
+                    playlist = playlist[start_video]
+                    title = playlist.get('title')
+                    if not title:
+                        continue
+                    description = playlist.get('description')
+                    for source in playlist.get('sources', []):
+                        file_ = source.get('file')
+                        if not file_:
+                            continue
+                        ext = determine_ext(file_)
+                        format_id_list = [lang, kind]
+                        if ext == 'm3u8':
+                            format_id_list.append('hls')
+                        elif source.get('type') == 'video/dash' or ext == 'mpd':
+                            format_id_list.append('dash')
+                        format_id = '-'.join(filter(None, format_id_list))
+                        if ext == 'm3u8':
+                            file_formats = self._extract_m3u8_formats(
+                                file_, video_id, 'mp4',
+                                entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)
+                        elif source.get('type') == 'video/dash' or ext == 'mpd':
+                            continue
+                            file_formats = self._extract_mpd_formats(
+                                file_, video_id, mpd_id=format_id, fatal=False)
+                        else:
+                            continue
+                        for f in file_formats:
+                            f.update({
+                                'language': lang,
+                                'format_note': format_note,
+                            })
+                        formats.extend(file_formats)
+
+            return {
+                'title': title,
+                'description': description,
+                'formats': formats,
+            }
+
+        def extract_entries(html, video_id, common_info, num=None):
+            info = extract_info(html, video_id, num)
+
+            if info['formats']:
+                self._sort_formats(info['formats'])
+                f = common_info.copy()
+                f.update(info)
+                entries.append(f)
+
+            # Extract teaser/trailer only when full episode is not available
+            if not info['formats']:
+                m = re.search(
+                    r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<',
+                    html)
+                if m:
+                    f = common_info.copy()
+                    f.update({
+                        'id': '%s-%s' % (f['id'], m.group('kind').lower()),
+                        'title': m.group('title'),
+                        'url': urljoin(url, m.group('href')),
+                    })
+                    entries.append(f)
+
+        def extract_episodes(html):
+            for num, episode_html in enumerate(re.findall(
+                    r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1):
+                episodebox_title = self._search_regex(
+                    (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
+                     r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
+                    episode_html, 'episodebox title', default=None, group='title')
+                if not episodebox_title:
+                    continue
+
+                episode_number = int(self._search_regex(
+                    r'(?:Episode|Film)\s*(\d+)',
+                    episodebox_title, 'episode number', default=num))
+                episode_title = self._search_regex(
+                    r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
+                    episodebox_title, 'episode title', default=None)
+
+                video_id = 'episode-%d' % episode_number
+
+                common_info = {
+                    'id': video_id,
+                    'series': anime_title,
+                    'episode': episode_title,
+                    'episode_number': episode_number,
+                }
+
+                extract_entries(episode_html, video_id, common_info)
+
+        def extract_film(html, video_id):
+            common_info = {
+                'id': anime_id,
+                'title': anime_title,
+                'description': anime_description,
+            }
+            extract_entries(html, video_id, common_info)
+
+        extract_episodes(webpage)
+
+        if not entries:
+            extract_film(webpage, anime_id)
+
+        return self.playlist_result(entries, anime_id, anime_title, anime_description)
diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py

new file mode 100644 (file)

index 0000000..84e8410
--- /dev/null
+++ b/youtube_dl/extractor/anvato.py
@@ -0,0 +1,314 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import hashlib
+import json
+import random
+import re
+import time
+
+from .common import InfoExtractor
+from ..aes import aes_encrypt
+from ..compat import compat_str
+from ..utils import (
+    bytes_to_intlist,
+    determine_ext,
+    intlist_to_bytes,
+    int_or_none,
+    strip_jsonp,
+    unescapeHTML,
+    unsmuggle_url,
+)
+
+
+def md5_text(s):
+    if not isinstance(s, compat_str):
+        s = compat_str(s)
+    return hashlib.md5(s.encode('utf-8')).hexdigest()
+
+
+class AnvatoIE(InfoExtractor):
+    _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
+
+    # Copied from anvplayer.min.js
+    _ANVACK_TABLE = {
+        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
+        'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA',
+        'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP',
+        'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv',
+        'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7',
+        'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR',
+        'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg',
+        'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto',
+        'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY',
+        'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh',
+        'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK',
+        'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D',
+        'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad',
+        'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp',
+        'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih',
+        'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR',
+        'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW',
+        'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su',
+        'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q',
+        'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5',
+        'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3',
+        'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI',
+        'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s',
+        'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz',
+        'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg',
+        'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x',
+        'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH',
+        'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX',
+        'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc',
+        'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK',
+        'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7',
+        'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C',
+        'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e',
+        'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1',
+        'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re',
+        'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51',
+        'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho',
+        'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9',
+        'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH',
+        'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F',
+        'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo',
+        'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR',
+        'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa',
+        'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk',
+        'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ',
+        'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ',
+        'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m',
+        'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b',
+        'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3',
+        'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK',
+        'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
+        'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
+        'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F',
+        'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx',
+        'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ',
+        'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH',
+        'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm',
+        'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt',
+        'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl',
+        'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b',
+        'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV',
+        'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg',
+        'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk',
+        'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT',
+        'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa',
+        'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv',
+        'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k',
+        'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI',
+        'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr',
+        'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw',
+        'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K',
+        'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH',
+        'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK',
+        'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu',
+        'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
+        'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
+        'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK',
+        'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n',
+        'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD',
+        'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk',
+        'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn',
+        'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W',
+        'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ',
+        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
+    }
+
+    _MCP_TO_ACCESS_KEY_TABLE = {
+        'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922',
+        'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749',
+        'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+        'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+        'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a',
+        'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+        'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+        'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3',
+        'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
+        'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
+        'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
+        'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
+    }
+
+    _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA'
+
+    _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
+    _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
+
+    _TESTS = [{
+        # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874
+        'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496',
+        'info_dict': {
+            'id': '4465496',
+            'ext': 'mp4',
+            'title': 'VIDEO: Humpback whale breaches right next to NH boat',
+            'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.',
+            'duration': 22,
+            'timestamp': 1534855680,
+            'upload_date': '20180821',
+            'uploader': 'ANV',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/
+        'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601',
+        'only_matching': True,
+    }]
+
+    def __init__(self, *args, **kwargs):
+        super(AnvatoIE, self).__init__(*args, **kwargs)
+        self.__server_time = None
+
+    def _server_time(self, access_key, video_id):
+        if self.__server_time is not None:
+            return self.__server_time
+
+        self.__server_time = int(self._download_json(
+            self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id,
+            note='Fetching server time')['server_time'])
+
+        return self.__server_time
+
+    def _api_prefix(self, access_key):
+        return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage')
+
+    def _get_video_json(self, access_key, video_id):
+        # See et() in anvplayer.min.js, which is an alias of getVideoJSON()
+        video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key)
+        server_time = self._server_time(access_key, video_id)
+        input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time))
+
+        auth_secret = intlist_to_bytes(aes_encrypt(
+            bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
+
+        video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii')
+        anvrid = md5_text(time.time() * 1000 * random.random())[:30]
+        payload = {
+            'api': {
+                'anvrid': anvrid,
+                'anvstk': md5_text('%s|%s|%d|%s' % (
+                    access_key, anvrid, server_time,
+                    self._ANVACK_TABLE.get(access_key, self._API_KEY))),
+                'anvts': server_time,
+            },
+        }
+
+        return self._download_json(
+            video_data_url, video_id, transform_source=strip_jsonp,
+            data=json.dumps(payload).encode('utf-8'))
+
+    def _get_anvato_videos(self, access_key, video_id):
+        video_data = self._get_video_json(access_key, video_id)
+
+        formats = []
+        for published_url in video_data['published_urls']:
+            video_url = published_url['embed_url']
+            media_format = published_url.get('format')
+            ext = determine_ext(video_url)
+
+            if ext == 'smil' or media_format == 'smil':
+                formats.extend(self._extract_smil_formats(video_url, video_id))
+                continue
+
+            tbr = int_or_none(published_url.get('kbps'))
+            a_format = {
+                'url': video_url,
+                'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(),
+                'tbr': tbr if tbr != 0 else None,
+            }
+
+            if media_format == 'm3u8' and tbr is not None:
+                a_format.update({
+                    'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
+                    'ext': 'mp4',
+                })
+            elif media_format == 'm3u8-variant' or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+                continue
+            elif ext == 'mp3' or media_format == 'mp3':
+                a_format['vcodec'] = 'none'
+            else:
+                a_format.update({
+                    'width': int_or_none(published_url.get('width')),
+                    'height': int_or_none(published_url.get('height')),
+                })
+            formats.append(a_format)
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for caption in video_data.get('captions', []):
+            a_caption = {
+                'url': caption['url'],
+                'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None
+            }
+            subtitles.setdefault(caption['language'], []).append(a_caption)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': video_data.get('def_title'),
+            'description': video_data.get('def_description'),
+            'tags': video_data.get('def_tags', '').split(','),
+            'categories': video_data.get('categories'),
+            'thumbnail': video_data.get('thumbnail'),
+            'timestamp': int_or_none(video_data.get(
+                'ts_published') or video_data.get('ts_added')),
+            'uploader': video_data.get('mcp_id'),
+            'duration': int_or_none(video_data.get('duration')),
+            'subtitles': subtitles,
+        }
+
+    @staticmethod
+    def _extract_urls(ie, webpage, video_id):
+        entries = []
+        for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
+            anvplayer_data = ie._parse_json(
+                mobj.group('anvp'), video_id, transform_source=unescapeHTML,
+                fatal=False)
+            if not anvplayer_data:
+                continue
+            video = anvplayer_data.get('video')
+            if not isinstance(video, compat_str) or not video.isdigit():
+                continue
+            access_key = anvplayer_data.get('accessKey')
+            if not access_key:
+                mcp = anvplayer_data.get('mcp')
+                if mcp:
+                    access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
+                        mcp.lower())
+            if not access_key:
+                continue
+            entries.append(ie.url_result(
+                'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
+                video_id=video))
+        return entries
+
+    def _extract_anvato_videos(self, webpage, video_id):
+        anvplayer_data = self._parse_json(
+            self._html_search_regex(
+                self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
+            video_id)
+        return self._get_anvato_videos(
+            anvplayer_data['accessKey'], anvplayer_data['video'])
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
+
+        mobj = re.match(self._VALID_URL, url)
+        access_key, video_id = mobj.group('access_key_or_mcp', 'id')
+        if access_key not in self._ANVACK_TABLE:
+            access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(
+                access_key) or access_key
+        return self._get_anvato_videos(access_key, video_id)
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py

new file mode 100644 (file)

index 0000000..e87994a
--- /dev/null
+++ b/youtube_dl/extractor/aol.py
@@ -0,0 +1,133 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    url_or_none,
+)
+
+
+class AolIE(InfoExtractor):
+    IE_NAME = 'aol.com'
+    _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>[0-9a-f]+)'
+
+    _TESTS = [{
+        # video with 5min ID
+        'url': 'https://www.aol.com/video/view/u-s--official-warns-of-largest-ever-irs-phone-scam/518167793/',
+        'md5': '18ef68f48740e86ae94b98da815eec42',
+        'info_dict': {
+            'id': '518167793',
+            'ext': 'mp4',
+            'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
+            'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.',
+            'timestamp': 1395405060,
+            'upload_date': '20140321',
+            'uploader': 'Newsy Studio',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        # video with vidible ID
+        'url': 'https://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/',
+        'info_dict': {
+            'id': '5707d6b8e4b090497b04f706',
+            'ext': 'mp4',
+            'title': 'Netflix is Raising Rates',
+            'description': 'Netflix is rewarding millions of it’s long-standing members with an increase in cost. Veuer’s Carly Figueroa has more.',
+            'upload_date': '20160408',
+            'timestamp': 1460123280,
+            'uploader': 'Veuer',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://www.aol.com/video/view/park-bench-season-2-trailer/559a1b9be4b0c3bfad3357a7/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.aol.com/video/view/donald-trump-spokeswoman-tones-down-megyn-kelly-attacks/519442220/',
+        'only_matching': True,
+    }, {
+        'url': 'aol-video:5707d6b8e4b090497b04f706',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.aol.com/video/playlist/PL8245/5ca79d19d21f1a04035db606/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.aol.ca/video/view/u-s-woman-s-family-arrested-for-murder-first-pinned-on-panhandler-police/5c7ccf45bc03931fa04b2fe1/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.aol.co.uk/video/view/-one-dead-and-22-hurt-in-bus-crash-/5cb3a6f3d21f1a072b457347/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.aol.de/video/view/eva-braun-privataufnahmen-von-hitlers-geliebter-werden-digitalisiert/5cb2d49de98ab54c113d3d5d/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        response = self._download_json(
+            'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id,
+            video_id)['response']
+        if response['statusText'] != 'Ok':
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True)
+
+        video_data = response['data']
+        formats = []
+        m3u8_url = url_or_none(video_data.get('videoMasterPlaylist'))
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+        for rendition in video_data.get('renditions', []):
+            video_url = url_or_none(rendition.get('url'))
+            if not video_url:
+                continue
+            ext = rendition.get('format')
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+            else:
+                f = {
+                    'url': video_url,
+                    'format_id': rendition.get('quality'),
+                }
+                mobj = re.search(r'(\d+)x(\d+)', video_url)
+                if mobj:
+                    f.update({
+                        'width': int(mobj.group(1)),
+                        'height': int(mobj.group(2)),
+                    })
+                else:
+                    qs = compat_parse_qs(compat_urllib_parse_urlparse(video_url).query)
+                    f.update({
+                        'width': int_or_none(qs.get('w', [None])[0]),
+                        'height': int_or_none(qs.get('h', [None])[0]),
+                    })
+                formats.append(f)
+        self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id'))
+
+        return {
+            'id': video_id,
+            'title': video_data['title'],
+            'duration': int_or_none(video_data.get('duration')),
+            'timestamp': int_or_none(video_data.get('publishDate')),
+            'view_count': int_or_none(video_data.get('views')),
+            'description': video_data.get('description'),
+            'uploader': video_data.get('videoOwner'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py

new file mode 100644 (file)

index 0000000..98ccdaa
--- /dev/null
+++ b/youtube_dl/extractor/apa.py
@@ -0,0 +1,94 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    js_to_json,
+    url_or_none,
+)
+
+
+class APAIE(InfoExtractor):
+    _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TESTS = [{
+        'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
+        'md5': '2b12292faeb0a7d930c778c7a5b4759b',
+        'info_dict': {
+            'id': 'jjv85FdZ',
+            'ext': 'mp4',
+            'title': '"Blau ist mysteriös": Die Blue Man Group im Interview',
+            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 254,
+            'timestamp': 1519211149,
+            'upload_date': '20180221',
+        },
+    }, {
+        'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78',
+        'only_matching': True,
+    }, {
+        'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76',
+        'only_matching': True,
+    }, {
+        'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url')
+            for mobj in re.finditer(
+                r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1',
+                webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        jwplatform_id = self._search_regex(
+            r'media[iI]d\s*:\s*["\'](?P<id>[a-zA-Z0-9]{8})', webpage,
+            'jwplatform id', default=None)
+
+        if jwplatform_id:
+            return self.url_result(
+                'jwplatform:' + jwplatform_id, ie='JWPlatform',
+                video_id=video_id)
+
+        sources = self._parse_json(
+            self._search_regex(
+                r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'),
+            video_id, transform_source=js_to_json)
+
+        formats = []
+        for source in sources:
+            if not isinstance(source, dict):
+                continue
+            source_url = url_or_none(source.get('file'))
+            if not source_url:
+                continue
+            ext = determine_ext(source_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': source_url,
+                })
+        self._sort_formats(formats)
+
+        thumbnail = self._search_regex(
+            r'image\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'thumbnail', fatal=False, group='url')
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py

new file mode 100644 (file)

index 0000000..883dcee
--- /dev/null
+++ b/youtube_dl/extractor/aparat.py
@@ -0,0 +1,95 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    merge_dicts,
+    mimetype2ext,
+    url_or_none,
+)
+
+
+class AparatIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
+
+    _TESTS = [{
+        'url': 'http://www.aparat.com/v/wP8On',
+        'md5': '131aca2e14fe7c4dcb3c4877ba300c89',
+        'info_dict': {
+            'id': 'wP8On',
+            'ext': 'mp4',
+            'title': 'تیم گلکسی 11 - زومیت',
+            'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028',
+            'duration': 231,
+            'timestamp': 1387394859,
+            'upload_date': '20131218',
+            'view_count': int,
+        },
+    }, {
+        # multiple formats
+        'url': 'https://www.aparat.com/v/8dflw/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # Provides more metadata
+        webpage = self._download_webpage(url, video_id, fatal=False)
+
+        if not webpage:
+            # Note: There is an easier-to-parse configuration at
+            # http://www.aparat.com/video/video/config/videohash/%video_id
+            # but the URL in there does not work
+            webpage = self._download_webpage(
+                'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
+                video_id)
+
+        options = self._parse_json(
+            self._search_regex(
+                r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1\s*\)',
+                webpage, 'options', group='value'),
+            video_id)
+
+        player = options['plugins']['sabaPlayerPlugin']
+
+        formats = []
+        for sources in player['multiSRC']:
+            for item in sources:
+                if not isinstance(item, dict):
+                    continue
+                file_url = url_or_none(item.get('src'))
+                if not file_url:
+                    continue
+                item_type = item.get('type')
+                if item_type == 'application/vnd.apple.mpegurl':
+                    formats.extend(self._extract_m3u8_formats(
+                        file_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id='hls',
+                        fatal=False))
+                else:
+                    ext = mimetype2ext(item.get('type'))
+                    label = item.get('label')
+                    formats.append({
+                        'url': file_url,
+                        'ext': ext,
+                        'format_id': 'http-%s' % (label or ext),
+                        'height': int_or_none(self._search_regex(
+                            r'(\d+)[pP]', label or '', 'height',
+                            default=None)),
+                    })
+        self._sort_formats(
+            formats, field_preference=('height', 'width', 'tbr', 'format_id'))
+
+        info = self._search_json_ld(webpage, video_id, default={})
+
+        if not info.get('title'):
+            info['title'] = player['title']
+
+        return merge_dicts(info, {
+            'id': video_id,
+            'thumbnail': url_or_none(options.get('poster')),
+            'duration': int_or_none(player.get('duration')),
+            'formats': formats,
+        })
diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py

new file mode 100644 (file)

index 0000000..a84b8b1
--- /dev/null
+++ b/youtube_dl/extractor/appleconnect.py
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    str_to_int,
+    ExtractorError
+)
+
+
+class AppleConnectIE(InfoExtractor):
+    _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
+    _TEST = {
+        'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
+        'md5': 'e7c38568a01ea45402570e6029206723',
+        'info_dict': {
+            'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
+            'ext': 'm4v',
+            'title': 'Energy',
+            'uploader': 'Drake',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20150710',
+            'timestamp': 1436545535,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        try:
+            video_json = self._html_search_regex(
+                r'class="auc-video-data">(\{.*?\})', webpage, 'json')
+        except ExtractorError:
+            raise ExtractorError('This post doesn\'t contain a video', expected=True)
+
+        video_data = self._parse_json(video_json, video_id)
+        timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp'))
+        like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count'))
+
+        return {
+            'id': video_id,
+            'url': video_data['sslSrc'],
+            'title': video_data['title'],
+            'description': video_data['description'],
+            'uploader': video_data['artistName'],
+            'thumbnail': video_data['artworkUrl'],
+            'timestamp': timestamp,
+            'like_count': like_count,
+        }
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py

new file mode 100644 (file)

index 0000000..b5ed2b8
--- /dev/null
+++ b/youtube_dl/extractor/appletrailers.py
@@ -0,0 +1,283 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    unified_strdate,
+)
+
+
+class AppleTrailersIE(InfoExtractor):
+    IE_NAME = 'appletrailers'
+    _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+    _TESTS = [{
+        'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
+        'info_dict': {
+            'id': '5111',
+            'title': 'Man of Steel',
+        },
+        'playlist': [
+            {
+                'md5': 'd97a8e575432dbcb81b7c3acb741f8a8',
+                'info_dict': {
+                    'id': 'manofsteel-trailer4',
+                    'ext': 'mov',
+                    'duration': 111,
+                    'title': 'Trailer 4',
+                    'upload_date': '20130523',
+                    'uploader_id': 'wb',
+                },
+            },
+            {
+                'md5': 'b8017b7131b721fb4e8d6f49e1df908c',
+                'info_dict': {
+                    'id': 'manofsteel-trailer3',
+                    'ext': 'mov',
+                    'duration': 182,
+                    'title': 'Trailer 3',
+                    'upload_date': '20130417',
+                    'uploader_id': 'wb',
+                },
+            },
+            {
+                'md5': 'd0f1e1150989b9924679b441f3404d48',
+                'info_dict': {
+                    'id': 'manofsteel-trailer',
+                    'ext': 'mov',
+                    'duration': 148,
+                    'title': 'Trailer',
+                    'upload_date': '20121212',
+                    'uploader_id': 'wb',
+                },
+            },
+            {
+                'md5': '5fe08795b943eb2e757fa95cb6def1cb',
+                'info_dict': {
+                    'id': 'manofsteel-teaser',
+                    'ext': 'mov',
+                    'duration': 93,
+                    'title': 'Teaser',
+                    'upload_date': '20120721',
+                    'uploader_id': 'wb',
+                },
+            },
+        ]
+    }, {
+        'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
+        'info_dict': {
+            'id': '4489',
+            'title': 'Blackthorn',
+        },
+        'playlist_mincount': 2,
+        'expected_warnings': ['Unable to download JSON metadata'],
+    }, {
+        # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json
+        'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/',
+        'info_dict': {
+            'id': '15881',
+            'title': 'Kung Fu Panda 3',
+        },
+        'playlist_mincount': 4,
+    }, {
+        'url': 'http://trailers.apple.com/ca/metropole/autrui/',
+        'only_matching': True,
+    }, {
+        'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/',
+        'only_matching': True,
+    }]
+
+    _JSON_RE = r'iTunes.playURL\((.*?)\);'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        movie = mobj.group('movie')
+        uploader_id = mobj.group('company')
+
+        webpage = self._download_webpage(url, movie)
+        film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id')
+        film_data = self._download_json(
+            'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id,
+            film_id, fatal=False)
+
+        if film_data:
+            entries = []
+            for clip in film_data.get('clips', []):
+                clip_title = clip['title']
+
+                formats = []
+                for version, version_data in clip.get('versions', {}).items():
+                    for size, size_data in version_data.get('sizes', {}).items():
+                        src = size_data.get('src')
+                        if not src:
+                            continue
+                        formats.append({
+                            'format_id': '%s-%s' % (version, size),
+                            'url': re.sub(r'_(\d+p\.mov)', r'_h\1', src),
+                            'width': int_or_none(size_data.get('width')),
+                            'height': int_or_none(size_data.get('height')),
+                            'language': version[:2],
+                        })
+                self._sort_formats(formats)
+
+                entries.append({
+                    'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(),
+                    'formats': formats,
+                    'title': clip_title,
+                    'thumbnail': clip.get('screen') or clip.get('thumb'),
+                    'duration': parse_duration(clip.get('runtime') or clip.get('faded')),
+                    'upload_date': unified_strdate(clip.get('posted')),
+                    'uploader_id': uploader_id,
+                })
+
+            page_data = film_data.get('page', {})
+            return self.playlist_result(entries, film_id, page_data.get('movie_title'))
+
+        playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
+
+        def fix_html(s):
+            s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
+            s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s)
+            # The ' in the onClick attributes are not escaped, it couldn't be parsed
+            # like: http://trailers.apple.com/trailers/wb/gravity/
+
+            def _clean_json(m):
+                return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
+            s = re.sub(self._JSON_RE, _clean_json, s)
+            s = '<html>%s</html>' % s
+            return s
+        doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
+
+        playlist = []
+        for li in doc.findall('./div/ul/li'):
+            on_click = li.find('.//a').attrib['onClick']
+            trailer_info_json = self._search_regex(self._JSON_RE,
+                                                   on_click, 'trailer info')
+            trailer_info = json.loads(trailer_info_json)
+            first_url = trailer_info.get('url')
+            if not first_url:
+                continue
+            title = trailer_info['title']
+            video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
+            thumbnail = li.find('.//img').attrib['src']
+            upload_date = trailer_info['posted'].replace('-', '')
+
+            runtime = trailer_info['runtime']
+            m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)
+            duration = None
+            if m:
+                duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
+
+            trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
+            settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
+            settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
+
+            formats = []
+            for format in settings['metadata']['sizes']:
+                # The src is a file pointing to the real video file
+                format_url = re.sub(r'_(\d*p\.mov)', r'_h\1', format['src'])
+                formats.append({
+                    'url': format_url,
+                    'format': format['type'],
+                    'width': int_or_none(format['width']),
+                    'height': int_or_none(format['height']),
+                })
+
+            self._sort_formats(formats)
+
+            playlist.append({
+                '_type': 'video',
+                'id': video_id,
+                'formats': formats,
+                'title': title,
+                'duration': duration,
+                'thumbnail': thumbnail,
+                'upload_date': upload_date,
+                'uploader_id': uploader_id,
+                'http_headers': {
+                    'User-Agent': 'QuickTime compatible (youtube-dlc)',
+                },
+            })
+
+        return {
+            '_type': 'playlist',
+            'id': movie,
+            'entries': playlist,
+        }
+
+
+class AppleTrailersSectionIE(InfoExtractor):
+    IE_NAME = 'appletrailers:section'
+    _SECTIONS = {
+        'justadded': {
+            'feed_path': 'just_added',
+            'title': 'Just Added',
+        },
+        'exclusive': {
+            'feed_path': 'exclusive',
+            'title': 'Exclusive',
+        },
+        'justhd': {
+            'feed_path': 'just_hd',
+            'title': 'Just HD',
+        },
+        'mostpopular': {
+            'feed_path': 'most_pop',
+            'title': 'Most Popular',
+        },
+        'moviestudios': {
+            'feed_path': 'studios',
+            'title': 'Movie Studios',
+        },
+    }
+    _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS)
+    _TESTS = [{
+        'url': 'http://trailers.apple.com/#section=justadded',
+        'info_dict': {
+            'title': 'Just Added',
+            'id': 'justadded',
+        },
+        'playlist_mincount': 80,
+    }, {
+        'url': 'http://trailers.apple.com/#section=exclusive',
+        'info_dict': {
+            'title': 'Exclusive',
+            'id': 'exclusive',
+        },
+        'playlist_mincount': 80,
+    }, {
+        'url': 'http://trailers.apple.com/#section=justhd',
+        'info_dict': {
+            'title': 'Just HD',
+            'id': 'justhd',
+        },
+        'playlist_mincount': 80,
+    }, {
+        'url': 'http://trailers.apple.com/#section=mostpopular',
+        'info_dict': {
+            'title': 'Most Popular',
+            'id': 'mostpopular',
+        },
+        'playlist_mincount': 30,
+    }, {
+        'url': 'http://trailers.apple.com/#section=moviestudios',
+        'info_dict': {
+            'title': 'Movie Studios',
+            'id': 'moviestudios',
+        },
+        'playlist_mincount': 80,
+    }]
+
+    def _real_extract(self, url):
+        section = self._match_id(url)
+        section_data = self._download_json(
+            'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'],
+            section)
+        entries = [
+            self.url_result('http://trailers.apple.com' + e['location'])
+            for e in section_data]
+        return self.playlist_result(entries, section, self._SECTIONS[section]['title'])
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py

new file mode 100644 (file)

index 0000000..c79c58e
--- /dev/null
+++ b/youtube_dl/extractor/archiveorg.py
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    unified_strdate,
+    clean_html,
+)
+
+
+class ArchiveOrgIE(InfoExtractor):
+    IE_NAME = 'archive.org'
+    IE_DESC = 'archive.org videos'
+    _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$'
+    _TESTS = [{
+        'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+        'md5': '8af1d4cf447933ed3c7f4871162602db',
+        'info_dict': {
+            'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+            'ext': 'ogg',
+            'title': '1968 Demo - FJCC Conference Presentation Reel #1',
+            'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
+            'upload_date': '19681210',
+            'uploader': 'SRI International'
+        }
+    }, {
+        'url': 'https://archive.org/details/Cops1922',
+        'md5': '0869000b4ce265e8ca62738b336b268a',
+        'info_dict': {
+            'id': 'Cops1922',
+            'ext': 'mp4',
+            'title': 'Buster Keaton\'s "Cops" (1922)',
+            'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6',
+        }
+    }, {
+        'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'http://archive.org/embed/' + video_id, video_id)
+        jwplayer_playlist = self._parse_json(self._search_regex(
+            r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)",
+            webpage, 'jwplayer playlist'), video_id)
+        info = self._parse_jwplayer_data(
+            {'playlist': jwplayer_playlist}, video_id, base_url=url)
+
+        def get_optional(metadata, field):
+            return metadata.get(field, [None])[0]
+
+        metadata = self._download_json(
+            'http://archive.org/details/' + video_id, video_id, query={
+                'output': 'json',
+            })['metadata']
+        info.update({
+            'title': get_optional(metadata, 'title') or info.get('title'),
+            'description': clean_html(get_optional(metadata, 'description')),
+        })
+        if info.get('_type') != 'playlist':
+            info.update({
+                'uploader': get_optional(metadata, 'creator'),
+                'upload_date': unified_strdate(get_optional(metadata, 'date')),
+            })
+        return info
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py

new file mode 100644 (file)

index 0000000..5b7b2dd
--- /dev/null
+++ b/youtube_dl/extractor/ard.py
@@ -0,0 +1,422 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from .generic import GenericIE
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    parse_duration,
+    qualities,
+    str_or_none,
+    try_get,
+    unified_strdate,
+    unified_timestamp,
+    update_url_query,
+    url_or_none,
+    xpath_text,
+)
+from ..compat import compat_etree_fromstring
+
+
+class ARDMediathekBaseIE(InfoExtractor):
+    _GEO_COUNTRIES = ['DE']
+
+    def _extract_media_info(self, media_info_url, webpage, video_id):
+        media_info = self._download_json(
+            media_info_url, video_id, 'Downloading media JSON')
+        return self._parse_media_info(media_info, video_id, '"fsk"' in webpage)
+
+    def _parse_media_info(self, media_info, video_id, fsk):
+        formats = self._extract_formats(media_info, video_id)
+
+        if not formats:
+            if fsk:
+                raise ExtractorError(
+                    'This video is only available after 20:00', expected=True)
+            elif media_info.get('_geoblocked'):
+                self.raise_geo_restricted(
+                    'This video is not available due to geoblocking',
+                    countries=self._GEO_COUNTRIES)
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        subtitle_url = media_info.get('_subtitleUrl')
+        if subtitle_url:
+            subtitles['de'] = [{
+                'ext': 'ttml',
+                'url': subtitle_url,
+            }]
+
+        return {
+            'id': video_id,
+            'duration': int_or_none(media_info.get('_duration')),
+            'thumbnail': media_info.get('_previewImage'),
+            'is_live': media_info.get('_isLive') is True,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _extract_formats(self, media_info, video_id):
+        type_ = media_info.get('_type')
+        media_array = media_info.get('_mediaArray', [])
+        formats = []
+        for num, media in enumerate(media_array):
+            for stream in media.get('_mediaStreamArray', []):
+                stream_urls = stream.get('_stream')
+                if not stream_urls:
+                    continue
+                if not isinstance(stream_urls, list):
+                    stream_urls = [stream_urls]
+                quality = stream.get('_quality')
+                server = stream.get('_server')
+                for stream_url in stream_urls:
+                    if not url_or_none(stream_url):
+                        continue
+                    ext = determine_ext(stream_url)
+                    if quality != 'auto' and ext in ('f4m', 'm3u8'):
+                        continue
+                    if ext == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            update_url_query(stream_url, {
+                                'hdcore': '3.1.1',
+                                'plugin': 'aasp-3.1.1.69.124'
+                            }), video_id, f4m_id='hds', fatal=False))
+                    elif ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            stream_url, video_id, 'mp4', 'm3u8_native',
+                            m3u8_id='hls', fatal=False))
+                    else:
+                        if server and server.startswith('rtmp'):
+                            f = {
+                                'url': server,
+                                'play_path': stream_url,
+                                'format_id': 'a%s-rtmp-%s' % (num, quality),
+                            }
+                        else:
+                            f = {
+                                'url': stream_url,
+                                'format_id': 'a%s-%s-%s' % (num, ext, quality)
+                            }
+                        m = re.search(
+                            r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$',
+                            stream_url)
+                        if m:
+                            f.update({
+                                'width': int(m.group('width')),
+                                'height': int(m.group('height')),
+                            })
+                        if type_ == 'audio':
+                            f['vcodec'] = 'none'
+                        formats.append(f)
+        return formats
+
+
+class ARDMediathekIE(ARDMediathekBaseIE):
+    IE_NAME = 'ARD:mediathek'
+    _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
+
+    _TESTS = [{
+        # available till 26.07.2022
+        'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
+        'info_dict': {
+            'id': '44726822',
+            'ext': 'mp4',
+            'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
+            'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
+            'duration': 1740,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
+        'only_matching': True,
+    }, {
+        # audio
+        'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
+        'only_matching': True,
+    }, {
+        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
+        'only_matching': True,
+    }, {
+        # audio
+        'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
+        'only_matching': True,
+    }, {
+        'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        # determine video id from url
+        m = re.match(self._VALID_URL, url)
+
+        document_id = None
+
+        numid = re.search(r'documentId=([0-9]+)', url)
+        if numid:
+            document_id = video_id = numid.group(1)
+        else:
+            video_id = m.group('video_id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        ERRORS = (
+            ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
+            ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
+             'Video %s is no longer available'),
+        )
+
+        for pattern, message in ERRORS:
+            if pattern in webpage:
+                raise ExtractorError(message % video_id, expected=True)
+
+        if re.search(r'[\?&]rss($|[=&])', url):
+            doc = compat_etree_fromstring(webpage.encode('utf-8'))
+            if doc.tag == 'rss':
+                return GenericIE()._extract_rss(url, video_id, doc)
+
+        title = self._html_search_regex(
+            [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
+             r'<meta name="dcterms\.title" content="(.*?)"/>',
+             r'<h4 class="headline">(.*?)</h4>',
+             r'<title[^>]*>(.*?)</title>'],
+            webpage, 'title')
+        description = self._html_search_meta(
+            'dcterms.abstract', webpage, 'description', default=None)
+        if description is None:
+            description = self._html_search_meta(
+                'description', webpage, 'meta description', default=None)
+        if description is None:
+            description = self._html_search_regex(
+                r'<p\s+class="teasertext">(.+?)</p>',
+                webpage, 'teaser text', default=None)
+
+        # Thumbnail is sometimes not present.
+        # It is in the mobile version, but that seems to use a different URL
+        # structure altogether.
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
+
+        media_streams = re.findall(r'''(?x)
+            mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
+            "([^"]+)"''', webpage)
+
+        if media_streams:
+            QUALITIES = qualities(['lo', 'hi', 'hq'])
+            formats = []
+            for furl in set(media_streams):
+                if furl.endswith('.f4m'):
+                    fid = 'f4m'
+                else:
+                    fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
+                    fid = fid_m.group(1) if fid_m else None
+                formats.append({
+                    'quality': QUALITIES(fid),
+                    'format_id': fid,
+                    'url': furl,
+                })
+            self._sort_formats(formats)
+            info = {
+                'formats': formats,
+            }
+        else:  # request JSON file
+            if not document_id:
+                video_id = self._search_regex(
+                    r'/play/(?:config|media)/(\d+)', webpage, 'media id')
+            info = self._extract_media_info(
+                'http://www.ardmediathek.de/play/media/%s' % video_id,
+                webpage, video_id)
+
+        info.update({
+            'id': video_id,
+            'title': self._live_title(title) if info.get('is_live') else title,
+            'description': description,
+            'thumbnail': thumbnail,
+        })
+
+        return info
+
+
+class ARDIE(InfoExtractor):
+    _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
+    _TESTS = [{
+        # available till 14.02.2019
+        'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html',
+        'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49',
+        'info_dict': {
+            'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video',
+            'id': '102',
+            'ext': 'mp4',
+            'duration': 4435.0,
+            'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?',
+            'upload_date': '20180214',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+
+        player_url = mobj.group('mainurl') + '~playerXml.xml'
+        doc = self._download_xml(player_url, display_id)
+        video_node = doc.find('./video')
+        upload_date = unified_strdate(xpath_text(
+            video_node, './broadcastDate'))
+        thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
+
+        formats = []
+        for a in video_node.findall('.//asset'):
+            f = {
+                'format_id': a.attrib['type'],
+                'width': int_or_none(a.find('./frameWidth').text),
+                'height': int_or_none(a.find('./frameHeight').text),
+                'vbr': int_or_none(a.find('./bitrateVideo').text),
+                'abr': int_or_none(a.find('./bitrateAudio').text),
+                'vcodec': a.find('./codecVideo').text,
+                'tbr': int_or_none(a.find('./totalBitrate').text),
+            }
+            if a.find('./serverPrefix').text:
+                f['url'] = a.find('./serverPrefix').text
+                f['playpath'] = a.find('./fileName').text
+            else:
+                f['url'] = a.find('./fileName').text
+            formats.append(f)
+        self._sort_formats(formats)
+
+        return {
+            'id': mobj.group('id'),
+            'formats': formats,
+            'display_id': display_id,
+            'title': video_node.find('./title').text,
+            'duration': parse_duration(video_node.find('./duration').text),
+            'upload_date': upload_date,
+            'thumbnail': thumbnail,
+        }
+
+
+class ARDBetaMediathekIE(ARDMediathekBaseIE):
+    _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?:player|live|video)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
+    _TESTS = [{
+        'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
+        'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f',
+        'info_dict': {
+            'display_id': 'die-robuste-roswita',
+            'id': '70153354',
+            'title': 'Die robuste Roswita',
+            'description': r're:^Der Mord.*trüber ist als die Ilm.',
+            'duration': 5316,
+            'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard',
+            'timestamp': 1577047500,
+            'upload_date': '20191222',
+            'ext': 'mp4',
+        },
+    }, {
+        'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
+        'only_matching': True,
+    }, {
+        'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('video_id')
+        display_id = mobj.group('display_id')
+        if display_id:
+            display_id = display_id.rstrip('/')
+        if not display_id:
+            display_id = video_id
+
+        player_page = self._download_json(
+            'https://api.ardmediathek.de/public-gateway',
+            display_id, data=json.dumps({
+                'query': '''{
+  playerPage(client:"%s", clipId: "%s") {
+    blockedByFsk
+    broadcastedOn
+    maturityContentRating
+    mediaCollection {
+      _duration
+      _geoblocked
+      _isLive
+      _mediaArray {
+        _mediaStreamArray {
+          _quality
+          _server
+          _stream
+        }
+      }
+      _previewImage
+      _subtitleUrl
+      _type
+    }
+    show {
+      title
+    }
+    synopsis
+    title
+    tracking {
+      atiCustomVars {
+        contentId
+      }
+    }
+  }
+}''' % (mobj.group('client'), video_id),
+            }).encode(), headers={
+                'Content-Type': 'application/json'
+            })['data']['playerPage']
+        title = player_page['title']
+        content_id = str_or_none(try_get(
+            player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
+        media_collection = player_page.get('mediaCollection') or {}
+        if not media_collection and content_id:
+            media_collection = self._download_json(
+                'https://www.ardmediathek.de/play/media/' + content_id,
+                content_id, fatal=False) or {}
+        info = self._parse_media_info(
+            media_collection, content_id or video_id,
+            player_page.get('blockedByFsk'))
+        age_limit = None
+        description = player_page.get('synopsis')
+        maturity_content_rating = player_page.get('maturityContentRating')
+        if maturity_content_rating:
+            age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
+        if not age_limit and description:
+            age_limit = int_or_none(self._search_regex(
+                r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
+        info.update({
+            'age_limit': age_limit,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
+            'series': try_get(player_page, lambda x: x['show']['title']),
+        })
+        return info
diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py

new file mode 100644 (file)

index 0000000..854f587
--- /dev/null
+++ b/youtube_dl/extractor/arkena.py
@@ -0,0 +1,133 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    mimetype2ext,
+    parse_iso8601,
+    strip_jsonp,
+)
+
+
+class ArkenaIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:
+                                video\.arkena\.com/play2/embed/player\?|
+                                play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)
+                            )
+                        '''
+    _TESTS = [{
+        'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411',
+        'md5': 'b96f2f71b359a8ecd05ce4e1daa72365',
+        'info_dict': {
+            'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe',
+            'ext': 'mp4',
+            'title': 'Big Buck Bunny',
+            'description': 'Royalty free test video',
+            'timestamp': 1432816365,
+            'upload_date': '20150528',
+            'is_live': False,
+        },
+    }, {
+        'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893',
+        'only_matching': True,
+    }, {
+        'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972',
+        'only_matching': True,
+    }, {
+        'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/',
+        'only_matching': True,
+    }, {
+        'url': 'http://video.arkena.com/play2/embed/player?accountId=472718&mediaId=35763b3b-00090078-bf604299&pageStyling=styled',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        account_id = mobj.group('account_id')
+
+        # Handle http://video.arkena.com/play2/embed/player URL
+        if not video_id:
+            qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+            video_id = qs.get('mediaId', [None])[0]
+            account_id = qs.get('accountId', [None])[0]
+            if not video_id or not account_id:
+                raise ExtractorError('Invalid URL', expected=True)
+
+        playlist = self._download_json(
+            'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_'
+            % (video_id, account_id),
+            video_id, transform_source=strip_jsonp)['Playlist'][0]
+
+        media_info = playlist['MediaInfo']
+        title = media_info['Title']
+        media_files = playlist['MediaFiles']
+
+        is_live = False
+        formats = []
+        for kind_case, kind_formats in media_files.items():
+            kind = kind_case.lower()
+            for f in kind_formats:
+                f_url = f.get('Url')
+                if not f_url:
+                    continue
+                is_live = f.get('Live') == 'true'
+                exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None))
+                if kind == 'm3u8' or 'm3u8' in exts:
+                    formats.extend(self._extract_m3u8_formats(
+                        f_url, video_id, 'mp4', 'm3u8_native',
+                        m3u8_id=kind, fatal=False, live=is_live))
+                elif kind == 'flash' or 'f4m' in exts:
+                    formats.extend(self._extract_f4m_formats(
+                        f_url, video_id, f4m_id=kind, fatal=False))
+                elif kind == 'dash' or 'mpd' in exts:
+                    formats.extend(self._extract_mpd_formats(
+                        f_url, video_id, mpd_id=kind, fatal=False))
+                elif kind == 'silverlight':
+                    # TODO: process when ism is supported (see
+                    # https://github.com/ytdl-org/youtube-dl/issues/8118)
+                    continue
+                else:
+                    tbr = float_or_none(f.get('Bitrate'), 1000)
+                    formats.append({
+                        'url': f_url,
+                        'format_id': '%s-%d' % (kind, tbr) if tbr else kind,
+                        'tbr': tbr,
+                    })
+        self._sort_formats(formats)
+
+        description = media_info.get('Description')
+        video_id = media_info.get('VideoId') or video_id
+        timestamp = parse_iso8601(media_info.get('PublishDate'))
+        thumbnails = [{
+            'url': thumbnail['Url'],
+            'width': int_or_none(thumbnail.get('Size')),
+        } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'is_live': is_live,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py

new file mode 100644 (file)

index 0000000..2bd3bfe
--- /dev/null
+++ b/youtube_dl/extractor/arte.py
@@ -0,0 +1,201 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    qualities,
+    try_get,
+    unified_strdate,
+)
+
+# There are different sources of video in arte.tv, the extraction process
+# is different for each one. The videos usually expire in 7 days, so we can't
+# add tests.
+
+
+class ArteTVBaseIE(InfoExtractor):
+    def _extract_from_json_url(self, json_url, video_id, lang, title=None):
+        info = self._download_json(json_url, video_id)
+        player_info = info['videoJsonPlayer']
+
+        vsr = try_get(player_info, lambda x: x['VSR'], dict)
+        if not vsr:
+            error = None
+            if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error':
+                error = try_get(
+                    player_info, lambda x: x['custom_msg']['msg'], compat_str)
+            if not error:
+                error = 'Video %s is not available' % player_info.get('VID') or video_id
+            raise ExtractorError(error, expected=True)
+
+        upload_date_str = player_info.get('shootingDate')
+        if not upload_date_str:
+            upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
+
+        title = (player_info.get('VTI') or title or player_info['VID']).strip()
+        subtitle = player_info.get('VSU', '').strip()
+        if subtitle:
+            title += ' - %s' % subtitle
+
+        info_dict = {
+            'id': player_info['VID'],
+            'title': title,
+            'description': player_info.get('VDE'),
+            'upload_date': unified_strdate(upload_date_str),
+            'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
+        }
+        qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
+
+        LANGS = {
+            'fr': 'F',
+            'de': 'A',
+            'en': 'E[ANG]',
+            'es': 'E[ESP]',
+            'it': 'E[ITA]',
+            'pl': 'E[POL]',
+        }
+
+        langcode = LANGS.get(lang, lang)
+
+        formats = []
+        for format_id, format_dict in vsr.items():
+            f = dict(format_dict)
+            versionCode = f.get('versionCode')
+            l = re.escape(langcode)
+
+            # Language preference from most to least priority
+            # Reference: section 6.8 of
+            # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf
+            PREFERENCES = (
+                # original version in requested language, without subtitles
+                r'VO{0}$'.format(l),
+                # original version in requested language, with partial subtitles in requested language
+                r'VO{0}-ST{0}$'.format(l),
+                # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
+                r'VO{0}-STM{0}$'.format(l),
+                # non-original (dubbed) version in requested language, without subtitles
+                r'V{0}$'.format(l),
+                # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
+                r'V{0}-ST{0}$'.format(l),
+                # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
+                r'V{0}-STM{0}$'.format(l),
+                # original version in requested language, with partial subtitles in different language
+                r'VO{0}-ST(?!{0}).+?$'.format(l),
+                # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
+                r'VO{0}-STM(?!{0}).+?$'.format(l),
+                # original version in different language, with partial subtitles in requested language
+                r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
+                # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
+                r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
+                # original version in different language, without subtitles
+                r'VO(?:(?!{0}))?$'.format(l),
+                # original version in different language, with partial subtitles in different language
+                r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
+                # original version in different language, with subtitles for the deaf and hard-of-hearing in different language
+                r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
+            )
+
+            for pref, p in enumerate(PREFERENCES):
+                if re.match(p, versionCode):
+                    lang_pref = len(PREFERENCES) - pref
+                    break
+            else:
+                lang_pref = -1
+
+            format = {
+                'format_id': format_id,
+                'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
+                'language_preference': lang_pref,
+                'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
+                'width': int_or_none(f.get('width')),
+                'height': int_or_none(f.get('height')),
+                'tbr': int_or_none(f.get('bitrate')),
+                'quality': qfunc(f.get('quality')),
+            }
+
+            if f.get('mediaType') == 'rtmp':
+                format['url'] = f['streamer']
+                format['play_path'] = 'mp4:' + f['url']
+                format['ext'] = 'flv'
+            else:
+                format['url'] = f['url']
+
+            formats.append(format)
+
+        self._check_formats(formats, video_id)
+        self._sort_formats(formats)
+
+        info_dict['formats'] = formats
+        return info_dict
+
+
+class ArteTVPlus7IE(ArteTVBaseIE):
+    IE_NAME = 'arte.tv:+7'
+    _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'
+
+    _TESTS = [{
+        'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
+        'info_dict': {
+            'id': '088501-000-A',
+            'ext': 'mp4',
+            'title': 'Mexico: Stealing Petrol to Survive',
+            'upload_date': '20190628',
+        },
+    }]
+
+    def _real_extract(self, url):
+        lang, video_id = re.match(self._VALID_URL, url).groups()
+        return self._extract_from_json_url(
+            'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id),
+            video_id, lang)
+
+
+class ArteTVEmbedIE(ArteTVPlus7IE):
+    IE_NAME = 'arte.tv:embed'
+    _VALID_URL = r'''(?x)
+        https://www\.arte\.tv
+        /player/v3/index\.php\?json_url=
+        (?P<json_url>
+            https?://api\.arte\.tv/api/player/v1/config/
+            (?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
+        )
+    '''
+
+    _TESTS = []
+
+    def _real_extract(self, url):
+        json_url, lang, video_id = re.match(self._VALID_URL, url).groups()
+        return self._extract_from_json_url(json_url, video_id, lang)
+
+
+class ArteTVPlaylistIE(ArteTVBaseIE):
+    IE_NAME = 'arte.tv:playlist'
+    _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'
+
+    _TESTS = [{
+        'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
+        'info_dict': {
+            'id': 'RC-016954',
+            'title': 'Earn a Living',
+            'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
+        },
+        'playlist_mincount': 6,
+    }]
+
+    def _real_extract(self, url):
+        lang, playlist_id = re.match(self._VALID_URL, url).groups()
+        collection = self._download_json(
+            'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
+            % (lang, playlist_id), playlist_id)
+        title = collection.get('title')
+        description = collection.get('shortDescription') or collection.get('teaserText')
+        entries = [
+            self._extract_from_json_url(
+                video['jsonUrl'], video.get('programId') or playlist_id, lang)
+            for video in collection['videos'] if video.get('jsonUrl')]
+        return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py

new file mode 100644 (file)

index 0000000..0348e68
--- /dev/null
+++ b/youtube_dl/extractor/asiancrush.py
@@ -0,0 +1,145 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .kaltura import KalturaIE
+from ..utils import extract_attributes
+
+
+class AsianCrushIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|cocoro\.tv))'
+    _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % _VALID_URL_BASE
+    _TESTS = [{
+        'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/',
+        'md5': 'c3b740e48d0ba002a42c0b72857beae6',
+        'info_dict': {
+            'id': '1_y4tmjm5r',
+            'ext': 'mp4',
+            'title': 'Women Who Flirt',
+            'description': 'md5:7e986615808bcfb11756eb503a751487',
+            'timestamp': 1496936429,
+            'upload_date': '20170608',
+            'uploader_id': 'craig@crifkin.com',
+        },
+    }, {
+        'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.yuyutv.com/video/013886v/the-act-of-killing/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.yuyutv.com/video/peep-show/013922v-warring-factions/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.midnightpulp.com/video/010400v/drifters/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.midnightpulp.com/video/mononoke/016378v-zashikiwarashi-part-1/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        host = mobj.group('host')
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        entry_id, partner_id, title = [None] * 3
+
+        vars = self._parse_json(
+            self._search_regex(
+                r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars',
+                default='{}'), video_id, fatal=False)
+        if vars:
+            entry_id = vars.get('entry_id')
+            partner_id = vars.get('partner_id')
+            title = vars.get('vid_label')
+
+        if not entry_id:
+            entry_id = self._search_regex(
+                r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id')
+
+        player = self._download_webpage(
+            'https://api.%s/embeddedVideoPlayer' % host, video_id,
+            query={'id': entry_id})
+
+        kaltura_id = self._search_regex(
+            r'entry_id["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', player,
+            'kaltura id', group='id')
+
+        if not partner_id:
+            partner_id = self._search_regex(
+                r'/p(?:artner_id)?/(\d+)', player, 'partner id',
+                default='513551')
+
+        description = self._html_search_regex(
+            r'(?s)<div[^>]+\bclass=["\']description["\'][^>]*>(.+?)</div>',
+            webpage, 'description', fatal=False)
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'kaltura:%s:%s' % (partner_id, kaltura_id),
+            'ie_key': KalturaIE.ie_key(),
+            'id': video_id,
+            'title': title,
+            'description': description,
+        }
+
+
+class AsianCrushPlaylistIE(InfoExtractor):
+    _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushIE._VALID_URL_BASE
+    _TESTS = [{
+        'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/',
+        'info_dict': {
+            'id': '12481',
+            'title': 'Scholar Who Walks the Night',
+            'description': 'md5:7addd7c5132a09fd4741152d96cce886',
+        },
+        'playlist_count': 20,
+    }, {
+        'url': 'https://www.yuyutv.com/series/013920s/peep-show/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.midnightpulp.com/series/016375s/mononoke/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = []
+
+        for mobj in re.finditer(
+                r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL,
+                webpage):
+            attrs = extract_attributes(mobj.group(0))
+            if attrs.get('class') == 'clearfix':
+                entries.append(self.url_result(
+                    mobj.group('url'), ie=AsianCrushIE.ie_key()))
+
+        title = self._html_search_regex(
+            r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage,
+            'title', default=None) or self._og_search_title(
+            webpage, default=None) or self._html_search_meta(
+            'twitter:title', webpage, 'title',
+            default=None) or self._search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+        if title:
+            title = re.sub(r'\s*\|\s*.+?$', '', title)
+
+        description = self._og_search_description(
+            webpage, default=None) or self._html_search_meta(
+            'twitter:description', webpage, 'description', fatal=False)
+
+        return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py

new file mode 100644 (file)

index 0000000..c2cec98
--- /dev/null
+++ b/youtube_dl/extractor/atresplayer.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    urlencode_postdata,
+)
+
+
+class AtresPlayerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/[^/]+/[^/]+/[^/]+/[^/]+/(?P<display_id>.+?)_(?P<id>[0-9a-f]{24})'
+    _NETRC_MACHINE = 'atresplayer'
+    _TESTS = [
+        {
+            'url': 'https://www.atresplayer.com/antena3/series/pequenas-coincidencias/temporada-1/capitulo-7-asuntos-pendientes_5d4aa2c57ed1a88fc715a615/',
+            'info_dict': {
+                'id': '5d4aa2c57ed1a88fc715a615',
+                'ext': 'mp4',
+                'title': 'Capítulo 7: Asuntos pendientes',
+                'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc',
+                'duration': 3413,
+            },
+            'params': {
+                'format': 'bestvideo',
+            },
+            'skip': 'This video is only available for registered users'
+        },
+        {
+            'url': 'https://www.atresplayer.com/lasexta/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_5ad08edf986b2855ed47adc4/',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://www.atresplayer.com/antena3/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_5ad51046986b2886722ccdea/',
+            'only_matching': True,
+        },
+    ]
+    _API_BASE = 'https://api.atresplayer.com/'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _handle_error(self, e, code):
+        if isinstance(e.cause, compat_HTTPError) and e.cause.code == code:
+            error = self._parse_json(e.cause.read(), None)
+            if error.get('error') == 'required_registered':
+                self.raise_login_required()
+            raise ExtractorError(error['error_description'], expected=True)
+        raise
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        self._request_webpage(
+            self._API_BASE + 'login', None, 'Downloading login page')
+
+        try:
+            target_url = self._download_json(
+                'https://account.atresmedia.com/api/login', None,
+                'Logging in', headers={
+                    'Content-Type': 'application/x-www-form-urlencoded'
+                }, data=urlencode_postdata({
+                    'username': username,
+                    'password': password,
+                }))['targetUrl']
+        except ExtractorError as e:
+            self._handle_error(e, 400)
+
+        self._request_webpage(target_url, None, 'Following Target URL')
+
+    def _real_extract(self, url):
+        display_id, video_id = re.match(self._VALID_URL, url).groups()
+
+        try:
+            episode = self._download_json(
+                self._API_BASE + 'client/v1/player/episode/' + video_id, video_id)
+        except ExtractorError as e:
+            self._handle_error(e, 403)
+
+        title = episode['titulo']
+
+        formats = []
+        for source in episode.get('sources', []):
+            src = source.get('src')
+            if not src:
+                continue
+            src_type = source.get('type')
+            if src_type == 'application/vnd.apple.mpegurl':
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif src_type == 'application/dash+xml':
+                formats.extend(self._extract_mpd_formats(
+                    src, video_id, mpd_id='dash', fatal=False))
+        self._sort_formats(formats)
+
+        heartbeat = episode.get('heartbeat') or {}
+        omniture = episode.get('omniture') or {}
+        get_meta = lambda x: heartbeat.get(x) or omniture.get(x)
+
+        return {
+            'display_id': display_id,
+            'id': video_id,
+            'title': title,
+            'description': episode.get('descripcion'),
+            'thumbnail': episode.get('imgPoster'),
+            'duration': int_or_none(episode.get('duration')),
+            'formats': formats,
+            'channel': get_meta('channel'),
+            'season': get_meta('season'),
+            'episode_number': int_or_none(get_meta('episodeNumber')),
+        }
diff --git a/youtube_dl/extractor/atttechchannel.py b/youtube_dl/extractor/atttechchannel.py

new file mode 100644 (file)

index 0000000..8f93fb3
--- /dev/null
+++ b/youtube_dl/extractor/atttechchannel.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class ATTTechChannelIE(InfoExtractor):
+    _VALID_URL = r'https?://techchannel\.att\.com/play-video\.cfm/([^/]+/)*(?P<id>.+)'
+    _TEST = {
+        'url': 'http://techchannel.att.com/play-video.cfm/2014/1/27/ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use',
+        'info_dict': {
+            'id': '11316',
+            'display_id': 'ATT-Archives-The-UNIX-System-Making-Computers-Easier-to-Use',
+            'ext': 'flv',
+            'title': 'AT&T Archives : The UNIX System: Making Computers Easier to Use',
+            'description': 'A 1982 film about UNIX is the foundation for software in use around Bell Labs and AT&T.',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20140127',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_url = self._search_regex(
+            r"url\s*:\s*'(rtmp://[^']+)'",
+            webpage, 'video URL')
+
+        video_id = self._search_regex(
+            r'mediaid\s*=\s*(\d+)',
+            webpage, 'video id', fatal=False)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+        upload_date = unified_strdate(self._search_regex(
+            r'[Rr]elease\s+date:\s*(\d{1,2}/\d{1,2}/\d{4})',
+            webpage, 'upload date', fatal=False), False)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'ext': 'flv',
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+        }
diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py

new file mode 100644 (file)

index 0000000..95e572d
--- /dev/null
+++ b/youtube_dl/extractor/atvat.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    unescapeHTML,
+)
+
+
+class ATVAtIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P<id>[dv]\d+)'
+    _TESTS = [{
+        'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/',
+        'md5': 'c3b6b975fb3150fc628572939df205f2',
+        'info_dict': {
+            'id': '1698447',
+            'ext': 'mp4',
+            'title': 'DI, 21.03.17 | 20:05 Uhr 1/1',
+        }
+    }, {
+        'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_data = self._parse_json(unescapeHTML(self._search_regex(
+            [r'flashPlayerOptions\s*=\s*(["\'])(?P<json>(?:(?!\1).)+)\1',
+             r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P<json>[^"]+)"'],
+            webpage, 'player data', group='json')),
+            display_id)['config']['initial_video']
+
+        video_id = video_data['id']
+        video_title = video_data['title']
+
+        parts = []
+        for part in video_data.get('parts', []):
+            part_id = part['id']
+            part_title = part['title']
+
+            formats = []
+            for source in part.get('sources', []):
+                source_url = source.get('src')
+                if not source_url:
+                    continue
+                ext = determine_ext(source_url)
+                if ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        source_url, part_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+                else:
+                    formats.append({
+                        'format_id': source.get('delivery'),
+                        'url': source_url,
+                    })
+            self._sort_formats(formats)
+
+            parts.append({
+                'id': part_id,
+                'title': part_title,
+                'thumbnail': part.get('preview_image_url'),
+                'duration': int_or_none(part.get('duration')),
+                'is_live': part.get('is_livestream'),
+                'formats': formats,
+            })
+
+        return {
+            '_type': 'multi_video',
+            'id': video_id,
+            'title': video_title,
+            'entries': parts,
+        }
diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py

new file mode 100644 (file)

index 0000000..6bd48ef
--- /dev/null
+++ b/youtube_dl/extractor/audimedia.py
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class AudiMediaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?:video/)?(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467',
+        'md5': '79a8b71c46d49042609795ab59779b66',
+        'info_dict': {
+            'id': '1565',
+            'ext': 'mp4',
+            'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test',
+            'description': 'md5:60e5d30a78ced725f7b8d34370762941',
+            'upload_date': '20151124',
+            'timestamp': 1448354940,
+            'duration': 74022,
+            'view_count': int,
+        }
+    }, {
+        'url': 'https://www.audi-mediacenter.com/en/audimediatv/video/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-2991',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        raw_payload = self._search_regex([
+            r'class="amtv-embed"[^>]+id="([0-9a-z-]+)"',
+            r'id="([0-9a-z-]+)"[^>]+class="amtv-embed"',
+            r'class=\\"amtv-embed\\"[^>]+id=\\"([0-9a-z-]+)\\"',
+            r'id=\\"([0-9a-z-]+)\\"[^>]+class=\\"amtv-embed\\"',
+            r'id=(?:\\)?"(amtve-[a-z]-\d+-[a-z]{2})',
+        ], webpage, 'raw payload')
+        _, stage_mode, video_id, _ = raw_payload.split('-')
+
+        # TODO: handle s and e stage_mode (live streams and ended live streams)
+        if stage_mode not in ('s', 'e'):
+            video_data = self._download_json(
+                'https://www.audimedia.tv/api/video/v1/videos/' + video_id,
+                video_id, query={
+                    'embed[]': ['video_versions', 'thumbnail_image'],
+                })['results']
+            formats = []
+
+            stream_url_hls = video_data.get('stream_url_hls')
+            if stream_url_hls:
+                formats.extend(self._extract_m3u8_formats(
+                    stream_url_hls, video_id, 'mp4',
+                    entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+
+            stream_url_hds = video_data.get('stream_url_hds')
+            if stream_url_hds:
+                formats.extend(self._extract_f4m_formats(
+                    stream_url_hds + '?hdcore=3.4.0',
+                    video_id, f4m_id='hds', fatal=False))
+
+            for video_version in video_data.get('video_versions', []):
+                video_version_url = video_version.get('download_url') or video_version.get('stream_url')
+                if not video_version_url:
+                    continue
+                f = {
+                    'url': video_version_url,
+                    'width': int_or_none(video_version.get('width')),
+                    'height': int_or_none(video_version.get('height')),
+                    'abr': int_or_none(video_version.get('audio_bitrate')),
+                    'vbr': int_or_none(video_version.get('video_bitrate')),
+                }
+                bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None)
+                if bitrate:
+                    f.update({
+                        'format_id': 'http-%s' % bitrate,
+                    })
+                formats.append(f)
+            self._sort_formats(formats)
+
+            return {
+                'id': video_id,
+                'title': video_data['title'],
+                'description': video_data.get('subtitle'),
+                'thumbnail': video_data.get('thumbnail_image', {}).get('file'),
+                'timestamp': parse_iso8601(video_data.get('publication_date')),
+                'duration': int_or_none(video_data.get('duration')),
+                'view_count': int_or_none(video_data.get('view_count')),
+                'formats': formats,
+            }
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py

new file mode 100644 (file)

index 0000000..c51837b
--- /dev/null
+++ b/youtube_dl/extractor/audioboom.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    float_or_none,
+)
+
+
+class AudioBoomIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'https://audioboom.com/posts/7398103-asim-chaudhry',
+        'md5': '7b00192e593ff227e6a315486979a42d',
+        'info_dict': {
+            'id': '7398103',
+            'ext': 'mp3',
+            'title': 'Asim Chaudhry',
+            'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc',
+            'duration': 4000.99,
+            'uploader': 'Sue Perkins: An hour or so with...',
+            'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins',
+        }
+    }, {
+        'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        clip = None
+
+        clip_store = self._parse_json(
+            self._html_search_regex(
+                r'data-new-clip-store=(["\'])(?P<json>{.+?})\1',
+                webpage, 'clip store', default='{}', group='json'),
+            video_id, fatal=False)
+        if clip_store:
+            clips = clip_store.get('clips')
+            if clips and isinstance(clips, list) and isinstance(clips[0], dict):
+                clip = clips[0]
+
+        def from_clip(field):
+            if clip:
+                return clip.get(field)
+
+        audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
+            'audio', webpage, 'audio url')
+        title = from_clip('title') or self._html_search_meta(
+            ['og:title', 'og:audio:title', 'audio_title'], webpage)
+        description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage)
+
+        duration = float_or_none(from_clip('duration') or self._html_search_meta(
+            'weibo:audio:duration', webpage))
+
+        uploader = from_clip('author') or self._html_search_meta(
+            ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader')
+        uploader_url = from_clip('author_url') or self._html_search_meta(
+            'audioboo:channel', webpage, 'uploader url')
+
+        return {
+            'id': video_id,
+            'url': audio_url,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'uploader': uploader,
+            'uploader_url': uploader_url,
+        }
diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py

new file mode 100644 (file)

index 0000000..cc77713
--- /dev/null
+++ b/youtube_dl/extractor/audiomack.py
@@ -0,0 +1,145 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import time
+
+from .common import InfoExtractor
+from .soundcloud import SoundcloudIE
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    url_basename,
+)
+
+
+class AudiomackIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P<id>[\w/-]+)'
+    IE_NAME = 'audiomack'
+    _TESTS = [
+        # hosted on audiomack
+        {
+            'url': 'http://www.audiomack.com/song/roosh-williams/extraordinary',
+            'info_dict':
+            {
+                'id': '310086',
+                'ext': 'mp3',
+                'uploader': 'Roosh Williams',
+                'title': 'Extraordinary'
+            }
+        },
+        # audiomack wrapper around soundcloud song
+        {
+            'add_ie': ['Soundcloud'],
+            'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle',
+            'info_dict': {
+                'id': '258901379',
+                'ext': 'mp3',
+                'description': 'mamba day freestyle for the legend Kobe Bryant ',
+                'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]',
+                'uploader': 'ILOVEMAKONNEN',
+                'upload_date': '20160414',
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        # URLs end with [uploader name]/[uploader title]
+        # this title is whatever the user types in, and is rarely
+        # the proper song title.  Real metadata is in the api response
+        album_url_tag = self._match_id(url)
+
+        # Request the extended version of the api for extra fields like artist and title
+        api_response = self._download_json(
+            'http://www.audiomack.com/api/music/url/song/%s?extended=1&_=%d' % (
+                album_url_tag, time.time()),
+            album_url_tag)
+
+        # API is inconsistent with errors
+        if 'url' not in api_response or not api_response['url'] or 'error' in api_response:
+            raise ExtractorError('Invalid url %s' % url)
+
+        # Audiomack wraps a lot of soundcloud tracks in their branded wrapper
+        # if so, pass the work off to the soundcloud extractor
+        if SoundcloudIE.suitable(api_response['url']):
+            return self.url_result(api_response['url'], SoundcloudIE.ie_key())
+
+        return {
+            'id': compat_str(api_response.get('id', album_url_tag)),
+            'uploader': api_response.get('artist'),
+            'title': api_response.get('title'),
+            'url': api_response['url'],
+        }
+
+
+class AudiomackAlbumIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P<id>[\w/-]+)'
+    IE_NAME = 'audiomack:album'
+    _TESTS = [
+        # Standard album playlist
+        {
+            'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape',
+            'playlist_count': 15,
+            'info_dict':
+            {
+                'id': '812251',
+                'title': 'Tha Tour: Part 2 (Official Mixtape)'
+            }
+        },
+        # Album playlist ripped from fakeshoredrive with no metadata
+        {
+            'url': 'http://www.audiomack.com/album/fakeshoredrive/ppp-pistol-p-project',
+            'info_dict': {
+                'title': 'PPP (Pistol P Project)',
+                'id': '837572',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)',
+                    'id': '837577',
+                    'ext': 'mp3',
+                    'uploader': 'Lil Herb a.k.a. G Herbo',
+                }
+            }],
+            'params': {
+                'playliststart': 9,
+                'playlistend': 9,
+            }
+        }
+    ]
+
+    def _real_extract(self, url):
+        # URLs end with [uploader name]/[uploader title]
+        # this title is whatever the user types in, and is rarely
+        # the proper song title.  Real metadata is in the api response
+        album_url_tag = self._match_id(url)
+        result = {'_type': 'playlist', 'entries': []}
+        # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata
+        # Therefore we don't know how many songs the album has and must infi-loop until failure
+        for track_no in itertools.count():
+            # Get song's metadata
+            api_response = self._download_json(
+                'http://www.audiomack.com/api/music/url/album/%s/%d?extended=1&_=%d'
+                % (album_url_tag, track_no, time.time()), album_url_tag,
+                note='Querying song information (%d)' % (track_no + 1))
+
+            # Total failure, only occurs when url is totally wrong
+            # Won't happen in middle of valid playlist (next case)
+            if 'url' not in api_response or 'error' in api_response:
+                raise ExtractorError('Invalid url for track %d of album url %s' % (track_no, url))
+            # URL is good but song id doesn't exist - usually means end of playlist
+            elif not api_response['url']:
+                break
+            else:
+                # Pull out the album metadata and add to result (if it exists)
+                for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]:
+                    if apikey in api_response and resultkey not in result:
+                        result[resultkey] = api_response[apikey]
+                song_id = url_basename(api_response['url']).rpartition('.')[0]
+                result['entries'].append({
+                    'id': compat_str(api_response.get('id', song_id)),
+                    'uploader': api_response.get('artist'),
+                    'title': api_response.get('title', song_id),
+                    'url': api_response['url'],
+                })
+        return result
diff --git a/youtube_dl/extractor/awaan.py b/youtube_dl/extractor/awaan.py

new file mode 100644 (file)

index 0000000..a2603bb
--- /dev/null
+++ b/youtube_dl/extractor/awaan.py
@@ -0,0 +1,185 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import base64
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_urlencode,
+    compat_str,
+)
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    smuggle_url,
+    unsmuggle_url,
+    urlencode_postdata,
+)
+
+
+class AWAANIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
+
+    def _real_extract(self, url):
+        show_id, video_id, season_id = re.match(self._VALID_URL, url).groups()
+        if video_id and int(video_id) > 0:
+            return self.url_result(
+                'http://awaan.ae/media/%s' % video_id, 'AWAANVideo')
+        elif season_id and int(season_id) > 0:
+            return self.url_result(smuggle_url(
+                'http://awaan.ae/program/season/%s' % season_id,
+                {'show_id': show_id}), 'AWAANSeason')
+        else:
+            return self.url_result(
+                'http://awaan.ae/program/%s' % show_id, 'AWAANSeason')
+
+
+class AWAANBaseIE(InfoExtractor):
+    def _parse_video_data(self, video_data, video_id, is_live):
+        title = video_data.get('title_en') or video_data['title_ar']
+        img = video_data.get('img')
+
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'description': video_data.get('description_en') or video_data.get('description_ar'),
+            'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None,
+            'duration': int_or_none(video_data.get('duration')),
+            'timestamp': parse_iso8601(video_data.get('create_time'), ' '),
+            'is_live': is_live,
+        }
+
+
+class AWAANVideoIE(AWAANBaseIE):
+    IE_NAME = 'awaan:video'
+    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375',
+        'md5': '5f61c33bfc7794315c671a62d43116aa',
+        'info_dict':
+        {
+            'id': '17375',
+            'ext': 'mp4',
+            'title': 'رحلة العمر : الحلقة 1',
+            'description': 'md5:0156e935d870acb8ef0a66d24070c6d6',
+            'duration': 2041,
+            'timestamp': 1227504126,
+            'upload_date': '20081124',
+            'uploader_id': '71',
+        },
+    }, {
+        'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video_data = self._download_json(
+            'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id,
+            video_id, headers={'Origin': 'http://awaan.ae'})
+        info = self._parse_video_data(video_data, video_id, False)
+
+        embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + compat_urllib_parse_urlencode({
+            'id': video_data['id'],
+            'user_id': video_data['user_id'],
+            'signature': video_data['signature'],
+            'countries': 'Q0M=',
+            'filter': 'DENY',
+        })
+        info.update({
+            '_type': 'url_transparent',
+            'url': embed_url,
+            'ie_key': 'MangomoloVideo',
+        })
+        return info
+
+
+class AWAANLiveIE(AWAANBaseIE):
+    IE_NAME = 'awaan:live'
+    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://awaan.ae/live/6/dubai-tv',
+        'info_dict': {
+            'id': '6',
+            'ext': 'mp4',
+            'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'upload_date': '20150107',
+            'timestamp': 1420588800,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        channel_data = self._download_json(
+            'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id,
+            channel_id, headers={'Origin': 'http://awaan.ae'})
+        info = self._parse_video_data(channel_data, channel_id, True)
+
+        embed_url = 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + compat_urllib_parse_urlencode({
+            'id': base64.b64encode(channel_data['user_id'].encode()).decode(),
+            'channelid': base64.b64encode(channel_data['id'].encode()).decode(),
+            'signature': channel_data['signature'],
+            'countries': 'Q0M=',
+            'filter': 'DENY',
+        })
+        info.update({
+            '_type': 'url_transparent',
+            'url': embed_url,
+            'ie_key': 'MangomoloLive',
+        })
+        return info
+
+
+class AWAANSeasonIE(InfoExtractor):
+    IE_NAME = 'awaan:season'
+    _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
+    _TEST = {
+        'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A',
+        'info_dict':
+        {
+            'id': '7910',
+            'title': 'محاضرات الشيخ الشعراوي',
+        },
+        'playlist_mincount': 27,
+    }
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+        show_id, season_id = re.match(self._VALID_URL, url).groups()
+
+        data = {}
+        if season_id:
+            data['season'] = season_id
+            show_id = smuggled_data.get('show_id')
+            if show_id is None:
+                season = self._download_json(
+                    'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id,
+                    season_id, headers={'Origin': 'http://awaan.ae'})
+                show_id = season['id']
+        data['show_id'] = show_id
+        show = self._download_json(
+            'http://admin.mangomolo.com/analytics/index.php/plus/show',
+            show_id, data=urlencode_postdata(data), headers={
+                'Origin': 'http://awaan.ae',
+                'Content-Type': 'application/x-www-form-urlencoded'
+            })
+        if not season_id:
+            season_id = show['default_season']
+        for season in show['seasons']:
+            if season['id'] == season_id:
+                title = season.get('title_en') or season['title_ar']
+
+                entries = []
+                for video in show['videos']:
+                    video_id = compat_str(video['id'])
+                    entries.append(self.url_result(
+                        'http://awaan.ae/media/%s' % video_id, 'AWAANVideo', video_id))
+
+                return self.playlist_result(entries, season_id, title)
diff --git a/youtube_dl/extractor/aws.py b/youtube_dl/extractor/aws.py

new file mode 100644 (file)

index 0000000..dccfeaf
--- /dev/null
+++ b/youtube_dl/extractor/aws.py
@@ -0,0 +1,78 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import datetime
+import hashlib
+import hmac
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlencode
+
+
+class AWSIE(InfoExtractor):
+    _AWS_ALGORITHM = 'AWS4-HMAC-SHA256'
+    _AWS_REGION = 'us-east-1'
+
+    def _aws_execute_api(self, aws_dict, video_id, query=None):
+        query = query or {}
+        amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
+        date = amz_date[:8]
+        headers = {
+            'Accept': 'application/json',
+            'Host': self._AWS_PROXY_HOST,
+            'X-Amz-Date': amz_date,
+            'X-Api-Key': self._AWS_API_KEY
+        }
+        session_token = aws_dict.get('session_token')
+        if session_token:
+            headers['X-Amz-Security-Token'] = session_token
+
+        def aws_hash(s):
+            return hashlib.sha256(s.encode('utf-8')).hexdigest()
+
+        # Task 1: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-canonical-request.html
+        canonical_querystring = compat_urllib_parse_urlencode(query)
+        canonical_headers = ''
+        for header_name, header_value in sorted(headers.items()):
+            canonical_headers += '%s:%s\n' % (header_name.lower(), header_value)
+        signed_headers = ';'.join([header.lower() for header in sorted(headers.keys())])
+        canonical_request = '\n'.join([
+            'GET',
+            aws_dict['uri'],
+            canonical_querystring,
+            canonical_headers,
+            signed_headers,
+            aws_hash('')
+        ])
+
+        # Task 2: http://docs.aws.amazon.com/general/latest/gr/sigv4-create-string-to-sign.html
+        credential_scope_list = [date, self._AWS_REGION, 'execute-api', 'aws4_request']
+        credential_scope = '/'.join(credential_scope_list)
+        string_to_sign = '\n'.join([self._AWS_ALGORITHM, amz_date, credential_scope, aws_hash(canonical_request)])
+
+        # Task 3: http://docs.aws.amazon.com/general/latest/gr/sigv4-calculate-signature.html
+        def aws_hmac(key, msg):
+            return hmac.new(key, msg.encode('utf-8'), hashlib.sha256)
+
+        def aws_hmac_digest(key, msg):
+            return aws_hmac(key, msg).digest()
+
+        def aws_hmac_hexdigest(key, msg):
+            return aws_hmac(key, msg).hexdigest()
+
+        k_signing = ('AWS4' + aws_dict['secret_key']).encode('utf-8')
+        for value in credential_scope_list:
+            k_signing = aws_hmac_digest(k_signing, value)
+
+        signature = aws_hmac_hexdigest(k_signing, string_to_sign)
+
+        # Task 4: http://docs.aws.amazon.com/general/latest/gr/sigv4-add-signature-to-request.html
+        headers['Authorization'] = ', '.join([
+            '%s Credential=%s/%s' % (self._AWS_ALGORITHM, aws_dict['access_key'], credential_scope),
+            'SignedHeaders=%s' % signed_headers,
+            'Signature=%s' % signature,
+        ])
+
+        return self._download_json(
+            'https://%s%s%s' % (self._AWS_PROXY_HOST, aws_dict['uri'], '?' + canonical_querystring if canonical_querystring else ''),
+            video_id, headers=headers)
diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py

new file mode 100644 (file)

index 0000000..b1e20de
--- /dev/null
+++ b/youtube_dl/extractor/azmedien.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from .kaltura import KalturaIE
+
+
+class AZMedienIE(InfoExtractor):
+    IE_DESC = 'AZ Medien videos'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        (?P<host>
+                            telezueri\.ch|
+                            telebaern\.tv|
+                            telem1\.ch
+                        )/
+                        [^/]+/
+                        (?P<id>
+                            [^/]+-(?P<article_id>\d+)
+                        )
+                        (?:
+                            \#video=
+                            (?P<kaltura_id>
+                                [_0-9a-z]+
+                            )
+                        )?
+                    '''
+
+    _TESTS = [{
+        'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569',
+        'info_dict': {
+            'id': '1_anruz3wy',
+            'ext': 'mp4',
+            'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen',
+            'uploader_id': 'TVOnline',
+            'upload_date': '20180930',
+            'timestamp': 1538328802,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1',
+        'only_matching': True
+    }]
+    _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d'
+    _PARTNER_ID = '1719221'
+
+    def _real_extract(self, url):
+        host, display_id, article_id, entry_id = re.match(self._VALID_URL, url).groups()
+
+        if not entry_id:
+            entry_id = self._download_json(
+                self._API_TEMPL % (host, host.split('.')[0]), display_id, query={
+                    'variables': json.dumps({
+                        'contextId': 'NewsArticle:' + article_id,
+                    }),
+                })['data']['context']['mainAsset']['video']['kaltura']['kalturaId']
+
+        return self.url_result(
+            'kaltura:%s:%s' % (self._PARTNER_ID, entry_id),
+            ie=KalturaIE.ie_key(), video_id=entry_id)
diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py

new file mode 100644 (file)

index 0000000..234a661
--- /dev/null
+++ b/youtube_dl/extractor/baidu.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import unescapeHTML
+
+
+class BaiduVideoIE(InfoExtractor):
+    IE_DESC = '百度视频'
+    _VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm'
+    _TESTS = [{
+        'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6',
+        'info_dict': {
+            'id': '1069',
+            'title': '中华小当家 TV版国语',
+            'description': 'md5:51be07afe461cf99fa61231421b5397c',
+        },
+        'playlist_count': 52,
+    }, {
+        'url': 'http://v.baidu.com/show/11595.htm?frp=bdbrand',
+        'info_dict': {
+            'id': '11595',
+            'title': 're:^奔跑吧兄弟',
+            'description': 'md5:1bf88bad6d850930f542d51547c089b8',
+        },
+        'playlist_mincount': 12,
+    }]
+
+    def _call_api(self, path, category, playlist_id, note):
+        return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % (
+            path, category, playlist_id), playlist_id, note)
+
+    def _real_extract(self, url):
+        category, playlist_id = re.match(self._VALID_URL, url).groups()
+        if category == 'show':
+            category = 'tvshow'
+        if category == 'tv':
+            category = 'tvplay'
+
+        playlist_detail = self._call_api(
+            'xqinfo', category, playlist_id, 'Download playlist JSON metadata')
+
+        playlist_title = playlist_detail['title']
+        playlist_description = unescapeHTML(playlist_detail.get('intro'))
+
+        episodes_detail = self._call_api(
+            'xqsingle', category, playlist_id, 'Download episodes JSON metadata')
+
+        entries = [self.url_result(
+            episode['url'], video_title=episode['title']
+        ) for episode in episodes_detail['videos']]
+
+        return self.playlist_result(
+            entries, playlist_id, playlist_title, playlist_description)
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py

new file mode 100644 (file)

index 0000000..b8a57e6
--- /dev/null
+++ b/youtube_dl/extractor/bandcamp.py
@@ -0,0 +1,417 @@
+from __future__ import unicode_literals
+
+import random
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    KNOWN_EXTENSIONS,
+    parse_filesize,
+    str_or_none,
+    try_get,
+    unescapeHTML,
+    update_url_query,
+    unified_strdate,
+    unified_timestamp,
+    url_or_none,
+)
+
+
+class BandcampIE(InfoExtractor):
+    _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://youtube-dlc.bandcamp.com/track/youtube-dlc-test-song',
+        'md5': 'c557841d5e50261777a6585648adf439',
+        'info_dict': {
+            'id': '1812978515',
+            'ext': 'mp3',
+            'title': "youtube-dlc  \"'/\\\u00e4\u21ad - youtube-dlc test song \"'/\\\u00e4\u21ad",
+            'duration': 9.8485,
+        },
+        '_skip': 'There is a limit of 200 free downloads / month for the test song'
+    }, {
+        # free download
+        'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
+        'md5': '853e35bf34aa1d6fe2615ae612564b36',
+        'info_dict': {
+            'id': '2650410135',
+            'ext': 'aiff',
+            'title': 'Ben Prunty - Lanius (Battle)',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Ben Prunty',
+            'timestamp': 1396508491,
+            'upload_date': '20140403',
+            'release_date': '20140403',
+            'duration': 260.877,
+            'track': 'Lanius (Battle)',
+            'track_number': 1,
+            'track_id': '2650410135',
+            'artist': 'Ben Prunty',
+            'album': 'FTL: Advanced Edition Soundtrack',
+        },
+    }, {
+        # no free download, mp3 128
+        'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire',
+        'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7',
+        'info_dict': {
+            'id': '2584466013',
+            'ext': 'mp3',
+            'title': 'Mastodon - Hail to Fire',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Mastodon',
+            'timestamp': 1322005399,
+            'upload_date': '20111122',
+            'release_date': '20040207',
+            'duration': 120.79,
+            'track': 'Hail to Fire',
+            'track_number': 5,
+            'track_id': '2584466013',
+            'artist': 'Mastodon',
+            'album': 'Call of the Mastodon',
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = mobj.group('title')
+        webpage = self._download_webpage(url, title)
+        thumbnail = self._html_search_meta('og:image', webpage, default=None)
+
+        track_id = None
+        track = None
+        track_number = None
+        duration = None
+
+        formats = []
+        track_info = self._parse_json(
+            self._search_regex(
+                r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n',
+                webpage, 'track info', default='{}'), title)
+        if track_info:
+            file_ = track_info.get('file')
+            if isinstance(file_, dict):
+                for format_id, format_url in file_.items():
+                    if not url_or_none(format_url):
+                        continue
+                    ext, abr_str = format_id.split('-', 1)
+                    formats.append({
+                        'format_id': format_id,
+                        'url': self._proto_relative_url(format_url, 'http:'),
+                        'ext': ext,
+                        'vcodec': 'none',
+                        'acodec': ext,
+                        'abr': int_or_none(abr_str),
+                    })
+            track = track_info.get('title')
+            track_id = str_or_none(track_info.get('track_id') or track_info.get('id'))
+            track_number = int_or_none(track_info.get('track_num'))
+            duration = float_or_none(track_info.get('duration'))
+
+        def extract(key):
+            return self._search_regex(
+                r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key,
+                webpage, key, default=None, group='value')
+
+        artist = extract('artist')
+        album = extract('album_title')
+        timestamp = unified_timestamp(
+            extract('publish_date') or extract('album_publish_date'))
+        release_date = unified_strdate(extract('album_release_date'))
+
+        download_link = self._search_regex(
+            r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'download link', default=None, group='url')
+        if download_link:
+            track_id = self._search_regex(
+                r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
+                webpage, 'track id')
+
+            download_webpage = self._download_webpage(
+                download_link, track_id, 'Downloading free downloads page')
+
+            blob = self._parse_json(
+                self._search_regex(
+                    r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
+                    'blob', group='blob'),
+                track_id, transform_source=unescapeHTML)
+
+            info = try_get(
+                blob, (lambda x: x['digital_items'][0],
+                       lambda x: x['download_items'][0]), dict)
+            if info:
+                downloads = info.get('downloads')
+                if isinstance(downloads, dict):
+                    if not track:
+                        track = info.get('title')
+                    if not artist:
+                        artist = info.get('artist')
+                    if not thumbnail:
+                        thumbnail = info.get('thumb_url')
+
+                    download_formats = {}
+                    download_formats_list = blob.get('download_formats')
+                    if isinstance(download_formats_list, list):
+                        for f in blob['download_formats']:
+                            name, ext = f.get('name'), f.get('file_extension')
+                            if all(isinstance(x, compat_str) for x in (name, ext)):
+                                download_formats[name] = ext.strip('.')
+
+                    for format_id, f in downloads.items():
+                        format_url = f.get('url')
+                        if not format_url:
+                            continue
+                        # Stat URL generation algorithm is reverse engineered from
+                        # download_*_bundle_*.js
+                        stat_url = update_url_query(
+                            format_url.replace('/download/', '/statdownload/'), {
+                                '.rand': int(time.time() * 1000 * random.random()),
+                            })
+                        format_id = f.get('encoding_name') or format_id
+                        stat = self._download_json(
+                            stat_url, track_id, 'Downloading %s JSON' % format_id,
+                            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
+                            fatal=False)
+                        if not stat:
+                            continue
+                        retry_url = url_or_none(stat.get('retry_url'))
+                        if not retry_url:
+                            continue
+                        formats.append({
+                            'url': self._proto_relative_url(retry_url, 'http:'),
+                            'ext': download_formats.get(format_id),
+                            'format_id': format_id,
+                            'format_note': f.get('description'),
+                            'filesize': parse_filesize(f.get('size_mb')),
+                            'vcodec': 'none',
+                        })
+
+        self._sort_formats(formats)
+
+        title = '%s - %s' % (artist, track) if artist else track
+
+        if not duration:
+            duration = float_or_none(self._html_search_meta(
+                'duration', webpage, default=None))
+
+        return {
+            'id': track_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'uploader': artist,
+            'timestamp': timestamp,
+            'release_date': release_date,
+            'duration': duration,
+            'track': track,
+            'track_number': track_number,
+            'track_id': track_id,
+            'artist': artist,
+            'album': album,
+            'formats': formats,
+        }
+
+
+class BandcampAlbumIE(InfoExtractor):
+    IE_NAME = 'Bandcamp:album'
+    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
+
+    _TESTS = [{
+        'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
+        'playlist': [
+            {
+                'md5': '39bc1eded3476e927c724321ddf116cf',
+                'info_dict': {
+                    'id': '1353101989',
+                    'ext': 'mp3',
+                    'title': 'Intro',
+                }
+            },
+            {
+                'md5': '1a2c32e2691474643e912cc6cd4bffaa',
+                'info_dict': {
+                    'id': '38097443',
+                    'ext': 'mp3',
+                    'title': 'Kero One - Keep It Alive (Blazo remix)',
+                }
+            },
+        ],
+        'info_dict': {
+            'title': 'Jazz Format Mixtape vol.1',
+            'id': 'jazz-format-mixtape-vol-1',
+            'uploader_id': 'blazo',
+        },
+        'params': {
+            'playlistend': 2
+        },
+        'skip': 'Bandcamp imposes download limits.'
+    }, {
+        'url': 'http://nightbringer.bandcamp.com/album/hierophany-of-the-open-grave',
+        'info_dict': {
+            'title': 'Hierophany of the Open Grave',
+            'uploader_id': 'nightbringer',
+            'id': 'hierophany-of-the-open-grave',
+        },
+        'playlist_mincount': 9,
+    }, {
+        'url': 'http://dotscale.bandcamp.com',
+        'info_dict': {
+            'title': 'Loom',
+            'id': 'dotscale',
+            'uploader_id': 'dotscale',
+        },
+        'playlist_mincount': 7,
+    }, {
+        # with escaped quote in title
+        'url': 'https://jstrecords.bandcamp.com/album/entropy-ep',
+        'info_dict': {
+            'title': '"Entropy" EP',
+            'uploader_id': 'jstrecords',
+            'id': 'entropy-ep',
+        },
+        'playlist_mincount': 3,
+    }, {
+        # not all tracks have songs
+        'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
+        'info_dict': {
+            'id': 'we-are-the-plague',
+            'title': 'WE ARE THE PLAGUE',
+            'uploader_id': 'insulters',
+        },
+        'playlist_count': 2,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False
+                if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
+                else super(BandcampAlbumIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        uploader_id = mobj.group('subdomain')
+        album_id = mobj.group('album_id')
+        playlist_id = album_id or uploader_id
+        webpage = self._download_webpage(url, playlist_id)
+        track_elements = re.findall(
+            r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
+        if not track_elements:
+            raise ExtractorError('The page doesn\'t contain any tracks')
+        # Only tracks with duration info have songs
+        entries = [
+            self.url_result(
+                compat_urlparse.urljoin(url, t_path),
+                ie=BandcampIE.ie_key(),
+                video_title=self._search_regex(
+                    r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
+                    elem_content, 'track title', fatal=False))
+            for elem_content, t_path in track_elements
+            if self._html_search_meta('duration', elem_content, default=None)]
+
+        title = self._html_search_regex(
+            r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
+            webpage, 'title', fatal=False)
+        if title:
+            title = title.replace(r'\"', '"')
+        return {
+            '_type': 'playlist',
+            'uploader_id': uploader_id,
+            'id': playlist_id,
+            'title': title,
+            'entries': entries,
+        }
+
+
+class BandcampWeeklyIE(InfoExtractor):
+    IE_NAME = 'Bandcamp:weekly'
+    _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://bandcamp.com/?show=224',
+        'md5': 'b00df799c733cf7e0c567ed187dea0fd',
+        'info_dict': {
+            'id': '224',
+            'ext': 'opus',
+            'title': 'BC Weekly April 4th 2017 - Magic Moments',
+            'description': 'md5:5d48150916e8e02d030623a48512c874',
+            'duration': 5829.77,
+            'release_date': '20170404',
+            'series': 'Bandcamp Weekly',
+            'episode': 'Magic Moments',
+            'episode_number': 208,
+            'episode_id': '224',
+        }
+    }, {
+        'url': 'https://bandcamp.com/?blah/blah@&show=228',
+        'only_matching': True
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        blob = self._parse_json(
+            self._search_regex(
+                r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
+                'blob', group='blob'),
+            video_id, transform_source=unescapeHTML)
+
+        show = blob['bcw_show']
+
+        # This is desired because any invalid show id redirects to `bandcamp.com`
+        # which happens to expose the latest Bandcamp Weekly episode.
+        show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
+
+        formats = []
+        for format_id, format_url in show['audio_stream'].items():
+            if not url_or_none(format_url):
+                continue
+            for known_ext in KNOWN_EXTENSIONS:
+                if known_ext in format_id:
+                    ext = known_ext
+                    break
+            else:
+                ext = None
+            formats.append({
+                'format_id': format_id,
+                'url': format_url,
+                'ext': ext,
+                'vcodec': 'none',
+            })
+        self._sort_formats(formats)
+
+        title = show.get('audio_title') or 'Bandcamp Weekly'
+        subtitle = show.get('subtitle')
+        if subtitle:
+            title += ' - %s' % subtitle
+
+        episode_number = None
+        seq = blob.get('bcw_seq')
+
+        if seq and isinstance(seq, list):
+            try:
+                episode_number = next(
+                    int_or_none(e.get('episode_number'))
+                    for e in seq
+                    if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
+            except StopIteration:
+                pass
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': show.get('desc') or show.get('short_desc'),
+            'duration': float_or_none(show.get('audio_duration')),
+            'is_live': False,
+            'release_date': unified_strdate(show.get('published_date')),
+            'series': 'Bandcamp Weekly',
+            'episode': show.get('subtitle'),
+            'episode_number': episode_number,
+            'episode_id': compat_str(video_id),
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py

new file mode 100644 (file)

index 0000000..002c39c
--- /dev/null
+++ b/youtube_dl/extractor/bbc.py
@@ -0,0 +1,1359 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    dict_get,
+    ExtractorError,
+    float_or_none,
+    get_element_by_class,
+    int_or_none,
+    js_to_json,
+    parse_duration,
+    parse_iso8601,
+    try_get,
+    unescapeHTML,
+    url_or_none,
+    urlencode_postdata,
+    urljoin,
+)
+from ..compat import (
+    compat_etree_Element,
+    compat_HTTPError,
+    compat_urlparse,
+)
+
+
+class BBCCoUkIE(InfoExtractor):
+    IE_NAME = 'bbc.co.uk'
+    IE_DESC = 'BBC iPlayer'
+    _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?bbc\.co\.uk/
+                        (?:
+                            programmes/(?!articles/)|
+                            iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
+                            music/(?:clips|audiovideo/popular)[/#]|
+                            radio/player/|
+                            sounds/play/|
+                            events/[^/]+/play/[^/]+/
+                        )
+                        (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
+                    ''' % _ID_REGEX
+
+    _LOGIN_URL = 'https://account.bbc.com/signin'
+    _NETRC_MACHINE = 'bbc'
+
+    _MEDIASELECTOR_URLS = [
+        # Provides HQ HLS streams with even better quality that pc mediaset but fails
+        # with geolocation in some cases when it's even not geo restricted at all (e.g.
+        # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
+        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
+        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s',
+    ]
+
+    _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection'
+    _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
+
+    _NAMESPACES = (
+        _MEDIASELECTION_NS,
+        _EMP_PLAYLIST_NS,
+    )
+
+    _TESTS = [
+        {
+            'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
+            'info_dict': {
+                'id': 'b039d07m',
+                'ext': 'flv',
+                'title': 'Kaleidoscope, Leonard Cohen',
+                'description': 'The Canadian poet and songwriter reflects on his musical career.',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            }
+        },
+        {
+            'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
+            'info_dict': {
+                'id': 'b00yng1d',
+                'ext': 'flv',
+                'title': 'The Man in Black: Series 3: The Printed Name',
+                'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
+                'duration': 1800,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'Episode is no longer available on BBC iPlayer Radio',
+        },
+        {
+            'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
+            'info_dict': {
+                'id': 'b00yng1d',
+                'ext': 'flv',
+                'title': 'The Voice UK: Series 3: Blind Auditions 5',
+                'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
+                'duration': 5100,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
+        },
+        {
+            'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
+            'info_dict': {
+                'id': 'b03k3pb7',
+                'ext': 'flv',
+                'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
+                'description': '2. Invasion',
+                'duration': 3600,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
+        }, {
+            'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
+            'info_dict': {
+                'id': 'b04v209v',
+                'ext': 'flv',
+                'title': 'Pete Tong, The Essential New Tune Special',
+                'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
+                'duration': 10800,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'Episode is no longer available on BBC iPlayer Radio',
+        }, {
+            'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
+            'note': 'Audio',
+            'info_dict': {
+                'id': 'p022h44j',
+                'ext': 'flv',
+                'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
+                'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
+                'duration': 227,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            }
+        }, {
+            'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
+            'note': 'Video',
+            'info_dict': {
+                'id': 'p025c103',
+                'ext': 'flv',
+                'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
+                'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
+                'duration': 226,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            }
+        }, {
+            'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
+            'info_dict': {
+                'id': 'p02n76xf',
+                'ext': 'flv',
+                'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
+                'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
+                'duration': 3540,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'geolocation',
+        }, {
+            'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
+            'info_dict': {
+                'id': 'b05zmgw1',
+                'ext': 'flv',
+                'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
+                'title': 'Royal Academy Summer Exhibition',
+                'duration': 3540,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'geolocation',
+        }, {
+            # iptv-all mediaset fails with geolocation however there is no geo restriction
+            # for this programme at all
+            'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
+            'info_dict': {
+                'id': 'b06rkms3',
+                'ext': 'flv',
+                'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
+                'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'Now it\'s really geo-restricted',
+        }, {
+            # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
+            'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
+            'info_dict': {
+                'id': 'p028bfkj',
+                'ext': 'flv',
+                'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
+                'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        }, {
+            'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb',
+            'note': 'Audio',
+            'info_dict': {
+                'id': 'm0007jz9',
+                'ext': 'mp4',
+                'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra',
+                'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.",
+                'duration': 9840,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            }
+        }, {
+            'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
+            'only_matching': True,
+        }, {
+            'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
+            'only_matching': True,
+        }, {
+            'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
+            'only_matching': True,
+        }, {
+            'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
+            'only_matching': True,
+        }, {
+            'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
+            'only_matching': True,
+        }, {
+            'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
+            'only_matching': True,
+        }, {
+            'url': 'https://www.bbc.co.uk/programmes/m00005xn',
+            'only_matching': True,
+        }, {
+            'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
+            'only_matching': True,
+        }]
+
+    _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading signin page')
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'username': username,
+            'password': password,
+        })
+
+        post_url = urljoin(self._LOGIN_URL, self._search_regex(
+            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+            'post url', default=self._LOGIN_URL, group='url'))
+
+        response, urlh = self._download_webpage_handle(
+            post_url, None, 'Logging in', data=urlencode_postdata(login_form),
+            headers={'Referer': self._LOGIN_URL})
+
+        if self._LOGIN_URL in urlh.geturl():
+            error = clean_html(get_element_by_class('form-message', response))
+            if error:
+                raise ExtractorError(
+                    'Unable to login: %s' % error, expected=True)
+            raise ExtractorError('Unable to log in')
+
+    def _real_initialize(self):
+        self._login()
+
+    class MediaSelectionError(Exception):
+        def __init__(self, id):
+            self.id = id
+
+    def _extract_asx_playlist(self, connection, programme_id):
+        asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
+        return [ref.get('href') for ref in asx.findall('./Entry/ref')]
+
+    def _extract_items(self, playlist):
+        return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
+
+    def _findall_ns(self, element, xpath):
+        elements = []
+        for ns in self._NAMESPACES:
+            elements.extend(element.findall(xpath % ns))
+        return elements
+
+    def _extract_medias(self, media_selection):
+        error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS)
+        if error is None:
+            media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS)
+        if error is not None:
+            raise BBCCoUkIE.MediaSelectionError(error.get('id'))
+        return self._findall_ns(media_selection, './{%s}media')
+
+    def _extract_connections(self, media):
+        return self._findall_ns(media, './{%s}connection')
+
+    def _get_subtitles(self, media, programme_id):
+        subtitles = {}
+        for connection in self._extract_connections(media):
+            cc_url = url_or_none(connection.get('href'))
+            if not cc_url:
+                continue
+            captions = self._download_xml(
+                cc_url, programme_id, 'Downloading captions', fatal=False)
+            if not isinstance(captions, compat_etree_Element):
+                continue
+            lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
+            subtitles[lang] = [
+                {
+                    'url': connection.get('href'),
+                    'ext': 'ttml',
+                },
+            ]
+        return subtitles
+
+    def _raise_extractor_error(self, media_selection_error):
+        raise ExtractorError(
+            '%s returned error: %s' % (self.IE_NAME, media_selection_error.id),
+            expected=True)
+
+    def _download_media_selector(self, programme_id):
+        last_exception = None
+        for mediaselector_url in self._MEDIASELECTOR_URLS:
+            try:
+                return self._download_media_selector_url(
+                    mediaselector_url % programme_id, programme_id)
+            except BBCCoUkIE.MediaSelectionError as e:
+                if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
+                    last_exception = e
+                    continue
+                self._raise_extractor_error(e)
+        self._raise_extractor_error(last_exception)
+
+    def _download_media_selector_url(self, url, programme_id=None):
+        media_selection = self._download_xml(
+            url, programme_id, 'Downloading media selection XML',
+            expected_status=(403, 404))
+        return self._process_media_selector(media_selection, programme_id)
+
+    def _process_media_selector(self, media_selection, programme_id):
+        formats = []
+        subtitles = None
+        urls = []
+
+        for media in self._extract_medias(media_selection):
+            kind = media.get('kind')
+            if kind in ('video', 'audio'):
+                bitrate = int_or_none(media.get('bitrate'))
+                encoding = media.get('encoding')
+                service = media.get('service')
+                width = int_or_none(media.get('width'))
+                height = int_or_none(media.get('height'))
+                file_size = int_or_none(media.get('media_file_size'))
+                for connection in self._extract_connections(media):
+                    href = connection.get('href')
+                    if href in urls:
+                        continue
+                    if href:
+                        urls.append(href)
+                    conn_kind = connection.get('kind')
+                    protocol = connection.get('protocol')
+                    supplier = connection.get('supplier')
+                    transfer_format = connection.get('transferFormat')
+                    format_id = supplier or conn_kind or protocol
+                    if service:
+                        format_id = '%s_%s' % (service, format_id)
+                    # ASX playlist
+                    if supplier == 'asx':
+                        for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
+                            formats.append({
+                                'url': ref,
+                                'format_id': 'ref%s_%s' % (i, format_id),
+                            })
+                    elif transfer_format == 'dash':
+                        formats.extend(self._extract_mpd_formats(
+                            href, programme_id, mpd_id=format_id, fatal=False))
+                    elif transfer_format == 'hls':
+                        formats.extend(self._extract_m3u8_formats(
+                            href, programme_id, ext='mp4', entry_protocol='m3u8_native',
+                            m3u8_id=format_id, fatal=False))
+                        if re.search(self._USP_RE, href):
+                            usp_formats = self._extract_m3u8_formats(
+                                re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href),
+                                programme_id, ext='mp4', entry_protocol='m3u8_native',
+                                m3u8_id=format_id, fatal=False)
+                            for f in usp_formats:
+                                if f.get('height') and f['height'] > 720:
+                                    continue
+                                formats.append(f)
+                    elif transfer_format == 'hds':
+                        formats.extend(self._extract_f4m_formats(
+                            href, programme_id, f4m_id=format_id, fatal=False))
+                    else:
+                        if not service and not supplier and bitrate:
+                            format_id += '-%d' % bitrate
+                        fmt = {
+                            'format_id': format_id,
+                            'filesize': file_size,
+                        }
+                        if kind == 'video':
+                            fmt.update({
+                                'width': width,
+                                'height': height,
+                                'tbr': bitrate,
+                                'vcodec': encoding,
+                            })
+                        else:
+                            fmt.update({
+                                'abr': bitrate,
+                                'acodec': encoding,
+                                'vcodec': 'none',
+                            })
+                        if protocol in ('http', 'https'):
+                            # Direct link
+                            fmt.update({
+                                'url': href,
+                            })
+                        elif protocol == 'rtmp':
+                            application = connection.get('application', 'ondemand')
+                            auth_string = connection.get('authString')
+                            identifier = connection.get('identifier')
+                            server = connection.get('server')
+                            fmt.update({
+                                'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
+                                'play_path': identifier,
+                                'app': '%s?%s' % (application, auth_string),
+                                'page_url': 'http://www.bbc.co.uk',
+                                'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
+                                'rtmp_live': False,
+                                'ext': 'flv',
+                            })
+                        else:
+                            continue
+                        formats.append(fmt)
+            elif kind == 'captions':
+                subtitles = self.extract_subtitles(media, programme_id)
+        return formats, subtitles
+
+    def _download_playlist(self, playlist_id):
+        try:
+            playlist = self._download_json(
+                'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id,
+                playlist_id, 'Downloading playlist JSON')
+
+            version = playlist.get('defaultAvailableVersion')
+            if version:
+                smp_config = version['smpConfig']
+                title = smp_config['title']
+                description = smp_config['summary']
+                for item in smp_config['items']:
+                    kind = item['kind']
+                    if kind not in ('programme', 'radioProgramme'):
+                        continue
+                    programme_id = item.get('vpid')
+                    duration = int_or_none(item.get('duration'))
+                    formats, subtitles = self._download_media_selector(programme_id)
+                return programme_id, title, description, duration, formats, subtitles
+        except ExtractorError as ee:
+            if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404):
+                raise
+
+        # fallback to legacy playlist
+        return self._process_legacy_playlist(playlist_id)
+
+    def _process_legacy_playlist_url(self, url, display_id):
+        playlist = self._download_legacy_playlist_url(url, display_id)
+        return self._extract_from_legacy_playlist(playlist, display_id)
+
+    def _process_legacy_playlist(self, playlist_id):
+        return self._process_legacy_playlist_url(
+            'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id)
+
+    def _download_legacy_playlist_url(self, url, playlist_id=None):
+        return self._download_xml(
+            url, playlist_id, 'Downloading legacy playlist XML')
+
+    def _extract_from_legacy_playlist(self, playlist, playlist_id):
+        no_items = playlist.find('./{%s}noItems' % self._EMP_PLAYLIST_NS)
+        if no_items is not None:
+            reason = no_items.get('reason')
+            if reason == 'preAvailability':
+                msg = 'Episode %s is not yet available' % playlist_id
+            elif reason == 'postAvailability':
+                msg = 'Episode %s is no longer available' % playlist_id
+            elif reason == 'noMedia':
+                msg = 'Episode %s is not currently available' % playlist_id
+            else:
+                msg = 'Episode %s is not available: %s' % (playlist_id, reason)
+            raise ExtractorError(msg, expected=True)
+
+        for item in self._extract_items(playlist):
+            kind = item.get('kind')
+            if kind not in ('programme', 'radioProgramme'):
+                continue
+            title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
+            description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
+            description = description_el.text if description_el is not None else None
+
+            def get_programme_id(item):
+                def get_from_attributes(item):
+                    for p in ('identifier', 'group'):
+                        value = item.get(p)
+                        if value and re.match(r'^[pb][\da-z]{7}$', value):
+                            return value
+                get_from_attributes(item)
+                mediator = item.find('./{%s}mediator' % self._EMP_PLAYLIST_NS)
+                if mediator is not None:
+                    return get_from_attributes(mediator)
+
+            programme_id = get_programme_id(item)
+            duration = int_or_none(item.get('duration'))
+
+            if programme_id:
+                formats, subtitles = self._download_media_selector(programme_id)
+            else:
+                formats, subtitles = self._process_media_selector(item, playlist_id)
+                programme_id = playlist_id
+
+        return programme_id, title, description, duration, formats, subtitles
+
+    def _real_extract(self, url):
+        group_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, group_id, 'Downloading video page')
+
+        error = self._search_regex(
+            r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<',
+            webpage, 'error', default=None)
+        if error:
+            raise ExtractorError(error, expected=True)
+
+        programme_id = None
+        duration = None
+
+        tviplayer = self._search_regex(
+            r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
+            webpage, 'player', default=None)
+
+        if tviplayer:
+            player = self._parse_json(tviplayer, group_id).get('player', {})
+            duration = int_or_none(player.get('duration'))
+            programme_id = player.get('vpid')
+
+        if not programme_id:
+            programme_id = self._search_regex(
+                r'"vpid"\s*:\s*"(%s)"' % self._ID_REGEX, webpage, 'vpid', fatal=False, default=None)
+
+        if programme_id:
+            formats, subtitles = self._download_media_selector(programme_id)
+            title = self._og_search_title(webpage, default=None) or self._html_search_regex(
+                (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
+                 r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
+            description = self._search_regex(
+                (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
+                 r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
+                webpage, 'description', default=None)
+            if not description:
+                description = self._html_search_meta('description', webpage)
+        else:
+            programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': programme_id,
+            'title': title,
+            'description': description,
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class BBCIE(BBCCoUkIE):
+    IE_NAME = 'bbc'
+    IE_DESC = 'BBC'
+    _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)'
+
+    _MEDIASELECTOR_URLS = [
+        # Provides HQ HLS streams but fails with geolocation in some cases when it's
+        # even not geo restricted at all
+        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s',
+        # Provides more formats, namely direct mp4 links, but fails on some videos with
+        # notukerror for non UK (?) users (e.g.
+        # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+        'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s',
+        # Provides fewer formats, but works everywhere for everybody (hopefully)
+        'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s',
+    ]
+
+    _TESTS = [{
+        # article with multiple videos embedded with data-playable containing vpids
+        'url': 'http://www.bbc.com/news/world-europe-32668511',
+        'info_dict': {
+            'id': 'world-europe-32668511',
+            'title': 'Russia stages massive WW2 parade',
+            'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
+        },
+        'playlist_count': 2,
+    }, {
+        # article with multiple videos embedded with data-playable (more videos)
+        'url': 'http://www.bbc.com/news/business-28299555',
+        'info_dict': {
+            'id': 'business-28299555',
+            'title': 'Farnborough Airshow: Video highlights',
+            'description': 'BBC reports and video highlights at the Farnborough Airshow.',
+        },
+        'playlist_count': 9,
+        'skip': 'Save time',
+    }, {
+        # article with multiple videos embedded with `new SMP()`
+        # broken
+        'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
+        'info_dict': {
+            'id': '3662a707-0af9-3149-963f-47bea720b460',
+            'title': 'BUGGER',
+        },
+        'playlist_count': 18,
+    }, {
+        # single video embedded with data-playable containing vpid
+        'url': 'http://www.bbc.com/news/world-europe-32041533',
+        'info_dict': {
+            'id': 'p02mprgb',
+            'ext': 'mp4',
+            'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV',
+            'description': 'md5:2868290467291b37feda7863f7a83f54',
+            'duration': 47,
+            'timestamp': 1427219242,
+            'upload_date': '20150324',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        # article with single video embedded with data-playable containing XML playlist
+        # with direct video links as progressiveDownloadUrl (for now these are extracted)
+        # and playlist with f4m and m3u8 as streamingUrl
+        'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
+        'info_dict': {
+            'id': '150615_telabyad_kentin_cogu',
+            'ext': 'mp4',
+            'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
+            'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
+            'timestamp': 1434397334,
+            'upload_date': '20150615',
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        # single video embedded with data-playable containing XML playlists (regional section)
+        'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
+        'info_dict': {
+            'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
+            'ext': 'mp4',
+            'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
+            'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
+            'timestamp': 1434713142,
+            'upload_date': '20150619',
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        # single video from video playlist embedded with vxp-playlist-data JSON
+        'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
+        'info_dict': {
+            'id': 'p02w6qjc',
+            'ext': 'mp4',
+            'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
+            'duration': 56,
+            'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        # single video story with digitalData
+        'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
+        'info_dict': {
+            'id': 'p02q6gc4',
+            'ext': 'flv',
+            'title': 'Sri Lanka’s spicy secret',
+            'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.',
+            'timestamp': 1437674293,
+            'upload_date': '20150723',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        # single video story without digitalData
+        'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
+        'info_dict': {
+            'id': 'p018zqqg',
+            'ext': 'mp4',
+            'title': 'Hyundai Santa Fe Sport: Rock star',
+            'description': 'md5:b042a26142c4154a6e472933cf20793d',
+            'timestamp': 1415867444,
+            'upload_date': '20141113',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        # single video embedded with Morph
+        'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
+        'info_dict': {
+            'id': 'p041vhd0',
+            'ext': 'mp4',
+            'title': "Nigeria v Japan - Men's First Round",
+            'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
+            'duration': 7980,
+            'uploader': 'BBC Sport',
+            'uploader_id': 'bbc_sport',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': 'Georestricted to UK',
+    }, {
+        # single video with playlist.sxml URL in playlist param
+        'url': 'http://www.bbc.com/sport/0/football/33653409',
+        'info_dict': {
+            'id': 'p02xycnp',
+            'ext': 'mp4',
+            'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
+            'description': 'BBC Sport\'s David Ornstein has the latest transfer gossip, including rumours of a Manchester United return for Cristiano Ronaldo.',
+            'duration': 140,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        # article with multiple videos embedded with playlist.sxml in playlist param
+        'url': 'http://www.bbc.com/sport/0/football/34475836',
+        'info_dict': {
+            'id': '34475836',
+            'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
+            'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
+        },
+        'playlist_count': 3,
+    }, {
+        # school report article with single video
+        'url': 'http://www.bbc.co.uk/schoolreport/35744779',
+        'info_dict': {
+            'id': '35744779',
+            'title': 'School which breaks down barriers in Jerusalem',
+        },
+        'playlist_count': 1,
+    }, {
+        # single video with playlist URL from weather section
+        'url': 'http://www.bbc.com/weather/features/33601775',
+        'only_matching': True,
+    }, {
+        # custom redirection to www.bbc.com
+        'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
+        'only_matching': True,
+    }, {
+        # single video article embedded with data-media-vpid
+        'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
+        'info_dict': {
+            'id': 'p06556y7',
+            'ext': 'mp4',
+            'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?',
+            'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd',
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        # window.__PRELOADED_STATE__
+        'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
+        'info_dict': {
+            'id': 'b0b9z4vz',
+            'ext': 'mp4',
+            'title': 'Prom 6: An American in Paris and Turangalila',
+            'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
+            'uploader': 'Radio 3',
+            'uploader_id': 'bbc_radio_three',
+        },
+    }, {
+        'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
+        'info_dict': {
+            'id': 'p06w9tws',
+            'ext': 'mp4',
+            'title': 'md5:2fabf12a726603193a2879a055f72514',
+            'description': 'Learn English words and phrases from this story',
+        },
+        'add_ie': [BBCCoUkIE.ie_key()],
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
+        return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
+                else super(BBCIE, cls).suitable(url))
+
+    def _extract_from_media_meta(self, media_meta, video_id):
+        # Direct links to media in media metadata (e.g.
+        # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
+        # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
+        source_files = media_meta.get('sourceFiles')
+        if source_files:
+            return [{
+                'url': f['url'],
+                'format_id': format_id,
+                'ext': f.get('encoding'),
+                'tbr': float_or_none(f.get('bitrate'), 1000),
+                'filesize': int_or_none(f.get('filesize')),
+            } for format_id, f in source_files.items() if f.get('url')], []
+
+        programme_id = media_meta.get('externalId')
+        if programme_id:
+            return self._download_media_selector(programme_id)
+
+        # Process playlist.sxml as legacy playlist
+        href = media_meta.get('href')
+        if href:
+            playlist = self._download_legacy_playlist_url(href)
+            _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
+            return formats, subtitles
+
+        return [], []
+
+    def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
+        programme_id, title, description, duration, formats, subtitles = \
+            self._process_legacy_playlist_url(url, playlist_id)
+        self._sort_formats(formats)
+        return {
+            'id': programme_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
+        timestamp = json_ld_info.get('timestamp')
+
+        playlist_title = json_ld_info.get('title')
+        if not playlist_title:
+            playlist_title = self._og_search_title(
+                webpage, default=None) or self._html_search_regex(
+                r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+            if playlist_title:
+                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+
+        playlist_description = json_ld_info.get(
+            'description') or self._og_search_description(webpage, default=None)
+
+        if not timestamp:
+            timestamp = parse_iso8601(self._search_regex(
+                [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
+                 r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
+                 r'"datePublished":\s*"([^"]+)'],
+                webpage, 'date', default=None))
+
+        entries = []
+
+        # article with multiple videos embedded with playlist.sxml (e.g.
+        # http://www.bbc.com/sport/0/football/34475836)
+        playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
+        playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
+        if playlists:
+            entries = [
+                self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
+                for playlist_url in playlists]
+
+        # news article with multiple videos embedded with data-playable
+        data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
+        if data_playables:
+            for _, data_playable_json in data_playables:
+                data_playable = self._parse_json(
+                    unescapeHTML(data_playable_json), playlist_id, fatal=False)
+                if not data_playable:
+                    continue
+                settings = data_playable.get('settings', {})
+                if settings:
+                    # data-playable with video vpid in settings.playlistObject.items (e.g.
+                    # http://www.bbc.com/news/world-us-canada-34473351)
+                    playlist_object = settings.get('playlistObject', {})
+                    if playlist_object:
+                        items = playlist_object.get('items')
+                        if items and isinstance(items, list):
+                            title = playlist_object['title']
+                            description = playlist_object.get('summary')
+                            duration = int_or_none(items[0].get('duration'))
+                            programme_id = items[0].get('vpid')
+                            formats, subtitles = self._download_media_selector(programme_id)
+                            self._sort_formats(formats)
+                            entries.append({
+                                'id': programme_id,
+                                'title': title,
+                                'description': description,
+                                'timestamp': timestamp,
+                                'duration': duration,
+                                'formats': formats,
+                                'subtitles': subtitles,
+                            })
+                    else:
+                        # data-playable without vpid but with a playlist.sxml URLs
+                        # in otherSettings.playlist (e.g.
+                        # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
+                        playlist = data_playable.get('otherSettings', {}).get('playlist', {})
+                        if playlist:
+                            entry = None
+                            for key in ('streaming', 'progressiveDownload'):
+                                playlist_url = playlist.get('%sUrl' % key)
+                                if not playlist_url:
+                                    continue
+                                try:
+                                    info = self._extract_from_playlist_sxml(
+                                        playlist_url, playlist_id, timestamp)
+                                    if not entry:
+                                        entry = info
+                                    else:
+                                        entry['title'] = info['title']
+                                        entry['formats'].extend(info['formats'])
+                                except Exception as e:
+                                    # Some playlist URL may fail with 500, at the same time
+                                    # the other one may work fine (e.g.
+                                    # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
+                                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+                                        continue
+                                    raise
+                            if entry:
+                                self._sort_formats(entry['formats'])
+                                entries.append(entry)
+
+        if entries:
+            return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
+
+        # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
+        group_id = self._search_regex(
+            r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
+            webpage, 'group id', default=None)
+        if playlist_id:
+            return self.url_result(
+                'https://www.bbc.co.uk/programmes/%s' % group_id,
+                ie=BBCCoUkIE.ie_key())
+
+        # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
+        programme_id = self._search_regex(
+            [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
+             r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
+             r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
+            webpage, 'vpid', default=None)
+
+        if programme_id:
+            formats, subtitles = self._download_media_selector(programme_id)
+            self._sort_formats(formats)
+            # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
+            digital_data = self._parse_json(
+                self._search_regex(
+                    r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
+                programme_id, fatal=False)
+            page_info = digital_data.get('page', {}).get('pageInfo', {})
+            title = page_info.get('pageName') or self._og_search_title(webpage)
+            description = page_info.get('description') or self._og_search_description(webpage)
+            timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
+            return {
+                'id': programme_id,
+                'title': title,
+                'description': description,
+                'timestamp': timestamp,
+                'formats': formats,
+                'subtitles': subtitles,
+            }
+
+        # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
+        # There are several setPayload calls may be present but the video
+        # seems to be always related to the first one
+        morph_payload = self._parse_json(
+            self._search_regex(
+                r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
+                webpage, 'morph payload', default='{}'),
+            playlist_id, fatal=False)
+        if morph_payload:
+            components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
+            for component in components:
+                if not isinstance(component, dict):
+                    continue
+                lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
+                if not lead_media:
+                    continue
+                identifiers = lead_media.get('identifiers')
+                if not identifiers or not isinstance(identifiers, dict):
+                    continue
+                programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
+                if not programme_id:
+                    continue
+                title = lead_media.get('title') or self._og_search_title(webpage)
+                formats, subtitles = self._download_media_selector(programme_id)
+                self._sort_formats(formats)
+                description = lead_media.get('summary')
+                uploader = lead_media.get('masterBrand')
+                uploader_id = lead_media.get('mid')
+                duration = None
+                duration_d = lead_media.get('duration')
+                if isinstance(duration_d, dict):
+                    duration = parse_duration(dict_get(
+                        duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
+                return {
+                    'id': programme_id,
+                    'title': title,
+                    'description': description,
+                    'duration': duration,
+                    'uploader': uploader,
+                    'uploader_id': uploader_id,
+                    'formats': formats,
+                    'subtitles': subtitles,
+                }
+
+        preload_state = self._parse_json(self._search_regex(
+            r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
+            'preload state', default='{}'), playlist_id, fatal=False)
+        if preload_state:
+            current_programme = preload_state.get('programmes', {}).get('current') or {}
+            programme_id = current_programme.get('id')
+            if current_programme and programme_id and current_programme.get('type') == 'playable_item':
+                title = current_programme.get('titles', {}).get('tertiary') or playlist_title
+                formats, subtitles = self._download_media_selector(programme_id)
+                self._sort_formats(formats)
+                synopses = current_programme.get('synopses') or {}
+                network = current_programme.get('network') or {}
+                duration = int_or_none(
+                    current_programme.get('duration', {}).get('value'))
+                thumbnail = None
+                image_url = current_programme.get('image_url')
+                if image_url:
+                    thumbnail = image_url.replace('{recipe}', '1920x1920')
+                return {
+                    'id': programme_id,
+                    'title': title,
+                    'description': dict_get(synopses, ('long', 'medium', 'short')),
+                    'thumbnail': thumbnail,
+                    'duration': duration,
+                    'uploader': network.get('short_title'),
+                    'uploader_id': network.get('id'),
+                    'formats': formats,
+                    'subtitles': subtitles,
+                }
+
+        bbc3_config = self._parse_json(
+            self._search_regex(
+                r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
+                'bbcthree config', default='{}'),
+            playlist_id, transform_source=js_to_json, fatal=False)
+        if bbc3_config:
+            bbc3_playlist = try_get(
+                bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'],
+                dict)
+            if bbc3_playlist:
+                playlist_title = bbc3_playlist.get('title') or playlist_title
+                thumbnail = bbc3_playlist.get('holdingImageURL')
+                entries = []
+                for bbc3_item in bbc3_playlist['items']:
+                    programme_id = bbc3_item.get('versionID')
+                    if not programme_id:
+                        continue
+                    formats, subtitles = self._download_media_selector(programme_id)
+                    self._sort_formats(formats)
+                    entries.append({
+                        'id': programme_id,
+                        'title': playlist_title,
+                        'thumbnail': thumbnail,
+                        'timestamp': timestamp,
+                        'formats': formats,
+                        'subtitles': subtitles,
+                    })
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)
+
+        def extract_all(pattern):
+            return list(filter(None, map(
+                lambda s: self._parse_json(s, playlist_id, fatal=False),
+                re.findall(pattern, webpage))))
+
+        # Multiple video article (e.g.
+        # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
+        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX
+        entries = []
+        for match in extract_all(r'new\s+SMP\(({.+?})\)'):
+            embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
+            if embed_url and re.match(EMBED_URL, embed_url):
+                entries.append(embed_url)
+        entries.extend(re.findall(
+            r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
+        if entries:
+            return self.playlist_result(
+                [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
+                playlist_id, playlist_title, playlist_description)
+
+        # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
+        medias = extract_all(r"data-media-meta='({[^']+})'")
+
+        if not medias:
+            # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
+            media_asset = self._search_regex(
+                r'mediaAssetPage\.init\(\s*({.+?}), "/',
+                webpage, 'media asset', default=None)
+            if media_asset:
+                media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
+                medias = []
+                for video in media_asset_page.get('videos', {}).values():
+                    medias.extend(video.values())
+
+        if not medias:
+            # Multiple video playlist with single `now playing` entry (e.g.
+            # http://www.bbc.com/news/video_and_audio/must_see/33767813)
+            vxp_playlist = self._parse_json(
+                self._search_regex(
+                    r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
+                    webpage, 'playlist data'),
+                playlist_id)
+            playlist_medias = []
+            for item in vxp_playlist:
+                media = item.get('media')
+                if not media:
+                    continue
+                playlist_medias.append(media)
+                # Download single video if found media with asset id matching the video id from URL
+                if item.get('advert', {}).get('assetId') == playlist_id:
+                    medias = [media]
+                    break
+            # Fallback to the whole playlist
+            if not medias:
+                medias = playlist_medias
+
+        entries = []
+        for num, media_meta in enumerate(medias, start=1):
+            formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
+            if not formats:
+                continue
+            self._sort_formats(formats)
+
+            video_id = media_meta.get('externalId')
+            if not video_id:
+                video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num)
+
+            title = media_meta.get('caption')
+            if not title:
+                title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num)
+
+            duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
+
+            images = []
+            for image in media_meta.get('images', {}).values():
+                images.extend(image.values())
+            if 'image' in media_meta:
+                images.append(media_meta['image'])
+
+            thumbnails = [{
+                'url': image.get('href'),
+                'width': int_or_none(image.get('width')),
+                'height': int_or_none(image.get('height')),
+            } for image in images]
+
+            entries.append({
+                'id': video_id,
+                'title': title,
+                'thumbnails': thumbnails,
+                'duration': duration,
+                'timestamp': timestamp,
+                'formats': formats,
+                'subtitles': subtitles,
+            })
+
+        return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
+
+
+class BBCCoUkArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
+    IE_NAME = 'bbc.co.uk:article'
+    IE_DESC = 'BBC articles'
+
+    _TEST = {
+        'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
+        'info_dict': {
+            'id': '3jNQLTMrPlYGTBn0WV6M2MS',
+            'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
+            'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
+        },
+        'playlist_count': 4,
+        'add_ie': ['BBCCoUk'],
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage).strip()
+
+        entries = [self.url_result(programme_url) for programme_url in re.findall(
+            r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
+
+        return self.playlist_result(entries, playlist_id, title, description)
+
+
+class BBCCoUkPlaylistBaseIE(InfoExtractor):
+    def _entries(self, webpage, url, playlist_id):
+        single_page = 'page' in compat_urlparse.parse_qs(
+            compat_urlparse.urlparse(url).query)
+        for page_num in itertools.count(2):
+            for video_id in re.findall(
+                    self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
+                yield self.url_result(
+                    self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
+            if single_page:
+                return
+            next_page = self._search_regex(
+                r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
+                webpage, 'next page url', default=None, group='url')
+            if not next_page:
+                break
+            webpage = self._download_webpage(
+                compat_urlparse.urljoin(url, next_page), playlist_id,
+                'Downloading page %d' % page_num, page_num)
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        title, description = self._extract_title_and_description(webpage)
+
+        return self.playlist_result(
+            self._entries(webpage, url, playlist_id),
+            playlist_id, title, description)
+
+
+class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
+    IE_NAME = 'bbc.co.uk:iplayer:playlist'
+    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
+    _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
+    _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
+    _TESTS = [{
+        'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
+        'info_dict': {
+            'id': 'b05rcz9v',
+            'title': 'The Disappearance',
+            'description': 'French thriller serial about a missing teenager.',
+        },
+        'playlist_mincount': 6,
+        'skip': 'This programme is not currently available on BBC iPlayer',
+    }, {
+        # Available for over a year unlike 30 days for most other programmes
+        'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
+        'info_dict': {
+            'id': 'p02tcc32',
+            'title': 'Bohemian Icons',
+            'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
+        },
+        'playlist_mincount': 10,
+    }]
+
+    def _extract_title_and_description(self, webpage):
+        title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
+        description = self._search_regex(
+            r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
+            webpage, 'description', fatal=False, group='value')
+        return title, description
+
+
+class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
+    IE_NAME = 'bbc.co.uk:playlist'
+    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
+    _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
+    _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
+    _TESTS = [{
+        'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
+        'info_dict': {
+            'id': 'b05rcz9v',
+            'title': 'The Disappearance - Clips - BBC Four',
+            'description': 'French thriller serial about a missing teenager.',
+        },
+        'playlist_mincount': 7,
+    }, {
+        # multipage playlist, explicit page
+        'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
+        'info_dict': {
+            'id': 'b00mfl7n',
+            'title': 'Frozen Planet - Clips - BBC One',
+            'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
+        },
+        'playlist_mincount': 24,
+    }, {
+        # multipage playlist, all pages
+        'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
+        'info_dict': {
+            'id': 'b00mfl7n',
+            'title': 'Frozen Planet - Clips - BBC One',
+            'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
+        },
+        'playlist_mincount': 142,
+    }, {
+        'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
+        'only_matching': True,
+    }]
+
+    def _extract_title_and_description(self, webpage):
+        title = self._og_search_title(webpage, fatal=False)
+        description = self._og_search_description(webpage)
+        return title, description
diff --git a/youtube_dl/extractor/beampro.py b/youtube_dl/extractor/beampro.py

new file mode 100644 (file)

index 0000000..86abdae
--- /dev/null
+++ b/youtube_dl/extractor/beampro.py
@@ -0,0 +1,194 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    clean_html,
+    compat_str,
+    float_or_none,
+    int_or_none,
+    parse_iso8601,
+    try_get,
+    urljoin,
+)
+
+
+class BeamProBaseIE(InfoExtractor):
+    _API_BASE = 'https://mixer.com/api/v1'
+    _RATINGS = {'family': 0, 'teen': 13, '18+': 18}
+
+    def _extract_channel_info(self, chan):
+        user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id'])
+        return {
+            'uploader': chan.get('token') or try_get(
+                chan, lambda x: x['user']['username'], compat_str),
+            'uploader_id': compat_str(user_id) if user_id else None,
+            'age_limit': self._RATINGS.get(chan.get('audience')),
+        }
+
+
+class BeamProLiveIE(BeamProBaseIE):
+    IE_NAME = 'Mixer:live'
+    _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://mixer.com/niterhayven',
+        'info_dict': {
+            'id': '261562',
+            'ext': 'mp4',
+            'title': 'Introducing The Witcher 3 //  The Grind Starts Now!',
+            'description': 'md5:0b161ac080f15fe05d18a07adb44a74d',
+            'thumbnail': r're:https://.*\.jpg$',
+            'timestamp': 1483477281,
+            'upload_date': '20170103',
+            'uploader': 'niterhayven',
+            'uploader_id': '373396',
+            'age_limit': 18,
+            'is_live': True,
+            'view_count': int,
+        },
+        'skip': 'niterhayven is offline',
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE
+
+    @classmethod
+    def suitable(cls, url):
+        return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        channel_name = self._match_id(url)
+
+        chan = self._download_json(
+            '%s/channels/%s' % (self._API_BASE, channel_name), channel_name)
+
+        if chan.get('online') is False:
+            raise ExtractorError(
+                '{0} is offline'.format(channel_name), expected=True)
+
+        channel_id = chan['id']
+
+        def manifest_url(kind):
+            return self._MANIFEST_URL_TEMPLATE % (channel_id, kind)
+
+        formats = self._extract_m3u8_formats(
+            manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls',
+            fatal=False)
+        formats.extend(self._extract_smil_formats(
+            manifest_url('smil'), channel_name, fatal=False))
+        self._sort_formats(formats)
+
+        info = {
+            'id': compat_str(chan.get('id') or channel_name),
+            'title': self._live_title(chan.get('name') or channel_name),
+            'description': clean_html(chan.get('description')),
+            'thumbnail': try_get(
+                chan, lambda x: x['thumbnail']['url'], compat_str),
+            'timestamp': parse_iso8601(chan.get('updatedAt')),
+            'is_live': True,
+            'view_count': int_or_none(chan.get('viewersTotal')),
+            'formats': formats,
+        }
+        info.update(self._extract_channel_info(chan))
+
+        return info
+
+
+class BeamProVodIE(BeamProBaseIE):
+    IE_NAME = 'Mixer:vod'
+    _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>[^?#&]+)'
+    _TESTS = [{
+        'url': 'https://mixer.com/willow8714?vod=2259830',
+        'md5': 'b2431e6e8347dc92ebafb565d368b76b',
+        'info_dict': {
+            'id': '2259830',
+            'ext': 'mp4',
+            'title': 'willow8714\'s Channel',
+            'duration': 6828.15,
+            'thumbnail': r're:https://.*source\.png$',
+            'timestamp': 1494046474,
+            'upload_date': '20170506',
+            'uploader': 'willow8714',
+            'uploader_id': '6085379',
+            'age_limit': 13,
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://mixer.com/streamer?vod=IxFno1rqC0S_XJ1a2yGgNw',
+        'only_matching': True,
+    }, {
+        'url': 'https://mixer.com/streamer?vod=Rh3LY0VAqkGpEQUe2pN-ig',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_format(vod, vod_type):
+        if not vod.get('baseUrl'):
+            return []
+
+        if vod_type == 'hls':
+            filename, protocol = 'manifest.m3u8', 'm3u8_native'
+        elif vod_type == 'raw':
+            filename, protocol = 'source.mp4', 'https'
+        else:
+            assert False
+
+        data = vod.get('data') if isinstance(vod.get('data'), dict) else {}
+
+        format_id = [vod_type]
+        if isinstance(data.get('Height'), compat_str):
+            format_id.append('%sp' % data['Height'])
+
+        return [{
+            'url': urljoin(vod['baseUrl'], filename),
+            'format_id': '-'.join(format_id),
+            'ext': 'mp4',
+            'protocol': protocol,
+            'width': int_or_none(data.get('Width')),
+            'height': int_or_none(data.get('Height')),
+            'fps': int_or_none(data.get('Fps')),
+            'tbr': int_or_none(data.get('Bitrate'), 1000),
+        }]
+
+    def _real_extract(self, url):
+        vod_id = self._match_id(url)
+
+        vod_info = self._download_json(
+            '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id)
+
+        state = vod_info.get('state')
+        if state != 'AVAILABLE':
+            raise ExtractorError(
+                'VOD %s is not available (state: %s)' % (vod_id, state),
+                expected=True)
+
+        formats = []
+        thumbnail_url = None
+
+        for vod in vod_info['vods']:
+            vod_type = vod.get('format')
+            if vod_type in ('hls', 'raw'):
+                formats.extend(self._extract_format(vod, vod_type))
+            elif vod_type == 'thumbnail':
+                thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png')
+
+        self._sort_formats(formats)
+
+        info = {
+            'id': vod_id,
+            'title': vod_info.get('name') or vod_id,
+            'duration': float_or_none(vod_info.get('duration')),
+            'thumbnail': thumbnail_url,
+            'timestamp': parse_iso8601(vod_info.get('createdAt')),
+            'view_count': int_or_none(vod_info.get('viewsTotal')),
+            'formats': formats,
+        }
+        info.update(self._extract_channel_info(vod_info.get('channel') or {}))
+
+        return info
diff --git a/youtube_dl/extractor/beatport.py b/youtube_dl/extractor/beatport.py

new file mode 100644 (file)

index 0000000..e607094
--- /dev/null
+++ b/youtube_dl/extractor/beatport.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import int_or_none
+
+
+class BeatportIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.|pro\.)?beatport\.com/track/(?P<display_id>[^/]+)/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'https://beatport.com/track/synesthesia-original-mix/5379371',
+        'md5': 'b3c34d8639a2f6a7f734382358478887',
+        'info_dict': {
+            'id': '5379371',
+            'display_id': 'synesthesia-original-mix',
+            'ext': 'mp4',
+            'title': 'Froxic - Synesthesia (Original Mix)',
+        },
+    }, {
+        'url': 'https://beatport.com/track/love-and-war-original-mix/3756896',
+        'md5': 'e44c3025dfa38c6577fbaeb43da43514',
+        'info_dict': {
+            'id': '3756896',
+            'display_id': 'love-and-war-original-mix',
+            'ext': 'mp3',
+            'title': 'Wolfgang Gartner - Love & War (Original Mix)',
+        },
+    }, {
+        'url': 'https://beatport.com/track/birds-original-mix/4991738',
+        'md5': 'a1fd8e8046de3950fd039304c186c05f',
+        'info_dict': {
+            'id': '4991738',
+            'display_id': 'birds-original-mix',
+            'ext': 'mp4',
+            'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)",
+        }
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        track_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        playables = self._parse_json(
+            self._search_regex(
+                r'window\.Playables\s*=\s*({.+?});', webpage,
+                'playables info', flags=re.DOTALL),
+            track_id)
+
+        track = next(t for t in playables['tracks'] if t['id'] == int(track_id))
+
+        title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name']
+        if track['mix']:
+            title += ' (' + track['mix'] + ')'
+
+        formats = []
+        for ext, info in track['preview'].items():
+            if not info['url']:
+                continue
+            fmt = {
+                'url': info['url'],
+                'ext': ext,
+                'format_id': ext,
+                'vcodec': 'none',
+            }
+            if ext == 'mp3':
+                fmt['preference'] = 0
+                fmt['acodec'] = 'mp3'
+                fmt['abr'] = 96
+                fmt['asr'] = 44100
+            elif ext == 'mp4':
+                fmt['preference'] = 1
+                fmt['acodec'] = 'aac'
+                fmt['abr'] = 96
+                fmt['asr'] = 44100
+            formats.append(fmt)
+        self._sort_formats(formats)
+
+        images = []
+        for name, info in track['images'].items():
+            image_url = info.get('url')
+            if name == 'dynamic' or not image_url:
+                continue
+            image = {
+                'id': name,
+                'url': image_url,
+                'height': int_or_none(info.get('height')),
+                'width': int_or_none(info.get('width')),
+            }
+            images.append(image)
+
+        return {
+            'id': compat_str(track.get('id')) or track_id,
+            'display_id': track.get('slug') or display_id,
+            'title': title,
+            'formats': formats,
+            'thumbnails': images,
+        }
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py

new file mode 100644 (file)

index 0000000..5788d13
--- /dev/null
+++ b/youtube_dl/extractor/beeg.py
@@ -0,0 +1,116 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    int_or_none,
+    unified_timestamp,
+)
+
+
+class BeegIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P<id>\d+)'
+    _TESTS = [{
+        # api/v6 v1
+        'url': 'http://beeg.com/5416503',
+        'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820',
+        'info_dict': {
+            'id': '5416503',
+            'ext': 'mp4',
+            'title': 'Sultry Striptease',
+            'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2',
+            'timestamp': 1391813355,
+            'upload_date': '20140207',
+            'duration': 383,
+            'tags': list,
+            'age_limit': 18,
+        }
+    }, {
+        # api/v6 v2
+        'url': 'https://beeg.com/1941093077?t=911-1391',
+        'only_matching': True,
+    }, {
+        # api/v6 v2 w/o t
+        'url': 'https://beeg.com/1277207756',
+        'only_matching': True,
+    }, {
+        'url': 'https://beeg.porn/video/5416503',
+        'only_matching': True,
+    }, {
+        'url': 'https://beeg.porn/5416503',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        beeg_version = self._search_regex(
+            r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version',
+            default='1546225636701')
+
+        if len(video_id) >= 10:
+            query = {
+                'v': 2,
+            }
+            qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+            t = qs.get('t', [''])[0].split('-')
+            if len(t) > 1:
+                query.update({
+                    's': t[0],
+                    'e': t[1],
+                })
+        else:
+            query = {'v': 1}
+
+        for api_path in ('', 'api.'):
+            video = self._download_json(
+                'https://%sbeeg.com/api/v6/%s/video/%s'
+                % (api_path, beeg_version, video_id), video_id,
+                fatal=api_path == 'api.', query=query)
+            if video:
+                break
+
+        formats = []
+        for format_id, video_url in video.items():
+            if not video_url:
+                continue
+            height = self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None)
+            if not height:
+                continue
+            formats.append({
+                'url': self._proto_relative_url(
+                    video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'),
+                'format_id': format_id,
+                'height': int(height),
+            })
+        self._sort_formats(formats)
+
+        title = video['title']
+        video_id = compat_str(video.get('id') or video_id)
+        display_id = video.get('code')
+        description = video.get('desc')
+        series = video.get('ps_name')
+
+        timestamp = unified_timestamp(video.get('date'))
+        duration = int_or_none(video.get('duration'))
+
+        tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'series': series,
+            'timestamp': timestamp,
+            'duration': duration,
+            'tags': tags,
+            'formats': formats,
+            'age_limit': self._rta_search(webpage),
+        }
diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py

new file mode 100644 (file)

index 0000000..9bca853
--- /dev/null
+++ b/youtube_dl/extractor/behindkink.py
@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import url_basename
+
+
+class BehindKinkIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)'
+    _TEST = {
+        'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/',
+        'md5': '507b57d8fdcd75a41a9a7bdb7989c762',
+        'info_dict': {
+            'id': '37127',
+            'ext': 'mp4',
+            'title': 'What are you passionate about – Marley Blaze',
+            'description': 'md5:aee8e9611b4ff70186f752975d9b94b4',
+            'upload_date': '20141205',
+            'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg',
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_url = self._search_regex(
+            r'<source src="([^"]+)"', webpage, 'video URL')
+        video_id = url_basename(video_url).split('_')[0]
+        upload_date = mobj.group('year') + mobj.group('month') + mobj.group('day')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': self._og_search_title(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage),
+            'upload_date': upload_date,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py

new file mode 100644 (file)

index 0000000..9f9de96
--- /dev/null
+++ b/youtube_dl/extractor/bellmedia.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class BellMediaIE(InfoExtractor):
+    _VALID_URL = r'''(?x)https?://(?:www\.)?
+        (?P<domain>
+            (?:
+                ctv|
+                tsn|
+                bnn(?:bloomberg)?|
+                thecomedynetwork|
+                discovery|
+                discoveryvelocity|
+                sciencechannel|
+                investigationdiscovery|
+                animalplanet|
+                bravo|
+                mtv|
+                space|
+                etalk|
+                marilyn
+            )\.ca|
+            (?:much|cp24)\.com
+        )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
+    _TESTS = [{
+        'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070',
+        'md5': '36d3ef559cfe8af8efe15922cd3ce950',
+        'info_dict': {
+            'id': '1403070',
+            'ext': 'flv',
+            'title': 'David Cockfield\'s Top Picks',
+            'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3',
+            'upload_date': '20180525',
+            'timestamp': 1527288600,
+        },
+    }, {
+        'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tsn.ca/video/expectations-high-for-milos-raonic-at-us-open~939549',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bnn.ca/video/berman-s-call-part-two-viewer-questions~939654',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ctv.ca/YourMorning/Video/S1E6-Monday-August-29-2016-vid938009',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.much.com/shows/atmidnight/episode948007/tuesday-september-13-2016',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.much.com/shows/the-almost-impossible-gameshow/928979/episode-6',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.etalk.ca/video?videoid=663455',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.cp24.com/video?clipId=1982548',
+        'only_matching': True,
+    }]
+    _DOMAINS = {
+        'thecomedynetwork': 'comedy',
+        'discoveryvelocity': 'discvel',
+        'sciencechannel': 'discsci',
+        'investigationdiscovery': 'invdisc',
+        'animalplanet': 'aniplan',
+        'etalk': 'ctv',
+        'bnnbloomberg': 'bnn',
+        'marilyn': 'ctv_marilyn',
+    }
+
+    def _real_extract(self, url):
+        domain, video_id = re.match(self._VALID_URL, url).groups()
+        domain = domain.split('.')[0]
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': '9c9media:%s_web:%s' % (self._DOMAINS.get(domain, domain), video_id),
+            'ie_key': 'NineCNineMedia',
+        }
diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py

new file mode 100644 (file)

index 0000000..d7ceaa8
--- /dev/null
+++ b/youtube_dl/extractor/bet.py
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+from .mtv import MTVServicesInfoExtractor
+from ..utils import unified_strdate
+
+
+class BetIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html'
+    _TESTS = [
+        {
+            'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
+            'info_dict': {
+                'id': '07e96bd3-8850-3051-b856-271b457f0ab8',
+                'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
+                'ext': 'flv',
+                'title': 'A Conversation With President Obama',
+                'description': 'President Obama urges persistence in confronting racism and bias.',
+                'duration': 1534,
+                'upload_date': '20141208',
+                'thumbnail': r're:(?i)^https?://.*\.jpg$',
+                'subtitles': {
+                    'en': 'mincount:2',
+                }
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
+            'info_dict': {
+                'id': '9f516bf1-7543-39c4-8076-dd441b459ba9',
+                'display_id': 'justice-for-ferguson-a-community-reacts',
+                'ext': 'flv',
+                'title': 'Justice for Ferguson: A Community Reacts',
+                'description': 'A BET News special.',
+                'duration': 1696,
+                'upload_date': '20141125',
+                'thumbnail': r're:(?i)^https?://.*\.jpg$',
+                'subtitles': {
+                    'en': 'mincount:2',
+                }
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        }
+    ]
+
+    _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player"
+
+    def _get_feed_query(self, uri):
+        return {
+            'uuid': uri,
+        }
+
+    def _extract_mgid(self, webpage):
+        return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid')
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+        mgid = self._extract_mgid(webpage)
+        videos_info = self._get_videos_info(mgid)
+
+        info_dict = videos_info['entries'][0]
+
+        upload_date = unified_strdate(self._html_search_meta('date', webpage))
+        description = self._html_search_meta('description', webpage)
+
+        info_dict.update({
+            'display_id': display_id,
+            'description': description,
+            'upload_date': upload_date,
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/bfi.py b/youtube_dl/extractor/bfi.py

new file mode 100644 (file)

index 0000000..60c8944
--- /dev/null
+++ b/youtube_dl/extractor/bfi.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import extract_attributes
+
+
+class BFIPlayerIE(InfoExtractor):
+    IE_NAME = 'bfi:player'
+    _VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P<id>[\w-]+)-online'
+    _TEST = {
+        'url': 'https://player.bfi.org.uk/free/film/watch-computer-doctor-1974-online',
+        'md5': 'e8783ebd8e061ec4bc6e9501ed547de8',
+        'info_dict': {
+            'id': 'htNnhlZjE60C9VySkQEIBtU-cNV1Xx63',
+            'ext': 'mp4',
+            'title': 'Computer Doctor',
+            'description': 'md5:fb6c240d40c4dbe40428bdd62f78203b',
+        },
+        'skip': 'BFI Player films cannot be played outside of the UK',
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        entries = []
+        for player_el in re.findall(r'(?s)<[^>]+class="player"[^>]*>', webpage):
+            player_attr = extract_attributes(player_el)
+            ooyala_id = player_attr.get('data-video-id')
+            if not ooyala_id:
+                continue
+            entries.append(self.url_result(
+                'ooyala:' + ooyala_id, 'Ooyala',
+                ooyala_id, player_attr.get('data-label')))
+        return self.playlist_result(entries)
diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py

new file mode 100644 (file)

index 0000000..28e3e59
--- /dev/null
+++ b/youtube_dl/extractor/bigflix.py
@@ -0,0 +1,78 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_urllib_parse_unquote,
+)
+
+
+class BigflixIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)'
+    _TESTS = [{
+        # 2 formats
+        'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070',
+        'info_dict': {
+            'id': '16070',
+            'ext': 'mp4',
+            'title': 'Madarasapatinam',
+            'description': 'md5:9f0470b26a4ba8e824c823b5d95c2f6b',
+            'formats': 'mincount:2',
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        # multiple formats
+        'url': 'http://www.bigflix.com/Malayalam-movies/Drama-movies/Indian-Rupee/15967',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<div[^>]+class=["\']pagetitle["\'][^>]*>(.+?)</div>',
+            webpage, 'title')
+
+        def decode_url(quoted_b64_url):
+            return compat_b64decode(compat_urllib_parse_unquote(
+                quoted_b64_url)).decode('utf-8')
+
+        formats = []
+        for height, encoded_url in re.findall(
+                r'ContentURL_(\d{3,4})[pP][^=]+=([^&]+)', webpage):
+            video_url = decode_url(encoded_url)
+            f = {
+                'url': video_url,
+                'format_id': '%sp' % height,
+                'height': int(height),
+            }
+            if video_url.startswith('rtmp'):
+                f['ext'] = 'flv'
+            formats.append(f)
+
+        file_url = self._search_regex(
+            r'file=([^&]+)', webpage, 'video url', default=None)
+        if file_url:
+            video_url = decode_url(file_url)
+            if all(f['url'] != video_url for f in formats):
+                formats.append({
+                    'url': decode_url(file_url),
+                })
+
+        self._sort_formats(formats)
+
+        description = self._html_search_meta('description', webpage)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py

new file mode 100644 (file)

index 0000000..b8dfbd4
--- /dev/null
+++ b/youtube_dl/extractor/bild.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unescapeHTML,
+)
+
+
+class BildIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html'
+    IE_DESC = 'Bild.de'
+    _TEST = {
+        'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html',
+        'md5': 'dd495cbd99f2413502a1713a1156ac8a',
+        'info_dict': {
+            'id': '38184146',
+            'ext': 'mp4',
+            'title': 'Das können die  neuen iPads',
+            'description': 'md5:a4058c4fa2a804ab59c00d7244bbf62f',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 196,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video_data = self._download_json(
+            url.split('.bild.html')[0] + ',view=json.bild.html', video_id)
+
+        return {
+            'id': video_id,
+            'title': unescapeHTML(video_data['title']).strip(),
+            'description': unescapeHTML(video_data.get('description')),
+            'url': video_data['clipList'][0]['srces'][0]['src'],
+            'thumbnail': video_data.get('poster'),
+            'duration': int_or_none(video_data.get('durationSec')),
+        }
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py

new file mode 100644 (file)

index 0000000..d39ee8f
--- /dev/null
+++ b/youtube_dl/extractor/bilibili.py
@@ -0,0 +1,450 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    parse_iso8601,
+    smuggle_url,
+    str_or_none,
+    strip_jsonp,
+    unified_timestamp,
+    unsmuggle_url,
+    urlencode_postdata,
+)
+
+
+class BiliBiliIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:(?:www|bangumi)\.)?
+                        bilibili\.(?:tv|com)/
+                        (?:
+                            (?:
+                                video/[aA][vV]|
+                                anime/(?P<anime_id>\d+)/play\#
+                            )(?P<id_bv>\d+)|
+                            video/[bB][vV](?P<id>[^/?#&]+)
+                        )
+                    '''
+
+    _TESTS = [{
+        'url': 'http://www.bilibili.tv/video/av1074402/',
+        'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
+        'info_dict': {
+            'id': '1074402',
+            'ext': 'flv',
+            'title': '【金坷垃】金泡沫',
+            'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
+            'duration': 308.067,
+            'timestamp': 1398012678,
+            'upload_date': '20140420',
+            'thumbnail': r're:^https?://.+\.jpg',
+            'uploader': '菊子桑',
+            'uploader_id': '156160',
+        },
+    }, {
+        # Tested in BiliBiliBangumiIE
+        'url': 'http://bangumi.bilibili.com/anime/1869/play#40062',
+        'only_matching': True,
+    }, {
+        'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
+        'md5': '3f721ad1e75030cc06faf73587cfec57',
+        'info_dict': {
+            'id': '100643',
+            'ext': 'mp4',
+            'title': 'CHAOS;CHILD',
+            'description': '如果你是神明，并且能够让妄想成为现实。那你会进行怎么样的妄想？是淫靡的世界？独裁社会？毁灭性的制裁？还是……2015年，涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
+        },
+        'skip': 'Geo-restricted to China',
+    }, {
+        # Title with double quotes
+        'url': 'http://www.bilibili.com/video/av8903802/',
+        'info_dict': {
+            'id': '8903802',
+            'title': '阿滴英文｜英文歌分享#6 "Closer',
+            'description': '滴妹今天唱Closer給你聽! 有史以来，被推最多次也是最久的歌曲，其实歌词跟我原本想像差蛮多的，不过还是好听！ 微博@阿滴英文',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': '8903802_part1',
+                'ext': 'flv',
+                'title': '阿滴英文｜英文歌分享#6 "Closer',
+                'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
+                'uploader': '阿滴英文',
+                'uploader_id': '65880958',
+                'timestamp': 1488382634,
+                'upload_date': '20170301',
+            },
+            'params': {
+                'skip_download': True,  # Test metadata only
+            },
+        }, {
+            'info_dict': {
+                'id': '8903802_part2',
+                'ext': 'flv',
+                'title': '阿滴英文｜英文歌分享#6 "Closer',
+                'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
+                'uploader': '阿滴英文',
+                'uploader_id': '65880958',
+                'timestamp': 1488382634,
+                'upload_date': '20170301',
+            },
+            'params': {
+                'skip_download': True,  # Test metadata only
+            },
+        }]
+    }, {
+        # new BV video id format
+        'url': 'https://www.bilibili.com/video/BV1JE411F741',
+        'only_matching': True,
+    }]
+
+    _APP_KEY = 'iVGUTjsxvpLeuDCf'
+    _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
+
+    def _report_error(self, result):
+        if 'message' in result:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True)
+        elif 'code' in result:
+            raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True)
+        else:
+            raise ExtractorError('Can\'t extract Bangumi episode ID')
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('id_bv')
+        anime_id = mobj.group('anime_id')
+        webpage = self._download_webpage(url, video_id)
+
+        if 'anime/' not in url:
+            cid = self._search_regex(
+                r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
+                default=None
+            ) or compat_parse_qs(self._search_regex(
+                [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
+                 r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)',
+                 r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
+                webpage, 'player parameters'))['cid'][0]
+        else:
+            if 'no_bangumi_tip' not in smuggled_data:
+                self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dlc with %s' % (
+                    video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id)))
+            headers = {
+                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+                'Referer': url
+            }
+            headers.update(self.geo_verification_headers())
+
+            js = self._download_json(
+                'http://bangumi.bilibili.com/web_api/get_source', video_id,
+                data=urlencode_postdata({'episode_id': video_id}),
+                headers=headers)
+            if 'result' not in js:
+                self._report_error(js)
+            cid = js['result']['cid']
+
+        headers = {
+            'Referer': url
+        }
+        headers.update(self.geo_verification_headers())
+
+        entries = []
+
+        RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
+        for num, rendition in enumerate(RENDITIONS, start=1):
+            payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
+            sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
+
+            video_info = self._download_json(
+                'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
+                video_id, note='Downloading video info page',
+                headers=headers, fatal=num == len(RENDITIONS))
+
+            if not video_info:
+                continue
+
+            if 'durl' not in video_info:
+                if num < len(RENDITIONS):
+                    continue
+                self._report_error(video_info)
+
+            for idx, durl in enumerate(video_info['durl']):
+                formats = [{
+                    'url': durl['url'],
+                    'filesize': int_or_none(durl['size']),
+                }]
+                for backup_url in durl.get('backup_url', []):
+                    formats.append({
+                        'url': backup_url,
+                        # backup URLs have lower priorities
+                        'preference': -2 if 'hd.mp4' in backup_url else -3,
+                    })
+
+                for a_format in formats:
+                    a_format.setdefault('http_headers', {}).update({
+                        'Referer': url,
+                    })
+
+                self._sort_formats(formats)
+
+                entries.append({
+                    'id': '%s_part%s' % (video_id, idx),
+                    'duration': float_or_none(durl.get('length'), 1000),
+                    'formats': formats,
+                })
+            break
+
+        title = self._html_search_regex(
+            ('<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
+             '(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
+            group='title')
+        description = self._html_search_meta('description', webpage)
+        timestamp = unified_timestamp(self._html_search_regex(
+            r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
+            default=None) or self._html_search_meta(
+            'uploadDate', webpage, 'timestamp', default=None))
+        thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
+
+        # TODO 'view_count' requires deobfuscating Javascript
+        info = {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'thumbnail': thumbnail,
+            'duration': float_or_none(video_info.get('timelength'), scale=1000),
+        }
+
+        uploader_mobj = re.search(
+            r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)',
+            webpage)
+        if uploader_mobj:
+            info.update({
+                'uploader': uploader_mobj.group('name'),
+                'uploader_id': uploader_mobj.group('id'),
+            })
+        if not info.get('uploader'):
+            info['uploader'] = self._html_search_meta(
+                'author', webpage, 'uploader', default=None)
+
+        for entry in entries:
+            entry.update(info)
+
+        if len(entries) == 1:
+            return entries[0]
+        else:
+            for idx, entry in enumerate(entries):
+                entry['id'] = '%s_part%d' % (video_id, (idx + 1))
+
+            return {
+                '_type': 'multi_video',
+                'id': video_id,
+                'title': title,
+                'description': description,
+                'entries': entries,
+            }
+
+
+class BiliBiliBangumiIE(InfoExtractor):
+    _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)'
+
+    IE_NAME = 'bangumi.bilibili.com'
+    IE_DESC = 'BiliBili番剧'
+
+    _TESTS = [{
+        'url': 'http://bangumi.bilibili.com/anime/1869',
+        'info_dict': {
+            'id': '1869',
+            'title': '混沌武士',
+            'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
+        },
+        'playlist_count': 26,
+    }, {
+        'url': 'http://bangumi.bilibili.com/anime/1869',
+        'info_dict': {
+            'id': '1869',
+            'title': '混沌武士',
+            'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
+        },
+        'playlist': [{
+            'md5': '91da8621454dd58316851c27c68b0c13',
+            'info_dict': {
+                'id': '40062',
+                'ext': 'mp4',
+                'title': '混沌武士',
+                'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日，酒馆里来了一群恶霸，虽然他们的举动令风十分不满，但是毕竟风只是一届女流，无法对他们采取什么行动，只能在心里嘟哝。这时，酒家里又进来了个“不良份子...',
+                'timestamp': 1414538739,
+                'upload_date': '20141028',
+                'episode': '疾风怒涛 Tempestuous Temperaments',
+                'episode_number': 1,
+            },
+        }],
+        'params': {
+            'playlist_items': '1',
+        },
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        bangumi_id = self._match_id(url)
+
+        # Sometimes this API returns a JSONP response
+        season_info = self._download_json(
+            'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id,
+            bangumi_id, transform_source=strip_jsonp)['result']
+
+        entries = [{
+            '_type': 'url_transparent',
+            'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}),
+            'ie_key': BiliBiliIE.ie_key(),
+            'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '),
+            'episode': episode.get('index_title'),
+            'episode_number': int_or_none(episode.get('index')),
+        } for episode in season_info['episodes']]
+
+        entries = sorted(entries, key=lambda entry: entry.get('episode_number'))
+
+        return self.playlist_result(
+            entries, bangumi_id,
+            season_info.get('bangumi_title'), season_info.get('evaluate'))
+
+
+class BilibiliAudioBaseIE(InfoExtractor):
+    def _call_api(self, path, sid, query=None):
+        if not query:
+            query = {'sid': sid}
+        return self._download_json(
+            'https://www.bilibili.com/audio/music-service-c/web/' + path,
+            sid, query=query)['data']
+
+
+class BilibiliAudioIE(BilibiliAudioBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/au(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.bilibili.com/audio/au1003142',
+        'md5': 'fec4987014ec94ef9e666d4d158ad03b',
+        'info_dict': {
+            'id': '1003142',
+            'ext': 'm4a',
+            'title': '【tsukimi】YELLOW / 神山羊',
+            'artist': 'tsukimi',
+            'comment_count': int,
+            'description': 'YELLOW的mp3版！',
+            'duration': 183,
+            'subtitles': {
+                'origin': [{
+                    'ext': 'lrc',
+                }],
+            },
+            'thumbnail': r're:^https?://.+\.jpg',
+            'timestamp': 1564836614,
+            'upload_date': '20190803',
+            'uploader': 'tsukimi-つきみぐー',
+            'view_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        au_id = self._match_id(url)
+
+        play_data = self._call_api('url', au_id)
+        formats = [{
+            'url': play_data['cdns'][0],
+            'filesize': int_or_none(play_data.get('size')),
+        }]
+
+        song = self._call_api('song/info', au_id)
+        title = song['title']
+        statistic = song.get('statistic') or {}
+
+        subtitles = None
+        lyric = song.get('lyric')
+        if lyric:
+            subtitles = {
+                'origin': [{
+                    'url': lyric,
+                }]
+            }
+
+        return {
+            'id': au_id,
+            'title': title,
+            'formats': formats,
+            'artist': song.get('author'),
+            'comment_count': int_or_none(statistic.get('comment')),
+            'description': song.get('intro'),
+            'duration': int_or_none(song.get('duration')),
+            'subtitles': subtitles,
+            'thumbnail': song.get('cover'),
+            'timestamp': int_or_none(song.get('passtime')),
+            'uploader': song.get('uname'),
+            'view_count': int_or_none(statistic.get('play')),
+        }
+
+
+class BilibiliAudioAlbumIE(BilibiliAudioBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?bilibili\.com/audio/am(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.bilibili.com/audio/am10624',
+        'info_dict': {
+            'id': '10624',
+            'title': '每日新曲推荐（每日11:00更新）',
+            'description': '每天11:00更新，为你推送最新音乐',
+        },
+        'playlist_count': 19,
+    }
+
+    def _real_extract(self, url):
+        am_id = self._match_id(url)
+
+        songs = self._call_api(
+            'song/of-menu', am_id, {'sid': am_id, 'pn': 1, 'ps': 100})['data']
+
+        entries = []
+        for song in songs:
+            sid = str_or_none(song.get('id'))
+            if not sid:
+                continue
+            entries.append(self.url_result(
+                'https://www.bilibili.com/audio/au' + sid,
+                BilibiliAudioIE.ie_key(), sid))
+
+        if entries:
+            album_data = self._call_api('menu/info', am_id) or {}
+            album_title = album_data.get('title')
+            if album_title:
+                for entry in entries:
+                    entry['album'] = album_title
+                return self.playlist_result(
+                    entries, am_id, album_title, album_data.get('intro'))
+
+        return self.playlist_result(entries, am_id)
+
+
+class BiliBiliPlayerIE(InfoExtractor):
+    _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result(
+            'http://www.bilibili.tv/video/av%s/' % video_id,
+            ie=BiliBiliIE.ie_key(), video_id=video_id)
diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py

new file mode 100644 (file)

index 0000000..dc86c57
--- /dev/null
+++ b/youtube_dl/extractor/biobiochiletv.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    remove_end,
+)
+
+
+class BioBioChileTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:tv|www)\.biobiochile\.cl/(?:notas|noticias)/(?:[^/]+/)+(?P<id>[^/]+)\.shtml'
+
+    _TESTS = [{
+        'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml',
+        'md5': '26f51f03cf580265defefb4518faec09',
+        'info_dict': {
+            'id': 'sobre-camaras-y-camarillas-parlamentarias',
+            'ext': 'mp4',
+            'title': 'Sobre Cámaras y camarillas parlamentarias',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Fernando Atria',
+        },
+        'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html',
+    }, {
+        # different uploader layout
+        'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml',
+        'md5': 'edc2e6b58974c46d5b047dea3c539ff3',
+        'info_dict': {
+            'id': 'natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades',
+            'ext': 'mp4',
+            'title': 'Natalia Valdebenito repasa a diputado Hasbún: Pasó a la categoría de hablar brutalidades',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Piangella Obrador',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html',
+    }, {
+        'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml',
+        'info_dict': {
+            'id': 'b4xd0LK3SK',
+            'ext': 'mp4',
+            # TODO: fix url_transparent information overriding
+            # 'uploader': 'Juan Pablo Echenique',
+            'title': 'Comentario Oscar Cáceres',
+        },
+        'params': {
+            # empty m3u8 manifest
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml',
+        'only_matching': True,
+    }, {
+        'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        rudo_url = self._search_regex(
+            r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)',
+            webpage, 'embed URL', None, group='url')
+        if not rudo_url:
+            raise ExtractorError('No videos found')
+
+        title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV')
+
+        thumbnail = self._og_search_thumbnail(webpage)
+        uploader = self._html_search_regex(
+            r'<a[^>]+href=["\'](?:https?://(?:busca|www)\.biobiochile\.cl)?/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>',
+            webpage, 'uploader', fatal=False)
+
+        return {
+            '_type': 'url_transparent',
+            'url': rudo_url,
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+        }
diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py

new file mode 100644 (file)

index 0000000..17ebbb2
--- /dev/null
+++ b/youtube_dl/extractor/biqle.py
@@ -0,0 +1,105 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .vk import VKIE
+from ..compat import (
+    compat_b64decode,
+    compat_urllib_parse_unquote,
+)
+from ..utils import int_or_none
+
+
+class BIQLEIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
+    _TESTS = [{
+        # Youtube embed
+        'url': 'https://biqle.ru/watch/-115995369_456239081',
+        'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06',
+        'info_dict': {
+            'id': '8v4f-avW-VI',
+            'ext': 'mp4',
+            'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer",
+            'description': 'Passe-Partout',
+            'uploader_id': 'mrsimpsonstef3',
+            'uploader': 'Phanolito',
+            'upload_date': '20120822',
+        },
+    }, {
+        'url': 'http://biqle.org/watch/-44781847_168547604',
+        'md5': '7f24e72af1db0edf7c1aaba513174f97',
+        'info_dict': {
+            'id': '-44781847_168547604',
+            'ext': 'mp4',
+            'title': 'Ребенок в шоке от автоматической мойки',
+            'timestamp': 1396633454,
+            'uploader': 'Dmitry Kotov',
+            'upload_date': '20140404',
+            'uploader_id': '47850140',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        embed_url = self._proto_relative_url(self._search_regex(
+            r'<iframe.+?src="((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^"]+)".*?></iframe>',
+            webpage, 'embed url'))
+        if VKIE.suitable(embed_url):
+            return self.url_result(embed_url, VKIE.ie_key(), video_id)
+
+        embed_page = self._download_webpage(
+            embed_url, video_id, headers={'Referer': url})
+        video_ext = self._get_cookies(embed_url).get('video_ext')
+        if video_ext:
+            video_ext = compat_urllib_parse_unquote(video_ext.value)
+        if not video_ext:
+            video_ext = compat_b64decode(self._search_regex(
+                r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)',
+                embed_page, 'video_ext')).decode()
+        video_id, sig, _, access_token = video_ext.split(':')
+        item = self._download_json(
+            'https://api.vk.com/method/video.get', video_id,
+            headers={'User-Agent': 'okhttp/3.4.1'}, query={
+                'access_token': access_token,
+                'sig': sig,
+                'v': 5.44,
+                'videos': video_id,
+            })['response']['items'][0]
+        title = item['title']
+
+        formats = []
+        for f_id, f_url in item.get('files', {}).items():
+            if f_id == 'external':
+                return self.url_result(f_url)
+            ext, height = f_id.split('_')
+            formats.append({
+                'format_id': height + 'p',
+                'url': f_url,
+                'height': int_or_none(height),
+                'ext': ext,
+            })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for k, v in item.items():
+            if k.startswith('photo_') and v:
+                width = k.replace('photo_', '')
+                thumbnails.append({
+                    'id': width,
+                    'url': v,
+                    'width': int_or_none(width),
+                })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'comment_count': int_or_none(item.get('comments')),
+            'description': item.get('description'),
+            'duration': int_or_none(item.get('duration')),
+            'thumbnails': thumbnails,
+            'timestamp': int_or_none(item.get('date')),
+            'uploader': item.get('owner_id'),
+            'view_count': int_or_none(item.get('views')),
+        }
diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py

new file mode 100644 (file)

index 0000000..0c773e6
--- /dev/null
+++ b/youtube_dl/extractor/bitchute.py
@@ -0,0 +1,142 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    orderedSet,
+    unified_strdate,
+    urlencode_postdata,
+)
+
+
+class BitChuteIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.bitchute.com/video/szoMrox2JEI/',
+        'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb',
+        'info_dict': {
+            'id': 'szoMrox2JEI',
+            'ext': 'mp4',
+            'title': 'Fuck bitches get money',
+            'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Victoria X Rave',
+            'upload_date': '20170813',
+        },
+    }, {
+        'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://www.bitchute.com/video/%s' % video_id, video_id, headers={
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
+            })
+
+        title = self._html_search_regex(
+            (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'),
+            webpage, 'title', default=None) or self._html_search_meta(
+            'description', webpage, 'title',
+            default=None) or self._og_search_description(webpage)
+
+        format_urls = []
+        for mobj in re.finditer(
+                r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
+            format_urls.append(mobj.group('url'))
+        format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage))
+
+        formats = [
+            {'url': format_url}
+            for format_url in orderedSet(format_urls)]
+
+        if not formats:
+            formats = self._parse_html5_media_entries(
+                url, webpage, video_id)[0]['formats']
+
+        self._check_formats(formats, video_id)
+        self._sort_formats(formats)
+
+        description = self._html_search_regex(
+            r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>',
+            webpage, 'description', fatal=False)
+        thumbnail = self._og_search_thumbnail(
+            webpage, default=None) or self._html_search_meta(
+            'twitter:image:src', webpage, 'thumbnail')
+        uploader = self._html_search_regex(
+            (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>',
+             r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'),
+            webpage, 'uploader', fatal=False)
+
+        upload_date = unified_strdate(self._search_regex(
+            r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.',
+            webpage, 'upload date', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'upload_date': upload_date,
+            'formats': formats,
+        }
+
+
+class BitChuteChannelIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.bitchute.com/channel/victoriaxrave/',
+        'playlist_mincount': 185,
+        'info_dict': {
+            'id': 'victoriaxrave',
+        },
+    }
+
+    _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
+
+    def _entries(self, channel_id):
+        channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
+        offset = 0
+        for page_num in itertools.count(1):
+            data = self._download_json(
+                '%sextend/' % channel_url, channel_id,
+                'Downloading channel page %d' % page_num,
+                data=urlencode_postdata({
+                    'csrfmiddlewaretoken': self._TOKEN,
+                    'name': '',
+                    'offset': offset,
+                }), headers={
+                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+                    'Referer': channel_url,
+                    'X-Requested-With': 'XMLHttpRequest',
+                    'Cookie': 'csrftoken=%s' % self._TOKEN,
+                })
+            if data.get('success') is False:
+                break
+            html = data.get('html')
+            if not html:
+                break
+            video_ids = re.findall(
+                r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
+                html)
+            if not video_ids:
+                break
+            offset += len(video_ids)
+            for video_id in video_ids:
+                yield self.url_result(
+                    'https://www.bitchute.com/video/%s' % video_id,
+                    ie=BitChuteIE.ie_key(), video_id=video_id)
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+        return self.playlist_result(
+            self._entries(channel_id), playlist_id=channel_id)
diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py

new file mode 100644 (file)

index 0000000..dc60224
--- /dev/null
+++ b/youtube_dl/extractor/bleacherreport.py
@@ -0,0 +1,106 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .amp import AMPIE
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class BleacherReportIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football',
+        'md5': 'a3ffc3dc73afdbc2010f02d98f990f20',
+        'info_dict': {
+            'id': '2496438',
+            'ext': 'mp4',
+            'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?',
+            'uploader_id': 3992341,
+            'description': 'CFB, ACC, Florida State',
+            'timestamp': 1434380212,
+            'upload_date': '20150615',
+            'uploader': 'Team Stream Now ',
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo',
+        'md5': '6a5cd403418c7b01719248ca97fb0692',
+        'info_dict': {
+            'id': '2586817',
+            'ext': 'webm',
+            'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
+            'timestamp': 1446839961,
+            'uploader': 'Sean Fay',
+            'description': 'md5:b1601e2314c4d8eec23b6eafe086a757',
+            'uploader_id': 6466954,
+            'upload_date': '20151011',
+        },
+        'add_ie': ['Youtube'],
+    }]
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+
+        article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article']
+
+        thumbnails = []
+        primary_photo = article_data.get('primaryPhoto')
+        if primary_photo:
+            thumbnails = [{
+                'url': primary_photo['url'],
+                'width': primary_photo.get('width'),
+                'height': primary_photo.get('height'),
+            }]
+
+        info = {
+            '_type': 'url_transparent',
+            'id': article_id,
+            'title': article_data['title'],
+            'uploader': article_data.get('author', {}).get('name'),
+            'uploader_id': article_data.get('authorId'),
+            'timestamp': parse_iso8601(article_data.get('createdAt')),
+            'thumbnails': thumbnails,
+            'comment_count': int_or_none(article_data.get('commentsCount')),
+            'view_count': int_or_none(article_data.get('hitCount')),
+        }
+
+        video = article_data.get('video')
+        if video:
+            video_type = video['type']
+            if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'):
+                info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id']
+            elif video_type == 'ooyala.com':
+                info['url'] = 'ooyala:%s' % video['id']
+            elif video_type == 'youtube.com':
+                info['url'] = video['id']
+            elif video_type == 'vine.co':
+                info['url'] = 'https://vine.co/v/%s' % video['id']
+            else:
+                info['url'] = video_type + video['id']
+            return info
+        else:
+            raise ExtractorError('no video in the article', expected=True)
+
+
+class BleacherReportCMSIE(AMPIE):
+    _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})'
+    _TESTS = [{
+        'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms',
+        'md5': '2e4b0a997f9228ffa31fada5c53d1ed1',
+        'info_dict': {
+            'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
+            'ext': 'flv',
+            'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
+            'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        info = self._extract_feed_info('http://vid.bleacherreport.com/videos/%s.akamai' % video_id)
+        info['id'] = video_id
+        return info
diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py

new file mode 100644 (file)

index 0000000..db5e12b
--- /dev/null
+++ b/youtube_dl/extractor/blinkx.py
@@ -0,0 +1,86 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    remove_start,
+    int_or_none,
+)
+
+
+class BlinkxIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
+    IE_NAME = 'blinkx'
+
+    _TEST = {
+        'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ',
+        'md5': '337cf7a344663ec79bf93a526a2e06c7',
+        'info_dict': {
+            'id': 'Da0Gw3xc',
+            'ext': 'mp4',
+            'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News',
+            'uploader': 'IGN News',
+            'upload_date': '20150217',
+            'timestamp': 1424215740,
+            'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.',
+            'duration': 47.743333,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        display_id = video_id[:8]
+
+        api_url = ('https://apib4.blinkx.com/api.php?action=play_video&'
+                   + 'video=%s' % video_id)
+        data_json = self._download_webpage(api_url, display_id)
+        data = json.loads(data_json)['api']['results'][0]
+        duration = None
+        thumbnails = []
+        formats = []
+        for m in data['media']:
+            if m['type'] == 'jpg':
+                thumbnails.append({
+                    'url': m['link'],
+                    'width': int(m['w']),
+                    'height': int(m['h']),
+                })
+            elif m['type'] == 'original':
+                duration = float(m['d'])
+            elif m['type'] == 'youtube':
+                yt_id = m['link']
+                self.to_screen('Youtube video detected: %s' % yt_id)
+                return self.url_result(yt_id, 'Youtube', video_id=yt_id)
+            elif m['type'] in ('flv', 'mp4'):
+                vcodec = remove_start(m['vcodec'], 'ff')
+                acodec = remove_start(m['acodec'], 'ff')
+                vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000)
+                abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000)
+                tbr = vbr + abr if vbr and abr else None
+                format_id = '%s-%sk-%s' % (vcodec, tbr, m['w'])
+                formats.append({
+                    'format_id': format_id,
+                    'url': m['link'],
+                    'vcodec': vcodec,
+                    'acodec': acodec,
+                    'abr': abr,
+                    'vbr': vbr,
+                    'tbr': tbr,
+                    'width': int_or_none(m.get('w')),
+                    'height': int_or_none(m.get('h')),
+                })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': display_id,
+            'fullid': video_id,
+            'title': data['title'],
+            'formats': formats,
+            'uploader': data['channel_name'],
+            'timestamp': data['pubdate_epoch'],
+            'description': data.get('description'),
+            'thumbnails': thumbnails,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py

new file mode 100644 (file)

index 0000000..2fbfad1
--- /dev/null
+++ b/youtube_dl/extractor/bloomberg.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class BloombergIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
+
+    _TESTS = [{
+        'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2',
+        # The md5 checksum changes
+        'info_dict': {
+            'id': 'qurhIVlJSB6hzkVi229d8g',
+            'ext': 'flv',
+            'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
+            'description': 'md5:a8ba0302912d03d246979735c17d2761',
+        },
+        'params': {
+            'format': 'best[format_id^=hds]',
+        },
+    }, {
+        # video ID in BPlayer(...)
+        'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/',
+        'info_dict': {
+            'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74',
+            'ext': 'flv',
+            'title': 'Meet the Real-Life Tech Wizards of Middle Earth',
+            'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.',
+        },
+        'params': {
+            'format': 'best[format_id^=hds]',
+        },
+    }, {
+        # data-bmmrid=
+        'url': 'https://www.bloomberg.com/politics/articles/2017-02-08/le-pen-aide-briefed-french-central-banker-on-plan-to-print-money',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bloomberg.com/politics/videos/2015-11-25/karl-rove-on-jeb-bush-s-struggles-stopping-trump',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        name = self._match_id(url)
+        webpage = self._download_webpage(url, name)
+        video_id = self._search_regex(
+            (r'["\']bmmrId["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
+             r'videoId\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
+             r'data-bmmrid=(["\'])(?P<id>(?:(?!\1).)+)\1'),
+            webpage, 'id', group='id', default=None)
+        if not video_id:
+            bplayer_data = self._parse_json(self._search_regex(
+                r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name)
+            video_id = bplayer_data['id']
+        title = re.sub(': Video$', '', self._og_search_title(webpage))
+
+        embed_info = self._download_json(
+            'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id)
+        formats = []
+        for stream in embed_info['streams']:
+            stream_url = stream.get('url')
+            if not stream_url:
+                continue
+            if stream['muxing_format'] == 'TS':
+                formats.extend(self._extract_m3u8_formats(
+                    stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+            else:
+                formats.extend(self._extract_f4m_formats(
+                    stream_url, video_id, f4m_id='hds', fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py

new file mode 100644 (file)

index 0000000..6017e83
--- /dev/null
+++ b/youtube_dl/extractor/bokecc.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import ExtractorError
+
+
+class BokeCCBaseIE(InfoExtractor):
+    def _extract_bokecc_formats(self, webpage, video_id, format_id=None):
+        player_params_str = self._html_search_regex(
+            r'<(?:script|embed)[^>]+src=(?P<q>["\'])(?:https?:)?//p\.bokecc\.com/(?:player|flash/player\.swf)\?(?P<query>.+?)(?P=q)',
+            webpage, 'player params', group='query')
+
+        player_params = compat_parse_qs(player_params_str)
+
+        info_xml = self._download_xml(
+            'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
+                player_params['siteid'][0], player_params['vid'][0]), video_id)
+
+        formats = [{
+            'format_id': format_id,
+            'url': quality.find('./copy').attrib['playurl'],
+            'preference': int(quality.attrib['value']),
+        } for quality in info_xml.findall('./video/quality')]
+
+        self._sort_formats(formats)
+
+        return formats
+
+
+class BokeCCIE(BokeCCBaseIE):
+    _IE_DESC = 'CC视频'
+    _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'
+
+    _TESTS = [{
+        'url': 'http://union.bokecc.com/playvideo.bo?vid=E0ABAE9D4F509B189C33DC5901307461&uid=FE644790DE9D154A',
+        'info_dict': {
+            'id': 'FE644790DE9D154A_E0ABAE9D4F509B189C33DC5901307461',
+            'ext': 'flv',
+            'title': 'BokeCC Video',
+        },
+    }]
+
+    def _real_extract(self, url):
+        qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
+        if not qs.get('vid') or not qs.get('uid'):
+            raise ExtractorError('Invalid URL', expected=True)
+
+        video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0])
+
+        webpage = self._download_webpage(url, video_id)
+
+        return {
+            'id': video_id,
+            'title': 'BokeCC Video',  # no title provided in the webpage
+            'formats': self._extract_bokecc_formats(webpage, video_id),
+        }
diff --git a/youtube_dl/extractor/bostonglobe.py b/youtube_dl/extractor/bostonglobe.py

new file mode 100644 (file)

index 0000000..57882fb
--- /dev/null
+++ b/youtube_dl/extractor/bostonglobe.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+    extract_attributes,
+)
+
+
+class BostonGlobeIE(InfoExtractor):
+    _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?'
+    _TESTS = [
+        {
+            'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html',
+            'md5': '0a62181079c85c2d2b618c9a738aedaf',
+            'info_dict': {
+                'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood',
+                'id': '5320421710001',
+                'ext': 'mp4',
+                'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.',
+                'timestamp': 1486877593,
+                'upload_date': '20170212',
+                'uploader_id': '245991542',
+            },
+        },
+        {
+            # Embedded youtube video; we hand it off to the Generic extractor.
+            'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html',
+            'md5': '582b40327089d5c0c949b3c54b13c24b',
+            'info_dict': {
+                'title': "Who Is Matt Damon's Favorite Batman?",
+                'id': 'ZW1QCnlA6Qc',
+                'ext': 'mp4',
+                'upload_date': '20170217',
+                'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb',
+                'uploader': 'The Late Late Show with James Corden',
+                'uploader_id': 'TheLateLateShow',
+            },
+            'expected_warnings': ['404'],
+        },
+    ]
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+        webpage = self._download_webpage(url, page_id)
+
+        page_title = self._og_search_title(webpage, default=None)
+
+        # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject">
+        entries = []
+        for video in re.findall(r'(?i)(<video[^>]+>)', webpage):
+            attrs = extract_attributes(video)
+
+            video_id = attrs.get('data-brightcove-video-id')
+            account_id = attrs.get('data-account')
+            player_id = attrs.get('data-player')
+            embed = attrs.get('data-embed')
+
+            if video_id and account_id and player_id and embed:
+                entries.append(
+                    'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
+                    % (account_id, player_id, embed, video_id))
+
+        if len(entries) == 0:
+            return self.url_result(url, 'Generic')
+        elif len(entries) == 1:
+            return self.url_result(entries[0], 'BrightcoveNew')
+        else:
+            return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew')
diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py

new file mode 100644 (file)

index 0000000..0783353
--- /dev/null
+++ b/youtube_dl/extractor/bpb.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+    determine_ext,
+)
+
+
+class BpbIE(InfoExtractor):
+    IE_DESC = 'Bundeszentrale für politische Bildung'
+    _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P<id>[0-9]+)/'
+
+    _TEST = {
+        'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr',
+        # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2
+        'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f',
+        'info_dict': {
+            'id': '297',
+            'ext': 'mp4',
+            'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
+            'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.'
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<h2 class="white">(.*?)</h2>', webpage, 'title')
+        video_info_dicts = re.findall(
+            r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage)
+
+        formats = []
+        for video_info in video_info_dicts:
+            video_info = self._parse_json(
+                video_info, video_id, transform_source=js_to_json, fatal=False)
+            if not video_info:
+                continue
+            video_url = video_info.get('src')
+            if not video_url:
+                continue
+            quality = 'high' if '_high' in video_url else 'low'
+            formats.append({
+                'url': video_url,
+                'preference': 10 if quality == 'high' else 0,
+                'format_note': quality,
+                'format_id': '%s-%s' % (quality, determine_ext(video_url)),
+            })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': self._og_search_description(webpage),
+        }
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py

new file mode 100644 (file)

index 0000000..9bde7f2
--- /dev/null
+++ b/youtube_dl/extractor/br.py
@@ -0,0 +1,311 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+    xpath_element,
+    xpath_text,
+)
+
+
+class BRIE(InfoExtractor):
+    IE_DESC = 'Bayerischer Rundfunk'
+    _VALID_URL = r'(?P<base_url>https?://(?:www\.)?br(?:-klassik)?\.de)/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html'
+
+    _TESTS = [
+        {
+            'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html',
+            'md5': '83a0477cf0b8451027eb566d88b51106',
+            'info_dict': {
+                'id': '48f656ef-287e-486f-be86-459122db22cc',
+                'ext': 'mp4',
+                'title': 'Die böse Überraschung',
+                'description': 'md5:ce9ac81b466ce775b8018f6801b48ac9',
+                'duration': 180,
+                'uploader': 'Reinhard Weber',
+                'upload_date': '20150422',
+            },
+            'skip': '404 not found',
+        },
+        {
+            'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
+            'md5': 'af3a3a4aa43ff0ce6a89504c67f427ef',
+            'info_dict': {
+                'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05',
+                'ext': 'flv',
+                'title': 'Manfred Schreiber ist tot',
+                'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97',
+                'duration': 26,
+            },
+            'skip': '404 not found',
+        },
+        {
+            'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html',
+            'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d',
+            'info_dict': {
+                'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b',
+                'ext': 'aac',
+                'title': 'Kurzweilig und sehr bewegend',
+                'description': 'md5:0351996e3283d64adeb38ede91fac54e',
+                'duration': 296,
+            },
+            'skip': '404 not found',
+        },
+        {
+            'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html',
+            'md5': 'dbab0aef2e047060ea7a21fc1ce1078a',
+            'info_dict': {
+                'id': '6ba73750-d405-45d3-861d-1ce8c524e059',
+                'ext': 'mp4',
+                'title': 'Umweltbewusster Häuslebauer',
+                'description': 'md5:d52dae9792d00226348c1dbb13c9bae2',
+                'duration': 116,
+            }
+        },
+        {
+            'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html',
+            'md5': '23bca295f1650d698f94fc570977dae3',
+            'info_dict': {
+                'id': 'd982c9ce-8648-4753-b358-98abb8aec43d',
+                'ext': 'mp4',
+                'title': 'Folge 1 - Metaphysik',
+                'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',
+                'duration': 893,
+                'uploader': 'Eva Maria Steimle',
+                'upload_date': '20170208',
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        base_url, display_id = re.search(self._VALID_URL, url).groups()
+        page = self._download_webpage(url, display_id)
+        xml_url = self._search_regex(
+            r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL')
+        xml = self._download_xml(base_url + xml_url, display_id)
+
+        medias = []
+
+        for xml_media in xml.findall('video') + xml.findall('audio'):
+            media_id = xml_media.get('externalId')
+            media = {
+                'id': media_id,
+                'title': xpath_text(xml_media, 'title', 'title', True),
+                'duration': parse_duration(xpath_text(xml_media, 'duration')),
+                'formats': self._extract_formats(xpath_element(
+                    xml_media, 'assets'), media_id),
+                'thumbnails': self._extract_thumbnails(xpath_element(
+                    xml_media, 'teaserImage/variants'), base_url),
+                'description': xpath_text(xml_media, 'desc'),
+                'webpage_url': xpath_text(xml_media, 'permalink'),
+                'uploader': xpath_text(xml_media, 'author'),
+            }
+            broadcast_date = xpath_text(xml_media, 'broadcastDate')
+            if broadcast_date:
+                media['upload_date'] = ''.join(reversed(broadcast_date.split('.')))
+            medias.append(media)
+
+        if len(medias) > 1:
+            self._downloader.report_warning(
+                'found multiple medias; please '
+                'report this with the video URL to http://yt-dl.org/bug')
+        if not medias:
+            raise ExtractorError('No media entries found')
+        return medias[0]
+
+    def _extract_formats(self, assets, media_id):
+        formats = []
+        for asset in assets.findall('asset'):
+            format_url = xpath_text(asset, ['downloadUrl', 'url'])
+            asset_type = asset.get('type')
+            if asset_type.startswith('HDS'):
+                formats.extend(self._extract_f4m_formats(
+                    format_url + '?hdcore=3.2.0', media_id, f4m_id='hds', fatal=False))
+            elif asset_type.startswith('HLS'):
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, media_id, 'mp4', 'm3u8_native', m3u8_id='hds', fatal=False))
+            else:
+                format_info = {
+                    'ext': xpath_text(asset, 'mediaType'),
+                    'width': int_or_none(xpath_text(asset, 'frameWidth')),
+                    'height': int_or_none(xpath_text(asset, 'frameHeight')),
+                    'tbr': int_or_none(xpath_text(asset, 'bitrateVideo')),
+                    'abr': int_or_none(xpath_text(asset, 'bitrateAudio')),
+                    'vcodec': xpath_text(asset, 'codecVideo'),
+                    'acodec': xpath_text(asset, 'codecAudio'),
+                    'container': xpath_text(asset, 'mediaType'),
+                    'filesize': int_or_none(xpath_text(asset, 'size')),
+                }
+                format_url = self._proto_relative_url(format_url)
+                if format_url:
+                    http_format_info = format_info.copy()
+                    http_format_info.update({
+                        'url': format_url,
+                        'format_id': 'http-%s' % asset_type,
+                    })
+                    formats.append(http_format_info)
+                server_prefix = xpath_text(asset, 'serverPrefix')
+                if server_prefix:
+                    rtmp_format_info = format_info.copy()
+                    rtmp_format_info.update({
+                        'url': server_prefix,
+                        'play_path': xpath_text(asset, 'fileName'),
+                        'format_id': 'rtmp-%s' % asset_type,
+                    })
+                    formats.append(rtmp_format_info)
+        self._sort_formats(formats)
+        return formats
+
+    def _extract_thumbnails(self, variants, base_url):
+        thumbnails = [{
+            'url': base_url + xpath_text(variant, 'url'),
+            'width': int_or_none(xpath_text(variant, 'width')),
+            'height': int_or_none(xpath_text(variant, 'height')),
+        } for variant in variants.findall('variant') if xpath_text(variant, 'url')]
+        thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True)
+        return thumbnails
+
+
+class BRMediathekIE(InfoExtractor):
+    IE_DESC = 'Bayerischer Rundfunk Mediathek'
+    _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?P<id>av:[0-9a-f]{24})'
+
+    _TESTS = [{
+        'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e',
+        'md5': 'fdc3d485835966d1622587d08ba632ec',
+        'info_dict': {
+            'id': 'av:5a1e6a6e8fce6d001871cc8e',
+            'ext': 'mp4',
+            'title': 'Die Sendung vom 28.11.2017',
+            'description': 'md5:6000cdca5912ab2277e5b7339f201ccc',
+            'timestamp': 1511942766,
+            'upload_date': '20171129',
+        }
+    }]
+
+    def _real_extract(self, url):
+        clip_id = self._match_id(url)
+
+        clip = self._download_json(
+            'https://proxy-base.master.mango.express/graphql',
+            clip_id, data=json.dumps({
+                "query": """{
+  viewer {
+    clip(id: "%s") {
+      title
+      description
+      duration
+      createdAt
+      ageRestriction
+      videoFiles {
+        edges {
+          node {
+            publicLocation
+            fileSize
+            videoProfile {
+              width
+              height
+              bitrate
+              encoding
+            }
+          }
+        }
+      }
+      captionFiles {
+        edges {
+          node {
+            publicLocation
+          }
+        }
+      }
+      teaserImages {
+        edges {
+          node {
+            imageFiles {
+              edges {
+                node {
+                  publicLocation
+                  width
+                  height
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}""" % clip_id}).encode(), headers={
+                'Content-Type': 'application/json',
+            })['data']['viewer']['clip']
+        title = clip['title']
+
+        formats = []
+        for edge in clip.get('videoFiles', {}).get('edges', []):
+            node = edge.get('node', {})
+            n_url = node.get('publicLocation')
+            if not n_url:
+                continue
+            ext = determine_ext(n_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    n_url, clip_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                video_profile = node.get('videoProfile', {})
+                tbr = int_or_none(video_profile.get('bitrate'))
+                format_id = 'http'
+                if tbr:
+                    format_id += '-%d' % tbr
+                formats.append({
+                    'format_id': format_id,
+                    'url': n_url,
+                    'width': int_or_none(video_profile.get('width')),
+                    'height': int_or_none(video_profile.get('height')),
+                    'tbr': tbr,
+                    'filesize': int_or_none(node.get('fileSize')),
+                })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for edge in clip.get('captionFiles', {}).get('edges', []):
+            node = edge.get('node', {})
+            n_url = node.get('publicLocation')
+            if not n_url:
+                continue
+            subtitles.setdefault('de', []).append({
+                'url': n_url,
+            })
+
+        thumbnails = []
+        for edge in clip.get('teaserImages', {}).get('edges', []):
+            for image_edge in edge.get('node', {}).get('imageFiles', {}).get('edges', []):
+                node = image_edge.get('node', {})
+                n_url = node.get('publicLocation')
+                if not n_url:
+                    continue
+                thumbnails.append({
+                    'url': n_url,
+                    'width': int_or_none(node.get('width')),
+                    'height': int_or_none(node.get('height')),
+                })
+
+        return {
+            'id': clip_id,
+            'title': title,
+            'description': clip.get('description'),
+            'duration': int_or_none(clip.get('duration')),
+            'timestamp': parse_iso8601(clip.get('createdAt')),
+            'age_limit': int_or_none(clip.get('ageRestriction')),
+            'formats': formats,
+            'subtitles': subtitles,
+            'thumbnails': thumbnails,
+        }
diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py

new file mode 100644 (file)

index 0000000..b9715df
--- /dev/null
+++ b/youtube_dl/extractor/bravotv.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .adobepass import AdobePassIE
+from ..utils import (
+    smuggle_url,
+    update_url_query,
+    int_or_none,
+)
+
+
+class BravoTVIE(AdobePassIE):
+    _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is',
+        'md5': 'e34684cfea2a96cd2ee1ef3a60909de9',
+        'info_dict': {
+            'id': 'epL0pmK1kQlT',
+            'ext': 'mp4',
+            'title': 'The Top Chef Season 16 Winner Is...',
+            'description': 'Find out who takes the title of Top Chef!',
+            'uploader': 'NBCU-BRAV',
+            'upload_date': '20190314',
+            'timestamp': 1552591860,
+        }
+    }, {
+        'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        settings = self._parse_json(self._search_regex(
+            r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'),
+            display_id)
+        info = {}
+        query = {
+            'mbr': 'true',
+        }
+        account_pid, release_pid = [None] * 2
+        tve = settings.get('ls_tve')
+        if tve:
+            query['manifest'] = 'm3u'
+            mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage)
+            if mobj:
+                account_pid, tp_path = mobj.groups()
+                release_pid = tp_path.strip('/').split('/')[-1]
+            else:
+                account_pid = 'HNK2IC'
+                tp_path = release_pid = tve['release_pid']
+            if tve.get('entitlement') == 'auth':
+                adobe_pass = settings.get('tve_adobe_auth', {})
+                resource = self._get_mvpd_resource(
+                    adobe_pass.get('adobePassResourceId', 'bravo'),
+                    tve['title'], release_pid, tve.get('rating'))
+                query['auth'] = self._extract_mvpd_auth(
+                    url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource)
+        else:
+            shared_playlist = settings['ls_playlist']
+            account_pid = shared_playlist['account_pid']
+            metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']]
+            tp_path = release_pid = metadata.get('release_pid')
+            if not release_pid:
+                release_pid = metadata['guid']
+                tp_path = 'media/guid/2140479951/' + release_pid
+            info.update({
+                'title': metadata['title'],
+                'description': metadata.get('description'),
+                'season_number': int_or_none(metadata.get('season_num')),
+                'episode_number': int_or_none(metadata.get('episode_num')),
+            })
+            query['switch'] = 'progressive'
+        info.update({
+            '_type': 'url_transparent',
+            'id': release_pid,
+            'url': smuggle_url(update_url_query(
+                'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path),
+                query), {'force_smil_url': True}),
+            'ie_key': 'ThePlatform',
+        })
+        return info
diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py

new file mode 100644 (file)

index 0000000..68c7cf2
--- /dev/null
+++ b/youtube_dl/extractor/breakcom.py
@@ -0,0 +1,91 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..utils import (
+    int_or_none,
+    url_or_none,
+)
+
+
+class BreakIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)'
+    _TESTS = [{
+        'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056',
+        'info_dict': {
+            'id': '2468056',
+            'ext': 'mp4',
+            'title': 'When Girls Act Like D-Bags',
+            'age_limit': 13,
+        },
+    }, {
+        # youtube embed
+        'url': 'http://www.break.com/video/someone-forgot-boat-brakes-work',
+        'info_dict': {
+            'id': 'RrrDLdeL2HQ',
+            'ext': 'mp4',
+            'title': 'Whale Watching Boat Crashing Into San Diego Dock',
+            'description': 'md5:afc1b2772f0a8468be51dd80eb021069',
+            'upload_date': '20160331',
+            'uploader': 'Steve Holden',
+            'uploader_id': 'sdholden07',
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://www.break.com/video/ugc/baby-flex-2773063',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id, video_id = re.match(self._VALID_URL, url).groups()
+
+        webpage = self._download_webpage(url, display_id)
+
+        youtube_url = YoutubeIE._extract_url(webpage)
+        if youtube_url:
+            return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
+
+        content = self._parse_json(
+            self._search_regex(
+                r'(?s)content["\']\s*:\s*(\[.+?\])\s*[,\n]', webpage,
+                'content'),
+            display_id)
+
+        formats = []
+        for video in content:
+            video_url = url_or_none(video.get('url'))
+            if not video_url:
+                continue
+            bitrate = int_or_none(self._search_regex(
+                r'(\d+)_kbps', video_url, 'tbr', default=None))
+            formats.append({
+                'url': video_url,
+                'format_id': 'http-%d' % bitrate if bitrate else 'http',
+                'tbr': bitrate,
+            })
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
+             r'<h1[^>]*>(?P<value>[^<]+)'), webpage, 'title', group='value')
+
+        def get(key, name):
+            return int_or_none(self._search_regex(
+                r'%s["\']\s*:\s*["\'](\d+)' % key, webpage, name,
+                default=None))
+
+        age_limit = get('ratings', 'age limit')
+        video_id = video_id or get('pid', 'video id') or display_id
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

new file mode 100644 (file)

index 0000000..2aa9f47
--- /dev/null
+++ b/youtube_dl/extractor/brightcove.py
@@ -0,0 +1,677 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import re
+import struct
+
+from .adobepass import AdobePassIE
+from .common import InfoExtractor
+from ..compat import (
+    compat_etree_fromstring,
+    compat_HTTPError,
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+    compat_urlparse,
+    compat_xml_parse_error,
+)
+from ..utils import (
+    clean_html,
+    extract_attributes,
+    ExtractorError,
+    find_xpath_attr,
+    fix_xml_ampersands,
+    float_or_none,
+    int_or_none,
+    js_to_json,
+    mimetype2ext,
+    parse_iso8601,
+    smuggle_url,
+    str_or_none,
+    unescapeHTML,
+    unsmuggle_url,
+    UnsupportedError,
+    update_url_query,
+    url_or_none,
+)
+
+
+class BrightcoveLegacyIE(InfoExtractor):
+    IE_NAME = 'brightcove:legacy'
+    _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
+
+    _TESTS = [
+        {
+            # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
+            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
+            'md5': '5423e113865d26e40624dce2e4b45d95',
+            'note': 'Test Brightcove downloads and detection in GenericIE',
+            'info_dict': {
+                'id': '2371591881001',
+                'ext': 'mp4',
+                'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
+                'uploader': '8TV',
+                'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
+                'timestamp': 1368213670,
+                'upload_date': '20130510',
+                'uploader_id': '1589608506001',
+            },
+            'skip': 'The player has been deactivated by the content owner',
+        },
+        {
+            # From http://medianetwork.oracle.com/video/player/1785452137001
+            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
+            'info_dict': {
+                'id': '1785452137001',
+                'ext': 'flv',
+                'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
+                'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
+                'uploader': 'Oracle',
+                'timestamp': 1344975024,
+                'upload_date': '20120814',
+                'uploader_id': '1460825906',
+            },
+            'skip': 'video not playable',
+        },
+        {
+            # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
+            'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
+            'info_dict': {
+                'id': '2750934548001',
+                'ext': 'mp4',
+                'title': 'This Bracelet Acts as a Personal Thermostat',
+                'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
+                # 'uploader': 'Mashable',
+                'timestamp': 1382041798,
+                'upload_date': '20131017',
+                'uploader_id': '1130468786001',
+            },
+        },
+        {
+            # test that the default referer works
+            # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
+            'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
+            'info_dict': {
+                'id': '2878862109001',
+                'ext': 'mp4',
+                'title': 'Lost in Motion II',
+                'description': 'md5:363109c02998fee92ec02211bd8000df',
+                'uploader': 'National Ballet of Canada',
+            },
+            'skip': 'Video gone',
+        },
+        {
+            # test flv videos served by akamaihd.net
+            # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william
+            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D',
+            # The md5 checksum changes on each download
+            'info_dict': {
+                'id': '3750436379001',
+                'ext': 'flv',
+                'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
+                'uploader': 'RBTV Old (do not use)',
+                'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
+                'timestamp': 1409122195,
+                'upload_date': '20140827',
+                'uploader_id': '710858724001',
+            },
+            'skip': 'Video gone',
+        },
+        {
+            # playlist with 'videoList'
+            # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players
+            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
+            'info_dict': {
+                'title': 'Sealife',
+                'id': '3550319591001',
+            },
+            'playlist_mincount': 7,
+            'skip': 'Unsupported URL',
+        },
+        {
+            # playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965)
+            'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg',
+            'info_dict': {
+                'id': '1522758701001',
+                'title': 'Lesson 08',
+            },
+            'playlist_mincount': 10,
+            'skip': 'Unsupported URL',
+        },
+        {
+            # playerID inferred from bcpid
+            # from http://www.un.org/chinese/News/story.asp?NewsID=27724
+            'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350',
+            'only_matching': True,  # Tested in GenericIE
+        }
+    ]
+
+    @classmethod
+    def _build_brighcove_url(cls, object_str):
+        """
+        Build a Brightcove url from a xml string containing
+        <object class="BrightcoveExperience">{params}</object>
+        """
+
+        # Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553
+        object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>',
+                            lambda m: m.group(1) + '/>', object_str)
+        # Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608
+        object_str = object_str.replace('<--', '<!--')
+        # remove namespace to simplify extraction
+        object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str)
+        object_str = fix_xml_ampersands(object_str)
+
+        try:
+            object_doc = compat_etree_fromstring(object_str.encode('utf-8'))
+        except compat_xml_parse_error:
+            return
+
+        fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
+        if fv_el is not None:
+            flashvars = dict(
+                (k, v[0])
+                for k, v in compat_parse_qs(fv_el.attrib['value']).items())
+        else:
+            flashvars = {}
+
+        data_url = object_doc.attrib.get('data', '')
+        data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query)
+
+        def find_param(name):
+            if name in flashvars:
+                return flashvars[name]
+            node = find_xpath_attr(object_doc, './param', 'name', name)
+            if node is not None:
+                return node.attrib['value']
+            return data_url_params.get(name)
+
+        params = {}
+
+        playerID = find_param('playerID') or find_param('playerId')
+        if playerID is None:
+            raise ExtractorError('Cannot find player ID')
+        params['playerID'] = playerID
+
+        playerKey = find_param('playerKey')
+        # Not all pages define this value
+        if playerKey is not None:
+            params['playerKey'] = playerKey
+        # These fields hold the id of the video
+        videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList')
+        if videoPlayer is not None:
+            if isinstance(videoPlayer, list):
+                videoPlayer = videoPlayer[0]
+            videoPlayer = videoPlayer.strip()
+            # UUID is also possible for videoPlayer (e.g.
+            # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd
+            # or http://www8.hp.com/cn/zh/home.html)
+            if not (re.match(
+                    r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$',
+                    videoPlayer) or videoPlayer.startswith('ref:')):
+                return None
+            params['@videoPlayer'] = videoPlayer
+        linkBase = find_param('linkBaseURL')
+        if linkBase is not None:
+            params['linkBaseURL'] = linkBase
+        return cls._make_brightcove_url(params)
+
+    @classmethod
+    def _build_brighcove_url_from_js(cls, object_js):
+        # The layout of JS is as follows:
+        # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
+        #   // build Brightcove <object /> XML
+        # }
+        m = re.search(
+            r'''(?x)customBC\.createVideo\(
+                .*?                                                  # skipping width and height
+                ["\'](?P<playerID>\d+)["\']\s*,\s*                   # playerID
+                ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s*  # playerKey begins with AQ and is 50 characters
+                                                                     # in length, however it's appended to itself
+                                                                     # in places, so truncate
+                ["\'](?P<videoID>\d+)["\']                           # @videoPlayer
+            ''', object_js)
+        if m:
+            return cls._make_brightcove_url(m.groupdict())
+
+    @classmethod
+    def _make_brightcove_url(cls, params):
+        return update_url_query(
+            'http://c.brightcove.com/services/viewer/htmlFederated', params)
+
+    @classmethod
+    def _extract_brightcove_url(cls, webpage):
+        """Try to extract the brightcove url from the webpage, returns None
+        if it can't be found
+        """
+        urls = cls._extract_brightcove_urls(webpage)
+        return urls[0] if urls else None
+
+    @classmethod
+    def _extract_brightcove_urls(cls, webpage):
+        """Return a list of all Brightcove URLs from the webpage """
+
+        url_m = re.search(
+            r'''(?x)
+                <meta\s+
+                    (?:property|itemprop)=([\'"])(?:og:video|embedURL)\1[^>]+
+                    content=([\'"])(?P<url>https?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2
+            ''', webpage)
+        if url_m:
+            url = unescapeHTML(url_m.group('url'))
+            # Some sites don't add it, we can't download with this url, for example:
+            # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/
+            if 'playerKey' in url or 'videoId' in url or 'idVideo' in url:
+                return [url]
+
+        matches = re.findall(
+            r'''(?sx)<object
+            (?:
+                [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] |
+                [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
+            ).+?>\s*</object>''',
+            webpage)
+        if matches:
+            return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+
+        matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)
+        if matches:
+            return list(filter(None, [
+                cls._build_brighcove_url_from_js(custom_bc)
+                for custom_bc in matches]))
+        return [src for _, src in re.findall(
+            r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)]
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        # Change the 'videoId' and others field to '@videoPlayer'
+        url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url)
+        # Change bckey (used by bcove.me urls) to playerKey
+        url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
+        mobj = re.match(self._VALID_URL, url)
+        query_str = mobj.group('query')
+        query = compat_urlparse.parse_qs(query_str)
+
+        videoPlayer = query.get('@videoPlayer')
+        if videoPlayer:
+            # We set the original url as the default 'Referer' header
+            referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url)
+            video_id = videoPlayer[0]
+            if 'playerID' not in query:
+                mobj = re.search(r'/bcpid(\d+)', url)
+                if mobj is not None:
+                    query['playerID'] = [mobj.group(1)]
+            publisher_id = query.get('publisherId')
+            if publisher_id and publisher_id[0].isdigit():
+                publisher_id = publisher_id[0]
+            if not publisher_id:
+                player_key = query.get('playerKey')
+                if player_key and ',' in player_key[0]:
+                    player_key = player_key[0]
+                else:
+                    player_id = query.get('playerID')
+                    if player_id and player_id[0].isdigit():
+                        headers = {}
+                        if referer:
+                            headers['Referer'] = referer
+                        player_page = self._download_webpage(
+                            'http://link.brightcove.com/services/player/bcpid' + player_id[0],
+                            video_id, headers=headers, fatal=False)
+                        if player_page:
+                            player_key = self._search_regex(
+                                r'<param\s+name="playerKey"\s+value="([\w~,-]+)"',
+                                player_page, 'player key', fatal=False)
+                if player_key:
+                    enc_pub_id = player_key.split(',')[1].replace('~', '=')
+                    publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0]
+            if publisher_id:
+                brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id)
+                if referer:
+                    brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer})
+                return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id)
+        # TODO: figure out if it's possible to extract playlistId from playerKey
+        # elif 'playerKey' in query:
+        #     player_key = query['playerKey']
+        #     return self._get_playlist_info(player_key[0])
+        raise UnsupportedError(url)
+
+
+class BrightcoveNewIE(AdobePassIE):
+    IE_NAME = 'brightcove:new'
+    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)'
+    _TESTS = [{
+        'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
+        'md5': 'c8100925723840d4b0d243f7025703be',
+        'info_dict': {
+            'id': '4463358922001',
+            'ext': 'mp4',
+            'title': 'Meet the man behind Popcorn Time',
+            'description': 'md5:eac376a4fe366edc70279bfb681aea16',
+            'duration': 165.768,
+            'timestamp': 1441391203,
+            'upload_date': '20150904',
+            'uploader_id': '929656772001',
+            'formats': 'mincount:20',
+        },
+    }, {
+        # with rtmp streams
+        'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001',
+        'info_dict': {
+            'id': '4279049078001',
+            'ext': 'mp4',
+            'title': 'Titansgrave: Chapter 0',
+            'description': 'Titansgrave: Chapter 0',
+            'duration': 1242.058,
+            'timestamp': 1433556729,
+            'upload_date': '20150606',
+            'uploader_id': '4036320279001',
+            'formats': 'mincount:39',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        # playlist stream
+        'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001',
+        'info_dict': {
+            'id': '5718313430001',
+            'title': 'No Audio Playlist',
+        },
+        'playlist_count': 7,
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001',
+        'only_matching': True,
+    }, {
+        # ref: prefixed video id
+        'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442',
+        'only_matching': True,
+    }, {
+        # non numeric ref: prefixed video id
+        'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356',
+        'only_matching': True,
+    }, {
+        # unavailable video without message but with error_code
+        'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(ie, webpage):
+        urls = BrightcoveNewIE._extract_urls(ie, webpage)
+        return urls[0] if urls else None
+
+    @staticmethod
+    def _extract_urls(ie, webpage):
+        # Reference:
+        # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
+        # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
+        # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
+        # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html
+        # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
+
+        entries = []
+
+        # Look for iframe embeds [1]
+        for _, url in re.findall(
+                r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
+            entries.append(url if url.startswith('http') else 'http:' + url)
+
+        # Look for <video> tags [2] and embed_in_page embeds [3]
+        # [2] looks like:
+        for video, script_tag, account_id, player_id, embed in re.findall(
+                r'''(?isx)
+                    (<video(?:-js)?\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>)
+                    (?:.*?
+                        (<script[^>]+
+                            src=["\'](?:https?:)?//players\.brightcove\.net/
+                            (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+                        )
+                    )?
+                ''', webpage):
+            attrs = extract_attributes(video)
+
+            # According to examples from [4] it's unclear whether video id
+            # may be optional and what to do when it is
+            video_id = attrs.get('data-video-id')
+            if not video_id:
+                continue
+
+            account_id = account_id or attrs.get('data-account')
+            if not account_id:
+                continue
+
+            player_id = player_id or attrs.get('data-player') or 'default'
+            embed = embed or attrs.get('data-embed') or 'default'
+
+            bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (
+                account_id, player_id, embed, video_id)
+
+            # Some brightcove videos may be embedded with video tag only and
+            # without script tag or any mentioning of brightcove at all. Such
+            # embeds are considered ambiguous since they are matched based only
+            # on data-video-id and data-account attributes and in the wild may
+            # not be brightcove embeds at all. Let's check reconstructed
+            # brightcove URLs in case of such embeds and only process valid
+            # ones. By this we ensure there is indeed a brightcove embed.
+            if not script_tag and not ie._is_valid_url(
+                    bc_url, video_id, 'possible brightcove video'):
+                continue
+
+            entries.append(bc_url)
+
+        return entries
+
+    def _parse_brightcove_metadata(self, json_data, video_id, headers={}):
+        title = json_data['name'].strip()
+
+        formats = []
+        for source in json_data.get('sources', []):
+            container = source.get('container')
+            ext = mimetype2ext(source.get('type'))
+            src = source.get('src')
+            # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
+            if ext == 'ism' or container == 'WVM' or source.get('key_systems'):
+                continue
+            elif ext == 'm3u8' or container == 'M2TS':
+                if not src:
+                    continue
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+            elif ext == 'mpd':
+                if not src:
+                    continue
+                formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False))
+            else:
+                streaming_src = source.get('streaming_src')
+                stream_name, app_name = source.get('stream_name'), source.get('app_name')
+                if not src and not streaming_src and (not stream_name or not app_name):
+                    continue
+                tbr = float_or_none(source.get('avg_bitrate'), 1000)
+                height = int_or_none(source.get('height'))
+                width = int_or_none(source.get('width'))
+                f = {
+                    'tbr': tbr,
+                    'filesize': int_or_none(source.get('size')),
+                    'container': container,
+                    'ext': ext or container.lower(),
+                }
+                if width == 0 and height == 0:
+                    f.update({
+                        'vcodec': 'none',
+                    })
+                else:
+                    f.update({
+                        'width': width,
+                        'height': height,
+                        'vcodec': source.get('codec'),
+                    })
+
+                def build_format_id(kind):
+                    format_id = kind
+                    if tbr:
+                        format_id += '-%dk' % int(tbr)
+                    if height:
+                        format_id += '-%dp' % height
+                    return format_id
+
+                if src or streaming_src:
+                    f.update({
+                        'url': src or streaming_src,
+                        'format_id': build_format_id('http' if src else 'http-streaming'),
+                        'source_preference': 0 if src else -1,
+                    })
+                else:
+                    f.update({
+                        'url': app_name,
+                        'play_path': stream_name,
+                        'format_id': build_format_id('rtmp'),
+                    })
+                formats.append(f)
+        if not formats:
+            # for sonyliv.com DRM protected videos
+            s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl')
+            if s3_source_url:
+                formats.append({
+                    'url': s3_source_url,
+                    'format_id': 'source',
+                })
+
+        errors = json_data.get('errors')
+        if not formats and errors:
+            error = errors[0]
+            raise ExtractorError(
+                error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
+
+        self._sort_formats(formats)
+
+        for f in formats:
+            f.setdefault('http_headers', {}).update(headers)
+
+        subtitles = {}
+        for text_track in json_data.get('text_tracks', []):
+            if text_track.get('kind') != 'captions':
+                continue
+            text_track_url = url_or_none(text_track.get('src'))
+            if not text_track_url:
+                continue
+            lang = (str_or_none(text_track.get('srclang'))
+                    or str_or_none(text_track.get('label')) or 'en').lower()
+            subtitles.setdefault(lang, []).append({
+                'url': text_track_url,
+            })
+
+        is_live = False
+        duration = float_or_none(json_data.get('duration'), 1000)
+        if duration is not None and duration <= 0:
+            is_live = True
+
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'description': clean_html(json_data.get('description')),
+            'thumbnail': json_data.get('thumbnail') or json_data.get('poster'),
+            'duration': duration,
+            'timestamp': parse_iso8601(json_data.get('published_at')),
+            'uploader_id': json_data.get('account_id'),
+            'formats': formats,
+            'subtitles': subtitles,
+            'tags': json_data.get('tags', []),
+            'is_live': is_live,
+        }
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+            'ip_blocks': smuggled_data.get('geo_ip_blocks'),
+        })
+
+        account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups()
+
+        policy_key_id = '%s_%s' % (account_id, player_id)
+        policy_key = self._downloader.cache.load('brightcove', policy_key_id)
+        policy_key_extracted = False
+        store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x)
+
+        def extract_policy_key():
+            webpage = self._download_webpage(
+                'http://players.brightcove.net/%s/%s_%s/index.min.js'
+                % (account_id, player_id, embed), video_id)
+
+            policy_key = None
+
+            catalog = self._search_regex(
+                r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
+            if catalog:
+                catalog = self._parse_json(
+                    js_to_json(catalog), video_id, fatal=False)
+                if catalog:
+                    policy_key = catalog.get('policyKey')
+
+            if not policy_key:
+                policy_key = self._search_regex(
+                    r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
+                    webpage, 'policy key', group='pk')
+
+            store_pk(policy_key)
+            return policy_key
+
+        api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id)
+        headers = {}
+        referrer = smuggled_data.get('referrer')
+        if referrer:
+            headers.update({
+                'Referer': referrer,
+                'Origin': re.search(r'https?://[^/]+', referrer).group(0),
+            })
+
+        for _ in range(2):
+            if not policy_key:
+                policy_key = extract_policy_key()
+                policy_key_extracted = True
+            headers['Accept'] = 'application/json;pk=%s' % policy_key
+            try:
+                json_data = self._download_json(api_url, video_id, headers=headers)
+                break
+            except ExtractorError as e:
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
+                    json_data = self._parse_json(e.cause.read().decode(), video_id)[0]
+                    message = json_data.get('message') or json_data['error_code']
+                    if json_data.get('error_subcode') == 'CLIENT_GEO':
+                        self.raise_geo_restricted(msg=message)
+                    elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted:
+                        policy_key = None
+                        store_pk(None)
+                        continue
+                    raise ExtractorError(message, expected=True)
+                raise
+
+        errors = json_data.get('errors')
+        if errors and errors[0].get('error_subcode') == 'TVE_AUTH':
+            custom_fields = json_data['custom_fields']
+            tve_token = self._extract_mvpd_auth(
+                smuggled_data['source_url'], video_id,
+                custom_fields['bcadobepassrequestorid'],
+                custom_fields['bcadobepassresourceid'])
+            json_data = self._download_json(
+                api_url, video_id, headers={
+                    'Accept': 'application/json;pk=%s' % policy_key
+                }, query={
+                    'tveToken': tve_token,
+                })
+
+        if content_type == 'playlist':
+            return self.playlist_result(
+                [self._parse_brightcove_metadata(vid, vid.get('id'), headers)
+                 for vid in json_data.get('videos', []) if vid.get('id')],
+                json_data.get('id'), json_data.get('name'),
+                json_data.get('description'))
+
+        return self._parse_brightcove_metadata(
+            json_data, video_id, headers=headers)
diff --git a/youtube_dl/extractor/businessinsider.py b/youtube_dl/extractor/businessinsider.py

new file mode 100644 (file)

index 0000000..73a57b1
--- /dev/null
+++ b/youtube_dl/extractor/businessinsider.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .jwplatform import JWPlatformIE
+
+
+class BusinessInsiderIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+\.)?businessinsider\.(?:com|nl)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://uk.businessinsider.com/how-much-radiation-youre-exposed-to-in-everyday-life-2016-6',
+        'md5': 'ffed3e1e12a6f950aa2f7d83851b497a',
+        'info_dict': {
+            'id': 'cjGDb0X9',
+            'ext': 'mp4',
+            'title': "Bananas give you more radiation exposure than living next to a nuclear power plant",
+            'description': 'md5:0175a3baf200dd8fa658f94cade841b3',
+            'upload_date': '20160611',
+            'timestamp': 1465675620,
+        },
+    }, {
+        'url': 'https://www.businessinsider.nl/5-scientifically-proven-things-make-you-less-attractive-2017-7/',
+        'md5': '43f438dbc6da0b89f5ac42f68529d84a',
+        'info_dict': {
+            'id': '5zJwd4FK',
+            'ext': 'mp4',
+            'title': 'Deze dingen zorgen ervoor dat je minder snel een date scoort',
+            'description': 'md5:2af8975825d38a4fed24717bbe51db49',
+            'upload_date': '20170705',
+            'timestamp': 1499270528,
+        },
+    }, {
+        'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        jwplatform_id = self._search_regex(
+            (r'data-media-id=["\']([a-zA-Z0-9]{8})',
+             r'id=["\']jwplayer_([a-zA-Z0-9]{8})',
+             r'id["\']?\s*:\s*["\']?([a-zA-Z0-9]{8})',
+             r'(?:jwplatform\.com/players/|jwplayer_)([a-zA-Z0-9]{8})'),
+            webpage, 'jwplatform id')
+        return self.url_result(
+            'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(),
+            video_id=video_id)
diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py

new file mode 100644 (file)

index 0000000..ec41109
--- /dev/null
+++ b/youtube_dl/extractor/buzzfeed.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from .facebook import FacebookIE
+
+
+class BuzzFeedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?buzzfeed\.com/[^?#]*?/(?P<id>[^?#]+)'
+    _TESTS = [{
+        'url': 'http://www.buzzfeed.com/abagg/this-angry-ram-destroys-a-punching-bag-like-a-boss?utm_term=4ldqpia',
+        'info_dict': {
+            'id': 'this-angry-ram-destroys-a-punching-bag-like-a-boss',
+            'title': 'This Angry Ram Destroys A Punching Bag Like A Boss',
+            'description': 'Rambro!',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': 'aVCR29aE_OQ',
+                'ext': 'mp4',
+                'title': 'Angry Ram destroys a punching bag..',
+                'description': 'md5:c59533190ef23fd4458a5e8c8c872345',
+                'upload_date': '20141024',
+                'uploader_id': 'Buddhanz1',
+                'uploader': 'Angry Ram',
+            }
+        }]
+    }, {
+        'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia',
+        'params': {
+            'skip_download': True,  # Got enough YouTube download tests
+        },
+        'info_dict': {
+            'id': 'look-at-this-cute-dog-omg',
+            'description': 're:Munchkin the Teddy Bear is back ?!',
+            'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': 'mVmBL8B-In0',
+                'ext': 'mp4',
+                'title': 're:Munchkin the Teddy Bear gets her exercise',
+                'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',
+                'upload_date': '20141124',
+                'uploader_id': 'CindysMunchkin',
+                'uploader': 're:^Munchkin the',
+            },
+        }]
+    }, {
+        'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK',
+        'info_dict': {
+            'id': 'the-most-adorable-crash-landing-ever',
+            'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing',
+            'description': 'This gosling knows how to stick a landing.',
+        },
+        'playlist': [{
+            'md5': '763ca415512f91ca62e4621086900a23',
+            'info_dict': {
+                'id': '971793786185728',
+                'ext': 'mp4',
+                'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...',
+                'uploader': 'Calgary Outdoor Centre-University of Calgary',
+            },
+        }],
+        'add_ie': ['Facebook'],
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+
+        all_buckets = re.findall(
+            r'(?s)<div class="video-embed[^"]*"..*?rel:bf_bucket_data=\'([^\']+)\'',
+            webpage)
+
+        entries = []
+        for bd_json in all_buckets:
+            bd = json.loads(bd_json)
+            video = bd.get('video') or bd.get('progload_video')
+            if not video:
+                continue
+            entries.append(self.url_result(video['url']))
+
+        facebook_urls = FacebookIE._extract_urls(webpage)
+        entries.extend([
+            self.url_result(facebook_url)
+            for facebook_url in facebook_urls])
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py

new file mode 100644 (file)

index 0000000..0b11bf1
--- /dev/null
+++ b/youtube_dl/extractor/byutv.py
@@ -0,0 +1,117 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    merge_dicts,
+    parse_duration,
+    url_or_none,
+)
+
+
+class BYUtvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?'
+    _TESTS = [{
+        # ooyalaVOD
+        'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
+        'info_dict': {
+            'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH',
+            'display_id': 'studio-c-season-5-episode-5',
+            'ext': 'mp4',
+            'title': 'Season 5 Episode 5',
+            'description': 'md5:1d31dc18ef4f075b28f6a65937d22c65',
+            'thumbnail': r're:^https?://.*',
+            'duration': 1486.486,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        # dvr
+        'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2',
+        'info_dict': {
+            'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451',
+            'display_id': 'byu-softball-pacific-vs-byu-41219---game-2',
+            'ext': 'mp4',
+            'title': 'Pacific vs. BYU (4/12/19)',
+            'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3',
+            'duration': 11645,
+        },
+        'params': {
+            'skip_download': True
+        },
+    }, {
+        'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.byutv.org/player/27741493-dc83-40b0-8420-e7ae38a2ae98/byu-football-toledo-vs-byu-93016?listid=4fe0fee5-0d3c-4a29-b725-e4948627f472&listindex=0&q=toledo',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        video = self._download_json(
+            'https://api.byutv.org/api3/catalog/getvideosforcontent',
+            display_id, query={
+                'contentid': video_id,
+                'channel': 'byutv',
+                'x-byutv-context': 'web$US',
+            }, headers={
+                'x-byutv-context': 'web$US',
+                'x-byutv-platformkey': 'xsaaw9c7y5',
+            })
+
+        ep = video.get('ooyalaVOD')
+        if ep:
+            return {
+                '_type': 'url_transparent',
+                'ie_key': 'Ooyala',
+                'url': 'ooyala:%s' % ep['providerId'],
+                'id': video_id,
+                'display_id': display_id,
+                'title': ep.get('title'),
+                'description': ep.get('description'),
+                'thumbnail': ep.get('imageThumbnail'),
+            }
+
+        info = {}
+        formats = []
+        for format_id, ep in video.items():
+            if not isinstance(ep, dict):
+                continue
+            video_url = url_or_none(ep.get('videoUrl'))
+            if not video_url:
+                continue
+            ext = determine_ext(video_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    video_url, video_id, mpd_id='dash', fatal=False))
+            else:
+                formats.append({
+                    'url': video_url,
+                    'format_id': format_id,
+                })
+            merge_dicts(info, {
+                'title': ep.get('title'),
+                'description': ep.get('description'),
+                'thumbnail': ep.get('imageThumbnail'),
+                'duration': parse_duration(ep.get('length')),
+            })
+        self._sort_formats(formats)
+
+        return merge_dicts(info, {
+            'id': video_id,
+            'display_id': display_id,
+            'title': display_id,
+            'formats': formats,
+        })
diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py

new file mode 100644 (file)

index 0000000..cac8fdc
--- /dev/null
+++ b/youtube_dl/extractor/c56.py
@@ -0,0 +1,65 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class C56IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
+    IE_NAME = '56.com'
+    _TESTS = [{
+        'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
+        'md5': 'e59995ac63d0457783ea05f93f12a866',
+        'info_dict': {
+            'id': '93440716',
+            'ext': 'flv',
+            'title': '网事知多少 第32期：车怒',
+            'duration': 283.813,
+        },
+    }, {
+        'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html',
+        'md5': '',
+        'info_dict': {
+            'id': '82247482',
+            'title': '爱的诅咒之杜鹃花开',
+        },
+        'playlist_count': 7,
+        'add_ie': ['Sohu'],
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+        text_id = mobj.group('textid')
+
+        webpage = self._download_webpage(url, text_id)
+        sohu_video_info_str = self._search_regex(
+            r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None)
+        if sohu_video_info_str:
+            sohu_video_info = self._parse_json(
+                sohu_video_info_str, text_id, transform_source=js_to_json)
+            return self.url_result(sohu_video_info['url'], 'Sohu')
+
+        page = self._download_json(
+            'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
+
+        info = page['info']
+
+        formats = [
+            {
+                'format_id': f['type'],
+                'filesize': int(f['filesize']),
+                'url': f['url']
+            } for f in info['rfiles']
+        ]
+        self._sort_formats(formats)
+
+        return {
+            'id': info['vid'],
+            'title': info['Subject'],
+            'duration': int(info['duration']) / 1000.0,
+            'formats': formats,
+            'thumbnail': info.get('bimg') or info.get('img'),
+        }
diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py

new file mode 100644 (file)

index 0000000..8f0c6c5
--- /dev/null
+++ b/youtube_dl/extractor/camdemy.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_urlencode,
+    compat_urlparse,
+)
+from ..utils import (
+    clean_html,
+    parse_duration,
+    str_to_int,
+    unified_strdate,
+)
+
+
+class CamdemyIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
+    _TESTS = [{
+        # single file
+        'url': 'http://www.camdemy.com/media/5181/',
+        'md5': '5a5562b6a98b37873119102e052e311b',
+        'info_dict': {
+            'id': '5181',
+            'ext': 'mp4',
+            'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'creator': 'ss11spring',
+            'duration': 1591,
+            'upload_date': '20130114',
+            'view_count': int,
+        }
+    }, {
+        # With non-empty description
+        # webpage returns "No permission or not login"
+        'url': 'http://www.camdemy.com/media/13885',
+        'md5': '4576a3bb2581f86c61044822adbd1249',
+        'info_dict': {
+            'id': '13885',
+            'ext': 'mp4',
+            'title': 'EverCam + Camdemy QuickStart',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
+            'creator': 'evercam',
+            'duration': 318,
+        }
+    }, {
+        # External source (YouTube)
+        'url': 'http://www.camdemy.com/media/14842',
+        'info_dict': {
+            'id': '2vsYQzNIsJo',
+            'ext': 'mp4',
+            'title': 'Excel 2013 Tutorial - How to add Password Protection',
+            'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
+            'upload_date': '20130211',
+            'uploader': 'Hun Kim',
+            'uploader_id': 'hunkimtutorials',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        src_from = self._html_search_regex(
+            r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
+            webpage, 'external source', default=None, group='url')
+        if src_from:
+            return self.url_result(src_from)
+
+        oembed_obj = self._download_json(
+            'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
+
+        title = oembed_obj['title']
+        thumb_url = oembed_obj['thumbnail_url']
+        video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
+        file_list_doc = self._download_xml(
+            compat_urlparse.urljoin(video_folder, 'fileList.xml'),
+            video_id, 'Downloading filelist XML')
+        file_name = file_list_doc.find('./video/item/fileName').text
+        video_url = compat_urlparse.urljoin(video_folder, file_name)
+
+        # Some URLs return "No permission or not login" in a webpage despite being
+        # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
+        upload_date = unified_strdate(self._search_regex(
+            r'>published on ([^<]+)<', webpage,
+            'upload date', default=None))
+        view_count = str_to_int(self._search_regex(
+            r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
+            webpage, 'view count', default=None))
+        description = self._html_search_meta(
+            'description', webpage, default=None) or clean_html(
+            oembed_obj.get('description'))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumb_url,
+            'description': description,
+            'creator': oembed_obj.get('author_name'),
+            'duration': parse_duration(oembed_obj.get('duration')),
+            'upload_date': upload_date,
+            'view_count': view_count,
+        }
+
+
+class CamdemyFolderIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
+    _TESTS = [{
+        # links with trailing slash
+        'url': 'http://www.camdemy.com/folder/450',
+        'info_dict': {
+            'id': '450',
+            'title': '信號與系統 2012 & 2011 (Signals and Systems)',
+        },
+        'playlist_mincount': 145
+    }, {
+        # links without trailing slash
+        # and multi-page
+        'url': 'http://www.camdemy.com/folder/853',
+        'info_dict': {
+            'id': '853',
+            'title': '科學計算 - 使用 Matlab'
+        },
+        'playlist_mincount': 20
+    }, {
+        # with displayMode parameter. For testing the codes to add parameters
+        'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
+        'info_dict': {
+            'id': '853',
+            'title': '科學計算 - 使用 Matlab'
+        },
+        'playlist_mincount': 20
+    }]
+
+    def _real_extract(self, url):
+        folder_id = self._match_id(url)
+
+        # Add displayMode=list so that all links are displayed in a single page
+        parsed_url = list(compat_urlparse.urlparse(url))
+        query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
+        query.update({'displayMode': 'list'})
+        parsed_url[4] = compat_urllib_parse_urlencode(query)
+        final_url = compat_urlparse.urlunparse(parsed_url)
+
+        page = self._download_webpage(final_url, folder_id)
+        matches = re.findall(r"href='(/media/\d+/?)'", page)
+
+        entries = [self.url_result('http://www.camdemy.com' + media_path)
+                   for media_path in matches]
+
+        folder_title = self._html_search_meta('keywords', page)
+
+        return self.playlist_result(entries, folder_id, folder_title)
diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py

new file mode 100644 (file)

index 0000000..1eb81b7
--- /dev/null
+++ b/youtube_dl/extractor/cammodels.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    url_or_none,
+)
+
+
+class CamModelsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.cammodels.com/cam/AutumnKnight/',
+        'only_matching': True,
+        'age_limit': 18
+    }]
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, user_id, headers=self.geo_verification_headers())
+
+        manifest_root = self._html_search_regex(
+            r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None)
+
+        if not manifest_root:
+            ERRORS = (
+                ("I'm offline, but let's stay connected", 'This user is currently offline'),
+                ('in a private show', 'This user is in a private show'),
+                ('is currently performing LIVE', 'This model is currently performing live'),
+            )
+            for pattern, message in ERRORS:
+                if pattern in webpage:
+                    error = message
+                    expected = True
+                    break
+            else:
+                error = 'Unable to find manifest URL root'
+                expected = False
+            raise ExtractorError(error, expected=expected)
+
+        manifest = self._download_json(
+            '%s%s.json' % (manifest_root, user_id), user_id)
+
+        formats = []
+        for format_id, format_dict in manifest['formats'].items():
+            if not isinstance(format_dict, dict):
+                continue
+            encodings = format_dict.get('encodings')
+            if not isinstance(encodings, list):
+                continue
+            vcodec = format_dict.get('videoCodec')
+            acodec = format_dict.get('audioCodec')
+            for media in encodings:
+                if not isinstance(media, dict):
+                    continue
+                media_url = url_or_none(media.get('location'))
+                if not media_url:
+                    continue
+
+                format_id_list = [format_id]
+                height = int_or_none(media.get('videoHeight'))
+                if height is not None:
+                    format_id_list.append('%dp' % height)
+                f = {
+                    'url': media_url,
+                    'format_id': '-'.join(format_id_list),
+                    'width': int_or_none(media.get('videoWidth')),
+                    'height': height,
+                    'vbr': int_or_none(media.get('videoKbps')),
+                    'abr': int_or_none(media.get('audioKbps')),
+                    'fps': int_or_none(media.get('fps')),
+                    'vcodec': vcodec,
+                    'acodec': acodec,
+                }
+                if 'rtmp' in format_id:
+                    f['ext'] = 'flv'
+                elif 'hls' in format_id:
+                    f.update({
+                        'ext': 'mp4',
+                        # hls skips fragments, preferring rtmp
+                        'preference': -1,
+                    })
+                else:
+                    continue
+                formats.append(f)
+        self._sort_formats(formats)
+
+        return {
+            'id': user_id,
+            'title': self._live_title(user_id),
+            'is_live': True,
+            'formats': formats,
+            'age_limit': 18
+        }
diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py

new file mode 100644 (file)

index 0000000..b3be3bd
--- /dev/null
+++ b/youtube_dl/extractor/camtube.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_timestamp,
+)
+
+
+class CamTubeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female',
+        'info_dict': {
+            'id': '42ad3956-dd5b-445a-8313-803ea6079fac',
+            'display_id': 'minafay-030618-1136-chaturbate-female',
+            'ext': 'mp4',
+            'title': 'minafay-030618-1136-chaturbate-female',
+            'duration': 1274,
+            'timestamp': 1528018608,
+            'upload_date': '20180603',
+            'age_limit': 18
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    _API_BASE = 'https://api.camtube.co'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        token = self._download_json(
+            '%s/rpc/session/new' % self._API_BASE, display_id,
+            'Downloading session token')['token']
+
+        self._set_cookie('api.camtube.co', 'session', token)
+
+        video = self._download_json(
+            '%s/recordings/%s' % (self._API_BASE, display_id), display_id,
+            headers={'Referer': url})
+
+        video_id = video['uuid']
+        timestamp = unified_timestamp(video.get('createdAt'))
+        duration = int_or_none(video.get('duration'))
+        view_count = int_or_none(video.get('viewCount'))
+        like_count = int_or_none(video.get('likeCount'))
+        creator = video.get('stageName')
+
+        formats = [{
+            'url': '%s/recordings/%s/manifest.m3u8'
+                   % (self._API_BASE, video_id),
+            'format_id': 'hls',
+            'ext': 'mp4',
+            'protocol': 'm3u8_native',
+        }]
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': display_id,
+            'timestamp': timestamp,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': like_count,
+            'creator': creator,
+            'formats': formats,
+            'age_limit': 18
+        }
diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py

new file mode 100644 (file)

index 0000000..bbc5205
--- /dev/null
+++ b/youtube_dl/extractor/camwithher.py
@@ -0,0 +1,89 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    unified_strdate,
+)
+
+
+class CamWithHerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)'
+
+    _TESTS = [{
+        'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=',
+        'info_dict': {
+            'id': '5644',
+            'ext': 'flv',
+            'title': 'Periscope Tease',
+            'description': 'In the clouds teasing on periscope to my favorite song',
+            'duration': 240,
+            'view_count': int,
+            'comment_count': int,
+            'uploader': 'MileenaK',
+            'upload_date': '20160322',
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937',
+        'only_matching': True,
+    }, {
+        'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=',
+        'only_matching': True,
+    }, {
+        'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        flv_id = self._html_search_regex(
+            r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id')
+
+        # Video URL construction algorithm is reverse-engineered from cwhplayer.swf
+        rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % (
+            ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id)
+
+        title = self._html_search_regex(
+            r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title')
+        description = self._html_search_regex(
+            r'>Description:</span>(.+?)</div>', webpage, 'description', default=None)
+
+        runtime = self._search_regex(
+            r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None)
+        if runtime:
+            runtime = re.sub(r'[\s-]', '', runtime)
+        duration = parse_duration(runtime)
+        view_count = int_or_none(self._search_regex(
+            r'Views\s*:\s*(\d+)', webpage, 'view count', default=None))
+        comment_count = int_or_none(self._search_regex(
+            r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None))
+
+        uploader = self._search_regex(
+            r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None)
+        upload_date = unified_strdate(self._search_regex(
+            r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None))
+
+        return {
+            'id': flv_id,
+            'url': rtmp_url,
+            'ext': 'flv',
+            'no_resume': True,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'uploader': uploader,
+            'upload_date': upload_date,
+            'age_limit': 18
+        }
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py

new file mode 100644 (file)

index 0000000..407cc80
--- /dev/null
+++ b/youtube_dl/extractor/canalc2.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class Canalc2IE(InfoExtractor):
+    IE_NAME = 'canalc2.tv'
+    _VALID_URL = r'https?://(?:(?:www\.)?canalc2\.tv/video/|archives-canalc2\.u-strasbg\.fr/video\.asp\?.*\bidVideo=)(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.canalc2.tv/video/12163',
+        'md5': '060158428b650f896c542dfbb3d6487f',
+        'info_dict': {
+            'id': '12163',
+            'ext': 'mp4',
+            'title': 'Terrasses du Numérique',
+            'duration': 122,
+        },
+    }, {
+        'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.canalc2.tv/video/%s' % video_id, video_id)
+
+        title = self._html_search_regex(
+            r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.+?)</h3>',
+            webpage, 'title')
+
+        formats = []
+        for _, video_url in re.findall(r'file\s*=\s*(["\'])(.+?)\1', webpage):
+            if video_url.startswith('rtmp://'):
+                rtmp = re.search(
+                    r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
+                formats.append({
+                    'url': rtmp.group('url'),
+                    'format_id': 'rtmp',
+                    'ext': 'flv',
+                    'app': rtmp.group('app'),
+                    'play_path': rtmp.group('play_path'),
+                    'page_url': url,
+                })
+            else:
+                formats.append({
+                    'url': video_url,
+                    'format_id': 'http',
+                })
+
+        if formats:
+            info = {
+                'formats': formats,
+            }
+        else:
+            info = self._parse_html5_media_entries(url, webpage, url)[0]
+
+        self._sort_formats(info['formats'])
+
+        info.update({
+            'id': video_id,
+            'title': title,
+            'duration': parse_duration(self._search_regex(
+                r'id=["\']video_duree["\'][^>]*>([^<]+)',
+                webpage, 'duration', fatal=False)),
+        })
+        return info
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py

new file mode 100644 (file)

index 0000000..51c11cb
--- /dev/null
+++ b/youtube_dl/extractor/canalplus.py
@@ -0,0 +1,116 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    # ExtractorError,
+    # HEADRequest,
+    int_or_none,
+    qualities,
+    unified_strdate,
+)
+
+
+class CanalplusIE(InfoExtractor):
+    IE_DESC = 'mycanal.fr and piwiplus.fr'
+    _VALID_URL = r'https?://(?:www\.)?(?P<site>mycanal|piwiplus)\.fr/(?:[^/]+/)*(?P<display_id>[^?/]+)(?:\.html\?.*\bvid=|/p/)(?P<id>\d+)'
+    _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json'
+    _SITE_ID_MAP = {
+        'mycanal': 'cplus',
+        'piwiplus': 'teletoon',
+    }
+
+    # Only works for direct mp4 URLs
+    _GEO_COUNTRIES = ['FR']
+
+    _TESTS = [{
+        'url': 'https://www.mycanal.fr/d17-emissions/lolywood/p/1397061',
+        'info_dict': {
+            'id': '1397061',
+            'display_id': 'lolywood',
+            'ext': 'mp4',
+            'title': 'Euro 2016 : Je préfère te prévenir - Lolywood - Episode 34',
+            'description': 'md5:7d97039d455cb29cdba0d652a0efaa5e',
+            'upload_date': '20160602',
+        },
+    }, {
+        # geo restricted, bypassed
+        'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
+        'info_dict': {
+            'id': '1108190',
+            'display_id': 'pid1405-le-labyrinthe-boing-super-ranger',
+            'ext': 'mp4',
+            'title': 'BOING SUPER RANGER - Ep : Le labyrinthe',
+            'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff',
+            'upload_date': '20140724',
+        },
+        'expected_warnings': ['HTTP Error 403: Forbidden'],
+    }]
+
+    def _real_extract(self, url):
+        site, display_id, video_id = re.match(self._VALID_URL, url).groups()
+
+        site_id = self._SITE_ID_MAP[site]
+
+        info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id)
+        video_data = self._download_json(info_url, video_id, 'Downloading video JSON')
+
+        if isinstance(video_data, list):
+            video_data = [video for video in video_data if video.get('ID') == video_id][0]
+        media = video_data['MEDIA']
+        infos = video_data['INFOS']
+
+        preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD'])
+
+        # _, fmt_url = next(iter(media['VIDEOS'].items()))
+        # if '/geo' in fmt_url.lower():
+        #     response = self._request_webpage(
+        #         HEADRequest(fmt_url), video_id,
+        #         'Checking if the video is georestricted')
+        #     if '/blocage' in response.geturl():
+        #         raise ExtractorError(
+        #             'The video is not available in your country',
+        #             expected=True)
+
+        formats = []
+        for format_id, format_url in media['VIDEOS'].items():
+            if not format_url:
+                continue
+            if format_id == 'HLS':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False))
+            elif format_id == 'HDS':
+                formats.extend(self._extract_f4m_formats(
+                    format_url + '?hdcore=2.11.3', video_id, f4m_id=format_id, fatal=False))
+            else:
+                formats.append({
+                    # the secret extracted from ya function in http://player.canalplus.fr/common/js/canalPlayer.js
+                    'url': format_url + '?secret=pqzerjlsmdkjfoiuerhsdlfknaes',
+                    'format_id': format_id,
+                    'preference': preference(format_id),
+                })
+        self._sort_formats(formats)
+
+        thumbnails = [{
+            'id': image_id,
+            'url': image_url,
+        } for image_id, image_url in media.get('images', {}).items()]
+
+        titrage = infos['TITRAGE']
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': '%s - %s' % (titrage['TITRE'],
+                                  titrage['SOUS_TITRE']),
+            'upload_date': unified_strdate(infos.get('PUBLICATION', {}).get('DATE')),
+            'thumbnails': thumbnails,
+            'description': infos.get('DESCRIPTION'),
+            'duration': int_or_none(infos.get('DURATION')),
+            'view_count': int_or_none(infos.get('NB_VUES')),
+            'like_count': int_or_none(infos.get('NB_LIKES')),
+            'comment_count': int_or_none(infos.get('NB_COMMENTS')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py

new file mode 100644 (file)

index 0000000..8667a0d
--- /dev/null
+++ b/youtube_dl/extractor/canvas.py
@@ -0,0 +1,368 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from .gigya import GigyaBaseIE
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    strip_or_none,
+    float_or_none,
+    int_or_none,
+    merge_dicts,
+    parse_iso8601,
+    str_or_none,
+    url_or_none,
+)
+
+
+class CanvasIE(InfoExtractor):
+    _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
+        'md5': '68993eda72ef62386a15ea2cf3c93107',
+        'info_dict': {
+            'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
+            'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
+            'ext': 'mp4',
+            'title': 'Nachtwacht: De Greystook',
+            'description': 'Nachtwacht: De Greystook',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1468.04,
+        },
+        'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
+    }, {
+        'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
+        'only_matching': True,
+    }]
+    _HLS_ENTRY_PROTOCOLS_MAP = {
+        'HLS': 'm3u8_native',
+        'HLS_AES': 'm3u8',
+    }
+    _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        site_id, video_id = mobj.group('site_id'), mobj.group('id')
+
+        # Old API endpoint, serves more formats but may fail for some videos
+        data = self._download_json(
+            'https://mediazone.vrt.be/api/v1/%s/assets/%s'
+            % (site_id, video_id), video_id, 'Downloading asset JSON',
+            'Unable to download asset JSON', fatal=False)
+
+        # New API endpoint
+        if not data:
+            token = self._download_json(
+                '%s/tokens' % self._REST_API_BASE, video_id,
+                'Downloading token', data=b'',
+                headers={'Content-Type': 'application/json'})['vrtPlayerToken']
+            data = self._download_json(
+                '%s/videos/%s' % (self._REST_API_BASE, video_id),
+                video_id, 'Downloading video JSON', fatal=False, query={
+                    'vrtPlayerToken': token,
+                    'client': '%s@PROD' % site_id,
+                }, expected_status=400)
+            message = data.get('message')
+            if message and not data.get('title'):
+                if data.get('code') == 'AUTHENTICATION_REQUIRED':
+                    self.raise_login_required(message)
+                raise ExtractorError(message, expected=True)
+
+        title = data['title']
+        description = data.get('description')
+
+        formats = []
+        for target in data['targetUrls']:
+            format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
+            if not format_url or not format_type:
+                continue
+            format_type = format_type.upper()
+            if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
+                    m3u8_id=format_type, fatal=False))
+            elif format_type == 'HDS':
+                formats.extend(self._extract_f4m_formats(
+                    format_url, video_id, f4m_id=format_type, fatal=False))
+            elif format_type == 'MPEG_DASH':
+                formats.extend(self._extract_mpd_formats(
+                    format_url, video_id, mpd_id=format_type, fatal=False))
+            elif format_type == 'HSS':
+                formats.extend(self._extract_ism_formats(
+                    format_url, video_id, ism_id='mss', fatal=False))
+            else:
+                formats.append({
+                    'format_id': format_type,
+                    'url': format_url,
+                })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        subtitle_urls = data.get('subtitleUrls')
+        if isinstance(subtitle_urls, list):
+            for subtitle in subtitle_urls:
+                subtitle_url = subtitle.get('url')
+                if subtitle_url and subtitle.get('type') == 'CLOSED':
+                    subtitles.setdefault('nl', []).append({'url': subtitle_url})
+
+        return {
+            'id': video_id,
+            'display_id': video_id,
+            'title': title,
+            'description': description,
+            'formats': formats,
+            'duration': float_or_none(data.get('duration'), 1000),
+            'thumbnail': data.get('posterImageUrl'),
+            'subtitles': subtitles,
+        }
+
+
+class CanvasEenIE(InfoExtractor):
+    IE_DESC = 'canvas.be and een.be'
+    _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
+        'md5': 'ed66976748d12350b118455979cca293',
+        'info_dict': {
+            'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
+            'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
+            'ext': 'flv',
+            'title': 'De afspraak veilt voor de Warmste Week',
+            'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 49.02,
+        },
+        'expected_warnings': ['is not a supported codec'],
+    }, {
+        # with subtitles
+        'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
+        'info_dict': {
+            'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
+            'display_id': 'pieter-0167',
+            'ext': 'mp4',
+            'title': 'Pieter 0167',
+            'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2553.08,
+            'subtitles': {
+                'nl': [{
+                    'ext': 'vtt',
+                }],
+            },
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Pagina niet gevonden',
+    }, {
+        'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan',
+        'info_dict': {
+            'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8',
+            'display_id': 'emma-pakt-thilly-aan',
+            'ext': 'mp4',
+            'title': 'Emma pakt Thilly aan',
+            'description': 'md5:c5c9b572388a99b2690030afa3f3bad7',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 118.24,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['is not a supported codec'],
+    }, {
+        'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        site_id, display_id = mobj.group('site_id'), mobj.group('id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = strip_or_none(self._search_regex(
+            r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
+            webpage, 'title', default=None) or self._og_search_title(
+            webpage, default=None))
+
+        video_id = self._html_search_regex(
+            r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
+            group='id')
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id),
+            'ie_key': CanvasIE.ie_key(),
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+        }
+
+
+class VrtNUIE(GigyaBaseIE):
+    IE_DESC = 'VrtNU.be'
+    _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        # Available via old API endpoint
+        'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/',
+        'info_dict': {
+            'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
+            'ext': 'mp4',
+            'title': 'De zwarte weduwe',
+            'description': 'md5:db1227b0f318c849ba5eab1fef895ee4',
+            'duration': 1457.04,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'season': 'Season 1',
+            'season_number': 1,
+            'episode_number': 1,
+        },
+        'skip': 'This video is only available for registered users',
+        'params': {
+            'username': '<snip>',
+            'password': '<snip>',
+        },
+        'expected_warnings': ['is not a supported codec'],
+    }, {
+        # Only available via new API endpoint
+        'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/',
+        'info_dict': {
+            'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1',
+            'ext': 'mp4',
+            'title': 'Aflevering 5',
+            'description': 'Wie valt door de mand tijdens een missie?',
+            'duration': 2967.06,
+            'season': 'Season 1',
+            'season_number': 1,
+            'episode_number': 5,
+        },
+        'skip': 'This video is only available for registered users',
+        'params': {
+            'username': '<snip>',
+            'password': '<snip>',
+        },
+        'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
+    }]
+    _NETRC_MACHINE = 'vrtnu'
+    _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
+    _CONTEXT_ID = 'R3595707040'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        auth_data = {
+            'APIKey': self._APIKEY,
+            'targetEnv': 'jssdk',
+            'loginID': username,
+            'password': password,
+            'authMode': 'cookie',
+        }
+
+        auth_info = self._gigya_login(auth_data)
+
+        # Sometimes authentication fails for no good reason, retry
+        login_attempt = 1
+        while login_attempt <= 3:
+            try:
+                # When requesting a token, no actual token is returned, but the
+                # necessary cookies are set.
+                self._request_webpage(
+                    'https://token.vrt.be',
+                    None, note='Requesting a token', errnote='Could not get a token',
+                    headers={
+                        'Content-Type': 'application/json',
+                        'Referer': 'https://www.vrt.be/vrtnu/',
+                    },
+                    data=json.dumps({
+                        'uid': auth_info['UID'],
+                        'uidsig': auth_info['UIDSignature'],
+                        'ts': auth_info['signatureTimestamp'],
+                        'email': auth_info['profile']['email'],
+                    }).encode('utf-8'))
+            except ExtractorError as e:
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                    login_attempt += 1
+                    self.report_warning('Authentication failed')
+                    self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again')
+                else:
+                    raise e
+            else:
+                break
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage, urlh = self._download_webpage_handle(url, display_id)
+
+        info = self._search_json_ld(webpage, display_id, default={})
+
+        # title is optional here since it may be extracted by extractor
+        # that is delegated from here
+        title = strip_or_none(self._html_search_regex(
+            r'(?ms)<h1 class="content__heading">(.+?)</h1>',
+            webpage, 'title', default=None))
+
+        description = self._html_search_regex(
+            r'(?ms)<div class="content__description">(.+?)</div>',
+            webpage, 'description', default=None)
+
+        season = self._html_search_regex(
+            [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s*
+                    <span>seizoen\ (.+?)</span>\s*
+                </div>''',
+             r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'],
+            webpage, 'season', default=None)
+
+        season_number = int_or_none(season)
+
+        episode_number = int_or_none(self._html_search_regex(
+            r'''(?xms)<div\ class="content__episode">\s*
+                    <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span>
+                </div>''',
+            webpage, 'episode_number', default=None))
+
+        release_date = parse_iso8601(self._html_search_regex(
+            r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"',
+            webpage, 'release_date', default=None))
+
+        # If there's a ? or a # in the URL, remove them and everything after
+        clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/')
+        securevideo_url = clean_url + '.mssecurevideo.json'
+
+        try:
+            video = self._download_json(securevideo_url, display_id)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                self.raise_login_required()
+            raise
+
+        # We are dealing with a '../<show>.relevant' URL
+        redirect_url = video.get('url')
+        if redirect_url:
+            return self.url_result(self._proto_relative_url(redirect_url, 'https:'))
+
+        # There is only one entry, but with an unknown key, so just get
+        # the first one
+        video_id = list(video.values())[0].get('videoid')
+
+        return merge_dicts(info, {
+            '_type': 'url_transparent',
+            'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
+            'ie_key': CanvasIE.ie_key(),
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'season': season,
+            'season_number': season_number,
+            'episode_number': episode_number,
+            'release_date': release_date,
+        })
diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py

new file mode 100644 (file)

index 0000000..b57b86a
--- /dev/null
+++ b/youtube_dl/extractor/carambatv.py
@@ -0,0 +1,108 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    try_get,
+)
+
+from .videomore import VideomoreIE
+
+
+class CarambaTVIE(InfoExtractor):
+    _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://video1.carambatv.ru/v/191910501',
+        'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a',
+        'info_dict': {
+            'id': '191910501',
+            'ext': 'mp4',
+            'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 2678.31,
+        },
+    }, {
+        'url': 'carambatv:191910501',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id,
+            video_id)
+
+        title = video['title']
+
+        base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id
+
+        formats = [{
+            'url': base_url + f['fn'],
+            'height': int_or_none(f.get('height')),
+            'format_id': '%sp' % f['height'] if f.get('height') else None,
+        } for f in video['qualities'] if f.get('fn')]
+        self._sort_formats(formats)
+
+        thumbnail = video.get('splash')
+        duration = float_or_none(try_get(
+            video, lambda x: x['annotations'][0]['end_time'], compat_str))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
+
+
+class CarambaTVPageIE(InfoExtractor):
+    _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/',
+        'md5': 'a49fb0ec2ad66503eeb46aac237d3c86',
+        'info_dict': {
+            'id': '475222',
+            'ext': 'flv',
+            'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)',
+            'thumbnail': r're:^https?://.*\.jpg',
+            # duration reported by videomore is incorrect
+            'duration': int,
+        },
+        'add_ie': [VideomoreIE.ie_key()],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        videomore_url = VideomoreIE._extract_url(webpage)
+        if not videomore_url:
+            videomore_id = self._search_regex(
+                r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id',
+                default=None)
+            if videomore_id:
+                videomore_url = 'videomore:%s' % videomore_id
+        if videomore_url:
+            title = self._og_search_title(webpage)
+            return {
+                '_type': 'url_transparent',
+                'url': videomore_url,
+                'ie_key': VideomoreIE.ie_key(),
+                'title': title,
+            }
+
+        video_url = self._og_search_property('video:iframe', webpage, default=None)
+
+        if not video_url:
+            video_id = self._search_regex(
+                r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)',
+                webpage, 'video id')
+            video_url = 'carambatv:%s' % video_id
+
+        return self.url_result(video_url, CarambaTVIE.ie_key())
diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py

new file mode 100644 (file)

index 0000000..48b3361
--- /dev/null
+++ b/youtube_dl/extractor/cartoonnetwork.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .turner import TurnerBaseIE
+from ..utils import int_or_none
+
+
+class CartoonNetworkIE(TurnerBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html'
+    _TEST = {
+        'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html',
+        'info_dict': {
+            'id': '6e3375097f63874ebccec7ef677c1c3845fa850e',
+            'ext': 'mp4',
+            'title': 'How to Draw Upgrade',
+            'description': 'md5:2061d83776db7e8be4879684eefe8c0f',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False):
+            metadata_re = ''
+            if content_re:
+                metadata_re = r'|video_metadata\.content_' + content_re
+            return self._search_regex(
+                r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re),
+                webpage, name, fatal=fatal)
+
+        media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True)
+        title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True)
+
+        info = self._extract_ngtv_info(
+            media_id, {'networkId': 'cartoonnetwork'}, {
+                'url': url,
+                'site_name': 'CartoonNetwork',
+                'auth_required': find_field('authType', 'auth type') != 'unauth',
+            })
+
+        series = find_field(
+            'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage)
+        info.update({
+            'id': media_id,
+            'display_id': display_id,
+            'title': title,
+            'description': self._html_search_meta('description', webpage),
+            'series': series,
+            'episode': title,
+        })
+
+        for field in ('season', 'episode'):
+            field_name = field + 'Number'
+            info[field + '_number'] = int_or_none(find_field(
+                field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage))
+
+        return info
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py

new file mode 100644 (file)

index 0000000..fd5ec60
--- /dev/null
+++ b/youtube_dl/extractor/cbc.py
@@ -0,0 +1,497 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import json
+import re
+from xml.sax.saxutils import escape
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_HTTPError,
+)
+from ..utils import (
+    js_to_json,
+    smuggle_url,
+    try_get,
+    xpath_text,
+    xpath_element,
+    xpath_with_ns,
+    find_xpath_attr,
+    orderedSet,
+    parse_duration,
+    parse_iso8601,
+    parse_age_limit,
+    strip_or_none,
+    int_or_none,
+    ExtractorError,
+)
+
+
+class CBCIE(InfoExtractor):
+    IE_NAME = 'cbc.ca'
+    _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'
+    _TESTS = [{
+        # with mediaId
+        'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs',
+        'md5': '97e24d09672fc4cf56256d6faa6c25bc',
+        'info_dict': {
+            'id': '2682904050',
+            'ext': 'mp4',
+            'title': 'Don Cherry – All-Stars',
+            'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.',
+            'timestamp': 1454463000,
+            'upload_date': '20160203',
+            'uploader': 'CBCC-NEW',
+        },
+        'skip': 'Geo-restricted to Canada',
+    }, {
+        # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com
+        'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4',
+        'md5': '162adfa070274b144f4fdc3c3b8207db',
+        'info_dict': {
+            'id': '2414435309',
+            'ext': 'mp4',
+            'title': '22 Minutes Update: What Not To Wear Quebec',
+            'description': "This week's latest Canadian top political story is What Not To Wear Quebec.",
+            'upload_date': '20131025',
+            'uploader': 'CBCC-NEW',
+            'timestamp': 1382717907,
+        },
+    }, {
+        # with clipId, feed only available via tpfeed.cbc.ca
+        'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
+        'md5': '0274a90b51a9b4971fe005c63f592f12',
+        'info_dict': {
+            'id': '2487345465',
+            'ext': 'mp4',
+            'title': 'Robin Williams freestyles on 90 Minutes Live',
+            'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.',
+            'upload_date': '19780210',
+            'uploader': 'CBCC-NEW',
+            'timestamp': 255977160,
+        },
+    }, {
+        # multiple iframes
+        'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot',
+        'playlist': [{
+            'md5': '377572d0b49c4ce0c9ad77470e0b96b4',
+            'info_dict': {
+                'id': '2680832926',
+                'ext': 'mp4',
+                'title': 'An Eagle\'s-Eye View Off Burrard Bridge',
+                'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.',
+                'upload_date': '20160201',
+                'timestamp': 1454342820,
+                'uploader': 'CBCC-NEW',
+            },
+        }, {
+            'md5': '415a0e3f586113894174dfb31aa5bb1a',
+            'info_dict': {
+                'id': '2658915080',
+                'ext': 'mp4',
+                'title': 'Fly like an eagle!',
+                'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower',
+                'upload_date': '20150315',
+                'timestamp': 1426443984,
+                'uploader': 'CBCC-NEW',
+            },
+        }],
+        'skip': 'Geo-restricted to Canada',
+    }, {
+        # multiple CBC.APP.Caffeine.initInstance(...)
+        'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238',
+        'info_dict': {
+            'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks',
+            'id': 'dog-indoor-exercise-winter-1.3928238',
+            'description': 'md5:c18552e41726ee95bd75210d1ca9194c',
+        },
+        'playlist_mincount': 6,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url)
+
+    def _extract_player_init(self, player_init, display_id):
+        player_info = self._parse_json(player_init, display_id, js_to_json)
+        media_id = player_info.get('mediaId')
+        if not media_id:
+            clip_id = player_info['clipId']
+            feed = self._download_json(
+                'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id,
+                clip_id, fatal=False)
+            if feed:
+                media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str)
+            if not media_id:
+                media_id = self._download_json(
+                    'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id,
+                    clip_id)['entries'][0]['id'].split('/')[-1]
+        return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        title = self._og_search_title(webpage, default=None) or self._html_search_meta(
+            'twitter:title', webpage, 'title', default=None) or self._html_search_regex(
+                r'<title>([^<]+)</title>', webpage, 'title', fatal=False)
+        entries = [
+            self._extract_player_init(player_init, display_id)
+            for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
+        media_ids = []
+        for media_id_re in (
+                r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"',
+                r'<div[^>]+\bid=["\']player-(\d+)',
+                r'guid["\']\s*:\s*["\'](\d+)'):
+            media_ids.extend(re.findall(media_id_re, webpage))
+        entries.extend([
+            self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
+            for media_id in orderedSet(media_ids)])
+        return self.playlist_result(
+            entries, display_id, strip_or_none(title),
+            self._og_search_description(webpage))
+
+
+class CBCPlayerIE(InfoExtractor):
+    IE_NAME = 'cbc.ca:player'
+    _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.cbc.ca/player/play/2683190193',
+        'md5': '64d25f841ddf4ddb28a235338af32e2c',
+        'info_dict': {
+            'id': '2683190193',
+            'ext': 'mp4',
+            'title': 'Gerry Runs a Sweat Shop',
+            'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0',
+            'timestamp': 1455071400,
+            'upload_date': '20160210',
+            'uploader': 'CBCC-NEW',
+        },
+        'skip': 'Geo-restricted to Canada',
+    }, {
+        # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
+        'url': 'http://www.cbc.ca/player/play/2657631896',
+        'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
+        'info_dict': {
+            'id': '2657631896',
+            'ext': 'mp3',
+            'title': 'CBC Montreal is organizing its first ever community hackathon!',
+            'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.',
+            'timestamp': 1425704400,
+            'upload_date': '20150307',
+            'uploader': 'CBCC-NEW',
+        },
+    }, {
+        'url': 'http://www.cbc.ca/player/play/2164402062',
+        'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
+        'info_dict': {
+            'id': '2164402062',
+            'ext': 'mp4',
+            'title': 'Cancer survivor four times over',
+            'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
+            'timestamp': 1320410746,
+            'upload_date': '20111104',
+            'uploader': 'CBCC-NEW',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': smuggle_url(
+                'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, {
+                    'force_smil_url': True
+                }),
+            'id': video_id,
+        }
+
+
+class CBCWatchBaseIE(InfoExtractor):
+    _device_id = None
+    _device_token = None
+    _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/'
+    _NS_MAP = {
+        'media': 'http://search.yahoo.com/mrss/',
+        'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/',
+    }
+    _GEO_COUNTRIES = ['CA']
+    _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login'
+    _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token'
+    _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37'
+    _NETRC_MACHINE = 'cbcwatch'
+
+    def _signature(self, email, password):
+        data = json.dumps({
+            'email': email,
+            'password': password,
+        }).encode()
+        headers = {'content-type': 'application/json'}
+        query = {'apikey': self._API_KEY}
+        resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query)
+        access_token = resp['access_token']
+
+        # token
+        query = {
+            'access_token': access_token,
+            'apikey': self._API_KEY,
+            'jwtapp': 'jwt',
+        }
+        resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query)
+        return resp['signature']
+
+    def _call_api(self, path, video_id):
+        url = path if path.startswith('http') else self._API_BASE_URL + path
+        for _ in range(2):
+            try:
+                result = self._download_xml(url, video_id, headers={
+                    'X-Clearleap-DeviceId': self._device_id,
+                    'X-Clearleap-DeviceToken': self._device_token,
+                })
+            except ExtractorError as e:
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                    # Device token has expired, re-acquiring device token
+                    self._register_device()
+                    continue
+                raise
+        error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage')
+        if error_message:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message))
+        return result
+
+    def _real_initialize(self):
+        if self._valid_device_token():
+            return
+        device = self._downloader.cache.load(
+            'cbcwatch', self._cache_device_key()) or {}
+        self._device_id, self._device_token = device.get('id'), device.get('token')
+        if self._valid_device_token():
+            return
+        self._register_device()
+
+    def _valid_device_token(self):
+        return self._device_id and self._device_token
+
+    def _cache_device_key(self):
+        email, _ = self._get_login_info()
+        return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device'
+
+    def _register_device(self):
+        result = self._download_xml(
+            self._API_BASE_URL + 'device/register',
+            None, 'Acquiring device token',
+            data=b'<device><type>web</type></device>')
+        self._device_id = xpath_text(result, 'deviceId', fatal=True)
+        email, password = self._get_login_info()
+        if email and password:
+            signature = self._signature(email, password)
+            data = '<login><token>{0}</token><device><deviceId>{1}</deviceId><type>web</type></device></login>'.format(
+                escape(signature), escape(self._device_id)).encode()
+            url = self._API_BASE_URL + 'device/login'
+            result = self._download_xml(
+                url, None, data=data,
+                headers={'content-type': 'application/xml'})
+            self._device_token = xpath_text(result, 'token', fatal=True)
+        else:
+            self._device_token = xpath_text(result, 'deviceToken', fatal=True)
+        self._downloader.cache.store(
+            'cbcwatch', self._cache_device_key(), {
+                'id': self._device_id,
+                'token': self._device_token,
+            })
+
+    def _parse_rss_feed(self, rss):
+        channel = xpath_element(rss, 'channel', fatal=True)
+
+        def _add_ns(path):
+            return xpath_with_ns(path, self._NS_MAP)
+
+        entries = []
+        for item in channel.findall('item'):
+            guid = xpath_text(item, 'guid', fatal=True)
+            title = xpath_text(item, 'title', fatal=True)
+
+            media_group = xpath_element(item, _add_ns('media:group'), fatal=True)
+            content = xpath_element(media_group, _add_ns('media:content'), fatal=True)
+            content_url = content.attrib['url']
+
+            thumbnails = []
+            for thumbnail in media_group.findall(_add_ns('media:thumbnail')):
+                thumbnail_url = thumbnail.get('url')
+                if not thumbnail_url:
+                    continue
+                thumbnails.append({
+                    'id': thumbnail.get('profile'),
+                    'url': thumbnail_url,
+                    'width': int_or_none(thumbnail.get('width')),
+                    'height': int_or_none(thumbnail.get('height')),
+                })
+
+            timestamp = None
+            release_date = find_xpath_attr(
+                item, _add_ns('media:credit'), 'role', 'releaseDate')
+            if release_date is not None:
+                timestamp = parse_iso8601(release_date.text)
+
+            entries.append({
+                '_type': 'url_transparent',
+                'url': content_url,
+                'id': guid,
+                'title': title,
+                'description': xpath_text(item, 'description'),
+                'timestamp': timestamp,
+                'duration': int_or_none(content.get('duration')),
+                'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))),
+                'episode': xpath_text(item, _add_ns('clearleap:episode')),
+                'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))),
+                'series': xpath_text(item, _add_ns('clearleap:series')),
+                'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))),
+                'thumbnails': thumbnails,
+                'ie_key': 'CBCWatchVideo',
+            })
+
+        return self.playlist_result(
+            entries, xpath_text(channel, 'guid'),
+            xpath_text(channel, 'title'),
+            xpath_text(channel, 'description'))
+
+
+class CBCWatchVideoIE(CBCWatchBaseIE):
+    IE_NAME = 'cbc.ca:watch:video'
+    _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TEST = {
+        # geo-restricted to Canada, bypassable
+        'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        result = self._call_api(url, video_id)
+
+        m3u8_url = xpath_text(result, 'url', fatal=True)
+        formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False)
+        if len(formats) < 2:
+            formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
+        for f in formats:
+            format_id = f.get('format_id')
+            if format_id.startswith('AAC'):
+                f['acodec'] = 'aac'
+            elif format_id.startswith('AC3'):
+                f['acodec'] = 'ac-3'
+        self._sort_formats(formats)
+
+        info = {
+            'id': video_id,
+            'title': video_id,
+            'formats': formats,
+        }
+
+        rss = xpath_element(result, 'rss')
+        if rss:
+            info.update(self._parse_rss_feed(rss)['entries'][0])
+            del info['url']
+            del info['_type']
+            del info['ie_key']
+        return info
+
+
+class CBCWatchIE(CBCWatchBaseIE):
+    IE_NAME = 'cbc.ca:watch'
+    _VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'
+    _TESTS = [{
+        # geo-restricted to Canada, bypassable
+        'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4',
+        'info_dict': {
+            'id': '9673749a-5e77-484c-8b62-a1092a6b5168',
+            'ext': 'mp4',
+            'title': 'Customer (Dis)Service',
+            'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87',
+            'upload_date': '20160219',
+            'timestamp': 1455840000,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+            'format': 'bestvideo',
+        },
+    }, {
+        # geo-restricted to Canada, bypassable
+        'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057',
+        'info_dict': {
+            'id': '1ed4b385-cd84-49cf-95f0-80f004680057',
+            'title': 'Arthur',
+            'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.',
+        },
+        'playlist_mincount': 30,
+    }, {
+        'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        rss = self._call_api('web/browse/' + video_id, video_id)
+        return self._parse_rss_feed(rss)
+
+
+class CBCOlympicsIE(InfoExtractor):
+    IE_NAME = 'cbc.ca:olympics'
+    _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._hidden_inputs(webpage)['videoId']
+        video_doc = self._download_xml(
+            'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id)
+        title = xpath_text(video_doc, 'title', fatal=True)
+        is_live = xpath_text(video_doc, 'kind') == 'Live'
+        if is_live:
+            title = self._live_title(title)
+
+        formats = []
+        for video_source in video_doc.findall('videoSources/videoSource'):
+            uri = xpath_text(video_source, 'uri')
+            if not uri:
+                continue
+            tokenize = self._download_json(
+                'https://olympics.cbc.ca/api/api-akamai/tokenize',
+                video_id, data=json.dumps({
+                    'VideoSource': uri,
+                }).encode(), headers={
+                    'Content-Type': 'application/json',
+                    'Referer': url,
+                    # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js
+                    'Cookie': '_dvp=TK:C0ObxjerU',  # AKAMAI CDN cookie
+                }, fatal=False)
+            if not tokenize:
+                continue
+            content_url = tokenize['ContentUrl']
+            video_source_format = video_source.get('format')
+            if video_source_format == 'IIS':
+                formats.extend(self._extract_ism_formats(
+                    content_url, video_id, ism_id=video_source_format, fatal=False))
+            else:
+                formats.extend(self._extract_m3u8_formats(
+                    content_url, video_id, 'mp4',
+                    'm3u8' if is_live else 'm3u8_native',
+                    m3u8_id=video_source_format, fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': xpath_text(video_doc, 'description'),
+            'thumbnail': xpath_text(video_doc, 'thumbnailUrl'),
+            'duration': parse_duration(xpath_text(video_doc, 'duration')),
+            'formats': formats,
+            'is_live': is_live,
+        }
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py

new file mode 100644 (file)

index 0000000..4a19a73
--- /dev/null
+++ b/youtube_dl/extractor/cbs.py
@@ -0,0 +1,112 @@
+from __future__ import unicode_literals
+
+from .theplatform import ThePlatformFeedIE
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    find_xpath_attr,
+    xpath_element,
+    xpath_text,
+    update_url_query,
+)
+
+
+class CBSBaseIE(ThePlatformFeedIE):
+    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+        subtitles = {}
+        for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]:
+            cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k)
+            if cc_e is not None:
+                cc_url = cc_e.get('value')
+                if cc_url:
+                    subtitles.setdefault(subtitles_lang, []).append({
+                        'ext': ext,
+                        'url': cc_url,
+                    })
+        return subtitles
+
+
+class CBSIE(CBSBaseIE):
+    _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
+
+    _TESTS = [{
+        'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
+        'info_dict': {
+            'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_',
+            'ext': 'mp4',
+            'title': 'Connect Chat feat. Garth Brooks',
+            'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
+            'duration': 1495,
+            'timestamp': 1385585425,
+            'upload_date': '20131127',
+            'uploader': 'CBSI-NEW',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        '_skip': 'Blocked outside the US',
+    }, {
+        'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
+        'only_matching': True,
+    }]
+
+    def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517):
+        items_data = self._download_xml(
+            'http://can.cbs.com/thunder/player/videoPlayerService.php',
+            content_id, query={'partner': site, 'contentId': content_id})
+        video_data = xpath_element(items_data, './/item')
+        title = xpath_text(video_data, 'videoTitle', 'title', True)
+        tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id)
+        tp_release_url = 'http://link.theplatform.com/s/' + tp_path
+
+        asset_types = []
+        subtitles = {}
+        formats = []
+        last_e = None
+        for item in items_data.findall('.//item'):
+            asset_type = xpath_text(item, 'assetType')
+            if not asset_type or asset_type in asset_types or 'HLS_FPS' in asset_type or 'DASH_CENC' in asset_type:
+                continue
+            asset_types.append(asset_type)
+            query = {
+                'mbr': 'true',
+                'assetTypes': asset_type,
+            }
+            if asset_type.startswith('HLS') or asset_type in ('OnceURL', 'StreamPack'):
+                query['formats'] = 'MPEG4,M3U'
+            elif asset_type in ('RTMP', 'WIFI', '3G'):
+                query['formats'] = 'MPEG4,FLV'
+            try:
+                tp_formats, tp_subtitles = self._extract_theplatform_smil(
+                    update_url_query(tp_release_url, query), content_id,
+                    'Downloading %s SMIL data' % asset_type)
+            except ExtractorError as e:
+                last_e = e
+                continue
+            formats.extend(tp_formats)
+            subtitles = self._merge_subtitles(subtitles, tp_subtitles)
+        if last_e and not formats:
+            raise last_e
+        self._sort_formats(formats)
+
+        info = self._extract_theplatform_metadata(tp_path, content_id)
+        info.update({
+            'id': content_id,
+            'title': title,
+            'series': xpath_text(video_data, 'seriesTitle'),
+            'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
+            'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
+            'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000),
+            'thumbnail': xpath_text(video_data, 'previewImageURL'),
+            'formats': formats,
+            'subtitles': subtitles,
+        })
+        return info
+
+    def _real_extract(self, url):
+        content_id = self._match_id(url)
+        return self._extract_video_info(content_id)
diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py

new file mode 100644 (file)

index 0000000..6596e98
--- /dev/null
+++ b/youtube_dl/extractor/cbsinteractive.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .cbs import CBSIE
+from ..utils import int_or_none
+
+
+class CBSInteractiveIE(CBSIE):
+    _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P<id>[^/?]+)'
+    _TESTS = [{
+        'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
+        'info_dict': {
+            'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00',
+            'display_id': 'hands-on-with-microsofts-windows-8-1-update',
+            'ext': 'mp4',
+            'title': 'Hands-on with Microsoft Windows 8.1 Update',
+            'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',
+            'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861',
+            'uploader': 'Sarah Mitroff',
+            'duration': 70,
+            'timestamp': 1396479627,
+            'upload_date': '20140402',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',
+        'md5': 'f11d27b2fa18597fbf92444d2a9ed386',
+        'info_dict': {
+            'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK',
+            'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187',
+            'ext': 'mp4',
+            'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)',
+            'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f',
+            'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',
+            'uploader': 'Ashley Esqueda',
+            'duration': 1482,
+            'timestamp': 1433289889,
+            'upload_date': '20150603',
+        },
+    }, {
+        'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/',
+        'info_dict': {
+            'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt',
+            'display_id': 'video-keeping-android-smartphones-and-tablets-secure',
+            'ext': 'mp4',
+            'title': 'Video: Keeping Android smartphones and tablets secure',
+            'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.',
+            'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0',
+            'uploader': 'Adrian Kingsley-Hughes',
+            'duration': 731,
+            'timestamp': 1449129925,
+            'upload_date': '20151203',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/',
+        'only_matching': True,
+    }]
+
+    MPX_ACCOUNTS = {
+        'cnet': 2198311517,
+        'zdnet': 2387448114,
+    }
+
+    def _real_extract(self, url):
+        site, display_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, display_id)
+
+        data_json = self._html_search_regex(
+            r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'",
+            webpage, 'data json')
+        data = self._parse_json(data_json, display_id)
+        vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0]
+
+        video_id = vdata['mpxRefId']
+
+        title = vdata['title']
+        author = vdata.get('author')
+        if author:
+            uploader = '%s %s' % (author['firstName'], author['lastName'])
+            uploader_id = author.get('id')
+        else:
+            uploader = None
+            uploader_id = None
+
+        info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site])
+        info.update({
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'duration': int_or_none(vdata.get('duration')),
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+        })
+        return info
diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py

new file mode 100644 (file)

index 0000000..90852a9
--- /dev/null
+++ b/youtube_dl/extractor/cbslocal.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .anvato import AnvatoIE
+from .sendtonews import SendtoNewsIE
+from ..compat import compat_urlparse
+from ..utils import (
+    parse_iso8601,
+    unified_timestamp,
+)
+
+
+class CBSLocalIE(AnvatoIE):
+    _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P<id>[0-9a-z-]+)'
+
+    _TESTS = [{
+        # Anvato backend
+        'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis',
+        'md5': 'f0ee3081e3843f575fccef901199b212',
+        'info_dict': {
+            'id': '3401037',
+            'ext': 'mp4',
+            'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'',
+            'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.',
+            'thumbnail': 're:^https?://.*',
+            'timestamp': 1463440500,
+            'upload_date': '20160516',
+            'uploader': 'CBS',
+            'subtitles': {
+                'en': 'mincount:5',
+            },
+            'categories': [
+                'Stations\\Spoken Word\\KCBSTV',
+                'Syndication\\MSN',
+                'Syndication\\NDN',
+                'Syndication\\AOL',
+                'Syndication\\Yahoo',
+                'Syndication\\Tribune',
+                'Syndication\\Curb.tv',
+                'Content\\News'
+            ],
+            'tags': ['CBS 2 News Evening'],
+        },
+    }, {
+        # SendtoNews embed
+        'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/',
+        'info_dict': {
+            'id': 'GxfCe0Zo7D-175909-5588',
+        },
+        'playlist_count': 9,
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/',
+        'info_dict': {
+            'id': '3580809',
+            'ext': 'mp4',
+            'title': 'A Very Blue Anniversary',
+            'description': 'CBS2’s Cindy Hsu has more.',
+            'thumbnail': 're:^https?://.*',
+            'timestamp': int,
+            'upload_date': r're:^\d{8}$',
+            'uploader': 'CBS',
+            'subtitles': {
+                'en': 'mincount:5',
+            },
+            'categories': [
+                'Stations\\Spoken Word\\WCBSTV',
+                'Syndication\\AOL',
+                'Syndication\\MSN',
+                'Syndication\\NDN',
+                'Syndication\\Yahoo',
+                'Content\\News',
+                'Content\\News\\Local News',
+            ],
+            'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'],
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        sendtonews_url = SendtoNewsIE._extract_url(webpage)
+        if sendtonews_url:
+            return self.url_result(
+                compat_urlparse.urljoin(url, sendtonews_url),
+                ie=SendtoNewsIE.ie_key())
+
+        info_dict = self._extract_anvato_videos(webpage, display_id)
+
+        timestamp = unified_timestamp(self._html_search_regex(
+            r'class="(?:entry|post)-date"[^>]*>([^<]+)', webpage,
+            'released date', default=None)) or parse_iso8601(
+            self._html_search_meta('uploadDate', webpage))
+
+        info_dict.update({
+            'display_id': display_id,
+            'timestamp': timestamp,
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py

new file mode 100644 (file)

index 0000000..345debc
--- /dev/null
+++ b/youtube_dl/extractor/cbsnews.py
@@ -0,0 +1,147 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import zlib
+
+from .common import InfoExtractor
+from .cbs import CBSIE
+from ..compat import (
+    compat_b64decode,
+    compat_urllib_parse_unquote,
+)
+from ..utils import (
+    parse_duration,
+)
+
+
+class CBSNewsEmbedIE(CBSIE):
+    IE_NAME = 'cbsnews:embed'
+    _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P<id>.+)'
+    _TESTS = [{
+        'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        item = self._parse_json(zlib.decompress(compat_b64decode(
+            compat_urllib_parse_unquote(self._match_id(url))),
+            -zlib.MAX_WBITS), None)['video']['items'][0]
+        return self._extract_video_info(item['mpxRefId'], 'cbsnews')
+
+
+class CBSNewsIE(CBSIE):
+    IE_NAME = 'cbsnews'
+    IE_DESC = 'CBS News'
+    _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\da-z_-]+)'
+
+    _TESTS = [
+        {
+            # 60 minutes
+            'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/',
+            'info_dict': {
+                'id': 'Y_nf_aEg6WwO9OLAq0MpKaPgfnBUxfW4',
+                'ext': 'flv',
+                'title': 'Artificial Intelligence, real-life applications',
+                'description': 'md5:a7aaf27f1b4777244de8b0b442289304',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 317,
+                'uploader': 'CBSI-NEW',
+                'timestamp': 1476046464,
+                'upload_date': '20161009',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
+            'info_dict': {
+                'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y',
+                'ext': 'mp4',
+                'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
+                'description': 'md5:4a6983e480542d8b333a947bfc64ddc7',
+                'upload_date': '20140404',
+                'timestamp': 1396650660,
+                'uploader': 'CBSI-NEW',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 205,
+                'subtitles': {
+                    'en': [{
+                        'ext': 'ttml',
+                    }],
+                },
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            # 48 hours
+            'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/',
+            'info_dict': {
+                'title': 'Cold as Ice',
+                'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?',
+            },
+            'playlist_mincount': 7,
+        },
+    ]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        entries = []
+        for embed_url in re.findall(r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage):
+            entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key()))
+        if entries:
+            return self.playlist_result(
+                entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage),
+                playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage))
+
+        item = self._parse_json(self._html_search_regex(
+            r'CBSNEWS\.defaultPayload\s*=\s*({.+})',
+            webpage, 'video JSON info'), display_id)['items'][0]
+        return self._extract_video_info(item['mpxRefId'], 'cbsnews')
+
+
+class CBSNewsLiveVideoIE(InfoExtractor):
+    IE_NAME = 'cbsnews:livevideo'
+    IE_DESC = 'CBS News Live Videos'
+    _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[^/?#]+)'
+
+    # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples
+    _TEST = {
+        'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/',
+        'info_dict': {
+            'id': 'clinton-sanders-prepare-to-face-off-in-nh',
+            'ext': 'mp4',
+            'title': 'Clinton, Sanders Prepare To Face Off In NH',
+            'duration': 334,
+        },
+        'skip': 'Video gone',
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        video_info = self._download_json(
+            'http://feeds.cbsn.cbsnews.com/rundown/story', display_id, query={
+                'device': 'desktop',
+                'dvr_slug': display_id,
+            })
+
+        formats = self._extract_akamai_formats(video_info['url'], display_id)
+        self._sort_formats(formats)
+
+        return {
+            'id': display_id,
+            'display_id': display_id,
+            'title': video_info['headline'],
+            'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'),
+            'duration': parse_duration(video_info.get('segmentDur')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py

new file mode 100644 (file)

index 0000000..83b7647
--- /dev/null
+++ b/youtube_dl/extractor/cbssports.py
@@ -0,0 +1,38 @@
+from __future__ import unicode_literals
+
+from .cbs import CBSBaseIE
+
+
+class CBSSportsIE(CBSBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/',
+        'info_dict': {
+            'id': '1214315075735',
+            'ext': 'mp4',
+            'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder',
+            'description': 'md5:df6f48622612c2d6bd2e295ddef58def',
+            'timestamp': 1524111457,
+            'upload_date': '20180419',
+            'uploader': 'CBSI-NEW',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/',
+        'only_matching': True,
+    }]
+
+    def _extract_video_info(self, filter_query, video_id):
+        return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._search_regex(
+            [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'],
+            webpage, 'video id')
+        return self._extract_video_info('byId=%s' % video_id, video_id)
diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py

new file mode 100644 (file)

index 0000000..36e6dff
--- /dev/null
+++ b/youtube_dl/extractor/ccc.py
@@ -0,0 +1,111 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    try_get,
+    url_or_none,
+)
+
+
+class CCCIE(InfoExtractor):
+    IE_NAME = 'media.ccc.de'
+    _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/v/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video',
+        'md5': '3a1eda8f3a29515d27f5adb967d7e740',
+        'info_dict': {
+            'id': '1839',
+            'ext': 'mp4',
+            'title': 'Introduction to Processor Design',
+            'creator': 'byterazor',
+            'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20131228',
+            'timestamp': 1388188800,
+            'duration': 3710,
+            'tags': list,
+        }
+    }, {
+        'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        event_id = self._search_regex(r"data-id='(\d+)'", webpage, 'event id')
+        event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id)
+
+        formats = []
+        for recording in event_data.get('recordings', []):
+            recording_url = recording.get('recording_url')
+            if not recording_url:
+                continue
+            language = recording.get('language')
+            folder = recording.get('folder')
+            format_id = None
+            if language:
+                format_id = language
+            if folder:
+                if language:
+                    format_id += '-' + folder
+                else:
+                    format_id = folder
+            vcodec = 'h264' if 'h264' in folder else (
+                'none' if folder in ('mp3', 'opus') else None
+            )
+            formats.append({
+                'format_id': format_id,
+                'url': recording_url,
+                'width': int_or_none(recording.get('width')),
+                'height': int_or_none(recording.get('height')),
+                'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024),
+                'language': language,
+                'vcodec': vcodec,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': event_id,
+            'display_id': display_id,
+            'title': event_data['title'],
+            'creator': try_get(event_data, lambda x: ', '.join(x['persons'])),
+            'description': event_data.get('description'),
+            'thumbnail': event_data.get('thumb_url'),
+            'timestamp': parse_iso8601(event_data.get('date')),
+            'duration': int_or_none(event_data.get('length')),
+            'tags': event_data.get('tags'),
+            'formats': formats,
+        }
+
+
+class CCCPlaylistIE(InfoExtractor):
+    IE_NAME = 'media.ccc.de:lists'
+    _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/c/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://media.ccc.de/c/30c3',
+        'info_dict': {
+            'title': '30C3',
+            'id': '30c3',
+        },
+        'playlist_count': 135,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url).lower()
+
+        conf = self._download_json(
+            'https://media.ccc.de/public/conferences/' + playlist_id,
+            playlist_id)
+
+        entries = []
+        for e in conf['events']:
+            event_url = url_or_none(e.get('frontend_link'))
+            if event_url:
+                entries.append(self.url_result(event_url, ie=CCCIE.ie_key()))
+
+        return self.playlist_result(entries, playlist_id, conf.get('title'))
diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py

new file mode 100644 (file)

index 0000000..544647f
--- /dev/null
+++ b/youtube_dl/extractor/ccma.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+    parse_resolution,
+    url_or_none,
+)
+
+
+class CCMAIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?P<type>video|audio)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/',
+        'md5': '7296ca43977c8ea4469e719c609b0871',
+        'info_dict': {
+            'id': '5630208',
+            'ext': 'mp4',
+            'title': 'L\'espot de La Marató de TV3',
+            'description': 'md5:f12987f320e2f6e988e9908e4fe97765',
+            'timestamp': 1470918540,
+            'upload_date': '20160811',
+        }
+    }, {
+        'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/',
+        'md5': 'fa3e38f269329a278271276330261425',
+        'info_dict': {
+            'id': '943685',
+            'ext': 'mp3',
+            'title': 'El Consell de Savis analitza el derbi',
+            'description': 'md5:e2a3648145f3241cb9c6b4b624033e53',
+            'upload_date': '20171205',
+            'timestamp': 1512507300,
+        }
+    }]
+
+    def _real_extract(self, url):
+        media_type, media_id = re.match(self._VALID_URL, url).groups()
+
+        media = self._download_json(
+            'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={
+                'media': media_type,
+                'idint': media_id,
+            })
+
+        formats = []
+        media_url = media['media']['url']
+        if isinstance(media_url, list):
+            for format_ in media_url:
+                format_url = url_or_none(format_.get('file'))
+                if not format_url:
+                    continue
+                label = format_.get('label')
+                f = parse_resolution(label)
+                f.update({
+                    'url': format_url,
+                    'format_id': label,
+                })
+                formats.append(f)
+        else:
+            formats.append({
+                'url': media_url,
+                'vcodec': 'none' if media_type == 'audio' else None,
+            })
+        self._sort_formats(formats)
+
+        informacio = media['informacio']
+        title = informacio['titol']
+        durada = informacio.get('durada', {})
+        duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text'))
+        timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc'))
+
+        subtitles = {}
+        subtitols = media.get('subtitols', {})
+        if subtitols:
+            sub_url = subtitols.get('url')
+            if sub_url:
+                subtitles.setdefault(
+                    subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({
+                        'url': sub_url,
+                    })
+
+        thumbnails = []
+        imatges = media.get('imatges', {})
+        if imatges:
+            thumbnail_url = imatges.get('url')
+            if thumbnail_url:
+                thumbnails = [{
+                    'url': thumbnail_url,
+                    'width': int_or_none(imatges.get('amplada')),
+                    'height': int_or_none(imatges.get('alcada')),
+                }]
+
+        return {
+            'id': media_id,
+            'title': title,
+            'description': clean_html(informacio.get('descripcio')),
+            'duration': duration,
+            'timestamp': timestamp,
+            'thumbnails': thumbnails,
+            'subtitles': subtitles,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/cctv.py b/youtube_dl/extractor/cctv.py

new file mode 100644 (file)

index 0000000..c76f361
--- /dev/null
+++ b/youtube_dl/extractor/cctv.py
@@ -0,0 +1,191 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    float_or_none,
+    try_get,
+    unified_timestamp,
+)
+
+
+class CCTVIE(InfoExtractor):
+    IE_DESC = '央视网'
+    _VALID_URL = r'https?://(?:(?:[^/]+)\.(?:cntv|cctv)\.(?:com|cn)|(?:www\.)?ncpa-classic\.com)/(?:[^/]+/)*?(?P<id>[^/?#&]+?)(?:/index)?(?:\.s?html|[?#&]|$)'
+    _TESTS = [{
+        # fo.addVariable("videoCenterId","id")
+        'url': 'http://sports.cntv.cn/2016/02/12/ARTIaBRxv4rTT1yWf1frW2wi160212.shtml',
+        'md5': 'd61ec00a493e09da810bf406a078f691',
+        'info_dict': {
+            'id': '5ecdbeab623f4973b40ff25f18b174e8',
+            'ext': 'mp4',
+            'title': '[NBA]二少联手砍下46分 雷霆主场击败鹈鹕（快讯）',
+            'description': 'md5:7e14a5328dc5eb3d1cd6afbbe0574e95',
+            'duration': 98,
+            'uploader': 'songjunjie',
+            'timestamp': 1455279956,
+            'upload_date': '20160212',
+        },
+    }, {
+        # var guid = "id"
+        'url': 'http://tv.cctv.com/2016/02/05/VIDEUS7apq3lKrHG9Dncm03B160205.shtml',
+        'info_dict': {
+            'id': 'efc5d49e5b3b4ab2b34f3a502b73d3ae',
+            'ext': 'mp4',
+            'title': '[赛车]“车王”舒马赫恢复情况成谜（快讯）',
+            'description': '2月4日，蒙特泽莫罗透露了关于“车王”舒马赫恢复情况，但情况是否属实遭到了质疑。',
+            'duration': 37,
+            'uploader': 'shujun',
+            'timestamp': 1454677291,
+            'upload_date': '20160205',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # changePlayer('id')
+        'url': 'http://english.cntv.cn/special/four_comprehensives/index.shtml',
+        'info_dict': {
+            'id': '4bb9bb4db7a6471ba85fdeda5af0381e',
+            'ext': 'mp4',
+            'title': 'NHnews008 ANNUAL POLITICAL SEASON',
+            'description': 'Four Comprehensives',
+            'duration': 60,
+            'uploader': 'zhangyunlei',
+            'timestamp': 1425385521,
+            'upload_date': '20150303',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # loadvideo('id')
+        'url': 'http://cctv.cntv.cn/lm/tvseries_russian/yilugesanghua/index.shtml',
+        'info_dict': {
+            'id': 'b15f009ff45c43968b9af583fc2e04b2',
+            'ext': 'mp4',
+            'title': 'Путь，усыпанный космеями Серия 1',
+            'description': 'Путь, усыпанный космеями',
+            'duration': 2645,
+            'uploader': 'renxue',
+            'timestamp': 1477479241,
+            'upload_date': '20161026',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # var initMyAray = 'id'
+        'url': 'http://www.ncpa-classic.com/2013/05/22/VIDE1369219508996867.shtml',
+        'info_dict': {
+            'id': 'a194cfa7f18c426b823d876668325946',
+            'ext': 'mp4',
+            'title': '小泽征尔音乐塾 音乐梦想无国界',
+            'duration': 2173,
+            'timestamp': 1369248264,
+            'upload_date': '20130522',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # var ids = ["id"]
+        'url': 'http://www.ncpa-classic.com/clt/more/416/index.shtml',
+        'info_dict': {
+            'id': 'a8606119a4884588a79d81c02abecc16',
+            'ext': 'mp3',
+            'title': '来自维也纳的新年贺礼',
+            'description': 'md5:f13764ae8dd484e84dd4b39d5bcba2a7',
+            'duration': 1578,
+            'uploader': 'djy',
+            'timestamp': 1482942419,
+            'upload_date': '20161228',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        'url': 'http://ent.cntv.cn/2016/01/18/ARTIjprSSJH8DryTVr5Bx8Wb160118.shtml',
+        'only_matching': True,
+    }, {
+        'url': 'http://tv.cntv.cn/video/C39296/e0210d949f113ddfb38d31f00a4e5c44',
+        'only_matching': True,
+    }, {
+        'url': 'http://english.cntv.cn/2016/09/03/VIDEhnkB5y9AgHyIEVphCEz1160903.shtml',
+        'only_matching': True,
+    }, {
+        'url': 'http://tv.cctv.com/2016/09/07/VIDE5C1FnlX5bUywlrjhxXOV160907.shtml',
+        'only_matching': True,
+    }, {
+        'url': 'http://tv.cntv.cn/video/C39296/95cfac44cabd3ddc4a9438780a4e5c44',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_id = self._search_regex(
+            [r'var\s+guid\s*=\s*["\']([\da-fA-F]+)',
+             r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)',
+             r'changePlayer\s*\(\s*["\']([\da-fA-F]+)',
+             r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)',
+             r'var\s+initMyAray\s*=\s*["\']([\da-fA-F]+)',
+             r'var\s+ids\s*=\s*\[["\']([\da-fA-F]+)'],
+            webpage, 'video id')
+
+        data = self._download_json(
+            'http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do', video_id,
+            query={
+                'pid': video_id,
+                'url': url,
+                'idl': 32,
+                'idlr': 32,
+                'modifyed': 'false',
+            })
+
+        title = data['title']
+
+        formats = []
+
+        video = data.get('video')
+        if isinstance(video, dict):
+            for quality, chapters_key in enumerate(('lowChapters', 'chapters')):
+                video_url = try_get(
+                    video, lambda x: x[chapters_key][0]['url'], compat_str)
+                if video_url:
+                    formats.append({
+                        'url': video_url,
+                        'format_id': 'http',
+                        'quality': quality,
+                        'preference': -1,
+                    })
+
+        hls_url = try_get(data, lambda x: x['hls_url'], compat_str)
+        if hls_url:
+            hls_url = re.sub(r'maxbr=\d+&?', '', hls_url)
+            formats.extend(self._extract_m3u8_formats(
+                hls_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        self._sort_formats(formats)
+
+        uploader = data.get('editer_name')
+        description = self._html_search_meta(
+            'description', webpage, default=None)
+        timestamp = unified_timestamp(data.get('f_pgmtime'))
+        duration = float_or_none(try_get(video, lambda x: x['totalLength']))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'uploader': uploader,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py

new file mode 100644 (file)

index 0000000..0c3af23
--- /dev/null
+++ b/youtube_dl/extractor/cda.py
@@ -0,0 +1,182 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import codecs
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    multipart_encode,
+    parse_duration,
+    random_birthday,
+    urljoin,
+)
+
+
+class CDAIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
+    _BASE_URL = 'http://www.cda.pl/'
+    _TESTS = [{
+        'url': 'http://www.cda.pl/video/5749950c',
+        'md5': '6f844bf51b15f31fae165365707ae970',
+        'info_dict': {
+            'id': '5749950c',
+            'ext': 'mp4',
+            'height': 720,
+            'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
+            'description': 'md5:269ccd135d550da90d1662651fcb9772',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'average_rating': float,
+            'duration': 39,
+            'age_limit': 0,
+        }
+    }, {
+        'url': 'http://www.cda.pl/video/57413289',
+        'md5': 'a88828770a8310fc00be6c95faf7f4d5',
+        'info_dict': {
+            'id': '57413289',
+            'ext': 'mp4',
+            'title': 'Lądowanie na lotnisku na Maderze',
+            'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'crash404',
+            'view_count': int,
+            'average_rating': float,
+            'duration': 137,
+            'age_limit': 0,
+        }
+    }, {
+        # Age-restricted
+        'url': 'http://www.cda.pl/video/1273454c4',
+        'info_dict': {
+            'id': '1273454c4',
+            'ext': 'mp4',
+            'title': 'Bronson (2008) napisy HD 1080p',
+            'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
+            'height': 1080,
+            'uploader': 'boniek61',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 5554,
+            'age_limit': 18,
+            'view_count': int,
+            'average_rating': float,
+        },
+    }, {
+        'url': 'http://ebd.cda.pl/0x0/5749950c',
+        'only_matching': True,
+    }]
+
+    def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
+        form_data = random_birthday('rok', 'miesiac', 'dzien')
+        form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
+        data, content_type = multipart_encode(form_data)
+        return self._download_webpage(
+            urljoin(url, '/a/validatebirth'), video_id, *args,
+            data=data, headers={
+                'Referer': url,
+                'Content-Type': content_type,
+            }, **kwargs)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        self._set_cookie('cda.pl', 'cda.player', 'html5')
+        webpage = self._download_webpage(
+            self._BASE_URL + '/video/' + video_id, video_id)
+
+        if 'Ten film jest dostępny dla użytkowników premium' in webpage:
+            raise ExtractorError('This video is only available for premium users.', expected=True)
+
+        need_confirm_age = False
+        if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")',
+                                   webpage, 'birthday validate form', default=None):
+            webpage = self._download_age_confirm_page(
+                url, video_id, note='Confirming age')
+            need_confirm_age = True
+
+        formats = []
+
+        uploader = self._search_regex(r'''(?x)
+            <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
+            (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
+            <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
+        ''', webpage, 'uploader', default=None, group='uploader')
+        view_count = self._search_regex(
+            r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
+            'view_count', default=None)
+        average_rating = self._search_regex(
+            r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
+            webpage, 'rating', fatal=False, group='rating_value')
+
+        info_dict = {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'uploader': uploader,
+            'view_count': int_or_none(view_count),
+            'average_rating': float_or_none(average_rating),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'formats': formats,
+            'duration': None,
+            'age_limit': 18 if need_confirm_age else 0,
+        }
+
+        def extract_format(page, version):
+            json_str = self._html_search_regex(
+                r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
+                '%s player_json' % version, fatal=False, group='player_data')
+            if not json_str:
+                return
+            player_data = self._parse_json(
+                json_str, '%s player_data' % version, fatal=False)
+            if not player_data:
+                return
+            video = player_data.get('video')
+            if not video or 'file' not in video:
+                self.report_warning('Unable to extract %s version information' % version)
+                return
+            if video['file'].startswith('uggc'):
+                video['file'] = codecs.decode(video['file'], 'rot_13')
+                if video['file'].endswith('adc.mp4'):
+                    video['file'] = video['file'].replace('adc.mp4', '.mp4')
+            f = {
+                'url': video['file'],
+            }
+            m = re.search(
+                r'<a[^>]+data-quality="(?P<format_id>[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P<height>[0-9]+)p',
+                page)
+            if m:
+                f.update({
+                    'format_id': m.group('format_id'),
+                    'height': int(m.group('height')),
+                })
+            info_dict['formats'].append(f)
+            if not info_dict['duration']:
+                info_dict['duration'] = parse_duration(video.get('duration'))
+
+        extract_format(webpage, 'default')
+
+        for href, resolution in re.findall(
+                r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
+                webpage):
+            if need_confirm_age:
+                handler = self._download_age_confirm_page
+            else:
+                handler = self._download_webpage
+
+            webpage = handler(
+                self._BASE_URL + href, video_id,
+                'Downloading %s version information' % resolution, fatal=False)
+            if not webpage:
+                # Manually report warning because empty page is returned when
+                # invalid version is requested.
+                self.report_warning('Unable to download %s version information' % resolution)
+                continue
+
+            extract_format(webpage, resolution)
+
+        self._sort_formats(formats)
+
+        return info_dict
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py

new file mode 100644 (file)

index 0000000..7cb4efb
--- /dev/null
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -0,0 +1,289 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    sanitized_Request,
+    unescapeHTML,
+    update_url_query,
+    urlencode_postdata,
+    USER_AGENTS,
+)
+
+
+class CeskaTelevizeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
+    _TESTS = [{
+        'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
+        'info_dict': {
+            'id': '61924494877246241',
+            'ext': 'mp4',
+            'title': 'Hyde Park Civilizace: Život v Grónsku',
+            'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 3350,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
+        'info_dict': {
+            'id': '61924494877028507',
+            'ext': 'mp4',
+            'title': 'Hyde Park Civilizace: Bonus 01 - En',
+            'description': 'English Subtittles',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 81.3,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # live stream
+        'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
+        'info_dict': {
+            'id': 402,
+            'ext': 'mp4',
+            'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+            'is_live': True,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': 'Georestricted to Czech Republic',
+    }, {
+        'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
+        if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
+            raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
+
+        type_ = None
+        episode_id = None
+
+        playlist = self._parse_json(
+            self._search_regex(
+                r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
+                default='{}'), playlist_id)
+        if playlist:
+            type_ = playlist.get('type')
+            episode_id = playlist.get('id')
+
+        if not type_:
+            type_ = self._html_search_regex(
+                r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
+                webpage, 'type')
+        if not episode_id:
+            episode_id = self._html_search_regex(
+                r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
+                webpage, 'episode_id')
+
+        data = {
+            'playlist[0][type]': type_,
+            'playlist[0][id]': episode_id,
+            'requestUrl': compat_urllib_parse_urlparse(url).path,
+            'requestSource': 'iVysilani',
+        }
+
+        entries = []
+
+        for user_agent in (None, USER_AGENTS['Safari']):
+            req = sanitized_Request(
+                'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
+                data=urlencode_postdata(data))
+
+            req.add_header('Content-type', 'application/x-www-form-urlencoded')
+            req.add_header('x-addr', '127.0.0.1')
+            req.add_header('X-Requested-With', 'XMLHttpRequest')
+            if user_agent:
+                req.add_header('User-Agent', user_agent)
+            req.add_header('Referer', url)
+
+            playlistpage = self._download_json(req, playlist_id, fatal=False)
+
+            if not playlistpage:
+                continue
+
+            playlist_url = playlistpage['url']
+            if playlist_url == 'error_region':
+                raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
+
+            req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
+            req.add_header('Referer', url)
+
+            playlist_title = self._og_search_title(webpage, default=None)
+            playlist_description = self._og_search_description(webpage, default=None)
+
+            playlist = self._download_json(req, playlist_id, fatal=False)
+            if not playlist:
+                continue
+
+            playlist = playlist.get('playlist')
+            if not isinstance(playlist, list):
+                continue
+
+            playlist_len = len(playlist)
+
+            for num, item in enumerate(playlist):
+                is_live = item.get('type') == 'LIVE'
+                formats = []
+                for format_id, stream_url in item.get('streamUrls', {}).items():
+                    if 'drmOnly=true' in stream_url:
+                        continue
+                    if 'playerType=flash' in stream_url:
+                        stream_formats = self._extract_m3u8_formats(
+                            stream_url, playlist_id, 'mp4', 'm3u8_native',
+                            m3u8_id='hls-%s' % format_id, fatal=False)
+                    else:
+                        stream_formats = self._extract_mpd_formats(
+                            stream_url, playlist_id,
+                            mpd_id='dash-%s' % format_id, fatal=False)
+                    # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031
+                    if format_id == 'audioDescription':
+                        for f in stream_formats:
+                            f['source_preference'] = -10
+                    formats.extend(stream_formats)
+
+                if user_agent and len(entries) == playlist_len:
+                    entries[num]['formats'].extend(formats)
+                    continue
+
+                item_id = item.get('id') or item['assetId']
+                title = item['title']
+
+                duration = float_or_none(item.get('duration'))
+                thumbnail = item.get('previewImageUrl')
+
+                subtitles = {}
+                if item.get('type') == 'VOD':
+                    subs = item.get('subtitles')
+                    if subs:
+                        subtitles = self.extract_subtitles(episode_id, subs)
+
+                if playlist_len == 1:
+                    final_title = playlist_title or title
+                    if is_live:
+                        final_title = self._live_title(final_title)
+                else:
+                    final_title = '%s (%s)' % (playlist_title, title)
+
+                entries.append({
+                    'id': item_id,
+                    'title': final_title,
+                    'description': playlist_description if playlist_len == 1 else None,
+                    'thumbnail': thumbnail,
+                    'duration': duration,
+                    'formats': formats,
+                    'subtitles': subtitles,
+                    'is_live': is_live,
+                })
+
+        for e in entries:
+            self._sort_formats(e['formats'])
+
+        return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
+
+    def _get_subtitles(self, episode_id, subs):
+        original_subtitles = self._download_webpage(
+            subs[0]['url'], episode_id, 'Downloading subtitles')
+        srt_subs = self._fix_subtitles(original_subtitles)
+        return {
+            'cs': [{
+                'ext': 'srt',
+                'data': srt_subs,
+            }]
+        }
+
+    @staticmethod
+    def _fix_subtitles(subtitles):
+        """ Convert millisecond-based subtitles to SRT """
+
+        def _msectotimecode(msec):
+            """ Helper utility to convert milliseconds to timecode """
+            components = []
+            for divider in [1000, 60, 60, 100]:
+                components.append(msec % divider)
+                msec //= divider
+            return '{3:02}:{2:02}:{1:02},{0:03}'.format(*components)
+
+        def _fix_subtitle(subtitle):
+            for line in subtitle.splitlines():
+                m = re.match(r'^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$', line)
+                if m:
+                    yield m.group(1)
+                    start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:])
+                    yield '{0} --> {1}'.format(start, stop)
+                else:
+                    yield line
+
+        return '\r\n'.join(_fix_subtitle(subtitles))
+
+
+class CeskaTelevizePoradyIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
+    _TESTS = [{
+        # video with 18+ caution trailer
+        'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
+        'info_dict': {
+            'id': '215562210900007-bogotart',
+            'title': 'Queer: Bogotart',
+            'description': 'Alternativní průvodce současným queer světem',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': '61924494876844842',
+                'ext': 'mp4',
+                'title': 'Queer: Bogotart (Varování 18+)',
+                'duration': 10.2,
+            },
+        }, {
+            'info_dict': {
+                'id': '61924494877068022',
+                'ext': 'mp4',
+                'title': 'Queer: Bogotart (Queer)',
+                'thumbnail': r're:^https?://.*\.jpg',
+                'duration': 1558.3,
+            },
+        }],
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # iframe embed
+        'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        data_url = update_url_query(unescapeHTML(self._search_regex(
+            (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+             r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'),
+            webpage, 'iframe player url', group='url')), query={
+                'autoStart': 'true',
+        })
+
+        return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py

new file mode 100644 (file)

index 0000000..09cacf6
--- /dev/null
+++ b/youtube_dl/extractor/channel9.py
@@ -0,0 +1,262 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+    qualities,
+    unescapeHTML,
+)
+
+
+class Channel9IE(InfoExtractor):
+    IE_DESC = 'Channel 9'
+    IE_NAME = 'channel9'
+    _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
+
+    _TESTS = [{
+        'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
+        'md5': '32083d4eaf1946db6d454313f44510ca',
+        'info_dict': {
+            'id': '6c413323-383a-49dc-88f9-a22800cab024',
+            'ext': 'wmv',
+            'title': 'Developer Kick-Off Session: Stuff We Love',
+            'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',
+            'duration': 4576,
+            'thumbnail': r're:https?://.*\.jpg',
+            'timestamp': 1377717420,
+            'upload_date': '20130828',
+            'session_code': 'KOS002',
+            'session_room': 'Arena 1A',
+            'session_speakers': 'count:5',
+        },
+    }, {
+        'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
+        'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',
+        'info_dict': {
+            'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024',
+            'ext': 'wmv',
+            'title': 'Self-service BI with Power BI - nuclear testing',
+            'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',
+            'duration': 1540,
+            'thumbnail': r're:https?://.*\.jpg',
+            'timestamp': 1386381991,
+            'upload_date': '20131207',
+            'authors': ['Mike Wilmot'],
+        },
+    }, {
+        # low quality mp4 is best
+        'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+        'info_dict': {
+            'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',
+            'ext': 'mp4',
+            'title': 'Ranges for the Standard Library',
+            'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',
+            'duration': 5646,
+            'thumbnail': r're:https?://.*\.jpg',
+            'upload_date': '20150930',
+            'timestamp': 1443640735,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
+        'info_dict': {
+            'id': 'Events/DEVintersection/DEVintersection-2016',
+            'title': 'DEVintersection 2016 Orlando Sessions',
+        },
+        'playlist_mincount': 14,
+    }, {
+        'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
+        'only_matching': True,
+    }, {
+        'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
+        'only_matching': True,
+    }]
+
+    _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b',
+            webpage)
+
+    def _extract_list(self, video_id, rss_url=None):
+        if not rss_url:
+            rss_url = self._RSS_URL % video_id
+        rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
+        entries = [self.url_result(session_url.text, 'Channel9')
+                   for session_url in rss.findall('./channel/item/link')]
+        title_text = rss.find('./channel/title').text
+        return self.playlist_result(entries, video_id, title_text)
+
+    def _real_extract(self, url):
+        content_path, rss = re.match(self._VALID_URL, url).groups()
+
+        if rss:
+            return self._extract_list(content_path, url)
+
+        webpage = self._download_webpage(
+            url, content_path, 'Downloading web page')
+
+        episode_data = self._search_regex(
+            r"data-episode='([^']+)'", webpage, 'episode data', default=None)
+        if episode_data:
+            episode_data = self._parse_json(unescapeHTML(
+                episode_data), content_path)
+            content_id = episode_data['contentId']
+            is_session = '/Sessions(' in episode_data['api']
+            content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,'
+            if is_session:
+                content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers'
+            else:
+                content_url += 'Authors,Body&$expand=Authors'
+            content_data = self._download_json(content_url, content_id)
+            title = content_data['Title']
+
+            QUALITIES = (
+                'mp3',
+                'wmv', 'mp4',
+                'wmv-low', 'mp4-low',
+                'wmv-mid', 'mp4-mid',
+                'wmv-high', 'mp4-high',
+            )
+
+            quality_key = qualities(QUALITIES)
+
+            def quality(quality_id, format_url):
+                return (len(QUALITIES) if '_Source.' in format_url
+                        else quality_key(quality_id))
+
+            formats = []
+            urls = set()
+
+            SITE_QUALITIES = {
+                'MP3': 'mp3',
+                'MP4': 'mp4',
+                'Low Quality WMV': 'wmv-low',
+                'Low Quality MP4': 'mp4-low',
+                'Mid Quality WMV': 'wmv-mid',
+                'Mid Quality MP4': 'mp4-mid',
+                'High Quality WMV': 'wmv-high',
+                'High Quality MP4': 'mp4-high',
+            }
+
+            formats_select = self._search_regex(
+                r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage,
+                'formats select', default=None)
+            if formats_select:
+                for mobj in re.finditer(
+                        r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<',
+                        formats_select):
+                    format_url = mobj.group('url')
+                    if format_url in urls:
+                        continue
+                    urls.add(format_url)
+                    format_id = mobj.group('format')
+                    quality_id = SITE_QUALITIES.get(format_id, format_id)
+                    formats.append({
+                        'url': format_url,
+                        'format_id': quality_id,
+                        'quality': quality(quality_id, format_url),
+                        'vcodec': 'none' if quality_id == 'mp3' else None,
+                    })
+
+            API_QUALITIES = {
+                'VideoMP4Low': 'mp4-low',
+                'VideoWMV': 'wmv-mid',
+                'VideoMP4Medium': 'mp4-mid',
+                'VideoMP4High': 'mp4-high',
+                'VideoWMVHQ': 'wmv-hq',
+            }
+
+            for format_id, q in API_QUALITIES.items():
+                q_url = content_data.get(format_id)
+                if not q_url or q_url in urls:
+                    continue
+                urls.add(q_url)
+                formats.append({
+                    'url': q_url,
+                    'format_id': q,
+                    'quality': quality(q, q_url),
+                })
+
+            self._sort_formats(formats)
+
+            slides = content_data.get('Slides')
+            zip_file = content_data.get('ZipFile')
+
+            if not formats and not slides and not zip_file:
+                raise ExtractorError(
+                    'None of recording, slides or zip are available for %s' % content_path)
+
+            subtitles = {}
+            for caption in content_data.get('Captions', []):
+                caption_url = caption.get('Url')
+                if not caption_url:
+                    continue
+                subtitles.setdefault(caption.get('Language', 'en'), []).append({
+                    'url': caption_url,
+                    'ext': 'vtt',
+                })
+
+            common = {
+                'id': content_id,
+                'title': title,
+                'description': clean_html(content_data.get('Description') or content_data.get('Body')),
+                'thumbnail': content_data.get('VideoPlayerPreviewImage'),
+                'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
+                'timestamp': parse_iso8601(content_data.get('PublishedDate')),
+                'avg_rating': int_or_none(content_data.get('Rating')),
+                'rating_count': int_or_none(content_data.get('RatingCount')),
+                'view_count': int_or_none(content_data.get('Views')),
+                'comment_count': int_or_none(content_data.get('CommentCount')),
+                'subtitles': subtitles,
+            }
+            if is_session:
+                speakers = []
+                for s in content_data.get('Speakers', []):
+                    speaker_name = s.get('FullName')
+                    if not speaker_name:
+                        continue
+                    speakers.append(speaker_name)
+
+                common.update({
+                    'session_code': content_data.get('Code'),
+                    'session_room': content_data.get('Room'),
+                    'session_speakers': speakers,
+                })
+            else:
+                authors = []
+                for a in content_data.get('Authors', []):
+                    author_name = a.get('DisplayName')
+                    if not author_name:
+                        continue
+                    authors.append(author_name)
+                common['authors'] = authors
+
+            contents = []
+
+            if slides:
+                d = common.copy()
+                d.update({'title': title + '-Slides', 'url': slides})
+                contents.append(d)
+
+            if zip_file:
+                d = common.copy()
+                d.update({'title': title + '-Zip', 'url': zip_file})
+                contents.append(d)
+
+            if formats:
+                d = common.copy()
+                d.update({'title': title, 'formats': formats})
+                contents.append(d)
+            return self.playlist_result(contents)
+        else:
+            return self._extract_list(content_path)
diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py

new file mode 100644 (file)

index 0000000..42c9af2
--- /dev/null
+++ b/youtube_dl/extractor/charlierose.py
@@ -0,0 +1,54 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import remove_end
+
+
+class CharlieRoseIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://charlierose.com/videos/27996',
+        'md5': 'fda41d49e67d4ce7c2411fd2c4702e09',
+        'info_dict': {
+            'id': '27996',
+            'ext': 'mp4',
+            'title': 'Remembering Zaha Hadid',
+            'thumbnail': r're:^https?://.*\.jpg\?\d+',
+            'description': 'We revisit past conversations with Zaha Hadid, in memory of the world renowned Iraqi architect.',
+            'subtitles': {
+                'en': [{
+                    'ext': 'vtt',
+                }],
+            },
+        },
+    }, {
+        'url': 'https://charlierose.com/videos/27996',
+        'only_matching': True,
+    }, {
+        'url': 'https://charlierose.com/episodes/30887?autoplay=true',
+        'only_matching': True,
+    }]
+
+    _PLAYER_BASE = 'https://charlierose.com/video/player/%s'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(self._PLAYER_BASE % video_id, video_id)
+
+        title = remove_end(self._og_search_title(webpage), ' - Charlie Rose')
+
+        info_dict = self._parse_html5_media_entries(
+            self._PLAYER_BASE % video_id, webpage, video_id,
+            m3u8_entry_protocol='m3u8_native')[0]
+
+        self._sort_formats(info_dict['formats'])
+        self._remove_duplicate_formats(info_dict['formats'])
+
+        info_dict.update({
+            'id': video_id,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage),
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py

new file mode 100644 (file)

index 0000000..a459dcb
--- /dev/null
+++ b/youtube_dl/extractor/chaturbate.py
@@ -0,0 +1,109 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    lowercase_escape,
+    url_or_none,
+)
+
+
+class ChaturbateIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+\.)?chaturbate\.com/(?:fullvideo/?\?.*?\bb=)?(?P<id>[^/?&#]+)'
+    _TESTS = [{
+        'url': 'https://www.chaturbate.com/siswet19/',
+        'info_dict': {
+            'id': 'siswet19',
+            'ext': 'mp4',
+            'title': 're:^siswet19 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'age_limit': 18,
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Room is offline',
+    }, {
+        'url': 'https://chaturbate.com/fullvideo/?b=caylin',
+        'only_matching': True,
+    }, {
+        'url': 'https://en.chaturbate.com/siswet19/',
+        'only_matching': True,
+    }]
+
+    _ROOM_OFFLINE = 'Room is currently offline'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://chaturbate.com/%s/' % video_id, video_id,
+            headers=self.geo_verification_headers())
+
+        found_m3u8_urls = []
+
+        data = self._parse_json(
+            self._search_regex(
+                r'initialRoomDossier\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
+                webpage, 'data', default='{}', group='value'),
+            video_id, transform_source=lowercase_escape, fatal=False)
+        if data:
+            m3u8_url = url_or_none(data.get('hls_source'))
+            if m3u8_url:
+                found_m3u8_urls.append(m3u8_url)
+
+        if not found_m3u8_urls:
+            for m in re.finditer(
+                    r'(\\u002[27])(?P<url>http.+?\.m3u8.*?)\1', webpage):
+                found_m3u8_urls.append(lowercase_escape(m.group('url')))
+
+        if not found_m3u8_urls:
+            for m in re.finditer(
+                    r'(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage):
+                found_m3u8_urls.append(m.group('url'))
+
+        m3u8_urls = []
+        for found_m3u8_url in found_m3u8_urls:
+            m3u8_fast_url, m3u8_no_fast_url = found_m3u8_url, found_m3u8_url.replace('_fast', '')
+            for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url):
+                if m3u8_url not in m3u8_urls:
+                    m3u8_urls.append(m3u8_url)
+
+        if not m3u8_urls:
+            error = self._search_regex(
+                [r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>',
+                 r'<div[^>]+id=(["\'])defchat\1[^>]*>\s*<p><strong>(?P<error>[^<]+)<'],
+                webpage, 'error', group='error', default=None)
+            if not error:
+                if any(p in webpage for p in (
+                        self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')):
+                    error = self._ROOM_OFFLINE
+            if error:
+                raise ExtractorError(error, expected=True)
+            raise ExtractorError('Unable to find stream URL')
+
+        formats = []
+        for m3u8_url in m3u8_urls:
+            for known_id in ('fast', 'slow'):
+                if '_%s' % known_id in m3u8_url:
+                    m3u8_id = known_id
+                    break
+            else:
+                m3u8_id = None
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, ext='mp4',
+                # ffmpeg skips segments for fast m3u8
+                preference=-10 if m3u8_id == 'fast' else None,
+                m3u8_id=m3u8_id, fatal=False, live=True))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._live_title(video_id),
+            'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id,
+            'age_limit': self._rta_search(webpage),
+            'is_live': True,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py

new file mode 100644 (file)

index 0000000..5aac212
--- /dev/null
+++ b/youtube_dl/extractor/chilloutzone.py
@@ -0,0 +1,96 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..compat import compat_b64decode
+from ..utils import (
+    clean_html,
+    ExtractorError
+)
+
+
+class ChilloutzoneIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?chilloutzone\.net/video/(?P<id>[\w|-]+)\.html'
+    _TESTS = [{
+        'url': 'http://www.chilloutzone.net/video/enemene-meck-alle-katzen-weg.html',
+        'md5': 'a76f3457e813ea0037e5244f509e66d1',
+        'info_dict': {
+            'id': 'enemene-meck-alle-katzen-weg',
+            'ext': 'mp4',
+            'title': 'Enemene Meck - Alle Katzen weg',
+            'description': 'Ist das der Umkehrschluss des Niesenden Panda-Babys?',
+        },
+    }, {
+        'note': 'Video hosted at YouTube',
+        'url': 'http://www.chilloutzone.net/video/eine-sekunde-bevor.html',
+        'info_dict': {
+            'id': '1YVQaAgHyRU',
+            'ext': 'mp4',
+            'title': '16 Photos Taken 1 Second Before Disaster',
+            'description': 'md5:58a8fcf6a459fe0a08f54140f0ad1814',
+            'uploader': 'BuzzFeedVideo',
+            'uploader_id': 'BuzzFeedVideo',
+            'upload_date': '20131105',
+        },
+    }, {
+        'note': 'Video hosted at Vimeo',
+        'url': 'http://www.chilloutzone.net/video/icon-blending.html',
+        'md5': '2645c678b8dc4fefcc0e1b60db18dac1',
+        'info_dict': {
+            'id': '85523671',
+            'ext': 'mp4',
+            'title': 'The Sunday Times - Icons',
+            'description': 're:(?s)^Watch the making of - makingoficons.com.{300,}',
+            'uploader': 'Us',
+            'uploader_id': 'usfilms',
+            'upload_date': '20140131'
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        base64_video_info = self._html_search_regex(
+            r'var cozVidData = "(.+?)";', webpage, 'video data')
+        decoded_video_info = compat_b64decode(base64_video_info).decode('utf-8')
+        video_info_dict = json.loads(decoded_video_info)
+
+        # get video information from dict
+        video_url = video_info_dict['mediaUrl']
+        description = clean_html(video_info_dict.get('description'))
+        title = video_info_dict['title']
+        native_platform = video_info_dict['nativePlatform']
+        native_video_id = video_info_dict['nativeVideoId']
+        source_priority = video_info_dict['sourcePriority']
+
+        # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)
+        if native_platform is None:
+            youtube_url = YoutubeIE._extract_url(webpage)
+            if youtube_url:
+                return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
+
+        # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or
+        # the own CDN
+        if source_priority == 'native':
+            if native_platform == 'youtube':
+                return self.url_result(native_video_id, ie='Youtube')
+            if native_platform == 'vimeo':
+                return self.url_result(
+                    'http://vimeo.com/' + native_video_id, ie='Vimeo')
+
+        if not video_url:
+            raise ExtractorError('No video found')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'ext': 'mp4',
+            'title': title,
+            'description': description,
+        }
diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py

new file mode 100644 (file)

index 0000000..8d75cdf
--- /dev/null
+++ b/youtube_dl/extractor/chirbit.py
@@ -0,0 +1,91 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_b64decode
+from ..utils import parse_duration
+
+
+class ChirbitIE(InfoExtractor):
+    IE_NAME = 'chirbit'
+    _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)'
+    _TESTS = [{
+        'url': 'http://chirb.it/be2abG',
+        'info_dict': {
+            'id': 'be2abG',
+            'ext': 'mp3',
+            'title': 'md5:f542ea253f5255240be4da375c6a5d7e',
+            'description': 'md5:f24a4e22a71763e32da5fed59e47c770',
+            'duration': 306,
+            'uploader': 'Gerryaudio',
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5',
+        'only_matching': True,
+    }, {
+        'url': 'https://chirb.it/wp/MN58c2',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        audio_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://chirb.it/%s' % audio_id, audio_id)
+
+        data_fd = self._search_regex(
+            r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'data fd', group='url')
+
+        # Reverse engineered from https://chirb.it/js/chirbit.player.js (look
+        # for soundURL)
+        audio_url = compat_b64decode(data_fd[::-1]).decode('utf-8')
+
+        title = self._search_regex(
+            r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title')
+        description = self._search_regex(
+            r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>',
+            webpage, 'description', default=None)
+        duration = parse_duration(self._search_regex(
+            r'class=["\']c-length["\'][^>]*>([^<]+)',
+            webpage, 'duration', fatal=False))
+        uploader = self._search_regex(
+            r'id=["\']chirbit-username["\'][^>]*>([^<]+)',
+            webpage, 'uploader', fatal=False)
+
+        return {
+            'id': audio_id,
+            'url': audio_url,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'uploader': uploader,
+        }
+
+
+class ChirbitProfileIE(InfoExtractor):
+    IE_NAME = 'chirbit:profile'
+    _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://chirbit.com/ScarletBeauty',
+        'info_dict': {
+            'id': 'ScarletBeauty',
+        },
+        'playlist_mincount': 3,
+    }
+
+    def _real_extract(self, url):
+        profile_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, profile_id)
+
+        entries = [
+            self.url_result(self._proto_relative_url('//chirb.it/' + video_id))
+            for _, video_id in re.findall(r'<input[^>]+id=([\'"])copy-btn-(?P<id>[0-9a-zA-Z]+)\1', webpage)]
+
+        return self.playlist_result(entries, profile_id)
diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py

new file mode 100644 (file)

index 0000000..b861d54
--- /dev/null
+++ b/youtube_dl/extractor/cinchcast.py
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    unified_strdate,
+    xpath_text,
+)
+
+
+class CinchcastIE(InfoExtractor):
+    _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single',
+        'info_dict': {
+            'id': '5258197',
+            'ext': 'mp3',
+            'title': 'Train Your Brain to Up Your Game with Coach Mandy',
+            'upload_date': '20130816',
+        },
+    }, {
+        # Actual test is run in generic, look for undergroundwellness
+        'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        doc = self._download_xml(
+            'http://www.blogtalkradio.com/playerasset/mrss?assetType=single&assetId=%s' % video_id,
+            video_id)
+
+        item = doc.find('.//item')
+        title = xpath_text(item, './title', fatal=True)
+        date_str = xpath_text(
+            item, './{http://developer.longtailvideo.com/trac/}date')
+        upload_date = unified_strdate(date_str, day_first=False)
+        # duration is present but wrong
+        formats = [{
+            'format_id': 'main',
+            'url': item.find('./{http://search.yahoo.com/mrss/}content').attrib['url'],
+        }]
+        backup_url = xpath_text(
+            item, './{http://developer.longtailvideo.com/trac/}backupContent')
+        if backup_url:
+            formats.append({
+                'preference': 2,  # seems to be more reliable
+                'format_id': 'backup',
+                'url': backup_url,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'upload_date': upload_date,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/cinemax.py b/youtube_dl/extractor/cinemax.py

new file mode 100644 (file)

index 0000000..7f89d33
--- /dev/null
+++ b/youtube_dl/extractor/cinemax.py
@@ -0,0 +1,29 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .hbo import HBOBaseIE
+
+
+class CinemaxIE(HBOBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?cinemax\.com/(?P<path>[^/]+/video/[0-9a-z-]+-(?P<id>\d+))'
+    _TESTS = [{
+        'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903',
+        'md5': '82e0734bba8aa7ef526c9dd00cf35a05',
+        'info_dict': {
+            'id': '20126903',
+            'ext': 'mp4',
+            'title': 'S1 Ep 1: Recap',
+        },
+        'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'],
+    }, {
+        'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903.embed',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        path, video_id = re.match(self._VALID_URL, url).groups()
+        info = self._extract_info('https://www.cinemax.com/%s.xml' % path, video_id)
+        info['id'] = video_id
+        return info
diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py

new file mode 100644 (file)

index 0000000..da404e4
--- /dev/null
+++ b/youtube_dl/extractor/ciscolive.py
@@ -0,0 +1,151 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    clean_html,
+    float_or_none,
+    int_or_none,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class CiscoLiveBaseIE(InfoExtractor):
+    # These appear to be constant across all Cisco Live presentations
+    # and are not tied to any user session or event
+    RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s'
+    RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz'
+    RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye'
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s'
+
+    HEADERS = {
+        'Origin': 'https://ciscolive.cisco.com',
+        'rfApiProfileId': RAINFOCUS_API_PROFILE_ID,
+        'rfWidgetId': RAINFOCUS_WIDGET_ID,
+    }
+
+    def _call_api(self, ep, rf_id, query, referrer, note=None):
+        headers = self.HEADERS.copy()
+        headers['Referer'] = referrer
+        return self._download_json(
+            self.RAINFOCUS_API_URL % ep, rf_id, note=note,
+            data=urlencode_postdata(query), headers=headers)
+
+    def _parse_rf_item(self, rf_item):
+        event_name = rf_item.get('eventName')
+        title = rf_item['title']
+        description = clean_html(rf_item.get('abstract'))
+        presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName'])
+        bc_id = rf_item['videos'][0]['url']
+        bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id
+        duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length']))
+        location = try_get(rf_item, lambda x: x['times'][0]['room'])
+
+        if duration:
+            duration = duration * 60
+
+        return {
+            '_type': 'url_transparent',
+            'url': bc_url,
+            'ie_key': 'BrightcoveNew',
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'creator': presenter_name,
+            'location': location,
+            'series': event_name,
+        }
+
+
+class CiscoLiveSessionIE(CiscoLiveBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/[^#]*#/session/(?P<id>[^/?&]+)'
+    _TESTS = [{
+        'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs',
+        'md5': 'c98acf395ed9c9f766941c70f5352e22',
+        'info_dict': {
+            'id': '5803694304001',
+            'ext': 'mp4',
+            'title': '13 Smart Automations to Monitor Your Cisco IOS Network',
+            'description': 'md5:ec4a436019e09a918dec17714803f7cc',
+            'timestamp': 1530305395,
+            'upload_date': '20180629',
+            'uploader_id': '5647924234001',
+            'location': '16B Mezz.',
+        },
+    }, {
+        'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.event=ciscoliveemea2019#/session/15361595531500013WOU',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ciscolive.com/global/on-demand-library.html?#/session/1490051371645001kNaS',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        rf_id = self._match_id(url)
+        rf_result = self._call_api('session', rf_id, {'id': rf_id}, url)
+        return self._parse_rf_item(rf_result['items'][0])
+
+
+class CiscoLiveSearchIE(CiscoLiveBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)'
+    _TESTS = [{
+        'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/',
+        'info_dict': {
+            'title': 'Search query',
+        },
+        'playlist_count': 5,
+    }, {
+        'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.technicallevel=scpsSkillLevel_aintroductory&search.event=ciscoliveemea2019&search.technology=scpsTechnology_dataCenter&search.focus=scpsSessionFocus_bestPractices#/',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url)
+
+    @staticmethod
+    def _check_bc_id_exists(rf_item):
+        return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None
+
+    def _entries(self, query, url):
+        query['size'] = 50
+        query['from'] = 0
+        for page_num in itertools.count(1):
+            results = self._call_api(
+                'search', None, query, url,
+                'Downloading search JSON page %d' % page_num)
+            sl = try_get(results, lambda x: x['sectionList'][0], dict)
+            if sl:
+                results = sl
+            items = results.get('items')
+            if not items or not isinstance(items, list):
+                break
+            for item in items:
+                if not isinstance(item, dict):
+                    continue
+                if not self._check_bc_id_exists(item):
+                    continue
+                yield self._parse_rf_item(item)
+            size = int_or_none(results.get('size'))
+            if size is not None:
+                query['size'] = size
+            total = int_or_none(results.get('total'))
+            if total is not None and query['from'] + query['size'] > total:
+                break
+            query['from'] += query['size']
+
+    def _real_extract(self, url):
+        query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        query['type'] = 'session'
+        return self.playlist_result(
+            self._entries(query, url), playlist_title='Search query')
diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py

new file mode 100644 (file)

index 0000000..505bdbe
--- /dev/null
+++ b/youtube_dl/extractor/cjsw.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    unescapeHTML,
+)
+
+
+class CJSWIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620',
+        'md5': 'cee14d40f1e9433632c56e3d14977120',
+        'info_dict': {
+            'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41',
+            'ext': 'mp3',
+            'title': 'Freshly Squeezed – Episode June 20, 2017',
+            'description': 'md5:c967d63366c3898a80d0c7b0ff337202',
+            'series': 'Freshly Squeezed',
+            'episode_id': '20170620',
+        },
+    }, {
+        # no description
+        'url': 'http://cjsw.com/program/road-pops/episode/20170707/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        program, episode_id = mobj.group('program', 'id')
+        audio_id = '%s/%s' % (program, episode_id)
+
+        webpage = self._download_webpage(url, episode_id)
+
+        title = unescapeHTML(self._search_regex(
+            (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)',
+             r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'),
+            webpage, 'title', group='title'))
+
+        audio_url = self._search_regex(
+            r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'audio url', group='url')
+
+        audio_id = self._search_regex(
+            r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3',
+            audio_url, 'audio id', default=audio_id)
+
+        formats = [{
+            'url': audio_url,
+            'ext': determine_ext(audio_url, 'mp3'),
+            'vcodec': 'none',
+        }]
+
+        description = self._html_search_regex(
+            r'<p>(?P<description>.+?)</p>', webpage, 'description',
+            default=None)
+        series = self._search_regex(
+            r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage,
+            'series', default=program, group='name')
+
+        return {
+            'id': audio_id,
+            'title': title,
+            'description': description,
+            'formats': formats,
+            'series': series,
+            'episode_id': episode_id,
+        }
diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py

new file mode 100644 (file)

index 0000000..f2ca7a3
--- /dev/null
+++ b/youtube_dl/extractor/cliphunter.py
@@ -0,0 +1,79 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    url_or_none,
+)
+
+
+class CliphunterIE(InfoExtractor):
+    IE_NAME = 'cliphunter'
+
+    _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/
+        (?P<id>[0-9]+)/
+        (?P<seo>.+?)(?:$|[#\?])
+    '''
+    _TESTS = [{
+        'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo',
+        'md5': 'b7c9bbd4eb3a226ab91093714dcaa480',
+        'info_dict': {
+            'id': '1012420',
+            'ext': 'flv',
+            'title': 'Fun Jynx Maze solo',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'age_limit': 18,
+        },
+        'skip': 'Video gone',
+    }, {
+        'url': 'http://www.cliphunter.com/w/2019449/ShesNew__My_booty_girlfriend_Victoria_Paradices_pussy_filled_with_jizz',
+        'md5': '55a723c67bfc6da6b0cfa00d55da8a27',
+        'info_dict': {
+            'id': '2019449',
+            'ext': 'mp4',
+            'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'age_limit': 18,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_title = self._search_regex(
+            r'mediaTitle = "([^"]+)"', webpage, 'title')
+
+        gexo_files = self._parse_json(
+            self._search_regex(
+                r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'),
+            video_id)
+
+        formats = []
+        for format_id, f in gexo_files.items():
+            video_url = url_or_none(f.get('url'))
+            if not video_url:
+                continue
+            fmt = f.get('fmt')
+            height = f.get('h')
+            format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'width': int_or_none(f.get('w')),
+                'height': int_or_none(height),
+                'tbr': int_or_none(f.get('br')),
+            })
+        self._sort_formats(formats)
+
+        thumbnail = self._search_regex(
+            r"var\s+mov_thumb\s*=\s*'([^']+)';",
+            webpage, 'thumbnail', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
+            'age_limit': self._rta_search(webpage),
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/clippit.py b/youtube_dl/extractor/clippit.py

new file mode 100644 (file)

index 0000000..a1a7a77
--- /dev/null
+++ b/youtube_dl/extractor/clippit.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_iso8601,
+    qualities,
+)
+
+import re
+
+
+class ClippitIE(InfoExtractor):
+
+    _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)'
+    _TEST = {
+        'url': 'https://www.clippituser.tv/c/evmgm',
+        'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09',
+        'info_dict': {
+            'id': 'evmgm',
+            'ext': 'mp4',
+            'title': 'Bye bye Brutus. #BattleBots  - Clippit',
+            'uploader': 'lizllove',
+            'uploader_url': 'https://www.clippituser.tv/p/lizllove',
+            'timestamp': 1472183818,
+            'upload_date': '20160826',
+            'description': 'BattleBots | ABC',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(r'<title.*>(.+?)</title>', webpage, 'title')
+
+        FORMATS = ('sd', 'hd')
+        quality = qualities(FORMATS)
+        formats = []
+        for format_id in FORMATS:
+            url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id,
+                                          webpage, 'url', fatal=False)
+            if not url:
+                continue
+            match = re.search(r'/(?P<height>\d+)\.mp4', url)
+            formats.append({
+                'url': url,
+                'format_id': format_id,
+                'quality': quality(format_id),
+                'height': int(match.group('height')) if match else None,
+            })
+
+        uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n',
+                                           webpage, 'uploader', fatal=False)
+        uploader_url = ('https://www.clippituser.tv/p/' + uploader
+                        if uploader else None)
+
+        timestamp = self._html_search_regex(r'datetime="(.+?)"',
+                                            webpage, 'date', fatal=False)
+        thumbnail = self._html_search_regex(r'data-image="(.+?)"',
+                                            webpage, 'thumbnail', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'uploader': uploader,
+            'uploader_url': uploader_url,
+            'timestamp': parse_iso8601(timestamp),
+            'description': self._og_search_description(webpage),
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py

new file mode 100644 (file)

index 0000000..d55b26d
--- /dev/null
+++ b/youtube_dl/extractor/cliprs.py
@@ -0,0 +1,33 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .onet import OnetBaseIE
+
+
+class ClipRsIE(OnetBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+'
+    _TEST = {
+        'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732',
+        'md5': 'c412d57815ba07b56f9edc7b5d6a14e5',
+        'info_dict': {
+            'id': '1488842.1399140381',
+            'ext': 'mp4',
+            'title': 'PREMIJERA Frajle predstavljaju novi spot za pesmu Moli me, moli',
+            'description': 'md5:56ce2c3b4ab31c5a2e0b17cb9a453026',
+            'duration': 229,
+            'timestamp': 1459850243,
+            'upload_date': '20160405',
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        mvp_id = self._search_mvp_id(webpage)
+
+        info_dict = self._extract_from_id(mvp_id, webpage)
+        info_dict['display_id'] = display_id
+
+        return info_dict
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py

new file mode 100644 (file)

index 0000000..6cdb42f
--- /dev/null
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -0,0 +1,54 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    find_xpath_attr,
+    fix_xml_ampersands
+)
+
+
+class ClipsyndicateIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
+        'md5': '4d7d549451bad625e0ff3d7bd56d776c',
+        'info_dict': {
+            'id': '4629301',
+            'ext': 'mp4',
+            'title': 'Brick Briscoe',
+            'duration': 612,
+            'thumbnail': r're:^https?://.+\.jpg',
+        },
+    }, {
+        'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        js_player = self._download_webpage(
+            'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
+            video_id, 'Downlaoding player')
+        # it includes a required token
+        flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars')
+
+        pdoc = self._download_xml(
+            'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
+            video_id, 'Downloading video info',
+            transform_source=fix_xml_ampersands)
+
+        track_doc = pdoc.find('trackList/track')
+
+        def find_param(name):
+            node = find_xpath_attr(track_doc, './/param', 'name', name)
+            if node is not None:
+                return node.attrib['value']
+
+        return {
+            'id': video_id,
+            'title': find_param('title'),
+            'url': track_doc.find('location').text,
+            'thumbnail': find_param('thumbnail'),
+            'duration': int(find_param('duration')),
+        }
diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py

new file mode 100644 (file)

index 0000000..26243d5
--- /dev/null
+++ b/youtube_dl/extractor/closertotruth.py
@@ -0,0 +1,92 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class CloserToTruthIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
+        'info_dict': {
+            'id': '0_zof1ktre',
+            'display_id': 'solutions-the-mind-body-problem',
+            'ext': 'mov',
+            'title': 'Solutions to the Mind-Body Problem?',
+            'upload_date': '20140221',
+            'timestamp': 1392956007,
+            'uploader_id': 'CTTXML'
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://closertotruth.com/episodes/how-do-brains-work',
+        'info_dict': {
+            'id': '0_iuxai6g6',
+            'display_id': 'how-do-brains-work',
+            'ext': 'mov',
+            'title': 'How do Brains Work?',
+            'upload_date': '20140221',
+            'timestamp': 1392956024,
+            'uploader_id': 'CTTXML'
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://closertotruth.com/interviews/1725',
+        'info_dict': {
+            'id': '1725',
+            'title': 'AyaFr-002',
+        },
+        'playlist_mincount': 2,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        partner_id = self._search_regex(
+            r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
+            webpage, 'kaltura partner_id')
+
+        title = self._search_regex(
+            r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
+
+        select = self._search_regex(
+            r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
+            webpage, 'select version', default=None)
+        if select:
+            entry_ids = set()
+            entries = []
+            for mobj in re.finditer(
+                    r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
+                    webpage):
+                entry_id = mobj.group('id')
+                if entry_id in entry_ids:
+                    continue
+                entry_ids.add(entry_id)
+                entries.append({
+                    '_type': 'url_transparent',
+                    'url': 'kaltura:%s:%s' % (partner_id, entry_id),
+                    'ie_key': 'Kaltura',
+                    'title': mobj.group('title'),
+                })
+            if entries:
+                return self.playlist_result(entries, display_id, title)
+
+        entry_id = self._search_regex(
+            r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
+            webpage, 'kaltura entry_id', group='id')
+
+        return {
+            '_type': 'url_transparent',
+            'display_id': display_id,
+            'url': 'kaltura:%s:%s' % (partner_id, entry_id),
+            'ie_key': 'Kaltura',
+            'title': title
+        }
diff --git a/youtube_dl/extractor/cloudflarestream.py b/youtube_dl/extractor/cloudflarestream.py

new file mode 100644 (file)

index 0000000..2fdcfbb
--- /dev/null
+++ b/youtube_dl/extractor/cloudflarestream.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import re
+
+from .common import InfoExtractor
+
+
+class CloudflareStreamIE(InfoExtractor):
+    _DOMAIN_RE = r'(?:cloudflarestream\.com|(?:videodelivery|bytehighway)\.net)'
+    _EMBED_RE = r'embed\.%s/embed/[^/]+\.js\?.*?\bvideo=' % _DOMAIN_RE
+    _ID_RE = r'[\da-f]{32}|[\w-]+\.[\w-]+\.[\w-]+'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:watch\.)?%s/|
+                            %s
+                        )
+                        (?P<id>%s)
+                    ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE)
+    _TESTS = [{
+        'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717',
+        'info_dict': {
+            'id': '31c9291ab41fac05471db4e73aa11717',
+            'ext': 'mp4',
+            'title': '31c9291ab41fac05471db4e73aa11717',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://watch.cloudflarestream.com/9df17203414fd1db3e3ed74abbe936c1',
+        'only_matching': True,
+    }, {
+        'url': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/manifest/video.mpd',
+        'only_matching': True,
+    }, {
+        'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url')
+            for mobj in re.finditer(
+                r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE),
+                webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net'
+        base_url = 'https://%s/%s/' % (domain, video_id)
+        if '.' in video_id:
+            video_id = self._parse_json(base64.urlsafe_b64decode(
+                video_id.split('.')[1]), video_id)['sub']
+        manifest_base_url = base_url + 'manifest/video.'
+
+        formats = self._extract_m3u8_formats(
+            manifest_base_url + 'm3u8', video_id, 'mp4',
+            'm3u8_native', m3u8_id='hls', fatal=False)
+        formats.extend(self._extract_mpd_formats(
+            manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'thumbnail': base_url + 'thumbnails/thumbnail.jpg',
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py

new file mode 100644 (file)

index 0000000..85ca20e
--- /dev/null
+++ b/youtube_dl/extractor/cloudy.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    str_to_int,
+    unified_strdate,
+)
+
+
+class CloudyIE(InfoExtractor):
+    _IE_DESC = 'cloudy.ec'
+    _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)'
+    _TESTS = [{
+        'url': 'https://www.cloudy.ec/v/af511e2527aac',
+        'md5': '29832b05028ead1b58be86bf319397ca',
+        'info_dict': {
+            'id': 'af511e2527aac',
+            'ext': 'mp4',
+            'title': 'Funny Cats and Animals Compilation june 2013',
+            'upload_date': '20130913',
+            'view_count': int,
+        }
+    }, {
+        'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://www.cloudy.ec/embed.php', video_id, query={
+                'id': video_id,
+                'playerPage': 1,
+                'autoplay': 1,
+            })
+
+        info = self._parse_html5_media_entries(url, webpage, video_id)[0]
+
+        webpage = self._download_webpage(
+            'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False)
+
+        if webpage:
+            info.update({
+                'title': self._search_regex(
+                    r'<h\d[^>]*>([^<]+)<', webpage, 'title'),
+                'upload_date': unified_strdate(self._search_regex(
+                    r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage,
+                    'upload date', fatal=False)),
+                'view_count': str_to_int(self._search_regex(
+                    r'([\d,.]+) views<', webpage, 'view count', fatal=False)),
+            })
+
+        if not info.get('title'):
+            info['title'] = video_id
+
+        info['id'] = video_id
+
+        return info
diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py

new file mode 100644 (file)

index 0000000..98f9cb5
--- /dev/null
+++ b/youtube_dl/extractor/clubic.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    qualities,
+)
+
+
+class ClubicIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html'
+
+    _TESTS = [{
+        'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html',
+        'md5': '1592b694ba586036efac1776b0b43cd3',
+        'info_dict': {
+            'id': '448474',
+            'ext': 'mp4',
+            'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité',
+            'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*',
+            'thumbnail': r're:^http://img\.clubic\.com/.*\.jpg$',
+        }
+    }, {
+        'url': 'http://www.clubic.com/video/video-clubic-week-2-0-apple-iphone-6s-et-plus-mais-surtout-le-pencil-469792.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id
+        player_page = self._download_webpage(player_url, video_id)
+
+        config = self._parse_json(self._search_regex(
+            r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page,
+            'configuration'), video_id)
+
+        video_info = config['videoInfo']
+        sources = config['sources']
+        quality_order = qualities(['sd', 'hq'])
+
+        formats = [{
+            'format_id': src['streamQuality'],
+            'url': src['src'],
+            'quality': quality_order(src['streamQuality']),
+        } for src in sources]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_info['title'],
+            'formats': formats,
+            'description': clean_html(video_info.get('description')),
+            'thumbnail': config.get('poster'),
+        }
diff --git a/youtube_dl/extractor/clyp.py b/youtube_dl/extractor/clyp.py

new file mode 100644 (file)

index 0000000..06d04de
--- /dev/null
+++ b/youtube_dl/extractor/clyp.py
@@ -0,0 +1,82 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    float_or_none,
+    unified_timestamp,
+)
+
+
+class ClypIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)'
+    _TESTS = [{
+        'url': 'https://clyp.it/ojz2wfah',
+        'md5': '1d4961036c41247ecfdcc439c0cddcbb',
+        'info_dict': {
+            'id': 'ojz2wfah',
+            'ext': 'mp3',
+            'title': 'Krisson80 - bits wip wip',
+            'description': '#Krisson80BitsWipWip #chiptune\n#wip',
+            'duration': 263.21,
+            'timestamp': 1443515251,
+            'upload_date': '20150929',
+        },
+    }, {
+        'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d',
+        'info_dict': {
+            'id': 'b04p1odi',
+            'ext': 'mp3',
+            'title': 'GJ! (Reward Edit)',
+            'description': 'Metal Resistance (THE ONE edition)',
+            'duration': 177.789,
+            'timestamp': 1528241278,
+            'upload_date': '20180605',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        audio_id = self._match_id(url)
+
+        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        token = qs.get('token', [None])[0]
+
+        query = {}
+        if token:
+            query['token'] = token
+
+        metadata = self._download_json(
+            'https://api.clyp.it/%s' % audio_id, audio_id, query=query)
+
+        formats = []
+        for secure in ('', 'Secure'):
+            for ext in ('Ogg', 'Mp3'):
+                format_id = '%s%s' % (secure, ext)
+                format_url = metadata.get('%sUrl' % format_id)
+                if format_url:
+                    formats.append({
+                        'url': format_url,
+                        'format_id': format_id,
+                        'vcodec': 'none',
+                    })
+        self._sort_formats(formats)
+
+        title = metadata['Title']
+        description = metadata.get('Description')
+        duration = float_or_none(metadata.get('Duration'))
+        timestamp = unified_timestamp(metadata.get('DateCreated'))
+
+        return {
+            'id': audio_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py

new file mode 100644 (file)

index 0000000..e701fbe
--- /dev/null
+++ b/youtube_dl/extractor/cmt.py
@@ -0,0 +1,54 @@
+from __future__ import unicode_literals
+
+from .mtv import MTVIE
+
+
+class CMTIE(MTVIE):
+    IE_NAME = 'cmt.com'
+    _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P<id>[^/]+)'
+
+    _TESTS = [{
+        'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061',
+        'md5': 'e6b7ef3c4c45bbfae88061799bbba6c2',
+        'info_dict': {
+            'id': '989124',
+            'ext': 'mp4',
+            'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"',
+            'description': 'Blame It All On My Roots',
+        },
+        'skip': 'Video not available',
+    }, {
+        'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908',
+        'md5': 'e61a801ca4a183a466c08bd98dccbb1c',
+        'info_dict': {
+            'id': '1504699',
+            'ext': 'mp4',
+            'title': 'Still The King Ep. 109 in 3 Minutes',
+            'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9.',
+            'timestamp': 1469421000.0,
+            'upload_date': '20160725',
+        },
+    }, {
+        'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.cmt.com/full-episodes/537qb3/nashville-the-wayfaring-stranger-season-5-ep-501',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.cmt.com/video-clips/t9e4ci/nashville-juliette-in-2-minutes',
+        'only_matching': True,
+    }]
+
+    def _extract_mgid(self, webpage):
+        mgid = self._search_regex(
+            r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1',
+            webpage, 'mgid', group='mgid', default=None)
+        if not mgid:
+            mgid = self._extract_triforce_mgid(webpage)
+        return mgid
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        mgid = self._extract_mgid(webpage)
+        return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py

new file mode 100644 (file)

index 0000000..6889b0f
--- /dev/null
+++ b/youtube_dl/extractor/cnbc.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class CNBCIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://video.cnbc.com/gallery/?video=3000503714',
+        'info_dict': {
+            'id': '3000503714',
+            'ext': 'mp4',
+            'title': 'Fighting zombies is big business',
+            'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e',
+            'timestamp': 1459332000,
+            'upload_date': '20160330',
+            'uploader': 'NBCU-CNBC',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': smuggle_url(
+                'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id,
+                {'force_smil_url': True}),
+            'id': video_id,
+        }
+
+
+class CNBCVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)'
+    _TEST = {
+        'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',
+        'info_dict': {
+            'id': '7000031301',
+            'ext': 'mp4',
+            'title': "Trump: I don't necessarily agree with raising rates",
+            'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3',
+            'timestamp': 1531958400,
+            'upload_date': '20180719',
+            'uploader': 'NBCU-CNBC',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._search_regex(
+            r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id,
+            'video id')
+        return self.url_result(
+            'http://video.cnbc.com/gallery/?video=%s' % video_id,
+            CNBCIE.ie_key())
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py

new file mode 100644 (file)

index 0000000..774b710
--- /dev/null
+++ b/youtube_dl/extractor/cnn.py
@@ -0,0 +1,144 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .turner import TurnerBaseIE
+from ..utils import url_basename
+
+
+class CNNIE(TurnerBaseIE):
+    _VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/
+        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
+
+    _TESTS = [{
+        'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
+        'md5': '3e6121ea48df7e2259fe73a0628605c4',
+        'info_dict': {
+            'id': 'sports/2013/06/09/nadal-1-on-1.cnn',
+            'ext': 'mp4',
+            'title': 'Nadal wins 8th French Open title',
+            'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.',
+            'duration': 135,
+            'upload_date': '20130609',
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29',
+        'md5': 'b5cc60c60a3477d185af8f19a2a26f4e',
+        'info_dict': {
+            'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology',
+            'ext': 'mp4',
+            'title': "Student's epic speech stuns new freshmen",
+            'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"",
+            'upload_date': '20130821',
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html',
+        'md5': 'f14d02ebd264df951feb2400e2c25a1b',
+        'info_dict': {
+            'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln',
+            'ext': 'mp4',
+            'title': 'Nashville Ep. 1: Hand crafted skateboards',
+            'description': 'md5:e7223a503315c9f150acac52e76de086',
+            'upload_date': '20141222',
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html',
+        'md5': '52a515dc1b0f001cd82e4ceda32be9d1',
+        'info_dict': {
+            'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney',
+            'ext': 'mp4',
+            'title': '5 stunning stats about Netflix',
+            'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.',
+            'upload_date': '20160819',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk',
+        'only_matching': True,
+    }, {
+        'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg',
+        'only_matching': True,
+    }, {
+        'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn',
+        'only_matching': True,
+    }]
+
+    _CONFIG = {
+        # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml
+        'edition': {
+            'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml',
+            'media_src': 'http://pmd.cdn.turner.com/cnn/big',
+        },
+        # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml
+        'money': {
+            'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml',
+            'media_src': 'http://ht3.cdn.turner.com/money/big',
+        },
+    }
+
+    def _extract_timestamp(self, video_data):
+        # TODO: fix timestamp extraction
+        return None
+
+    def _real_extract(self, url):
+        sub_domain, path, page_title = re.match(self._VALID_URL, url).groups()
+        if sub_domain not in ('money', 'edition'):
+            sub_domain = 'edition'
+        config = self._CONFIG[sub_domain]
+        return self._extract_cvp_info(
+            config['data_src'] % path, page_title, {
+                'default': {
+                    'media_src': config['media_src'],
+                }
+            })
+
+
+class CNNBlogsIE(InfoExtractor):
+    _VALID_URL = r'https?://[^\.]+\.blogs\.cnn\.com/.+'
+    _TEST = {
+        'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/',
+        'md5': '3e56f97b0b6ffb4b79f4ea0749551084',
+        'info_dict': {
+            'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn',
+            'ext': 'mp4',
+            'title': 'Criminalizing journalism?',
+            'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.',
+            'upload_date': '20140209',
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+        'add_ie': ['CNN'],
+    }
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, url_basename(url))
+        cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url')
+        return self.url_result(cnn_url, CNNIE.ie_key())
+
+
+class CNNArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)'
+    _TEST = {
+        'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
+        'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
+        'info_dict': {
+            'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn',
+            'ext': 'mp4',
+            'title': 'Obama: Cyberattack not an act of war',
+            'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b',
+            'upload_date': '20141221',
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+        'add_ie': ['CNN'],
+    }
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, url_basename(url))
+        cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
+        return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key())
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py

new file mode 100644 (file)

index 0000000..d08b909
--- /dev/null
+++ b/youtube_dl/extractor/comedycentral.py
@@ -0,0 +1,142 @@
+from __future__ import unicode_literals
+
+from .mtv import MTVServicesInfoExtractor
+from .common import InfoExtractor
+
+
+class ComedyCentralIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
+        (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes)))
+        /(?P<title>.*)'''
+    _FEED_URL = 'http://comedycentral.com/feeds/mrss/'
+
+    _TESTS = [{
+        'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
+        'md5': 'c4f48e9eda1b16dd10add0744344b6d8',
+        'info_dict': {
+            'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354',
+            'ext': 'mp4',
+            'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother',
+            'description': 'After a certain point, breastfeeding becomes c**kblocking.',
+            'timestamp': 1376798400,
+            'upload_date': '20130818',
+        },
+    }, {
+        'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview',
+        'only_matching': True,
+    }]
+
+
+class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
+        (?:full-episodes|shows(?=/[^/]+/full-episodes))
+        /(?P<id>[^?]+)'''
+    _FEED_URL = 'http://comedycentral.com/feeds/mrss/'
+
+    _TESTS = [{
+        'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028',
+        'info_dict': {
+            'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."',
+            'title': 'November 28, 2016 - Ryan Speedo Green',
+        },
+        'playlist_count': 4,
+    }, {
+        'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+        mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1')
+        videos_info = self._get_videos_info(mgid)
+        return videos_info
+
+
+class ToshIE(MTVServicesInfoExtractor):
+    IE_DESC = 'Tosh.0'
+    _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)'
+    _FEED_URL = 'http://tosh.cc.com/feeds/mrss'
+
+    _TESTS = [{
+        'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans',
+        'info_dict': {
+            'description': 'Tosh asked fans to share their summer plans.',
+            'title': 'Twitter Users Share Summer Plans',
+        },
+        'playlist': [{
+            'md5': 'f269e88114c1805bb6d7653fecea9e06',
+            'info_dict': {
+                'id': '90498ec2-ed00-11e0-aca6-0026b9414f30',
+                'ext': 'mp4',
+                'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans',
+                'description': 'Tosh asked fans to share their summer plans.',
+                'thumbnail': r're:^https?://.*\.jpg',
+                # It's really reported to be published on year 2077
+                'upload_date': '20770610',
+                'timestamp': 3390510600,
+                'subtitles': {
+                    'en': 'mincount:3',
+                },
+            },
+        }]
+    }, {
+        'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp',
+        'only_matching': True,
+    }]
+
+
+class ComedyCentralTVIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4',
+        'info_dict': {
+            'id': 'local_playlist-f99b626bdfe13568579a',
+            'ext': 'flv',
+            'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.comedycentral.tv/shows/1074-workaholics',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        mrss_url = self._search_regex(
+            r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'mrss url', group='url')
+
+        return self._get_videos_info_from_url(mrss_url, video_id)
+
+
+class ComedyCentralShortnameIE(InfoExtractor):
+    _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$'
+    _TESTS = [{
+        'url': ':tds',
+        'only_matching': True,
+    }, {
+        'url': ':thedailyshow',
+        'only_matching': True,
+    }, {
+        'url': ':theopposition',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        shortcut_map = {
+            'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
+            'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
+            'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes',
+        }
+        return self.url_result(shortcut_map[video_id])
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

new file mode 100644 (file)

index 0000000..c1ea5d8
--- /dev/null
+++ b/youtube_dl/extractor/common.py
@@ -0,0 +1,3013 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import datetime
+import hashlib
+import json
+import netrc
+import os
+import random
+import re
+import socket
+import sys
+import time
+import math
+
+from ..compat import (
+    compat_cookiejar_Cookie,
+    compat_cookies,
+    compat_etree_Element,
+    compat_etree_fromstring,
+    compat_getpass,
+    compat_integer_types,
+    compat_http_client,
+    compat_os_name,
+    compat_str,
+    compat_urllib_error,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_urlencode,
+    compat_urllib_request,
+    compat_urlparse,
+    compat_xml_parse_error,
+)
+from ..downloader.f4m import (
+    get_base_url,
+    remove_encrypted_media,
+)
+from ..utils import (
+    NO_DEFAULT,
+    age_restricted,
+    base_url,
+    bug_reports_message,
+    clean_html,
+    compiled_regex_type,
+    determine_ext,
+    determine_protocol,
+    dict_get,
+    error_to_compat_str,
+    ExtractorError,
+    extract_attributes,
+    fix_xml_ampersands,
+    float_or_none,
+    GeoRestrictedError,
+    GeoUtils,
+    int_or_none,
+    js_to_json,
+    JSON_LD_RE,
+    mimetype2ext,
+    orderedSet,
+    parse_bitrate,
+    parse_codecs,
+    parse_duration,
+    parse_iso8601,
+    parse_m3u8_attributes,
+    parse_resolution,
+    RegexNotFoundError,
+    sanitized_Request,
+    sanitize_filename,
+    str_or_none,
+    strip_or_none,
+    unescapeHTML,
+    unified_strdate,
+    unified_timestamp,
+    update_Request,
+    update_url_query,
+    urljoin,
+    url_basename,
+    url_or_none,
+    xpath_element,
+    xpath_text,
+    xpath_with_ns,
+)
+
+
+class InfoExtractor(object):
+    """Information Extractor class.
+
+    Information extractors are the classes that, given a URL, extract
+    information about the video (or videos) the URL refers to. This
+    information includes the real video URL, the video title, author and
+    others. The information is stored in a dictionary which is then
+    passed to the YoutubeDL. The YoutubeDL processes this
+    information possibly downloading the video to the file system, among
+    other possible outcomes.
+
+    The type field determines the type of the result.
+    By far the most common value (and the default if _type is missing) is
+    "video", which indicates a single video.
+
+    For a video, the dictionaries must include the following fields:
+
+    id:             Video identifier.
+    title:          Video title, unescaped.
+
+    Additionally, it must contain either a formats entry or a url one:
+
+    formats:        A list of dictionaries for each format available, ordered
+                    from worst to best quality.
+
+                    Potential fields:
+                    * url        The mandatory URL representing the media:
+                                   for plain file media - HTTP URL of this file,
+                                   for RTMP - RTMP URL,
+                                   for HLS - URL of the M3U8 media playlist,
+                                   for HDS - URL of the F4M manifest,
+                                   for DASH
+                                     - HTTP URL to plain file media (in case of
+                                       unfragmented media)
+                                     - URL of the MPD manifest or base URL
+                                       representing the media if MPD manifest
+                                       is parsed from a string (in case of
+                                       fragmented media)
+                                   for MSS - URL of the ISM manifest.
+                    * manifest_url
+                                 The URL of the manifest file in case of
+                                 fragmented media:
+                                   for HLS - URL of the M3U8 master playlist,
+                                   for HDS - URL of the F4M manifest,
+                                   for DASH - URL of the MPD manifest,
+                                   for MSS - URL of the ISM manifest.
+                    * ext        Will be calculated from URL if missing
+                    * format     A human-readable description of the format
+                                 ("mp4 container with h264/opus").
+                                 Calculated from the format_id, width, height.
+                                 and format_note fields if missing.
+                    * format_id  A short description of the format
+                                 ("mp4_h264_opus" or "19").
+                                Technically optional, but strongly recommended.
+                    * format_note Additional info about the format
+                                 ("3D" or "DASH video")
+                    * width      Width of the video, if known
+                    * height     Height of the video, if known
+                    * resolution Textual description of width and height
+                    * tbr        Average bitrate of audio and video in KBit/s
+                    * abr        Average audio bitrate in KBit/s
+                    * acodec     Name of the audio codec in use
+                    * asr        Audio sampling rate in Hertz
+                    * vbr        Average video bitrate in KBit/s
+                    * fps        Frame rate
+                    * vcodec     Name of the video codec in use
+                    * container  Name of the container format
+                    * filesize   The number of bytes, if known in advance
+                    * filesize_approx  An estimate for the number of bytes
+                    * player_url SWF Player URL (used for rtmpdump).
+                    * protocol   The protocol that will be used for the actual
+                                 download, lower-case.
+                                 "http", "https", "rtsp", "rtmp", "rtmpe",
+                                 "m3u8", "m3u8_native" or "http_dash_segments".
+                    * fragment_base_url
+                                 Base URL for fragments. Each fragment's path
+                                 value (if present) will be relative to
+                                 this URL.
+                    * fragments  A list of fragments of a fragmented media.
+                                 Each fragment entry must contain either an url
+                                 or a path. If an url is present it should be
+                                 considered by a client. Otherwise both path and
+                                 fragment_base_url must be present. Here is
+                                 the list of all potential fields:
+                                 * "url" - fragment's URL
+                                 * "path" - fragment's path relative to
+                                            fragment_base_url
+                                 * "duration" (optional, int or float)
+                                 * "filesize" (optional, int)
+                    * preference Order number of this format. If this field is
+                                 present and not None, the formats get sorted
+                                 by this field, regardless of all other values.
+                                 -1 for default (order by other properties),
+                                 -2 or smaller for less than default.
+                                 < -1000 to hide the format (if there is
+                                    another one which is strictly better)
+                    * language   Language code, e.g. "de" or "en-US".
+                    * language_preference  Is this in the language mentioned in
+                                 the URL?
+                                 10 if it's what the URL is about,
+                                 -1 for default (don't know),
+                                 -10 otherwise, other values reserved for now.
+                    * quality    Order number of the video quality of this
+                                 format, irrespective of the file format.
+                                 -1 for default (order by other properties),
+                                 -2 or smaller for less than default.
+                    * source_preference  Order number for this video source
+                                  (quality takes higher priority)
+                                 -1 for default (order by other properties),
+                                 -2 or smaller for less than default.
+                    * http_headers  A dictionary of additional HTTP headers
+                                 to add to the request.
+                    * stretched_ratio  If given and not 1, indicates that the
+                                 video's pixels are not square.
+                                 width : height ratio as float.
+                    * no_resume  The server does not support resuming the
+                                 (HTTP or RTMP) download. Boolean.
+                    * downloader_options  A dictionary of downloader options as
+                                 described in FileDownloader
+
+    url:            Final video URL.
+    ext:            Video filename extension.
+    format:         The video format, defaults to ext (used for --get-format)
+    player_url:     SWF Player URL (used for rtmpdump).
+
+    The following fields are optional:
+
+    alt_title:      A secondary title of the video.
+    display_id      An alternative identifier for the video, not necessarily
+                    unique, but available before title. Typically, id is
+                    something like "4234987", title "Dancing naked mole rats",
+                    and display_id "dancing-naked-mole-rats"
+    thumbnails:     A list of dictionaries, with the following entries:
+                        * "id" (optional, string) - Thumbnail format ID
+                        * "url"
+                        * "preference" (optional, int) - quality of the image
+                        * "width" (optional, int)
+                        * "height" (optional, int)
+                        * "resolution" (optional, string "{width}x{height}",
+                                        deprecated)
+                        * "filesize" (optional, int)
+    thumbnail:      Full URL to a video thumbnail image.
+    description:    Full video description.
+    uploader:       Full name of the video uploader.
+    license:        License name the video is licensed under.
+    creator:        The creator of the video.
+    release_date:   The date (YYYYMMDD) when the video was released.
+    timestamp:      UNIX timestamp of the moment the video became available.
+    upload_date:    Video upload date (YYYYMMDD).
+                    If not explicitly set, calculated from timestamp.
+    uploader_id:    Nickname or id of the video uploader.
+    uploader_url:   Full URL to a personal webpage of the video uploader.
+    channel:        Full name of the channel the video is uploaded on.
+                    Note that channel fields may or may not repeat uploader
+                    fields. This depends on a particular extractor.
+    channel_id:     Id of the channel.
+    channel_url:    Full URL to a channel webpage.
+    location:       Physical location where the video was filmed.
+    subtitles:      The available subtitles as a dictionary in the format
+                    {tag: subformats}. "tag" is usually a language code, and
+                    "subformats" is a list sorted from lower to higher
+                    preference, each element is a dictionary with the "ext"
+                    entry and one of:
+                        * "data": The subtitles file contents
+                        * "url": A URL pointing to the subtitles file
+                    "ext" will be calculated from URL if missing
+    automatic_captions: Like 'subtitles', used by the YoutubeIE for
+                    automatically generated captions
+    duration:       Length of the video in seconds, as an integer or float.
+    view_count:     How many users have watched the video on the platform.
+    like_count:     Number of positive ratings of the video
+    dislike_count:  Number of negative ratings of the video
+    repost_count:   Number of reposts of the video
+    average_rating: Average rating give by users, the scale used depends on the webpage
+    comment_count:  Number of comments on the video
+    comments:       A list of comments, each with one or more of the following
+                    properties (all but one of text or html optional):
+                        * "author" - human-readable name of the comment author
+                        * "author_id" - user ID of the comment author
+                        * "id" - Comment ID
+                        * "html" - Comment as HTML
+                        * "text" - Plain text of the comment
+                        * "timestamp" - UNIX timestamp of comment
+                        * "parent" - ID of the comment this one is replying to.
+                                     Set to "root" to indicate that this is a
+                                     comment to the original video.
+    age_limit:      Age restriction for the video, as an integer (years)
+    webpage_url:    The URL to the video webpage, if given to youtube-dlc it
+                    should allow to get the same result again. (It will be set
+                    by YoutubeDL if it's missing)
+    categories:     A list of categories that the video falls in, for example
+                    ["Sports", "Berlin"]
+    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
+    is_live:        True, False, or None (=unknown). Whether this video is a
+                    live stream that goes on instead of a fixed-length video.
+    start_time:     Time in seconds where the reproduction should start, as
+                    specified in the URL.
+    end_time:       Time in seconds where the reproduction should end, as
+                    specified in the URL.
+    chapters:       A list of dictionaries, with the following entries:
+                        * "start_time" - The start time of the chapter in seconds
+                        * "end_time" - The end time of the chapter in seconds
+                        * "title" (optional, string)
+
+    The following fields should only be used when the video belongs to some logical
+    chapter or section:
+
+    chapter:        Name or title of the chapter the video belongs to.
+    chapter_number: Number of the chapter the video belongs to, as an integer.
+    chapter_id:     Id of the chapter the video belongs to, as a unicode string.
+
+    The following fields should only be used when the video is an episode of some
+    series, programme or podcast:
+
+    series:         Title of the series or programme the video episode belongs to.
+    season:         Title of the season the video episode belongs to.
+    season_number:  Number of the season the video episode belongs to, as an integer.
+    season_id:      Id of the season the video episode belongs to, as a unicode string.
+    episode:        Title of the video episode. Unlike mandatory video title field,
+                    this field should denote the exact title of the video episode
+                    without any kind of decoration.
+    episode_number: Number of the video episode within a season, as an integer.
+    episode_id:     Id of the video episode, as a unicode string.
+
+    The following fields should only be used when the media is a track or a part of
+    a music album:
+
+    track:          Title of the track.
+    track_number:   Number of the track within an album or a disc, as an integer.
+    track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
+                    as a unicode string.
+    artist:         Artist(s) of the track.
+    genre:          Genre(s) of the track.
+    album:          Title of the album the track belongs to.
+    album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
+    album_artist:   List of all artists appeared on the album (e.g.
+                    "Ash Borer / Fell Voices" or "Various Artists", useful for splits
+                    and compilations).
+    disc_number:    Number of the disc or other physical medium the track belongs to,
+                    as an integer.
+    release_year:   Year (YYYY) when the album was released.
+
+    Unless mentioned otherwise, the fields should be Unicode strings.
+
+    Unless mentioned otherwise, None is equivalent to absence of information.
+
+
+    _type "playlist" indicates multiple videos.
+    There must be a key "entries", which is a list, an iterable, or a PagedList
+    object, each element of which is a valid dictionary by this specification.
+
+    Additionally, playlists can have "id", "title", "description", "uploader",
+    "uploader_id", "uploader_url" attributes with the same semantics as videos
+    (see above).
+
+
+    _type "multi_video" indicates that there are multiple videos that
+    form a single show, for examples multiple acts of an opera or TV episode.
+    It must have an entries key like a playlist and contain all the keys
+    required for a video at the same time.
+
+
+    _type "url" indicates that the video must be extracted from another
+    location, possibly by a different extractor. Its only required key is:
+    "url" - the next URL to extract.
+    The key "ie_key" can be set to the class name (minus the trailing "IE",
+    e.g. "Youtube") if the extractor class is known in advance.
+    Additionally, the dictionary may have any properties of the resolved entity
+    known in advance, for example "title" if the title of the referred video is
+    known ahead of time.
+
+
+    _type "url_transparent" entities have the same specification as "url", but
+    indicate that the given additional information is more precise than the one
+    associated with the resolved URL.
+    This is useful when a site employs a video service that hosts the video and
+    its technical metadata, but that video service does not embed a useful
+    title, description etc.
+
+
+    Subclasses of this one should re-define the _real_initialize() and
+    _real_extract() methods and define a _VALID_URL regexp.
+    Probably, they should also be added to the list of extractors.
+
+    _GEO_BYPASS attribute may be set to False in order to disable
+    geo restriction bypass mechanisms for a particular extractor.
+    Though it won't disable explicit geo restriction bypass based on
+    country code provided with geo_bypass_country.
+
+    _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
+    countries for this extractor. One of these countries will be used by
+    geo restriction bypass mechanism right away in order to bypass
+    geo restriction, of course, if the mechanism is not disabled.
+
+    _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
+    IP blocks in CIDR notation for this extractor. One of these IP blocks
+    will be used by geo restriction bypass mechanism similarly
+    to _GEO_COUNTRIES.
+
+    Finally, the _WORKING attribute should be set to False for broken IEs
+    in order to warn the users and skip the tests.
+    """
+
+    _ready = False
+    _downloader = None
+    _x_forwarded_for_ip = None
+    _GEO_BYPASS = True
+    _GEO_COUNTRIES = None
+    _GEO_IP_BLOCKS = None
+    _WORKING = True
+
+    def __init__(self, downloader=None):
+        """Constructor. Receives an optional downloader."""
+        self._ready = False
+        self._x_forwarded_for_ip = None
+        self.set_downloader(downloader)
+
+    @classmethod
+    def suitable(cls, url):
+        """Receives a URL and returns True if suitable for this IE."""
+
+        # This does not use has/getattr intentionally - we want to know whether
+        # we have cached the regexp for *this* class, whereas getattr would also
+        # match the superclass
+        if '_VALID_URL_RE' not in cls.__dict__:
+            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
+        return cls._VALID_URL_RE.match(url) is not None
+
+    @classmethod
+    def _match_id(cls, url):
+        if '_VALID_URL_RE' not in cls.__dict__:
+            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
+        m = cls._VALID_URL_RE.match(url)
+        assert m
+        return compat_str(m.group('id'))
+
+    @classmethod
+    def working(cls):
+        """Getter method for _WORKING."""
+        return cls._WORKING
+
+    def initialize(self):
+        """Initializes an instance (authentication, etc)."""
+        self._initialize_geo_bypass({
+            'countries': self._GEO_COUNTRIES,
+            'ip_blocks': self._GEO_IP_BLOCKS,
+        })
+        if not self._ready:
+            self._real_initialize()
+            self._ready = True
+
+    def _initialize_geo_bypass(self, geo_bypass_context):
+        """
+        Initialize geo restriction bypass mechanism.
+
+        This method is used to initialize geo bypass mechanism based on faking
+        X-Forwarded-For HTTP header. A random country from provided country list
+        is selected and a random IP belonging to this country is generated. This
+        IP will be passed as X-Forwarded-For HTTP header in all subsequent
+        HTTP requests.
+
+        This method will be used for initial geo bypass mechanism initialization
+        during the instance initialization with _GEO_COUNTRIES and
+        _GEO_IP_BLOCKS.
+
+        You may also manually call it from extractor's code if geo bypass
+        information is not available beforehand (e.g. obtained during
+        extraction) or due to some other reason. In this case you should pass
+        this information in geo bypass context passed as first argument. It may
+        contain following fields:
+
+        countries:  List of geo unrestricted countries (similar
+                    to _GEO_COUNTRIES)
+        ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
+                    (similar to _GEO_IP_BLOCKS)
+
+        """
+        if not self._x_forwarded_for_ip:
+
+            # Geo bypass mechanism is explicitly disabled by user
+            if not self._downloader.params.get('geo_bypass', True):
+                return
+
+            if not geo_bypass_context:
+                geo_bypass_context = {}
+
+            # Backward compatibility: previously _initialize_geo_bypass
+            # expected a list of countries, some 3rd party code may still use
+            # it this way
+            if isinstance(geo_bypass_context, (list, tuple)):
+                geo_bypass_context = {
+                    'countries': geo_bypass_context,
+                }
+
+            # The whole point of geo bypass mechanism is to fake IP
+            # as X-Forwarded-For HTTP header based on some IP block or
+            # country code.
+
+            # Path 1: bypassing based on IP block in CIDR notation
+
+            # Explicit IP block specified by user, use it right away
+            # regardless of whether extractor is geo bypassable or not
+            ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
+
+            # Otherwise use random IP block from geo bypass context but only
+            # if extractor is known as geo bypassable
+            if not ip_block:
+                ip_blocks = geo_bypass_context.get('ip_blocks')
+                if self._GEO_BYPASS and ip_blocks:
+                    ip_block = random.choice(ip_blocks)
+
+            if ip_block:
+                self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
+                if self._downloader.params.get('verbose', False):
+                    self._downloader.to_screen(
+                        '[debug] Using fake IP %s as X-Forwarded-For.'
+                        % self._x_forwarded_for_ip)
+                return
+
+            # Path 2: bypassing based on country code
+
+            # Explicit country code specified by user, use it right away
+            # regardless of whether extractor is geo bypassable or not
+            country = self._downloader.params.get('geo_bypass_country', None)
+
+            # Otherwise use random country code from geo bypass context but
+            # only if extractor is known as geo bypassable
+            if not country:
+                countries = geo_bypass_context.get('countries')
+                if self._GEO_BYPASS and countries:
+                    country = random.choice(countries)
+
+            if country:
+                self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
+                if self._downloader.params.get('verbose', False):
+                    self._downloader.to_screen(
+                        '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
+                        % (self._x_forwarded_for_ip, country.upper()))
+
+    def extract(self, url):
+        """Extracts URL information and returns it in list of dicts."""
+        try:
+            for _ in range(2):
+                try:
+                    self.initialize()
+                    ie_result = self._real_extract(url)
+                    if self._x_forwarded_for_ip:
+                        ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
+                    return ie_result
+                except GeoRestrictedError as e:
+                    if self.__maybe_fake_ip_and_retry(e.countries):
+                        continue
+                    raise
+        except ExtractorError:
+            raise
+        except compat_http_client.IncompleteRead as e:
+            raise ExtractorError('A network error has occurred.', cause=e, expected=True)
+        except (KeyError, StopIteration) as e:
+            raise ExtractorError('An extractor error has occurred.', cause=e)
+
+    def __maybe_fake_ip_and_retry(self, countries):
+        if (not self._downloader.params.get('geo_bypass_country', None)
+                and self._GEO_BYPASS
+                and self._downloader.params.get('geo_bypass', True)
+                and not self._x_forwarded_for_ip
+                and countries):
+            country_code = random.choice(countries)
+            self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
+            if self._x_forwarded_for_ip:
+                self.report_warning(
+                    'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
+                    % (self._x_forwarded_for_ip, country_code.upper()))
+                return True
+        return False
+
+    def set_downloader(self, downloader):
+        """Sets the downloader for this IE."""
+        self._downloader = downloader
+
+    def _real_initialize(self):
+        """Real initialization process. Redefine in subclasses."""
+        pass
+
+    def _real_extract(self, url):
+        """Real extraction process. Redefine in subclasses."""
+        pass
+
+    @classmethod
+    def ie_key(cls):
+        """A string for getting the InfoExtractor with get_info_extractor"""
+        return compat_str(cls.__name__[:-2])
+
+    @property
+    def IE_NAME(self):
+        return compat_str(type(self).__name__[:-2])
+
+    @staticmethod
+    def __can_accept_status_code(err, expected_status):
+        assert isinstance(err, compat_urllib_error.HTTPError)
+        if expected_status is None:
+            return False
+        if isinstance(expected_status, compat_integer_types):
+            return err.code == expected_status
+        elif isinstance(expected_status, (list, tuple)):
+            return err.code in expected_status
+        elif callable(expected_status):
+            return expected_status(err.code) is True
+        else:
+            assert False
+
+    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
+        """
+        Return the response handle.
+
+        See _download_webpage docstring for arguments specification.
+        """
+        if note is None:
+            self.report_download_webpage(video_id)
+        elif note is not False:
+            if video_id is None:
+                self.to_screen('%s' % (note,))
+            else:
+                self.to_screen('%s: %s' % (video_id, note))
+
+        # Some sites check X-Forwarded-For HTTP header in order to figure out
+        # the origin of the client behind proxy. This allows bypassing geo
+        # restriction by faking this header's value to IP that belongs to some
+        # geo unrestricted country. We will do so once we encounter any
+        # geo restriction error.
+        if self._x_forwarded_for_ip:
+            if 'X-Forwarded-For' not in headers:
+                headers['X-Forwarded-For'] = self._x_forwarded_for_ip
+
+        if isinstance(url_or_request, compat_urllib_request.Request):
+            url_or_request = update_Request(
+                url_or_request, data=data, headers=headers, query=query)
+        else:
+            if query:
+                url_or_request = update_url_query(url_or_request, query)
+            if data is not None or headers:
+                url_or_request = sanitized_Request(url_or_request, data, headers)
+        try:
+            return self._downloader.urlopen(url_or_request)
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            if isinstance(err, compat_urllib_error.HTTPError):
+                if self.__can_accept_status_code(err, expected_status):
+                    # Retain reference to error to prevent file object from
+                    # being closed before it can be read. Works around the
+                    # effects of <https://bugs.python.org/issue15002>
+                    # introduced in Python 3.4.1.
+                    err.fp._error = err
+                    return err.fp
+
+            if errnote is False:
+                return False
+            if errnote is None:
+                errnote = 'Unable to download webpage'
+
+            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
+            if fatal:
+                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
+            else:
+                self._downloader.report_warning(errmsg)
+                return False
+
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
+        """
+        Return a tuple (page content as string, URL handle).
+
+        See _download_webpage docstring for arguments specification.
+        """
+        # Strip hashes from the URL (#1038)
+        if isinstance(url_or_request, (compat_str, str)):
+            url_or_request = url_or_request.partition('#')[0]
+
+        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
+        if urlh is False:
+            assert not fatal
+            return False
+        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+        return (content, urlh)
+
+    @staticmethod
+    def _guess_encoding_from_content(content_type, webpage_bytes):
+        m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
+        if m:
+            encoding = m.group(1)
+        else:
+            m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
+                          webpage_bytes[:1024])
+            if m:
+                encoding = m.group(1).decode('ascii')
+            elif webpage_bytes.startswith(b'\xff\xfe'):
+                encoding = 'utf-16'
+            else:
+                encoding = 'utf-8'
+
+        return encoding
+
+    def __check_blocked(self, content):
+        first_block = content[:512]
+        if ('<title>Access to this site is blocked</title>' in content
+                and 'Websense' in first_block):
+            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
+            blocked_iframe = self._html_search_regex(
+                r'<iframe src="([^"]+)"', content,
+                'Websense information URL', default=None)
+            if blocked_iframe:
+                msg += ' Visit %s for more details' % blocked_iframe
+            raise ExtractorError(msg, expected=True)
+        if '<title>The URL you requested has been blocked</title>' in first_block:
+            msg = (
+                'Access to this webpage has been blocked by Indian censorship. '
+                'Use a VPN or proxy server (with --proxy) to route around it.')
+            block_msg = self._html_search_regex(
+                r'</h1><p>(.*?)</p>',
+                content, 'block message', default=None)
+            if block_msg:
+                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+            raise ExtractorError(msg, expected=True)
+        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
+                and 'blocklist.rkn.gov.ru' in content):
+            raise ExtractorError(
+                'Access to this webpage has been blocked by decision of the Russian government. '
+                'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
+                expected=True)
+
+    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+        content_type = urlh.headers.get('Content-Type', '')
+        webpage_bytes = urlh.read()
+        if prefix is not None:
+            webpage_bytes = prefix + webpage_bytes
+        if not encoding:
+            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
+        if self._downloader.params.get('dump_intermediate_pages', False):
+            self.to_screen('Dumping request to ' + urlh.geturl())
+            dump = base64.b64encode(webpage_bytes).decode('ascii')
+            self._downloader.to_screen(dump)
+        if self._downloader.params.get('write_pages', False):
+            basen = '%s_%s' % (video_id, urlh.geturl())
+            if len(basen) > 240:
+                h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
+                basen = basen[:240 - len(h)] + h
+            raw_filename = basen + '.dump'
+            filename = sanitize_filename(raw_filename, restricted=True)
+            self.to_screen('Saving request to ' + filename)
+            # Working around MAX_PATH limitation on Windows (see
+            # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
+            if compat_os_name == 'nt':
+                absfilepath = os.path.abspath(filename)
+                if len(absfilepath) > 259:
+                    filename = '\\\\?\\' + absfilepath
+            with open(filename, 'wb') as outf:
+                outf.write(webpage_bytes)
+
+        try:
+            content = webpage_bytes.decode(encoding, 'replace')
+        except LookupError:
+            content = webpage_bytes.decode('utf-8', 'replace')
+
+        self.__check_blocked(content)
+
+        return content
+
+    def _download_webpage(
+            self, url_or_request, video_id, note=None, errnote=None,
+            fatal=True, tries=1, timeout=5, encoding=None, data=None,
+            headers={}, query={}, expected_status=None):
+        """
+        Return the data of the page as a string.
+
+        Arguments:
+        url_or_request -- plain text URL as a string or
+            a compat_urllib_request.Requestobject
+        video_id -- Video/playlist/item identifier (string)
+
+        Keyword arguments:
+        note -- note printed before downloading (string)
+        errnote -- note printed in case of an error (string)
+        fatal -- flag denoting whether error should be considered fatal,
+            i.e. whether it should cause ExtractionError to be raised,
+            otherwise a warning will be reported and extraction continued
+        tries -- number of tries
+        timeout -- sleep interval between tries
+        encoding -- encoding for a page content decoding, guessed automatically
+            when not explicitly specified
+        data -- POST data (bytes)
+        headers -- HTTP headers (dict)
+        query -- URL query (dict)
+        expected_status -- allows to accept failed HTTP requests (non 2xx
+            status code) by explicitly specifying a set of accepted status
+            codes. Can be any of the following entities:
+                - an integer type specifying an exact failed status code to
+                  accept
+                - a list or a tuple of integer types specifying a list of
+                  failed status codes to accept
+                - a callable accepting an actual failed status code and
+                  returning True if it should be accepted
+            Note that this argument does not affect success status codes (2xx)
+            which are always accepted.
+        """
+
+        success = False
+        try_count = 0
+        while success is False:
+            try:
+                res = self._download_webpage_handle(
+                    url_or_request, video_id, note, errnote, fatal,
+                    encoding=encoding, data=data, headers=headers, query=query,
+                    expected_status=expected_status)
+                success = True
+            except compat_http_client.IncompleteRead as e:
+                try_count += 1
+                if try_count >= tries:
+                    raise e
+                self._sleep(timeout, video_id)
+        if res is False:
+            return res
+        else:
+            content, _ = res
+            return content
+
+    def _download_xml_handle(
+            self, url_or_request, video_id, note='Downloading XML',
+            errnote='Unable to download XML', transform_source=None,
+            fatal=True, encoding=None, data=None, headers={}, query={},
+            expected_status=None):
+        """
+        Return a tuple (xml as an compat_etree_Element, URL handle).
+
+        See _download_webpage docstring for arguments specification.
+        """
+        res = self._download_webpage_handle(
+            url_or_request, video_id, note, errnote, fatal=fatal,
+            encoding=encoding, data=data, headers=headers, query=query,
+            expected_status=expected_status)
+        if res is False:
+            return res
+        xml_string, urlh = res
+        return self._parse_xml(
+            xml_string, video_id, transform_source=transform_source,
+            fatal=fatal), urlh
+
+    def _download_xml(
+            self, url_or_request, video_id,
+            note='Downloading XML', errnote='Unable to download XML',
+            transform_source=None, fatal=True, encoding=None,
+            data=None, headers={}, query={}, expected_status=None):
+        """
+        Return the xml as an compat_etree_Element.
+
+        See _download_webpage docstring for arguments specification.
+        """
+        res = self._download_xml_handle(
+            url_or_request, video_id, note=note, errnote=errnote,
+            transform_source=transform_source, fatal=fatal, encoding=encoding,
+            data=data, headers=headers, query=query,
+            expected_status=expected_status)
+        return res if res is False else res[0]
+
+    def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
+        if transform_source:
+            xml_string = transform_source(xml_string)
+        try:
+            return compat_etree_fromstring(xml_string.encode('utf-8'))
+        except compat_xml_parse_error as ve:
+            errmsg = '%s: Failed to parse XML ' % video_id
+            if fatal:
+                raise ExtractorError(errmsg, cause=ve)
+            else:
+                self.report_warning(errmsg + str(ve))
+
+    def _download_json_handle(
+            self, url_or_request, video_id, note='Downloading JSON metadata',
+            errnote='Unable to download JSON metadata', transform_source=None,
+            fatal=True, encoding=None, data=None, headers={}, query={},
+            expected_status=None):
+        """
+        Return a tuple (JSON object, URL handle).
+
+        See _download_webpage docstring for arguments specification.
+        """
+        res = self._download_webpage_handle(
+            url_or_request, video_id, note, errnote, fatal=fatal,
+            encoding=encoding, data=data, headers=headers, query=query,
+            expected_status=expected_status)
+        if res is False:
+            return res
+        json_string, urlh = res
+        return self._parse_json(
+            json_string, video_id, transform_source=transform_source,
+            fatal=fatal), urlh
+
+    def _download_json(
+            self, url_or_request, video_id, note='Downloading JSON metadata',
+            errnote='Unable to download JSON metadata', transform_source=None,
+            fatal=True, encoding=None, data=None, headers={}, query={},
+            expected_status=None):
+        """
+        Return the JSON object as a dict.
+
+        See _download_webpage docstring for arguments specification.
+        """
+        res = self._download_json_handle(
+            url_or_request, video_id, note=note, errnote=errnote,
+            transform_source=transform_source, fatal=fatal, encoding=encoding,
+            data=data, headers=headers, query=query,
+            expected_status=expected_status)
+        return res if res is False else res[0]
+
+    def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
+        if transform_source:
+            json_string = transform_source(json_string)
+        try:
+            return json.loads(json_string)
+        except ValueError as ve:
+            errmsg = '%s: Failed to parse JSON ' % video_id
+            if fatal:
+                raise ExtractorError(errmsg, cause=ve)
+            else:
+                self.report_warning(errmsg + str(ve))
+
+    def report_warning(self, msg, video_id=None):
+        idstr = '' if video_id is None else '%s: ' % video_id
+        self._downloader.report_warning(
+            '[%s] %s%s' % (self.IE_NAME, idstr, msg))
+
+    def to_screen(self, msg):
+        """Print msg to screen, prefixing it with '[ie_name]'"""
+        self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
+
+    def report_extraction(self, id_or_name):
+        """Report information extraction."""
+        self.to_screen('%s: Extracting information' % id_or_name)
+
+    def report_download_webpage(self, video_id):
+        """Report webpage download."""
+        self.to_screen('%s: Downloading webpage' % video_id)
+
+    def report_age_confirmation(self):
+        """Report attempt to confirm age."""
+        self.to_screen('Confirming age')
+
+    def report_login(self):
+        """Report attempt to log in."""
+        self.to_screen('Logging in')
+
+    @staticmethod
+    def raise_login_required(msg='This video is only available for registered users'):
+        raise ExtractorError(
+            '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
+            expected=True)
+
+    @staticmethod
+    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
+        raise GeoRestrictedError(msg, countries=countries)
+
+    # Methods for following #608
+    @staticmethod
+    def url_result(url, ie=None, video_id=None, video_title=None):
+        """Returns a URL that points to a page that should be processed"""
+        # TODO: ie should be the class used for getting the info
+        video_info = {'_type': 'url',
+                      'url': url,
+                      'ie_key': ie}
+        if video_id is not None:
+            video_info['id'] = video_id
+        if video_title is not None:
+            video_info['title'] = video_title
+        return video_info
+
+    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
+        urls = orderedSet(
+            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
+            for m in matches)
+        return self.playlist_result(
+            urls, playlist_id=playlist_id, playlist_title=playlist_title)
+
+    @staticmethod
+    def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
+        """Returns a playlist"""
+        video_info = {'_type': 'playlist',
+                      'entries': entries}
+        if playlist_id:
+            video_info['id'] = playlist_id
+        if playlist_title:
+            video_info['title'] = playlist_title
+        if playlist_description:
+            video_info['description'] = playlist_description
+        return video_info
+
+    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
+        """
+        Perform a regex search on the given string, using a single or a list of
+        patterns returning the first matching group.
+        In case of failure return a default value or raise a WARNING or a
+        RegexNotFoundError, depending on fatal, specifying the field name.
+        """
+        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
+            mobj = re.search(pattern, string, flags)
+        else:
+            for p in pattern:
+                mobj = re.search(p, string, flags)
+                if mobj:
+                    break
+
+        if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
+            _name = '\033[0;34m%s\033[0m' % name
+        else:
+            _name = name
+
+        if mobj:
+            if group is None:
+                # return the first matching group
+                return next(g for g in mobj.groups() if g is not None)
+            else:
+                return mobj.group(group)
+        elif default is not NO_DEFAULT:
+            return default
+        elif fatal:
+            raise RegexNotFoundError('Unable to extract %s' % _name)
+        else:
+            self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
+            return None
+
+    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
+        """
+        Like _search_regex, but strips HTML tags and unescapes entities.
+        """
+        res = self._search_regex(pattern, string, name, default, fatal, flags, group)
+        if res:
+            return clean_html(res).strip()
+        else:
+            return res
+
+    def _get_netrc_login_info(self, netrc_machine=None):
+        username = None
+        password = None
+        netrc_machine = netrc_machine or self._NETRC_MACHINE
+
+        if self._downloader.params.get('usenetrc', False):
+            try:
+                info = netrc.netrc().authenticators(netrc_machine)
+                if info is not None:
+                    username = info[0]
+                    password = info[2]
+                else:
+                    raise netrc.NetrcParseError(
+                        'No authenticators for %s' % netrc_machine)
+            except (IOError, netrc.NetrcParseError) as err:
+                self._downloader.report_warning(
+                    'parsing .netrc: %s' % error_to_compat_str(err))
+
+        return username, password
+
+    def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
+        """
+        Get the login info as (username, password)
+        First look for the manually specified credentials using username_option
+        and password_option as keys in params dictionary. If no such credentials
+        available look in the netrc file using the netrc_machine or _NETRC_MACHINE
+        value.
+        If there's no info available, return (None, None)
+        """
+        if self._downloader is None:
+            return (None, None)
+
+        downloader_params = self._downloader.params
+
+        # Attempt to use provided username and password or .netrc data
+        if downloader_params.get(username_option) is not None:
+            username = downloader_params[username_option]
+            password = downloader_params[password_option]
+        else:
+            username, password = self._get_netrc_login_info(netrc_machine)
+
+        return username, password
+
+    def _get_tfa_info(self, note='two-factor verification code'):
+        """
+        Get the two-factor authentication info
+        TODO - asking the user will be required for sms/phone verify
+        currently just uses the command line option
+        If there's no info available, return None
+        """
+        if self._downloader is None:
+            return None
+        downloader_params = self._downloader.params
+
+        if downloader_params.get('twofactor') is not None:
+            return downloader_params['twofactor']
+
+        return compat_getpass('Type %s and press [Return]: ' % note)
+
+    # Helper functions for extracting OpenGraph info
+    @staticmethod
+    def _og_regexes(prop):
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+        property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
+                       % {'prop': re.escape(prop)})
+        template = r'<meta[^>]+?%s[^>]+?%s'
+        return [
+            template % (property_re, content_re),
+            template % (content_re, property_re),
+        ]
+
+    @staticmethod
+    def _meta_regex(prop):
+        return r'''(?isx)<meta
+                    (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
+                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
+
+    def _og_search_property(self, prop, html, name=None, **kargs):
+        if not isinstance(prop, (list, tuple)):
+            prop = [prop]
+        if name is None:
+            name = 'OpenGraph %s' % prop[0]
+        og_regexes = []
+        for p in prop:
+            og_regexes.extend(self._og_regexes(p))
+        escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
+        if escaped is None:
+            return None
+        return unescapeHTML(escaped)
+
+    def _og_search_thumbnail(self, html, **kargs):
+        return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
+
+    def _og_search_description(self, html, **kargs):
+        return self._og_search_property('description', html, fatal=False, **kargs)
+
+    def _og_search_title(self, html, **kargs):
+        return self._og_search_property('title', html, **kargs)
+
+    def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
+        regexes = self._og_regexes('video') + self._og_regexes('video:url')
+        if secure:
+            regexes = self._og_regexes('video:secure_url') + regexes
+        return self._html_search_regex(regexes, html, name, **kargs)
+
+    def _og_search_url(self, html, **kargs):
+        return self._og_search_property('url', html, **kargs)
+
+    def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
+        if not isinstance(name, (list, tuple)):
+            name = [name]
+        if display_name is None:
+            display_name = name[0]
+        return self._html_search_regex(
+            [self._meta_regex(n) for n in name],
+            html, display_name, fatal=fatal, group='content', **kwargs)
+
+    def _dc_search_uploader(self, html):
+        return self._html_search_meta('dc.creator', html, 'uploader')
+
+    def _rta_search(self, html):
+        # See http://www.rtalabel.org/index.php?content=howtofaq#single
+        if re.search(r'(?ix)<meta\s+name="rating"\s+'
+                     r'     content="RTA-5042-1996-1400-1577-RTA"',
+                     html):
+            return 18
+        return 0
+
+    def _media_rating_search(self, html):
+        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
+        rating = self._html_search_meta('rating', html)
+
+        if not rating:
+            return None
+
+        RATING_TABLE = {
+            'safe for kids': 0,
+            'general': 8,
+            '14 years': 14,
+            'mature': 17,
+            'restricted': 19,
+        }
+        return RATING_TABLE.get(rating.lower())
+
+    def _family_friendly_search(self, html):
+        # See http://schema.org/VideoObject
+        family_friendly = self._html_search_meta(
+            'isFamilyFriendly', html, default=None)
+
+        if not family_friendly:
+            return None
+
+        RATING_TABLE = {
+            '1': 0,
+            'true': 0,
+            '0': 18,
+            'false': 18,
+        }
+        return RATING_TABLE.get(family_friendly.lower())
+
+    def _twitter_search_player(self, html):
+        return self._html_search_meta('twitter:player', html,
+                                      'twitter card player')
+
+    def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
+        json_ld_list = list(re.finditer(JSON_LD_RE, html))
+        default = kwargs.get('default', NO_DEFAULT)
+        # JSON-LD may be malformed and thus `fatal` should be respected.
+        # At the same time `default` may be passed that assumes `fatal=False`
+        # for _search_regex. Let's simulate the same behavior here as well.
+        fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
+        json_ld = []
+        for mobj in json_ld_list:
+            json_ld_item = self._parse_json(
+                mobj.group('json_ld'), video_id, fatal=fatal)
+            if not json_ld_item:
+                continue
+            if isinstance(json_ld_item, dict):
+                json_ld.append(json_ld_item)
+            elif isinstance(json_ld_item, (list, tuple)):
+                json_ld.extend(json_ld_item)
+        if json_ld:
+            json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
+        if json_ld:
+            return json_ld
+        if default is not NO_DEFAULT:
+            return default
+        elif fatal:
+            raise RegexNotFoundError('Unable to extract JSON-LD')
+        else:
+            self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
+            return {}
+
+    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
+        if isinstance(json_ld, compat_str):
+            json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
+        if not json_ld:
+            return {}
+        info = {}
+        if not isinstance(json_ld, (list, tuple, dict)):
+            return info
+        if isinstance(json_ld, dict):
+            json_ld = [json_ld]
+
+        INTERACTION_TYPE_MAP = {
+            'CommentAction': 'comment',
+            'AgreeAction': 'like',
+            'DisagreeAction': 'dislike',
+            'LikeAction': 'like',
+            'DislikeAction': 'dislike',
+            'ListenAction': 'view',
+            'WatchAction': 'view',
+            'ViewAction': 'view',
+        }
+
+        def extract_interaction_statistic(e):
+            interaction_statistic = e.get('interactionStatistic')
+            if not isinstance(interaction_statistic, list):
+                return
+            for is_e in interaction_statistic:
+                if not isinstance(is_e, dict):
+                    continue
+                if is_e.get('@type') != 'InteractionCounter':
+                    continue
+                interaction_type = is_e.get('interactionType')
+                if not isinstance(interaction_type, compat_str):
+                    continue
+                interaction_count = int_or_none(is_e.get('userInteractionCount'))
+                if interaction_count is None:
+                    continue
+                count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
+                if not count_kind:
+                    continue
+                count_key = '%s_count' % count_kind
+                if info.get(count_key) is not None:
+                    continue
+                info[count_key] = interaction_count
+
+        def extract_video_object(e):
+            assert e['@type'] == 'VideoObject'
+            info.update({
+                'url': url_or_none(e.get('contentUrl')),
+                'title': unescapeHTML(e.get('name')),
+                'description': unescapeHTML(e.get('description')),
+                'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
+                'duration': parse_duration(e.get('duration')),
+                'timestamp': unified_timestamp(e.get('uploadDate')),
+                'filesize': float_or_none(e.get('contentSize')),
+                'tbr': int_or_none(e.get('bitrate')),
+                'width': int_or_none(e.get('width')),
+                'height': int_or_none(e.get('height')),
+                'view_count': int_or_none(e.get('interactionCount')),
+            })
+            extract_interaction_statistic(e)
+
+        for e in json_ld:
+            if '@context' in e:
+                item_type = e.get('@type')
+                if expected_type is not None and expected_type != item_type:
+                    continue
+                if item_type in ('TVEpisode', 'Episode'):
+                    episode_name = unescapeHTML(e.get('name'))
+                    info.update({
+                        'episode': episode_name,
+                        'episode_number': int_or_none(e.get('episodeNumber')),
+                        'description': unescapeHTML(e.get('description')),
+                    })
+                    if not info.get('title') and episode_name:
+                        info['title'] = episode_name
+                    part_of_season = e.get('partOfSeason')
+                    if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
+                        info.update({
+                            'season': unescapeHTML(part_of_season.get('name')),
+                            'season_number': int_or_none(part_of_season.get('seasonNumber')),
+                        })
+                    part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
+                    if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
+                        info['series'] = unescapeHTML(part_of_series.get('name'))
+                elif item_type == 'Movie':
+                    info.update({
+                        'title': unescapeHTML(e.get('name')),
+                        'description': unescapeHTML(e.get('description')),
+                        'duration': parse_duration(e.get('duration')),
+                        'timestamp': unified_timestamp(e.get('dateCreated')),
+                    })
+                elif item_type in ('Article', 'NewsArticle'):
+                    info.update({
+                        'timestamp': parse_iso8601(e.get('datePublished')),
+                        'title': unescapeHTML(e.get('headline')),
+                        'description': unescapeHTML(e.get('articleBody')),
+                    })
+                elif item_type == 'VideoObject':
+                    extract_video_object(e)
+                    if expected_type is None:
+                        continue
+                    else:
+                        break
+                video = e.get('video')
+                if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+                    extract_video_object(video)
+                if expected_type is None:
+                    continue
+                else:
+                    break
+        return dict((k, v) for k, v in info.items() if v is not None)
+
+    @staticmethod
+    def _hidden_inputs(html):
+        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
+        hidden_inputs = {}
+        for input in re.findall(r'(?i)(<input[^>]+>)', html):
+            attrs = extract_attributes(input)
+            if not input:
+                continue
+            if attrs.get('type') not in ('hidden', 'submit'):
+                continue
+            name = attrs.get('name') or attrs.get('id')
+            value = attrs.get('value')
+            if name and value is not None:
+                hidden_inputs[name] = value
+        return hidden_inputs
+
+    def _form_hidden_inputs(self, form_id, html):
+        form = self._search_regex(
+            r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+            html, '%s form' % form_id, group='form')
+        return self._hidden_inputs(form)
+
+    def _sort_formats(self, formats, field_preference=None):
+        if not formats:
+            raise ExtractorError('No video formats found')
+
+        for f in formats:
+            # Automatically determine tbr when missing based on abr and vbr (improves
+            # formats sorting in some cases)
+            if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
+                f['tbr'] = f['abr'] + f['vbr']
+
+        def _formats_key(f):
+            # TODO remove the following workaround
+            from ..utils import determine_ext
+            if not f.get('ext') and 'url' in f:
+                f['ext'] = determine_ext(f['url'])
+
+            if isinstance(field_preference, (list, tuple)):
+                return tuple(
+                    f.get(field)
+                    if f.get(field) is not None
+                    else ('' if field == 'format_id' else -1)
+                    for field in field_preference)
+
+            preference = f.get('preference')
+            if preference is None:
+                preference = 0
+                if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
+                    preference -= 0.5
+
+            protocol = f.get('protocol') or determine_protocol(f)
+            proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
+
+            if f.get('vcodec') == 'none':  # audio only
+                preference -= 50
+                if self._downloader.params.get('prefer_free_formats'):
+                    ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
+                else:
+                    ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
+                ext_preference = 0
+                try:
+                    audio_ext_preference = ORDER.index(f['ext'])
+                except ValueError:
+                    audio_ext_preference = -1
+            else:
+                if f.get('acodec') == 'none':  # video only
+                    preference -= 40
+                if self._downloader.params.get('prefer_free_formats'):
+                    ORDER = ['flv', 'mp4', 'webm']
+                else:
+                    ORDER = ['webm', 'flv', 'mp4']
+                try:
+                    ext_preference = ORDER.index(f['ext'])
+                except ValueError:
+                    ext_preference = -1
+                audio_ext_preference = 0
+
+            return (
+                preference,
+                f.get('language_preference') if f.get('language_preference') is not None else -1,
+                f.get('quality') if f.get('quality') is not None else -1,
+                f.get('tbr') if f.get('tbr') is not None else -1,
+                f.get('filesize') if f.get('filesize') is not None else -1,
+                f.get('vbr') if f.get('vbr') is not None else -1,
+                f.get('height') if f.get('height') is not None else -1,
+                f.get('width') if f.get('width') is not None else -1,
+                proto_preference,
+                ext_preference,
+                f.get('abr') if f.get('abr') is not None else -1,
+                audio_ext_preference,
+                f.get('fps') if f.get('fps') is not None else -1,
+                f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
+                f.get('source_preference') if f.get('source_preference') is not None else -1,
+                f.get('format_id') if f.get('format_id') is not None else '',
+            )
+        formats.sort(key=_formats_key)
+
+    def _check_formats(self, formats, video_id):
+        if formats:
+            formats[:] = filter(
+                lambda f: self._is_valid_url(
+                    f['url'], video_id,
+                    item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
+                formats)
+
+    @staticmethod
+    def _remove_duplicate_formats(formats):
+        format_urls = set()
+        unique_formats = []
+        for f in formats:
+            if f['url'] not in format_urls:
+                format_urls.add(f['url'])
+                unique_formats.append(f)
+        formats[:] = unique_formats
+
+    def _is_valid_url(self, url, video_id, item='video', headers={}):
+        url = self._proto_relative_url(url, scheme='http:')
+        # For now assume non HTTP(S) URLs always valid
+        if not (url.startswith('http://') or url.startswith('https://')):
+            return True
+        try:
+            self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
+            return True
+        except ExtractorError:
+            self.to_screen(
+                '%s: %s URL is invalid, skipping' % (video_id, item))
+            return False
+
+    def http_scheme(self):
+        """ Either "http:" or "https:", depending on the user's preferences """
+        return (
+            'http:'
+            if self._downloader.params.get('prefer_insecure', False)
+            else 'https:')
+
+    def _proto_relative_url(self, url, scheme=None):
+        if url is None:
+            return url
+        if url.startswith('//'):
+            if scheme is None:
+                scheme = self.http_scheme()
+            return scheme + url
+        else:
+            return url
+
+    def _sleep(self, timeout, video_id, msg_template=None):
+        if msg_template is None:
+            msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
+        msg = msg_template % {'video_id': video_id, 'timeout': timeout}
+        self.to_screen(msg)
+        time.sleep(timeout)
+
+    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                             fatal=True, m3u8_id=None, data=None, headers={}, query={}):
+        manifest = self._download_xml(
+            manifest_url, video_id, 'Downloading f4m manifest',
+            'Unable to download f4m manifest',
+            # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+            # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
+            transform_source=transform_source,
+            fatal=fatal, data=data, headers=headers, query=query)
+
+        if manifest is False:
+            return []
+
+        return self._parse_f4m_formats(
+            manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+            transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
+
+    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
+                           transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                           fatal=True, m3u8_id=None):
+        if not isinstance(manifest, compat_etree_Element) and not fatal:
+            return []
+
+        # currently youtube-dlc cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
+        akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
+        if akamai_pv is not None and ';' in akamai_pv.text:
+            playerVerificationChallenge = akamai_pv.text.split(';')[0]
+            if playerVerificationChallenge.strip() != '':
+                return []
+
+        formats = []
+        manifest_version = '1.0'
+        media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
+        if not media_nodes:
+            manifest_version = '2.0'
+            media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+        # Remove unsupported DRM protected media from final formats
+        # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
+        media_nodes = remove_encrypted_media(media_nodes)
+        if not media_nodes:
+            return formats
+
+        manifest_base_url = get_base_url(manifest)
+
+        bootstrap_info = xpath_element(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
+            'bootstrap info', default=None)
+
+        vcodec = None
+        mime_type = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
+            'base URL', default=None)
+        if mime_type and mime_type.startswith('audio/'):
+            vcodec = 'none'
+
+        for i, media_el in enumerate(media_nodes):
+            tbr = int_or_none(media_el.attrib.get('bitrate'))
+            width = int_or_none(media_el.attrib.get('width'))
+            height = int_or_none(media_el.attrib.get('height'))
+            format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
+            # If <bootstrapInfo> is present, the specified f4m is a
+            # stream-level manifest, and only set-level manifests may refer to
+            # external resources.  See section 11.4 and section 4 of F4M spec
+            if bootstrap_info is None:
+                media_url = None
+                # @href is introduced in 2.0, see section 11.6 of F4M spec
+                if manifest_version == '2.0':
+                    media_url = media_el.attrib.get('href')
+                if media_url is None:
+                    media_url = media_el.attrib.get('url')
+                if not media_url:
+                    continue
+                manifest_url = (
+                    media_url if media_url.startswith('http://') or media_url.startswith('https://')
+                    else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
+                # If media_url is itself a f4m manifest do the recursive extraction
+                # since bitrates in parent manifest (this one) and media_url manifest
+                # may differ leading to inability to resolve the format by requested
+                # bitrate in f4m downloader
+                ext = determine_ext(manifest_url)
+                if ext == 'f4m':
+                    f4m_formats = self._extract_f4m_formats(
+                        manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+                        transform_source=transform_source, fatal=fatal)
+                    # Sometimes stream-level manifest contains single media entry that
+                    # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
+                    # At the same time parent's media entry in set-level manifest may
+                    # contain it. We will copy it from parent in such cases.
+                    if len(f4m_formats) == 1:
+                        f = f4m_formats[0]
+                        f.update({
+                            'tbr': f.get('tbr') or tbr,
+                            'width': f.get('width') or width,
+                            'height': f.get('height') or height,
+                            'format_id': f.get('format_id') if not tbr else format_id,
+                            'vcodec': vcodec,
+                        })
+                    formats.extend(f4m_formats)
+                    continue
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        manifest_url, video_id, 'mp4', preference=preference,
+                        m3u8_id=m3u8_id, fatal=fatal))
+                    continue
+            formats.append({
+                'format_id': format_id,
+                'url': manifest_url,
+                'manifest_url': manifest_url,
+                'ext': 'flv' if bootstrap_info is not None else None,
+                'protocol': 'f4m',
+                'tbr': tbr,
+                'width': width,
+                'height': height,
+                'vcodec': vcodec,
+                'preference': preference,
+            })
+        return formats
+
+    def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
+        return {
+            'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
+            'url': m3u8_url,
+            'ext': ext,
+            'protocol': 'm3u8',
+            'preference': preference - 100 if preference else -100,
+            'resolution': 'multiple',
+            'format_note': 'Quality selection URL',
+        }
+
+    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
+                              entry_protocol='m3u8', preference=None,
+                              m3u8_id=None, note=None, errnote=None,
+                              fatal=True, live=False, data=None, headers={},
+                              query={}):
+        res = self._download_webpage_handle(
+            m3u8_url, video_id,
+            note=note or 'Downloading m3u8 information',
+            errnote=errnote or 'Failed to download m3u8 information',
+            fatal=fatal, data=data, headers=headers, query=query)
+
+        if res is False:
+            return []
+
+        m3u8_doc, urlh = res
+        m3u8_url = urlh.geturl()
+
+        return self._parse_m3u8_formats(
+            m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
+            preference=preference, m3u8_id=m3u8_id, live=live)
+
+    def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
+                            entry_protocol='m3u8', preference=None,
+                            m3u8_id=None, live=False):
+        if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
+            return []
+
+        if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
+            return []
+
+        formats = []
+
+        format_url = lambda u: (
+            u
+            if re.match(r'^https?://', u)
+            else compat_urlparse.urljoin(m3u8_url, u))
+
+        # References:
+        # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
+        # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
+        # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
+
+        # We should try extracting formats only from master playlists [1, 4.3.4],
+        # i.e. playlists that describe available qualities. On the other hand
+        # media playlists [1, 4.3.3] should be returned as is since they contain
+        # just the media without qualities renditions.
+        # Fortunately, master playlist can be easily distinguished from media
+        # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
+        # master playlist tags MUST NOT appear in a media playist and vice versa.
+        # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
+        # media playlist and MUST NOT appear in master playlist thus we can
+        # clearly detect media playlist with this criterion.
+
+        if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
+            return [{
+                'url': m3u8_url,
+                'format_id': m3u8_id,
+                'ext': ext,
+                'protocol': entry_protocol,
+                'preference': preference,
+            }]
+
+        groups = {}
+        last_stream_inf = {}
+
+        def extract_media(x_media_line):
+            media = parse_m3u8_attributes(x_media_line)
+            # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
+            media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
+            if not (media_type and group_id and name):
+                return
+            groups.setdefault(group_id, []).append(media)
+            if media_type not in ('VIDEO', 'AUDIO'):
+                return
+            media_url = media.get('URI')
+            if media_url:
+                format_id = []
+                for v in (m3u8_id, group_id, name):
+                    if v:
+                        format_id.append(v)
+                f = {
+                    'format_id': '-'.join(format_id),
+                    'url': format_url(media_url),
+                    'manifest_url': m3u8_url,
+                    'language': media.get('LANGUAGE'),
+                    'ext': ext,
+                    'protocol': entry_protocol,
+                    'preference': preference,
+                }
+                if media_type == 'AUDIO':
+                    f['vcodec'] = 'none'
+                formats.append(f)
+
+        def build_stream_name():
+            # Despite specification does not mention NAME attribute for
+            # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
+            # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
+            # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
+            stream_name = last_stream_inf.get('NAME')
+            if stream_name:
+                return stream_name
+            # If there is no NAME in EXT-X-STREAM-INF it will be obtained
+            # from corresponding rendition group
+            stream_group_id = last_stream_inf.get('VIDEO')
+            if not stream_group_id:
+                return
+            stream_group = groups.get(stream_group_id)
+            if not stream_group:
+                return stream_group_id
+            rendition = stream_group[0]
+            return rendition.get('NAME') or stream_group_id
+
+        # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
+        # chance to detect video only formats when EXT-X-STREAM-INF tags
+        # precede EXT-X-MEDIA tags in HLS manifest such as [3].
+        for line in m3u8_doc.splitlines():
+            if line.startswith('#EXT-X-MEDIA:'):
+                extract_media(line)
+
+        for line in m3u8_doc.splitlines():
+            if line.startswith('#EXT-X-STREAM-INF:'):
+                last_stream_inf = parse_m3u8_attributes(line)
+            elif line.startswith('#') or not line.strip():
+                continue
+            else:
+                tbr = float_or_none(
+                    last_stream_inf.get('AVERAGE-BANDWIDTH')
+                    or last_stream_inf.get('BANDWIDTH'), scale=1000)
+                format_id = []
+                if m3u8_id:
+                    format_id.append(m3u8_id)
+                stream_name = build_stream_name()
+                # Bandwidth of live streams may differ over time thus making
+                # format_id unpredictable. So it's better to keep provided
+                # format_id intact.
+                if not live:
+                    format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
+                manifest_url = format_url(line.strip())
+                f = {
+                    'format_id': '-'.join(format_id),
+                    'url': manifest_url,
+                    'manifest_url': m3u8_url,
+                    'tbr': tbr,
+                    'ext': ext,
+                    'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
+                    'protocol': entry_protocol,
+                    'preference': preference,
+                }
+                resolution = last_stream_inf.get('RESOLUTION')
+                if resolution:
+                    mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
+                    if mobj:
+                        f['width'] = int(mobj.group('width'))
+                        f['height'] = int(mobj.group('height'))
+                # Unified Streaming Platform
+                mobj = re.search(
+                    r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
+                if mobj:
+                    abr, vbr = mobj.groups()
+                    abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
+                    f.update({
+                        'vbr': vbr,
+                        'abr': abr,
+                    })
+                codecs = parse_codecs(last_stream_inf.get('CODECS'))
+                f.update(codecs)
+                audio_group_id = last_stream_inf.get('AUDIO')
+                # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+                # references a rendition group MUST have a CODECS attribute.
+                # However, this is not always respected, for example, [2]
+                # contains EXT-X-STREAM-INF tag which references AUDIO
+                # rendition group but does not have CODECS and despite
+                # referencing an audio group it represents a complete
+                # (with audio and video) format. So, for such cases we will
+                # ignore references to rendition groups and treat them
+                # as complete formats.
+                if audio_group_id and codecs and f.get('vcodec') != 'none':
+                    audio_group = groups.get(audio_group_id)
+                    if audio_group and audio_group[0].get('URI'):
+                        # TODO: update acodec for audio only formats with
+                        # the same GROUP-ID
+                        f['acodec'] = 'none'
+                formats.append(f)
+
+                # for DailyMotion
+                progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
+                if progressive_uri:
+                    http_f = f.copy()
+                    del http_f['manifest_url']
+                    http_f.update({
+                        'format_id': f['format_id'].replace('hls-', 'http-'),
+                        'protocol': 'http',
+                        'url': progressive_uri,
+                    })
+                    formats.append(http_f)
+
+                last_stream_inf = {}
+        return formats
+
+    @staticmethod
+    def _xpath_ns(path, namespace=None):
+        if not namespace:
+            return path
+        out = []
+        for c in path.split('/'):
+            if not c or c == '.':
+                out.append(c)
+            else:
+                out.append('{%s}%s' % (namespace, c))
+        return '/'.join(out)
+
+    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
+
+        if smil is False:
+            assert not fatal
+            return []
+
+        namespace = self._parse_smil_namespace(smil)
+
+        return self._parse_smil_formats(
+            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+
+    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+        if smil is False:
+            return {}
+        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
+
+    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
+        return self._download_xml(
+            smil_url, video_id, 'Downloading SMIL file',
+            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
+
+    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+        namespace = self._parse_smil_namespace(smil)
+
+        formats = self._parse_smil_formats(
+            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+        subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+
+        video_id = os.path.splitext(url_basename(smil_url))[0]
+        title = None
+        description = None
+        upload_date = None
+        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+            name = meta.attrib.get('name')
+            content = meta.attrib.get('content')
+            if not name or not content:
+                continue
+            if not title and name == 'title':
+                title = content
+            elif not description and name in ('description', 'abstract'):
+                description = content
+            elif not upload_date and name == 'date':
+                upload_date = unified_strdate(content)
+
+        thumbnails = [{
+            'id': image.get('type'),
+            'url': image.get('src'),
+            'width': int_or_none(image.get('width')),
+            'height': int_or_none(image.get('height')),
+        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
+
+        return {
+            'id': video_id,
+            'title': title or video_id,
+            'description': description,
+            'upload_date': upload_date,
+            'thumbnails': thumbnails,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _parse_smil_namespace(self, smil):
+        return self._search_regex(
+            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
+
+    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+        base = smil_url
+        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+            b = meta.get('base') or meta.get('httpBase')
+            if b:
+                base = b
+                break
+
+        formats = []
+        rtmp_count = 0
+        http_count = 0
+        m3u8_count = 0
+
+        srcs = []
+        media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
+        for medium in media:
+            src = medium.get('src')
+            if not src or src in srcs:
+                continue
+            srcs.append(src)
+
+            bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
+            filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
+            width = int_or_none(medium.get('width'))
+            height = int_or_none(medium.get('height'))
+            proto = medium.get('proto')
+            ext = medium.get('ext')
+            src_ext = determine_ext(src)
+            streamer = medium.get('streamer') or base
+
+            if proto == 'rtmp' or streamer.startswith('rtmp'):
+                rtmp_count += 1
+                formats.append({
+                    'url': streamer,
+                    'play_path': src,
+                    'ext': 'flv',
+                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+                    'tbr': bitrate,
+                    'filesize': filesize,
+                    'width': width,
+                    'height': height,
+                })
+                if transform_rtmp_url:
+                    streamer, src = transform_rtmp_url(streamer, src)
+                    formats[-1].update({
+                        'url': streamer,
+                        'play_path': src,
+                    })
+                continue
+
+            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+            src_url = src_url.strip()
+
+            if proto == 'm3u8' or src_ext == 'm3u8':
+                m3u8_formats = self._extract_m3u8_formats(
+                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+                if len(m3u8_formats) == 1:
+                    m3u8_count += 1
+                    m3u8_formats[0].update({
+                        'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
+                        'tbr': bitrate,
+                        'width': width,
+                        'height': height,
+                    })
+                formats.extend(m3u8_formats)
+            elif src_ext == 'f4m':
+                f4m_url = src_url
+                if not f4m_params:
+                    f4m_params = {
+                        'hdcore': '3.2.0',
+                        'plugin': 'flowplayer-3.2.0.1',
+                    }
+                f4m_url += '&' if '?' in f4m_url else '?'
+                f4m_url += compat_urllib_parse_urlencode(f4m_params)
+                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
+            elif src_ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    src_url, video_id, mpd_id='dash', fatal=False))
+            elif re.search(r'\.ism/[Mm]anifest', src_url):
+                formats.extend(self._extract_ism_formats(
+                    src_url, video_id, ism_id='mss', fatal=False))
+            elif src_url.startswith('http') and self._is_valid_url(src, video_id):
+                http_count += 1
+                formats.append({
+                    'url': src_url,
+                    'ext': ext or src_ext or 'flv',
+                    'format_id': 'http-%d' % (bitrate or http_count),
+                    'tbr': bitrate,
+                    'filesize': filesize,
+                    'width': width,
+                    'height': height,
+                })
+
+        return formats
+
+    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+        urls = []
+        subtitles = {}
+        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+            src = textstream.get('src')
+            if not src or src in urls:
+                continue
+            urls.append(src)
+            ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
+            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
+            subtitles.setdefault(lang, []).append({
+                'url': src,
+                'ext': ext,
+            })
+        return subtitles
+
+    def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
+        xspf = self._download_xml(
+            xspf_url, playlist_id, 'Downloading xpsf playlist',
+            'Unable to download xspf manifest', fatal=fatal)
+        if xspf is False:
+            return []
+        return self._parse_xspf(
+            xspf, playlist_id, xspf_url=xspf_url,
+            xspf_base_url=base_url(xspf_url))
+
+    def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
+        NS_MAP = {
+            'xspf': 'http://xspf.org/ns/0/',
+            's1': 'http://static.streamone.nl/player/ns/0',
+        }
+
+        entries = []
+        for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+            title = xpath_text(
+                track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
+            description = xpath_text(
+                track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
+            thumbnail = xpath_text(
+                track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
+            duration = float_or_none(
+                xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
+
+            formats = []
+            for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
+                format_url = urljoin(xspf_base_url, location.text)
+                if not format_url:
+                    continue
+                formats.append({
+                    'url': format_url,
+                    'manifest_url': xspf_url,
+                    'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+                    'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+                    'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+                })
+            self._sort_formats(formats)
+
+            entries.append({
+                'id': playlist_id,
+                'title': title,
+                'description': description,
+                'thumbnail': thumbnail,
+                'duration': duration,
+                'formats': formats,
+            })
+        return entries
+
+    def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}):
+        res = self._download_xml_handle(
+            mpd_url, video_id,
+            note=note or 'Downloading MPD manifest',
+            errnote=errnote or 'Failed to download MPD manifest',
+            fatal=fatal, data=data, headers=headers, query=query)
+        if res is False:
+            return []
+        mpd_doc, urlh = res
+        if mpd_doc is None:
+            return []
+        mpd_base_url = base_url(urlh.geturl())
+
+        return self._parse_mpd_formats(
+            mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
+            formats_dict=formats_dict, mpd_url=mpd_url)
+
+    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
+        """
+        Parse formats from MPD manifest.
+        References:
+         1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
+            http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
+         2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
+        """
+        if mpd_doc.get('type') == 'dynamic':
+            return []
+
+        namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
+
+        def _add_ns(path):
+            return self._xpath_ns(path, namespace)
+
+        def is_drm_protected(element):
+            return element.find(_add_ns('ContentProtection')) is not None
+
+        def extract_multisegment_info(element, ms_parent_info):
+            ms_info = ms_parent_info.copy()
+
+            # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
+            # common attributes and elements.  We will only extract relevant
+            # for us.
+            def extract_common(source):
+                segment_timeline = source.find(_add_ns('SegmentTimeline'))
+                if segment_timeline is not None:
+                    s_e = segment_timeline.findall(_add_ns('S'))
+                    if s_e:
+                        ms_info['total_number'] = 0
+                        ms_info['s'] = []
+                        for s in s_e:
+                            r = int(s.get('r', 0))
+                            ms_info['total_number'] += 1 + r
+                            ms_info['s'].append({
+                                't': int(s.get('t', 0)),
+                                # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
+                                'd': int(s.attrib['d']),
+                                'r': r,
+                            })
+                start_number = source.get('startNumber')
+                if start_number:
+                    ms_info['start_number'] = int(start_number)
+                timescale = source.get('timescale')
+                if timescale:
+                    ms_info['timescale'] = int(timescale)
+                segment_duration = source.get('duration')
+                if segment_duration:
+                    ms_info['segment_duration'] = float(segment_duration)
+
+            def extract_Initialization(source):
+                initialization = source.find(_add_ns('Initialization'))
+                if initialization is not None:
+                    ms_info['initialization_url'] = initialization.attrib['sourceURL']
+
+            segment_list = element.find(_add_ns('SegmentList'))
+            if segment_list is not None:
+                extract_common(segment_list)
+                extract_Initialization(segment_list)
+                segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
+                if segment_urls_e:
+                    ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
+            else:
+                segment_template = element.find(_add_ns('SegmentTemplate'))
+                if segment_template is not None:
+                    extract_common(segment_template)
+                    media = segment_template.get('media')
+                    if media:
+                        ms_info['media'] = media
+                    initialization = segment_template.get('initialization')
+                    if initialization:
+                        ms_info['initialization'] = initialization
+                    else:
+                        extract_Initialization(segment_template)
+            return ms_info
+
+        mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
+        formats = []
+        for period in mpd_doc.findall(_add_ns('Period')):
+            period_duration = parse_duration(period.get('duration')) or mpd_duration
+            period_ms_info = extract_multisegment_info(period, {
+                'start_number': 1,
+                'timescale': 1,
+            })
+            for adaptation_set in period.findall(_add_ns('AdaptationSet')):
+                if is_drm_protected(adaptation_set):
+                    continue
+                adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
+                for representation in adaptation_set.findall(_add_ns('Representation')):
+                    if is_drm_protected(representation):
+                        continue
+                    representation_attrib = adaptation_set.attrib.copy()
+                    representation_attrib.update(representation.attrib)
+                    # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
+                    mime_type = representation_attrib['mimeType']
+                    content_type = mime_type.split('/')[0]
+                    if content_type == 'text':
+                        # TODO implement WebVTT downloading
+                        pass
+                    elif content_type in ('video', 'audio'):
+                        base_url = ''
+                        for element in (representation, adaptation_set, period, mpd_doc):
+                            base_url_e = element.find(_add_ns('BaseURL'))
+                            if base_url_e is not None:
+                                base_url = base_url_e.text + base_url
+                                if re.match(r'^https?://', base_url):
+                                    break
+                        if mpd_base_url and not re.match(r'^https?://', base_url):
+                            if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
+                                mpd_base_url += '/'
+                            base_url = mpd_base_url + base_url
+                        representation_id = representation_attrib.get('id')
+                        lang = representation_attrib.get('lang')
+                        url_el = representation.find(_add_ns('BaseURL'))
+                        filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
+                        bandwidth = int_or_none(representation_attrib.get('bandwidth'))
+                        f = {
+                            'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
+                            'manifest_url': mpd_url,
+                            'ext': mimetype2ext(mime_type),
+                            'width': int_or_none(representation_attrib.get('width')),
+                            'height': int_or_none(representation_attrib.get('height')),
+                            'tbr': float_or_none(bandwidth, 1000),
+                            'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
+                            'fps': int_or_none(representation_attrib.get('frameRate')),
+                            'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
+                            'format_note': 'DASH %s' % content_type,
+                            'filesize': filesize,
+                            'container': mimetype2ext(mime_type) + '_dash',
+                        }
+                        f.update(parse_codecs(representation_attrib.get('codecs')))
+                        representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
+
+                        def prepare_template(template_name, identifiers):
+                            tmpl = representation_ms_info[template_name]
+                            # First of, % characters outside $...$ templates
+                            # must be escaped by doubling for proper processing
+                            # by % operator string formatting used further (see
+                            # https://github.com/ytdl-org/youtube-dl/issues/16867).
+                            t = ''
+                            in_template = False
+                            for c in tmpl:
+                                t += c
+                                if c == '$':
+                                    in_template = not in_template
+                                elif c == '%' and not in_template:
+                                    t += c
+                            # Next, $...$ templates are translated to their
+                            # %(...) counterparts to be used with % operator
+                            t = t.replace('$RepresentationID$', representation_id)
+                            t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
+                            t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
+                            t.replace('$$', '$')
+                            return t
+
+                        # @initialization is a regular template like @media one
+                        # so it should be handled just the same way (see
+                        # https://github.com/ytdl-org/youtube-dl/issues/11605)
+                        if 'initialization' in representation_ms_info:
+                            initialization_template = prepare_template(
+                                'initialization',
+                                # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
+                                # $Time$ shall not be included for @initialization thus
+                                # only $Bandwidth$ remains
+                                ('Bandwidth', ))
+                            representation_ms_info['initialization_url'] = initialization_template % {
+                                'Bandwidth': bandwidth,
+                            }
+
+                        def location_key(location):
+                            return 'url' if re.match(r'^https?://', location) else 'path'
+
+                        if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
+
+                            media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
+                            media_location_key = location_key(media_template)
+
+                            # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
+                            # can't be used at the same time
+                            if '%(Number' in media_template and 's' not in representation_ms_info:
+                                segment_duration = None
+                                if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
+                                    segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
+                                    representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
+                                representation_ms_info['fragments'] = [{
+                                    media_location_key: media_template % {
+                                        'Number': segment_number,
+                                        'Bandwidth': bandwidth,
+                                    },
+                                    'duration': segment_duration,
+                                } for segment_number in range(
+                                    representation_ms_info['start_number'],
+                                    representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+                            else:
+                                # $Number*$ or $Time$ in media template with S list available
+                                # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
+                                # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
+                                representation_ms_info['fragments'] = []
+                                segment_time = 0
+                                segment_d = None
+                                segment_number = representation_ms_info['start_number']
+
+                                def add_segment_url():
+                                    segment_url = media_template % {
+                                        'Time': segment_time,
+                                        'Bandwidth': bandwidth,
+                                        'Number': segment_number,
+                                    }
+                                    representation_ms_info['fragments'].append({
+                                        media_location_key: segment_url,
+                                        'duration': float_or_none(segment_d, representation_ms_info['timescale']),
+                                    })
+
+                                for num, s in enumerate(representation_ms_info['s']):
+                                    segment_time = s.get('t') or segment_time
+                                    segment_d = s['d']
+                                    add_segment_url()
+                                    segment_number += 1
+                                    for r in range(s.get('r', 0)):
+                                        segment_time += segment_d
+                                        add_segment_url()
+                                        segment_number += 1
+                                    segment_time += segment_d
+                        elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
+                            # No media template
+                            # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
+                            # or any YouTube dashsegments video
+                            fragments = []
+                            segment_index = 0
+                            timescale = representation_ms_info['timescale']
+                            for s in representation_ms_info['s']:
+                                duration = float_or_none(s['d'], timescale)
+                                for r in range(s.get('r', 0) + 1):
+                                    segment_uri = representation_ms_info['segment_urls'][segment_index]
+                                    fragments.append({
+                                        location_key(segment_uri): segment_uri,
+                                        'duration': duration,
+                                    })
+                                    segment_index += 1
+                            representation_ms_info['fragments'] = fragments
+                        elif 'segment_urls' in representation_ms_info:
+                            # Segment URLs with no SegmentTimeline
+                            # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
+                            # https://github.com/ytdl-org/youtube-dl/pull/14844
+                            fragments = []
+                            segment_duration = float_or_none(
+                                representation_ms_info['segment_duration'],
+                                representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
+                            for segment_url in representation_ms_info['segment_urls']:
+                                fragment = {
+                                    location_key(segment_url): segment_url,
+                                }
+                                if segment_duration:
+                                    fragment['duration'] = segment_duration
+                                fragments.append(fragment)
+                            representation_ms_info['fragments'] = fragments
+                        # If there is a fragments key available then we correctly recognized fragmented media.
+                        # Otherwise we will assume unfragmented media with direct access. Technically, such
+                        # assumption is not necessarily correct since we may simply have no support for
+                        # some forms of fragmented media renditions yet, but for now we'll use this fallback.
+                        if 'fragments' in representation_ms_info:
+                            f.update({
+                                # NB: mpd_url may be empty when MPD manifest is parsed from a string
+                                'url': mpd_url or base_url,
+                                'fragment_base_url': base_url,
+                                'fragments': [],
+                                'protocol': 'http_dash_segments',
+                            })
+                            if 'initialization_url' in representation_ms_info:
+                                initialization_url = representation_ms_info['initialization_url']
+                                if not f.get('url'):
+                                    f['url'] = initialization_url
+                                f['fragments'].append({location_key(initialization_url): initialization_url})
+                            f['fragments'].extend(representation_ms_info['fragments'])
+                        else:
+                            # Assuming direct URL to unfragmented media.
+                            f['url'] = base_url
+
+                        # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
+                        # is not necessarily unique within a Period thus formats with
+                        # the same `format_id` are quite possible. There are numerous examples
+                        # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
+                        # https://github.com/ytdl-org/youtube-dl/issues/13919)
+                        full_info = formats_dict.get(representation_id, {}).copy()
+                        full_info.update(f)
+                        formats.append(full_info)
+                    else:
+                        self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
+        return formats
+
+    def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
+        res = self._download_xml_handle(
+            ism_url, video_id,
+            note=note or 'Downloading ISM manifest',
+            errnote=errnote or 'Failed to download ISM manifest',
+            fatal=fatal, data=data, headers=headers, query=query)
+        if res is False:
+            return []
+        ism_doc, urlh = res
+        if ism_doc is None:
+            return []
+
+        return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
+
+    def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+        """
+        Parse formats from ISM manifest.
+        References:
+         1. [MS-SSTR]: Smooth Streaming Protocol,
+            https://msdn.microsoft.com/en-us/library/ff469518.aspx
+        """
+        if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
+            return []
+
+        duration = int(ism_doc.attrib['Duration'])
+        timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
+
+        formats = []
+        for stream in ism_doc.findall('StreamIndex'):
+            stream_type = stream.get('Type')
+            if stream_type not in ('video', 'audio'):
+                continue
+            url_pattern = stream.attrib['Url']
+            stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
+            stream_name = stream.get('Name')
+            for track in stream.findall('QualityLevel'):
+                fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
+                # TODO: add support for WVC1 and WMAP
+                if fourcc not in ('H264', 'AVC1', 'AACL'):
+                    self.report_warning('%s is not a supported codec' % fourcc)
+                    continue
+                tbr = int(track.attrib['Bitrate']) // 1000
+                # [1] does not mention Width and Height attributes. However,
+                # they're often present while MaxWidth and MaxHeight are
+                # missing, so should be used as fallbacks
+                width = int_or_none(track.get('MaxWidth') or track.get('Width'))
+                height = int_or_none(track.get('MaxHeight') or track.get('Height'))
+                sampling_rate = int_or_none(track.get('SamplingRate'))
+
+                track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
+                track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
+
+                fragments = []
+                fragment_ctx = {
+                    'time': 0,
+                }
+                stream_fragments = stream.findall('c')
+                for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
+                    fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
+                    fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
+                    fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
+                    if not fragment_ctx['duration']:
+                        try:
+                            next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
+                        except IndexError:
+                            next_fragment_time = duration
+                        fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
+                    for _ in range(fragment_repeat):
+                        fragments.append({
+                            'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
+                            'duration': fragment_ctx['duration'] / stream_timescale,
+                        })
+                        fragment_ctx['time'] += fragment_ctx['duration']
+
+                format_id = []
+                if ism_id:
+                    format_id.append(ism_id)
+                if stream_name:
+                    format_id.append(stream_name)
+                format_id.append(compat_str(tbr))
+
+                formats.append({
+                    'format_id': '-'.join(format_id),
+                    'url': ism_url,
+                    'manifest_url': ism_url,
+                    'ext': 'ismv' if stream_type == 'video' else 'isma',
+                    'width': width,
+                    'height': height,
+                    'tbr': tbr,
+                    'asr': sampling_rate,
+                    'vcodec': 'none' if stream_type == 'audio' else fourcc,
+                    'acodec': 'none' if stream_type == 'video' else fourcc,
+                    'protocol': 'ism',
+                    'fragments': fragments,
+                    '_download_params': {
+                        'duration': duration,
+                        'timescale': stream_timescale,
+                        'width': width or 0,
+                        'height': height or 0,
+                        'fourcc': fourcc,
+                        'codec_private_data': track.get('CodecPrivateData'),
+                        'sampling_rate': sampling_rate,
+                        'channels': int_or_none(track.get('Channels', 2)),
+                        'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
+                        'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
+                    },
+                })
+        return formats
+
+    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
+        def absolute_url(item_url):
+            return urljoin(base_url, item_url)
+
+        def parse_content_type(content_type):
+            if not content_type:
+                return {}
+            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
+            if ctr:
+                mimetype, codecs = ctr.groups()
+                f = parse_codecs(codecs)
+                f['ext'] = mimetype2ext(mimetype)
+                return f
+            return {}
+
+        def _media_formats(src, cur_media_type, type_info={}):
+            full_url = absolute_url(src)
+            ext = type_info.get('ext') or determine_ext(full_url)
+            if ext == 'm3u8':
+                is_plain_url = False
+                formats = self._extract_m3u8_formats(
+                    full_url, video_id, ext='mp4',
+                    entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
+                    preference=preference, fatal=False)
+            elif ext == 'mpd':
+                is_plain_url = False
+                formats = self._extract_mpd_formats(
+                    full_url, video_id, mpd_id=mpd_id, fatal=False)
+            else:
+                is_plain_url = True
+                formats = [{
+                    'url': full_url,
+                    'vcodec': 'none' if cur_media_type == 'audio' else None,
+                }]
+            return is_plain_url, formats
+
+        entries = []
+        # amp-video and amp-audio are very similar to their HTML5 counterparts
+        # so we wll include them right here (see
+        # https://www.ampproject.org/docs/reference/components/amp-video)
+        media_tags = [(media_tag, media_type, '')
+                      for media_tag, media_type
+                      in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
+        media_tags.extend(re.findall(
+            # We only allow video|audio followed by a whitespace or '>'.
+            # Allowing more characters may end up in significant slow down (see
+            # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
+            # http://www.porntrex.com/maps/videositemap.xml).
+            r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
+        for media_tag, media_type, media_content in media_tags:
+            media_info = {
+                'formats': [],
+                'subtitles': {},
+            }
+            media_attributes = extract_attributes(media_tag)
+            src = strip_or_none(media_attributes.get('src'))
+            if src:
+                _, formats = _media_formats(src, media_type)
+                media_info['formats'].extend(formats)
+            media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
+            if media_content:
+                for source_tag in re.findall(r'<source[^>]+>', media_content):
+                    s_attr = extract_attributes(source_tag)
+                    # data-video-src and data-src are non standard but seen
+                    # several times in the wild
+                    src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
+                    if not src:
+                        continue
+                    f = parse_content_type(s_attr.get('type'))
+                    is_plain_url, formats = _media_formats(src, media_type, f)
+                    if is_plain_url:
+                        # width, height, res, label and title attributes are
+                        # all not standard but seen several times in the wild
+                        labels = [
+                            s_attr.get(lbl)
+                            for lbl in ('label', 'title')
+                            if str_or_none(s_attr.get(lbl))
+                        ]
+                        width = int_or_none(s_attr.get('width'))
+                        height = (int_or_none(s_attr.get('height'))
+                                  or int_or_none(s_attr.get('res')))
+                        if not width or not height:
+                            for lbl in labels:
+                                resolution = parse_resolution(lbl)
+                                if not resolution:
+                                    continue
+                                width = width or resolution.get('width')
+                                height = height or resolution.get('height')
+                        for lbl in labels:
+                            tbr = parse_bitrate(lbl)
+                            if tbr:
+                                break
+                        else:
+                            tbr = None
+                        f.update({
+                            'width': width,
+                            'height': height,
+                            'tbr': tbr,
+                            'format_id': s_attr.get('label') or s_attr.get('title'),
+                        })
+                        f.update(formats[0])
+                        media_info['formats'].append(f)
+                    else:
+                        media_info['formats'].extend(formats)
+                for track_tag in re.findall(r'<track[^>]+>', media_content):
+                    track_attributes = extract_attributes(track_tag)
+                    kind = track_attributes.get('kind')
+                    if not kind or kind in ('subtitles', 'captions'):
+                        src = strip_or_none(track_attributes.get('src'))
+                        if not src:
+                            continue
+                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
+                        media_info['subtitles'].setdefault(lang, []).append({
+                            'url': absolute_url(src),
+                        })
+            for f in media_info['formats']:
+                f.setdefault('http_headers', {})['Referer'] = base_url
+            if media_info['formats'] or media_info['subtitles']:
+                entries.append(media_info)
+        return entries
+
+    def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
+        formats = []
+        hdcore_sign = 'hdcore=3.7.0'
+        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+        hds_host = hosts.get('hds')
+        if hds_host:
+            f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
+        if 'hdcore=' not in f4m_url:
+            f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
+        f4m_formats = self._extract_f4m_formats(
+            f4m_url, video_id, f4m_id='hds', fatal=False)
+        for entry in f4m_formats:
+            entry.update({'extra_param_to_segment_url': hdcore_sign})
+        formats.extend(f4m_formats)
+        m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
+        hls_host = hosts.get('hls')
+        if hls_host:
+            m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
+        formats.extend(self._extract_m3u8_formats(
+            m3u8_url, video_id, 'mp4', 'm3u8_native',
+            m3u8_id='hls', fatal=False))
+        return formats
+
+    def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
+        query = compat_urlparse.urlparse(url).query
+        url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
+        mobj = re.search(
+            r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
+        url_base = mobj.group('url')
+        http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
+        formats = []
+
+        def manifest_url(manifest):
+            m_url = '%s/%s' % (http_base_url, manifest)
+            if query:
+                m_url += '?%s' % query
+            return m_url
+
+        if 'm3u8' not in skip_protocols:
+            formats.extend(self._extract_m3u8_formats(
+                manifest_url('playlist.m3u8'), video_id, 'mp4',
+                m3u8_entry_protocol, m3u8_id='hls', fatal=False))
+        if 'f4m' not in skip_protocols:
+            formats.extend(self._extract_f4m_formats(
+                manifest_url('manifest.f4m'),
+                video_id, f4m_id='hds', fatal=False))
+        if 'dash' not in skip_protocols:
+            formats.extend(self._extract_mpd_formats(
+                manifest_url('manifest.mpd'),
+                video_id, mpd_id='dash', fatal=False))
+        if re.search(r'(?:/smil:|\.smil)', url_base):
+            if 'smil' not in skip_protocols:
+                rtmp_formats = self._extract_smil_formats(
+                    manifest_url('jwplayer.smil'),
+                    video_id, fatal=False)
+                for rtmp_format in rtmp_formats:
+                    rtsp_format = rtmp_format.copy()
+                    rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+                    del rtsp_format['play_path']
+                    del rtsp_format['ext']
+                    rtsp_format.update({
+                        'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
+                        'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+                        'protocol': 'rtsp',
+                    })
+                    formats.extend([rtmp_format, rtsp_format])
+        else:
+            for protocol in ('rtmp', 'rtsp'):
+                if protocol not in skip_protocols:
+                    formats.append({
+                        'url': '%s:%s' % (protocol, url_base),
+                        'format_id': protocol,
+                        'protocol': protocol,
+                    })
+        return formats
+
+    def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
+        mobj = re.search(
+            r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
+            webpage)
+        if mobj:
+            try:
+                jwplayer_data = self._parse_json(mobj.group('options'),
+                                                 video_id=video_id,
+                                                 transform_source=transform_source)
+            except ExtractorError:
+                pass
+            else:
+                if isinstance(jwplayer_data, dict):
+                    return jwplayer_data
+
+    def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
+        jwplayer_data = self._find_jwplayer_data(
+            webpage, video_id, transform_source=js_to_json)
+        return self._parse_jwplayer_data(
+            jwplayer_data, video_id, *args, **kwargs)
+
+    def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
+                             m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+        # JWPlayer backward compatibility: flattened playlists
+        # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
+        if 'playlist' not in jwplayer_data:
+            jwplayer_data = {'playlist': [jwplayer_data]}
+
+        entries = []
+
+        # JWPlayer backward compatibility: single playlist item
+        # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
+        if not isinstance(jwplayer_data['playlist'], list):
+            jwplayer_data['playlist'] = [jwplayer_data['playlist']]
+
+        for video_data in jwplayer_data['playlist']:
+            # JWPlayer backward compatibility: flattened sources
+            # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
+            if 'sources' not in video_data:
+                video_data['sources'] = [video_data]
+
+            this_video_id = video_id or video_data['mediaid']
+
+            formats = self._parse_jwplayer_formats(
+                video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
+                mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
+
+            subtitles = {}
+            tracks = video_data.get('tracks')
+            if tracks and isinstance(tracks, list):
+                for track in tracks:
+                    if not isinstance(track, dict):
+                        continue
+                    track_kind = track.get('kind')
+                    if not track_kind or not isinstance(track_kind, compat_str):
+                        continue
+                    if track_kind.lower() not in ('captions', 'subtitles'):
+                        continue
+                    track_url = urljoin(base_url, track.get('file'))
+                    if not track_url:
+                        continue
+                    subtitles.setdefault(track.get('label') or 'en', []).append({
+                        'url': self._proto_relative_url(track_url)
+                    })
+
+            entry = {
+                'id': this_video_id,
+                'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
+                'description': clean_html(video_data.get('description')),
+                'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
+                'timestamp': int_or_none(video_data.get('pubdate')),
+                'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
+                'subtitles': subtitles,
+            }
+            # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
+            if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
+                entry.update({
+                    '_type': 'url_transparent',
+                    'url': formats[0]['url'],
+                })
+            else:
+                self._sort_formats(formats)
+                entry['formats'] = formats
+            entries.append(entry)
+        if len(entries) == 1:
+            return entries[0]
+        else:
+            return self.playlist_result(entries)
+
+    def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
+                                m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+        urls = []
+        formats = []
+        for source in jwplayer_sources_data:
+            if not isinstance(source, dict):
+                continue
+            source_url = urljoin(
+                base_url, self._proto_relative_url(source.get('file')))
+            if not source_url or source_url in urls:
+                continue
+            urls.append(source_url)
+            source_type = source.get('type') or ''
+            ext = mimetype2ext(source_type) or determine_ext(source_url)
+            if source_type == 'hls' or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id=m3u8_id, fatal=False))
+            elif source_type == 'dash' or ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    source_url, video_id, mpd_id=mpd_id, fatal=False))
+            elif ext == 'smil':
+                formats.extend(self._extract_smil_formats(
+                    source_url, video_id, fatal=False))
+            # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
+            elif source_type.startswith('audio') or ext in (
+                    'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
+                formats.append({
+                    'url': source_url,
+                    'vcodec': 'none',
+                    'ext': ext,
+                })
+            else:
+                height = int_or_none(source.get('height'))
+                if height is None:
+                    # Often no height is provided but there is a label in
+                    # format like "1080p", "720p SD", or 1080.
+                    height = int_or_none(self._search_regex(
+                        r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
+                        'height', default=None))
+                a_format = {
+                    'url': source_url,
+                    'width': int_or_none(source.get('width')),
+                    'height': height,
+                    'tbr': int_or_none(source.get('bitrate')),
+                    'ext': ext,
+                }
+                if source_url.startswith('rtmp'):
+                    a_format['ext'] = 'flv'
+                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
+                    # of jwplayer.flash.swf
+                    rtmp_url_parts = re.split(
+                        r'((?:mp4|mp3|flv):)', source_url, 1)
+                    if len(rtmp_url_parts) == 3:
+                        rtmp_url, prefix, play_path = rtmp_url_parts
+                        a_format.update({
+                            'url': rtmp_url,
+                            'play_path': prefix + play_path,
+                        })
+                    if rtmp_params:
+                        a_format.update(rtmp_params)
+                formats.append(a_format)
+        return formats
+
+    def _live_title(self, name):
+        """ Generate the title for a live video """
+        now = datetime.datetime.now()
+        now_str = now.strftime('%Y-%m-%d %H:%M')
+        return name + ' ' + now_str
+
+    def _int(self, v, name, fatal=False, **kwargs):
+        res = int_or_none(v, **kwargs)
+        if 'get_attr' in kwargs:
+            print(getattr(v, kwargs['get_attr']))
+        if res is None:
+            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+            if fatal:
+                raise ExtractorError(msg)
+            else:
+                self._downloader.report_warning(msg)
+        return res
+
+    def _float(self, v, name, fatal=False, **kwargs):
+        res = float_or_none(v, **kwargs)
+        if res is None:
+            msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+            if fatal:
+                raise ExtractorError(msg)
+            else:
+                self._downloader.report_warning(msg)
+        return res
+
+    def _set_cookie(self, domain, name, value, expire_time=None, port=None,
+                    path='/', secure=False, discard=False, rest={}, **kwargs):
+        cookie = compat_cookiejar_Cookie(
+            0, name, value, port, port is not None, domain, True,
+            domain.startswith('.'), path, True, secure, expire_time,
+            discard, None, None, rest)
+        self._downloader.cookiejar.set_cookie(cookie)
+
+    def _get_cookies(self, url):
+        """ Return a compat_cookies.SimpleCookie with the cookies for the url """
+        req = sanitized_Request(url)
+        self._downloader.cookiejar.add_cookie_header(req)
+        return compat_cookies.SimpleCookie(req.get_header('Cookie'))
+
+    def _apply_first_set_cookie_header(self, url_handle, cookie):
+        """
+        Apply first Set-Cookie header instead of the last. Experimental.
+
+        Some sites (e.g. [1-3]) may serve two cookies under the same name
+        in Set-Cookie header and expect the first (old) one to be set rather
+        than second (new). However, as of RFC6265 the newer one cookie
+        should be set into cookie store what actually happens.
+        We will workaround this issue by resetting the cookie to
+        the first one manually.
+        1. https://new.vk.com/
+        2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
+        3. https://learning.oreilly.com/
+        """
+        for header, cookies in url_handle.headers.items():
+            if header.lower() != 'set-cookie':
+                continue
+            if sys.version_info[0] >= 3:
+                cookies = cookies.encode('iso-8859-1')
+            cookies = cookies.decode('utf-8')
+            cookie_value = re.search(
+                r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
+            if cookie_value:
+                value, domain = cookie_value.groups()
+                self._set_cookie(domain, cookie, value)
+                break
+
+    def get_testcases(self, include_onlymatching=False):
+        t = getattr(self, '_TEST', None)
+        if t:
+            assert not hasattr(self, '_TESTS'), \
+                '%s has _TEST and _TESTS' % type(self).__name__
+            tests = [t]
+        else:
+            tests = getattr(self, '_TESTS', [])
+        for t in tests:
+            if not include_onlymatching and t.get('only_matching', False):
+                continue
+            t['name'] = type(self).__name__[:-len('IE')]
+            yield t
+
+    def is_suitable(self, age_limit):
+        """ Test whether the extractor is generally suitable for the given
+        age limit (i.e. pornographic sites are not, all others usually are) """
+
+        any_restricted = False
+        for tc in self.get_testcases(include_onlymatching=False):
+            if tc.get('playlist', []):
+                tc = tc['playlist'][0]
+            is_restricted = age_restricted(
+                tc.get('info_dict', {}).get('age_limit'), age_limit)
+            if not is_restricted:
+                return True
+            any_restricted = any_restricted or is_restricted
+        return not any_restricted
+
+    def extract_subtitles(self, *args, **kwargs):
+        if (self._downloader.params.get('writesubtitles', False)
+                or self._downloader.params.get('listsubtitles')):
+            return self._get_subtitles(*args, **kwargs)
+        return {}
+
+    def _get_subtitles(self, *args, **kwargs):
+        raise NotImplementedError('This method must be implemented by subclasses')
+
+    @staticmethod
+    def _merge_subtitle_items(subtitle_list1, subtitle_list2):
+        """ Merge subtitle items for one language. Items with duplicated URLs
+        will be dropped. """
+        list1_urls = set([item['url'] for item in subtitle_list1])
+        ret = list(subtitle_list1)
+        ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
+        return ret
+
+    @classmethod
+    def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
+        """ Merge two subtitle dictionaries, language by language. """
+        ret = dict(subtitle_dict1)
+        for lang in subtitle_dict2:
+            ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
+        return ret
+
+    def extract_automatic_captions(self, *args, **kwargs):
+        if (self._downloader.params.get('writeautomaticsub', False)
+                or self._downloader.params.get('listsubtitles')):
+            return self._get_automatic_captions(*args, **kwargs)
+        return {}
+
+    def _get_automatic_captions(self, *args, **kwargs):
+        raise NotImplementedError('This method must be implemented by subclasses')
+
+    def mark_watched(self, *args, **kwargs):
+        if (self._downloader.params.get('mark_watched', False)
+                and (self._get_login_info()[0] is not None
+                     or self._downloader.params.get('cookiefile') is not None)):
+            self._mark_watched(*args, **kwargs)
+
+    def _mark_watched(self, *args, **kwargs):
+        raise NotImplementedError('This method must be implemented by subclasses')
+
+    def geo_verification_headers(self):
+        headers = {}
+        geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+        if geo_verification_proxy:
+            headers['Ytdl-request-proxy'] = geo_verification_proxy
+        return headers
+
+    def _generic_id(self, url):
+        return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
+
+    def _generic_title(self, url):
+        return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
+
+
+class SearchInfoExtractor(InfoExtractor):
+    """
+    Base class for paged search queries extractors.
+    They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
+    Instances should define _SEARCH_KEY and _MAX_RESULTS.
+    """
+
+    @classmethod
+    def _make_valid_url(cls):
+        return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
+
+    @classmethod
+    def suitable(cls, url):
+        return re.match(cls._make_valid_url(), url) is not None
+
+    def _real_extract(self, query):
+        mobj = re.match(self._make_valid_url(), query)
+        if mobj is None:
+            raise ExtractorError('Invalid search query "%s"' % query)
+
+        prefix = mobj.group('prefix')
+        query = mobj.group('query')
+        if prefix == '':
+            return self._get_n_results(query, 1)
+        elif prefix == 'all':
+            return self._get_n_results(query, self._MAX_RESULTS)
+        else:
+            n = int(prefix)
+            if n <= 0:
+                raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
+            elif n > self._MAX_RESULTS:
+                self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
+                n = self._MAX_RESULTS
+            return self._get_n_results(query, n)
+
+    def _get_n_results(self, query, n):
+        """Get a specified number of results for a query"""
+        raise NotImplementedError('This method must be implemented by subclasses')
+
+    @property
+    def SEARCH_KEY(self):
+        return self._SEARCH_KEY
diff --git a/youtube_dl/extractor/commonmistakes.py b/youtube_dl/extractor/commonmistakes.py

new file mode 100644 (file)

index 0000000..933b89e
--- /dev/null
+++ b/youtube_dl/extractor/commonmistakes.py
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+import sys
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class CommonMistakesIE(InfoExtractor):
+    IE_DESC = False  # Do not list
+    _VALID_URL = r'''(?x)
+        (?:url|URL)$
+    '''
+
+    _TESTS = [{
+        'url': 'url',
+        'only_matching': True,
+    }, {
+        'url': 'URL',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        msg = (
+            'You\'ve asked youtube-dlc to download the URL "%s". '
+            'That doesn\'t make any sense. '
+            'Simply remove the parameter in your command or configuration.'
+        ) % url
+        if not self._downloader.params.get('verbose'):
+            msg += ' Add -v to the command line to see what arguments and configuration youtube-dlc got.'
+        raise ExtractorError(msg, expected=True)
+
+
+class UnicodeBOMIE(InfoExtractor):
+    IE_DESC = False
+    _VALID_URL = r'(?P<bom>\ufeff)(?P<id>.*)$'
+
+    # Disable test for python 3.2 since BOM is broken in re in this version
+    # (see https://github.com/ytdl-org/youtube-dl/issues/9751)
+    _TESTS = [] if (3, 0) < sys.version_info <= (3, 3) else [{
+        'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        real_url = self._match_id(url)
+        self.report_warning(
+            'Your URL starts with a Byte Order Mark (BOM). '
+            'Removing the BOM and looking for "%s" ...' % real_url)
+        return self.url_result(real_url)
diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py

new file mode 100644 (file)

index 0000000..d98331a
--- /dev/null
+++ b/youtube_dl/extractor/commonprotocols.py
@@ -0,0 +1,60 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urlparse,
+)
+
+
+class RtmpIE(InfoExtractor):
+    IE_DESC = False  # Do not list
+    _VALID_URL = r'(?i)rtmp[est]?://.+'
+
+    _TESTS = [{
+        'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4',
+        'only_matching': True,
+    }, {
+        'url': 'rtmp://edge.live.hitbox.tv/live/dimak',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._generic_id(url)
+        title = self._generic_title(url)
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': [{
+                'url': url,
+                'ext': 'flv',
+                'format_id': compat_urlparse.urlparse(url).scheme,
+            }],
+        }
+
+
+class MmsIE(InfoExtractor):
+    IE_DESC = False  # Do not list
+    _VALID_URL = r'(?i)mms://.+'
+
+    _TEST = {
+        # Direct MMS link
+        'url': 'mms://kentro.kaist.ac.kr/200907/MilesReid(0709).wmv',
+        'info_dict': {
+            'id': 'MilesReid(0709)',
+            'ext': 'wmv',
+            'title': 'MilesReid(0709)',
+        },
+        'params': {
+            'skip_download': True,  # rtsp downloads, requiring mplayer or mpv
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._generic_id(url)
+        title = self._generic_title(url)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': url,
+        }
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py

new file mode 100644 (file)

index 0000000..ed278fe
--- /dev/null
+++ b/youtube_dl/extractor/condenast.py
@@ -0,0 +1,232 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_urlparse,
+    compat_urlparse,
+)
+from ..utils import (
+    determine_ext,
+    extract_attributes,
+    int_or_none,
+    js_to_json,
+    mimetype2ext,
+    orderedSet,
+    parse_iso8601,
+)
+
+
+class CondeNastIE(InfoExtractor):
+    """
+    Condé Nast is a media group, some of its sites use a custom HTML5 player
+    that works the same in all of them.
+    """
+
+    # The keys are the supported sites and the values are the name to be shown
+    # to the user and in the extractor description.
+    _SITES = {
+        'allure': 'Allure',
+        'architecturaldigest': 'Architectural Digest',
+        'arstechnica': 'Ars Technica',
+        'bonappetit': 'Bon Appétit',
+        'brides': 'Brides',
+        'cnevids': 'Condé Nast',
+        'cntraveler': 'Condé Nast Traveler',
+        'details': 'Details',
+        'epicurious': 'Epicurious',
+        'glamour': 'Glamour',
+        'golfdigest': 'Golf Digest',
+        'gq': 'GQ',
+        'newyorker': 'The New Yorker',
+        'self': 'SELF',
+        'teenvogue': 'Teen Vogue',
+        'vanityfair': 'Vanity Fair',
+        'vogue': 'Vogue',
+        'wired': 'WIRED',
+        'wmagazine': 'W Magazine',
+    }
+
+    _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/
+        (?:
+            (?:
+                embed(?:js)?|
+                (?:script|inline)/video
+            )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?|
+            (?P<type>watch|series|video)/(?P<display_id>[^/?#]+)
+        )''' % '|'.join(_SITES.keys())
+    IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
+
+    EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys())
+
+    _TESTS = [{
+        'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
+        'md5': '1921f713ed48aabd715691f774c451f7',
+        'info_dict': {
+            'id': '5171b343c2b4c00dd0c1ccb3',
+            'ext': 'mp4',
+            'title': '3D Printed Speakers Lit With LED',
+            'description': 'Check out these beautiful 3D printed LED speakers.  You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.',
+            'uploader': 'wired',
+            'upload_date': '20130314',
+            'timestamp': 1363219200,
+        }
+    }, {
+        'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series',
+        'info_dict': {
+            'id': '58d1865bfd2e6126e2000015',
+            'ext': 'mp4',
+            'title': 'The Only True Surprise? Trump’s an Idiot',
+            'uploader': 'gq',
+            'upload_date': '20170321',
+            'timestamp': 1490126427,
+        },
+    }, {
+        # JS embed
+        'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js',
+        'md5': 'f1a6f9cafb7083bab74a710f65d08999',
+        'info_dict': {
+            'id': '55f9cf8b61646d1acf00000c',
+            'ext': 'mp4',
+            'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
+            'uploader': 'arstechnica',
+            'upload_date': '20150916',
+            'timestamp': 1442434955,
+        }
+    }, {
+        'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
+        'only_matching': True,
+    }, {
+        'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js',
+        'only_matching': True,
+    }]
+
+    def _extract_series(self, url, webpage):
+        title = self._html_search_regex(
+            r'(?s)<div class="cne-series-info">.*?<h1>(.+?)</h1>',
+            webpage, 'series title')
+        url_object = compat_urllib_parse_urlparse(url)
+        base_url = '%s://%s' % (url_object.scheme, url_object.netloc)
+        m_paths = re.finditer(
+            r'(?s)<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', webpage)
+        paths = orderedSet(m.group(1) for m in m_paths)
+        build_url = lambda path: compat_urlparse.urljoin(base_url, path)
+        entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
+        return self.playlist_result(entries, playlist_title=title)
+
+    def _extract_video_params(self, webpage, display_id):
+        query = self._parse_json(
+            self._search_regex(
+                r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params',
+                default='{}'),
+            display_id, transform_source=js_to_json, fatal=False)
+        if query:
+            query['videoId'] = self._search_regex(
+                r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)',
+                webpage, 'video id', default=None)
+        else:
+            params = extract_attributes(self._search_regex(
+                r'(<[^>]+data-js="video-player"[^>]+>)',
+                webpage, 'player params element'))
+            query.update({
+                'videoId': params['data-video'],
+                'playerId': params['data-player'],
+                'target': params['id'],
+            })
+        return query
+
+    def _extract_video(self, params):
+        video_id = params['videoId']
+
+        video_info = None
+
+        # New API path
+        query = params.copy()
+        query['embedType'] = 'inline'
+        info_page = self._download_json(
+            'http://player.cnevids.com/embed-api.json', video_id,
+            'Downloading embed info', fatal=False, query=query)
+
+        # Old fallbacks
+        if not info_page:
+            if params.get('playerId'):
+                info_page = self._download_json(
+                    'http://player.cnevids.com/player/video.js', video_id,
+                    'Downloading video info', fatal=False, query=params)
+        if info_page:
+            video_info = info_page.get('video')
+        if not video_info:
+            info_page = self._download_webpage(
+                'http://player.cnevids.com/player/loader.js',
+                video_id, 'Downloading loader info', query=params)
+        if not video_info:
+            info_page = self._download_webpage(
+                'https://player.cnevids.com/inline/video/%s.js' % video_id,
+                video_id, 'Downloading inline info', query={
+                    'target': params.get('target', 'embedplayer')
+                })
+
+        if not video_info:
+            video_info = self._parse_json(
+                self._search_regex(
+                    r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'),
+                video_id, transform_source=js_to_json)['video']
+
+        title = video_info['title']
+
+        formats = []
+        for fdata in video_info['sources']:
+            src = fdata.get('src')
+            if not src:
+                continue
+            ext = mimetype2ext(fdata.get('type')) or determine_ext(src)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+                continue
+            quality = fdata.get('quality')
+            formats.append({
+                'format_id': ext + ('-%s' % quality if quality else ''),
+                'url': src,
+                'ext': ext,
+                'quality': 1 if quality == 'high' else 0,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'thumbnail': video_info.get('poster_frame'),
+            'uploader': video_info.get('brand'),
+            'duration': int_or_none(video_info.get('duration')),
+            'tags': video_info.get('tags'),
+            'series': video_info.get('series_title'),
+            'season': video_info.get('season_title'),
+            'timestamp': parse_iso8601(video_info.get('premiere_date')),
+            'categories': video_info.get('categories'),
+        }
+
+    def _real_extract(self, url):
+        video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups()
+
+        if video_id:
+            return self._extract_video({
+                'videoId': video_id,
+                'playerId': player_id,
+                'target': target,
+            })
+
+        webpage = self._download_webpage(url, display_id)
+
+        if url_type == 'series':
+            return self._extract_series(url, webpage)
+        else:
+            params = self._extract_video_params(webpage, display_id)
+            info = self._search_json_ld(
+                webpage, display_id, fatal=False)
+            info.update(self._extract_video(params))
+            return info
diff --git a/youtube_dl/extractor/contv.py b/youtube_dl/extractor/contv.py

new file mode 100644 (file)

index 0000000..84b462d
--- /dev/null
+++ b/youtube_dl/extractor/contv.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    int_or_none,
+)
+
+
+class CONtvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?contv\.com/details-movie/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'https://www.contv.com/details-movie/CEG10022949/days-of-thrills-&-laughter',
+        'info_dict': {
+            'id': 'CEG10022949',
+            'ext': 'mp4',
+            'title': 'Days Of Thrills & Laughter',
+            'description': 'md5:5d6b3d0b1829bb93eb72898c734802eb',
+            'upload_date': '20180703',
+            'timestamp': 1530634789.61,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.contv.com/details-movie/CLIP-show_fotld_bts/fight-of-the-living-dead:-behind-the-scenes-bites',
+        'info_dict': {
+            'id': 'CLIP-show_fotld_bts',
+            'title': 'Fight of the Living Dead: Behind the Scenes Bites',
+        },
+        'playlist_mincount': 7,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        details = self._download_json(
+            'http://metax.contv.live.junctiontv.net/metax/2.5/details/' + video_id,
+            video_id, query={'device': 'web'})
+
+        if details.get('type') == 'episodic':
+            seasons = self._download_json(
+                'http://metax.contv.live.junctiontv.net/metax/2.5/seriesfeed/json/' + video_id,
+                video_id)
+            entries = []
+            for season in seasons:
+                for episode in season.get('episodes', []):
+                    episode_id = episode.get('id')
+                    if not episode_id:
+                        continue
+                    entries.append(self.url_result(
+                        'https://www.contv.com/details-movie/' + episode_id,
+                        CONtvIE.ie_key(), episode_id))
+            return self.playlist_result(entries, video_id, details.get('title'))
+
+        m_details = details['details']
+        title = details['title']
+
+        formats = []
+
+        media_hls_url = m_details.get('media_hls_url')
+        if media_hls_url:
+            formats.extend(self._extract_m3u8_formats(
+                media_hls_url, video_id, 'mp4',
+                m3u8_id='hls', fatal=False))
+
+        media_mp4_url = m_details.get('media_mp4_url')
+        if media_mp4_url:
+            formats.append({
+                'format_id': 'http',
+                'url': media_mp4_url,
+            })
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        captions = m_details.get('captions') or {}
+        for caption_url in captions.values():
+            subtitles.setdefault('en', []).append({
+                'url': caption_url
+            })
+
+        thumbnails = []
+        for image in m_details.get('images', []):
+            image_url = image.get('url')
+            if not image_url:
+                continue
+            thumbnails.append({
+                'url': image_url,
+                'width': int_or_none(image.get('width')),
+                'height': int_or_none(image.get('height')),
+            })
+
+        description = None
+        for p in ('large_', 'medium_', 'small_', ''):
+            d = m_details.get(p + 'description')
+            if d:
+                description = d
+                break
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnails': thumbnails,
+            'description': description,
+            'timestamp': float_or_none(details.get('metax_added_on'), 1000),
+            'subtitles': subtitles,
+            'duration': float_or_none(m_details.get('duration'), 1000),
+            'view_count': int_or_none(details.get('num_watched')),
+            'like_count': int_or_none(details.get('num_fav')),
+            'categories': details.get('category'),
+            'tags': details.get('tags'),
+            'season_number': int_or_none(details.get('season')),
+            'episode_number': int_or_none(details.get('episode')),
+            'release_year': int_or_none(details.get('pub_year')),
+        }
diff --git a/youtube_dl/extractor/corus.py b/youtube_dl/extractor/corus.py

new file mode 100644 (file)

index 0000000..e11aadf
--- /dev/null
+++ b/youtube_dl/extractor/corus.py
@@ -0,0 +1,160 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .theplatform import ThePlatformFeedIE
+from ..utils import (
+    dict_get,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+)
+
+
+class CorusIE(ThePlatformFeedIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        (?P<domain>
+                            (?:
+                                globaltv|
+                                etcanada|
+                                seriesplus|
+                                wnetwork|
+                                ytv
+                            )\.com|
+                            (?:
+                                hgtv|
+                                foodnetwork|
+                                slice|
+                                history|
+                                showcase|
+                                bigbrothercanada|
+                                abcspark|
+                                disney(?:channel|lachaine)
+                            )\.ca
+                        )
+                        /(?:[^/]+/)*
+                        (?:
+                            video\.html\?.*?\bv=|
+                            videos?/(?:[^/]+/)*(?:[a-z0-9-]+-)?
+                        )
+                        (?P<id>
+                            [\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}|
+                            (?:[A-Z]{4})?\d{12,20}
+                        )
+                    '''
+    _TESTS = [{
+        'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/',
+        'info_dict': {
+            'id': '870923331648',
+            'ext': 'mp4',
+            'title': 'Movie Night Popcorn with Bryan',
+            'description': 'Bryan whips up homemade popcorn, the old fashion way for Jojo and Lincoln.',
+            'upload_date': '20170206',
+            'timestamp': 1486392197,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to parse JSON'],
+    }, {
+        'url': 'http://www.foodnetwork.ca/shows/chopped/video/episode/chocolate-obsession/video.html?v=872683587753',
+        'only_matching': True,
+    }, {
+        'url': 'http://etcanada.com/video/873675331955/meet-the-survivor-game-changers-castaways-part-2/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.history.ca/the-world-without-canada/video/full-episodes/natural-resources/video.html?v=955054659646#video',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.showcase.ca/eyewitness/video/eyewitness++106/video.html?v=955070531919&p=1&s=da#video',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.bigbrothercanada.ca/video/1457812035894/',
+        'only_matching': True
+    }, {
+        'url': 'https://www.bigbrothercanada.ca/video/big-brother-canada-704/1457812035894/',
+        'only_matching': True
+    }, {
+        'url': 'https://www.seriesplus.com/emissions/dre-mary-mort-sur-ordonnance/videos/deux-coeurs-battant/SERP0055626330000200/',
+        'only_matching': True
+    }, {
+        'url': 'https://www.disneychannel.ca/shows/gabby-duran-the-unsittables/video/crybaby-duran-clip/2f557eec-0588-11ea-ae2b-e2c6776b770e/',
+        'only_matching': True
+    }]
+    _GEO_BYPASS = False
+    _SITE_MAP = {
+        'globaltv': 'series',
+        'etcanada': 'series',
+        'foodnetwork': 'food',
+        'bigbrothercanada': 'series',
+        'disneychannel': 'disneyen',
+        'disneylachaine': 'disneyfr',
+    }
+
+    def _real_extract(self, url):
+        domain, video_id = re.match(self._VALID_URL, url).groups()
+        site = domain.split('.')[0]
+        path = self._SITE_MAP.get(site, site)
+        if path != 'series':
+            path = 'migration/' + path
+        video = self._download_json(
+            'https://globalcontent.corusappservices.com/templates/%s/playlist/' % path,
+            video_id, query={'byId': video_id},
+            headers={'Accept': 'application/json'})[0]
+        title = video['title']
+
+        formats = []
+        for source in video.get('sources', []):
+            smil_url = source.get('file')
+            if not smil_url:
+                continue
+            source_type = source.get('type')
+            note = 'Downloading%s smil file' % (' ' + source_type if source_type else '')
+            resp = self._download_webpage(
+                smil_url, video_id, note, fatal=False,
+                headers=self.geo_verification_headers())
+            if not resp:
+                continue
+            error = self._parse_json(resp, video_id, fatal=False)
+            if error:
+                if error.get('exception') == 'GeoLocationBlocked':
+                    self.raise_geo_restricted(countries=['CA'])
+                raise ExtractorError(error['description'])
+            smil = self._parse_xml(resp, video_id, fatal=False)
+            if smil is None:
+                continue
+            namespace = self._parse_smil_namespace(smil)
+            formats.extend(self._parse_smil_formats(
+                smil, smil_url, video_id, namespace))
+        if not formats and video.get('drm'):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for track in video.get('tracks', []):
+            track_url = track.get('file')
+            if not track_url:
+                continue
+            lang = 'fr' if site in ('disneylachaine', 'seriesplus') else 'en'
+            subtitles.setdefault(lang, []).append({'url': track_url})
+
+        metadata = video.get('metadata') or {}
+        get_number = lambda x: int_or_none(video.get('pl1$' + x) or metadata.get(x + 'Number'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': dict_get(video, ('defaultThumbnailUrl', 'thumbnail', 'image')),
+            'description': video.get('description'),
+            'timestamp': int_or_none(video.get('availableDate'), 1000),
+            'subtitles': subtitles,
+            'duration': float_or_none(metadata.get('duration')),
+            'series': dict_get(video, ('show', 'pl1$show')),
+            'season_number': get_number('season'),
+            'episode_number': get_number('episode'),
+        }
diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py

new file mode 100644 (file)

index 0000000..6ea03e6
--- /dev/null
+++ b/youtube_dl/extractor/coub.py
@@ -0,0 +1,140 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    parse_iso8601,
+    qualities,
+)
+
+
+class CoubIE(InfoExtractor):
+    _VALID_URL = r'(?:coub:|https?://(?:coub\.com/(?:view|embed|coubs)/|c-cdn\.coub\.com/fb-player\.swf\?.*\bcoub(?:ID|id)=))(?P<id>[\da-z]+)'
+
+    _TESTS = [{
+        'url': 'http://coub.com/view/5u5n1',
+        'info_dict': {
+            'id': '5u5n1',
+            'ext': 'mp4',
+            'title': 'The Matrix Moonwalk',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 4.6,
+            'timestamp': 1428527772,
+            'upload_date': '20150408',
+            'uploader': 'Artyom Loskutnikov',
+            'uploader_id': 'artyom.loskutnikov',
+            'view_count': int,
+            'like_count': int,
+            'repost_count': int,
+            'age_limit': 0,
+        },
+    }, {
+        'url': 'http://c-cdn.coub.com/fb-player.swf?bot_type=vk&coubID=7w5a4',
+        'only_matching': True,
+    }, {
+        'url': 'coub:5u5n1',
+        'only_matching': True,
+    }, {
+        # longer video id
+        'url': 'http://coub.com/view/237d5l5h',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        coub = self._download_json(
+            'http://coub.com/api/v2/coubs/%s.json' % video_id, video_id)
+
+        if coub.get('error'):
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, coub['error']), expected=True)
+
+        title = coub['title']
+
+        file_versions = coub['file_versions']
+
+        QUALITIES = ('low', 'med', 'high')
+
+        MOBILE = 'mobile'
+        IPHONE = 'iphone'
+        HTML5 = 'html5'
+
+        SOURCE_PREFERENCE = (MOBILE, IPHONE, HTML5)
+
+        quality_key = qualities(QUALITIES)
+        preference_key = qualities(SOURCE_PREFERENCE)
+
+        formats = []
+
+        for kind, items in file_versions.get(HTML5, {}).items():
+            if kind not in ('video', 'audio'):
+                continue
+            if not isinstance(items, dict):
+                continue
+            for quality, item in items.items():
+                if not isinstance(item, dict):
+                    continue
+                item_url = item.get('url')
+                if not item_url:
+                    continue
+                formats.append({
+                    'url': item_url,
+                    'format_id': '%s-%s-%s' % (HTML5, kind, quality),
+                    'filesize': int_or_none(item.get('size')),
+                    'vcodec': 'none' if kind == 'audio' else None,
+                    'quality': quality_key(quality),
+                    'preference': preference_key(HTML5),
+                })
+
+        iphone_url = file_versions.get(IPHONE, {}).get('url')
+        if iphone_url:
+            formats.append({
+                'url': iphone_url,
+                'format_id': IPHONE,
+                'preference': preference_key(IPHONE),
+            })
+
+        mobile_url = file_versions.get(MOBILE, {}).get('audio_url')
+        if mobile_url:
+            formats.append({
+                'url': mobile_url,
+                'format_id': '%s-audio' % MOBILE,
+                'preference': preference_key(MOBILE),
+            })
+
+        self._sort_formats(formats)
+
+        thumbnail = coub.get('picture')
+        duration = float_or_none(coub.get('duration'))
+        timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at'))
+        uploader = coub.get('channel', {}).get('title')
+        uploader_id = coub.get('channel', {}).get('permalink')
+
+        view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count'))
+        like_count = int_or_none(coub.get('likes_count'))
+        repost_count = int_or_none(coub.get('recoubs_count'))
+
+        age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin'))
+        if age_restricted is not None:
+            age_limit = 18 if age_restricted is True else 0
+        else:
+            age_limit = None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'view_count': view_count,
+            'like_count': like_count,
+            'repost_count': repost_count,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py

new file mode 100644 (file)

index 0000000..f77a68e
--- /dev/null
+++ b/youtube_dl/extractor/cracked.py
@@ -0,0 +1,90 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..utils import (
+    parse_iso8601,
+    str_to_int,
+)
+
+
+class CrackedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html'
+    _TESTS = [{
+        'url': 'http://www.cracked.com/video_19070_if-animal-actors-got-e21-true-hollywood-stories.html',
+        'md5': '89b90b9824e3806ca95072c4d78f13f7',
+        'info_dict': {
+            'id': '19070',
+            'ext': 'mp4',
+            'title': 'If Animal Actors Got E! True Hollywood Stories',
+            'timestamp': 1404954000,
+            'upload_date': '20140710',
+        }
+    }, {
+        # youtube embed
+        'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html',
+        'md5': 'ccd52866b50bde63a6ef3b35016ba8c7',
+        'info_dict': {
+            'id': 'EjI00A3rZD0',
+            'ext': 'mp4',
+            'title': "4 Plot Holes You Didn't Notice in Your Favorite Movies - The Spit Take",
+            'description': 'md5:c603708c718b796fe6079e2b3351ffc7',
+            'upload_date': '20140725',
+            'uploader_id': 'Cracked',
+            'uploader': 'Cracked',
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        youtube_url = YoutubeIE._extract_url(webpage)
+        if youtube_url:
+            return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
+
+        video_url = self._html_search_regex(
+            [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'],
+            webpage, 'video URL')
+
+        title = self._search_regex(
+            [r'property="?og:title"?\s+content="([^"]+)"', r'class="?title"?>([^<]+)'],
+            webpage, 'title')
+
+        description = self._search_regex(
+            r'name="?(?:og:)?description"?\s+content="([^"]+)"',
+            webpage, 'description', default=None)
+
+        timestamp = self._html_search_regex(
+            r'"date"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False)
+        if timestamp:
+            timestamp = parse_iso8601(timestamp[:-6])
+
+        view_count = str_to_int(self._html_search_regex(
+            r'<span\s+class="?views"? id="?viewCounts"?>([\d,\.]+) Views</span>',
+            webpage, 'view count', fatal=False))
+        comment_count = str_to_int(self._html_search_regex(
+            r'<span\s+id="?commentCounts"?>([\d,\.]+)</span>',
+            webpage, 'comment count', fatal=False))
+
+        m = re.search(r'_(?P<width>\d+)X(?P<height>\d+)\.mp4$', video_url)
+        if m:
+            width = int(m.group('width'))
+            height = int(m.group('height'))
+        else:
+            width = height = None
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'height': height,
+            'width': width,
+        }
diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py

new file mode 100644 (file)

index 0000000..49bf3a4
--- /dev/null
+++ b/youtube_dl/extractor/crackle.py
@@ -0,0 +1,200 @@
+# coding: utf-8
+from __future__ import unicode_literals, division
+
+import hashlib
+import hmac
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    parse_age_limit,
+    parse_duration,
+    url_or_none,
+    ExtractorError
+)
+
+
+class CrackleIE(InfoExtractor):
+    _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)'
+    _TESTS = [{
+        # geo restricted to CA
+        'url': 'https://www.crackle.com/andromeda/2502343',
+        'info_dict': {
+            'id': '2502343',
+            'ext': 'mp4',
+            'title': 'Under The Night',
+            'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a',
+            'duration': 2583,
+            'view_count': int,
+            'average_rating': 0,
+            'age_limit': 14,
+            'genre': 'Action, Sci-Fi',
+            'creator': 'Allan Kroeker',
+            'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe',
+            'release_year': 2000,
+            'series': 'Andromeda',
+            'episode': 'Under The Night',
+            'season_number': 1,
+            'episode_number': 1,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://www.sonycrackle.com/andromeda/2502343',
+        'only_matching': True,
+    }]
+
+    _MEDIA_FILE_SLOTS = {
+        '360p.mp4': {
+            'width': 640,
+            'height': 360,
+        },
+        '480p.mp4': {
+            'width': 768,
+            'height': 432,
+        },
+        '480p_1mbps.mp4': {
+            'width': 852,
+            'height': 480,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        country_code = self._downloader.params.get('geo_bypass_country', None)
+        countries = [country_code] if country_code else (
+            'US', 'AU', 'CA', 'AS', 'FM', 'GU', 'MP', 'PR', 'PW', 'MH', 'VI')
+
+        last_e = None
+
+        for country in countries:
+            try:
+                # Authorization generation algorithm is reverse engineered from:
+                # https://www.sonycrackle.com/static/js/main.ea93451f.chunk.js
+                media_detail_url = 'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s?disableProtocols=true' % (video_id, country)
+                timestamp = time.strftime('%Y%m%d%H%M', time.gmtime())
+                h = hmac.new(b'IGSLUQCBDFHEOIFM', '|'.join([media_detail_url, timestamp]).encode(), hashlib.sha1).hexdigest().upper()
+                media = self._download_json(
+                    media_detail_url, video_id, 'Downloading media JSON as %s' % country,
+                    'Unable to download media JSON', headers={
+                        'Accept': 'application/json',
+                        'Authorization': '|'.join([h, timestamp, '117', '1']),
+                    })
+            except ExtractorError as e:
+                # 401 means geo restriction, trying next country
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                    last_e = e
+                    continue
+                raise
+
+            media_urls = media.get('MediaURLs')
+            if not media_urls or not isinstance(media_urls, list):
+                continue
+
+            title = media['Title']
+
+            formats = []
+            for e in media['MediaURLs']:
+                if e.get('UseDRM') is True:
+                    continue
+                format_url = url_or_none(e.get('Path'))
+                if not format_url:
+                    continue
+                ext = determine_ext(format_url)
+                if ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id='hls', fatal=False))
+                elif ext == 'mpd':
+                    formats.extend(self._extract_mpd_formats(
+                        format_url, video_id, mpd_id='dash', fatal=False))
+                elif format_url.endswith('.ism/Manifest'):
+                    formats.extend(self._extract_ism_formats(
+                        format_url, video_id, ism_id='mss', fatal=False))
+                else:
+                    mfs_path = e.get('Type')
+                    mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path)
+                    if not mfs_info:
+                        continue
+                    formats.append({
+                        'url': format_url,
+                        'format_id': 'http-' + mfs_path.split('.')[0],
+                        'width': mfs_info['width'],
+                        'height': mfs_info['height'],
+                    })
+            self._sort_formats(formats)
+
+            description = media.get('Description')
+            duration = int_or_none(media.get(
+                'DurationInSeconds')) or parse_duration(media.get('Duration'))
+            view_count = int_or_none(media.get('CountViews'))
+            average_rating = float_or_none(media.get('UserRating'))
+            age_limit = parse_age_limit(media.get('Rating'))
+            genre = media.get('Genre')
+            release_year = int_or_none(media.get('ReleaseYear'))
+            creator = media.get('Directors')
+            artist = media.get('Cast')
+
+            if media.get('MediaTypeDisplayValue') == 'Full Episode':
+                series = media.get('ShowName')
+                episode = title
+                season_number = int_or_none(media.get('Season'))
+                episode_number = int_or_none(media.get('Episode'))
+            else:
+                series = episode = season_number = episode_number = None
+
+            subtitles = {}
+            cc_files = media.get('ClosedCaptionFiles')
+            if isinstance(cc_files, list):
+                for cc_file in cc_files:
+                    if not isinstance(cc_file, dict):
+                        continue
+                    cc_url = url_or_none(cc_file.get('Path'))
+                    if not cc_url:
+                        continue
+                    lang = cc_file.get('Locale') or 'en'
+                    subtitles.setdefault(lang, []).append({'url': cc_url})
+
+            thumbnails = []
+            images = media.get('Images')
+            if isinstance(images, list):
+                for image_key, image_url in images.items():
+                    mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key)
+                    if not mobj:
+                        continue
+                    thumbnails.append({
+                        'url': image_url,
+                        'width': int(mobj.group(1)),
+                        'height': int(mobj.group(2)),
+                    })
+
+            return {
+                'id': video_id,
+                'title': title,
+                'description': description,
+                'duration': duration,
+                'view_count': view_count,
+                'average_rating': average_rating,
+                'age_limit': age_limit,
+                'genre': genre,
+                'creator': creator,
+                'artist': artist,
+                'release_year': release_year,
+                'series': series,
+                'episode': episode,
+                'season_number': season_number,
+                'episode_number': episode_number,
+                'thumbnails': thumbnails,
+                'subtitles': subtitles,
+                'formats': formats,
+            }
+
+        raise last_e
diff --git a/youtube_dl/extractor/crooksandliars.py b/youtube_dl/extractor/crooksandliars.py

new file mode 100644 (file)

index 0000000..7fb782d
--- /dev/null
+++ b/youtube_dl/extractor/crooksandliars.py
@@ -0,0 +1,60 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    qualities,
+)
+
+
+class CrooksAndLiarsIE(InfoExtractor):
+    _VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P<id>[A-Za-z0-9]+)'
+    _TESTS = [{
+        'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi',
+        'info_dict': {
+            'id': '8RUoRhRi',
+            'ext': 'mp4',
+            'title': 'Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!',
+            'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1428207000,
+            'upload_date': '20150405',
+            'uploader': 'Heather',
+            'duration': 236,
+        }
+    }, {
+        'url': 'http://embed.crooksandliars.com/v/MTE3MjUtMzQ2MzA',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://embed.crooksandliars.com/embed/%s' % video_id, video_id)
+
+        manifest = self._parse_json(
+            self._search_regex(
+                r'var\s+manifest\s*=\s*({.+?})\n', webpage, 'manifest JSON'),
+            video_id)
+
+        quality = qualities(('webm_low', 'mp4_low', 'webm_high', 'mp4_high'))
+
+        formats = [{
+            'url': item['url'],
+            'format_id': item['type'],
+            'quality': quality(item['type']),
+        } for item in manifest['flavors'] if item['mime'].startswith('video/')]
+        self._sort_formats(formats)
+
+        return {
+            'url': url,
+            'id': video_id,
+            'title': manifest['title'],
+            'description': manifest.get('description'),
+            'thumbnail': self._proto_relative_url(manifest.get('poster')),
+            'timestamp': int_or_none(manifest.get('created')),
+            'uploader': manifest.get('author'),
+            'duration': int_or_none(manifest.get('duration')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py

new file mode 100644 (file)

index 0000000..bc2d1fa
--- /dev/null
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -0,0 +1,686 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+import zlib
+
+from hashlib import sha1
+from math import pow, sqrt, floor
+from .common import InfoExtractor
+from .vrv import VRVIE
+from ..compat import (
+    compat_b64decode,
+    compat_etree_Element,
+    compat_etree_fromstring,
+    compat_str,
+    compat_urllib_parse_urlencode,
+    compat_urllib_request,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    bytes_to_intlist,
+    extract_attributes,
+    float_or_none,
+    intlist_to_bytes,
+    int_or_none,
+    lowercase_escape,
+    merge_dicts,
+    remove_end,
+    sanitized_Request,
+    urlencode_postdata,
+    xpath_text,
+)
+from ..aes import (
+    aes_cbc_decrypt,
+)
+
+
+class CrunchyrollBaseIE(InfoExtractor):
+    _LOGIN_URL = 'https://www.crunchyroll.com/login'
+    _LOGIN_FORM = 'login_form'
+    _NETRC_MACHINE = 'crunchyroll'
+
+    def _call_rpc_api(self, method, video_id, note=None, data=None):
+        data = data or {}
+        data['req'] = 'RpcApi' + method
+        data = compat_urllib_parse_urlencode(data).encode('utf-8')
+        return self._download_xml(
+            'https://www.crunchyroll.com/xml/',
+            video_id, note, fatal=False, data=data, headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+            })
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login page')
+
+        def is_logged(webpage):
+            return 'href="/logout"' in webpage
+
+        # Already logged in
+        if is_logged(login_page):
+            return
+
+        login_form_str = self._search_regex(
+            r'(?P<form><form[^>]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM,
+            login_page, 'login form', group='form')
+
+        post_url = extract_attributes(login_form_str).get('action')
+        if not post_url:
+            post_url = self._LOGIN_URL
+        elif not post_url.startswith('http'):
+            post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+
+        login_form = self._form_hidden_inputs(self._LOGIN_FORM, login_page)
+
+        login_form.update({
+            'login_form[name]': username,
+            'login_form[password]': password,
+        })
+
+        response = self._download_webpage(
+            post_url, None, 'Logging in', 'Wrong login info',
+            data=urlencode_postdata(login_form),
+            headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+        # Successful login
+        if is_logged(response):
+            return
+
+        error = self._html_search_regex(
+            '(?s)<ul[^>]+class=["\']messages["\'][^>]*>(.+?)</ul>',
+            response, 'error message', default=None)
+        if error:
+            raise ExtractorError('Unable to login: %s' % error, expected=True)
+
+        raise ExtractorError('Unable to log in')
+
+    def _real_initialize(self):
+        self._login()
+
+    @staticmethod
+    def _add_skip_wall(url):
+        parsed_url = compat_urlparse.urlparse(url)
+        qs = compat_urlparse.parse_qs(parsed_url.query)
+        # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message:
+        # > This content may be inappropriate for some people.
+        # > Are you sure you want to continue?
+        # since it's not disabled by default in crunchyroll account's settings.
+        # See https://github.com/ytdl-org/youtube-dl/issues/7202.
+        qs['skip_wall'] = ['1']
+        return compat_urlparse.urlunparse(
+            parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
+
+class CrunchyrollIE(CrunchyrollBaseIE, VRVIE):
+    IE_NAME = 'crunchyroll'
+    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+    _TESTS = [{
+        'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
+        'info_dict': {
+            'id': '645513',
+            'ext': 'mp4',
+            'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
+            'description': 'md5:2d17137920c64f2f49981a7797d275ef',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Yomiuri Telecasting Corporation (YTV)',
+            'upload_date': '20131013',
+            'url': 're:(?!.*&amp)',
+        },
+        'params': {
+            # rtmp
+            'skip_download': True,
+        },
+        'skip': 'Video gone',
+    }, {
+        'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1',
+        'info_dict': {
+            'id': '589804',
+            'ext': 'flv',
+            'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
+            'description': 'md5:2fbc01f90b87e8e9137296f37b461c12',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Danny Choo Network',
+            'upload_date': '20120213',
+        },
+        'params': {
+            # rtmp
+            'skip_download': True,
+        },
+        'skip': 'Video gone',
+    }, {
+        'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409',
+        'info_dict': {
+            'id': '702409',
+            'ext': 'mp4',
+            'title': compat_str,
+            'description': compat_str,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Re:Zero Partners',
+            'timestamp': 1462098900,
+            'upload_date': '20160501',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589',
+        'info_dict': {
+            'id': '727589',
+            'ext': 'mp4',
+            'title': compat_str,
+            'description': compat_str,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Kadokawa Pictures Inc.',
+            'timestamp': 1484130900,
+            'upload_date': '20170111',
+            'series': compat_str,
+            'season': "KONOSUBA -God's blessing on this wonderful world! 2",
+            'season_number': 2,
+            'episode': 'Give Me Deliverance From This Judicial Injustice!',
+            'episode_number': 1,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
+        'only_matching': True,
+    }, {
+        # geo-restricted (US), 18+ maturity wall, non-premium available
+        'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617',
+        'only_matching': True,
+    }, {
+        # A description with double quotes
+        'url': 'http://www.crunchyroll.com/11eyes/episode-1-piros-jszaka-red-night-535080',
+        'info_dict': {
+            'id': '535080',
+            'ext': 'mp4',
+            'title': compat_str,
+            'description': compat_str,
+            'uploader': 'Marvelous AQL Inc.',
+            'timestamp': 1255512600,
+            'upload_date': '20091014',
+        },
+        'params': {
+            # Just test metadata extraction
+            'skip_download': True,
+        },
+    }, {
+        # make sure we can extract an uploader name that's not a link
+        'url': 'http://www.crunchyroll.com/hakuoki-reimeiroku/episode-1-dawn-of-the-divine-warriors-606899',
+        'info_dict': {
+            'id': '606899',
+            'ext': 'mp4',
+            'title': 'Hakuoki Reimeiroku Episode 1 – Dawn of the Divine Warriors',
+            'description': 'Ryunosuke was left to die, but Serizawa-san asked him a simple question "Do you want to live?"',
+            'uploader': 'Geneon Entertainment',
+            'upload_date': '20120717',
+        },
+        'params': {
+            # just test metadata extraction
+            'skip_download': True,
+        },
+        'skip': 'Video gone',
+    }, {
+        # A video with a vastly different season name compared to the series name
+        'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532',
+        'info_dict': {
+            'id': '590532',
+            'ext': 'mp4',
+            'title': compat_str,
+            'description': compat_str,
+            'uploader': 'TV TOKYO',
+            'timestamp': 1330956000,
+            'upload_date': '20120305',
+            'series': 'Nyarko-san: Another Crawling Chaos',
+            'season': 'Haiyoru! Nyaruani (ONA)',
+        },
+        'params': {
+            # Just test metadata extraction
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.crunchyroll.com/media-723735',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921',
+        'only_matching': True,
+    }]
+
+    _FORMAT_IDS = {
+        '360': ('60', '106'),
+        '480': ('61', '106'),
+        '720': ('62', '106'),
+        '1080': ('80', '108'),
+    }
+
+    def _download_webpage(self, url_or_request, *args, **kwargs):
+        request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request)
+                   else sanitized_Request(url_or_request))
+        # Accept-Language must be set explicitly to accept any language to avoid issues
+        # similar to https://github.com/ytdl-org/youtube-dl/issues/6797.
+        # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction
+        # should be imposed or not (from what I can see it just takes the first language
+        # ignoring the priority and requires it to correspond the IP). By the way this causes
+        # Crunchyroll to not work in georestriction cases in some browsers that don't place
+        # the locale lang first in header. However allowing any language seems to workaround the issue.
+        request.add_header('Accept-Language', '*')
+        return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs)
+
+    def _decrypt_subtitles(self, data, iv, id):
+        data = bytes_to_intlist(compat_b64decode(data))
+        iv = bytes_to_intlist(compat_b64decode(iv))
+        id = int(id)
+
+        def obfuscate_key_aux(count, modulo, start):
+            output = list(start)
+            for _ in range(count):
+                output.append(output[-1] + output[-2])
+            # cut off start values
+            output = output[2:]
+            output = list(map(lambda x: x % modulo + 33, output))
+            return output
+
+        def obfuscate_key(key):
+            num1 = int(floor(pow(2, 25) * sqrt(6.9)))
+            num2 = (num1 ^ key) << 5
+            num3 = key ^ num1
+            num4 = num3 ^ (num3 >> 3) ^ num2
+            prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2)))
+            shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
+            # Extend 160 Bit hash to 256 Bit
+            return shaHash + [0] * 12
+
+        key = obfuscate_key(id)
+
+        decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
+        return zlib.decompress(decrypted_data)
+
+    def _convert_subtitles_to_srt(self, sub_root):
+        output = ''
+
+        for i, event in enumerate(sub_root.findall('./events/event'), 1):
+            start = event.attrib['start'].replace('.', ',')
+            end = event.attrib['end'].replace('.', ',')
+            text = event.attrib['text'].replace('\\N', '\n')
+            output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
+        return output
+
+    def _convert_subtitles_to_ass(self, sub_root):
+        output = ''
+
+        def ass_bool(strvalue):
+            assvalue = '0'
+            if strvalue == '1':
+                assvalue = '-1'
+            return assvalue
+
+        output = '[Script Info]\n'
+        output += 'Title: %s\n' % sub_root.attrib['title']
+        output += 'ScriptType: v4.00+\n'
+        output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style']
+        output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x']
+        output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y']
+        output += """
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+"""
+        for style in sub_root.findall('./styles/style'):
+            output += 'Style: ' + style.attrib['name']
+            output += ',' + style.attrib['font_name']
+            output += ',' + style.attrib['font_size']
+            output += ',' + style.attrib['primary_colour']
+            output += ',' + style.attrib['secondary_colour']
+            output += ',' + style.attrib['outline_colour']
+            output += ',' + style.attrib['back_colour']
+            output += ',' + ass_bool(style.attrib['bold'])
+            output += ',' + ass_bool(style.attrib['italic'])
+            output += ',' + ass_bool(style.attrib['underline'])
+            output += ',' + ass_bool(style.attrib['strikeout'])
+            output += ',' + style.attrib['scale_x']
+            output += ',' + style.attrib['scale_y']
+            output += ',' + style.attrib['spacing']
+            output += ',' + style.attrib['angle']
+            output += ',' + style.attrib['border_style']
+            output += ',' + style.attrib['outline']
+            output += ',' + style.attrib['shadow']
+            output += ',' + style.attrib['alignment']
+            output += ',' + style.attrib['margin_l']
+            output += ',' + style.attrib['margin_r']
+            output += ',' + style.attrib['margin_v']
+            output += ',' + style.attrib['encoding']
+            output += '\n'
+
+        output += """
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+"""
+        for event in sub_root.findall('./events/event'):
+            output += 'Dialogue: 0'
+            output += ',' + event.attrib['start']
+            output += ',' + event.attrib['end']
+            output += ',' + event.attrib['style']
+            output += ',' + event.attrib['name']
+            output += ',' + event.attrib['margin_l']
+            output += ',' + event.attrib['margin_r']
+            output += ',' + event.attrib['margin_v']
+            output += ',' + event.attrib['effect']
+            output += ',' + event.attrib['text']
+            output += '\n'
+
+        return output
+
+    def _extract_subtitles(self, subtitle):
+        sub_root = compat_etree_fromstring(subtitle)
+        return [{
+            'ext': 'srt',
+            'data': self._convert_subtitles_to_srt(sub_root),
+        }, {
+            'ext': 'ass',
+            'data': self._convert_subtitles_to_ass(sub_root),
+        }]
+
+    def _get_subtitles(self, video_id, webpage):
+        subtitles = {}
+        for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage):
+            sub_doc = self._call_rpc_api(
+                'Subtitle_GetXml', video_id,
+                'Downloading subtitles for ' + sub_name, data={
+                    'subtitle_script_id': sub_id,
+                })
+            if not isinstance(sub_doc, compat_etree_Element):
+                continue
+            sid = sub_doc.get('id')
+            iv = xpath_text(sub_doc, 'iv', 'subtitle iv')
+            data = xpath_text(sub_doc, 'data', 'subtitle data')
+            if not sid or not iv or not data:
+                continue
+            subtitle = self._decrypt_subtitles(data, iv, sid).decode('utf-8')
+            lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
+            if not lang_code:
+                continue
+            subtitles[lang_code] = self._extract_subtitles(subtitle)
+        return subtitles
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('video_id')
+
+        if mobj.group('prefix') == 'm':
+            mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage')
+            webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url')
+        else:
+            webpage_url = 'http://www.' + mobj.group('url')
+
+        webpage = self._download_webpage(
+            self._add_skip_wall(webpage_url), video_id,
+            headers=self.geo_verification_headers())
+        note_m = self._html_search_regex(
+            r'<div class="showmedia-trailer-notice">(.+?)</div>',
+            webpage, 'trailer-notice', default='')
+        if note_m:
+            raise ExtractorError(note_m)
+
+        mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
+        if mobj:
+            msg = json.loads(mobj.group('msg'))
+            if msg.get('type') == 'error':
+                raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
+
+        if 'To view this, please log in to verify you are 18 or older.' in webpage:
+            self.raise_login_required()
+
+        media = self._parse_json(self._search_regex(
+            r'vilos\.config\.media\s*=\s*({.+?});',
+            webpage, 'vilos media', default='{}'), video_id)
+        media_metadata = media.get('metadata') or {}
+
+        language = self._search_regex(
+            r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P<lang>(?:(?!\1).)+)\1',
+            webpage, 'language', default=None, group='lang')
+
+        video_title = self._html_search_regex(
+            (r'(?s)<h1[^>]*>((?:(?!<h1).)*?<(?:span[^>]+itemprop=["\']title["\']|meta[^>]+itemprop=["\']position["\'])[^>]*>(?:(?!<h1).)+?)</h1>',
+             r'<title>(.+?),\s+-\s+.+? Crunchyroll'),
+            webpage, 'video_title', default=None)
+        if not video_title:
+            video_title = re.sub(r'^Watch\s+', '', self._og_search_description(webpage))
+        video_title = re.sub(r' {2,}', ' ', video_title)
+        video_description = (self._parse_json(self._html_search_regex(
+            r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
+            webpage, 'description', default='{}'), video_id) or media_metadata).get('description')
+        if video_description:
+            video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
+        video_uploader = self._html_search_regex(
+            # try looking for both an uploader that's a link and one that's not
+            [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
+            webpage, 'video_uploader', default=False)
+
+        formats = []
+        for stream in media.get('streams', []):
+            audio_lang = stream.get('audio_lang')
+            hardsub_lang = stream.get('hardsub_lang')
+            vrv_formats = self._extract_vrv_formats(
+                stream.get('url'), video_id, stream.get('format'),
+                audio_lang, hardsub_lang)
+            for f in vrv_formats:
+                if not hardsub_lang:
+                    f['preference'] = 1
+                language_preference = 0
+                if audio_lang == language:
+                    language_preference += 1
+                if hardsub_lang == language:
+                    language_preference += 1
+                if language_preference:
+                    f['language_preference'] = language_preference
+            formats.extend(vrv_formats)
+        if not formats:
+            available_fmts = []
+            for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
+                attrs = extract_attributes(a)
+                href = attrs.get('href')
+                if href and '/freetrial' in href:
+                    continue
+                available_fmts.append(fmt)
+            if not available_fmts:
+                for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
+                    available_fmts = re.findall(p, webpage)
+                    if available_fmts:
+                        break
+            if not available_fmts:
+                available_fmts = self._FORMAT_IDS.keys()
+            video_encode_ids = []
+
+            for fmt in available_fmts:
+                stream_quality, stream_format = self._FORMAT_IDS[fmt]
+                video_format = fmt + 'p'
+                stream_infos = []
+                streamdata = self._call_rpc_api(
+                    'VideoPlayer_GetStandardConfig', video_id,
+                    'Downloading media info for %s' % video_format, data={
+                        'media_id': video_id,
+                        'video_format': stream_format,
+                        'video_quality': stream_quality,
+                        'current_page': url,
+                    })
+                if isinstance(streamdata, compat_etree_Element):
+                    stream_info = streamdata.find('./{default}preload/stream_info')
+                    if stream_info is not None:
+                        stream_infos.append(stream_info)
+                stream_info = self._call_rpc_api(
+                    'VideoEncode_GetStreamInfo', video_id,
+                    'Downloading stream info for %s' % video_format, data={
+                        'media_id': video_id,
+                        'video_format': stream_format,
+                        'video_encode_quality': stream_quality,
+                    })
+                if isinstance(stream_info, compat_etree_Element):
+                    stream_infos.append(stream_info)
+                for stream_info in stream_infos:
+                    video_encode_id = xpath_text(stream_info, './video_encode_id')
+                    if video_encode_id in video_encode_ids:
+                        continue
+                    video_encode_ids.append(video_encode_id)
+
+                    video_file = xpath_text(stream_info, './file')
+                    if not video_file:
+                        continue
+                    if video_file.startswith('http'):
+                        formats.extend(self._extract_m3u8_formats(
+                            video_file, video_id, 'mp4', entry_protocol='m3u8_native',
+                            m3u8_id='hls', fatal=False))
+                        continue
+
+                    video_url = xpath_text(stream_info, './host')
+                    if not video_url:
+                        continue
+                    metadata = stream_info.find('./metadata')
+                    format_info = {
+                        'format': video_format,
+                        'height': int_or_none(xpath_text(metadata, './height')),
+                        'width': int_or_none(xpath_text(metadata, './width')),
+                    }
+
+                    if '.fplive.net/' in video_url:
+                        video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
+                        parsed_video_url = compat_urlparse.urlparse(video_url)
+                        direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
+                            netloc='v.lvlt.crcdn.net',
+                            path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
+                        if self._is_valid_url(direct_video_url, video_id, video_format):
+                            format_info.update({
+                                'format_id': 'http-' + video_format,
+                                'url': direct_video_url,
+                            })
+                            formats.append(format_info)
+                            continue
+
+                    format_info.update({
+                        'format_id': 'rtmp-' + video_format,
+                        'url': video_url,
+                        'play_path': video_file,
+                        'ext': 'flv',
+                    })
+                    formats.append(format_info)
+        self._sort_formats(formats, ('preference', 'language_preference', 'height', 'width', 'tbr', 'fps'))
+
+        metadata = self._call_rpc_api(
+            'VideoPlayer_GetMediaMetadata', video_id,
+            note='Downloading media info', data={
+                'media_id': video_id,
+            })
+
+        subtitles = {}
+        for subtitle in media.get('subtitles', []):
+            subtitle_url = subtitle.get('url')
+            if not subtitle_url:
+                continue
+            subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({
+                'url': subtitle_url,
+                'ext': subtitle.get('format', 'ass'),
+            })
+        if not subtitles:
+            subtitles = self.extract_subtitles(video_id, webpage)
+
+        # webpage provide more accurate data than series_title from XML
+        series = self._html_search_regex(
+            r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d',
+            webpage, 'series', fatal=False)
+
+        season = episode = episode_number = duration = thumbnail = None
+
+        if isinstance(metadata, compat_etree_Element):
+            season = xpath_text(metadata, 'series_title')
+            episode = xpath_text(metadata, 'episode_title')
+            episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
+            duration = float_or_none(media_metadata.get('duration'), 1000)
+            thumbnail = xpath_text(metadata, 'episode_image_url')
+
+        if not episode:
+            episode = media_metadata.get('title')
+        if not episode_number:
+            episode_number = int_or_none(media_metadata.get('episode_number'))
+        if not thumbnail:
+            thumbnail = media_metadata.get('thumbnail', {}).get('url')
+
+        season_number = int_or_none(self._search_regex(
+            r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
+            webpage, 'season number', default=None))
+
+        info = self._search_json_ld(webpage, video_id, default={})
+
+        return merge_dicts({
+            'id': video_id,
+            'title': video_title,
+            'description': video_description,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'uploader': video_uploader,
+            'series': series,
+            'season': season,
+            'season_number': season_number,
+            'episode': episode,
+            'episode_number': episode_number,
+            'subtitles': subtitles,
+            'formats': formats,
+        }, info)
+
+
+class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
+    IE_NAME = 'crunchyroll:playlist'
+    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)'
+
+    _TESTS = [{
+        'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
+        'info_dict': {
+            'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
+            'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi'
+        },
+        'playlist_count': 13,
+    }, {
+        # geo-restricted (US), 18+ maturity wall, non-premium available
+        'url': 'http://www.crunchyroll.com/cosplay-complex-ova',
+        'info_dict': {
+            'id': 'cosplay-complex-ova',
+            'title': 'Cosplay Complex OVA'
+        },
+        'playlist_count': 3,
+        'skip': 'Georestricted',
+    }, {
+        # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14
+        'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            self._add_skip_wall(url), show_id,
+            headers=self.geo_verification_headers())
+        title = self._html_search_meta('name', webpage, default=None)
+
+        episode_paths = re.findall(
+            r'(?s)<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"',
+            webpage)
+        entries = [
+            self.url_result('http://www.crunchyroll.com' + ep, 'Crunchyroll', ep_id)
+            for ep_id, ep in episode_paths
+        ]
+        entries.reverse()
+
+        return {
+            '_type': 'playlist',
+            'id': show_id,
+            'title': title,
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py

new file mode 100644 (file)

index 0000000..67d6df4
--- /dev/null
+++ b/youtube_dl/extractor/cspan.py
@@ -0,0 +1,196 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    extract_attributes,
+    find_xpath_attr,
+    get_element_by_class,
+    int_or_none,
+    smuggle_url,
+    unescapeHTML,
+)
+from .senateisvp import SenateISVPIE
+from .ustream import UstreamIE
+
+
+class CSpanIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'
+    IE_DESC = 'C-SPAN'
+    _TESTS = [{
+        'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
+        'md5': '94b29a4f131ff03d23471dd6f60b6a1d',
+        'info_dict': {
+            'id': '315139',
+            'title': 'Attorney General Eric Holder on Voting Rights Act Decision',
+        },
+        'playlist_mincount': 2,
+        'skip': 'Regularly fails on travis, for unknown reasons',
+    }, {
+        'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
+        # md5 is unstable
+        'info_dict': {
+            'id': 'c4486943',
+            'ext': 'mp4',
+            'title': 'CSPAN - International Health Care Models',
+            'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
+        }
+    }, {
+        'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall',
+        'info_dict': {
+            'id': '342759',
+            'title': 'General Motors Ignition Switch Recall',
+        },
+        'playlist_mincount': 6,
+    }, {
+        # Video from senate.gov
+        'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers',
+        'info_dict': {
+            'id': 'judiciary031715',
+            'ext': 'mp4',
+            'title': 'Immigration Reforms Needed to Protect Skilled American Workers',
+        },
+        'params': {
+            'skip_download': True,  # m3u8 downloads
+        }
+    }, {
+        # Ustream embedded video
+        'url': 'https://www.c-span.org/video/?114917-1/armed-services',
+        'info_dict': {
+            'id': '58428542',
+            'ext': 'flv',
+            'title': 'USHR07 Armed Services Committee',
+            'description': 'hsas00-2118-20150204-1000et-07\n\n\nUSHR07 Armed Services Committee',
+            'timestamp': 1423060374,
+            'upload_date': '20150204',
+            'uploader': 'HouseCommittee',
+            'uploader_id': '12987475',
+        },
+    }, {
+        # Audio Only
+        'url': 'https://www.c-span.org/video/?437336-1/judiciary-antitrust-competition-policy-consumer-rights',
+        'only_matching': True,
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_type = None
+        webpage = self._download_webpage(url, video_id)
+
+        ustream_url = UstreamIE._extract_url(webpage)
+        if ustream_url:
+            return self.url_result(ustream_url, UstreamIE.ie_key())
+
+        if '&vod' not in url:
+            bc = self._search_regex(
+                r"(<[^>]+id='brightcove-player-embed'[^>]+>)",
+                webpage, 'brightcove embed', default=None)
+            if bc:
+                bc_attr = extract_attributes(bc)
+                bc_url = self.BRIGHTCOVE_URL_TEMPLATE % (
+                    bc_attr.get('data-bcaccountid', '3162030207001'),
+                    bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'),
+                    bc_attr.get('data-newbcplayerid', 'default'),
+                    bc_attr['data-bcid'])
+                return self.url_result(smuggle_url(bc_url, {'source_url': url}))
+
+        # We first look for clipid, because clipprog always appears before
+        patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]
+        results = list(filter(None, (re.search(p, webpage) for p in patterns)))
+        if results:
+            matches = results[0]
+            video_type, video_id = matches.groups()
+            video_type = 'clip' if video_type == 'id' else 'program'
+        else:
+            m = re.search(r'data-(?P<type>clip|prog)id=["\'](?P<id>\d+)', webpage)
+            if m:
+                video_id = m.group('id')
+                video_type = 'program' if m.group('type') == 'prog' else 'clip'
+            else:
+                senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
+                if senate_isvp_url:
+                    title = self._og_search_title(webpage)
+                    surl = smuggle_url(senate_isvp_url, {'force_title': title})
+                    return self.url_result(surl, 'SenateISVP', video_id, title)
+                video_id = self._search_regex(
+                    r'jwsetup\.clipprog\s*=\s*(\d+);',
+                    webpage, 'jwsetup program id', default=None)
+                if video_id:
+                    video_type = 'program'
+        if video_type is None or video_id is None:
+            error_message = get_element_by_class('VLplayer-error-message', webpage)
+            if error_message:
+                raise ExtractorError(error_message)
+            raise ExtractorError('unable to find video id and type')
+
+        def get_text_attr(d, attr):
+            return d.get(attr, {}).get('#text')
+
+        data = self._download_json(
+            'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id),
+            video_id)['video']
+        if data['@status'] != 'Success':
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True)
+
+        doc = self._download_xml(
+            'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id),
+            video_id)
+
+        description = self._html_search_meta('description', webpage)
+
+        title = find_xpath_attr(doc, './/string', 'name', 'title').text
+        thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text
+
+        files = data['files']
+        capfile = get_text_attr(data, 'capfile')
+
+        entries = []
+        for partnum, f in enumerate(files):
+            formats = []
+            for quality in f.get('qualities', []):
+                formats.append({
+                    'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')),
+                    'url': unescapeHTML(get_text_attr(quality, 'file')),
+                    'height': int_or_none(get_text_attr(quality, 'height')),
+                    'tbr': int_or_none(get_text_attr(quality, 'bitrate')),
+                })
+            if not formats:
+                path = unescapeHTML(get_text_attr(f, 'path'))
+                if not path:
+                    continue
+                formats = self._extract_m3u8_formats(
+                    path, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }]
+            self._sort_formats(formats)
+            entries.append({
+                'id': '%s_%d' % (video_id, partnum + 1),
+                'title': (
+                    title if len(files) == 1 else
+                    '%s part %d' % (title, partnum + 1)),
+                'formats': formats,
+                'description': description,
+                'thumbnail': thumbnail,
+                'duration': int_or_none(get_text_attr(f, 'length')),
+                'subtitles': {
+                    'en': [{
+                        'url': capfile,
+                        'ext': determine_ext(capfile, 'dfxp')
+                    }],
+                } if capfile else None,
+            })
+
+        if len(entries) == 1:
+            entry = dict(entries[0])
+            entry['id'] = 'c' + video_id if video_type == 'clip' else video_id
+            return entry
+        else:
+            return {
+                '_type': 'playlist',
+                'entries': entries,
+                'title': title,
+                'id': 'c' + video_id if video_type == 'clip' else video_id,
+            }
diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py

new file mode 100644 (file)

index 0000000..679f1d9
--- /dev/null
+++ b/youtube_dl/extractor/ctsnews.py
@@ -0,0 +1,87 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import unified_timestamp
+from .youtube import YoutubeIE
+
+
+class CtsNewsIE(InfoExtractor):
+    IE_DESC = '華視新聞'
+    _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html'
+    _TESTS = [{
+        'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html',
+        'md5': 'a9875cb790252b08431186d741beaabe',
+        'info_dict': {
+            'id': '201501291578109',
+            'ext': 'mp4',
+            'title': '以色列.真主黨交火 3人死亡 - 華視新聞網',
+            'description': '以色列和黎巴嫩真主黨，爆發五年最嚴重衝突，雙方砲轟交火，兩名以軍死亡，還有一名西班牙籍的聯合國維和人員也不幸罹難。大陸陝西、河南、安徽、江蘇和湖北五個省份出現大暴雪，嚴重影響陸空交通，不過九華山卻出現...',
+            'timestamp': 1422528540,
+            'upload_date': '20150129',
+        }
+    }, {
+        # News count not appear on page but still available in database
+        'url': 'http://news.cts.com.tw/cts/international/201309/201309031304098.html',
+        'md5': '3aee7e0df7cdff94e43581f54c22619e',
+        'info_dict': {
+            'id': '201309031304098',
+            'ext': 'mp4',
+            'title': '韓國31歲童顏男 貌如十多歲小孩 - 華視新聞網',
+            'description': '越有年紀的人，越希望看起來年輕一點，而南韓卻有一位31歲的男子，看起來像是11、12歲的小孩，身...',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1378205880,
+            'upload_date': '20130903',
+        }
+    }, {
+        # With Youtube embedded video
+        'url': 'http://news.cts.com.tw/cts/money/201501/201501291578003.html',
+        'md5': 'e4726b2ccd70ba2c319865e28f0a91d1',
+        'info_dict': {
+            'id': 'OVbfO7d0_hQ',
+            'ext': 'mp4',
+            'title': 'iPhone6熱銷 蘋果財報亮眼',
+            'description': 'md5:f395d4f485487bb0f992ed2c4b07aa7d',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20150128',
+            'uploader_id': 'TBSCTS',
+            'uploader': '中華電視公司',
+        },
+        'add_ie': ['Youtube'],
+    }]
+
+    def _real_extract(self, url):
+        news_id = self._match_id(url)
+        page = self._download_webpage(url, news_id)
+
+        news_id = self._hidden_inputs(page).get('get_id')
+
+        if news_id:
+            mp4_feed = self._download_json(
+                'http://news.cts.com.tw/action/test_mp4feed.php',
+                news_id, note='Fetching feed', query={'news_id': news_id})
+            video_url = mp4_feed['source_url']
+        else:
+            self.to_screen('Not CTSPlayer video, trying Youtube...')
+            youtube_url = YoutubeIE._extract_url(page)
+
+            return self.url_result(youtube_url, ie='Youtube')
+
+        description = self._html_search_meta('description', page)
+        title = self._html_search_meta('title', page, fatal=True)
+        thumbnail = self._html_search_meta('image', page)
+
+        datetime_str = self._html_search_regex(
+            r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page, 'date and time', fatal=False)
+        timestamp = None
+        if datetime_str:
+            timestamp = unified_timestamp(datetime_str) - 8 * 3600
+
+        return {
+            'id': news_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+        }
diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py

new file mode 100644 (file)

index 0000000..03f8cef
--- /dev/null
+++ b/youtube_dl/extractor/ctvnews.py
@@ -0,0 +1,68 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import orderedSet
+
+
+class CTVNewsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)'
+    _TESTS = [{
+        'url': 'http://www.ctvnews.ca/video?clipId=901995',
+        'md5': '9b8624ba66351a23e0b6e1391971f9af',
+        'info_dict': {
+            'id': '901995',
+            'ext': 'flv',
+            'title': 'Extended: \'That person cannot be me\' Johnson says',
+            'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285',
+            'timestamp': 1467286284,
+            'upload_date': '20160630',
+        }
+    }, {
+        'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224',
+        'info_dict':
+        {
+            'id': '1.2966224',
+        },
+        'playlist_mincount': 19,
+    }, {
+        'url': 'http://www.ctvnews.ca/video?binId=1.2876780',
+        'info_dict':
+        {
+            'id': '1.2876780',
+        },
+        'playlist_mincount': 100,
+    }, {
+        'url': 'http://www.ctvnews.ca/1.810401',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231',
+        'only_matching': True,
+    }, {
+        'url': 'http://vancouverisland.ctvnews.ca/video?clipId=761241',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+
+        def ninecninemedia_url_result(clip_id):
+            return {
+                '_type': 'url_transparent',
+                'id': clip_id,
+                'url': '9c9media:ctvnews_web:%s' % clip_id,
+                'ie_key': 'NineCNineMedia',
+            }
+
+        if page_id.isdigit():
+            return ninecninemedia_url_result(page_id)
+        else:
+            webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={
+                'ot': 'example.AjaxPageLayout.ot',
+                'maxItemsPerPage': 1000000,
+            })
+            entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet(
+                re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
+            return self.playlist_result(entries, page_id)
diff --git a/youtube_dl/extractor/cultureunplugged.py b/youtube_dl/extractor/cultureunplugged.py

new file mode 100644 (file)

index 0000000..bcdf273
--- /dev/null
+++ b/youtube_dl/extractor/cultureunplugged.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    HEADRequest,
+)
+
+
+class CultureUnpluggedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cultureunplugged\.com/documentary/watch-online/play/(?P<id>\d+)(?:/(?P<display_id>[^/]+))?'
+    _TESTS = [{
+        'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662/The-Next--Best-West',
+        'md5': 'ac6c093b089f7d05e79934dcb3d228fc',
+        'info_dict': {
+            'id': '53662',
+            'display_id': 'The-Next--Best-West',
+            'ext': 'mp4',
+            'title': 'The Next, Best West',
+            'description': 'md5:0423cd00833dea1519cf014e9d0903b1',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'creator': 'Coldstream Creative',
+            'duration': 2203,
+            'view_count': int,
+        }
+    }, {
+        'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request
+        self._request_webpage(HEADRequest(
+            'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id)
+        movie_data = self._download_json(
+            'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id)
+
+        video_url = movie_data['url']
+        title = movie_data['title']
+
+        description = movie_data.get('synopsis')
+        creator = movie_data.get('producer')
+        duration = int_or_none(movie_data.get('duration'))
+        view_count = int_or_none(movie_data.get('views'))
+
+        thumbnails = [{
+            'url': movie_data['%s_thumb' % size],
+            'id': size,
+            'preference': preference,
+        } for preference, size in enumerate((
+            'small', 'large')) if movie_data.get('%s_thumb' % size)]
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'creator': creator,
+            'duration': duration,
+            'view_count': view_count,
+            'thumbnails': thumbnails,
+        }
diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py

new file mode 100644 (file)

index 0000000..e4a7fca
--- /dev/null
+++ b/youtube_dl/extractor/curiositystream.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    urlencode_postdata,
+    compat_str,
+    ExtractorError,
+)
+
+
+class CuriosityStreamBaseIE(InfoExtractor):
+    _NETRC_MACHINE = 'curiositystream'
+    _auth_token = None
+    _API_BASE_URL = 'https://api.curiositystream.com/v1/'
+
+    def _handle_errors(self, result):
+        error = result.get('error', {}).get('message')
+        if error:
+            if isinstance(error, dict):
+                error = ', '.join(error.values())
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error), expected=True)
+
+    def _call_api(self, path, video_id):
+        headers = {}
+        if self._auth_token:
+            headers['X-Auth-Token'] = self._auth_token
+        result = self._download_json(
+            self._API_BASE_URL + path, video_id, headers=headers)
+        self._handle_errors(result)
+        return result['data']
+
+    def _real_initialize(self):
+        email, password = self._get_login_info()
+        if email is None:
+            return
+        result = self._download_json(
+            self._API_BASE_URL + 'login', None, data=urlencode_postdata({
+                'email': email,
+                'password': password,
+            }))
+        self._handle_errors(result)
+        self._auth_token = result['message']['auth_token']
+
+
+class CuriosityStreamIE(CuriosityStreamBaseIE):
+    IE_NAME = 'curiositystream'
+    _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://app.curiositystream.com/video/2',
+        'md5': '262bb2f257ff301115f1973540de8983',
+        'info_dict': {
+            'id': '2',
+            'ext': 'mp4',
+            'title': 'How Did You Develop The Internet?',
+            'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        media = self._call_api('media/' + video_id, video_id)
+        title = media['title']
+
+        formats = []
+        for encoding in media.get('encodings', []):
+            m3u8_url = encoding.get('master_playlist_url')
+            if m3u8_url:
+                formats.extend(self._extract_m3u8_formats(
+                    m3u8_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            encoding_url = encoding.get('url')
+            file_url = encoding.get('file_url')
+            if not encoding_url and not file_url:
+                continue
+            f = {
+                'width': int_or_none(encoding.get('width')),
+                'height': int_or_none(encoding.get('height')),
+                'vbr': int_or_none(encoding.get('video_bitrate')),
+                'abr': int_or_none(encoding.get('audio_bitrate')),
+                'filesize': int_or_none(encoding.get('size_in_bytes')),
+                'vcodec': encoding.get('video_codec'),
+                'acodec': encoding.get('audio_codec'),
+                'container': encoding.get('container_type'),
+            }
+            for f_url in (encoding_url, file_url):
+                if not f_url:
+                    continue
+                fmt = f.copy()
+                rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url)
+                if rtmp:
+                    fmt.update({
+                        'url': rtmp.group('url'),
+                        'play_path': rtmp.group('playpath'),
+                        'app': rtmp.group('app'),
+                        'ext': 'flv',
+                        'format_id': 'rtmp',
+                    })
+                else:
+                    fmt.update({
+                        'url': f_url,
+                        'format_id': 'http',
+                    })
+                formats.append(fmt)
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for closed_caption in media.get('closed_captions', []):
+            sub_url = closed_caption.get('file')
+            if not sub_url:
+                continue
+            lang = closed_caption.get('code') or closed_caption.get('language') or 'en'
+            subtitles.setdefault(lang, []).append({
+                'url': sub_url,
+            })
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': media.get('description'),
+            'thumbnail': media.get('image_large') or media.get('image_medium') or media.get('image_small'),
+            'duration': int_or_none(media.get('duration')),
+            'tags': media.get('tags'),
+            'subtitles': subtitles,
+        }
+
+
+class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
+    IE_NAME = 'curiositystream:collection'
+    _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collection|series)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://app.curiositystream.com/collection/2',
+        'info_dict': {
+            'id': '2',
+            'title': 'Curious Minds: The Internet',
+            'description': 'How is the internet shaping our lives in the 21st Century?',
+        },
+        'playlist_mincount': 17,
+    }, {
+        'url': 'https://curiositystream.com/series/2',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        collection_id = self._match_id(url)
+        collection = self._call_api(
+            'collections/' + collection_id, collection_id)
+        entries = []
+        for media in collection.get('media', []):
+            media_id = compat_str(media.get('id'))
+            entries.append(self.url_result(
+                'https://curiositystream.com/video/' + media_id,
+                CuriosityStreamIE.ie_key(), media_id))
+        return self.playlist_result(
+            entries, collection_id,
+            collection.get('title'), collection.get('description'))
diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py

new file mode 100644 (file)

index 0000000..7338243
--- /dev/null
+++ b/youtube_dl/extractor/cwtv.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_age_limit,
+    parse_iso8601,
+    smuggle_url,
+    str_or_none,
+)
+
+
+class CWTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cw(?:tv(?:pr)?|seed)\.com/(?:shows/)?(?:[^/]+/)+[^?]*\?.*\b(?:play|watch)=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'
+    _TESTS = [{
+        'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63',
+        'info_dict': {
+            'id': '6b15e985-9345-4f60-baf8-56e96be57c63',
+            'ext': 'mp4',
+            'title': 'Legends of Yesterday',
+            'description': 'Oliver and Barry Allen take Kendra Saunders and Carter Hall to a remote location to keep them hidden from Vandal Savage while they figure out how to defeat him.',
+            'duration': 2665,
+            'series': 'Arrow',
+            'season_number': 4,
+            'season': '4',
+            'episode_number': 8,
+            'upload_date': '20151203',
+            'timestamp': 1449122100,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': 'redirect to http://cwtv.com/shows/arrow/',
+    }, {
+        'url': 'http://www.cwseed.com/shows/whose-line-is-it-anyway/jeff-davis-4/?play=24282b12-ead2-42f2-95ad-26770c2c6088',
+        'info_dict': {
+            'id': '24282b12-ead2-42f2-95ad-26770c2c6088',
+            'ext': 'mp4',
+            'title': 'Jeff Davis 4',
+            'description': 'Jeff Davis is back to make you laugh.',
+            'duration': 1263,
+            'series': 'Whose Line Is It Anyway?',
+            'season_number': 11,
+            'episode_number': 20,
+            'upload_date': '20151006',
+            'timestamp': 1444107300,
+            'age_limit': 14,
+            'uploader': 'CWTV',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6',
+        'only_matching': True,
+    }, {
+        'url': 'http://cwtvpr.com/the-cw/video?watch=9eee3f60-ef4e-440b-b3b2-49428ac9c54e',
+        'only_matching': True,
+    }, {
+        'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?watch=6b15e985-9345-4f60-baf8-56e96be57c63',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        data = self._download_json(
+            'http://images.cwtv.com/feed/mobileapp/video-meta/apiversion_8/guid_' + video_id,
+            video_id)
+        if data.get('result') != 'ok':
+            raise ExtractorError(data['msg'], expected=True)
+        video_data = data['video']
+        title = video_data['title']
+        mpx_url = video_data.get('mpx_url') or 'http://link.theplatform.com/s/cwtv/media/guid/2703454149/%s?formats=M3U' % video_id
+
+        season = str_or_none(video_data.get('season'))
+        episode = str_or_none(video_data.get('episode'))
+        if episode and season:
+            episode = episode[len(season):]
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': title,
+            'url': smuggle_url(mpx_url, {'force_smil_url': True}),
+            'description': video_data.get('description_long'),
+            'duration': int_or_none(video_data.get('duration_secs')),
+            'series': video_data.get('series_name'),
+            'season_number': int_or_none(season),
+            'episode_number': int_or_none(episode),
+            'timestamp': parse_iso8601(video_data.get('start_time')),
+            'age_limit': parse_age_limit(video_data.get('rating')),
+            'ie_key': 'ThePlatform',
+        }
diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py

new file mode 100644 (file)

index 0000000..67b88fd
--- /dev/null
+++ b/youtube_dl/extractor/dailymail.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    determine_protocol,
+    try_get,
+    unescapeHTML,
+)
+
+
+class DailyMailIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html',
+        'md5': 'f6129624562251f628296c3a9ffde124',
+        'info_dict': {
+            'id': '1295863',
+            'ext': 'mp4',
+            'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'',
+            'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84',
+        }
+    }, {
+        'url': 'http://www.dailymail.co.uk/embed/video/1295863.html',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        video_data = self._parse_json(self._search_regex(
+            r"data-opts='({.+?})'", webpage, 'video data'), video_id)
+        title = unescapeHTML(video_data['title'])
+
+        sources_url = (try_get(
+            video_data,
+            (lambda x: x['plugins']['sources']['url'],
+             lambda x: x['sources']['url']), compat_str)
+            or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id)
+
+        video_sources = self._download_json(sources_url, video_id)
+        body = video_sources.get('body')
+        if body:
+            video_sources = body
+
+        formats = []
+        for rendition in video_sources['renditions']:
+            rendition_url = rendition.get('url')
+            if not rendition_url:
+                continue
+            tbr = int_or_none(rendition.get('encodingRate'), 1000)
+            container = rendition.get('videoContainer')
+            is_hls = container == 'M2TS'
+            protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url})
+            formats.append({
+                'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''),
+                'url': rendition_url,
+                'width': int_or_none(rendition.get('frameWidth')),
+                'height': int_or_none(rendition.get('frameHeight')),
+                'tbr': tbr,
+                'vcodec': rendition.get('videoCodec'),
+                'container': container,
+                'protocol': protocol,
+                'ext': 'mp4' if is_hls else None,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': unescapeHTML(video_data.get('descr')),
+            'thumbnail': video_data.get('poster') or video_data.get('thumbnail'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py

new file mode 100644 (file)

index 0000000..b852905
--- /dev/null
+++ b/youtube_dl/extractor/dailymotion.py
@@ -0,0 +1,393 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import functools
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    age_restricted,
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    OnDemandPagedList,
+    try_get,
+    unescapeHTML,
+    urlencode_postdata,
+)
+
+
+class DailymotionBaseInfoExtractor(InfoExtractor):
+    _FAMILY_FILTER = None
+    _HEADERS = {
+        'Content-Type': 'application/json',
+        'Origin': 'https://www.dailymotion.com',
+    }
+    _NETRC_MACHINE = 'dailymotion'
+
+    def _get_dailymotion_cookies(self):
+        return self._get_cookies('https://www.dailymotion.com/')
+
+    @staticmethod
+    def _get_cookie_value(cookies, name):
+        cookie = cookies.get(name)
+        if cookie:
+            return cookie.value
+
+    def _set_dailymotion_cookie(self, name, value):
+        self._set_cookie('www.dailymotion.com', name, value)
+
+    def _real_initialize(self):
+        cookies = self._get_dailymotion_cookies()
+        ff = self._get_cookie_value(cookies, 'ff')
+        self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self._downloader.params.get('age_limit'))
+        self._set_dailymotion_cookie('ff', 'on' if self._FAMILY_FILTER else 'off')
+
+    def _call_api(self, object_type, xid, object_fields, note, filter_extra=None):
+        if not self._HEADERS.get('Authorization'):
+            cookies = self._get_dailymotion_cookies()
+            token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token')
+            if not token:
+                data = {
+                    'client_id': 'f1a362d288c1b98099c7',
+                    'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5',
+                }
+                username, password = self._get_login_info()
+                if username:
+                    data.update({
+                        'grant_type': 'password',
+                        'password': password,
+                        'username': username,
+                    })
+                else:
+                    data['grant_type'] = 'client_credentials'
+                try:
+                    token = self._download_json(
+                        'https://graphql.api.dailymotion.com/oauth/token',
+                        None, 'Downloading Access Token',
+                        data=urlencode_postdata(data))['access_token']
+                except ExtractorError as e:
+                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+                        raise ExtractorError(self._parse_json(
+                            e.cause.read().decode(), xid)['error_description'], expected=True)
+                    raise
+                self._set_dailymotion_cookie('access_token' if username else 'client_token', token)
+            self._HEADERS['Authorization'] = 'Bearer ' + token
+
+        resp = self._download_json(
+            'https://graphql.api.dailymotion.com/', xid, note, data=json.dumps({
+                'query': '''{
+  %s(xid: "%s"%s) {
+    %s
+  }
+}''' % (object_type, xid, ', ' + filter_extra if filter_extra else '', object_fields),
+            }).encode(), headers=self._HEADERS)
+        obj = resp['data'][object_type]
+        if not obj:
+            raise ExtractorError(resp['errors'][0]['message'], expected=True)
+        return obj
+
+
+class DailymotionIE(DailymotionBaseInfoExtractor):
+    _VALID_URL = r'''(?ix)
+                    https?://
+                        (?:
+                            (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)|
+                            (?:www\.)?lequipe\.fr/video
+                        )
+                        /(?P<id>[^/?_]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
+                    '''
+    IE_NAME = 'dailymotion'
+    _TESTS = [{
+        'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
+        'md5': '074b95bdee76b9e3654137aee9c79dfe',
+        'info_dict': {
+            'id': 'x5kesuj',
+            'ext': 'mp4',
+            'title': 'Office Christmas Party Review –  Jason Bateman, Olivia Munn, T.J. Miller',
+            'description': 'Office Christmas Party Review -  Jason Bateman, Olivia Munn, T.J. Miller',
+            'duration': 187,
+            'timestamp': 1493651285,
+            'upload_date': '20170501',
+            'uploader': 'Deadline',
+            'uploader_id': 'x1xm8ri',
+            'age_limit': 0,
+        },
+    }, {
+        'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
+        'md5': '2137c41a8e78554bb09225b8eb322406',
+        'info_dict': {
+            'id': 'x2iuewm',
+            'ext': 'mp4',
+            'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
+            'description': 'Several come bundled with the Steam Controller.',
+            'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
+            'duration': 74,
+            'timestamp': 1425657362,
+            'upload_date': '20150306',
+            'uploader': 'IGN',
+            'uploader_id': 'xijv66',
+            'age_limit': 0,
+            'view_count': int,
+        },
+        'skip': 'video gone',
+    }, {
+        # Vevo video
+        'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
+        'info_dict': {
+            'title': 'Roar (Official)',
+            'id': 'USUV71301934',
+            'ext': 'mp4',
+            'uploader': 'Katy Perry',
+            'upload_date': '20130905',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'VEVO is only available in some countries',
+    }, {
+        # age-restricted video
+        'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
+        'md5': '0d667a7b9cebecc3c89ee93099c4159d',
+        'info_dict': {
+            'id': 'xyh2zz',
+            'ext': 'mp4',
+            'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
+            'uploader': 'HotWaves1012',
+            'age_limit': 18,
+        },
+        'skip': 'video gone',
+    }, {
+        # geo-restricted, player v5
+        'url': 'http://www.dailymotion.com/video/xhza0o',
+        'only_matching': True,
+    }, {
+        # with subtitles
+        'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.lequipe.fr/video/x791mem',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.lequipe.fr/video/k7MtHciueyTcrFtFKA2',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.dailymotion.com/video/x3z49k?playlist=xv4bw',
+        'only_matching': True,
+    }]
+    _GEO_BYPASS = False
+    _COMMON_MEDIA_FIELDS = '''description
+      geoblockedCountries {
+        allowed
+      }
+      xid'''
+
+    @staticmethod
+    def _extract_urls(webpage):
+        urls = []
+        # Look for embedded Dailymotion player
+        # https://developer.dailymotion.com/player#player-parameters
+        for mobj in re.finditer(
+                r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage):
+            urls.append(unescapeHTML(mobj.group('url')))
+        for mobj in re.finditer(
+                r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage):
+            urls.append('https://www.dailymotion.com/embed/video/' + mobj.group('id'))
+        return urls
+
+    def _real_extract(self, url):
+        video_id, playlist_id = re.match(self._VALID_URL, url).groups()
+
+        if playlist_id:
+            if not self._downloader.params.get('noplaylist'):
+                self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id)
+                return self.url_result(
+                    'http://www.dailymotion.com/playlist/' + playlist_id,
+                    'DailymotionPlaylist', playlist_id)
+            self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
+        password = self._downloader.params.get('videopassword')
+        media = self._call_api(
+            'media', video_id, '''... on Video {
+      %s
+      stats {
+        likes {
+          total
+        }
+        views {
+          total
+        }
+      }
+    }
+    ... on Live {
+      %s
+      audienceCount
+      isOnAir
+    }''' % (self._COMMON_MEDIA_FIELDS, self._COMMON_MEDIA_FIELDS), 'Downloading media JSON metadata',
+            'password: "%s"' % self._downloader.params.get('videopassword') if password else None)
+        xid = media['xid']
+
+        metadata = self._download_json(
+            'https://www.dailymotion.com/player/metadata/video/' + xid,
+            xid, 'Downloading metadata JSON',
+            query={'app': 'com.dailymotion.neon'})
+
+        error = metadata.get('error')
+        if error:
+            title = error.get('title') or error['raw_message']
+            # See https://developer.dailymotion.com/api#access-error
+            if error.get('code') == 'DM007':
+                allowed_countries = try_get(media, lambda x: x['geoblockedCountries']['allowed'], list)
+                self.raise_geo_restricted(msg=title, countries=allowed_countries)
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, title), expected=True)
+
+        title = metadata['title']
+        is_live = media.get('isOnAir')
+        formats = []
+        for quality, media_list in metadata['qualities'].items():
+            for m in media_list:
+                media_url = m.get('url')
+                media_type = m.get('type')
+                if not media_url or media_type == 'application/vnd.lumberjack.manifest':
+                    continue
+                if media_type == 'application/x-mpegURL':
+                    formats.extend(self._extract_m3u8_formats(
+                        media_url, video_id, 'mp4',
+                        'm3u8' if is_live else 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+                else:
+                    f = {
+                        'url': media_url,
+                        'format_id': 'http-' + quality,
+                    }
+                    m = re.search(r'/H264-(\d+)x(\d+)(?:-(60)/)?', media_url)
+                    if m:
+                        width, height, fps = map(int_or_none, m.groups())
+                        f.update({
+                            'fps': fps,
+                            'height': height,
+                            'width': width,
+                        })
+                    formats.append(f)
+        for f in formats:
+            f['url'] = f['url'].split('#')[0]
+            if not f.get('fps') and f['format_id'].endswith('@60'):
+                f['fps'] = 60
+        self._sort_formats(formats)
+
+        subtitles = {}
+        subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {}
+        for subtitle_lang, subtitle in subtitles_data.items():
+            subtitles[subtitle_lang] = [{
+                'url': subtitle_url,
+            } for subtitle_url in subtitle.get('urls', [])]
+
+        thumbnails = []
+        for height, poster_url in metadata.get('posters', {}).items():
+            thumbnails.append({
+                'height': int_or_none(height),
+                'id': height,
+                'url': poster_url,
+            })
+
+        owner = metadata.get('owner') or {}
+        stats = media.get('stats') or {}
+        get_count = lambda x: int_or_none(try_get(stats, lambda y: y[x + 's']['total']))
+
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'description': clean_html(media.get('description')),
+            'thumbnails': thumbnails,
+            'duration': int_or_none(metadata.get('duration')) or None,
+            'timestamp': int_or_none(metadata.get('created_time')),
+            'uploader': owner.get('screenname'),
+            'uploader_id': owner.get('id') or metadata.get('screenname'),
+            'age_limit': 18 if metadata.get('explicit') else 0,
+            'tags': metadata.get('tags'),
+            'view_count': get_count('view') or int_or_none(media.get('audienceCount')),
+            'like_count': get_count('like'),
+            'formats': formats,
+            'subtitles': subtitles,
+            'is_live': is_live,
+        }
+
+
+class DailymotionPlaylistBaseIE(DailymotionBaseInfoExtractor):
+    _PAGE_SIZE = 100
+
+    def _fetch_page(self, playlist_id, page):
+        page += 1
+        videos = self._call_api(
+            self._OBJECT_TYPE, playlist_id,
+            '''videos(allowExplicit: %s, first: %d, page: %d) {
+      edges {
+        node {
+          xid
+          url
+        }
+      }
+    }''' % ('false' if self._FAMILY_FILTER else 'true', self._PAGE_SIZE, page),
+            'Downloading page %d' % page)['videos']
+        for edge in videos['edges']:
+            node = edge['node']
+            yield self.url_result(
+                node['url'], DailymotionIE.ie_key(), node['xid'])
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        entries = OnDemandPagedList(functools.partial(
+            self._fetch_page, playlist_id), self._PAGE_SIZE)
+        return self.playlist_result(
+            entries, playlist_id)
+
+
+class DailymotionPlaylistIE(DailymotionPlaylistBaseIE):
+    IE_NAME = 'dailymotion:playlist'
+    _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)'
+    _TESTS = [{
+        'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
+        'info_dict': {
+            'id': 'xv4bw',
+        },
+        'playlist_mincount': 20,
+    }]
+    _OBJECT_TYPE = 'collection'
+
+
+class DailymotionUserIE(DailymotionPlaylistBaseIE):
+    IE_NAME = 'dailymotion:user'
+    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'https://www.dailymotion.com/user/nqtv',
+        'info_dict': {
+            'id': 'nqtv',
+        },
+        'playlist_mincount': 152,
+    }, {
+        'url': 'http://www.dailymotion.com/user/UnderProject',
+        'info_dict': {
+            'id': 'UnderProject',
+        },
+        'playlist_mincount': 1000,
+        'skip': 'Takes too long time',
+    }, {
+        'url': 'https://www.dailymotion.com/user/nqtv',
+        'info_dict': {
+            'id': 'nqtv',
+        },
+        'playlist_mincount': 148,
+        'params': {
+            'age_limit': 0,
+        },
+    }]
+    _OBJECT_TYPE = 'channel'
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py

new file mode 100644 (file)

index 0000000..1370955
--- /dev/null
+++ b/youtube_dl/extractor/daum.py
@@ -0,0 +1,266 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_unquote,
+    compat_urlparse,
+)
+
+
+class DaumBaseIE(InfoExtractor):
+    _KAKAO_EMBED_BASE = 'http://tv.kakao.com/embed/player/cliplink/'
+
+
+class DaumIE(DaumBaseIE):
+    _VALID_URL = r'https?://(?:(?:m\.)?tvpot\.daum\.net/v/|videofarm\.daum\.net/controller/player/VodPlayer\.swf\?vid=)(?P<id>[^?#&]+)'
+    IE_NAME = 'daum.net'
+
+    _TESTS = [{
+        'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz',
+        'info_dict': {
+            'id': 'vab4dyeDBysyBssyukBUjBz',
+            'ext': 'mp4',
+            'title': '마크 헌트 vs 안토니오 실바',
+            'description': 'Mark Hunt vs Antonio Silva',
+            'upload_date': '20131217',
+            'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+            'duration': 2117,
+            'view_count': int,
+            'comment_count': int,
+            'uploader_id': 186139,
+            'uploader': '콘간지',
+            'timestamp': 1387310323,
+        },
+    }, {
+        'url': 'http://m.tvpot.daum.net/v/65139429',
+        'info_dict': {
+            'id': '65139429',
+            'ext': 'mp4',
+            'title': '1297회, \'아빠 아들로 태어나길 잘 했어\' 민수, 감동의 눈물[아빠 어디가] 20150118',
+            'description': 'md5:79794514261164ff27e36a21ad229fc5',
+            'upload_date': '20150118',
+            'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+            'duration': 154,
+            'view_count': int,
+            'comment_count': int,
+            'uploader': 'MBC 예능',
+            'uploader_id': 132251,
+            'timestamp': 1421604228,
+        },
+    }, {
+        'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24',
+        'only_matching': True,
+    }, {
+        'url': 'http://videofarm.daum.net/controller/player/VodPlayer.swf?vid=vwIpVpCQsT8%24&ref=',
+        'info_dict': {
+            'id': 'vwIpVpCQsT8$',
+            'ext': 'flv',
+            'title': '01-Korean War ( Trouble on the horizon )',
+            'description': 'Korean War 01\r\nTrouble on the horizon\r\n전쟁의 먹구름',
+            'upload_date': '20080223',
+            'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+            'duration': 249,
+            'view_count': int,
+            'comment_count': int,
+            'uploader': '까칠한 墮落始祖 황비홍님의',
+            'uploader_id': 560824,
+            'timestamp': 1203770745,
+        },
+    }, {
+        # Requires dte_type=WEB (#9972)
+        'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU',
+        'md5': 'a8917742069a4dd442516b86e7d66529',
+        'info_dict': {
+            'id': 's3794Uf1NZeZ1qMpGpeqeRU',
+            'ext': 'mp4',
+            'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)',
+            'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회',
+            'upload_date': '20170129',
+            'uploader': '쇼! 음악중심',
+            'uploader_id': 2653210,
+            'timestamp': 1485684628,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = compat_urllib_parse_unquote(self._match_id(url))
+        if not video_id.isdigit():
+            video_id += '@my'
+        return self.url_result(
+            self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id)
+
+
+class DaumClipIE(DaumBaseIE):
+    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.(?:do|tv)|mypot/View.do)\?.*?clipid=(?P<id>\d+)'
+    IE_NAME = 'daum.net:clip'
+    _URL_TEMPLATE = 'http://tvpot.daum.net/clip/ClipView.do?clipid=%s'
+
+    _TESTS = [{
+        'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
+        'info_dict': {
+            'id': '52554690',
+            'ext': 'mp4',
+            'title': 'DOTA 2GETHER 시즌2 6회 - 2부',
+            'description': 'DOTA 2GETHER 시즌2 6회 - 2부',
+            'upload_date': '20130831',
+            'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+            'duration': 3868,
+            'view_count': int,
+            'uploader': 'GOMeXP',
+            'uploader_id': 6667,
+            'timestamp': 1377911092,
+        },
+    }, {
+        'url': 'http://m.tvpot.daum.net/clip/ClipView.tv?clipid=54999425',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if DaumPlaylistIE.suitable(url) or DaumUserIE.suitable(url) else super(DaumClipIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result(
+            self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id)
+
+
+class DaumListIE(InfoExtractor):
+    def _get_entries(self, list_id, list_id_type):
+        name = None
+        entries = []
+        for pagenum in itertools.count(1):
+            list_info = self._download_json(
+                'http://tvpot.daum.net/mypot/json/GetClipInfo.do?size=48&init=true&order=date&page=%d&%s=%s' % (
+                    pagenum, list_id_type, list_id), list_id, 'Downloading list info - %s' % pagenum)
+
+            entries.extend([
+                self.url_result(
+                    'http://tvpot.daum.net/v/%s' % clip['vid'])
+                for clip in list_info['clip_list']
+            ])
+
+            if not name:
+                name = list_info.get('playlist_bean', {}).get('name') or \
+                    list_info.get('potInfo', {}).get('name')
+
+            if not list_info.get('has_more'):
+                break
+
+        return name, entries
+
+    def _check_clip(self, url, list_id):
+        query_dict = compat_parse_qs(compat_urlparse.urlparse(url).query)
+        if 'clipid' in query_dict:
+            clip_id = query_dict['clipid'][0]
+            if self._downloader.params.get('noplaylist'):
+                self.to_screen('Downloading just video %s because of --no-playlist' % clip_id)
+                return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip')
+            else:
+                self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % list_id)
+
+
+class DaumPlaylistIE(DaumListIE):
+    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/mypot/(?:View\.do|Top\.tv)\?.*?playlistid=(?P<id>[0-9]+)'
+    IE_NAME = 'daum.net:playlist'
+    _URL_TEMPLATE = 'http://tvpot.daum.net/mypot/View.do?playlistid=%s'
+
+    _TESTS = [{
+        'note': 'Playlist url with clipid',
+        'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844',
+        'info_dict': {
+            'id': '6213966',
+            'title': 'Woorissica Official',
+        },
+        'playlist_mincount': 181
+    }, {
+        'note': 'Playlist url with clipid - noplaylist',
+        'url': 'http://tvpot.daum.net/mypot/View.do?playlistid=6213966&clipid=73806844',
+        'info_dict': {
+            'id': '73806844',
+            'ext': 'mp4',
+            'title': '151017 Airport',
+            'upload_date': '20160117',
+        },
+        'params': {
+            'noplaylist': True,
+            'skip_download': True,
+        }
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if DaumUserIE.suitable(url) else super(DaumPlaylistIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        clip_result = self._check_clip(url, list_id)
+        if clip_result:
+            return clip_result
+
+        name, entries = self._get_entries(list_id, 'playlistid')
+
+        return self.playlist_result(entries, list_id, name)
+
+
+class DaumUserIE(DaumListIE):
+    _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/mypot/(?:View|Top)\.(?:do|tv)\?.*?ownerid=(?P<id>[0-9a-zA-Z]+)'
+    IE_NAME = 'daum.net:user'
+
+    _TESTS = [{
+        'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0',
+        'info_dict': {
+            'id': 'o2scDLIVbHc0',
+            'title': '마이 리틀 텔레비전',
+        },
+        'playlist_mincount': 213
+    }, {
+        'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&clipid=73801156',
+        'info_dict': {
+            'id': '73801156',
+            'ext': 'mp4',
+            'title': '[미공개] 김구라, 오만석이 부릅니다 \'오케피\' - 마이 리틀 텔레비전 20160116',
+            'upload_date': '20160117',
+            'description': 'md5:5e91d2d6747f53575badd24bd62b9f36'
+        },
+        'params': {
+            'noplaylist': True,
+            'skip_download': True,
+        }
+    }, {
+        'note': 'Playlist url has ownerid and playlistid, playlistid takes precedence',
+        'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=o2scDLIVbHc0&playlistid=6196631',
+        'info_dict': {
+            'id': '6196631',
+            'title': '마이 리틀 텔레비전 - 20160109',
+        },
+        'playlist_count': 11
+    }, {
+        'url': 'http://tvpot.daum.net/mypot/Top.do?ownerid=o2scDLIVbHc0',
+        'only_matching': True,
+    }, {
+        'url': 'http://m.tvpot.daum.net/mypot/Top.tv?ownerid=45x1okb1If50&playlistid=3569733',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        clip_result = self._check_clip(url, list_id)
+        if clip_result:
+            return clip_result
+
+        query_dict = compat_parse_qs(compat_urlparse.urlparse(url).query)
+        if 'playlistid' in query_dict:
+            playlist_id = query_dict['playlistid'][0]
+            return self.url_result(DaumPlaylistIE._URL_TEMPLATE % playlist_id, 'DaumPlaylist')
+
+        name, entries = self._get_entries(list_id, 'ownerid')
+
+        return self.playlist_result(entries, list_id, name)
diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py

new file mode 100644 (file)

index 0000000..aaedf2e
--- /dev/null
+++ b/youtube_dl/extractor/dbtv.py
@@ -0,0 +1,57 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class DBTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})'
+    _TESTS = [{
+        'url': 'https://www.dagbladet.no/video/PynxJnNWChE/',
+        'md5': 'b8f850ba1860adbda668d367f9b77699',
+        'info_dict': {
+            'id': 'PynxJnNWChE',
+            'ext': 'mp4',
+            'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen',
+            'description': 'md5:49cc8370e7d66e8a2ef15c3b4631fd3f',
+            'thumbnail': r're:https?://.*\.jpg',
+            'upload_date': '20160916',
+            'duration': 69,
+            'uploader_id': 'UCk5pvsyZJoYJBd7_oFPTlRQ',
+            'uploader': 'Dagbladet',
+        },
+        'add_ie': ['Youtube']
+    }, {
+        'url': 'https://www.dagbladet.no/video/embed/xlGmyIeN9Jo/?autoplay=false',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.dagbladet.no/video/truer-iran-bor-passe-dere/PalfB2Cw',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [url for _, url in re.findall(
+            r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1',
+            webpage)]
+
+    def _real_extract(self, url):
+        display_id, video_id = re.match(self._VALID_URL, url).groups()
+        info = {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'display_id': display_id,
+        }
+        if len(video_id) == 11:
+            info.update({
+                'url': video_id,
+                'ie_key': 'Youtube',
+            })
+        else:
+            info.update({
+                'url': 'jwplatform:' + video_id,
+                'ie_key': 'JWPlatform',
+            })
+        return info
diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py

new file mode 100644 (file)

index 0000000..e700f8d
--- /dev/null
+++ b/youtube_dl/extractor/dctp.py
@@ -0,0 +1,105 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    unified_timestamp,
+    url_or_none,
+)
+
+
+class DctpTvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        # 4x3
+        'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/',
+        'md5': '3ffbd1556c3fe210724d7088fad723e3',
+        'info_dict': {
+            'id': '95eaa4f33dad413aa17b4ee613cccc6c',
+            'display_id': 'videoinstallation-fuer-eine-kaufhausfassade',
+            'ext': 'm4v',
+            'title': 'Videoinstallation für eine Kaufhausfassade',
+            'description': 'Kurzfilm',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 71.24,
+            'timestamp': 1302172322,
+            'upload_date': '20110407',
+        },
+    }, {
+        # 16x9
+        'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/',
+        'only_matching': True,
+    }]
+
+    _BASE_URL = 'http://dctp-ivms2-restapi.s3.amazonaws.com'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        version = self._download_json(
+            '%s/version.json' % self._BASE_URL, display_id,
+            'Downloading version JSON')
+
+        restapi_base = '%s/%s/restapi' % (
+            self._BASE_URL, version['version_name'])
+
+        info = self._download_json(
+            '%s/slugs/%s.json' % (restapi_base, display_id), display_id,
+            'Downloading video info JSON')
+
+        media = self._download_json(
+            '%s/media/%s.json' % (restapi_base, compat_str(info['object_id'])),
+            display_id, 'Downloading media JSON')
+
+        uuid = media['uuid']
+        title = media['title']
+        is_wide = media.get('is_wide')
+        formats = []
+
+        def add_formats(suffix):
+            templ = 'https://%%s/%s_dctp_%s.m4v' % (uuid, suffix)
+            formats.extend([{
+                'format_id': 'hls-' + suffix,
+                'url': templ % 'cdn-segments.dctp.tv' + '/playlist.m3u8',
+                'protocol': 'm3u8_native',
+            }, {
+                'format_id': 's3-' + suffix,
+                'url': templ % 'completed-media.s3.amazonaws.com',
+            }, {
+                'format_id': 'http-' + suffix,
+                'url': templ % 'cdn-media.dctp.tv',
+            }])
+
+        add_formats('0500_' + ('16x9' if is_wide else '4x3'))
+        if is_wide:
+            add_formats('720p')
+
+        thumbnails = []
+        images = media.get('images')
+        if isinstance(images, list):
+            for image in images:
+                if not isinstance(image, dict):
+                    continue
+                image_url = url_or_none(image.get('url'))
+                if not image_url:
+                    continue
+                thumbnails.append({
+                    'url': image_url,
+                    'width': int_or_none(image.get('width')),
+                    'height': int_or_none(image.get('height')),
+                })
+
+        return {
+            'id': uuid,
+            'display_id': display_id,
+            'title': title,
+            'alt_title': media.get('subtitle'),
+            'description': media.get('description') or media.get('teaser'),
+            'timestamp': unified_timestamp(media.get('created')),
+            'duration': float_or_none(media.get('duration_in_ms'), scale=1000),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/deezer.py b/youtube_dl/extractor/deezer.py

new file mode 100644 (file)

index 0000000..a38b268
--- /dev/null
+++ b/youtube_dl/extractor/deezer.py
@@ -0,0 +1,91 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    orderedSet,
+)
+
+
+class DeezerPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?deezer\.com/playlist/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.deezer.com/playlist/176747451',
+        'info_dict': {
+            'id': '176747451',
+            'title': 'Best!',
+            'uploader': 'Anonymous',
+            'thumbnail': r're:^https?://cdn-images\.deezer\.com/images/cover/.*\.jpg$',
+        },
+        'playlist_count': 30,
+        'skip': 'Only available in .de',
+    }
+
+    def _real_extract(self, url):
+        if 'test' not in self._downloader.params:
+            self._downloader.report_warning('For now, this extractor only supports the 30 second previews. Patches welcome!')
+
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, playlist_id)
+        geoblocking_msg = self._html_search_regex(
+            r'<p class="soon-txt">(.*?)</p>', webpage, 'geoblocking message',
+            default=None)
+        if geoblocking_msg is not None:
+            raise ExtractorError(
+                'Deezer said: %s' % geoblocking_msg, expected=True)
+
+        data_json = self._search_regex(
+            (r'__DZR_APP_STATE__\s*=\s*({.+?})\s*</script>',
+             r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n'),
+            webpage, 'data JSON')
+        data = json.loads(data_json)
+
+        playlist_title = data.get('DATA', {}).get('TITLE')
+        playlist_uploader = data.get('DATA', {}).get('PARENT_USERNAME')
+        playlist_thumbnail = self._search_regex(
+            r'<img id="naboo_playlist_image".*?src="([^"]+)"', webpage,
+            'playlist thumbnail')
+
+        preview_pattern = self._search_regex(
+            r"var SOUND_PREVIEW_GATEWAY\s*=\s*'([^']+)';", webpage,
+            'preview URL pattern', fatal=False)
+        entries = []
+        for s in data['SONGS']['data']:
+            puid = s['MD5_ORIGIN']
+            preview_video_url = preview_pattern.\
+                replace('{0}', puid[0]).\
+                replace('{1}', puid).\
+                replace('{2}', s['MEDIA_VERSION'])
+            formats = [{
+                'format_id': 'preview',
+                'url': preview_video_url,
+                'preference': -100,  # Only the first 30 seconds
+                'ext': 'mp3',
+            }]
+            self._sort_formats(formats)
+            artists = ', '.join(
+                orderedSet(a['ART_NAME'] for a in s['ARTISTS']))
+            entries.append({
+                'id': s['SNG_ID'],
+                'duration': int_or_none(s.get('DURATION')),
+                'title': '%s - %s' % (artists, s['SNG_TITLE']),
+                'uploader': s['ART_NAME'],
+                'uploader_id': s['ART_ID'],
+                'age_limit': 16 if s.get('EXPLICIT_LYRICS') == '1' else 0,
+                'formats': formats,
+            })
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': playlist_title,
+            'uploader': playlist_uploader,
+            'thumbnail': playlist_thumbnail,
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py

new file mode 100644 (file)

index 0000000..9fe144e
--- /dev/null
+++ b/youtube_dl/extractor/defense.py
@@ -0,0 +1,39 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class DefenseGouvFrIE(InfoExtractor):
+    IE_NAME = 'defense.gouv.fr'
+    _VALID_URL = r'https?://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P<id>[^/?#]*)'
+
+    _TEST = {
+        'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1',
+        'md5': '75bba6124da7e63d2d60b5244ec9430c',
+        'info_dict': {
+            'id': '11213',
+            'ext': 'mp4',
+            'title': 'attaque-chimique-syrienne-du-21-aout-2013-1'
+        }
+    }
+
+    def _real_extract(self, url):
+        title = self._match_id(url)
+        webpage = self._download_webpage(url, title)
+
+        video_id = self._search_regex(
+            r"flashvars.pvg_id=\"(\d+)\";",
+            webpage, 'ID')
+
+        json_url = (
+            'http://static.videos.gouv.fr/brightcovehub/export/json/%s' %
+            video_id)
+        info = self._download_json(json_url, title, 'Downloading JSON config')
+        video_url = info['renditions'][0]['url']
+
+        return {
+            'id': video_id,
+            'ext': 'mp4',
+            'url': video_url,
+            'title': title,
+        }
diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py

new file mode 100644 (file)

index 0000000..5c9c0ec
--- /dev/null
+++ b/youtube_dl/extractor/democracynow.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import os.path
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    url_basename,
+    remove_start,
+)
+
+
+class DemocracynowIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?democracynow\.org/(?P<id>[^\?]*)'
+    IE_NAME = 'democracynow'
+    _TESTS = [{
+        'url': 'http://www.democracynow.org/shows/2015/7/3',
+        'md5': '3757c182d3d84da68f5c8f506c18c196',
+        'info_dict': {
+            'id': '2015-0703-001',
+            'ext': 'mp4',
+            'title': 'Daily Show for July 03, 2015',
+            'description': 'md5:80eb927244d6749900de6072c7cc2c86',
+        },
+    }, {
+        'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
+        'info_dict': {
+            'id': '2015-0703-001',
+            'ext': 'mp4',
+            'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag',
+            'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        json_data = self._parse_json(self._search_regex(
+            r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'),
+            display_id)
+
+        title = json_data['title']
+        formats = []
+
+        video_id = None
+
+        for key in ('file', 'audio', 'video', 'high_res_video'):
+            media_url = json_data.get(key, '')
+            if not media_url:
+                continue
+            media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url))
+            video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn')
+            formats.append({
+                'url': media_url,
+                'vcodec': 'none' if key == 'audio' else None,
+            })
+
+        self._sort_formats(formats)
+
+        default_lang = 'en'
+        subtitles = {}
+
+        def add_subtitle_item(lang, info_dict):
+            if lang not in subtitles:
+                subtitles[lang] = []
+            subtitles[lang].append(info_dict)
+
+        # chapter_file are not subtitles
+        if 'caption_file' in json_data:
+            add_subtitle_item(default_lang, {
+                'url': compat_urlparse.urljoin(url, json_data['caption_file']),
+            })
+
+        for subtitle_item in json_data.get('captions', []):
+            lang = subtitle_item.get('language', '').lower() or default_lang
+            add_subtitle_item(lang, {
+                'url': compat_urlparse.urljoin(url, subtitle_item['url']),
+            })
+
+        description = self._og_search_description(webpage, default=None)
+
+        return {
+            'id': video_id or display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': json_data.get('image'),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py

new file mode 100644 (file)

index 0000000..a4d0448
--- /dev/null
+++ b/youtube_dl/extractor/dfb.py
@@ -0,0 +1,57 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class DFBIE(InfoExtractor):
+    IE_NAME = 'tv.dfb.de'
+    _VALID_URL = r'https?://tv\.dfb\.de/video/(?P<display_id>[^/]+)/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/',
+        'md5': 'ac0f98a52a330f700b4b3034ad240649',
+        'info_dict': {
+            'id': '11633',
+            'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland',
+            'ext': 'mp4',
+            'title': 'U 19-EM: Stimmen zum Spiel gegen Russland',
+            'upload_date': '20150714',
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id, video_id = re.match(self._VALID_URL, url).groups()
+
+        player_info = self._download_xml(
+            'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id,
+            display_id)
+        video_info = player_info.find('video')
+        stream_access_url = self._proto_relative_url(video_info.find('url').text.strip())
+
+        formats = []
+        # see http://tv.dfb.de/player/js/ajax.js for the method to extract m3u8 formats
+        for sa_url in (stream_access_url, stream_access_url + '&area=&format=iphone'):
+            stream_access_info = self._download_xml(sa_url, display_id)
+            token_el = stream_access_info.find('token')
+            manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth']
+            if '.f4m' in manifest_url:
+                formats.extend(self._extract_f4m_formats(
+                    manifest_url + '&hdcore=3.2.0',
+                    display_id, f4m_id='hds', fatal=False))
+            else:
+                formats.extend(self._extract_m3u8_formats(
+                    manifest_url, display_id, 'mp4',
+                    'm3u8_native', m3u8_id='hls', fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': video_info.find('title').text,
+            'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id,
+            'upload_date': unified_strdate(video_info.find('time_date').text),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py

new file mode 100644 (file)

index 0000000..aee72a6
--- /dev/null
+++ b/youtube_dl/extractor/dhm.py
@@ -0,0 +1,59 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class DHMIE(InfoExtractor):
+    IE_DESC = 'Filmarchiv - Deutsches Historisches Museum'
+    _VALID_URL = r'https?://(?:www\.)?dhm\.de/filmarchiv/(?:[^/]+/)+(?P<id>[^/]+)'
+
+    _TESTS = [{
+        'url': 'http://www.dhm.de/filmarchiv/die-filme/the-marshallplan-at-work-in-west-germany/',
+        'md5': '11c475f670209bf6acca0b2b7ef51827',
+        'info_dict': {
+            'id': 'the-marshallplan-at-work-in-west-germany',
+            'ext': 'flv',
+            'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE',
+            'description': 'md5:1fabd480c153f97b07add61c44407c82',
+            'duration': 660,
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.dhm.de/filmarchiv/02-mapping-the-wall/peter-g/rolle-1/',
+        'md5': '09890226332476a3e3f6f2cb74734aa5',
+        'info_dict': {
+            'id': 'rolle-1',
+            'ext': 'flv',
+            'title': 'ROLLE 1',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        playlist_url = self._search_regex(
+            r"file\s*:\s*'([^']+)'", webpage, 'playlist url')
+
+        entries = self._extract_xspf_playlist(playlist_url, playlist_id)
+
+        title = self._search_regex(
+            [r'dc:title="([^"]+)"', r'<title> &raquo;([^<]+)</title>'],
+            webpage, 'title').strip()
+        description = self._html_search_regex(
+            r'<p><strong>Description:</strong>(.+?)</p>',
+            webpage, 'description', default=None)
+        duration = parse_duration(self._search_regex(
+            r'<em>Length\s*</em>\s*:\s*</strong>([^<]+)',
+            webpage, 'duration', default=None))
+
+        entries[0].update({
+            'title': title,
+            'description': description,
+            'duration': duration,
+        })
+
+        return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/digg.py b/youtube_dl/extractor/digg.py

new file mode 100644 (file)

index 0000000..913c175
--- /dev/null
+++ b/youtube_dl/extractor/digg.py
@@ -0,0 +1,56 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class DiggIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?digg\.com/video/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        # JWPlatform via provider
+        'url': 'http://digg.com/video/sci-fi-short-jonah-daniel-kaluuya-get-out',
+        'info_dict': {
+            'id': 'LcqvmS0b',
+            'ext': 'mp4',
+            'title': "'Get Out' Star Daniel Kaluuya Goes On 'Moby Dick'-Like Journey In Sci-Fi Short 'Jonah'",
+            'description': 'md5:541bb847648b6ee3d6514bc84b82efda',
+            'upload_date': '20180109',
+            'timestamp': 1515530551,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # Youtube via provider
+        'url': 'http://digg.com/video/dog-boat-seal-play',
+        'only_matching': True,
+    }, {
+        # vimeo as regular embed
+        'url': 'http://digg.com/video/dream-girl-short-film',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        info = self._parse_json(
+            self._search_regex(
+                r'(?s)video_info\s*=\s*({.+?});\n', webpage, 'video info',
+                default='{}'), display_id, transform_source=js_to_json,
+            fatal=False)
+
+        video_id = info.get('video_id')
+
+        if video_id:
+            provider = info.get('provider_name')
+            if provider == 'youtube':
+                return self.url_result(
+                    video_id, ie='Youtube', video_id=video_id)
+            elif provider == 'jwplayer':
+                return self.url_result(
+                    'jwplatform:%s' % video_id, ie='JWPlatform',
+                    video_id=video_id)
+
+        return self.url_result(url, 'Generic')
diff --git a/youtube_dl/extractor/digiteka.py b/youtube_dl/extractor/digiteka.py

new file mode 100644 (file)

index 0000000..3dfde0d
--- /dev/null
+++ b/youtube_dl/extractor/digiteka.py
@@ -0,0 +1,112 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class DigitekaIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+        https?://(?:www\.)?(?:digiteka\.net|ultimedia\.com)/
+        (?:
+            deliver/
+            (?P<embed_type>
+                generic|
+                musique
+            )
+            (?:/[^/]+)*/
+            (?:
+                src|
+                article
+            )|
+            default/index/video
+            (?P<site_type>
+                generic|
+                music
+            )
+            /id
+        )/(?P<id>[\d+a-z]+)'''
+    _TESTS = [{
+        # news
+        'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r',
+        'md5': '276a0e49de58c7e85d32b057837952a2',
+        'info_dict': {
+            'id': 's8uk0r',
+            'ext': 'mp4',
+            'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 74,
+            'upload_date': '20150317',
+            'timestamp': 1426604939,
+            'uploader_id': '3fszv',
+        },
+    }, {
+        # music
+        'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8',
+        'md5': '2ea3513813cf230605c7e2ffe7eca61c',
+        'info_dict': {
+            'id': 'xvpfp8',
+            'ext': 'mp4',
+            'title': 'Two - C\'est La Vie (clip)',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 233,
+            'upload_date': '20150224',
+            'timestamp': 1424760500,
+            'uploader_id': '3rfzk',
+        },
+    }, {
+        'url': 'https://www.digiteka.net/deliver/generic/iframe/mdtk/01637594/src/lqm3kl/zone/1/showtitle/1/autoplay/yes',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        video_type = mobj.group('embed_type') or mobj.group('site_type')
+        if video_type == 'music':
+            video_type = 'musique'
+
+        deliver_info = self._download_json(
+            'http://www.ultimedia.com/deliver/video?video=%s&topic=%s' % (video_id, video_type),
+            video_id)
+
+        yt_id = deliver_info.get('yt_id')
+        if yt_id:
+            return self.url_result(yt_id, 'Youtube')
+
+        jwconf = deliver_info['jwconf']
+
+        formats = []
+        for source in jwconf['playlist'][0]['sources']:
+            formats.append({
+                'url': source['file'],
+                'format_id': source.get('label'),
+            })
+
+        self._sort_formats(formats)
+
+        title = deliver_info['title']
+        thumbnail = jwconf.get('image')
+        duration = int_or_none(deliver_info.get('duration'))
+        timestamp = int_or_none(deliver_info.get('release_time'))
+        uploader_id = deliver_info.get('owner_id')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'uploader_id': uploader_id,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py

new file mode 100644 (file)

index 0000000..e0139cc
--- /dev/null
+++ b/youtube_dl/extractor/discovery.py
@@ -0,0 +1,118 @@
+from __future__ import unicode_literals
+
+import random
+import re
+import string
+
+from .discoverygo import DiscoveryGoBaseIE
+from ..compat import compat_urllib_parse_unquote
+from ..utils import ExtractorError
+from ..compat import compat_HTTPError
+
+
+class DiscoveryIE(DiscoveryGoBaseIE):
+    _VALID_URL = r'''(?x)https?://
+        (?P<site>
+            go\.discovery|
+            www\.
+                (?:
+                    investigationdiscovery|
+                    discoverylife|
+                    animalplanet|
+                    ahctv|
+                    destinationamerica|
+                    sciencechannel|
+                    tlc
+                )|
+            watch\.
+                (?:
+                    hgtv|
+                    foodnetwork|
+                    travelchannel|
+                    diynetwork|
+                    cookingchanneltv|
+                    motortrend
+                )
+        )\.com/tv-shows/(?P<show_slug>[^/]+)/(?:video|full-episode)s/(?P<id>[^./?#]+)'''
+    _TESTS = [{
+        'url': 'https://go.discovery.com/tv-shows/cash-cab/videos/riding-with-matthew-perry',
+        'info_dict': {
+            'id': '5a2f35ce6b66d17a5026e29e',
+            'ext': 'mp4',
+            'title': 'Riding with Matthew Perry',
+            'description': 'md5:a34333153e79bc4526019a5129e7f878',
+            'duration': 84,
+        },
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        }
+    }, {
+        'url': 'https://www.investigationdiscovery.com/tv-shows/final-vision/full-episodes/final-vision',
+        'only_matching': True,
+    }, {
+        'url': 'https://go.discovery.com/tv-shows/alaskan-bush-people/videos/follow-your-own-road',
+        'only_matching': True,
+    }, {
+        # using `show_slug` is important to get the correct video data
+        'url': 'https://www.sciencechannel.com/tv-shows/mythbusters-on-science/full-episodes/christmas-special',
+        'only_matching': True,
+    }]
+    _GEO_COUNTRIES = ['US']
+    _GEO_BYPASS = False
+    _API_BASE_URL = 'https://api.discovery.com/v1/'
+
+    def _real_extract(self, url):
+        site, show_slug, display_id = re.match(self._VALID_URL, url).groups()
+
+        access_token = None
+        cookies = self._get_cookies(url)
+
+        # prefer Affiliate Auth Token over Anonymous Auth Token
+        auth_storage_cookie = cookies.get('eosAf') or cookies.get('eosAn')
+        if auth_storage_cookie and auth_storage_cookie.value:
+            auth_storage = self._parse_json(compat_urllib_parse_unquote(
+                compat_urllib_parse_unquote(auth_storage_cookie.value)),
+                display_id, fatal=False) or {}
+            access_token = auth_storage.get('a') or auth_storage.get('access_token')
+
+        if not access_token:
+            access_token = self._download_json(
+                'https://%s.com/anonymous' % site, display_id,
+                'Downloading token JSON metadata', query={
+                    'authRel': 'authorization',
+                    'client_id': '3020a40c2356a645b4b4',
+                    'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]),
+                    'redirectUri': 'https://www.discovery.com/',
+                })['access_token']
+
+        headers = self.geo_verification_headers()
+        headers['Authorization'] = 'Bearer ' + access_token
+
+        try:
+            video = self._download_json(
+                self._API_BASE_URL + 'content/videos',
+                display_id, 'Downloading content JSON metadata',
+                headers=headers, query={
+                    'embed': 'show.name',
+                    'fields': 'authenticated,description.detailed,duration,episodeNumber,id,name,parental.rating,season.number,show,tags',
+                    'slug': display_id,
+                    'show_slug': show_slug,
+                })[0]
+            video_id = video['id']
+            stream = self._download_json(
+                self._API_BASE_URL + 'streaming/video/' + video_id,
+                display_id, 'Downloading streaming JSON metadata', headers=headers)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
+                e_description = self._parse_json(
+                    e.cause.read().decode(), display_id)['description']
+                if 'resource not available for country' in e_description:
+                    self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+                if 'Authorized Networks' in e_description:
+                    raise ExtractorError(
+                        'This video is only available via cable service provider subscription that'
+                        ' is not currently supported. You may want to use --cookies.', expected=True)
+                raise ExtractorError(e_description)
+            raise
+
+        return self._extract_video_info(video, stream, display_id)
diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py

new file mode 100644 (file)

index 0000000..9e7b14a
--- /dev/null
+++ b/youtube_dl/extractor/discoverygo.py
@@ -0,0 +1,175 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    extract_attributes,
+    ExtractorError,
+    int_or_none,
+    parse_age_limit,
+    remove_end,
+    unescapeHTML,
+    url_or_none,
+)
+
+
+class DiscoveryGoBaseIE(InfoExtractor):
+    _VALID_URL_TEMPLATE = r'''(?x)https?://(?:www\.)?(?:
+            discovery|
+            investigationdiscovery|
+            discoverylife|
+            animalplanet|
+            ahctv|
+            destinationamerica|
+            sciencechannel|
+            tlc|
+            velocitychannel
+        )go\.com/%s(?P<id>[^/?#&]+)'''
+
+    def _extract_video_info(self, video, stream, display_id):
+        title = video['name']
+
+        if not stream:
+            if video.get('authenticated') is True:
+                raise ExtractorError(
+                    'This video is only available via cable service provider subscription that'
+                    ' is not currently supported. You may want to use --cookies.', expected=True)
+            else:
+                raise ExtractorError('Unable to find stream')
+        STREAM_URL_SUFFIX = 'streamUrl'
+        formats = []
+        for stream_kind in ('', 'hds'):
+            suffix = STREAM_URL_SUFFIX.capitalize() if stream_kind else STREAM_URL_SUFFIX
+            stream_url = stream.get('%s%s' % (stream_kind, suffix))
+            if not stream_url:
+                continue
+            if stream_kind == '':
+                formats.extend(self._extract_m3u8_formats(
+                    stream_url, display_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif stream_kind == 'hds':
+                formats.extend(self._extract_f4m_formats(
+                    stream_url, display_id, f4m_id=stream_kind, fatal=False))
+        self._sort_formats(formats)
+
+        video_id = video.get('id') or display_id
+        description = video.get('description', {}).get('detailed')
+        duration = int_or_none(video.get('duration'))
+
+        series = video.get('show', {}).get('name')
+        season_number = int_or_none(video.get('season', {}).get('number'))
+        episode_number = int_or_none(video.get('episodeNumber'))
+
+        tags = video.get('tags')
+        age_limit = parse_age_limit(video.get('parental', {}).get('rating'))
+
+        subtitles = {}
+        captions = stream.get('captions')
+        if isinstance(captions, list):
+            for caption in captions:
+                subtitle_url = url_or_none(caption.get('fileUrl'))
+                if not subtitle_url or not subtitle_url.startswith('http'):
+                    continue
+                lang = caption.get('fileLang', 'en')
+                ext = determine_ext(subtitle_url)
+                subtitles.setdefault(lang, []).append({
+                    'url': subtitle_url,
+                    'ext': 'ttml' if ext == 'xml' else ext,
+                })
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'series': series,
+            'season_number': season_number,
+            'episode_number': episode_number,
+            'tags': tags,
+            'age_limit': age_limit,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class DiscoveryGoIE(DiscoveryGoBaseIE):
+    _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+'
+    _GEO_COUNTRIES = ['US']
+    _TEST = {
+        'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/',
+        'info_dict': {
+            'id': '58c167d86b66d12f2addeb01',
+            'ext': 'mp4',
+            'title': 'Reaper Madness',
+            'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78',
+            'duration': 2519,
+            'series': 'Bering Sea Gold',
+            'season_number': 8,
+            'episode_number': 6,
+            'age_limit': 14,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        container = extract_attributes(
+            self._search_regex(
+                r'(<div[^>]+class=["\']video-player-container[^>]+>)',
+                webpage, 'video container'))
+
+        video = self._parse_json(
+            container.get('data-video') or container.get('data-json'),
+            display_id)
+
+        stream = video.get('stream')
+
+        return self._extract_video_info(video, stream, display_id)
+
+
+class DiscoveryGoPlaylistIE(DiscoveryGoBaseIE):
+    _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % ''
+    _TEST = {
+        'url': 'https://www.discoverygo.com/bering-sea-gold/',
+        'info_dict': {
+            'id': 'bering-sea-gold',
+            'title': 'Bering Sea Gold',
+            'description': 'md5:cc5c6489835949043c0cc3ad66c2fa0e',
+        },
+        'playlist_mincount': 6,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if DiscoveryGoIE.suitable(url) else super(
+            DiscoveryGoPlaylistIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        entries = []
+        for mobj in re.finditer(r'data-json=(["\'])(?P<json>{.+?})\1', webpage):
+            data = self._parse_json(
+                mobj.group('json'), display_id,
+                transform_source=unescapeHTML, fatal=False)
+            if not isinstance(data, dict) or data.get('type') != 'episode':
+                continue
+            episode_url = data.get('socialUrl')
+            if not episode_url:
+                continue
+            entries.append(self.url_result(
+                episode_url, ie=DiscoveryGoIE.ie_key(),
+                video_id=data.get('id')))
+
+        return self.playlist_result(
+            entries, display_id,
+            remove_end(self._og_search_title(
+                webpage, fatal=False), ' | Discovery GO'),
+            self._og_search_description(webpage))
diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py

new file mode 100644 (file)

index 0000000..607a549
--- /dev/null
+++ b/youtube_dl/extractor/discoverynetworks.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .dplay import DPlayIE
+
+
+class DiscoveryNetworksDeIE(DPlayIE):
+    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)'
+
+    _TESTS = [{
+        'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100',
+        'info_dict': {
+            'id': '78867',
+            'ext': 'mp4',
+            'title': 'Die Welt da draußen',
+            'description': 'md5:61033c12b73286e409d99a41742ef608',
+            'timestamp': 1554069600,
+            'upload_date': '20190331',
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        domain, programme, alternate_id = re.match(self._VALID_URL, url).groups()
+        country = 'GB' if domain == 'dplay.co.uk' else 'DE'
+        realm = 'questuk' if country == 'GB' else domain.replace('.', '')
+        return self._get_disco_api_info(
+            url, '%s/%s' % (programme, alternate_id),
+            'sonic-eu1-prod.disco-api.com', realm, country)
diff --git a/youtube_dl/extractor/discoveryvr.py b/youtube_dl/extractor/discoveryvr.py

new file mode 100644 (file)

index 0000000..cb63c26
--- /dev/null
+++ b/youtube_dl/extractor/discoveryvr.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class DiscoveryVRIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?discoveryvr\.com/watch/(?P<id>[^/?#]+)'
+    _TEST = {
+        'url': 'http://www.discoveryvr.com/watch/discovery-vr-an-introduction',
+        'md5': '32b1929798c464a54356378b7912eca4',
+        'info_dict': {
+            'id': 'discovery-vr-an-introduction',
+            'ext': 'mp4',
+            'title': 'Discovery VR - An Introduction',
+            'description': 'md5:80d418a10efb8899d9403e61d8790f06',
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        bootstrap_data = self._search_regex(
+            r'root\.DVR\.bootstrapData\s+=\s+"({.+?})";',
+            webpage, 'bootstrap data')
+        bootstrap_data = self._parse_json(
+            bootstrap_data.encode('utf-8').decode('unicode_escape'),
+            display_id)
+        videos = self._parse_json(bootstrap_data['videos'], display_id)['allVideos']
+        video_data = next(video for video in videos if video.get('slug') == display_id)
+
+        series = video_data.get('showTitle')
+        title = episode = video_data.get('title') or series
+        if series and series != title:
+            title = '%s - %s' % (series, title)
+
+        formats = []
+        for f, format_id in (('cdnUriM3U8', 'mobi'), ('webVideoUrlSd', 'sd'), ('webVideoUrlHd', 'hd')):
+            f_url = video_data.get(f)
+            if not f_url:
+                continue
+            formats.append({
+                'format_id': format_id,
+                'url': f_url,
+            })
+
+        return {
+            'id': display_id,
+            'display_id': display_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('thumbnail'),
+            'duration': parse_duration(video_data.get('runTime')),
+            'formats': formats,
+            'episode': episode,
+            'series': series,
+        }
diff --git a/youtube_dl/extractor/disney.py b/youtube_dl/extractor/disney.py

new file mode 100644 (file)

index 0000000..0eee82f
--- /dev/null
+++ b/youtube_dl/extractor/disney.py
@@ -0,0 +1,170 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+    compat_str,
+    determine_ext,
+    ExtractorError,
+    update_url_query,
+)
+
+
+class DisneyIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+        https?://(?P<domain>(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr|channel\.de)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P<id>[a-z0-9]{24})|(?:[^/]+/)?(?P<display_id>[^/?#]+))'''
+    _TESTS = [{
+        # Disney.EmbedVideo
+        'url': 'http://video.disney.com/watch/moana-trailer-545ed1857afee5a0ec239977',
+        'info_dict': {
+            'id': '545ed1857afee5a0ec239977',
+            'ext': 'mp4',
+            'title': 'Moana - Trailer',
+            'description': 'A fun adventure for the entire Family!  Bring home Moana on Digital HD Feb 21 & Blu-ray March 7',
+            'upload_date': '20170112',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        # Grill.burger
+        'url': 'http://www.starwars.com/video/rogue-one-a-star-wars-story-intro-featurette',
+        'info_dict': {
+            'id': '5454e9f4e9804a552e3524c8',
+            'ext': 'mp4',
+            'title': '"Intro" Featurette: Rogue One: A Star Wars Story',
+            'upload_date': '20170104',
+            'description': 'Go behind-the-scenes of Rogue One: A Star Wars Story in this featurette with Director Gareth Edwards and the cast of the film.',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://videos.disneylatino.com/ver/spider-man-de-regreso-a-casa-primer-adelanto-543a33a1850bdcfcca13bae2',
+        'only_matching': True,
+    }, {
+        'url': 'http://video.en.disneyme.com/watch/future-worm/robo-carp-2001-544b66002aa7353cdd3f5114',
+        'only_matching': True,
+    }, {
+        'url': 'http://video.disneyturkiye.com.tr/izle/7c-7-cuceler/kimin-sesi-zaten-5456f3d015f6b36c8afdd0e2',
+        'only_matching': True,
+    }, {
+        'url': 'http://disneyjunior.disney.com/embed/546a4798ddba3d1612e4005d',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.starwars.com/embed/54690d1e6c42e5f09a0fb097',
+        'only_matching': True,
+    }, {
+        'url': 'http://spiderman.marvelkids.com/embed/522900d2ced3c565e4cc0677',
+        'only_matching': True,
+    }, {
+        'url': 'http://spiderman.marvelkids.com/videos/contest-of-champions-part-four-clip-1',
+        'only_matching': True,
+    }, {
+        'url': 'http://disneyjunior.en.disneyme.com/dj/watch-my-friends-tigger-and-pooh-promo',
+        'only_matching': True,
+    }, {
+        'url': 'http://disneychannel.de/sehen/soy-luna-folge-118-5518518987ba27f3cc729268',
+        'only_matching': True,
+    }, {
+        'url': 'http://disneyjunior.disney.com/galactech-the-galactech-grab-galactech-an-admiral-rescue',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        domain, video_id, display_id = re.match(self._VALID_URL, url).groups()
+        if not video_id:
+            webpage = self._download_webpage(url, display_id)
+            grill = re.sub(r'"\s*\+\s*"', '', self._search_regex(
+                r'Grill\.burger\s*=\s*({.+})\s*:',
+                webpage, 'grill data'))
+            page_data = next(s for s in self._parse_json(grill, display_id)['stack'] if s.get('type') == 'video')
+            video_data = page_data['data'][0]
+        else:
+            webpage = self._download_webpage(
+                'http://%s/embed/%s' % (domain, video_id), video_id)
+            page_data = self._parse_json(self._search_regex(
+                r'Disney\.EmbedVideo\s*=\s*({.+});',
+                webpage, 'embed data'), video_id)
+            video_data = page_data['video']
+
+        for external in video_data.get('externals', []):
+            if external.get('source') == 'vevo':
+                return self.url_result('vevo:' + external['data_id'], 'Vevo')
+
+        video_id = video_data['id']
+        title = video_data['title']
+
+        formats = []
+        for flavor in video_data.get('flavors', []):
+            flavor_format = flavor.get('format')
+            flavor_url = flavor.get('url')
+            if not flavor_url or not re.match(r'https?://', flavor_url) or flavor_format == 'mp4_access':
+                continue
+            tbr = int_or_none(flavor.get('bitrate'))
+            if tbr == 99999:
+                # wrong ks(Kaltura Signature) causes 404 Error
+                flavor_url = update_url_query(flavor_url, {'ks': ''})
+                m3u8_formats = self._extract_m3u8_formats(
+                    flavor_url, video_id, 'mp4',
+                    m3u8_id=flavor_format, fatal=False)
+                for f in m3u8_formats:
+                    # Apple FairPlay
+                    if '/fpshls/' in f['url']:
+                        continue
+                    formats.append(f)
+                continue
+            format_id = []
+            if flavor_format:
+                format_id.append(flavor_format)
+            if tbr:
+                format_id.append(compat_str(tbr))
+            ext = determine_ext(flavor_url)
+            if flavor_format == 'applehttp' or ext == 'm3u8':
+                ext = 'mp4'
+            width = int_or_none(flavor.get('width'))
+            height = int_or_none(flavor.get('height'))
+            formats.append({
+                'format_id': '-'.join(format_id),
+                'url': flavor_url,
+                'width': width,
+                'height': height,
+                'tbr': tbr,
+                'ext': ext,
+                'vcodec': 'none' if (width == 0 and height == 0) else None,
+            })
+        if not formats and video_data.get('expired'):
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, page_data['translations']['video_expired']),
+                expected=True)
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for caption in video_data.get('captions', []):
+            caption_url = caption.get('url')
+            caption_format = caption.get('format')
+            if not caption_url or caption_format.startswith('unknown'):
+                continue
+            subtitles.setdefault(caption.get('language', 'en'), []).append({
+                'url': caption_url,
+                'ext': {
+                    'webvtt': 'vtt',
+                }.get(caption_format, caption_format),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description') or video_data.get('short_desc'),
+            'thumbnail': video_data.get('thumb') or video_data.get('thumb_secure'),
+            'duration': int_or_none(video_data.get('duration_sec')),
+            'upload_date': unified_strdate(video_data.get('publish_date')),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py

new file mode 100644 (file)

index 0000000..c345e02
--- /dev/null
+++ b/youtube_dl/extractor/dispeak.py
@@ -0,0 +1,125 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    remove_end,
+    xpath_element,
+    xpath_text,
+)
+
+
+class DigitallySpeakingIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
+
+    _TESTS = [{
+        # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface
+        'url': 'http://evt.dispeak.com/ubm/gdc/sf16/xml/840376_BQRC.xml',
+        'md5': 'a8efb6c31ed06ca8739294960b2dbabd',
+        'info_dict': {
+            'id': '840376_BQRC',
+            'ext': 'mp4',
+            'title': 'Tenacious Design and The Interface of \'Destiny\'',
+        },
+    }, {
+        # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC
+        'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml',
+        'only_matching': True,
+    }, {
+        # From http://www.gdcvault.com/play/1013700/Advanced-Material
+        'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml',
+        'only_matching': True,
+    }]
+
+    def _parse_mp4(self, metadata):
+        video_formats = []
+        video_root = None
+
+        mp4_video = xpath_text(metadata, './mp4video', default=None)
+        if mp4_video is not None:
+            mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video)
+            video_root = mobj.group('root')
+        if video_root is None:
+            http_host = xpath_text(metadata, 'httpHost', default=None)
+            if http_host:
+                video_root = 'http://%s/' % http_host
+        if video_root is None:
+            # Hard-coded in http://evt.dispeak.com/ubm/gdc/sf16/custom/player2.js
+            # Works for GPUTechConf, too
+            video_root = 'http://s3-2u.digitallyspeaking.com/'
+
+        formats = metadata.findall('./MBRVideos/MBRVideo')
+        if not formats:
+            return None
+        for a_format in formats:
+            stream_name = xpath_text(a_format, 'streamName', fatal=True)
+            video_path = re.match(r'mp4\:(?P<path>.*)', stream_name).group('path')
+            url = video_root + video_path
+            bitrate = xpath_text(a_format, 'bitrate')
+            tbr = int_or_none(bitrate)
+            vbr = int_or_none(self._search_regex(
+                r'-(\d+)\.mp4', video_path, 'vbr', default=None))
+            abr = tbr - vbr if tbr and vbr else None
+            video_formats.append({
+                'format_id': bitrate,
+                'url': url,
+                'tbr': tbr,
+                'vbr': vbr,
+                'abr': abr,
+            })
+        return video_formats
+
+    def _parse_flv(self, metadata):
+        formats = []
+        akamai_url = xpath_text(metadata, './akamaiHost', fatal=True)
+        audios = metadata.findall('./audios/audio')
+        for audio in audios:
+            formats.append({
+                'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+                'play_path': remove_end(audio.get('url'), '.flv'),
+                'ext': 'flv',
+                'vcodec': 'none',
+                'format_id': audio.get('code'),
+            })
+        slide_video_path = xpath_text(metadata, './slideVideo', fatal=True)
+        formats.append({
+            'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+            'play_path': remove_end(slide_video_path, '.flv'),
+            'ext': 'flv',
+            'format_note': 'slide deck video',
+            'quality': -2,
+            'preference': -2,
+            'format_id': 'slides',
+        })
+        speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True)
+        formats.append({
+            'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+            'play_path': remove_end(speaker_video_path, '.flv'),
+            'ext': 'flv',
+            'format_note': 'speaker video',
+            'quality': -1,
+            'preference': -1,
+            'format_id': 'speaker',
+        })
+        return formats
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        xml_description = self._download_xml(url, video_id)
+        metadata = xpath_element(xml_description, 'metadata')
+
+        video_formats = self._parse_mp4(metadata)
+        if video_formats is None:
+            video_formats = self._parse_flv(metadata)
+
+        return {
+            'id': video_id,
+            'formats': video_formats,
+            'title': xpath_text(metadata, 'title', fatal=True),
+            'duration': parse_duration(xpath_text(metadata, 'endTime')),
+            'creator': xpath_text(metadata, 'speaker'),
+        }
diff --git a/youtube_dl/extractor/dlive.py b/youtube_dl/extractor/dlive.py

new file mode 100644 (file)

index 0000000..d95c67a
--- /dev/null
+++ b/youtube_dl/extractor/dlive.py
@@ -0,0 +1,97 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class DLiveVODIE(InfoExtractor):
+    IE_NAME = 'dlive:vod'
+    _VALID_URL = r'https?://(?:www\.)?dlive\.tv/p/(?P<uploader_id>.+?)\+(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://dlive.tv/p/pdp+3mTzOl4WR',
+        'info_dict': {
+            'id': '3mTzOl4WR',
+            'ext': 'mp4',
+            'title': 'Minecraft with james charles epic',
+            'upload_date': '20190701',
+            'timestamp': 1562011015,
+            'uploader_id': 'pdp',
+        }
+    }, {
+        'url': 'https://dlive.tv/p/pdpreplay+D-RD-xSZg',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        uploader_id, vod_id = re.match(self._VALID_URL, url).groups()
+        broadcast = self._download_json(
+            'https://graphigo.prd.dlive.tv/', vod_id,
+            data=json.dumps({'query': '''query {
+  pastBroadcast(permlink:"%s+%s") {
+    content
+    createdAt
+    length
+    playbackUrl
+    title
+    thumbnailUrl
+    viewCount
+  }
+}''' % (uploader_id, vod_id)}).encode())['data']['pastBroadcast']
+        title = broadcast['title']
+        formats = self._extract_m3u8_formats(
+            broadcast['playbackUrl'], vod_id, 'mp4', 'm3u8_native')
+        self._sort_formats(formats)
+        return {
+            'id': vod_id,
+            'title': title,
+            'uploader_id': uploader_id,
+            'formats': formats,
+            'description': broadcast.get('content'),
+            'thumbnail': broadcast.get('thumbnailUrl'),
+            'timestamp': int_or_none(broadcast.get('createdAt'), 1000),
+            'view_count': int_or_none(broadcast.get('viewCount')),
+        }
+
+
+class DLiveStreamIE(InfoExtractor):
+    IE_NAME = 'dlive:stream'
+    _VALID_URL = r'https?://(?:www\.)?dlive\.tv/(?!p/)(?P<id>[\w.-]+)'
+
+    def _real_extract(self, url):
+        display_name = self._match_id(url)
+        user = self._download_json(
+            'https://graphigo.prd.dlive.tv/', display_name,
+            data=json.dumps({'query': '''query {
+  userByDisplayName(displayname:"%s") {
+    livestream {
+      content
+      createdAt
+      title
+      thumbnailUrl
+      watchingCount
+    }
+    username
+  }
+}''' % display_name}).encode())['data']['userByDisplayName']
+        livestream = user['livestream']
+        title = livestream['title']
+        username = user['username']
+        formats = self._extract_m3u8_formats(
+            'https://live.prd.dlive.tv/hls/live/%s.m3u8' % username,
+            display_name, 'mp4')
+        self._sort_formats(formats)
+        return {
+            'id': display_name,
+            'title': self._live_title(title),
+            'uploader': display_name,
+            'uploader_id': username,
+            'formats': formats,
+            'description': livestream.get('content'),
+            'thumbnail': livestream.get('thumbnailUrl'),
+            'is_live': True,
+            'timestamp': int_or_none(livestream.get('createdAt'), 1000),
+            'view_count': int_or_none(livestream.get('watchingCount')),
+        }
diff --git a/youtube_dl/extractor/doodstream.py b/youtube_dl/extractor/doodstream.py

new file mode 100644 (file)

index 0000000..2c9ea68
--- /dev/null
+++ b/youtube_dl/extractor/doodstream.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import string
+import random
+import time
+
+from .common import InfoExtractor
+
+
+class DoodStreamIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch)/[ed]/(?P<id>[a-z0-9]+)'
+    _TESTS = [{
+        'url': 'http://dood.to/e/5s1wmbdacezb',
+        'md5': '4568b83b31e13242b3f1ff96c55f0595',
+        'info_dict': {
+            'id': '5s1wmbdacezb',
+            'ext': 'mp4',
+            'title': 'Kat Wonders - Monthly May 2020',
+            'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com',
+            'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg',
+        }
+    }, {
+        'url': 'https://dood.to/d/jzrxn12t2s7n',
+        'md5': '3207e199426eca7c2aa23c2872e6728a',
+        'info_dict': {
+            'id': 'jzrxn12t2s7n',
+            'ext': 'mp4',
+            'title': 'Stacy Cruz Cute ALLWAYSWELL',
+            'description': 'Stacy Cruz Cute ALLWAYSWELL | DoodStream.com',
+            'thumbnail': 'https://img.doodcdn.com/snaps/8edqd5nppkac3x8u.jpg',
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        if '/d/' in url:
+            url = "https://dood.to" + self._html_search_regex(
+                r'<iframe src="(/e/[a-z0-9]+)"', webpage, 'embed')
+            video_id = self._match_id(url)
+            webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_meta(['og:title', 'twitter:title'],
+                                       webpage, default=None)
+        thumb = self._html_search_meta(['og:image', 'twitter:image'],
+                                       webpage, default=None)
+        token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token')
+        description = self._html_search_meta(
+            ['og:description', 'description', 'twitter:description'],
+            webpage, default=None)
+        auth_url = 'https://dood.to' + self._html_search_regex(
+            r'(/pass_md5.*?)\'', webpage, 'pass_md5')
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0',
+            'referer': url
+        }
+
+        webpage = self._download_webpage(auth_url, video_id, headers=headers)
+        final_url = webpage + ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(10)]) + "?token=" + token + "&expiry=" + str(int(time.time() * 1000))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': final_url,
+            'http_headers': headers,
+            'ext': 'mp4',
+            'description': description,
+            'thumbnail': thumb,
+        }
diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py

new file mode 100644 (file)

index 0000000..148605c
--- /dev/null
+++ b/youtube_dl/extractor/dotsub.py
@@ -0,0 +1,83 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    int_or_none,
+)
+
+
+class DotsubIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'https://dotsub.com/view/9c63db2a-fa95-4838-8e6e-13deafe47f09',
+        'md5': '21c7ff600f545358134fea762a6d42b6',
+        'info_dict': {
+            'id': '9c63db2a-fa95-4838-8e6e-13deafe47f09',
+            'ext': 'flv',
+            'title': 'MOTIVATION - "It\'s Possible" Best Inspirational Video Ever',
+            'description': 'md5:41af1e273edbbdfe4e216a78b9d34ac6',
+            'thumbnail': 're:^https?://dotsub.com/media/9c63db2a-fa95-4838-8e6e-13deafe47f09/p',
+            'duration': 198,
+            'uploader': 'liuxt',
+            'timestamp': 1385778501.104,
+            'upload_date': '20131130',
+            'view_count': int,
+        }
+    }, {
+        'url': 'https://dotsub.com/view/747bcf58-bd59-45b7-8c8c-ac312d084ee6',
+        'md5': '2bb4a83896434d5c26be868c609429a3',
+        'info_dict': {
+            'id': '168006778',
+            'ext': 'mp4',
+            'title': 'Apartments and flats in Raipur the white symphony',
+            'description': 'md5:784d0639e6b7d1bc29530878508e38fe',
+            'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p',
+            'duration': 290,
+            'timestamp': 1476767794.2809999,
+            'upload_date': '20161018',
+            'uploader': 'parthivi001',
+            'uploader_id': 'user52596202',
+            'view_count': int,
+        },
+        'add_ie': ['Vimeo'],
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        info = self._download_json(
+            'https://dotsub.com/api/media/%s/metadata' % video_id, video_id)
+        video_url = info.get('mediaURI')
+
+        if not video_url:
+            webpage = self._download_webpage(url, video_id)
+            video_url = self._search_regex(
+                [r'<source[^>]+src="([^"]+)"', r'"file"\s*:\s*\'([^\']+)'],
+                webpage, 'video url', default=None)
+            info_dict = {
+                'id': video_id,
+                'url': video_url,
+                'ext': 'flv',
+            }
+
+        if not video_url:
+            setup_data = self._parse_json(self._html_search_regex(
+                r'(?s)data-setup=([\'"])(?P<content>(?!\1).+?)\1',
+                webpage, 'setup data', group='content'), video_id)
+            info_dict = {
+                '_type': 'url_transparent',
+                'url': setup_data['src'],
+            }
+
+        info_dict.update({
+            'title': info['title'],
+            'description': info.get('description'),
+            'thumbnail': info.get('screenshotURI'),
+            'duration': int_or_none(info.get('duration'), 1000),
+            'uploader': info.get('user'),
+            'timestamp': float_or_none(info.get('dateCreated'), 1000),
+            'view_count': int_or_none(info.get('numberOfViews')),
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py

new file mode 100644 (file)

index 0000000..9757f44
--- /dev/null
+++ b/youtube_dl/extractor/douyutv.py
@@ -0,0 +1,201 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import time
+import hashlib
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unescapeHTML,
+    unified_strdate,
+    urljoin,
+)
+
+
+class DouyuTVIE(InfoExtractor):
+    IE_DESC = '斗鱼'
+    _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?:[^/]+/)*(?P<id>[A-Za-z0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.douyutv.com/iseven',
+        'info_dict': {
+            'id': '17732',
+            'display_id': 'iseven',
+            'ext': 'flv',
+            'title': 're:^清晨醒脑！根本停不下来！ [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': r're:.*m7show@163\.com.*',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': '7师傅',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.douyutv.com/85982',
+        'info_dict': {
+            'id': '85982',
+            'display_id': '85982',
+            'ext': 'flv',
+            'title': 're:^小漠从零单排记！——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'md5:746a2f7a253966a06755a912f0acc0d2',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'douyu小漠',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Room not found',
+    }, {
+        'url': 'http://www.douyutv.com/17732',
+        'info_dict': {
+            'id': '17732',
+            'display_id': '17732',
+            'ext': 'flv',
+            'title': 're:^清晨醒脑！根本停不下来！ [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': r're:.*m7show@163\.com.*',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': '7师傅',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.douyu.com/xiaocang',
+        'only_matching': True,
+    }, {
+        # \"room_id\"
+        'url': 'http://www.douyu.com/t/lpl',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        if video_id.isdigit():
+            room_id = video_id
+        else:
+            page = self._download_webpage(url, video_id)
+            room_id = self._html_search_regex(
+                r'"room_id\\?"\s*:\s*(\d+),', page, 'room id')
+
+        # Grab metadata from mobile API
+        room = self._download_json(
+            'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id,
+            note='Downloading room info')['data']
+
+        # 1 = live, 2 = offline
+        if room.get('show_status') == '2':
+            raise ExtractorError('Live stream is offline', expected=True)
+
+        # Grab the URL from PC client API
+        # The m3u8 url from mobile API requires re-authentication every 5 minutes
+        tt = int(time.time())
+        signContent = 'lapi/live/thirdPart/getPlay/%s?aid=pcclient&rate=0&time=%d9TUk5fjjUjg9qIMH3sdnh' % (room_id, tt)
+        sign = hashlib.md5(signContent.encode('ascii')).hexdigest()
+        video_url = self._download_json(
+            'http://coapi.douyucdn.cn/lapi/live/thirdPart/getPlay/' + room_id,
+            video_id, note='Downloading video URL info',
+            query={'rate': 0}, headers={
+                'auth': sign,
+                'time': str(tt),
+                'aid': 'pcclient'
+            })['data']['live_url']
+
+        title = self._live_title(unescapeHTML(room['room_name']))
+        description = room.get('show_details')
+        thumbnail = room.get('room_src')
+        uploader = room.get('nickname')
+
+        return {
+            'id': room_id,
+            'display_id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'is_live': True,
+        }
+
+
+class DouyuShowIE(InfoExtractor):
+    _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
+
+    _TESTS = [{
+        'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
+        'md5': '0c2cfd068ee2afe657801269b2d86214',
+        'info_dict': {
+            'id': 'rjNBdvnVXNzvE2yw',
+            'ext': 'mp4',
+            'title': '陈一发儿：砒霜 我有个室友系列！04-01 22点场',
+            'duration': 7150.08,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': '陈一发儿',
+            'uploader_id': 'XrZwYelr5wbK',
+            'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
+            'upload_date': '20170402',
+        },
+    }, {
+        'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        url = url.replace('vmobile.', 'v.')
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        room_info = self._parse_json(self._search_regex(
+            r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)
+
+        video_info = None
+
+        for trial in range(5):
+            # Sometimes Douyu rejects our request. Let's try it more times
+            try:
+                video_info = self._download_json(
+                    'https://vmobile.douyu.com/video/getInfo', video_id,
+                    query={'vid': video_id},
+                    headers={
+                        'Referer': url,
+                        'x-requested-with': 'XMLHttpRequest',
+                    })
+                break
+            except ExtractorError:
+                self._sleep(1, video_id)
+
+        if not video_info:
+            raise ExtractorError('Can\'t fetch video info')
+
+        formats = self._extract_m3u8_formats(
+            video_info['data']['video_url'], video_id,
+            entry_protocol='m3u8_native', ext='mp4')
+
+        upload_date = unified_strdate(self._html_search_regex(
+            r'<em>上传时间：</em><span>([^<]+)</span>', webpage,
+            'upload date', fatal=False))
+
+        uploader = uploader_id = uploader_url = None
+        mobj = re.search(
+            r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
+            webpage)
+        if mobj:
+            uploader_id, uploader = mobj.groups()
+            uploader_url = urljoin(url, '/author/' + uploader_id)
+
+        return {
+            'id': video_id,
+            'title': room_info['name'],
+            'formats': formats,
+            'duration': room_info.get('duration'),
+            'thumbnail': room_info.get('pic'),
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'uploader_url': uploader_url,
+        }
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py

new file mode 100644 (file)

index 0000000..a7b9db5
--- /dev/null
+++ b/youtube_dl/extractor/dplay.py
@@ -0,0 +1,247 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    unified_timestamp,
+)
+
+
+class DPlayIE(InfoExtractor):
+    _VALID_URL = r'''(?x)https?://
+        (?P<domain>
+            (?:www\.)?(?P<host>dplay\.(?P<country>dk|fi|jp|se|no))|
+            (?P<subdomain_country>es|it)\.dplay\.com
+        )/[^/]+/(?P<id>[^/]+/[^/?#]+)'''
+
+    _TESTS = [{
+        # non geo restricted, via secure api, unsigned download hls URL
+        'url': 'https://www.dplay.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101',
+        'info_dict': {
+            'id': '13628',
+            'display_id': 'nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101',
+            'ext': 'mp4',
+            'title': 'Svensken lär sig njuta av livet',
+            'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8',
+            'duration': 2649.856,
+            'timestamp': 1365453720,
+            'upload_date': '20130408',
+            'creator': 'Kanal 5',
+            'series': 'Nugammalt - 77 händelser som format Sverige',
+            'season_number': 1,
+            'episode_number': 1,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+    }, {
+        # geo restricted, via secure api, unsigned download hls URL
+        'url': 'http://www.dplay.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster',
+        'info_dict': {
+            'id': '104465',
+            'display_id': 'ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster',
+            'ext': 'mp4',
+            'title': 'Ted Bundy: Mind Of A Monster',
+            'description': 'md5:8b780f6f18de4dae631668b8a9637995',
+            'duration': 5290.027,
+            'timestamp': 1570694400,
+            'upload_date': '20191010',
+            'creator': 'ID - Investigation Discovery',
+            'series': 'Ted Bundy: Mind Of A Monster',
+            'season_number': 1,
+            'episode_number': 1,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+    }, {
+        # disco-api
+        'url': 'https://www.dplay.no/videoer/i-kongens-klr/sesong-1-episode-7',
+        'info_dict': {
+            'id': '40206',
+            'display_id': 'i-kongens-klr/sesong-1-episode-7',
+            'ext': 'mp4',
+            'title': 'Episode 7',
+            'description': 'md5:e3e1411b2b9aebeea36a6ec5d50c60cf',
+            'duration': 2611.16,
+            'timestamp': 1516726800,
+            'upload_date': '20180123',
+            'series': 'I kongens klær',
+            'season_number': 1,
+            'episode_number': 7,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+        'skip': 'Available for Premium users',
+    }, {
+        'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/',
+        'md5': '2b808ffb00fc47b884a172ca5d13053c',
+        'info_dict': {
+            'id': '6918',
+            'display_id': 'biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij',
+            'ext': 'mp4',
+            'title': 'Luigi Di Maio: la psicosi di Stanislawskij',
+            'description': 'md5:3c7a4303aef85868f867a26f5cc14813',
+            'thumbnail': r're:^https?://.*\.jpe?g',
+            'upload_date': '20160524',
+            'timestamp': 1464076800,
+            'series': 'Biografie imbarazzanti',
+            'season_number': 1,
+            'episode': 'Episode 1',
+            'episode_number': 1,
+        },
+    }, {
+        'url': 'https://es.dplay.com/dmax/la-fiebre-del-oro/temporada-8-episodio-1/',
+        'info_dict': {
+            'id': '21652',
+            'display_id': 'la-fiebre-del-oro/temporada-8-episodio-1',
+            'ext': 'mp4',
+            'title': 'Episodio 1',
+            'description': 'md5:b9dcff2071086e003737485210675f69',
+            'thumbnail': r're:^https?://.*\.png',
+            'upload_date': '20180709',
+            'timestamp': 1531173540,
+            'series': 'La fiebre del oro',
+            'season_number': 8,
+            'episode': 'Episode 1',
+            'episode_number': 1,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.dplay.fi/videot/shifting-gears-with-aaron-kaufman/episode-16',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.dplay.jp/video/gold-rush/24086',
+        'only_matching': True,
+    }]
+
+    def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
+        geo_countries = [country.upper()]
+        self._initialize_geo_bypass({
+            'countries': geo_countries,
+        })
+        disco_base = 'https://%s/' % disco_host
+        token = self._download_json(
+            disco_base + 'token', display_id, 'Downloading token',
+            query={
+                'realm': realm,
+            })['data']['attributes']['token']
+        headers = {
+            'Referer': url,
+            'Authorization': 'Bearer ' + token,
+        }
+        video = self._download_json(
+            disco_base + 'content/videos/' + display_id, display_id,
+            headers=headers, query={
+                'fields[channel]': 'name',
+                'fields[image]': 'height,src,width',
+                'fields[show]': 'name',
+                'fields[tag]': 'name',
+                'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
+                'include': 'images,primaryChannel,show,tags'
+            })
+        video_id = video['data']['id']
+        info = video['data']['attributes']
+        title = info['name'].strip()
+        formats = []
+        try:
+            streaming = self._download_json(
+                disco_base + 'playback/videoPlaybackInfo/' + video_id,
+                display_id, headers=headers)['data']['attributes']['streaming']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
+                error = info['errors'][0]
+                error_code = error.get('code')
+                if error_code == 'access.denied.geoblocked':
+                    self.raise_geo_restricted(countries=geo_countries)
+                elif error_code == 'access.denied.missingpackage':
+                    self.raise_login_required()
+                raise ExtractorError(info['errors'][0]['detail'], expected=True)
+            raise
+        for format_id, format_dict in streaming.items():
+            if not isinstance(format_dict, dict):
+                continue
+            format_url = format_dict.get('url')
+            if not format_url:
+                continue
+            ext = determine_ext(format_url)
+            if format_id == 'dash' or ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    format_url, display_id, mpd_id='dash', fatal=False))
+            elif format_id == 'hls' or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, display_id, 'mp4',
+                    entry_protocol='m3u8_native', m3u8_id='hls',
+                    fatal=False))
+            else:
+                formats.append({
+                    'url': format_url,
+                    'format_id': format_id,
+                })
+        self._sort_formats(formats)
+
+        creator = series = None
+        tags = []
+        thumbnails = []
+        included = video.get('included') or []
+        if isinstance(included, list):
+            for e in included:
+                attributes = e.get('attributes')
+                if not attributes:
+                    continue
+                e_type = e.get('type')
+                if e_type == 'channel':
+                    creator = attributes.get('name')
+                elif e_type == 'image':
+                    src = attributes.get('src')
+                    if src:
+                        thumbnails.append({
+                            'url': src,
+                            'width': int_or_none(attributes.get('width')),
+                            'height': int_or_none(attributes.get('height')),
+                        })
+                if e_type == 'show':
+                    series = attributes.get('name')
+                elif e_type == 'tag':
+                    name = attributes.get('name')
+                    if name:
+                        tags.append(name)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': info.get('description'),
+            'duration': float_or_none(info.get('videoDuration'), 1000),
+            'timestamp': unified_timestamp(info.get('publishStart')),
+            'series': series,
+            'season_number': int_or_none(info.get('seasonNumber')),
+            'episode_number': int_or_none(info.get('episodeNumber')),
+            'creator': creator,
+            'tags': tags,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        domain = mobj.group('domain').lstrip('www.')
+        country = mobj.group('country') or mobj.group('subdomain_country')
+        host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com'
+        return self._get_disco_api_info(
+            url, display_id, host, 'dplay' + country, country)
diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py

new file mode 100644 (file)

index 0000000..164e97c
--- /dev/null
+++ b/youtube_dl/extractor/drbonanza.py
@@ -0,0 +1,59 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+    parse_duration,
+    unescapeHTML,
+)
+
+
+class DRBonanzaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/[^/]+/\d+/[^/]+/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://www.dr.dk/bonanza/serie/154/matador/40312/matador---0824-komme-fremmede-',
+        'info_dict': {
+            'id': '40312',
+            'display_id': 'matador---0824-komme-fremmede-',
+            'ext': 'mp4',
+            'title': 'MATADOR - 08:24. "Komme fremmede".',
+            'description': 'md5:77b4c1ac4d4c1b9d610ab4395212ff84',
+            'thumbnail': r're:^https?://.*\.(?:gif|jpg)$',
+            'duration': 4613,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.group('id', 'display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        info = self._parse_html5_media_entries(
+            url, webpage, display_id, m3u8_id='hls',
+            m3u8_entry_protocol='m3u8_native')[0]
+        self._sort_formats(info['formats'])
+
+        asset = self._parse_json(
+            self._search_regex(
+                r'(?s)currentAsset\s*=\s*({.+?})\s*</script', webpage, 'asset'),
+            display_id, transform_source=js_to_json)
+
+        title = unescapeHTML(asset['AssetTitle']).strip()
+
+        def extract(field):
+            return self._search_regex(
+                r'<div[^>]+>\s*<p>%s:<p>\s*</div>\s*<div[^>]+>\s*<p>([^<]+)</p>' % field,
+                webpage, field, default=None)
+
+        info.update({
+            'id': asset.get('AssetId') or video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': extract('Programinfo'),
+            'duration': parse_duration(extract('Tid')),
+            'thumbnail': asset.get('AssetImageUrl'),
+        })
+        return info
diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py

new file mode 100644 (file)

index 0000000..9dc6614
--- /dev/null
+++ b/youtube_dl/extractor/dropbox.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import os.path
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import url_basename
+
+
+class DropboxIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/sh?/(?P<id>[a-zA-Z0-9]{15})/.*'
+    _TESTS = [
+        {
+            'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dlc%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0',
+            'info_dict': {
+                'id': 'nelirfsxnmcfbfh',
+                'ext': 'mp4',
+                'title': 'youtube-dlc test video \'ä"BaW_jenozKc'
+            }
+        }, {
+            'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v',
+            'only_matching': True,
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        fn = compat_urllib_parse_unquote(url_basename(url))
+        title = os.path.splitext(fn)[0]
+        video_url = re.sub(r'[?&]dl=0', '', url)
+        video_url += ('?' if '?' not in video_url else '&') + 'dl=1'
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+        }
diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py

new file mode 100644 (file)

index 0000000..2baea58
--- /dev/null
+++ b/youtube_dl/extractor/drtuber.py
@@ -0,0 +1,112 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    NO_DEFAULT,
+    parse_duration,
+    str_to_int,
+)
+
+
+class DrTuberIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?'
+    _TESTS = [{
+        'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf',
+        'md5': '93e680cf2536ad0dfb7e74d94a89facd',
+        'info_dict': {
+            'id': '1740434',
+            'display_id': 'hot-perky-blonde-naked-golf',
+            'ext': 'mp4',
+            'title': 'hot perky blonde naked golf',
+            'like_count': int,
+            'comment_count': int,
+            'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'],
+            'thumbnail': r're:https?://.*\.jpg$',
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'http://www.drtuber.com/embed/489939',
+        'only_matching': True,
+    }, {
+        'url': 'http://m.drtuber.com/video/3893529/lingerie-blowjob-from-beautiful-teen',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)',
+            webpage)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage = self._download_webpage(
+            'http://www.drtuber.com/video/%s' % video_id, display_id)
+
+        video_data = self._download_json(
+            'http://www.drtuber.com/player_config_json/', video_id, query={
+                'vid': video_id,
+                'embed': 0,
+                'aid': 0,
+                'domain_id': 0,
+            })
+
+        formats = []
+        for format_id, video_url in video_data['files'].items():
+            if video_url:
+                formats.append({
+                    'format_id': format_id,
+                    'quality': 2 if format_id == 'hq' else 1,
+                    'url': video_url
+                })
+        self._sort_formats(formats)
+
+        duration = int_or_none(video_data.get('duration')) or parse_duration(
+            video_data.get('duration_format'))
+
+        title = self._html_search_regex(
+            (r'<h1[^>]+class=["\']title[^>]+>([^<]+)',
+             r'<title>([^<]+)\s*@\s+DrTuber',
+             r'class="title_watch"[^>]*><(?:p|h\d+)[^>]*>([^<]+)<',
+             r'<p[^>]+class="title_substrate">([^<]+)</p>',
+             r'<title>([^<]+) - \d+'),
+            webpage, 'title')
+
+        thumbnail = self._html_search_regex(
+            r'poster="([^"]+)"',
+            webpage, 'thumbnail', fatal=False)
+
+        def extract_count(id_, name, default=NO_DEFAULT):
+            return str_to_int(self._html_search_regex(
+                r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_,
+                webpage, '%s count' % name, default=default, fatal=False))
+
+        like_count = extract_count('rate_likes', 'like')
+        dislike_count = extract_count('rate_dislikes', 'dislike', default=None)
+        comment_count = extract_count('comments_count', 'comment')
+
+        cats_str = self._search_regex(
+            r'<div[^>]+class="categories_list">(.+?)</div>',
+            webpage, 'categories', fatal=False)
+        categories = [] if not cats_str else re.findall(
+            r'<a title="([^"]+)"', cats_str)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'formats': formats,
+            'title': title,
+            'thumbnail': thumbnail,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'comment_count': comment_count,
+            'categories': categories,
+            'age_limit': self._rta_search(webpage),
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py

new file mode 100644 (file)

index 0000000..390e79f
--- /dev/null
+++ b/youtube_dl/extractor/drtv.py
@@ -0,0 +1,352 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import binascii
+import hashlib
+import re
+
+
+from .common import InfoExtractor
+from ..aes import aes_cbc_decrypt
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+    bytes_to_intlist,
+    ExtractorError,
+    int_or_none,
+    intlist_to_bytes,
+    float_or_none,
+    mimetype2ext,
+    str_or_none,
+    try_get,
+    unified_timestamp,
+    update_url_query,
+    url_or_none,
+)
+
+
+class DRTVIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*|
+                            (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/
+                        )
+                        (?P<id>[\da-z_-]+)
+                    '''
+    _GEO_BYPASS = False
+    _GEO_COUNTRIES = ['DK']
+    IE_NAME = 'drtv'
+    _TESTS = [{
+        'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
+        'md5': '25e659cccc9a2ed956110a299fdf5983',
+        'info_dict': {
+            'id': 'klassen-darlig-taber-10',
+            'ext': 'mp4',
+            'title': 'Klassen - Dårlig taber (10)',
+            'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa',
+            'timestamp': 1539085800,
+            'upload_date': '20181009',
+            'duration': 606.84,
+            'series': 'Klassen',
+            'season': 'Klassen I',
+            'season_number': 1,
+            'season_id': 'urn:dr:mu:bundle:57d7e8216187a4031cfd6f6b',
+            'episode': 'Episode 10',
+            'episode_number': 10,
+            'release_year': 2016,
+        },
+        'expected_warnings': ['Unable to download f4m manifest'],
+    }, {
+        # embed
+        'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
+        'info_dict': {
+            'id': 'urn:dr:mu:programcard:57c926176187a50a9c6e83c6',
+            'ext': 'mp4',
+            'title': 'christiania pusher street ryddes drdkrjpo',
+            'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
+            'timestamp': 1472800279,
+            'upload_date': '20160902',
+            'duration': 131.4,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['Unable to download f4m manifest'],
+    }, {
+        # with SignLanguage formats
+        'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
+        'info_dict': {
+            'id': 'historien-om-danmark-stenalder',
+            'ext': 'mp4',
+            'title': 'Historien om Danmark: Stenalder',
+            'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
+            'timestamp': 1546628400,
+            'upload_date': '20190104',
+            'duration': 3502.56,
+            'formats': 'mincount:20',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.dr.dk/radio/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769',
+        'info_dict': {
+            'id': '00951930010',
+            'ext': 'mp4',
+            'title': 'Bonderøven (1:8)',
+            'description': 'md5:3cf18fc0d3b205745d4505f896af8121',
+            'timestamp': 1546542000,
+            'upload_date': '20190103',
+            'duration': 2576.6,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769',
+        'only_matching': True,
+    }, {
+        'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        if '>Programmet er ikke længere tilgængeligt' in webpage:
+            raise ExtractorError(
+                'Video %s is not available' % video_id, expected=True)
+
+        video_id = self._search_regex(
+            (r'data-(?:material-identifier|episode-slug)="([^"]+)"',
+             r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'),
+            webpage, 'video id', default=None)
+
+        if not video_id:
+            video_id = self._search_regex(
+                r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)',
+                webpage, 'urn', default=None)
+            if video_id:
+                video_id = compat_urllib_parse_unquote(video_id)
+
+        _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard'
+        query = {'expanded': 'true'}
+
+        if video_id:
+            programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id)
+        else:
+            programcard_url = _PROGRAMCARD_BASE
+            page = self._parse_json(
+                self._search_regex(
+                    r'data\s*=\s*({.+?})\s*(?:;|</script)', webpage,
+                    'data'), '1')['cache']['page']
+            page = page[list(page.keys())[0]]
+            item = try_get(
+                page, (lambda x: x['item'], lambda x: x['entries'][0]['item']),
+                dict)
+            video_id = item['customId'].split(':')[-1]
+            query['productionnumber'] = video_id
+
+        data = self._download_json(
+            programcard_url, video_id, 'Downloading video JSON', query=query)
+
+        title = str_or_none(data.get('Title')) or re.sub(
+            r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '',
+            self._og_search_title(webpage))
+        description = self._og_search_description(
+            webpage, default=None) or data.get('Description')
+
+        timestamp = unified_timestamp(
+            data.get('PrimaryBroadcastStartTime') or data.get('SortDateTime'))
+
+        thumbnail = None
+        duration = None
+
+        restricted_to_denmark = False
+
+        formats = []
+        subtitles = {}
+
+        assets = []
+        primary_asset = data.get('PrimaryAsset')
+        if isinstance(primary_asset, dict):
+            assets.append(primary_asset)
+        secondary_assets = data.get('SecondaryAssets')
+        if isinstance(secondary_assets, list):
+            for secondary_asset in secondary_assets:
+                if isinstance(secondary_asset, dict):
+                    assets.append(secondary_asset)
+
+        def hex_to_bytes(hex):
+            return binascii.a2b_hex(hex.encode('ascii'))
+
+        def decrypt_uri(e):
+            n = int(e[2:10], 16)
+            a = e[10 + n:]
+            data = bytes_to_intlist(hex_to_bytes(e[10:10 + n]))
+            key = bytes_to_intlist(hashlib.sha256(
+                ('%s:sRBzYNXBzkKgnjj8pGtkACch' % a).encode('utf-8')).digest())
+            iv = bytes_to_intlist(hex_to_bytes(a))
+            decrypted = aes_cbc_decrypt(data, key, iv)
+            return intlist_to_bytes(
+                decrypted[:-decrypted[-1]]).decode('utf-8').split('?')[0]
+
+        for asset in assets:
+            kind = asset.get('Kind')
+            if kind == 'Image':
+                thumbnail = url_or_none(asset.get('Uri'))
+            elif kind in ('VideoResource', 'AudioResource'):
+                duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)
+                restricted_to_denmark = asset.get('RestrictedToDenmark')
+                asset_target = asset.get('Target')
+                for link in asset.get('Links', []):
+                    uri = link.get('Uri')
+                    if not uri:
+                        encrypted_uri = link.get('EncryptedUri')
+                        if not encrypted_uri:
+                            continue
+                        try:
+                            uri = decrypt_uri(encrypted_uri)
+                        except Exception:
+                            self.report_warning(
+                                'Unable to decrypt EncryptedUri', video_id)
+                            continue
+                    uri = url_or_none(uri)
+                    if not uri:
+                        continue
+                    target = link.get('Target')
+                    format_id = target or ''
+                    if asset_target in ('SpokenSubtitles', 'SignLanguage', 'VisuallyInterpreted'):
+                        preference = -1
+                        format_id += '-%s' % asset_target
+                    elif asset_target == 'Default':
+                        preference = 1
+                    else:
+                        preference = None
+                    if target == 'HDS':
+                        f4m_formats = self._extract_f4m_formats(
+                            uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
+                            video_id, preference, f4m_id=format_id, fatal=False)
+                        if kind == 'AudioResource':
+                            for f in f4m_formats:
+                                f['vcodec'] = 'none'
+                        formats.extend(f4m_formats)
+                    elif target == 'HLS':
+                        formats.extend(self._extract_m3u8_formats(
+                            uri, video_id, 'mp4', entry_protocol='m3u8_native',
+                            preference=preference, m3u8_id=format_id,
+                            fatal=False))
+                    else:
+                        bitrate = link.get('Bitrate')
+                        if bitrate:
+                            format_id += '-%s' % bitrate
+                        formats.append({
+                            'url': uri,
+                            'format_id': format_id,
+                            'tbr': int_or_none(bitrate),
+                            'ext': link.get('FileFormat'),
+                            'vcodec': 'none' if kind == 'AudioResource' else None,
+                            'preference': preference,
+                        })
+            subtitles_list = asset.get('SubtitlesList') or asset.get('Subtitleslist')
+            if isinstance(subtitles_list, list):
+                LANGS = {
+                    'Danish': 'da',
+                }
+                for subs in subtitles_list:
+                    if not isinstance(subs, dict):
+                        continue
+                    sub_uri = url_or_none(subs.get('Uri'))
+                    if not sub_uri:
+                        continue
+                    lang = subs.get('Language') or 'da'
+                    subtitles.setdefault(LANGS.get(lang, lang), []).append({
+                        'url': sub_uri,
+                        'ext': mimetype2ext(subs.get('MimeType')) or 'vtt'
+                    })
+
+        if not formats and restricted_to_denmark:
+            self.raise_geo_restricted(
+                'Unfortunately, DR is not allowed to show this program outside Denmark.',
+                countries=self._GEO_COUNTRIES)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles,
+            'series': str_or_none(data.get('SeriesTitle')),
+            'season': str_or_none(data.get('SeasonTitle')),
+            'season_number': int_or_none(data.get('SeasonNumber')),
+            'season_id': str_or_none(data.get('SeasonUrn')),
+            'episode': str_or_none(data.get('EpisodeTitle')),
+            'episode_number': int_or_none(data.get('EpisodeNumber')),
+            'release_year': int_or_none(data.get('ProductionYear')),
+        }
+
+
+class DRTVLiveIE(InfoExtractor):
+    IE_NAME = 'drtv:live'
+    _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)'
+    _GEO_COUNTRIES = ['DK']
+    _TEST = {
+        'url': 'https://www.dr.dk/tv/live/dr1',
+        'info_dict': {
+            'id': 'dr1',
+            'ext': 'mp4',
+            'title': 're:^DR1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+        channel_data = self._download_json(
+            'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id,
+            channel_id)
+        title = self._live_title(channel_data['Title'])
+
+        formats = []
+        for streaming_server in channel_data.get('StreamingServers', []):
+            server = streaming_server.get('Server')
+            if not server:
+                continue
+            link_type = streaming_server.get('LinkType')
+            for quality in streaming_server.get('Qualities', []):
+                for stream in quality.get('Streams', []):
+                    stream_path = stream.get('Stream')
+                    if not stream_path:
+                        continue
+                    stream_url = update_url_query(
+                        '%s/%s' % (server, stream_path), {'b': ''})
+                    if link_type == 'HLS':
+                        formats.extend(self._extract_m3u8_formats(
+                            stream_url, channel_id, 'mp4',
+                            m3u8_id=link_type, fatal=False, live=True))
+                    elif link_type == 'HDS':
+                        formats.extend(self._extract_f4m_formats(update_url_query(
+                            '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}),
+                            channel_id, f4m_id=link_type, fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': channel_id,
+            'title': title,
+            'thumbnail': channel_data.get('PrimaryImageUri'),
+            'formats': formats,
+            'is_live': True,
+        }
diff --git a/youtube_dl/extractor/dtube.py b/youtube_dl/extractor/dtube.py

new file mode 100644 (file)

index 0000000..114d2db
--- /dev/null
+++ b/youtube_dl/extractor/dtube.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+from socket import timeout
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class DTubeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P<uploader_id>[0-9a-z.-]+)/(?P<id>[0-9a-z]{8})'
+    _TEST = {
+        'url': 'https://d.tube/#!/v/broncnutz/x380jtr1',
+        'md5': '9f29088fa08d699a7565ee983f56a06e',
+        'info_dict': {
+            'id': 'x380jtr1',
+            'ext': 'mp4',
+            'title': 'Lefty 3-Rings is Back Baby!! NCAA Picks',
+            'description': 'md5:60be222088183be3a42f196f34235776',
+            'uploader_id': 'broncnutz',
+            'upload_date': '20190107',
+            'timestamp': 1546854054,
+        },
+        'params': {
+            'format': '480p',
+        },
+    }
+
+    def _real_extract(self, url):
+        uploader_id, video_id = re.match(self._VALID_URL, url).groups()
+        result = self._download_json('https://api.steemit.com/', video_id, data=json.dumps({
+            'jsonrpc': '2.0',
+            'method': 'get_content',
+            'params': [uploader_id, video_id],
+        }).encode())['result']
+
+        metadata = json.loads(result['json_metadata'])
+        video = metadata['video']
+        content = video['content']
+        info = video.get('info', {})
+        title = info.get('title') or result['title']
+
+        def canonical_url(h):
+            if not h:
+                return None
+            return 'https://video.dtube.top/ipfs/' + h
+
+        formats = []
+        for q in ('240', '480', '720', '1080', ''):
+            video_url = canonical_url(content.get('video%shash' % q))
+            if not video_url:
+                continue
+            format_id = (q + 'p') if q else 'Source'
+            try:
+                self.to_screen('%s: Checking %s video format URL' % (video_id, format_id))
+                self._downloader._opener.open(video_url, timeout=5).close()
+            except timeout:
+                self.to_screen(
+                    '%s: %s URL is invalid, skipping' % (video_id, format_id))
+                continue
+            formats.append({
+                'format_id': format_id,
+                'url': video_url,
+                'height': int_or_none(q),
+                'ext': 'mp4',
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': content.get('description'),
+            'thumbnail': canonical_url(info.get('snaphash')),
+            'tags': content.get('tags') or metadata.get('tags'),
+            'duration': info.get('duration'),
+            'formats': formats,
+            'timestamp': parse_iso8601(result.get('created')),
+            'uploader_id': uploader_id,
+        }
diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py

new file mode 100644 (file)

index 0000000..d9d9afd
--- /dev/null
+++ b/youtube_dl/extractor/dumpert.py
@@ -0,0 +1,80 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    qualities,
+)
+
+
+class DumpertIE(InfoExtractor):
+    _VALID_URL = r'(?P<protocol>https?)://(?:(?:www|legacy)\.)?dumpert\.nl/(?:mediabase|embed|item)/(?P<id>[0-9]+[/_][0-9a-zA-Z]+)'
+    _TESTS = [{
+        'url': 'https://www.dumpert.nl/item/6646981_951bc60f',
+        'md5': '1b9318d7d5054e7dcb9dc7654f21d643',
+        'info_dict': {
+            'id': '6646981/951bc60f',
+            'ext': 'mp4',
+            'title': 'Ik heb nieuws voor je',
+            'description': 'Niet schrikken hoor',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }, {
+        'url': 'https://www.dumpert.nl/embed/6675421_dc440fe7',
+        'only_matching': True,
+    }, {
+        'url': 'http://legacy.dumpert.nl/mediabase/6646981/951bc60f',
+        'only_matching': True,
+    }, {
+        'url': 'http://legacy.dumpert.nl/embed/6675421/dc440fe7',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url).replace('_', '/')
+        item = self._download_json(
+            'http://api-live.dumpert.nl/mobile_api/json/info/' + video_id.replace('/', '_'),
+            video_id)['items'][0]
+        title = item['title']
+        media = next(m for m in item['media'] if m.get('mediatype') == 'VIDEO')
+
+        quality = qualities(['flv', 'mobile', 'tablet', '720p'])
+        formats = []
+        for variant in media.get('variants', []):
+            uri = variant.get('uri')
+            if not uri:
+                continue
+            version = variant.get('version')
+            formats.append({
+                'url': uri,
+                'format_id': version,
+                'quality': quality(version),
+            })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        stills = item.get('stills') or {}
+        for t in ('thumb', 'still'):
+            for s in ('', '-medium', '-large'):
+                still_id = t + s
+                still_url = stills.get(still_id)
+                if not still_url:
+                    continue
+                thumbnails.append({
+                    'id': still_id,
+                    'url': still_url,
+                })
+
+        stats = item.get('stats') or {}
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': item.get('description'),
+            'thumbnails': thumbnails,
+            'formats': formats,
+            'duration': int_or_none(media.get('duration')),
+            'like_count': int_or_none(stats.get('kudos_total')),
+            'view_count': int_or_none(stats.get('views_total')),
+        }
diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py

new file mode 100644 (file)

index 0000000..de7f6d6
--- /dev/null
+++ b/youtube_dl/extractor/dvtv.py
@@ -0,0 +1,184 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    js_to_json,
+    mimetype2ext,
+    try_get,
+    unescapeHTML,
+    parse_iso8601,
+)
+
+
+class DVTVIE(InfoExtractor):
+    IE_NAME = 'dvtv'
+    IE_DESC = 'http://video.aktualne.cz/'
+    _VALID_URL = r'https?://video\.aktualne\.cz/(?:[^/]+/)+r~(?P<id>[0-9a-f]{32})'
+    _TESTS = [{
+        'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/',
+        'md5': '67cb83e4a955d36e1b5d31993134a0c2',
+        'info_dict': {
+            'id': 'dc0768de855511e49e4b0025900fea04',
+            'ext': 'mp4',
+            'title': 'Vondra o Českém století: Při pohledu na Havla mi bylo trapně',
+            'duration': 1484,
+            'upload_date': '20141217',
+            'timestamp': 1418792400,
+        }
+    }, {
+        'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/',
+        'info_dict': {
+            'title': r'DVTV 16. 12. 2014: útok Talibanu, boj o kliniku, uprchlíci',
+            'id': '973eb3bc854e11e498be002590604f2e',
+        },
+        'playlist': [{
+            'md5': 'da7ca6be4935532241fa9520b3ad91e4',
+            'info_dict': {
+                'id': 'b0b40906854d11e4bdad0025900fea04',
+                'ext': 'mp4',
+                'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne',
+                'description': 'md5:0916925dea8e30fe84222582280b47a0',
+                'timestamp': 1418760010,
+                'upload_date': '20141216',
+            }
+        }, {
+            'md5': '5f7652a08b05009c1292317b449ffea2',
+            'info_dict': {
+                'id': '420ad9ec854a11e4bdad0025900fea04',
+                'ext': 'mp4',
+                'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka',
+                'description': 'md5:ff2f9f6de73c73d7cef4f756c1c1af42',
+                'timestamp': 1418760010,
+                'upload_date': '20141216',
+            }
+        }, {
+            'md5': '498eb9dfa97169f409126c617e2a3d64',
+            'info_dict': {
+                'id': '95d35580846a11e4b6d20025900fea04',
+                'ext': 'mp4',
+                'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?',
+                'description': 'md5:889fe610a70fee5511dc3326a089188e',
+                'timestamp': 1418760010,
+                'upload_date': '20141216',
+            }
+        }, {
+            'md5': 'b8dc6b744844032dab6ba3781a7274b9',
+            'info_dict': {
+                'id': '6fe14d66853511e4833a0025900fea04',
+                'ext': 'mp4',
+                'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády',
+                'description': 'md5:544f86de6d20c4815bea11bf2ac3004f',
+                'timestamp': 1418760010,
+                'upload_date': '20141216',
+            }
+        }],
+    }, {
+        'url': 'https://video.aktualne.cz/dvtv/zeman-si-jen-leci-mindraky-sobotku-nenavidi-a-babis-se-mu-te/r~960cdb3a365a11e7a83b0025900fea04/',
+        'md5': 'f8efe9656017da948369aa099788c8ea',
+        'info_dict': {
+            'id': '3c496fec365911e7a6500025900fea04',
+            'ext': 'mp4',
+            'title': 'Zeman si jen léčí mindráky, Sobotku nenávidí a Babiš se mu teď hodí, tvrdí Kmenta',
+            'duration': 1103,
+            'upload_date': '20170511',
+            'timestamp': 1494514200,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/',
+        'only_matching': True,
+    }, {
+        # Test live stream video (liveStarter) parsing
+        'url': 'https://video.aktualne.cz/dvtv/zive-mistryne-sveta-eva-samkova-po-navratu-ze-sampionatu/r~182654c2288811e990fd0cc47ab5f122/',
+        'md5': '2e552e483f2414851ca50467054f9d5d',
+        'info_dict': {
+            'id': '8d116360288011e98c840cc47ab5f122',
+            'ext': 'mp4',
+            'title': 'Živě: Mistryně světa Eva Samková po návratu ze šampionátu',
+            'upload_date': '20190204',
+            'timestamp': 1549289591,
+        },
+        'params': {
+            # Video content is no longer available
+            'skip_download': True,
+        },
+    }]
+
+    def _parse_video_metadata(self, js, video_id, timestamp):
+        data = self._parse_json(js, video_id, transform_source=js_to_json)
+        title = unescapeHTML(data['title'])
+
+        live_starter = try_get(data, lambda x: x['plugins']['liveStarter'], dict)
+        if live_starter:
+            data.update(live_starter)
+
+        formats = []
+        for tracks in data.get('tracks', {}).values():
+            for video in tracks:
+                video_url = video.get('src')
+                if not video_url:
+                    continue
+                video_type = video.get('type')
+                ext = determine_ext(video_url, mimetype2ext(video_type))
+                if video_type == 'application/vnd.apple.mpegurl' or ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id='hls', fatal=False))
+                elif video_type == 'application/dash+xml' or ext == 'mpd':
+                    formats.extend(self._extract_mpd_formats(
+                        video_url, video_id, mpd_id='dash', fatal=False))
+                else:
+                    label = video.get('label')
+                    height = self._search_regex(
+                        r'^(\d+)[pP]', label or '', 'height', default=None)
+                    format_id = ['http']
+                    for f in (ext, label):
+                        if f:
+                            format_id.append(f)
+                    formats.append({
+                        'url': video_url,
+                        'format_id': '-'.join(format_id),
+                        'height': int_or_none(height),
+                    })
+        self._sort_formats(formats)
+
+        return {
+            'id': data.get('mediaid') or video_id,
+            'title': title,
+            'description': data.get('description'),
+            'thumbnail': data.get('image'),
+            'duration': int_or_none(data.get('duration')),
+            'timestamp': int_or_none(timestamp),
+            'formats': formats
+        }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        timestamp = parse_iso8601(self._html_search_meta(
+            'article:published_time', webpage, 'published time', default=None))
+
+        items = re.findall(r'(?s)playlist\.push\(({.+?})\);', webpage)
+        if items:
+            return self.playlist_result(
+                [self._parse_video_metadata(i, video_id, timestamp) for i in items],
+                video_id, self._html_search_meta('twitter:title', webpage))
+
+        item = self._search_regex(
+            r'(?s)BBXPlayer\.setup\((.+?)\);',
+            webpage, 'video', default=None)
+        if item:
+            # remove function calls (ex. htmldeentitize)
+            # TODO this should be fixed in a general way in the js_to_json
+            item = re.sub(r'\w+?\((.+)\)', r'\1', item)
+            return self._parse_video_metadata(item, video_id, timestamp)
+
+        raise ExtractorError('Could not find neither video nor playlist')
diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py

new file mode 100644 (file)

index 0000000..d740652
--- /dev/null
+++ b/youtube_dl/extractor/dw.py
@@ -0,0 +1,108 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+)
+from ..compat import compat_urlparse
+
+
+class DWIE(InfoExtractor):
+    IE_NAME = 'dw'
+    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+(?:av|e)-(?P<id>\d+)'
+    _TESTS = [{
+        # video
+        'url': 'http://www.dw.com/en/intelligent-light/av-19112290',
+        'md5': '7372046e1815c5a534b43f3c3c36e6e9',
+        'info_dict': {
+            'id': '19112290',
+            'ext': 'mp4',
+            'title': 'Intelligent light',
+            'description': 'md5:90e00d5881719f2a6a5827cb74985af1',
+            'upload_date': '20160311',
+        }
+    }, {
+        # audio
+        'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941',
+        'md5': '2814c9a1321c3a51f8a7aeb067a360dd',
+        'info_dict': {
+            'id': '19111941',
+            'ext': 'mp3',
+            'title': 'WorldLink: My business',
+            'description': 'md5:bc9ca6e4e063361e21c920c53af12405',
+            'upload_date': '20160311',
+        }
+    }, {
+        # DW documentaries, only last for one or two weeks
+        'url': 'http://www.dw.com/en/documentaries-welcome-to-the-90s-2016-05-21/e-19220158-9798',
+        'md5': '56b6214ef463bfb9a3b71aeb886f3cf1',
+        'info_dict': {
+            'id': '19274438',
+            'ext': 'mp4',
+            'title': 'Welcome to the 90s – Hip Hop',
+            'description': 'Welcome to the 90s - The Golden Decade of Hip Hop',
+            'upload_date': '20160521',
+        },
+        'skip': 'Video removed',
+    }]
+
+    def _real_extract(self, url):
+        media_id = self._match_id(url)
+        webpage = self._download_webpage(url, media_id)
+        hidden_inputs = self._hidden_inputs(webpage)
+        title = hidden_inputs['media_title']
+        media_id = hidden_inputs.get('media_id') or media_id
+
+        if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
+            formats = self._extract_smil_formats(
+                'http://www.dw.com/smil/v-%s' % media_id, media_id,
+                transform_source=lambda s: s.replace(
+                    'rtmp://tv-od.dw.de/flash/',
+                    'http://tv-download.dw.de/dwtv_video/flv/'))
+            self._sort_formats(formats)
+        else:
+            formats = [{'url': hidden_inputs['file_name']}]
+
+        upload_date = hidden_inputs.get('display_date')
+        if not upload_date:
+            upload_date = self._html_search_regex(
+                r'<span[^>]+class="date">([0-9.]+)\s*\|', webpage,
+                'upload date', default=None)
+            upload_date = unified_strdate(upload_date)
+
+        return {
+            'id': media_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnail': hidden_inputs.get('preview_image'),
+            'duration': int_or_none(hidden_inputs.get('file_duration')),
+            'upload_date': upload_date,
+            'formats': formats,
+        }
+
+
+class DWArticleIE(InfoExtractor):
+    IE_NAME = 'dw:article'
+    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009',
+        'md5': '8ca657f9d068bbef74d6fc38b97fc869',
+        'info_dict': {
+            'id': '19105868',
+            'ext': 'mp4',
+            'title': 'The harsh life of refugees in Idomeni',
+            'description': 'md5:196015cc7e48ebf474db9399420043c7',
+            'upload_date': '20160310',
+        }
+    }
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+        webpage = self._download_webpage(url, article_id)
+        hidden_inputs = self._hidden_inputs(webpage)
+        media_id = hidden_inputs['media_id']
+        media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url')
+        media_url = compat_urlparse.urljoin(url, media_path)
+        return self.url_result(media_url, 'DW', media_id)
diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py

new file mode 100644 (file)

index 0000000..36fef07
--- /dev/null
+++ b/youtube_dl/extractor/eagleplatform.py
@@ -0,0 +1,206 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    unsmuggle_url,
+    url_or_none,
+)
+
+
+class EaglePlatformIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        eagleplatform:(?P<custom_host>[^/]+):|
+                        https?://(?P<host>.+?\.media\.eagleplatform\.com)/index/player\?.*\brecord_id=
+                    )
+                    (?P<id>\d+)
+                '''
+    _TESTS = [{
+        # http://lenta.ru/news/2015/03/06/navalny/
+        'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201',
+        # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
+        'info_dict': {
+            'id': '227304',
+            'ext': 'mp4',
+            'title': 'Навальный вышел на свободу',
+            'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 87,
+            'view_count': int,
+            'age_limit': 0,
+        },
+    }, {
+        # http://muz-tv.ru/play/7129/
+        # http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true
+        'url': 'eagleplatform:media.clipyou.ru:12820',
+        'md5': '358597369cf8ba56675c1df15e7af624',
+        'info_dict': {
+            'id': '12820',
+            'ext': 'mp4',
+            'title': "'O Sole Mio",
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 216,
+            'view_count': int,
+        },
+        'skip': 'Georestricted',
+    }, {
+        # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/)
+        'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        # Regular iframe embedding
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1',
+            webpage)
+        if mobj is not None:
+            return mobj.group('url')
+        PLAYER_JS_RE = r'''
+                        <script[^>]+
+                            src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs)
+                        .+?
+                    '''
+        # "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/)
+        mobj = re.search(
+            r'''(?xs)
+                    %s
+                    <div[^>]+
+                        class=(?P<qclass>["\'])eagleplayer(?P=qclass)[^>]+
+                        data-id=["\'](?P<id>\d+)
+            ''' % PLAYER_JS_RE, webpage)
+        if mobj is not None:
+            return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict()
+        # Generalization of "Javascript code usage", "Combined usage" and
+        # "Usage without attaching to DOM" embeddings (see
+        # http://dultonmedia.github.io/eplayer/)
+        mobj = re.search(
+            r'''(?xs)
+                    %s
+                    <script>
+                    .+?
+                    new\s+EaglePlayer\(
+                        (?:[^,]+\s*,\s*)?
+                        {
+                            .+?
+                            \bid\s*:\s*["\']?(?P<id>\d+)
+                            .+?
+                        }
+                    \s*\)
+                    .+?
+                    </script>
+            ''' % PLAYER_JS_RE, webpage)
+        if mobj is not None:
+            return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict()
+
+    @staticmethod
+    def _handle_error(response):
+        status = int_or_none(response.get('status', 200))
+        if status != 200:
+            raise ExtractorError(' '.join(response['errors']), expected=True)
+
+    def _download_json(self, url_or_request, video_id, *args, **kwargs):
+        try:
+            response = super(EaglePlatformIE, self)._download_json(
+                url_or_request, video_id, *args, **kwargs)
+        except ExtractorError as ee:
+            if isinstance(ee.cause, compat_HTTPError):
+                response = self._parse_json(ee.cause.read().decode('utf-8'), video_id)
+                self._handle_error(response)
+            raise
+        return response
+
+    def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'):
+        return self._download_json(url_or_request, video_id, note)['data'][0]
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        mobj = re.match(self._VALID_URL, url)
+        host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id')
+
+        headers = {}
+        query = {
+            'id': video_id,
+        }
+
+        referrer = smuggled_data.get('referrer')
+        if referrer:
+            headers['Referer'] = referrer
+            query['referrer'] = referrer
+
+        player_data = self._download_json(
+            'http://%s/api/player_data' % host, video_id,
+            headers=headers, query=query)
+
+        media = player_data['data']['playlist']['viewports'][0]['medialist'][0]
+
+        title = media['title']
+        description = media.get('description')
+        thumbnail = self._proto_relative_url(media.get('snapshot'), 'http:')
+        duration = int_or_none(media.get('duration'))
+        view_count = int_or_none(media.get('views'))
+
+        age_restriction = media.get('age_restriction')
+        age_limit = None
+        if age_restriction:
+            age_limit = 0 if age_restriction == 'allow_all' else 18
+
+        secure_m3u8 = self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:')
+
+        formats = []
+
+        m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON')
+        m3u8_formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls', fatal=False)
+        formats.extend(m3u8_formats)
+
+        m3u8_formats_dict = {}
+        for f in m3u8_formats:
+            if f.get('height') is not None:
+                m3u8_formats_dict[f['height']] = f
+
+        mp4_data = self._download_json(
+            # Secure mp4 URL is constructed according to Player.prototype.mp4 from
+            # http://lentaru.media.eagleplatform.com/player/player.js
+            re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4s', secure_m3u8),
+            video_id, 'Downloading mp4 JSON', fatal=False)
+        if mp4_data:
+            for format_id, format_url in mp4_data.get('data', {}).items():
+                if not url_or_none(format_url):
+                    continue
+                height = int_or_none(format_id)
+                if height is not None and m3u8_formats_dict.get(height):
+                    f = m3u8_formats_dict[height].copy()
+                    f.update({
+                        'format_id': f['format_id'].replace('hls', 'http'),
+                        'protocol': 'http',
+                    })
+                else:
+                    f = {
+                        'format_id': 'http-%s' % format_id,
+                        'height': int_or_none(format_id),
+                    }
+                f['url'] = format_url
+                formats.append(f)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py

new file mode 100644 (file)

index 0000000..c97682c
--- /dev/null
+++ b/youtube_dl/extractor/ebaumsworld.py
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class EbaumsWorldIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ebaumsworld\.com/videos/[^/]+/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.ebaumsworld.com/videos/a-giant-python-opens-the-door/83367677/',
+        'info_dict': {
+            'id': '83367677',
+            'ext': 'mp4',
+            'title': 'A Giant Python Opens The Door',
+            'description': 'This is how nightmares start...',
+            'uploader': 'jihadpizza',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        config = self._download_xml(
+            'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id)
+        video_url = config.find('file').text
+
+        return {
+            'id': video_id,
+            'title': config.find('title').text,
+            'url': video_url,
+            'description': config.find('description').text,
+            'thumbnail': config.find('image').text,
+            'uploader': config.find('username').text,
+        }
diff --git a/youtube_dl/extractor/echomsk.py b/youtube_dl/extractor/echomsk.py

new file mode 100644 (file)

index 0000000..6b7cc65
--- /dev/null
+++ b/youtube_dl/extractor/echomsk.py
@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class EchoMskIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?echo\.msk\.ru/sounds/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.echo.msk.ru/sounds/1464134.html',
+        'md5': '2e44b3b78daff5b458e4dbc37f191f7c',
+        'info_dict': {
+            'id': '1464134',
+            'ext': 'mp3',
+            'title': 'Особое мнение - 29 декабря 2014, 19:08',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        audio_url = self._search_regex(
+            r'<a rel="mp3" href="([^"]+)">', webpage, 'audio URL')
+
+        title = self._html_search_regex(
+            r'<a href="/programs/[^"]+" target="_blank">([^<]+)</a>',
+            webpage, 'title')
+
+        air_date = self._html_search_regex(
+            r'(?s)<div class="date">(.+?)</div>',
+            webpage, 'date', fatal=False, default=None)
+
+        if air_date:
+            air_date = re.sub(r'(\s)\1+', r'\1', air_date)
+            if air_date:
+                title = '%s - %s' % (title, air_date)
+
+        return {
+            'id': video_id,
+            'url': audio_url,
+            'title': title,
+        }
diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py

new file mode 100644 (file)

index 0000000..df11dc2
--- /dev/null
+++ b/youtube_dl/extractor/egghead.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    try_get,
+    unified_timestamp,
+    url_or_none,
+)
+
+
+class EggheadCourseIE(InfoExtractor):
+    IE_DESC = 'egghead.io course'
+    IE_NAME = 'egghead:course'
+    _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript',
+        'playlist_count': 29,
+        'info_dict': {
+            'id': '72',
+            'title': 'Professor Frisby Introduces Composable Functional JavaScript',
+            'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$',
+        },
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        lessons = self._download_json(
+            'https://egghead.io/api/v1/series/%s/lessons' % playlist_id,
+            playlist_id, 'Downloading course lessons JSON')
+
+        entries = []
+        for lesson in lessons:
+            lesson_url = url_or_none(lesson.get('http_url'))
+            if not lesson_url:
+                continue
+            lesson_id = lesson.get('id')
+            if lesson_id:
+                lesson_id = compat_str(lesson_id)
+            entries.append(self.url_result(
+                lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id))
+
+        course = self._download_json(
+            'https://egghead.io/api/v1/series/%s' % playlist_id,
+            playlist_id, 'Downloading course JSON', fatal=False) or {}
+
+        playlist_id = course.get('id')
+        if playlist_id:
+            playlist_id = compat_str(playlist_id)
+
+        return self.playlist_result(
+            entries, playlist_id, course.get('title'),
+            course.get('description'))
+
+
+class EggheadLessonIE(InfoExtractor):
+    IE_DESC = 'egghead.io lesson'
+    IE_NAME = 'egghead:lesson'
+    _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
+        'info_dict': {
+            'id': '1196',
+            'display_id': 'javascript-linear-data-flow-with-container-style-types-box',
+            'ext': 'mp4',
+            'title': 'Create linear data flow with container style types (Box)',
+            'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e',
+            'thumbnail': r're:^https?:.*\.jpg$',
+            'timestamp': 1481296768,
+            'upload_date': '20161209',
+            'duration': 304,
+            'view_count': 0,
+            'tags': ['javascript', 'free'],
+        },
+        'params': {
+            'skip_download': True,
+            'format': 'bestvideo',
+        },
+    }, {
+        'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        lesson = self._download_json(
+            'https://egghead.io/api/v1/lessons/%s' % display_id, display_id)
+
+        lesson_id = compat_str(lesson['id'])
+        title = lesson['title']
+
+        formats = []
+        for _, format_url in lesson['media_urls'].items():
+            format_url = url_or_none(format_url)
+            if not format_url:
+                continue
+            ext = determine_ext(format_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, lesson_id, 'mp4', entry_protocol='m3u8',
+                    m3u8_id='hls', fatal=False))
+            elif ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    format_url, lesson_id, mpd_id='dash', fatal=False))
+            else:
+                formats.append({
+                    'url': format_url,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': lesson_id,
+            'display_id': display_id,
+            'title': title,
+            'description': lesson.get('summary'),
+            'thumbnail': lesson.get('thumb_nail'),
+            'timestamp': unified_timestamp(lesson.get('published_at')),
+            'duration': int_or_none(lesson.get('duration')),
+            'view_count': int_or_none(lesson.get('plays_count')),
+            'tags': try_get(lesson, lambda x: x['tag_list'], list),
+            'series': try_get(
+                lesson, lambda x: x['series']['title'], compat_str),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py

new file mode 100644 (file)

index 0000000..b1cd4f5
--- /dev/null
+++ b/youtube_dl/extractor/ehow.py
@@ -0,0 +1,38 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+
+
+class EHowIE(InfoExtractor):
+    IE_NAME = 'eHow'
+    _VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
+        'md5': '9809b4e3f115ae2088440bcb4efbf371',
+        'info_dict': {
+            'id': '12245069',
+            'ext': 'flv',
+            'title': 'Hardwood Flooring Basics',
+            'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...',
+            'uploader': 'Erick Nathan',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        video_url = self._search_regex(
+            r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL')
+        final_url = compat_urllib_parse_unquote(video_url)
+        uploader = self._html_search_meta('uploader', webpage)
+        title = self._og_search_title(webpage).replace(' | eHow', '')
+
+        return {
+            'id': video_id,
+            'url': final_url,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage),
+            'uploader': uploader,
+        }
diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py

new file mode 100644 (file)

index 0000000..5ededd3
--- /dev/null
+++ b/youtube_dl/extractor/eighttracks.py
@@ -0,0 +1,164 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import random
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+)
+from ..utils import (
+    ExtractorError,
+)
+
+
+class EightTracksIE(InfoExtractor):
+    IE_NAME = '8tracks'
+    _VALID_URL = r'https?://8tracks\.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
+    _TEST = {
+        'name': 'EightTracks',
+        'url': 'http://8tracks.com/ytdl/youtube-dlc-test-tracks-a',
+        'info_dict': {
+            'id': '1336550',
+            'display_id': 'youtube-dlc-test-tracks-a',
+            'description': "test chars:  \"'/\\ä↭",
+            'title': "youtube-dlc test tracks \"'/\\ä↭<>",
+        },
+        'playlist': [
+            {
+                'md5': '96ce57f24389fc8734ce47f4c1abcc55',
+                'info_dict': {
+                    'id': '11885610',
+                    'ext': 'm4a',
+                    'title': "youtue-dl project<>\"' - youtube-dlc test track 1 \"'/\\\u00e4\u21ad",
+                    'uploader_id': 'ytdl'
+                }
+            },
+            {
+                'md5': '4ab26f05c1f7291ea460a3920be8021f',
+                'info_dict': {
+                    'id': '11885608',
+                    'ext': 'm4a',
+                    'title': "youtube-dlc project - youtube-dlc test track 2 \"'/\\\u00e4\u21ad",
+                    'uploader_id': 'ytdl'
+                }
+            },
+            {
+                'md5': 'd30b5b5f74217410f4689605c35d1fd7',
+                'info_dict': {
+                    'id': '11885679',
+                    'ext': 'm4a',
+                    'title': "youtube-dlc project as well - youtube-dlc test track 3 \"'/\\\u00e4\u21ad",
+                    'uploader_id': 'ytdl'
+                }
+            },
+            {
+                'md5': '4eb0a669317cd725f6bbd336a29f923a',
+                'info_dict': {
+                    'id': '11885680',
+                    'ext': 'm4a',
+                    'title': "youtube-dlc project as well - youtube-dlc test track 4 \"'/\\\u00e4\u21ad",
+                    'uploader_id': 'ytdl'
+                }
+            },
+            {
+                'md5': '1893e872e263a2705558d1d319ad19e8',
+                'info_dict': {
+                    'id': '11885682',
+                    'ext': 'm4a',
+                    'title': "PH - youtube-dlc test track 5 \"'/\\\u00e4\u21ad",
+                    'uploader_id': 'ytdl'
+                }
+            },
+            {
+                'md5': 'b673c46f47a216ab1741ae8836af5899',
+                'info_dict': {
+                    'id': '11885683',
+                    'ext': 'm4a',
+                    'title': "PH - youtube-dlc test track 6 \"'/\\\u00e4\u21ad",
+                    'uploader_id': 'ytdl'
+                }
+            },
+            {
+                'md5': '1d74534e95df54986da7f5abf7d842b7',
+                'info_dict': {
+                    'id': '11885684',
+                    'ext': 'm4a',
+                    'title': "phihag - youtube-dlc test track 7 \"'/\\\u00e4\u21ad",
+                    'uploader_id': 'ytdl'
+                }
+            },
+            {
+                'md5': 'f081f47af8f6ae782ed131d38b9cd1c0',
+                'info_dict': {
+                    'id': '11885685',
+                    'ext': 'm4a',
+                    'title': "phihag - youtube-dlc test track 8 \"'/\\\u00e4\u21ad",
+                    'uploader_id': 'ytdl'
+                }
+            }
+        ]
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        data = self._parse_json(
+            self._search_regex(
+                r"(?s)PAGE\.mix\s*=\s*({.+?});\n", webpage, 'trax information'),
+            playlist_id)
+
+        session = str(random.randint(0, 1000000000))
+        mix_id = data['id']
+        track_count = data['tracks_count']
+        duration = data['duration']
+        avg_song_duration = float(duration) / track_count
+        # duration is sometimes negative, use predefined avg duration
+        if avg_song_duration <= 0:
+            avg_song_duration = 300
+        first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
+        next_url = first_url
+        entries = []
+
+        for i in range(track_count):
+            api_json = None
+            download_tries = 0
+
+            while api_json is None:
+                try:
+                    api_json = self._download_webpage(
+                        next_url, playlist_id,
+                        note='Downloading song information %d/%d' % (i + 1, track_count),
+                        errnote='Failed to download song information')
+                except ExtractorError:
+                    if download_tries > 3:
+                        raise
+                    else:
+                        download_tries += 1
+                        self._sleep(avg_song_duration, playlist_id)
+
+            api_data = json.loads(api_json)
+            track_data = api_data['set']['track']
+            info = {
+                'id': compat_str(track_data['id']),
+                'url': track_data['track_file_stream_url'],
+                'title': track_data['performer'] + ' - ' + track_data['name'],
+                'raw_title': track_data['name'],
+                'uploader_id': data['user']['login'],
+                'ext': 'm4a',
+            }
+            entries.append(info)
+
+            next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (
+                session, mix_id, track_data['id'])
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'id': compat_str(mix_id),
+            'display_id': playlist_id,
+            'title': data.get('name'),
+            'description': data.get('description'),
+        }
diff --git a/youtube_dl/extractor/einthusan.py b/youtube_dl/extractor/einthusan.py

new file mode 100644 (file)

index 0000000..4e0f8bc
--- /dev/null
+++ b/youtube_dl/extractor/einthusan.py
@@ -0,0 +1,111 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    extract_attributes,
+    ExtractorError,
+    get_elements_by_class,
+    urlencode_postdata,
+)
+
+
+class EinthusanIE(InfoExtractor):
+    _VALID_URL = r'https?://(?P<host>einthusan\.(?:tv|com|ca))/movie/watch/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://einthusan.tv/movie/watch/9097/',
+        'md5': 'ff0f7f2065031b8a2cf13a933731c035',
+        'info_dict': {
+            'id': '9097',
+            'ext': 'mp4',
+            'title': 'Ae Dil Hai Mushkil',
+            'description': 'md5:33ef934c82a671a94652a9b4e54d931b',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }, {
+        'url': 'https://einthusan.tv/movie/watch/51MZ/?lang=hindi',
+        'only_matching': True,
+    }, {
+        'url': 'https://einthusan.com/movie/watch/9097/',
+        'only_matching': True,
+    }, {
+        'url': 'https://einthusan.ca/movie/watch/4E9n/?lang=hindi',
+        'only_matching': True,
+    }]
+
+    # reversed from jsoncrypto.prototype.decrypt() in einthusan-PGMovieWatcher.js
+    def _decrypt(self, encrypted_data, video_id):
+        return self._parse_json(compat_b64decode((
+            encrypted_data[:10] + encrypted_data[-1] + encrypted_data[12:-1]
+        )).decode('utf-8'), video_id)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        host = mobj.group('host')
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(r'<h3>([^<]+)</h3>', webpage, 'title')
+
+        player_params = extract_attributes(self._search_regex(
+            r'(<section[^>]+id="UIVideoPlayer"[^>]+>)', webpage, 'player parameters'))
+
+        page_id = self._html_search_regex(
+            '<html[^>]+data-pageid="([^"]+)"', webpage, 'page ID')
+        video_data = self._download_json(
+            'https://%s/ajax/movie/watch/%s/' % (host, video_id), video_id,
+            data=urlencode_postdata({
+                'xEvent': 'UIVideoPlayer.PingOutcome',
+                'xJson': json.dumps({
+                    'EJOutcomes': player_params['data-ejpingables'],
+                    'NativeHLS': False
+                }),
+                'arcVersion': 3,
+                'appVersion': 59,
+                'gorilla.csrf.Token': page_id,
+            }))['Data']
+
+        if isinstance(video_data, compat_str) and video_data.startswith('/ratelimited/'):
+            raise ExtractorError(
+                'Download rate reached. Please try again later.', expected=True)
+
+        ej_links = self._decrypt(video_data['EJLinks'], video_id)
+
+        formats = []
+
+        m3u8_url = ej_links.get('HLSLink')
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native'))
+
+        mp4_url = ej_links.get('MP4Link')
+        if mp4_url:
+            formats.append({
+                'url': mp4_url,
+            })
+
+        self._sort_formats(formats)
+
+        description = get_elements_by_class('synopsis', webpage)[0]
+        thumbnail = self._html_search_regex(
+            r'''<img[^>]+src=(["'])(?P<url>(?!\1).+?/moviecovers/(?!\1).+?)\1''',
+            webpage, 'thumbnail url', fatal=False, group='url')
+        if thumbnail is not None:
+            thumbnail = compat_urlparse.urljoin(url, thumbnail)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+        }
diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py

new file mode 100644 (file)

index 0000000..ee5ead1
--- /dev/null
+++ b/youtube_dl/extractor/eitb.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    parse_iso8601,
+    sanitized_Request,
+)
+
+
+class EitbIE(InfoExtractor):
+    IE_NAME = 'eitb.tv'
+    _VALID_URL = r'https?://(?:www\.)?eitb\.tv/(?:eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/4104995148001/4090227752001/lasa-y-zabala-30-anos/',
+        'md5': 'edf4436247185adee3ea18ce64c47998',
+        'info_dict': {
+            'id': '4090227752001',
+            'ext': 'mp4',
+            'title': '60 minutos (Lasa y Zabala, 30 años)',
+            'description': 'Programa de reportajes de actualidad.',
+            'duration': 3996.76,
+            'timestamp': 1381789200,
+            'upload_date': '20131014',
+            'tags': list,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id,
+            video_id, 'Downloading video JSON')
+
+        media = video['web_media'][0]
+
+        formats = []
+        for rendition in media['RENDITIONS']:
+            video_url = rendition.get('PMD_URL')
+            if not video_url:
+                continue
+            tbr = float_or_none(rendition.get('ENCODING_RATE'), 1000)
+            format_id = 'http'
+            if tbr:
+                format_id += '-%d' % int(tbr)
+            formats.append({
+                'url': rendition['PMD_URL'],
+                'format_id': format_id,
+                'width': int_or_none(rendition.get('FRAME_WIDTH')),
+                'height': int_or_none(rendition.get('FRAME_HEIGHT')),
+                'tbr': tbr,
+            })
+
+        hls_url = media.get('HLS_SURL')
+        if hls_url:
+            request = sanitized_Request(
+                'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/',
+                headers={'Referer': url})
+            token_data = self._download_json(
+                request, video_id, 'Downloading auth token', fatal=False)
+            if token_data:
+                token = token_data.get('token')
+                if token:
+                    formats.extend(self._extract_m3u8_formats(
+                        '%s?hdnts=%s' % (hls_url, token), video_id, m3u8_id='hls', fatal=False))
+
+        hds_url = media.get('HDS_SURL')
+        if hds_url:
+            formats.extend(self._extract_f4m_formats(
+                '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'),
+                video_id, f4m_id='hds', fatal=False))
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': media.get('NAME_ES') or media.get('name') or media['NAME_EU'],
+            'description': media.get('SHORT_DESC_ES') or video.get('desc_group') or media.get('SHORT_DESC_EU'),
+            'thumbnail': media.get('STILL_URL') or media.get('THUMBNAIL_URL'),
+            'duration': float_or_none(media.get('LENGTH'), 1000),
+            'timestamp': parse_iso8601(media.get('BROADCST_DATE'), ' '),
+            'tags': media.get('TAGS'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/ellentube.py b/youtube_dl/extractor/ellentube.py

new file mode 100644 (file)

index 0000000..5444732
--- /dev/null
+++ b/youtube_dl/extractor/ellentube.py
@@ -0,0 +1,133 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    extract_attributes,
+    float_or_none,
+    int_or_none,
+    try_get,
+)
+
+
+class EllenTubeBaseIE(InfoExtractor):
+    def _extract_data_config(self, webpage, video_id):
+        details = self._search_regex(
+            r'(<[^>]+\bdata-component=(["\'])[Dd]etails.+?></div>)', webpage,
+            'details')
+        return self._parse_json(
+            extract_attributes(details)['data-config'], video_id)
+
+    def _extract_video(self, data, video_id):
+        title = data['title']
+
+        formats = []
+        duration = None
+        for entry in data.get('media'):
+            if entry.get('id') == 'm3u8':
+                formats = self._extract_m3u8_formats(
+                    entry['url'], video_id, 'mp4',
+                    entry_protocol='m3u8_native', m3u8_id='hls')
+                duration = int_or_none(entry.get('duration'))
+                break
+        self._sort_formats(formats)
+
+        def get_insight(kind):
+            return int_or_none(try_get(
+                data, lambda x: x['insight']['%ss' % kind]))
+
+        return {
+            'extractor_key': EllenTubeIE.ie_key(),
+            'id': video_id,
+            'title': title,
+            'description': data.get('description'),
+            'duration': duration,
+            'thumbnail': data.get('thumbnail'),
+            'timestamp': float_or_none(data.get('publishTime'), scale=1000),
+            'view_count': get_insight('view'),
+            'like_count': get_insight('like'),
+            'formats': formats,
+        }
+
+
+class EllenTubeIE(EllenTubeBaseIE):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            ellentube:|
+                            https://api-prod\.ellentube\.com/ellenapi/api/item/
+                        )
+                        (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})
+                    '''
+    _TESTS = [{
+        'url': 'https://api-prod.ellentube.com/ellenapi/api/item/0822171c-3829-43bf-b99f-d77358ae75e3',
+        'md5': '2fabc277131bddafdd120e0fc0f974c9',
+        'info_dict': {
+            'id': '0822171c-3829-43bf-b99f-d77358ae75e3',
+            'ext': 'mp4',
+            'title': 'Ellen Meets Las Vegas Survivors Jesus Campos and Stephen Schuck',
+            'description': 'md5:76e3355e2242a78ad9e3858e5616923f',
+            'thumbnail': r're:^https?://.+?',
+            'duration': 514,
+            'timestamp': 1508505120,
+            'upload_date': '20171020',
+            'view_count': int,
+            'like_count': int,
+        }
+    }, {
+        'url': 'ellentube:734a3353-f697-4e79-9ca9-bfc3002dc1e0',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        data = self._download_json(
+            'https://api-prod.ellentube.com/ellenapi/api/item/%s' % video_id,
+            video_id)
+        return self._extract_video(data, video_id)
+
+
+class EllenTubeVideoIE(EllenTubeBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?ellentube\.com/video/(?P<id>.+?)\.html'
+    _TEST = {
+        'url': 'https://www.ellentube.com/video/ellen-meets-las-vegas-survivors-jesus-campos-and-stephen-schuck.html',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._extract_data_config(webpage, display_id)['id']
+        return self.url_result(
+            'ellentube:%s' % video_id, ie=EllenTubeIE.ie_key(),
+            video_id=video_id)
+
+
+class EllenTubePlaylistIE(EllenTubeBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?ellentube\.com/(?:episode|studios)/(?P<id>.+?)\.html'
+    _TESTS = [{
+        'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html',
+        'info_dict': {
+            'id': 'dax-shepard-jordan-fisher-haim',
+            'title': "Dax Shepard, 'DWTS' Team Jordan Fisher & Lindsay Arnold, HAIM",
+            'description': 'md5:bfc982194dabb3f4e325e43aa6b2e21c',
+        },
+        'playlist_count': 6,
+    }, {
+        'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        data = self._extract_data_config(webpage, display_id)['data']
+        feed = self._download_json(
+            'https://api-prod.ellentube.com/ellenapi/api/feed/?%s'
+            % data['filter'], display_id)
+        entries = [
+            self._extract_video(elem, elem['id'])
+            for elem in feed if elem.get('type') == 'VIDEO' and elem.get('id')]
+        return self.playlist_result(
+            entries, display_id, data.get('title'),
+            clean_html(data.get('description')))
diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py

new file mode 100644 (file)

index 0000000..b89f6db
--- /dev/null
+++ b/youtube_dl/extractor/elpais.py
@@ -0,0 +1,95 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import strip_jsonp, unified_strdate
+
+
+class ElPaisIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
+    IE_DESC = 'El País'
+
+    _TESTS = [{
+        'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',
+        'md5': '98406f301f19562170ec071b83433d55',
+        'info_dict': {
+            'id': 'tiempo-nuevo-recetas-viejas',
+            'ext': 'mp4',
+            'title': 'Tiempo nuevo, recetas viejas',
+            'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',
+            'upload_date': '20140206',
+        }
+    }, {
+        'url': 'http://elcomidista.elpais.com/elcomidista/2016/02/24/articulo/1456340311_668921.html#?id_externo_nwl=newsletter_diaria20160303t',
+        'md5': '3bd5b09509f3519d7d9e763179b013de',
+        'info_dict': {
+            'id': '1456340311_668921',
+            'ext': 'mp4',
+            'title': 'Cómo hacer el mejor café con cafetera italiana',
+            'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.',
+            'upload_date': '20160303',
+        }
+    }, {
+        'url': 'http://elpais.com/elpais/2017/01/26/ciencia/1485456786_417876.html',
+        'md5': '9c79923a118a067e1a45789e1e0b0f9c',
+        'info_dict': {
+            'id': '1485456786_417876',
+            'ext': 'mp4',
+            'title': 'Hallado un barco de la antigua Roma que naufragó en Baleares hace 1.800 años',
+            'description': 'La nave portaba cientos de ánforas y se hundió cerca de la isla de Cabrera por razones desconocidas',
+            'upload_date': '20170127',
+        },
+    }, {
+        'url': 'http://epv.elpais.com/epv/2017/02/14/programa_la_voz_de_inaki/1487062137_075943.html',
+        'info_dict': {
+            'id': '1487062137_075943',
+            'ext': 'mp4',
+            'title': 'Disyuntivas',
+            'description': 'md5:a0fb1485c4a6a8a917e6f93878e66218',
+            'upload_date': '20170214',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        prefix = self._html_search_regex(
+            r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix')
+        id_multimedia = self._search_regex(
+            r"id_multimedia\s*=\s*'([^']+)'", webpage, 'ID multimedia', default=None)
+        if id_multimedia:
+            url_info = self._download_json(
+                'http://elpais.com/vdpep/1/?pepid=' + id_multimedia, video_id, transform_source=strip_jsonp)
+            video_suffix = url_info['mp4']
+        else:
+            video_suffix = self._search_regex(
+                r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL')
+        video_url = prefix + video_suffix
+        thumbnail_suffix = self._search_regex(
+            r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'",
+            webpage, 'thumbnail URL', default=None)
+        thumbnail = (
+            None if thumbnail_suffix is None
+            else prefix + thumbnail_suffix) or self._og_search_thumbnail(webpage)
+        title = self._html_search_regex(
+            (r"tituloVideo\s*=\s*'([^']+)'",
+             r'<h2 class="entry-header entry-title.*?>(.*?)</h2>',
+             r'<h1[^>]+class="titulo"[^>]*>([^<]+)'),
+            webpage, 'title', default=None) or self._og_search_title(webpage)
+        upload_date = unified_strdate(self._search_regex(
+            r'<p class="date-header date-int updated"\s+title="([^"]+)">',
+            webpage, 'upload date', default=None) or self._html_search_meta(
+            'datePublished', webpage, 'timestamp'))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+        }
diff --git a/youtube_dl/extractor/embedly.py b/youtube_dl/extractor/embedly.py

new file mode 100644 (file)

index 0000000..a5820b2
--- /dev/null
+++ b/youtube_dl/extractor/embedly.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+
+
+class EmbedlyIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www|cdn\.)?embedly\.com/widgets/media\.html\?(?:[^#]*?&)?url=(?P<id>[^#&]+)'
+    _TESTS = [{
+        'url': 'https://cdn.embedly.com/widgets/media.html?src=http%3A%2F%2Fwww.youtube.com%2Fembed%2Fvideoseries%3Flist%3DUUGLim4T2loE5rwCMdpCIPVg&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DSU4fj_aEMVw%26list%3DUUGLim4T2loE5rwCMdpCIPVg&image=http%3A%2F%2Fi.ytimg.com%2Fvi%2FSU4fj_aEMVw%2Fhqdefault.jpg&key=8ee8a2e6a8cc47aab1a5ee67f9a178e0&type=text%2Fhtml&schema=youtube&autoplay=1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        return self.url_result(compat_urllib_parse_unquote(self._match_id(url)))
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py

new file mode 100644 (file)

index 0000000..65635c1
--- /dev/null
+++ b/youtube_dl/extractor/engadget.py
@@ -0,0 +1,27 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class EngadgetIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?engadget\.com/video/(?P<id>[^/?#]+)'
+
+    _TESTS = [{
+        # video with 5min ID
+        'url': 'http://www.engadget.com/video/518153925/',
+        'md5': 'c6820d4828a5064447a4d9fc73f312c9',
+        'info_dict': {
+            'id': '518153925',
+            'ext': 'mp4',
+            'title': 'Samsung Galaxy Tab Pro 8.4 Review',
+        },
+        'add_ie': ['FiveMin'],
+    }, {
+        # video with vidible ID
+        'url': 'https://www.engadget.com/video/57a28462134aa15a39f0421a/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result('aol-video:%s' % video_id)
diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py

new file mode 100644 (file)

index 0000000..fe42821
--- /dev/null
+++ b/youtube_dl/extractor/eporner.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    encode_base_n,
+    ExtractorError,
+    int_or_none,
+    merge_dicts,
+    parse_duration,
+    str_to_int,
+    url_or_none,
+)
+
+
+class EpornerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:hd-porn|embed)/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?'
+    _TESTS = [{
+        'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/',
+        'md5': '39d486f046212d8e1b911c52ab4691f8',
+        'info_dict': {
+            'id': 'qlDUmNsj6VS',
+            'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video',
+            'ext': 'mp4',
+            'title': 'Infamous Tiffany Teen Strip Tease Video',
+            'description': 'md5:764f39abf932daafa37485eb46efa152',
+            'timestamp': 1232520922,
+            'upload_date': '20090121',
+            'duration': 1838,
+            'view_count': int,
+            'age_limit': 18,
+        },
+        'params': {
+            'proxy': '127.0.0.1:8118'
+        }
+    }, {
+        # New (May 2016) URL layout
+        'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage, urlh = self._download_webpage_handle(url, display_id)
+
+        video_id = self._match_id(urlh.geturl())
+
+        hash = self._search_regex(
+            r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash')
+
+        title = self._og_search_title(webpage, default=None) or self._html_search_regex(
+            r'<title>(.+?) - EPORNER', webpage, 'title')
+
+        # Reverse engineered from vjs.js
+        def calc_hash(s):
+            return ''.join((encode_base_n(int(s[lb:lb + 8], 16), 36) for lb in range(0, 32, 8)))
+
+        video = self._download_json(
+            'http://www.eporner.com/xhr/video/%s' % video_id,
+            display_id, note='Downloading video JSON',
+            query={
+                'hash': calc_hash(hash),
+                'device': 'generic',
+                'domain': 'www.eporner.com',
+                'fallback': 'false',
+            })
+
+        if video.get('available') is False:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, video['message']), expected=True)
+
+        sources = video['sources']
+
+        formats = []
+        for kind, formats_dict in sources.items():
+            if not isinstance(formats_dict, dict):
+                continue
+            for format_id, format_dict in formats_dict.items():
+                if not isinstance(format_dict, dict):
+                    continue
+                src = url_or_none(format_dict.get('src'))
+                if not src or not src.startswith('http'):
+                    continue
+                if kind == 'hls':
+                    formats.extend(self._extract_m3u8_formats(
+                        src, display_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id=kind, fatal=False))
+                else:
+                    height = int_or_none(self._search_regex(
+                        r'(\d+)[pP]', format_id, 'height', default=None))
+                    fps = int_or_none(self._search_regex(
+                        r'(\d+)fps', format_id, 'fps', default=None))
+
+                    formats.append({
+                        'url': src,
+                        'format_id': format_id,
+                        'height': height,
+                        'fps': fps,
+                    })
+        self._sort_formats(formats)
+
+        json_ld = self._search_json_ld(webpage, display_id, default={})
+
+        duration = parse_duration(self._html_search_meta(
+            'duration', webpage, default=None))
+        view_count = str_to_int(self._search_regex(
+            r'id="cinemaviews">\s*([0-9,]+)\s*<small>views',
+            webpage, 'view count', fatal=False))
+
+        return merge_dicts(json_ld, {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+            'age_limit': 18,
+        })
diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py

new file mode 100644 (file)

index 0000000..c08643a
--- /dev/null
+++ b/youtube_dl/extractor/eroprofile.py
@@ -0,0 +1,95 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlencode
+from ..utils import (
+    ExtractorError,
+    unescapeHTML
+)
+
+
+class EroProfileIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)'
+    _LOGIN_URL = 'http://www.eroprofile.com/auth/auth.php?'
+    _NETRC_MACHINE = 'eroprofile'
+    _TESTS = [{
+        'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore',
+        'md5': 'c26f351332edf23e1ea28ce9ec9de32f',
+        'info_dict': {
+            'id': '3733775',
+            'display_id': 'sexy-babe-softcore',
+            'ext': 'm4v',
+            'title': 'sexy babe softcore',
+            'thumbnail': r're:https?://.*\.jpg',
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file',
+        'md5': '1baa9602ede46ce904c431f5418d8916',
+        'info_dict': {
+            'id': '1133519',
+            'ext': 'm4v',
+            'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file',
+            'thumbnail': r're:https?://.*\.jpg',
+            'age_limit': 18,
+        },
+        'skip': 'Requires login',
+    }]
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        query = compat_urllib_parse_urlencode({
+            'username': username,
+            'password': password,
+            'url': 'http://www.eroprofile.com/',
+        })
+        login_url = self._LOGIN_URL + query
+        login_page = self._download_webpage(login_url, None, False)
+
+        m = re.search(r'Your username or password was incorrect\.', login_page)
+        if m:
+            raise ExtractorError(
+                'Wrong username and/or password.', expected=True)
+
+        self.report_login()
+        redirect_url = self._search_regex(
+            r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url')
+        self._download_webpage(redirect_url, None, False)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        m = re.search(r'You must be logged in to view this video\.', webpage)
+        if m:
+            self.raise_login_required('This video requires login')
+
+        video_id = self._search_regex(
+            [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
+            webpage, 'video id', default=None)
+
+        video_url = unescapeHTML(self._search_regex(
+            r'<source src="([^"]+)', webpage, 'video url'))
+        title = self._html_search_regex(
+            r'Title:</th><td>([^<]+)</td>', webpage, 'title')
+        thumbnail = self._search_regex(
+            r'onclick="showVideoPlayer\(\)"><img src="([^"]+)',
+            webpage, 'thumbnail', fatal=False)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py

new file mode 100644 (file)

index 0000000..4cd815e
--- /dev/null
+++ b/youtube_dl/extractor/escapist.py
@@ -0,0 +1,111 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    clean_html,
+    int_or_none,
+    float_or_none,
+)
+
+
+def _decrypt_config(key, string):
+    a = ''
+    i = ''
+    r = ''
+
+    while len(a) < (len(string) / 2):
+        a += key
+
+    a = a[0:int(len(string) / 2)]
+
+    t = 0
+    while t < len(string):
+        i += chr(int(string[t] + string[t + 1], 16))
+        t += 2
+
+    icko = [s for s in i]
+
+    for t, c in enumerate(a):
+        r += chr(ord(c) ^ ord(icko[t]))
+
+    return r
+
+
+class EscapistIE(InfoExtractor):
+    _VALID_URL = r'https?://?(?:(?:www|v1)\.)?escapistmagazine\.com/videos/view/[^/]+/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
+        'md5': 'ab3a706c681efca53f0a35f1415cf0d1',
+        'info_dict': {
+            'id': '6618',
+            'ext': 'mp4',
+            'description': "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
+            'title': "Breaking Down Baldur's Gate",
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 264,
+            'uploader': 'The Escapist',
+        }
+    }, {
+        'url': 'http://www.escapistmagazine.com/videos/view/zero-punctuation/10044-Evolve-One-vs-Multiplayer',
+        'md5': '9e8c437b0dbb0387d3bd3255ca77f6bf',
+        'info_dict': {
+            'id': '10044',
+            'ext': 'mp4',
+            'description': 'This week, Zero Punctuation reviews Evolve.',
+            'title': 'Evolve - One vs Multiplayer',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 304,
+            'uploader': 'The Escapist',
+        }
+    }, {
+        'url': 'http://escapistmagazine.com/videos/view/the-escapist-presents/6618',
+        'only_matching': True,
+    }, {
+        'url': 'https://v1.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        ims_video = self._parse_json(
+            self._search_regex(
+                r'imsVideo\.play\(({.+?})\);', webpage, 'imsVideo'),
+            video_id)
+        video_id = ims_video['videoID']
+        key = ims_video['hash']
+
+        config = self._download_webpage(
+            'http://www.escapistmagazine.com/videos/vidconfig.php',
+            video_id, 'Downloading video config', headers={
+                'Referer': url,
+            }, query={
+                'videoID': video_id,
+                'hash': key,
+            })
+
+        data = self._parse_json(_decrypt_config(key, config), video_id)
+
+        video_data = data['videoData']
+
+        title = clean_html(video_data['title'])
+
+        formats = [{
+            'url': video['src'],
+            'format_id': '%s-%sp' % (determine_ext(video['src']), video['res']),
+            'height': int_or_none(video.get('res')),
+        } for video in data['files']['videos']]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage) or data.get('poster'),
+            'description': self._og_search_description(webpage),
+            'duration': float_or_none(video_data.get('duration'), 1000),
+            'uploader': video_data.get('publisher'),
+            'series': video_data.get('show'),
+        }
diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py

new file mode 100644 (file)

index 0000000..6cf05e6
--- /dev/null
+++ b/youtube_dl/extractor/espn.py
@@ -0,0 +1,238 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .once import OnceIE
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    unified_timestamp,
+)
+
+
+class ESPNIE(OnceIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:
+                                (?:
+                                    (?:(?:\w+\.)+)?espn\.go|
+                                    (?:www\.)?espn
+                                )\.com/
+                                (?:
+                                    (?:
+                                        video/(?:clip|iframe/twitter)|
+                                        watch/player
+                                    )
+                                    (?:
+                                        .*?\?.*?\bid=|
+                                        /_/id/
+                                    )|
+                                    [^/]+/video/
+                                )
+                            )|
+                            (?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/
+                        )
+                        (?P<id>\d+)
+                    '''
+
+    _TESTS = [{
+        'url': 'http://espn.go.com/video/clip?id=10365079',
+        'info_dict': {
+            'id': '10365079',
+            'ext': 'mp4',
+            'title': '30 for 30 Shorts: Judging Jewell',
+            'description': 'md5:39370c2e016cb4ecf498ffe75bef7f0f',
+            'timestamp': 1390936111,
+            'upload_date': '20140128',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://broadband.espn.go.com/video/clip?id=18910086',
+        'info_dict': {
+            'id': '18910086',
+            'ext': 'mp4',
+            'title': 'Kyrie spins around defender for two',
+            'description': 'md5:2b0f5bae9616d26fba8808350f0d2b9b',
+            'timestamp': 1489539155,
+            'upload_date': '20170315',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['Unable to download f4m manifest'],
+    }, {
+        'url': 'http://nonredline.sports.espn.go.com/video/clip?id=19744672',
+        'only_matching': True,
+    }, {
+        'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espn.com/watch/player?id=19141491',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espn.com/watch/player/_/id/19141491',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espn.com/video/clip?id=10365079',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espn.com/video/clip/_/id/17989860',
+        'only_matching': True,
+    }, {
+        'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espnfc.us/video/espn-fc-tv/86/video/3319154/nashville-unveiled-as-the-newest-club-in-mls',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        clip = self._download_json(
+            'http://api-app.espn.com/v1/video/clips/%s' % video_id,
+            video_id)['videos'][0]
+
+        title = clip['headline']
+
+        format_urls = set()
+        formats = []
+
+        def traverse_source(source, base_source_id=None):
+            for source_id, source in source.items():
+                if source_id == 'alert':
+                    continue
+                elif isinstance(source, compat_str):
+                    extract_source(source, base_source_id)
+                elif isinstance(source, dict):
+                    traverse_source(
+                        source,
+                        '%s-%s' % (base_source_id, source_id)
+                        if base_source_id else source_id)
+
+        def extract_source(source_url, source_id=None):
+            if source_url in format_urls:
+                return
+            format_urls.add(source_url)
+            ext = determine_ext(source_url)
+            if OnceIE.suitable(source_url):
+                formats.extend(self._extract_once_formats(source_url))
+            elif ext == 'smil':
+                formats.extend(self._extract_smil_formats(
+                    source_url, video_id, fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    source_url, video_id, f4m_id=source_id, fatal=False))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id=source_id, fatal=False))
+            else:
+                f = {
+                    'url': source_url,
+                    'format_id': source_id,
+                }
+                mobj = re.search(r'(\d+)p(\d+)_(\d+)k\.', source_url)
+                if mobj:
+                    f.update({
+                        'height': int(mobj.group(1)),
+                        'fps': int(mobj.group(2)),
+                        'tbr': int(mobj.group(3)),
+                    })
+                if source_id == 'mezzanine':
+                    f['preference'] = 1
+                formats.append(f)
+
+        links = clip.get('links', {})
+        traverse_source(links.get('source', {}))
+        traverse_source(links.get('mobile', {}))
+        self._sort_formats(formats)
+
+        description = clip.get('caption') or clip.get('description')
+        thumbnail = clip.get('thumbnail')
+        duration = int_or_none(clip.get('duration'))
+        timestamp = unified_timestamp(clip.get('originalPublishDate'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
+
+
+class ESPNArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://espn.go.com/nba/recap?gameId=400793786',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings',
+        'only_matching': True,
+    }, {
+        'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if ESPNIE.suitable(url) else super(ESPNArticleIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_id = self._search_regex(
+            r'class=(["\']).*?video-play-button.*?\1[^>]+data-id=["\'](?P<id>\d+)',
+            webpage, 'video id', group='id')
+
+        return self.url_result(
+            'http://espn.go.com/video/clip?id=%s' % video_id, ESPNIE.ie_key())
+
+
+class FiveThirtyEightIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?fivethirtyeight\.com/features/(?P<id>[^/?#]+)'
+    _TEST = {
+        'url': 'http://fivethirtyeight.com/features/how-the-6-8-raiders-can-still-make-the-playoffs/',
+        'info_dict': {
+            'id': '56032156',
+            'ext': 'flv',
+            'title': 'FiveThirtyEight: The Raiders can still make the playoffs',
+            'description': 'Neil Paine breaks down the simplest scenario that will put the Raiders into the playoffs at 8-8.',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        embed_url = self._search_regex(
+            r'<iframe[^>]+src=["\'](https?://fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/\d+)',
+            webpage, 'embed url')
+
+        return self.url_result(embed_url, 'AbcNewsVideo')
diff --git a/youtube_dl/extractor/esri.py b/youtube_dl/extractor/esri.py

new file mode 100644 (file)

index 0000000..e9dcaeb
--- /dev/null
+++ b/youtube_dl/extractor/esri.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    int_or_none,
+    parse_filesize,
+    unified_strdate,
+)
+
+
+class EsriVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications',
+        'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc',
+        'info_dict': {
+            'id': '1124',
+            'ext': 'mp4',
+            'title': 'ArcGIS Online - Developing Applications',
+            'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 185,
+            'upload_date': '20120419',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        formats = []
+        for width, height, content in re.findall(
+                r'(?s)<li><strong>(\d+)x(\d+):</strong>(.+?)</li>', webpage):
+            for video_url, ext, filesize in re.findall(
+                    r'<a[^>]+href="([^"]+)">([^<]+)&nbsp;\(([^<]+)\)</a>', content):
+                formats.append({
+                    'url': compat_urlparse.urljoin(url, video_url),
+                    'ext': ext.lower(),
+                    'format_id': '%s-%s' % (ext.lower(), height),
+                    'width': int(width),
+                    'height': int(height),
+                    'filesize_approx': parse_filesize(filesize),
+                })
+        self._sort_formats(formats)
+
+        title = self._html_search_meta('title', webpage, 'title')
+        description = self._html_search_meta(
+            'description', webpage, 'description', fatal=False)
+
+        thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False)
+        if thumbnail:
+            thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail)
+
+        duration = int_or_none(self._search_regex(
+            [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"],
+            webpage, 'duration', fatal=False))
+
+        upload_date = unified_strdate(self._html_search_meta(
+            'last-modified', webpage, 'upload date', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'upload_date': upload_date,
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py

new file mode 100644 (file)

index 0000000..1efc0b2
--- /dev/null
+++ b/youtube_dl/extractor/europa.py
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    int_or_none,
+    orderedSet,
+    parse_duration,
+    qualities,
+    unified_strdate,
+    xpath_text
+)
+
+
+class EuropaIE(InfoExtractor):
+    _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P<id>[A-Za-z0-9-]+)'
+    _TESTS = [{
+        'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758',
+        'md5': '574f080699ddd1e19a675b0ddf010371',
+        'info_dict': {
+            'id': 'I107758',
+            'ext': 'mp4',
+            'title': 'TRADE - Wikileaks on TTIP',
+            'description': 'NEW  LIVE EC Midday press briefing of 11/08/2015',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20150811',
+            'duration': 34,
+            'view_count': int,
+            'formats': 'mincount:3',
+        }
+    }, {
+        'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786',
+        'only_matching': True,
+    }, {
+        'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        playlist = self._download_xml(
+            'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id)
+
+        def get_item(type_, preference):
+            items = {}
+            for item in playlist.findall('./info/%s/item' % type_):
+                lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)
+                if lang and label:
+                    items[lang] = label.strip()
+            for p in preference:
+                if items.get(p):
+                    return items[p]
+
+        query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        preferred_lang = query.get('sitelang', ('en', ))[0]
+
+        preferred_langs = orderedSet((preferred_lang, 'en', 'int'))
+
+        title = get_item('title', preferred_langs) or video_id
+        description = get_item('description', preferred_langs)
+        thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail')
+        upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date'))
+        duration = parse_duration(xpath_text(playlist, './info/duration', 'duration'))
+        view_count = int_or_none(xpath_text(playlist, './info/views', 'views'))
+
+        language_preference = qualities(preferred_langs[::-1])
+
+        formats = []
+        for file_ in playlist.findall('./files/file'):
+            video_url = xpath_text(file_, './url')
+            if not video_url:
+                continue
+            lang = xpath_text(file_, './lg')
+            formats.append({
+                'url': video_url,
+                'format_id': lang,
+                'format_note': xpath_text(file_, './lglabel'),
+                'language_preference': language_preference(lang)
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnmail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/everyonesmixtape.py b/youtube_dl/extractor/everyonesmixtape.py

new file mode 100644 (file)

index 0000000..84a9b75
--- /dev/null
+++ b/youtube_dl/extractor/everyonesmixtape.py
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    sanitized_Request,
+)
+
+
+class EveryonesMixtapeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?everyonesmixtape\.com/#/mix/(?P<id>[0-9a-zA-Z]+)(?:/(?P<songnr>[0-9]))?$'
+
+    _TESTS = [{
+        'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5',
+        'info_dict': {
+            'id': '5bfseWNmlds',
+            'ext': 'mp4',
+            'title': "Passion Pit - \"Sleepyhead\" (Official Music Video)",
+            'uploader': 'FKR.TV',
+            'uploader_id': 'frenchkissrecords',
+            'description': "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com",
+            'upload_date': '20081015'
+        },
+        'params': {
+            'skip_download': True,  # This is simply YouTube
+        }
+    }, {
+        'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi',
+        'info_dict': {
+            'id': 'm7m0jJAbMQi',
+            'title': 'Driving',
+        },
+        'playlist_count': 24
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+
+        pllist_url = 'http://everyonesmixtape.com/mixtape.php?a=getMixes&u=-1&linked=%s&explore=' % playlist_id
+        pllist_req = sanitized_Request(pllist_url)
+        pllist_req.add_header('X-Requested-With', 'XMLHttpRequest')
+
+        playlist_list = self._download_json(
+            pllist_req, playlist_id, note='Downloading playlist metadata')
+        try:
+            playlist_no = next(playlist['id']
+                               for playlist in playlist_list
+                               if playlist['code'] == playlist_id)
+        except StopIteration:
+            raise ExtractorError('Playlist id not found')
+
+        pl_url = 'http://everyonesmixtape.com/mixtape.php?a=getMix&id=%s&userId=null&code=' % playlist_no
+        pl_req = sanitized_Request(pl_url)
+        pl_req.add_header('X-Requested-With', 'XMLHttpRequest')
+        playlist = self._download_json(
+            pl_req, playlist_id, note='Downloading playlist info')
+
+        entries = [{
+            '_type': 'url',
+            'url': t['url'],
+            'title': t['title'],
+        } for t in playlist['tracks']]
+
+        if mobj.group('songnr'):
+            songnr = int(mobj.group('songnr')) - 1
+            return entries[songnr]
+
+        playlist_title = playlist['mixData']['name']
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': playlist_title,
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py

new file mode 100644 (file)

index 0000000..95a8977
--- /dev/null
+++ b/youtube_dl/extractor/expotv.py
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+)
+
+
+class ExpoTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])'
+    _TEST = {
+        'url': 'http://www.expotv.com/videos/reviews/3/40/NYX-Butter-lipstick/667916',
+        'md5': 'fe1d728c3a813ff78f595bc8b7a707a8',
+        'info_dict': {
+            'id': '667916',
+            'ext': 'mp4',
+            'title': 'NYX Butter Lipstick Little Susie',
+            'description': 'Goes on like butter, but looks better!',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Stephanie S.',
+            'upload_date': '20150520',
+            'view_count': int,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        player_key = self._search_regex(
+            r'<param name="playerKey" value="([^"]+)"', webpage, 'player key')
+        config = self._download_json(
+            'http://client.expotv.com/video/config/%s/%s' % (video_id, player_key),
+            video_id, 'Downloading video configuration')
+
+        formats = []
+        for fcfg in config['sources']:
+            media_url = fcfg.get('file')
+            if not media_url:
+                continue
+            if fcfg.get('type') == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    media_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls'))
+            else:
+                formats.append({
+                    'url': media_url,
+                    'height': int_or_none(fcfg.get('height')),
+                    'format_id': fcfg.get('label'),
+                    'ext': self._search_regex(
+                        r'filename=.*\.([a-z0-9_A-Z]+)&', media_url,
+                        'file extension', default=None) or fcfg.get('type'),
+                })
+        self._sort_formats(formats)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = config.get('image')
+        view_count = int_or_none(self._search_regex(
+            r'<h5>Plays: ([0-9]+)</h5>', webpage, 'view counts'))
+        uploader = self._search_regex(
+            r'<div class="reviewer">\s*<img alt="([^"]+)"', webpage, 'uploader',
+            fatal=False)
+        upload_date = unified_strdate(self._search_regex(
+            r'<h5>Reviewed on ([0-9/.]+)</h5>', webpage, 'upload date',
+            fatal=False), day_first=False)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'view_count': view_count,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'upload_date': upload_date,
+        }
diff --git a/youtube_dl/extractor/expressen.py b/youtube_dl/extractor/expressen.py

new file mode 100644 (file)

index 0000000..f793650
--- /dev/null
+++ b/youtube_dl/extractor/expressen.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    unescapeHTML,
+    unified_timestamp,
+)
+
+
+class ExpressenIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?expressen\.se/
+                        (?:(?:tvspelare/video|videoplayer/embed)/)?
+                        tv/(?:[^/]+/)*
+                        (?P<id>[^/?#&]+)
+                    '''
+    _TESTS = [{
+        'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/',
+        'md5': '2fbbe3ca14392a6b1b36941858d33a45',
+        'info_dict': {
+            'id': '8690962',
+            'ext': 'mp4',
+            'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden',
+            'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 788,
+            'timestamp': 1526639109,
+            'upload_date': '20180518',
+        },
+    }, {
+        'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.expressen.se/tvspelare/video/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.expressen.se/videoplayer/embed/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url') for mobj in re.finditer(
+                r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?expressen\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1',
+                webpage)]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        def extract_data(name):
+            return self._parse_json(
+                self._search_regex(
+                    r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
+                    webpage, 'info', group='value'),
+                display_id, transform_source=unescapeHTML)
+
+        info = extract_data('video-tracking-info')
+        video_id = info['videoId']
+
+        data = extract_data('article-data')
+        stream = data['stream']
+
+        if determine_ext(stream) == 'm3u8':
+            formats = self._extract_m3u8_formats(
+                stream, display_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls')
+        else:
+            formats = [{
+                'url': stream,
+            }]
+        self._sort_formats(formats)
+
+        title = info.get('titleRaw') or data['title']
+        description = info.get('descriptionRaw')
+        thumbnail = info.get('socialMediaImage') or data.get('image')
+        duration = int_or_none(info.get('videoTotalSecondsDuration')
+                               or data.get('totalSecondsDuration'))
+        timestamp = unified_timestamp(info.get('publishDate'))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py

new file mode 100644 (file)

index 0000000..6c281ef
--- /dev/null
+++ b/youtube_dl/extractor/extractors.py
@@ -0,0 +1,1524 @@
+# flake8: noqa
+from __future__ import unicode_literals
+
+from .abc import (
+    ABCIE,
+    ABCIViewIE,
+)
+from .abcnews import (
+    AbcNewsIE,
+    AbcNewsVideoIE,
+)
+from .abcotvs import (
+    ABCOTVSIE,
+    ABCOTVSClipsIE,
+)
+from .academicearth import AcademicEarthCourseIE
+from .acast import (
+    ACastIE,
+    ACastChannelIE,
+)
+from .adn import ADNIE
+from .adobeconnect import AdobeConnectIE
+from .adobetv import (
+    AdobeTVEmbedIE,
+    AdobeTVIE,
+    AdobeTVShowIE,
+    AdobeTVChannelIE,
+    AdobeTVVideoIE,
+)
+from .adultswim import AdultSwimIE
+from .aenetworks import (
+    AENetworksIE,
+    HistoryTopicIE,
+)
+from .afreecatv import AfreecaTVIE
+from .airmozilla import AirMozillaIE
+from .aljazeera import AlJazeeraIE
+from .alphaporno import AlphaPornoIE
+from .amcnetworks import AMCNetworksIE
+from .americastestkitchen import AmericasTestKitchenIE
+from .animeondemand import AnimeOnDemandIE
+from .anvato import AnvatoIE
+from .aol import AolIE
+from .allocine import AllocineIE
+from .aliexpress import AliExpressLiveIE
+from .apa import APAIE
+from .aparat import AparatIE
+from .appleconnect import AppleConnectIE
+from .appletrailers import (
+    AppleTrailersIE,
+    AppleTrailersSectionIE,
+)
+from .archiveorg import ArchiveOrgIE
+from .arkena import ArkenaIE
+from .ard import (
+    ARDBetaMediathekIE,
+    ARDIE,
+    ARDMediathekIE,
+)
+from .arte import (
+    ArteTVPlus7IE,
+    ArteTVEmbedIE,
+    ArteTVPlaylistIE,
+)
+from .asiancrush import (
+    AsianCrushIE,
+    AsianCrushPlaylistIE,
+)
+from .atresplayer import AtresPlayerIE
+from .atttechchannel import ATTTechChannelIE
+from .atvat import ATVAtIE
+from .audimedia import AudiMediaIE
+from .audioboom import AudioBoomIE
+from .audiomack import AudiomackIE, AudiomackAlbumIE
+from .awaan import (
+    AWAANIE,
+    AWAANVideoIE,
+    AWAANLiveIE,
+    AWAANSeasonIE,
+)
+from .azmedien import AZMedienIE
+from .baidu import BaiduVideoIE
+from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE
+from .bbc import (
+    BBCCoUkIE,
+    BBCCoUkArticleIE,
+    BBCCoUkIPlayerPlaylistIE,
+    BBCCoUkPlaylistIE,
+    BBCIE,
+)
+from .beampro import (
+    BeamProLiveIE,
+    BeamProVodIE,
+)
+from .beeg import BeegIE
+from .behindkink import BehindKinkIE
+from .bellmedia import BellMediaIE
+from .beatport import BeatportIE
+from .bet import BetIE
+from .bfi import BFIPlayerIE
+from .bigflix import BigflixIE
+from .bild import BildIE
+from .bilibili import (
+    BiliBiliIE,
+    BiliBiliBangumiIE,
+    BilibiliAudioIE,
+    BilibiliAudioAlbumIE,
+    BiliBiliPlayerIE,
+)
+from .biobiochiletv import BioBioChileTVIE
+from .bitchute import (
+    BitChuteIE,
+    BitChuteChannelIE,
+)
+from .biqle import BIQLEIE
+from .bleacherreport import (
+    BleacherReportIE,
+    BleacherReportCMSIE,
+)
+from .blinkx import BlinkxIE
+from .bloomberg import BloombergIE
+from .bokecc import BokeCCIE
+from .bostonglobe import BostonGlobeIE
+from .bpb import BpbIE
+from .br import (
+    BRIE,
+    BRMediathekIE,
+)
+from .bravotv import BravoTVIE
+from .breakcom import BreakIE
+from .brightcove import (
+    BrightcoveLegacyIE,
+    BrightcoveNewIE,
+)
+from .businessinsider import BusinessInsiderIE
+from .buzzfeed import BuzzFeedIE
+from .byutv import BYUtvIE
+from .c56 import C56IE
+from .camdemy import (
+    CamdemyIE,
+    CamdemyFolderIE
+)
+from .cammodels import CamModelsIE
+from .camtube import CamTubeIE
+from .camwithher import CamWithHerIE
+from .canalplus import CanalplusIE
+from .canalc2 import Canalc2IE
+from .canvas import (
+    CanvasIE,
+    CanvasEenIE,
+    VrtNUIE,
+)
+from .carambatv import (
+    CarambaTVIE,
+    CarambaTVPageIE,
+)
+from .cartoonnetwork import CartoonNetworkIE
+from .cbc import (
+    CBCIE,
+    CBCPlayerIE,
+    CBCWatchVideoIE,
+    CBCWatchIE,
+    CBCOlympicsIE,
+)
+from .cbs import CBSIE
+from .cbslocal import CBSLocalIE
+from .cbsinteractive import CBSInteractiveIE
+from .cbsnews import (
+    CBSNewsEmbedIE,
+    CBSNewsIE,
+    CBSNewsLiveVideoIE,
+)
+from .cbssports import CBSSportsIE
+from .ccc import (
+    CCCIE,
+    CCCPlaylistIE,
+)
+from .ccma import CCMAIE
+from .cctv import CCTVIE
+from .cda import CDAIE
+from .ceskatelevize import (
+    CeskaTelevizeIE,
+    CeskaTelevizePoradyIE,
+)
+from .channel9 import Channel9IE
+from .charlierose import CharlieRoseIE
+from .chaturbate import ChaturbateIE
+from .chilloutzone import ChilloutzoneIE
+from .chirbit import (
+    ChirbitIE,
+    ChirbitProfileIE,
+)
+from .cinchcast import CinchcastIE
+from .cinemax import CinemaxIE
+from .ciscolive import (
+    CiscoLiveSessionIE,
+    CiscoLiveSearchIE,
+)
+from .cjsw import CJSWIE
+from .cliphunter import CliphunterIE
+from .clippit import ClippitIE
+from .cliprs import ClipRsIE
+from .clipsyndicate import ClipsyndicateIE
+from .closertotruth import CloserToTruthIE
+from .cloudflarestream import CloudflareStreamIE
+from .cloudy import CloudyIE
+from .clubic import ClubicIE
+from .clyp import ClypIE
+from .cmt import CMTIE
+from .cnbc import (
+    CNBCIE,
+    CNBCVideoIE,
+)
+from .cnn import (
+    CNNIE,
+    CNNBlogsIE,
+    CNNArticleIE,
+)
+from .coub import CoubIE
+from .comedycentral import (
+    ComedyCentralFullEpisodesIE,
+    ComedyCentralIE,
+    ComedyCentralShortnameIE,
+    ComedyCentralTVIE,
+    ToshIE,
+)
+from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
+from .commonprotocols import (
+    MmsIE,
+    RtmpIE,
+)
+from .condenast import CondeNastIE
+from .contv import CONtvIE
+from .corus import CorusIE
+from .cracked import CrackedIE
+from .crackle import CrackleIE
+from .crooksandliars import CrooksAndLiarsIE
+from .crunchyroll import (
+    CrunchyrollIE,
+    CrunchyrollShowPlaylistIE
+)
+from .cspan import CSpanIE
+from .ctsnews import CtsNewsIE
+from .ctvnews import CTVNewsIE
+from .cultureunplugged import CultureUnpluggedIE
+from .curiositystream import (
+    CuriosityStreamIE,
+    CuriosityStreamCollectionIE,
+)
+from .cwtv import CWTVIE
+from .dailymail import DailyMailIE
+from .dailymotion import (
+    DailymotionIE,
+    DailymotionPlaylistIE,
+    DailymotionUserIE,
+)
+from .daum import (
+    DaumIE,
+    DaumClipIE,
+    DaumPlaylistIE,
+    DaumUserIE,
+)
+from .dbtv import DBTVIE
+from .dctp import DctpTvIE
+from .deezer import DeezerPlaylistIE
+from .democracynow import DemocracynowIE
+from .dfb import DFBIE
+from .dhm import DHMIE
+from .digg import DiggIE
+from .dotsub import DotsubIE
+from .douyutv import (
+    DouyuShowIE,
+    DouyuTVIE,
+)
+from .dplay import DPlayIE
+from .drbonanza import DRBonanzaIE
+from .drtuber import DrTuberIE
+from .drtv import (
+    DRTVIE,
+    DRTVLiveIE,
+)
+from .dtube import DTubeIE
+from .dvtv import DVTVIE
+from .dumpert import DumpertIE
+from .defense import DefenseGouvFrIE
+from .discovery import DiscoveryIE
+from .discoverygo import (
+    DiscoveryGoIE,
+    DiscoveryGoPlaylistIE,
+)
+from .discoverynetworks import DiscoveryNetworksDeIE
+from .discoveryvr import DiscoveryVRIE
+from .disney import DisneyIE
+from .dispeak import DigitallySpeakingIE
+from .doodstream import DoodStreamIE
+from .dropbox import DropboxIE
+from .dw import (
+    DWIE,
+    DWArticleIE,
+)
+from .eagleplatform import EaglePlatformIE
+from .ebaumsworld import EbaumsWorldIE
+from .echomsk import EchoMskIE
+from .egghead import (
+    EggheadCourseIE,
+    EggheadLessonIE,
+)
+from .ehow import EHowIE
+from .eighttracks import EightTracksIE
+from .einthusan import EinthusanIE
+from .eitb import EitbIE
+from .ellentube import (
+    EllenTubeIE,
+    EllenTubeVideoIE,
+    EllenTubePlaylistIE,
+)
+from .elpais import ElPaisIE
+from .embedly import EmbedlyIE
+from .engadget import EngadgetIE
+from .eporner import EpornerIE
+from .eroprofile import EroProfileIE
+from .escapist import EscapistIE
+from .espn import (
+    ESPNIE,
+    ESPNArticleIE,
+    FiveThirtyEightIE,
+)
+from .esri import EsriVideoIE
+from .europa import EuropaIE
+from .everyonesmixtape import EveryonesMixtapeIE
+from .expotv import ExpoTVIE
+from .expressen import ExpressenIE
+from .extremetube import ExtremeTubeIE
+from .eyedotv import EyedoTVIE
+from .facebook import (
+    FacebookIE,
+    FacebookPluginsVideoIE,
+)
+from .faz import FazIE
+from .fc2 import (
+    FC2IE,
+    FC2EmbedIE,
+)
+from .fczenit import FczenitIE
+from .filmon import (
+    FilmOnIE,
+    FilmOnChannelIE,
+)
+from .filmweb import FilmwebIE
+from .firsttv import FirstTVIE
+from .fivemin import FiveMinIE
+from .fivetv import FiveTVIE
+from .flickr import FlickrIE
+from .folketinget import FolketingetIE
+from .footyroom import FootyRoomIE
+from .formula1 import Formula1IE
+from .fourtube import (
+    FourTubeIE,
+    PornTubeIE,
+    PornerBrosIE,
+    FuxIE,
+)
+from .fox import FOXIE
+from .fox9 import (
+    FOX9IE,
+    FOX9NewsIE,
+)
+from .foxgay import FoxgayIE
+from .foxnews import (
+    FoxNewsIE,
+    FoxNewsArticleIE,
+)
+from .foxsports import FoxSportsIE
+from .franceculture import FranceCultureIE
+from .franceinter import FranceInterIE
+from .francetv import (
+    FranceTVIE,
+    FranceTVSiteIE,
+    FranceTVEmbedIE,
+    FranceTVInfoIE,
+    FranceTVInfoSportIE,
+    FranceTVJeunesseIE,
+    GenerationWhatIE,
+    CultureboxIE,
+)
+from .freesound import FreesoundIE
+from .freespeech import FreespeechIE
+from .freshlive import FreshLiveIE
+from .frontendmasters import (
+    FrontendMastersIE,
+    FrontendMastersLessonIE,
+    FrontendMastersCourseIE
+)
+from .funimation import FunimationIE
+from .funk import FunkIE
+from .fusion import FusionIE
+from .fxnetworks import FXNetworksIE
+from .gaia import GaiaIE
+from .gameinformer import GameInformerIE
+from .gamespot import GameSpotIE
+from .gamestar import GameStarIE
+from .gaskrank import GaskrankIE
+from .gazeta import GazetaIE
+from .gdcvault import GDCVaultIE
+from .generic import GenericIE
+from .gfycat import GfycatIE
+from .giantbomb import GiantBombIE
+from .giga import GigaIE
+from .glide import GlideIE
+from .globo import (
+    GloboIE,
+    GloboArticleIE,
+)
+from .go import GoIE
+from .godtube import GodTubeIE
+from .golem import GolemIE
+from .googledrive import GoogleDriveIE
+from .googleplus import GooglePlusIE
+from .googlesearch import GoogleSearchIE
+from .goshgay import GoshgayIE
+from .gputechconf import GPUTechConfIE
+from .groupon import GrouponIE
+from .hbo import HBOIE
+from .hearthisat import HearThisAtIE
+from .heise import HeiseIE
+from .hellporno import HellPornoIE
+from .helsinki import HelsinkiIE
+from .hentaistigma import HentaiStigmaIE
+from .hgtv import HGTVComShowIE
+from .hketv import HKETVIE
+from .hidive import HiDiveIE
+from .historicfilms import HistoricFilmsIE
+from .hitbox import HitboxIE, HitboxLiveIE
+from .hitrecord import HitRecordIE
+from .hornbunny import HornBunnyIE
+from .hotnewhiphop import HotNewHipHopIE
+from .hotstar import (
+    HotStarIE,
+    HotStarPlaylistIE,
+)
+from .howcast import HowcastIE
+from .howstuffworks import HowStuffWorksIE
+from .hrfensehen import HRFernsehenIE
+from .hrti import (
+    HRTiIE,
+    HRTiPlaylistIE,
+)
+from .huajiao import HuajiaoIE
+from .huffpost import HuffPostIE
+from .hungama import (
+    HungamaIE,
+    HungamaSongIE,
+)
+from .hypem import HypemIE
+from .ign import (
+    IGNIE,
+    OneUPIE,
+    PCMagIE,
+)
+from .imdb import (
+    ImdbIE,
+    ImdbListIE
+)
+from .imgur import (
+    ImgurIE,
+    ImgurAlbumIE,
+    ImgurGalleryIE,
+)
+from .ina import InaIE
+from .inc import IncIE
+from .indavideo import IndavideoEmbedIE
+from .infoq import InfoQIE
+from .instagram import (
+    InstagramIE,
+    InstagramUserIE,
+    InstagramTagIE,
+)
+from .internazionale import InternazionaleIE
+from .internetvideoarchive import InternetVideoArchiveIE
+from .iprima import IPrimaIE
+from .iqiyi import IqiyiIE
+from .ir90tv import Ir90TvIE
+from .itv import (
+    ITVIE,
+    ITVBTCCIE,
+)
+from .ivi import (
+    IviIE,
+    IviCompilationIE
+)
+from .ivideon import IvideonIE
+from .iwara import IwaraIE
+from .izlesene import IzleseneIE
+from .jamendo import (
+    JamendoIE,
+    JamendoAlbumIE,
+)
+from .jeuxvideo import JeuxVideoIE
+from .jove import JoveIE
+from .joj import JojIE
+from .jwplatform import JWPlatformIE
+from .kakao import KakaoIE
+from .kaltura import KalturaIE
+from .kanalplay import KanalPlayIE
+from .kankan import KankanIE
+from .karaoketv import KaraoketvIE
+from .karrierevideos import KarriereVideosIE
+from .keezmovies import KeezMoviesIE
+from .ketnet import KetnetIE
+from .khanacademy import KhanAcademyIE
+from .kickstarter import KickStarterIE
+from .kinja import KinjaEmbedIE
+from .kinopoisk import KinoPoiskIE
+from .konserthusetplay import KonserthusetPlayIE
+from .krasview import KrasViewIE
+from .ku6 import Ku6IE
+from .kusi import KUSIIE
+from .kuwo import (
+    KuwoIE,
+    KuwoAlbumIE,
+    KuwoChartIE,
+    KuwoSingerIE,
+    KuwoCategoryIE,
+    KuwoMvIE,
+)
+from .la7 import LA7IE
+from .laola1tv import (
+    Laola1TvEmbedIE,
+    Laola1TvIE,
+    EHFTVIE,
+    ITTFIE,
+)
+from .lci import LCIIE
+from .lcp import (
+    LcpPlayIE,
+    LcpIE,
+)
+from .lecture2go import Lecture2GoIE
+from .lecturio import (
+    LecturioIE,
+    LecturioCourseIE,
+    LecturioDeCourseIE,
+)
+from .leeco import (
+    LeIE,
+    LePlaylistIE,
+    LetvCloudIE,
+)
+from .lego import LEGOIE
+from .lemonde import LemondeIE
+from .lenta import LentaIE
+from .libraryofcongress import LibraryOfCongressIE
+from .libsyn import LibsynIE
+from .lifenews import (
+    LifeNewsIE,
+    LifeEmbedIE,
+)
+from .limelight import (
+    LimelightMediaIE,
+    LimelightChannelIE,
+    LimelightChannelListIE,
+)
+from .line import LineTVIE
+from .linkedin import (
+    LinkedInLearningIE,
+    LinkedInLearningCourseIE,
+)
+from .linuxacademy import LinuxAcademyIE
+from .litv import LiTVIE
+from .livejournal import LiveJournalIE
+from .liveleak import (
+    LiveLeakIE,
+    LiveLeakEmbedIE,
+)
+from .livestream import (
+    LivestreamIE,
+    LivestreamOriginalIE,
+    LivestreamShortenerIE,
+)
+from .lnkgo import LnkGoIE
+from .localnews8 import LocalNews8IE
+from .lovehomeporn import LoveHomePornIE
+from .lrt import LRTIE
+from .lynda import (
+    LyndaIE,
+    LyndaCourseIE
+)
+from .m6 import M6IE
+from .mailru import (
+    MailRuIE,
+    MailRuMusicIE,
+    MailRuMusicSearchIE,
+)
+from .malltv import MallTVIE
+from .mangomolo import (
+    MangomoloVideoIE,
+    MangomoloLiveIE,
+)
+from .manyvids import ManyVidsIE
+from .markiza import (
+    MarkizaIE,
+    MarkizaPageIE,
+)
+from .massengeschmacktv import MassengeschmackTVIE
+from .matchtv import MatchTVIE
+from .mdr import MDRIE
+from .mediaset import MediasetIE
+from .mediasite import (
+    MediasiteIE,
+    MediasiteCatalogIE,
+    MediasiteNamedCatalogIE,
+)
+from .medici import MediciIE
+from .megaphone import MegaphoneIE
+from .meipai import MeipaiIE
+from .melonvod import MelonVODIE
+from .meta import METAIE
+from .metacafe import MetacafeIE
+from .metacritic import MetacriticIE
+from .mgoon import MgoonIE
+from .mgtv import MGTVIE
+from .miaopai import MiaoPaiIE
+from .microsoftvirtualacademy import (
+    MicrosoftVirtualAcademyIE,
+    MicrosoftVirtualAcademyCourseIE,
+)
+from .ministrygrid import MinistryGridIE
+from .minoto import MinotoIE
+from .miomio import MioMioIE
+from .mit import TechTVMITIE, OCWMITIE
+from .mitele import MiTeleIE
+from .mixcloud import (
+    MixcloudIE,
+    MixcloudUserIE,
+    MixcloudPlaylistIE,
+)
+from .mlb import MLBIE
+from .mnet import MnetIE
+from .moevideo import MoeVideoIE
+from .mofosex import (
+    MofosexIE,
+    MofosexEmbedIE,
+)
+from .mojvideo import MojvideoIE
+from .morningstar import MorningstarIE
+from .motherless import (
+    MotherlessIE,
+    MotherlessGroupIE
+)
+from .motorsport import MotorsportIE
+from .movieclips import MovieClipsIE
+from .moviezine import MoviezineIE
+from .movingimage import MovingImageIE
+from .msn import MSNIE
+from .mtv import (
+    MTVIE,
+    MTVVideoIE,
+    MTVServicesEmbeddedIE,
+    MTVDEIE,
+    MTVJapanIE,
+)
+from .muenchentv import MuenchenTVIE
+from .mwave import MwaveIE, MwaveMeetGreetIE
+from .mychannels import MyChannelsIE
+from .myspace import MySpaceIE, MySpaceAlbumIE
+from .myspass import MySpassIE
+from .myvi import (
+    MyviIE,
+    MyviEmbedIE,
+)
+from .myvidster import MyVidsterIE
+from .nationalgeographic import (
+    NationalGeographicVideoIE,
+    NationalGeographicTVIE,
+)
+from .naver import NaverIE
+from .nba import NBAIE
+from .nbc import (
+    CSNNEIE,
+    NBCIE,
+    NBCNewsIE,
+    NBCOlympicsIE,
+    NBCOlympicsStreamIE,
+    NBCSportsIE,
+    NBCSportsStreamIE,
+    NBCSportsVPlayerIE,
+)
+from .ndr import (
+    NDRIE,
+    NJoyIE,
+    NDREmbedBaseIE,
+    NDREmbedIE,
+    NJoyEmbedIE,
+)
+from .ndtv import NDTVIE
+from .netzkino import NetzkinoIE
+from .nerdcubed import NerdCubedFeedIE
+from .neteasemusic import (
+    NetEaseMusicIE,
+    NetEaseMusicAlbumIE,
+    NetEaseMusicSingerIE,
+    NetEaseMusicListIE,
+    NetEaseMusicMvIE,
+    NetEaseMusicProgramIE,
+    NetEaseMusicDjRadioIE,
+)
+from .newgrounds import (
+    NewgroundsIE,
+    NewgroundsPlaylistIE,
+)
+from .newstube import NewstubeIE
+from .nextmedia import (
+    NextMediaIE,
+    NextMediaActionNewsIE,
+    AppleDailyIE,
+    NextTVIE,
+)
+from .nexx import (
+    NexxIE,
+    NexxEmbedIE,
+)
+from .nfl import NFLIE
+from .nhk import NhkVodIE
+from .nhl import NHLIE
+from .nick import (
+    NickIE,
+    NickBrIE,
+    NickDeIE,
+    NickNightIE,
+    NickRuIE,
+)
+from .niconico import NiconicoIE, NiconicoPlaylistIE
+from .ninecninemedia import NineCNineMediaIE
+from .ninegag import NineGagIE
+from .ninenow import NineNowIE
+from .nintendo import NintendoIE
+from .njpwworld import NJPWWorldIE
+from .nobelprize import NobelPrizeIE
+from .noco import NocoIE
+from .nonktube import NonkTubeIE
+from .noovo import NoovoIE
+from .normalboots import NormalbootsIE
+from .nosvideo import NosVideoIE
+from .nova import (
+    NovaEmbedIE,
+    NovaIE,
+)
+from .nowness import (
+    NownessIE,
+    NownessPlaylistIE,
+    NownessSeriesIE,
+)
+from .noz import NozIE
+from .npo import (
+    AndereTijdenIE,
+    NPOIE,
+    NPOLiveIE,
+    NPORadioIE,
+    NPORadioFragmentIE,
+    SchoolTVIE,
+    HetKlokhuisIE,
+    VPROIE,
+    WNLIE,
+)
+from .npr import NprIE
+from .nrk import (
+    NRKIE,
+    NRKPlaylistIE,
+    NRKSkoleIE,
+    NRKTVIE,
+    NRKTVDirekteIE,
+    NRKTVEpisodeIE,
+    NRKTVEpisodesIE,
+    NRKTVSeasonIE,
+    NRKTVSeriesIE,
+)
+from .nrl import NRLTVIE
+from .ntvcojp import NTVCoJpCUIE
+from .ntvde import NTVDeIE
+from .ntvru import NTVRuIE
+from .nytimes import (
+    NYTimesIE,
+    NYTimesArticleIE,
+)
+from .nuvid import NuvidIE
+from .nzz import NZZIE
+from .odatv import OdaTVIE
+from .odnoklassniki import OdnoklassnikiIE
+from .oktoberfesttv import OktoberfestTVIE
+from .ondemandkorea import OnDemandKoreaIE
+from .onet import (
+    OnetIE,
+    OnetChannelIE,
+    OnetMVPIE,
+    OnetPlIE,
+)
+from .onionstudios import OnionStudiosIE
+from .ooyala import (
+    OoyalaIE,
+    OoyalaExternalIE,
+)
+from .ora import OraTVIE
+from .orf import (
+    ORFTVthekIE,
+    ORFFM4IE,
+    ORFFM4StoryIE,
+    ORFOE1IE,
+    ORFOE3IE,
+    ORFNOEIE,
+    ORFWIEIE,
+    ORFBGLIE,
+    ORFOOEIE,
+    ORFSTMIE,
+    ORFKTNIE,
+    ORFSBGIE,
+    ORFTIRIE,
+    ORFVBGIE,
+    ORFIPTVIE,
+)
+from .outsidetv import OutsideTVIE
+from .packtpub import (
+    PacktPubIE,
+    PacktPubCourseIE,
+)
+from .pandoratv import PandoraTVIE
+from .parliamentliveuk import ParliamentLiveUKIE
+from .patreon import PatreonIE
+from .pbs import PBSIE
+from .pearvideo import PearVideoIE
+from .peertube import PeerTubeIE
+from .people import PeopleIE
+from .performgroup import PerformGroupIE
+from .periscope import (
+    PeriscopeIE,
+    PeriscopeUserIE,
+)
+from .philharmoniedeparis import PhilharmonieDeParisIE
+from .phoenix import PhoenixIE
+from .photobucket import PhotobucketIE
+from .picarto import (
+    PicartoIE,
+    PicartoVodIE,
+)
+from .piksel import PikselIE
+from .pinkbike import PinkbikeIE
+from .pladform import PladformIE
+from .platzi import (
+    PlatziIE,
+    PlatziCourseIE,
+)
+from .playfm import PlayFMIE
+from .playplustv import PlayPlusTVIE
+from .plays import PlaysTVIE
+from .playtvak import PlaytvakIE
+from .playvid import PlayvidIE
+from .playwire import PlaywireIE
+from .pluralsight import (
+    PluralsightIE,
+    PluralsightCourseIE,
+)
+from .podomatic import PodomaticIE
+from .pokemon import PokemonIE
+from .polskieradio import (
+    PolskieRadioIE,
+    PolskieRadioCategoryIE,
+)
+from .popcorntimes import PopcorntimesIE
+from .popcorntv import PopcornTVIE
+from .porn91 import Porn91IE
+from .porncom import PornComIE
+from .pornhd import PornHdIE
+from .pornhub import (
+    PornHubIE,
+    PornHubUserIE,
+    PornHubPagedVideoListIE,
+    PornHubUserVideosUploadIE,
+)
+from .pornotube import PornotubeIE
+from .pornovoisines import PornoVoisinesIE
+from .pornoxo import PornoXOIE
+from .puhutv import (
+    PuhuTVIE,
+    PuhuTVSerieIE,
+)
+from .presstv import PressTVIE
+from .prosiebensat1 import ProSiebenSat1IE
+from .puls4 import Puls4IE
+from .pyvideo import PyvideoIE
+from .qqmusic import (
+    QQMusicIE,
+    QQMusicSingerIE,
+    QQMusicAlbumIE,
+    QQMusicToplistIE,
+    QQMusicPlaylistIE,
+)
+from .r7 import (
+    R7IE,
+    R7ArticleIE,
+)
+from .radiocanada import (
+    RadioCanadaIE,
+    RadioCanadaAudioVideoIE,
+)
+from .radiode import RadioDeIE
+from .radiojavan import RadioJavanIE
+from .radiobremen import RadioBremenIE
+from .radiofrance import RadioFranceIE
+from .rai import (
+    RaiPlayIE,
+    RaiPlayLiveIE,
+    RaiPlayPlaylistIE,
+    RaiIE,
+)
+from .raywenderlich import (
+    RayWenderlichIE,
+    RayWenderlichCourseIE,
+)
+from .rbmaradio import RBMARadioIE
+from .rds import RDSIE
+from .redbulltv import (
+    RedBullTVIE,
+    RedBullTVRrnContentIE,
+)
+from .reddit import (
+    RedditIE,
+    RedditRIE,
+)
+from .redtube import RedTubeIE
+from .regiotv import RegioTVIE
+from .rentv import (
+    RENTVIE,
+    RENTVArticleIE,
+)
+from .restudy import RestudyIE
+from .reuters import ReutersIE
+from .reverbnation import ReverbNationIE
+from .rice import RICEIE
+from .rmcdecouverte import RMCDecouverteIE
+from .ro220 import Ro220IE
+from .rockstargames import RockstarGamesIE
+from .roosterteeth import RoosterTeethIE
+from .rottentomatoes import RottenTomatoesIE
+from .roxwel import RoxwelIE
+from .rozhlas import RozhlasIE
+from .rtbf import RTBFIE
+from .rte import RteIE, RteRadioIE
+from .rtlnl import RtlNlIE
+from .rtl2 import (
+    RTL2IE,
+    RTL2YouIE,
+    RTL2YouSeriesIE,
+)
+from .rtp import RTPIE
+from .rts import RTSIE
+from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE
+from .rtvnh import RTVNHIE
+from .rtvs import RTVSIE
+from .ruhd import RUHDIE
+from .rutube import (
+    RutubeIE,
+    RutubeChannelIE,
+    RutubeEmbedIE,
+    RutubeMovieIE,
+    RutubePersonIE,
+    RutubePlaylistIE,
+)
+from .rutv import RUTVIE
+from .ruutu import RuutuIE
+from .ruv import RuvIE
+from .safari import (
+    SafariIE,
+    SafariApiIE,
+    SafariCourseIE,
+)
+from .sapo import SapoIE
+from .savefrom import SaveFromIE
+from .sbs import SBSIE
+from .screencast import ScreencastIE
+from .screencastomatic import ScreencastOMaticIE
+from .scrippsnetworks import (
+    ScrippsNetworksWatchIE,
+    ScrippsNetworksIE,
+)
+from .scte import (
+    SCTEIE,
+    SCTECourseIE,
+)
+from .seeker import SeekerIE
+from .senateisvp import SenateISVPIE
+from .sendtonews import SendtoNewsIE
+from .servus import ServusIE
+from .sevenplus import SevenPlusIE
+from .sexu import SexuIE
+from .seznamzpravy import (
+    SeznamZpravyIE,
+    SeznamZpravyArticleIE,
+)
+from .shahid import (
+    ShahidIE,
+    ShahidShowIE,
+)
+from .shared import (
+    SharedIE,
+    VivoIE,
+)
+from .showroomlive import ShowRoomLiveIE
+from .sina import SinaIE
+from .sixplay import SixPlayIE
+from .skylinewebcams import SkylineWebcamsIE
+from .skynewsarabia import (
+    SkyNewsArabiaIE,
+    SkyNewsArabiaArticleIE,
+)
+from .sky import (
+    SkyNewsIE,
+    SkySportsIE,
+)
+from .slideshare import SlideshareIE
+from .slideslive import SlidesLiveIE
+from .slutload import SlutloadIE
+from .smotri import (
+    SmotriIE,
+    SmotriCommunityIE,
+    SmotriUserIE,
+    SmotriBroadcastIE,
+)
+from .snotr import SnotrIE
+from .sohu import SohuIE
+from .sonyliv import SonyLIVIE
+from .soundcloud import (
+    SoundcloudEmbedIE,
+    SoundcloudIE,
+    SoundcloudSetIE,
+    SoundcloudUserIE,
+    SoundcloudTrackStationIE,
+    SoundcloudPlaylistIE,
+    SoundcloudSearchIE,
+)
+from .soundgasm import (
+    SoundgasmIE,
+    SoundgasmProfileIE
+)
+from .southpark import (
+    SouthParkIE,
+    SouthParkDeIE,
+    SouthParkDkIE,
+    SouthParkEsIE,
+    SouthParkNlIE
+)
+from .spankbang import (
+    SpankBangIE,
+    SpankBangPlaylistIE,
+)
+from .spankwire import SpankwireIE
+from .spiegel import SpiegelIE, SpiegelArticleIE
+from .spiegeltv import SpiegeltvIE
+from .spike import (
+    BellatorIE,
+    ParamountNetworkIE,
+)
+from .storyfire import (
+    StoryFireIE,
+    StoryFireUserIE,
+    StoryFireSeriesIE,
+)
+from .stitcher import StitcherIE
+from .sport5 import Sport5IE
+from .sportbox import SportBoxIE
+from .sportdeutschland import SportDeutschlandIE
+from .springboardplatform import SpringboardPlatformIE
+from .sprout import SproutIE
+from .srgssr import (
+    SRGSSRIE,
+    SRGSSRPlayIE,
+)
+from .srmediathek import SRMediathekIE
+from .stanfordoc import StanfordOpenClassroomIE
+from .steam import SteamIE
+from .streamable import StreamableIE
+from .streamcloud import StreamcloudIE
+from .streamcz import StreamCZIE
+from .streetvoice import StreetVoiceIE
+from .stretchinternet import StretchInternetIE
+from .stv import STVPlayerIE
+from .sunporno import SunPornoIE
+from .sverigesradio import (
+    SverigesRadioEpisodeIE,
+    SverigesRadioPublicationIE,
+)
+from .svt import (
+    SVTIE,
+    SVTPageIE,
+    SVTPlayIE,
+    SVTSeriesIE,
+)
+from .swrmediathek import SWRMediathekIE
+from .syfy import SyfyIE
+from .sztvhu import SztvHuIE
+from .tagesschau import (
+    TagesschauPlayerIE,
+    TagesschauIE,
+)
+from .tass import TassIE
+from .tastytrade import TastyTradeIE
+from .tbs import TBSIE
+from .tdslifeway import TDSLifewayIE
+from .teachable import (
+    TeachableIE,
+    TeachableCourseIE,
+)
+from .teachertube import (
+    TeacherTubeIE,
+    TeacherTubeUserIE,
+)
+from .teachingchannel import TeachingChannelIE
+from .teamcoco import TeamcocoIE
+from .teamtreehouse import TeamTreeHouseIE
+from .techtalks import TechTalksIE
+from .ted import TEDIE
+from .tele5 import Tele5IE
+from .tele13 import Tele13IE
+from .telebruxelles import TeleBruxellesIE
+from .telecinco import TelecincoIE
+from .telegraaf import TelegraafIE
+from .telemb import TeleMBIE
+from .telequebec import (
+    TeleQuebecIE,
+    TeleQuebecSquatIE,
+    TeleQuebecEmissionIE,
+    TeleQuebecLiveIE,
+)
+from .teletask import TeleTaskIE
+from .telewebion import TelewebionIE
+from .tennistv import TennisTVIE
+from .tenplay import TenPlayIE
+from .testurl import TestURLIE
+from .tf1 import TF1IE
+from .tfo import TFOIE
+from .theintercept import TheInterceptIE
+from .theplatform import (
+    ThePlatformIE,
+    ThePlatformFeedIE,
+)
+from .thescene import TheSceneIE
+from .thestar import TheStarIE
+from .thesun import TheSunIE
+from .theweatherchannel import TheWeatherChannelIE
+from .thisamericanlife import ThisAmericanLifeIE
+from .thisav import ThisAVIE
+from .thisoldhouse import ThisOldHouseIE
+from .threeqsdn import ThreeQSDNIE
+from .tiktok import (
+    TikTokIE,
+    TikTokUserIE,
+)
+from .tinypic import TinyPicIE
+from .tmz import (
+    TMZIE,
+    TMZArticleIE,
+)
+from .tnaflix import (
+    TNAFlixNetworkEmbedIE,
+    TNAFlixIE,
+    EMPFlixIE,
+    MovieFapIE,
+)
+from .toggle import ToggleIE
+from .tonline import TOnlineIE
+from .toongoggles import ToonGogglesIE
+from .toutv import TouTvIE
+from .toypics import ToypicsUserIE, ToypicsIE
+from .traileraddict import TrailerAddictIE
+from .trilulilu import TriluliluIE
+from .trunews import TruNewsIE
+from .trutv import TruTVIE
+from .tube8 import Tube8IE
+from .tubitv import TubiTvIE
+from .tumblr import TumblrIE
+from .tunein import (
+    TuneInClipIE,
+    TuneInStationIE,
+    TuneInProgramIE,
+    TuneInTopicIE,
+    TuneInShortenerIE,
+)
+from .tunepk import TunePkIE
+from .turbo import TurboIE
+from .tv2 import (
+    TV2IE,
+    TV2ArticleIE,
+    KatsomoIE,
+)
+from .tv2dk import (
+    TV2DKIE,
+    TV2DKBornholmPlayIE,
+)
+from .tv2hu import TV2HuIE
+from .tv4 import TV4IE
+from .tv5mondeplus import TV5MondePlusIE
+from .tva import TVAIE
+from .tvanouvelles import (
+    TVANouvellesIE,
+    TVANouvellesArticleIE,
+)
+from .tvc import (
+    TVCIE,
+    TVCArticleIE,
+)
+from .tvigle import TvigleIE
+from .tvland import TVLandIE
+from .tvn24 import TVN24IE
+from .tvnet import TVNetIE
+from .tvnoe import TVNoeIE
+from .tvnow import (
+    TVNowIE,
+    TVNowNewIE,
+    TVNowSeasonIE,
+    TVNowAnnualIE,
+    TVNowShowIE,
+)
+from .tvp import (
+    TVPEmbedIE,
+    TVPIE,
+    TVPWebsiteIE,
+)
+from .tvplay import (
+    TVPlayIE,
+    ViafreeIE,
+    TVPlayHomeIE,
+)
+from .tvplayer import TVPlayerIE
+from .tweakers import TweakersIE
+from .twentyfourvideo import TwentyFourVideoIE
+from .twentymin import TwentyMinutenIE
+from .twentythreevideo import TwentyThreeVideoIE
+from .twitcasting import TwitCastingIE
+from .twitch import (
+    TwitchVideoIE,
+    TwitchChapterIE,
+    TwitchVodIE,
+    TwitchProfileIE,
+    TwitchAllVideosIE,
+    TwitchUploadsIE,
+    TwitchPastBroadcastsIE,
+    TwitchHighlightsIE,
+    TwitchStreamIE,
+    TwitchClipsIE,
+)
+from .twitter import (
+    TwitterCardIE,
+    TwitterIE,
+    TwitterAmplifyIE,
+    TwitterBroadcastIE,
+)
+from .udemy import (
+    UdemyIE,
+    UdemyCourseIE
+)
+from .udn import UDNEmbedIE
+from .ufctv import (
+    UFCTVIE,
+    UFCArabiaIE,
+)
+from .uktvplay import UKTVPlayIE
+from .digiteka import DigitekaIE
+from .dlive import (
+    DLiveVODIE,
+    DLiveStreamIE,
+)
+from .umg import UMGDeIE
+from .unistra import UnistraIE
+from .unity import UnityIE
+from .uol import UOLIE
+from .uplynk import (
+    UplynkIE,
+    UplynkPreplayIE,
+)
+from .urort import UrortIE
+from .urplay import URPlayIE
+from .usanetwork import USANetworkIE
+from .usatoday import USATodayIE
+from .ustream import UstreamIE, UstreamChannelIE
+from .ustudio import (
+    UstudioIE,
+    UstudioEmbedIE,
+)
+from .varzesh3 import Varzesh3IE
+from .vbox7 import Vbox7IE
+from .veehd import VeeHDIE
+from .veoh import VeohIE
+from .vesti import VestiIE
+from .vevo import (
+    VevoIE,
+    VevoPlaylistIE,
+)
+from .vgtv import (
+    BTArticleIE,
+    BTVestlendingenIE,
+    VGTVIE,
+)
+from .vh1 import VH1IE
+from .vice import (
+    ViceIE,
+    ViceArticleIE,
+    ViceShowIE,
+)
+from .vidbit import VidbitIE
+from .viddler import ViddlerIE
+from .videa import VideaIE
+from .videodetective import VideoDetectiveIE
+from .videofyme import VideofyMeIE
+from .videomore import (
+    VideomoreIE,
+    VideomoreVideoIE,
+    VideomoreSeasonIE,
+)
+from .videopress import VideoPressIE
+from .vidio import VidioIE
+from .vidlii import VidLiiIE
+from .vidme import (
+    VidmeIE,
+    VidmeUserIE,
+    VidmeUserLikesIE,
+)
+from .vidzi import VidziIE
+from .vier import VierIE, VierVideosIE
+from .viewlift import (
+    ViewLiftIE,
+    ViewLiftEmbedIE,
+)
+from .viidea import ViideaIE
+from .vimeo import (
+    VimeoIE,
+    VimeoAlbumIE,
+    VimeoChannelIE,
+    VimeoGroupsIE,
+    VimeoLikesIE,
+    VimeoOndemandIE,
+    VimeoReviewIE,
+    VimeoUserIE,
+    VimeoWatchLaterIE,
+    VHXEmbedIE,
+)
+from .vimple import VimpleIE
+from .vine import (
+    VineIE,
+    VineUserIE,
+)
+from .viki import (
+    VikiIE,
+    VikiChannelIE,
+)
+from .viqeo import ViqeoIE
+from .viu import (
+    ViuIE,
+    ViuPlaylistIE,
+    ViuOTTIE,
+)
+from .vk import (
+    VKIE,
+    VKUserVideosIE,
+    VKWallPostIE,
+)
+from .vlive import (
+    VLiveIE,
+    VLiveChannelIE,
+    VLivePlaylistIE
+)
+from .vodlocker import VodlockerIE
+from .vodpl import VODPlIE
+from .vodplatform import VODPlatformIE
+from .voicerepublic import VoiceRepublicIE
+from .voot import VootIE
+from .voxmedia import (
+    VoxMediaVolumeIE,
+    VoxMediaIE,
+)
+from .vrt import VRTIE
+from .vrak import VrakIE
+from .vrv import (
+    VRVIE,
+    VRVSeriesIE,
+)
+from .vshare import VShareIE
+from .medialaan import MedialaanIE
+from .vube import VubeIE
+from .vuclip import VuClipIE
+from .vvvvid import VVVVIDIE
+from .vyborymos import VyboryMosIE
+from .vzaar import VzaarIE
+from .wakanim import WakanimIE
+from .walla import WallaIE
+from .washingtonpost import (
+    WashingtonPostIE,
+    WashingtonPostArticleIE,
+)
+from .wat import WatIE
+from .watchbox import WatchBoxIE
+from .watchindianporn import WatchIndianPornIE
+from .wdr import (
+    WDRIE,
+    WDRPageIE,
+    WDRElefantIE,
+    WDRMobileIE,
+)
+from .webcaster import (
+    WebcasterIE,
+    WebcasterFeedIE,
+)
+from .webofstories import (
+    WebOfStoriesIE,
+    WebOfStoriesPlaylistIE,
+)
+from .weibo import (
+    WeiboIE,
+    WeiboMobileIE
+)
+from .weiqitv import WeiqiTVIE
+from .wistia import WistiaIE
+from .worldstarhiphop import WorldStarHipHopIE
+from .wsj import (
+    WSJIE,
+    WSJArticleIE,
+)
+from .wwe import WWEIE
+from .xbef import XBefIE
+from .xboxclips import XboxClipsIE
+from .xfileshare import XFileShareIE
+from .xhamster import (
+    XHamsterIE,
+    XHamsterEmbedIE,
+    XHamsterUserIE,
+)
+from .xiami import (
+    XiamiSongIE,
+    XiamiAlbumIE,
+    XiamiArtistIE,
+    XiamiCollectionIE
+)
+from .ximalaya import (
+    XimalayaIE,
+    XimalayaAlbumIE
+)
+from .xminus import XMinusIE
+from .xnxx import XNXXIE
+from .xstream import XstreamIE
+from .xtube import XTubeUserIE, XTubeIE
+from .xuite import XuiteIE
+from .xvideos import XVideosIE
+from .xxxymovies import XXXYMoviesIE
+from .yahoo import (
+    YahooIE,
+    YahooSearchIE,
+    YahooGyaOPlayerIE,
+    YahooGyaOIE,
+    YahooJapanNewsIE,
+)
+from .yandexdisk import YandexDiskIE
+from .yandexmusic import (
+    YandexMusicTrackIE,
+    YandexMusicAlbumIE,
+    YandexMusicPlaylistIE,
+)
+from .yandexvideo import YandexVideoIE
+from .yapfiles import YapFilesIE
+from .yesjapan import YesJapanIE
+from .yinyuetai import YinYueTaiIE
+from .ynet import YnetIE
+from .youjizz import YouJizzIE
+from .youku import (
+    YoukuIE,
+    YoukuShowIE,
+)
+from .younow import (
+    YouNowLiveIE,
+    YouNowChannelIE,
+    YouNowMomentIE,
+)
+from .youporn import YouPornIE
+from .yourporn import YourPornIE
+from .yourupload import YourUploadIE
+from .youtube import (
+    YoutubeIE,
+    YoutubeChannelIE,
+    YoutubeFavouritesIE,
+    YoutubeHistoryIE,
+    YoutubeLiveIE,
+    YoutubePlaylistIE,
+    YoutubePlaylistsIE,
+    YoutubeRecommendedIE,
+    YoutubeSearchDateIE,
+    YoutubeSearchIE,
+    YoutubeSearchURLIE,
+    YoutubeShowIE,
+    YoutubeSubscriptionsIE,
+    YoutubeTruncatedIDIE,
+    YoutubeTruncatedURLIE,
+    YoutubeUserIE,
+    YoutubeWatchLaterIE,
+)
+from .zapiks import ZapiksIE
+from .zaq1 import Zaq1IE
+from .zattoo import (
+    BBVTVIE,
+    EinsUndEinsTVIE,
+    EWETVIE,
+    GlattvisionTVIE,
+    MNetTVIE,
+    MyVisionTVIE,
+    NetPlusIE,
+    OsnatelTVIE,
+    QuantumTVIE,
+    QuicklineIE,
+    QuicklineLiveIE,
+    SaltTVIE,
+    SAKTVIE,
+    VTXTVIE,
+    WalyTVIE,
+    ZattooIE,
+    ZattooLiveIE,
+)
+from .zdf import ZDFIE, ZDFChannelIE
+from .zingmp3 import ZingMp3IE
+from .zype import ZypeIE
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py

new file mode 100644 (file)

index 0000000..acd4090
--- /dev/null
+++ b/youtube_dl/extractor/extremetube.py
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+from ..utils import str_to_int
+from .keezmovies import KeezMoviesIE
+
+
+class ExtremeTubeIE(KeezMoviesIE):
+    _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)'
+    _TESTS = [{
+        'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
+        'md5': '92feaafa4b58e82f261e5419f39c60cb',
+        'info_dict': {
+            'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431',
+            'ext': 'mp4',
+            'title': 'Music Video 14 british euro brit european cumshots swallow',
+            'uploader': 'anonim',
+            'view_count': int,
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'http://www.extremetube.com/gay/video/abcde-1234',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.extremetube.com/video/latina-slut-fucked-by-fat-black-dick',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.extremetube.com/video/652431',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        webpage, info = self._extract_info(url)
+
+        if not info['title']:
+            info['title'] = self._search_regex(
+                r'<h1[^>]+title="([^"]+)"[^>]*>', webpage, 'title')
+
+        uploader = self._html_search_regex(
+            r'Uploaded by:\s*</[^>]+>\s*<a[^>]+>(.+?)</a>',
+            webpage, 'uploader', fatal=False)
+        view_count = str_to_int(self._search_regex(
+            r'Views:\s*</[^>]+>\s*<[^>]+>([\d,\.]+)</',
+            webpage, 'view count', fatal=False))
+
+        info.update({
+            'uploader': uploader,
+            'view_count': view_count,
+        })
+
+        return info
diff --git a/youtube_dl/extractor/eyedotv.py b/youtube_dl/extractor/eyedotv.py

new file mode 100644 (file)

index 0000000..f62ddeb
--- /dev/null
+++ b/youtube_dl/extractor/eyedotv.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    xpath_text,
+    parse_duration,
+    ExtractorError,
+)
+
+
+class EyedoTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?eyedo\.tv/[^/]+/(?:#!/)?Live/Detail/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://www.eyedo.tv/en-US/#!/Live/Detail/16301',
+        'md5': 'ba14f17995cdfc20c36ba40e21bf73f7',
+        'info_dict': {
+            'id': '16301',
+            'ext': 'mp4',
+            'title': 'Journée du conseil scientifique de l\'Afnic 2015',
+            'description': 'md5:4abe07293b2f73efc6e1c37028d58c98',
+            'uploader': 'Afnic Live',
+            'uploader_id': '8023',
+        }
+    }
+    _ROOT_URL = 'http://live.eyedo.net:1935/'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_xml('http://eyedo.tv/api/live/GetLive/%s' % video_id, video_id)
+
+        def _add_ns(path):
+            return self._xpath_ns(path, 'http://schemas.datacontract.org/2004/07/EyeDo.Core.Implementation.Web.ViewModels.Api')
+
+        title = xpath_text(video_data, _add_ns('Titre'), 'title', True)
+        state_live_code = xpath_text(video_data, _add_ns('StateLiveCode'), 'title', True)
+        if state_live_code == 'avenir':
+            raise ExtractorError(
+                '%s said: We\'re sorry, but this video is not yet available.' % self.IE_NAME,
+                expected=True)
+
+        is_live = state_live_code == 'live'
+        m3u8_url = None
+        # http://eyedo.tv/Content/Html5/Scripts/html5view.js
+        if is_live:
+            if xpath_text(video_data, 'Cdn') == 'true':
+                m3u8_url = 'http://rrr.sz.xlcdn.com/?account=eyedo&file=A%s&type=live&service=wowza&protocol=http&output=playlist.m3u8' % video_id
+            else:
+                m3u8_url = self._ROOT_URL + 'w/%s/eyedo_720p/playlist.m3u8' % video_id
+        else:
+            m3u8_url = self._ROOT_URL + 'replay-w/%s/mp4:%s.mp4/playlist.m3u8' % (video_id, video_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8_native'),
+            'description': xpath_text(video_data, _add_ns('Description')),
+            'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))),
+            'uploader': xpath_text(video_data, _add_ns('Createur')),
+            'uploader_id': xpath_text(video_data, _add_ns('CreateurId')),
+            'chapter': xpath_text(video_data, _add_ns('ChapitreTitre')),
+            'chapter_id': xpath_text(video_data, _add_ns('ChapitreId')),
+        }
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py

new file mode 100644 (file)

index 0000000..610d667
--- /dev/null
+++ b/youtube_dl/extractor/facebook.py
@@ -0,0 +1,514 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import socket
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_etree_fromstring,
+    compat_http_client,
+    compat_urllib_error,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
+)
+from ..utils import (
+    clean_html,
+    error_to_compat_str,
+    ExtractorError,
+    get_element_by_id,
+    int_or_none,
+    js_to_json,
+    limit_length,
+    parse_count,
+    sanitized_Request,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class FacebookIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                (?:
+                    https?://
+                        (?:[\w-]+\.)?(?:facebook\.com|facebookcorewwwi\.onion)/
+                        (?:[^#]*?\#!/)?
+                        (?:
+                            (?:
+                                video/video\.php|
+                                photo\.php|
+                                video\.php|
+                                video/embed|
+                                story\.php
+                            )\?(?:.*?)(?:v|video_id|story_fbid)=|
+                            [^/]+/videos/(?:[^/]+/)?|
+                            [^/]+/posts/|
+                            groups/[^/]+/permalink/
+                        )|
+                    facebook:
+                )
+                (?P<id>[0-9]+)
+                '''
+    _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
+    _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
+    _NETRC_MACHINE = 'facebook'
+    IE_NAME = 'facebook'
+
+    _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
+
+    _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
+    _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
+
+    _TESTS = [{
+        'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
+        'md5': '6a40d33c0eccbb1af76cf0485a052659',
+        'info_dict': {
+            'id': '637842556329505',
+            'ext': 'mp4',
+            'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
+            'uploader': 'Tennis on Facebook',
+            'upload_date': '20140908',
+            'timestamp': 1410199200,
+        },
+        'skip': 'Requires logging in',
+    }, {
+        'url': 'https://www.facebook.com/video.php?v=274175099429670',
+        'info_dict': {
+            'id': '274175099429670',
+            'ext': 'mp4',
+            'title': 're:^Asif Nawab Butt posted a video',
+            'uploader': 'Asif Nawab Butt',
+            'upload_date': '20140506',
+            'timestamp': 1399398998,
+            'thumbnail': r're:^https?://.*',
+        },
+        'expected_warnings': [
+            'title'
+        ]
+    }, {
+        'note': 'Video with DASH manifest',
+        'url': 'https://www.facebook.com/video.php?v=957955867617029',
+        'md5': 'b2c28d528273b323abe5c6ab59f0f030',
+        'info_dict': {
+            'id': '957955867617029',
+            'ext': 'mp4',
+            'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...',
+            'uploader': 'Demy de Zeeuw',
+            'upload_date': '20160110',
+            'timestamp': 1452431627,
+        },
+        'skip': 'Requires logging in',
+    }, {
+        'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
+        'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
+        'info_dict': {
+            'id': '544765982287235',
+            'ext': 'mp4',
+            'title': '"What are you doing running in the snow?"',
+            'uploader': 'FailArmy',
+        },
+        'skip': 'Video gone',
+    }, {
+        'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903',
+        'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3',
+        'info_dict': {
+            'id': '1035862816472149',
+            'ext': 'mp4',
+            'title': 'What the Flock Is Going On In New Zealand  Credit: ViralHog',
+            'uploader': 'S. Saint',
+        },
+        'skip': 'Video gone',
+    }, {
+        'note': 'swf params escaped',
+        'url': 'https://www.facebook.com/barackobama/posts/10153664894881749',
+        'md5': '97ba073838964d12c70566e0085c2b91',
+        'info_dict': {
+            'id': '10153664894881749',
+            'ext': 'mp4',
+            'title': 'Average time to confirm recent Supreme Court nominees: 67 days Longest it\'s t...',
+            'thumbnail': r're:^https?://.*',
+            'timestamp': 1456259628,
+            'upload_date': '20160223',
+            'uploader': 'Barack Obama',
+        },
+    }, {
+        # have 1080P, but only up to 720p in swf params
+        'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
+        'md5': '9571fae53d4165bbbadb17a94651dcdc',
+        'info_dict': {
+            'id': '10155529876156509',
+            'ext': 'mp4',
+            'title': 'She survived the holocaust — and years later, she’s getting her citizenship s...',
+            'timestamp': 1477818095,
+            'upload_date': '20161030',
+            'uploader': 'CNN',
+            'thumbnail': r're:^https?://.*',
+            'view_count': int,
+        },
+    }, {
+        # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
+        'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/',
+        'info_dict': {
+            'id': '1417995061575415',
+            'ext': 'mp4',
+            'title': 'md5:1db063d6a8c13faa8da727817339c857',
+            'timestamp': 1486648217,
+            'upload_date': '20170209',
+            'uploader': 'Yaroslav Korpan',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471',
+        'info_dict': {
+            'id': '1072691702860471',
+            'ext': 'mp4',
+            'title': 'md5:ae2d22a93fbb12dad20dc393a869739d',
+            'timestamp': 1477305000,
+            'upload_date': '20161024',
+            'uploader': 'La Guía Del Varón',
+            'thumbnail': r're:^https?://.*',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/',
+        'info_dict': {
+            'id': '1396382447100162',
+            'ext': 'mp4',
+            'title': 'md5:19a428bbde91364e3de815383b54a235',
+            'timestamp': 1486035494,
+            'upload_date': '20170202',
+            'uploader': 'Elisabeth Ahtn',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.facebook.com/video.php?v=10204634152394104',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',
+        'only_matching': True,
+    }, {
+        'url': 'facebook:544765982287235',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
+        'only_matching': True,
+    }, {
+        'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670',
+        'only_matching': True,
+    }, {
+        # no title
+        'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/',
+        'info_dict': {
+            'id': '359649331226507',
+            'ext': 'mp4',
+            'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses',
+            'uploader': 'ESL One Dota 2',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        urls = []
+        for mobj in re.finditer(
+                r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
+                webpage):
+            urls.append(mobj.group('url'))
+        # Facebook API embed
+        # see https://developers.facebook.com/docs/plugins/embedded-video-player
+        for mobj in re.finditer(r'''(?x)<div[^>]+
+                class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
+                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage):
+            urls.append(mobj.group('url'))
+        return urls
+
+    def _login(self):
+        useremail, password = self._get_login_info()
+        if useremail is None:
+            return
+
+        login_page_req = sanitized_Request(self._LOGIN_URL)
+        self._set_cookie('facebook.com', 'locale', 'en_US')
+        login_page = self._download_webpage(login_page_req, None,
+                                            note='Downloading login page',
+                                            errnote='Unable to download login page')
+        lsd = self._search_regex(
+            r'<input type="hidden" name="lsd" value="([^"]*)"',
+            login_page, 'lsd')
+        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
+
+        login_form = {
+            'email': useremail,
+            'pass': password,
+            'lsd': lsd,
+            'lgnrnd': lgnrnd,
+            'next': 'http://facebook.com/home.php',
+            'default_persistent': '0',
+            'legacy_return': '1',
+            'timezone': '-60',
+            'trynum': '1',
+        }
+        request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form))
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        try:
+            login_results = self._download_webpage(request, None,
+                                                   note='Logging in', errnote='unable to fetch login page')
+            if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
+                error = self._html_search_regex(
+                    r'(?s)<div[^>]+class=(["\']).*?login_error_box.*?\1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div>',
+                    login_results, 'login error', default=None, group='error')
+                if error:
+                    raise ExtractorError('Unable to login: %s' % error, expected=True)
+                self._downloader.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.')
+                return
+
+            fb_dtsg = self._search_regex(
+                r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None)
+            h = self._search_regex(
+                r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None)
+
+            if not fb_dtsg or not h:
+                return
+
+            check_form = {
+                'fb_dtsg': fb_dtsg,
+                'h': h,
+                'name_action_selected': 'dont_save',
+            }
+            check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
+            check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
+            check_response = self._download_webpage(check_req, None,
+                                                    note='Confirming login')
+            if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
+                self._downloader.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err))
+            return
+
+    def _real_initialize(self):
+        self._login()
+
+    def _extract_from_url(self, url, video_id, fatal_if_no_video=True):
+        req = sanitized_Request(url)
+        req.add_header('User-Agent', self._CHROME_USER_AGENT)
+        webpage = self._download_webpage(req, video_id)
+
+        video_data = None
+
+        def extract_video_data(instances):
+            for item in instances:
+                if item[1][0] == 'VideoConfig':
+                    video_item = item[2][0]
+                    if video_item.get('video_id'):
+                        return video_item['videoData']
+
+        server_js_data = self._parse_json(self._search_regex(
+            r'handleServerJS\(({.+})(?:\);|,")', webpage,
+            'server js data', default='{}'), video_id, fatal=False)
+
+        if server_js_data:
+            video_data = extract_video_data(server_js_data.get('instances', []))
+
+        def extract_from_jsmods_instances(js_data):
+            if js_data:
+                return extract_video_data(try_get(
+                    js_data, lambda x: x['jsmods']['instances'], list) or [])
+
+        if not video_data:
+            server_js_data = self._parse_json(
+                self._search_regex(
+                    r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)',
+                    webpage, 'js data', default='{}'),
+                video_id, transform_source=js_to_json, fatal=False)
+            video_data = extract_from_jsmods_instances(server_js_data)
+
+        if not video_data:
+            if not fatal_if_no_video:
+                return webpage, False
+            m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
+            if m_msg is not None:
+                raise ExtractorError(
+                    'The video is not available, Facebook said: "%s"' % m_msg.group(1),
+                    expected=True)
+            elif '>You must log in to continue' in webpage:
+                self.raise_login_required()
+
+            # Video info not in first request, do a secondary request using
+            # tahoe player specific URL
+            tahoe_data = self._download_webpage(
+                self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
+                data=urlencode_postdata({
+                    '__a': 1,
+                    '__pc': self._search_regex(
+                        r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage,
+                        'pkg cohort', default='PHASED:DEFAULT'),
+                    '__rev': self._search_regex(
+                        r'client_revision["\']\s*:\s*(\d+),', webpage,
+                        'client revision', default='3944515'),
+                    'fb_dtsg': self._search_regex(
+                        r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"',
+                        webpage, 'dtsg token', default=''),
+                }),
+                headers={
+                    'Content-Type': 'application/x-www-form-urlencoded',
+                })
+            tahoe_js_data = self._parse_json(
+                self._search_regex(
+                    r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data,
+                    'tahoe js data', default='{}'),
+                video_id, fatal=False)
+            video_data = extract_from_jsmods_instances(tahoe_js_data)
+
+        if not video_data:
+            raise ExtractorError('Cannot parse data')
+
+        subtitles = {}
+        formats = []
+        for f in video_data:
+            format_id = f['stream_type']
+            if f and isinstance(f, dict):
+                f = [f]
+            if not f or not isinstance(f, list):
+                continue
+            for quality in ('sd', 'hd'):
+                for src_type in ('src', 'src_no_ratelimit'):
+                    src = f[0].get('%s_%s' % (quality, src_type))
+                    if src:
+                        preference = -10 if format_id == 'progressive' else 0
+                        if quality == 'hd':
+                            preference += 5
+                        formats.append({
+                            'format_id': '%s_%s_%s' % (format_id, quality, src_type),
+                            'url': src,
+                            'preference': preference,
+                        })
+            dash_manifest = f[0].get('dash_manifest')
+            if dash_manifest:
+                formats.extend(self._parse_mpd_formats(
+                    compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
+            subtitles_src = f[0].get('subtitles_src')
+            if subtitles_src:
+                subtitles.setdefault('en', []).append({'url': subtitles_src})
+        if not formats:
+            raise ExtractorError('Cannot find video formats')
+
+        # Downloads with browser's User-Agent are rate limited. Working around
+        # with non-browser User-Agent.
+        for f in formats:
+            f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'
+
+        self._sort_formats(formats)
+
+        video_title = self._html_search_regex(
+            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
+            'title', default=None)
+        if not video_title:
+            video_title = self._html_search_regex(
+                r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
+                webpage, 'alternative title', default=None)
+        if not video_title:
+            video_title = self._html_search_meta(
+                'description', webpage, 'title', default=None)
+        if video_title:
+            video_title = limit_length(video_title, 80)
+        else:
+            video_title = 'Facebook video #%s' % video_id
+        uploader = clean_html(get_element_by_id(
+            'fbPhotoPageAuthorName', webpage)) or self._search_regex(
+            r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
+            default=None) or self._og_search_title(webpage, fatal=False)
+        timestamp = int_or_none(self._search_regex(
+            r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
+            'timestamp', default=None))
+        thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
+
+        view_count = parse_count(self._search_regex(
+            r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
+            default=None))
+
+        info_dict = {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
+            'uploader': uploader,
+            'timestamp': timestamp,
+            'thumbnail': thumbnail,
+            'view_count': view_count,
+            'subtitles': subtitles,
+        }
+
+        return webpage, info_dict
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
+        webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False)
+
+        if info_dict:
+            return info_dict
+
+        if '/posts/' in url:
+            video_id_json = self._search_regex(
+                r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', webpage, 'video ids', group='ids',
+                default='')
+            if video_id_json:
+                entries = [
+                    self.url_result('facebook:%s' % vid, FacebookIE.ie_key())
+                    for vid in self._parse_json(video_id_json, video_id)]
+                return self.playlist_result(entries, video_id)
+
+            # Single Video?
+            video_id = self._search_regex(r'video_id:\s*"([0-9]+)"', webpage, 'single video id')
+            return self.url_result('facebook:%s' % video_id, FacebookIE.ie_key())
+        else:
+            _, info_dict = self._extract_from_url(
+                self._VIDEO_PAGE_TEMPLATE % video_id,
+                video_id, fatal_if_no_video=True)
+            return info_dict
+
+
+class FacebookPluginsVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?P<id>https.+)'
+
+    _TESTS = [{
+        'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560',
+        'md5': '5954e92cdfe51fe5782ae9bda7058a07',
+        'info_dict': {
+            'id': '10154383743583686',
+            'ext': 'mp4',
+            'title': 'What to do during the haze?',
+            'uploader': 'Gov.sg',
+            'upload_date': '20160826',
+            'timestamp': 1472184808,
+        },
+        'add_ie': [FacebookIE.ie_key()],
+    }, {
+        'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D10204634152394104',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.facebook.com/plugins/video.php?href=https://www.facebook.com/gov.sg/videos/10154383743583686/&show_text=0&width=560',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        return self.url_result(
+            compat_urllib_parse_unquote(self._match_id(url)),
+            FacebookIE.ie_key())
diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py

new file mode 100644 (file)

index 0000000..312ee2a
--- /dev/null
+++ b/youtube_dl/extractor/faz.py
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_etree_fromstring
+from ..utils import (
+    xpath_element,
+    xpath_text,
+    int_or_none,
+)
+
+
+class FazIE(InfoExtractor):
+    IE_NAME = 'faz.net'
+    _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P<id>\d+)\.html'
+
+    _TESTS = [{
+        'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html',
+        'info_dict': {
+            'id': '12610585',
+            'ext': 'mp4',
+            'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher',
+            'description': 'md5:1453fbf9a0d041d985a47306192ea253',
+        },
+    }, {
+        'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/aktuell/politik/-13659345.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.faz.net/foobarblafasel-13659345.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        description = self._og_search_description(webpage)
+        media = self._html_search_regex(
+            r"data-videojs-media='([^']+)",
+            webpage, 'media')
+        if media == 'extern':
+            perform_url = self._search_regex(
+                r"<iframe[^>]+?src='((?:http:)?//player\.performgroup\.com/eplayer/eplayer\.html#/?[0-9a-f]{26}\.[0-9a-z]{26})",
+                webpage, 'perform url')
+            return self.url_result(perform_url)
+        config = compat_etree_fromstring(media)
+
+        encodings = xpath_element(config, 'ENCODINGS', 'encodings', True)
+        formats = []
+        for pref, code in enumerate(['LOW', 'HIGH', 'HQ']):
+            encoding = xpath_element(encodings, code)
+            if encoding is not None:
+                encoding_url = xpath_text(encoding, 'FILENAME')
+                if encoding_url:
+                    tbr = xpath_text(encoding, 'AVERAGEBITRATE', 1000)
+                    if tbr:
+                        tbr = int_or_none(tbr.replace(',', '.'))
+                    f = {
+                        'url': encoding_url,
+                        'format_id': code.lower(),
+                        'quality': pref,
+                        'tbr': tbr,
+                        'vcodec': xpath_text(encoding, 'CODEC'),
+                    }
+                    mobj = re.search(r'(\d+)x(\d+)_(\d+)\.mp4', encoding_url)
+                    if mobj:
+                        f.update({
+                            'width': int(mobj.group(1)),
+                            'height': int(mobj.group(2)),
+                            'tbr': tbr or int(mobj.group(3)),
+                        })
+                    formats.append(f)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'formats': formats,
+            'description': description.strip() if description else None,
+            'thumbnail': xpath_text(config, 'STILL/STILL_BIG'),
+            'duration': int_or_none(xpath_text(config, 'DURATION')),
+        }
diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py

new file mode 100644 (file)

index 0000000..4355611
--- /dev/null
+++ b/youtube_dl/extractor/fc2.py
@@ -0,0 +1,160 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_request,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    sanitized_Request,
+    urlencode_postdata,
+)
+
+
+class FC2IE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P<id>[^/]+)'
+    IE_NAME = 'fc2'
+    _NETRC_MACHINE = 'fc2'
+    _TESTS = [{
+        'url': 'http://video.fc2.com/en/content/20121103kUan1KHs',
+        'md5': 'a6ebe8ebe0396518689d963774a54eb7',
+        'info_dict': {
+            'id': '20121103kUan1KHs',
+            'ext': 'flv',
+            'title': 'Boxing again with Puff',
+        },
+    }, {
+        'url': 'http://video.fc2.com/en/content/20150125cEva0hDn/',
+        'info_dict': {
+            'id': '20150125cEva0hDn',
+            'ext': 'mp4',
+        },
+        'params': {
+            'username': 'ytdl@yt-dl.org',
+            'password': '(snip)',
+        },
+        'skip': 'requires actual password',
+    }, {
+        'url': 'http://video.fc2.com/en/a/content/20130926eZpARwsF',
+        'only_matching': True,
+    }]
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None or password is None:
+            return False
+
+        # Log in
+        login_form_strs = {
+            'email': username,
+            'password': password,
+            'done': 'video',
+            'Submit': ' Login ',
+        }
+
+        login_data = urlencode_postdata(login_form_strs)
+        request = sanitized_Request(
+            'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data)
+
+        login_results = self._download_webpage(request, None, note='Logging in', errnote='Unable to log in')
+        if 'mode=redirect&login=done' not in login_results:
+            self.report_warning('unable to log in: bad username or password')
+            return False
+
+        # this is also needed
+        login_redir = sanitized_Request('http://id.fc2.com/?mode=redirect&login=done')
+        self._download_webpage(
+            login_redir, None, note='Login redirect', errnote='Login redirect failed')
+
+        return True
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        self._login()
+        webpage = None
+        if not url.startswith('fc2:'):
+            webpage = self._download_webpage(url, video_id)
+            self._downloader.cookiejar.clear_session_cookies()  # must clear
+            self._login()
+
+        title = 'FC2 video %s' % video_id
+        thumbnail = None
+        if webpage is not None:
+            title = self._og_search_title(webpage)
+            thumbnail = self._og_search_thumbnail(webpage)
+        refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url
+
+        mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest()
+
+        info_url = (
+            'http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&'.
+            format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E')))
+
+        info_webpage = self._download_webpage(
+            info_url, video_id, note='Downloading info page')
+        info = compat_urlparse.parse_qs(info_webpage)
+
+        if 'err_code' in info:
+            # most of the time we can still download wideo even if err_code is 403 or 602
+            self.report_warning(
+                'Error code was: %s... but still trying' % info['err_code'][0])
+
+        if 'filepath' not in info:
+            raise ExtractorError('Cannot download file. Are you logged in?')
+
+        video_url = info['filepath'][0] + '?mid=' + info['mid'][0]
+        title_info = info.get('title')
+        if title_info:
+            title = title_info[0]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'ext': 'flv',
+            'thumbnail': thumbnail,
+        }
+
+
+class FC2EmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.fc2\.com/flv2\.swf\?(?P<query>.+)'
+    IE_NAME = 'fc2:embed'
+
+    _TEST = {
+        'url': 'http://video.fc2.com/flv2.swf?t=201404182936758512407645&i=20130316kwishtfitaknmcgd76kjd864hso93htfjcnaogz629mcgfs6rbfk0hsycma7shkf85937cbchfygd74&i=201403223kCqB3Ez&d=2625&sj=11&lang=ja&rel=1&from=11&cmt=1&tk=TlRBM09EQTNNekU9&tl=プリズン･ブレイク%20S1-01%20マイケル%20【吹替】',
+        'md5': 'b8aae5334cb691bdb1193a88a6ab5d5a',
+        'info_dict': {
+            'id': '201403223kCqB3Ez',
+            'ext': 'flv',
+            'title': 'プリズン･ブレイク S1-01 マイケル 【吹替】',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        query = compat_parse_qs(mobj.group('query'))
+
+        video_id = query['i'][-1]
+        title = query.get('tl', ['FC2 video %s' % video_id])[0]
+
+        sj = query.get('sj', [None])[0]
+        thumbnail = None
+        if sj:
+            # See thumbnailImagePath() in ServerConst.as of flv2.swf
+            thumbnail = 'http://video%s-thumbnail.fc2.com/up/pic/%s.jpg' % (
+                sj, '/'.join((video_id[:6], video_id[6:8], video_id[-2], video_id[-1], video_id)))
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': FC2IE.ie_key(),
+            'url': 'fc2:%s' % video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py

new file mode 100644 (file)

index 0000000..8db7c59
--- /dev/null
+++ b/youtube_dl/extractor/fczenit.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    float_or_none,
+)
+
+
+class FczenitIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://fc-zenit.ru/video/41044/',
+        'md5': '0e3fab421b455e970fa1aa3891e57df0',
+        'info_dict': {
+            'id': '41044',
+            'ext': 'mp4',
+            'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»',
+            'timestamp': 1462283735,
+            'upload_date': '20160503',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        msi_id = self._search_regex(
+            r"(?s)config\s*=\s*{.+?video_id\s*:\s*'([^']+)'", webpage, 'msi id')
+
+        msi_data = self._download_json(
+            'http://player.fc-zenit.ru/msi/video', msi_id, query={
+                'video': msi_id,
+            })['data']
+        title = msi_data['name']
+
+        formats = [{
+            'format_id': q.get('label'),
+            'url': q['url'],
+            'height': int_or_none(q.get('label')),
+        } for q in msi_data['qualities'] if q.get('url')]
+
+        self._sort_formats(formats)
+
+        tags = [tag['label'] for tag in msi_data.get('tags', []) if tag.get('label')]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': msi_data.get('preview'),
+            'formats': formats,
+            'duration': float_or_none(msi_data.get('duration')),
+            'timestamp': int_or_none(msi_data.get('date')),
+            'tags': tags,
+        }
diff --git a/youtube_dl/extractor/filmon.py b/youtube_dl/extractor/filmon.py

new file mode 100644 (file)

index 0000000..f775fe0
--- /dev/null
+++ b/youtube_dl/extractor/filmon.py
@@ -0,0 +1,178 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_HTTPError,
+)
+from ..utils import (
+    qualities,
+    strip_or_none,
+    int_or_none,
+    ExtractorError,
+)
+
+
+class FilmOnIE(InfoExtractor):
+    IE_NAME = 'filmon'
+    _VALID_URL = r'(?:https?://(?:www\.)?filmon\.com/vod/view/|filmon:)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.filmon.com/vod/view/24869-0-plan-9-from-outer-space',
+        'info_dict': {
+            'id': '24869',
+            'ext': 'mp4',
+            'title': 'Plan 9 From Outer Space',
+            'description': 'Dead human, zombies and vampires',
+        },
+    }, {
+        'url': 'https://www.filmon.com/vod/view/2825-1-popeye-series-1',
+        'info_dict': {
+            'id': '2825',
+            'title': 'Popeye Series 1',
+            'description': 'The original series of Popeye.',
+        },
+        'playlist_mincount': 8,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        try:
+            response = self._download_json(
+                'https://www.filmon.com/api/vod/movie?id=%s' % video_id,
+                video_id)['response']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError):
+                errmsg = self._parse_json(e.cause.read().decode(), video_id)['reason']
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True)
+            raise
+
+        title = response['title']
+        description = strip_or_none(response.get('description'))
+
+        if response.get('type_id') == 1:
+            entries = [self.url_result('filmon:' + episode_id) for episode_id in response.get('episodes', [])]
+            return self.playlist_result(entries, video_id, title, description)
+
+        QUALITY = qualities(('low', 'high'))
+        formats = []
+        for format_id, stream in response.get('streams', {}).items():
+            stream_url = stream.get('url')
+            if not stream_url:
+                continue
+            formats.append({
+                'format_id': format_id,
+                'url': stream_url,
+                'ext': 'mp4',
+                'quality': QUALITY(stream.get('quality')),
+                'protocol': 'm3u8_native',
+            })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        poster = response.get('poster', {})
+        thumbs = poster.get('thumbs', {})
+        thumbs['poster'] = poster
+        for thumb_id, thumb in thumbs.items():
+            thumb_url = thumb.get('url')
+            if not thumb_url:
+                continue
+            thumbnails.append({
+                'id': thumb_id,
+                'url': thumb_url,
+                'width': int_or_none(thumb.get('width')),
+                'height': int_or_none(thumb.get('height')),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'thumbnails': thumbnails,
+        }
+
+
+class FilmOnChannelIE(InfoExtractor):
+    IE_NAME = 'filmon:channel'
+    _VALID_URL = r'https?://(?:www\.)?filmon\.com/(?:tv|channel)/(?P<id>[a-z0-9-]+)'
+    _TESTS = [{
+        # VOD
+        'url': 'http://www.filmon.com/tv/sports-haters',
+        'info_dict': {
+            'id': '4190',
+            'ext': 'mp4',
+            'title': 'Sports Haters',
+            'description': 'md5:dabcb4c1d9cfc77085612f1a85f8275d',
+        },
+    }, {
+        # LIVE
+        'url': 'https://www.filmon.com/channel/filmon-sports',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.filmon.com/tv/2894',
+        'only_matching': True,
+    }]
+
+    _THUMBNAIL_RES = [
+        ('logo', 56, 28),
+        ('big_logo', 106, 106),
+        ('extra_big_logo', 300, 300),
+    ]
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        try:
+            channel_data = self._download_json(
+                'http://www.filmon.com/api-v2/channel/' + channel_id, channel_id)['data']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError):
+                errmsg = self._parse_json(e.cause.read().decode(), channel_id)['message']
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True)
+            raise
+
+        channel_id = compat_str(channel_data['id'])
+        is_live = not channel_data.get('is_vod') and not channel_data.get('is_vox')
+        title = channel_data['title']
+
+        QUALITY = qualities(('low', 'high'))
+        formats = []
+        for stream in channel_data.get('streams', []):
+            stream_url = stream.get('url')
+            if not stream_url:
+                continue
+            if not is_live:
+                formats.extend(self._extract_wowza_formats(
+                    stream_url, channel_id, skip_protocols=['dash', 'rtmp', 'rtsp']))
+                continue
+            quality = stream.get('quality')
+            formats.append({
+                'format_id': quality,
+                # this is an m3u8 stream, but we are deliberately not using _extract_m3u8_formats
+                # because it doesn't have bitrate variants anyway
+                'url': stream_url,
+                'ext': 'mp4',
+                'quality': QUALITY(quality),
+            })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for name, width, height in self._THUMBNAIL_RES:
+            thumbnails.append({
+                'id': name,
+                'url': 'http://static.filmon.com/assets/channels/%s/%s.png' % (channel_id, name),
+                'width': width,
+                'height': height,
+            })
+
+        return {
+            'id': channel_id,
+            'display_id': channel_data.get('alias'),
+            'title': self._live_title(title) if is_live else title,
+            'description': channel_data.get('description'),
+            'thumbnails': thumbnails,
+            'formats': formats,
+            'is_live': is_live,
+        }
diff --git a/youtube_dl/extractor/filmweb.py b/youtube_dl/extractor/filmweb.py

new file mode 100644 (file)

index 0000000..56000bc
--- /dev/null
+++ b/youtube_dl/extractor/filmweb.py
@@ -0,0 +1,42 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class FilmwebIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?filmweb\.no/(?P<type>trailere|filmnytt)/article(?P<id>\d+)\.ece'
+    _TEST = {
+        'url': 'http://www.filmweb.no/trailere/article1264921.ece',
+        'md5': 'e353f47df98e557d67edaceda9dece89',
+        'info_dict': {
+            'id': '13033574',
+            'ext': 'mp4',
+            'title': 'Det som en gang var',
+            'upload_date': '20160316',
+            'timestamp': 1458140101,
+            'uploader_id': '12639966',
+            'uploader': 'Live Roaldset',
+        }
+    }
+
+    def _real_extract(self, url):
+        article_type, article_id = re.match(self._VALID_URL, url).groups()
+        if article_type == 'filmnytt':
+            webpage = self._download_webpage(url, article_id)
+            article_id = self._search_regex(r'data-videoid="(\d+)"', webpage, 'article id')
+        embed_code = self._download_json(
+            'https://www.filmweb.no/template_v2/ajax/json_trailerEmbed.jsp',
+            article_id, query={
+                'articleId': article_id,
+            })['embedCode']
+        iframe_url = self._proto_relative_url(self._search_regex(
+            r'<iframe[^>]+src="([^"]+)', embed_code, 'iframe url'))
+
+        return {
+            '_type': 'url_transparent',
+            'id': article_id,
+            'url': iframe_url,
+            'ie_key': 'TwentyThreeVideo',
+        }
diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py

new file mode 100644 (file)

index 0000000..28617d8
--- /dev/null
+++ b/youtube_dl/extractor/firsttv.py
@@ -0,0 +1,156 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    int_or_none,
+    qualities,
+    unified_strdate,
+    url_or_none,
+)
+
+
+class FirstTVIE(InfoExtractor):
+    IE_NAME = '1tv'
+    IE_DESC = 'Первый канал'
+    _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>[^/?#]+)'
+
+    _TESTS = [{
+        # single format
+        'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015',
+        'md5': 'a1b6b60d530ebcf8daacf4565762bbaf',
+        'info_dict': {
+            'id': '40049',
+            'ext': 'mp4',
+            'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015',
+            'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
+            'upload_date': '20150212',
+            'duration': 2694,
+        },
+    }, {
+        # multiple formats
+        'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016',
+        'info_dict': {
+            'id': '364746',
+            'ext': 'mp4',
+            'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016',
+            'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
+            'upload_date': '20160407',
+            'duration': 179,
+            'formats': 'mincount:3',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.1tv.ru/news/issue/2016-12-01/14:00',
+        'info_dict': {
+            'id': '14:00',
+            'title': 'Выпуск новостей в 14:00   1 декабря 2016 года. Новости. Первый канал',
+            'description': 'md5:2e921b948f8c1ff93901da78ebdb1dfd',
+        },
+        'playlist_count': 13,
+    }, {
+        'url': 'http://www.1tv.ru/shows/tochvtoch-supersezon/vystupleniya/evgeniy-dyatlov-vladimir-vysockiy-koni-priveredlivye-toch-v-toch-supersezon-fragment-vypuska-ot-06-11-2016',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+        playlist_url = compat_urlparse.urljoin(url, self._search_regex(
+            r'data-playlist-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'playlist url', group='url'))
+
+        parsed_url = compat_urlparse.urlparse(playlist_url)
+        qs = compat_urlparse.parse_qs(parsed_url.query)
+        item_ids = qs.get('videos_ids[]') or qs.get('news_ids[]')
+
+        items = self._download_json(playlist_url, display_id)
+
+        if item_ids:
+            items = [
+                item for item in items
+                if item.get('uid') and compat_str(item['uid']) in item_ids]
+        else:
+            items = [items[0]]
+
+        entries = []
+        QUALITIES = ('ld', 'sd', 'hd', )
+
+        for item in items:
+            title = item['title']
+            quality = qualities(QUALITIES)
+            formats = []
+            path = None
+            for f in item.get('mbr', []):
+                src = url_or_none(f.get('src'))
+                if not src:
+                    continue
+                tbr = int_or_none(self._search_regex(
+                    r'_(\d{3,})\.mp4', src, 'tbr', default=None))
+                if not path:
+                    path = self._search_regex(
+                        r'//[^/]+/(.+?)_\d+\.mp4', src,
+                        'm3u8 path', default=None)
+                formats.append({
+                    'url': src,
+                    'format_id': f.get('name'),
+                    'tbr': tbr,
+                    'source_preference': quality(f.get('name')),
+                    # quality metadata of http formats may be incorrect
+                    'preference': -1,
+                })
+            # m3u8 URL format is reverse engineered from [1] (search for
+            # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru)
+            # is taken from [2].
+            # 1. http://static.1tv.ru/player/eump1tv-current/eump-1tv.all.min.js?rnd=9097422834:formatted
+            # 2. http://static.1tv.ru/player/eump1tv-config/config-main.js?rnd=9097422834
+            if not path and len(formats) == 1:
+                path = self._search_regex(
+                    r'//[^/]+/(.+?$)', formats[0]['url'],
+                    'm3u8 path', default=None)
+            if path:
+                if len(formats) == 1:
+                    m3u8_path = ','
+                else:
+                    tbrs = [compat_str(t) for t in sorted(f['tbr'] for f in formats)]
+                    m3u8_path = '_,%s,%s' % (','.join(tbrs), '.mp4')
+                formats.extend(self._extract_m3u8_formats(
+                    'http://balancer-vod.1tv.ru/%s%s.urlset/master.m3u8'
+                    % (path, m3u8_path),
+                    display_id, 'mp4',
+                    entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+            self._sort_formats(formats)
+
+            thumbnail = item.get('poster') or self._og_search_thumbnail(webpage)
+            duration = int_or_none(item.get('duration') or self._html_search_meta(
+                'video:duration', webpage, 'video duration', fatal=False))
+            upload_date = unified_strdate(self._html_search_meta(
+                'ya:ovs:upload_date', webpage, 'upload date', default=None))
+
+            entries.append({
+                'id': compat_str(item.get('id') or item['uid']),
+                'thumbnail': thumbnail,
+                'title': title,
+                'upload_date': upload_date,
+                'duration': int_or_none(duration),
+                'formats': formats
+            })
+
+        title = self._html_search_regex(
+            (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>',
+             r"'title'\s*:\s*'([^']+)'"),
+            webpage, 'title', default=None) or self._og_search_title(
+            webpage, default=None)
+        description = self._html_search_regex(
+            r'<div class="descr">\s*<div>&nbsp;</div>\s*<p>([^<]*)</p></div>',
+            webpage, 'description', default=None) or self._html_search_meta(
+            'description', webpage, 'description', default=None)
+
+        return self.playlist_result(entries, display_id, title, description)
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py

new file mode 100644 (file)

index 0000000..f3f876e
--- /dev/null
+++ b/youtube_dl/extractor/fivemin.py
@@ -0,0 +1,54 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class FiveMinIE(InfoExtractor):
+    IE_NAME = '5min'
+    _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com/|delivery\.vidible\.tv/aol)(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P<id>\d+)'
+
+    _TESTS = [
+        {
+            # From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/
+            'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791',
+            'md5': '4f7b0b79bf1a470e5004f7112385941d',
+            'info_dict': {
+                'id': '518013791',
+                'ext': 'mp4',
+                'title': 'iPad Mini with Retina Display Review',
+                'description': 'iPad mini with Retina Display review',
+                'duration': 177,
+                'uploader': 'engadget',
+                'upload_date': '20131115',
+                'timestamp': 1384515288,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            }
+        },
+        {
+            # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247
+            'url': '5min:518086247',
+            'md5': 'e539a9dd682c288ef5a498898009f69e',
+            'info_dict': {
+                'id': '518086247',
+                'ext': 'mp4',
+                'title': 'How to Make a Next-Level Fruit Salad',
+                'duration': 184,
+            },
+            'skip': 'no longer available',
+        },
+        {
+            'url': 'http://embed.5min.com/518726732/',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://delivery.vidible.tv/aol?playList=518013791',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result('aol-video:%s' % video_id)
diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py

new file mode 100644 (file)

index 0000000..c4c0f1b
--- /dev/null
+++ b/youtube_dl/extractor/fivetv.py
@@ -0,0 +1,91 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class FiveTVIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?5-tv\.ru/
+                        (?:
+                            (?:[^/]+/)+(?P<id>\d+)|
+                            (?P<path>[^/?#]+)(?:[/?#])?
+                        )
+                    '''
+
+    _TESTS = [{
+        'url': 'http://5-tv.ru/news/96814/',
+        'md5': 'bbff554ad415ecf5416a2f48c22d9283',
+        'info_dict': {
+            'id': '96814',
+            'ext': 'mp4',
+            'title': 'Россияне выбрали имя для общенациональной платежной системы',
+            'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 180,
+        },
+    }, {
+        'url': 'http://5-tv.ru/video/1021729/',
+        'info_dict': {
+            'id': '1021729',
+            'ext': 'mp4',
+            'title': '3D принтер',
+            'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 180,
+        },
+    }, {
+        # redirect to https://www.5-tv.ru/projects/1000095/izvestia-glavnoe/
+        'url': 'http://www.5-tv.ru/glavnoe/#itemDetails',
+        'info_dict': {
+            'id': 'glavnoe',
+            'ext': 'mp4',
+            'title': r're:^Итоги недели с \d+ по \d+ \w+ \d{4} года$',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'skip': 'redirect to «Известия. Главное» project page',
+    }, {
+        'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/films/1507502/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/programs/broadcast/508713/',
+        'only_matching': True,
+    }, {
+        'url': 'http://5-tv.ru/angel/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('path')
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            [r'<div[^>]+?class="(?:flow)?player[^>]+?data-href="([^"]+)"',
+             r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
+            webpage, 'video url')
+
+        title = self._og_search_title(webpage, default=None) or self._search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title')
+        duration = int_or_none(self._og_search_property(
+            'video:duration', webpage, 'duration', default=None))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py

new file mode 100644 (file)

index 0000000..9f166ef
--- /dev/null
+++ b/youtube_dl/extractor/flickr.py
@@ -0,0 +1,116 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_urlencode,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    qualities,
+)
+
+
+class FlickrIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/[\w\-_@]+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/',
+        'md5': '164fe3fa6c22e18d448d4d5af2330f31',
+        'info_dict': {
+            'id': '5645318632',
+            'ext': 'mpg',
+            'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.',
+            'title': 'Dark Hollow Waterfalls',
+            'duration': 19,
+            'timestamp': 1303528740,
+            'upload_date': '20110423',
+            'uploader_id': '10922353@N03',
+            'uploader': 'Forest Wander',
+            'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/',
+            'comment_count': int,
+            'view_count': int,
+            'tags': list,
+            'license': 'Attribution-ShareAlike',
+        }
+    }
+    _API_BASE_URL = 'https://api.flickr.com/services/rest?'
+    # https://help.yahoo.com/kb/flickr/SLN25525.html
+    _LICENSES = {
+        '0': 'All Rights Reserved',
+        '1': 'Attribution-NonCommercial-ShareAlike',
+        '2': 'Attribution-NonCommercial',
+        '3': 'Attribution-NonCommercial-NoDerivs',
+        '4': 'Attribution',
+        '5': 'Attribution-ShareAlike',
+        '6': 'Attribution-NoDerivs',
+        '7': 'No known copyright restrictions',
+        '8': 'United States government work',
+        '9': 'Public Domain Dedication (CC0)',
+        '10': 'Public Domain Work',
+    }
+
+    def _call_api(self, method, video_id, api_key, note, secret=None):
+        query = {
+            'photo_id': video_id,
+            'method': 'flickr.%s' % method,
+            'api_key': api_key,
+            'format': 'json',
+            'nojsoncallback': 1,
+        }
+        if secret:
+            query['secret'] = secret
+        data = self._download_json(self._API_BASE_URL + compat_urllib_parse_urlencode(query), video_id, note)
+        if data['stat'] != 'ok':
+            raise ExtractorError(data['message'])
+        return data
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        api_key = self._download_json(
+            'https://www.flickr.com/hermes_error_beacon.gne', video_id,
+            'Downloading api key')['site_key']
+
+        video_info = self._call_api(
+            'photos.getInfo', video_id, api_key, 'Downloading video info')['photo']
+        if video_info['media'] == 'video':
+            streams = self._call_api(
+                'video.getStreamInfo', video_id, api_key,
+                'Downloading streams info', video_info['secret'])['streams']
+
+            preference = qualities(
+                ['288p', 'iphone_wifi', '100', '300', '700', '360p', 'appletv', '720p', '1080p', 'orig'])
+
+            formats = []
+            for stream in streams['stream']:
+                stream_type = compat_str(stream.get('type'))
+                formats.append({
+                    'format_id': stream_type,
+                    'url': stream['_content'],
+                    'preference': preference(stream_type),
+                })
+            self._sort_formats(formats)
+
+            owner = video_info.get('owner', {})
+            uploader_id = owner.get('nsid')
+            uploader_path = owner.get('path_alias') or uploader_id
+            uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None
+
+            return {
+                'id': video_id,
+                'title': video_info['title']['_content'],
+                'description': video_info.get('description', {}).get('_content'),
+                'formats': formats,
+                'timestamp': int_or_none(video_info.get('dateuploaded')),
+                'duration': int_or_none(video_info.get('video', {}).get('duration')),
+                'uploader_id': uploader_id,
+                'uploader': owner.get('realname'),
+                'uploader_url': uploader_url,
+                'comment_count': int_or_none(video_info.get('comments', {}).get('_content')),
+                'view_count': int_or_none(video_info.get('views')),
+                'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])],
+                'license': self._LICENSES.get(video_info.get('license')),
+            }
+        else:
+            raise ExtractorError('not a video', expected=True)
diff --git a/youtube_dl/extractor/folketinget.py b/youtube_dl/extractor/folketinget.py

new file mode 100644 (file)

index 0000000..b3df93f
--- /dev/null
+++ b/youtube_dl/extractor/folketinget.py
@@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+    xpath_text,
+)
+
+
+class FolketingetIE(InfoExtractor):
+    IE_DESC = 'Folketinget (ft.dk; Danish parliament)'
+    _VALID_URL = r'https?://(?:www\.)?ft\.dk/webtv/video/[^?#]*?\.(?P<id>[0-9]+)\.aspx'
+    _TEST = {
+        'url': 'http://www.ft.dk/webtv/video/20141/eru/td.1165642.aspx?as=1#player',
+        'md5': '6269e8626fa1a891bf5369b386ae996a',
+        'info_dict': {
+            'id': '1165642',
+            'ext': 'mp4',
+            'title': 'Åbent samråd i Erhvervsudvalget',
+            'description': 'Åbent samråd med erhvervs- og vækstministeren om regeringens politik på teleområdet',
+            'view_count': int,
+            'width': 768,
+            'height': 432,
+            'tbr': 928000,
+            'timestamp': 1416493800,
+            'upload_date': '20141120',
+            'duration': 3960,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(webpage)
+        description = self._html_search_regex(
+            r'(?s)<div class="video-item-agenda"[^>]*>(.*?)<',
+            webpage, 'description', fatal=False)
+
+        player_params = compat_parse_qs(self._search_regex(
+            r'<embed src="http://ft\.arkena\.tv/flash/ftplayer\.swf\?([^"]+)"',
+            webpage, 'player params'))
+        xml_url = player_params['xml'][0]
+        doc = self._download_xml(xml_url, video_id)
+
+        timestamp = parse_iso8601(xpath_text(doc, './/date'))
+        duration = parse_duration(xpath_text(doc, './/duration'))
+        width = int_or_none(xpath_text(doc, './/width'))
+        height = int_or_none(xpath_text(doc, './/height'))
+        view_count = int_or_none(xpath_text(doc, './/views'))
+
+        formats = [{
+            'format_id': n.attrib['bitrate'],
+            'url': xpath_text(n, './url', fatal=True),
+            'tbr': int_or_none(n.attrib['bitrate']),
+        } for n in doc.findall('.//streams/stream')]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'timestamp': timestamp,
+            'width': width,
+            'height': height,
+            'duration': duration,
+            'view_count': view_count,
+        }
diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py

new file mode 100644 (file)

index 0000000..118325b
--- /dev/null
+++ b/youtube_dl/extractor/footyroom.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .streamable import StreamableIE
+
+
+class FootyRoomIE(InfoExtractor):
+    _VALID_URL = r'https?://footyroom\.com/matches/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://footyroom.com/matches/79922154/hull-city-vs-chelsea/review',
+        'info_dict': {
+            'id': '79922154',
+            'title': 'VIDEO Hull City 0 - 2 Chelsea',
+        },
+        'playlist_count': 2,
+        'add_ie': [StreamableIE.ie_key()],
+    }, {
+        'url': 'http://footyroom.com/matches/75817984/georgia-vs-germany/review',
+        'info_dict': {
+            'id': '75817984',
+            'title': 'VIDEO Georgia 0 - 2 Germany',
+        },
+        'playlist_count': 1,
+        'add_ie': ['Playwire']
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        playlist = self._parse_json(self._search_regex(
+            r'DataStore\.media\s*=\s*([^;]+)', webpage, 'media data'),
+            playlist_id)
+
+        playlist_title = self._og_search_title(webpage)
+
+        entries = []
+        for video in playlist:
+            payload = video.get('payload')
+            if not payload:
+                continue
+            playwire_url = self._html_search_regex(
+                r'data-config="([^"]+)"', payload,
+                'playwire url', default=None)
+            if playwire_url:
+                entries.append(self.url_result(self._proto_relative_url(
+                    playwire_url, 'http:'), 'Playwire'))
+
+            streamable_url = StreamableIE._extract_url(payload)
+            if streamable_url:
+                entries.append(self.url_result(
+                    streamable_url, StreamableIE.ie_key()))
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py

new file mode 100644 (file)

index 0000000..fecfc28
--- /dev/null
+++ b/youtube_dl/extractor/formula1.py
@@ -0,0 +1,33 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class Formula1IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html'
+    _TESTS = [{
+        'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html',
+        'md5': '8c79e54be72078b26b89e0e111c0502b',
+        'info_dict': {
+            'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV',
+            'ext': 'mp4',
+            'title': 'Race highlights - Spain 2016',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        ooyala_embed_code = self._search_regex(
+            r'data-videoid="([^"]+)"', webpage, 'ooyala embed code')
+        return self.url_result(
+            'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code)
diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py

new file mode 100644 (file)

index 0000000..be4e813
--- /dev/null
+++ b/youtube_dl/extractor/fourtube.py
@@ -0,0 +1,309 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_str,
+    compat_urllib_parse_unquote,
+    compat_urlparse,
+)
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+    str_or_none,
+    str_to_int,
+    try_get,
+    unified_timestamp,
+    url_or_none,
+)
+
+
+class FourTubeBaseIE(InfoExtractor):
+    def _extract_formats(self, url, video_id, media_id, sources):
+        token_url = 'https://%s/%s/desktop/%s' % (
+            self._TKN_HOST, media_id, '+'.join(sources))
+
+        parsed_url = compat_urlparse.urlparse(url)
+        tokens = self._download_json(token_url, video_id, data=b'', headers={
+            'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname),
+            'Referer': url,
+        })
+        formats = [{
+            'url': tokens[format]['token'],
+            'format_id': format + 'p',
+            'resolution': format + 'p',
+            'quality': int(format),
+        } for format in sources]
+        self._sort_formats(formats)
+        return formats
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        kind, video_id, display_id = mobj.group('kind', 'id', 'display_id')
+
+        if kind == 'm' or not display_id:
+            url = self._URL_TEMPLATE % video_id
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_meta('name', webpage)
+        timestamp = parse_iso8601(self._html_search_meta(
+            'uploadDate', webpage))
+        thumbnail = self._html_search_meta('thumbnailUrl', webpage)
+        uploader_id = self._html_search_regex(
+            r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/([^/"]+)" title="Go to [^"]+ page">',
+            webpage, 'uploader id', fatal=False)
+        uploader = self._html_search_regex(
+            r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/[^/"]+" title="Go to ([^"]+) page">',
+            webpage, 'uploader', fatal=False)
+
+        categories_html = self._search_regex(
+            r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="[^"]*?list[^"]*?">(.*?)</ul>',
+            webpage, 'categories', fatal=False)
+        categories = None
+        if categories_html:
+            categories = [
+                c.strip() for c in re.findall(
+                    r'(?s)<li><a.*?>(.*?)</a>', categories_html)]
+
+        view_count = str_to_int(self._search_regex(
+            r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">',
+            webpage, 'view count', default=None))
+        like_count = str_to_int(self._search_regex(
+            r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">',
+            webpage, 'like count', default=None))
+        duration = parse_duration(self._html_search_meta('duration', webpage))
+
+        media_id = self._search_regex(
+            r'<button[^>]+data-id=(["\'])(?P<id>\d+)\1[^>]+data-quality=', webpage,
+            'media id', default=None, group='id')
+        sources = [
+            quality
+            for _, quality in re.findall(r'<button[^>]+data-quality=(["\'])(.+?)\1', webpage)]
+        if not (media_id and sources):
+            player_js = self._download_webpage(
+                self._search_regex(
+                    r'<script[^>]id=(["\'])playerembed\1[^>]+src=(["\'])(?P<url>.+?)\2',
+                    webpage, 'player JS', group='url'),
+                video_id, 'Downloading player JS')
+            params_js = self._search_regex(
+                r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)',
+                player_js, 'initialization parameters')
+            params = self._parse_json('[%s]' % params_js, video_id)
+            media_id = params[0]
+            sources = ['%s' % p for p in params[2]]
+
+        formats = self._extract_formats(url, video_id, media_id, sources)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'categories': categories,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'timestamp': timestamp,
+            'like_count': like_count,
+            'view_count': view_count,
+            'duration': duration,
+            'age_limit': 18,
+        }
+
+
+class FourTubeIE(FourTubeBaseIE):
+    IE_NAME = '4tube'
+    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?4tube\.com/(?:videos|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+    _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video'
+    _TKN_HOST = 'token.4tube.com'
+    _TESTS = [{
+        'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
+        'md5': '6516c8ac63b03de06bc8eac14362db4f',
+        'info_dict': {
+            'id': '209733',
+            'ext': 'mp4',
+            'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
+            'uploader': 'WCP Club',
+            'uploader_id': 'wcp-club',
+            'upload_date': '20131031',
+            'timestamp': 1383263892,
+            'duration': 583,
+            'view_count': int,
+            'like_count': int,
+            'categories': list,
+            'age_limit': 18,
+        },
+    }, {
+        'url': 'http://www.4tube.com/embed/209733',
+        'only_matching': True,
+    }, {
+        'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
+        'only_matching': True,
+    }]
+
+
+class FuxIE(FourTubeBaseIE):
+    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?fux\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+    _URL_TEMPLATE = 'https://www.fux.com/video/%s/video'
+    _TKN_HOST = 'token.fux.com'
+    _TESTS = [{
+        'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow',
+        'info_dict': {
+            'id': '195359',
+            'ext': 'mp4',
+            'title': 'Awesome fucking in the kitchen ends with cum swallow',
+            'uploader': 'alenci2342',
+            'uploader_id': 'alenci2342',
+            'upload_date': '20131230',
+            'timestamp': 1388361660,
+            'duration': 289,
+            'view_count': int,
+            'like_count': int,
+            'categories': list,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.fux.com/embed/195359',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow',
+        'only_matching': True,
+    }]
+
+
+class PornTubeIE(FourTubeBaseIE):
+    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)'
+    _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s'
+    _TKN_HOST = 'tkn.porntube.com'
+    _TESTS = [{
+        'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759',
+        'info_dict': {
+            'id': '7089759',
+            'ext': 'mp4',
+            'title': 'Teen couple doing anal',
+            'uploader': 'Alexy',
+            'uploader_id': '91488',
+            'upload_date': '20150606',
+            'timestamp': 1433595647,
+            'duration': 5052,
+            'view_count': int,
+            'like_count': int,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.porntube.com/videos/squirting-teen-ballerina-ecg_1331406',
+        'info_dict': {
+            'id': '1331406',
+            'ext': 'mp4',
+            'title': 'Squirting Teen Ballerina on ECG',
+            'uploader': 'Exploited College Girls',
+            'uploader_id': '665',
+            'channel': 'Exploited College Girls',
+            'channel_id': '665',
+            'upload_date': '20130920',
+            'timestamp': 1379685485,
+            'duration': 851,
+            'view_count': int,
+            'like_count': int,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.porntube.com/embed/7089759',
+        'only_matching': True,
+    }, {
+        'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.group('id', 'display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        video = self._parse_json(
+            self._search_regex(
+                r'INITIALSTATE\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
+                webpage, 'data', group='value'), video_id,
+            transform_source=lambda x: compat_urllib_parse_unquote(
+                compat_b64decode(x).decode('utf-8')))['page']['video']
+
+        title = video['title']
+        media_id = video['mediaId']
+        sources = [compat_str(e['height'])
+                   for e in video['encodings'] if e.get('height')]
+        formats = self._extract_formats(url, video_id, media_id, sources)
+
+        thumbnail = url_or_none(video.get('masterThumb'))
+        uploader = try_get(video, lambda x: x['user']['username'], compat_str)
+        uploader_id = str_or_none(try_get(
+            video, lambda x: x['user']['id'], int))
+        channel = try_get(video, lambda x: x['channel']['name'], compat_str)
+        channel_id = str_or_none(try_get(
+            video, lambda x: x['channel']['id'], int))
+        like_count = int_or_none(video.get('likes'))
+        dislike_count = int_or_none(video.get('dislikes'))
+        view_count = int_or_none(video.get('playsQty'))
+        duration = int_or_none(video.get('durationInSeconds'))
+        timestamp = unified_timestamp(video.get('publishedAt'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'uploader': uploader or channel,
+            'uploader_id': uploader_id or channel_id,
+            'channel': channel,
+            'channel_id': channel_id,
+            'timestamp': timestamp,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'view_count': view_count,
+            'duration': duration,
+            'age_limit': 18,
+        }
+
+
+class PornerBrosIE(FourTubeBaseIE):
+    _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)'
+    _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s'
+    _TKN_HOST = 'token.pornerbros.com'
+    _TESTS = [{
+        'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369',
+        'md5': '6516c8ac63b03de06bc8eac14362db4f',
+        'info_dict': {
+            'id': '181369',
+            'ext': 'mp4',
+            'title': 'Skinny brunette takes big cock down her anal hole',
+            'uploader': 'PornerBros HD',
+            'uploader_id': 'pornerbros-hd',
+            'upload_date': '20130130',
+            'timestamp': 1359527401,
+            'duration': 1224,
+            'view_count': int,
+            'categories': list,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.pornerbros.com/embed/181369',
+        'only_matching': True,
+    }, {
+        'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369',
+        'only_matching': True,
+    }]
diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py

new file mode 100644 (file)

index 0000000..04f4bdb
--- /dev/null
+++ b/youtube_dl/extractor/fox.py
@@ -0,0 +1,150 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import uuid
+
+from .adobepass import AdobePassIE
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+    compat_urllib_parse_unquote,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_age_limit,
+    parse_duration,
+    try_get,
+    unified_timestamp,
+)
+
+
+class FOXIE(AdobePassIE):
+    _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)'
+    _TESTS = [{
+        # clip
+        'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/',
+        'md5': 'ebd296fcc41dd4b19f8115d8461a3165',
+        'info_dict': {
+            'id': '4b765a60490325103ea69888fb2bd4e8',
+            'ext': 'mp4',
+            'title': 'Aftermath: Bruce Wayne Develops Into The Dark Knight',
+            'description': 'md5:549cd9c70d413adb32ce2a779b53b486',
+            'duration': 102,
+            'timestamp': 1504291893,
+            'upload_date': '20170901',
+            'creator': 'FOX',
+            'series': 'Gotham',
+            'age_limit': 14,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # episode, geo-restricted
+        'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/',
+        'only_matching': True,
+    }, {
+        # episode, geo-restricted, tv provided required
+        'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/',
+        'only_matching': True,
+    }]
+    _GEO_BYPASS = False
+    _HOME_PAGE_URL = 'https://www.fox.com/'
+    _API_KEY = 'abdcbed02c124d393b39e818a4312055'
+    _access_token = None
+
+    def _call_api(self, path, video_id, data=None):
+        headers = {
+            'X-Api-Key': self._API_KEY,
+        }
+        if self._access_token:
+            headers['Authorization'] = 'Bearer ' + self._access_token
+        try:
+            return self._download_json(
+                'https://api2.fox.com/v2.0/' + path,
+                video_id, data=data, headers=headers)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                entitlement_issues = self._parse_json(
+                    e.cause.read().decode(), video_id)['entitlementIssues']
+                for e in entitlement_issues:
+                    if e.get('errorCode') == 1005:
+                        raise ExtractorError(
+                            'This video is only available via cable service provider '
+                            'subscription. You may want to use --cookies.', expected=True)
+                messages = ', '.join([e['message'] for e in entitlement_issues])
+                raise ExtractorError(messages, expected=True)
+            raise
+
+    def _real_initialize(self):
+        if not self._access_token:
+            mvpd_auth = self._get_cookies(self._HOME_PAGE_URL).get('mvpd-auth')
+            if mvpd_auth:
+                self._access_token = (self._parse_json(compat_urllib_parse_unquote(
+                    mvpd_auth.value), None, fatal=False) or {}).get('accessToken')
+            if not self._access_token:
+                self._access_token = self._call_api(
+                    'login', None, json.dumps({
+                        'deviceId': compat_str(uuid.uuid4()),
+                    }).encode())['accessToken']
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._call_api('vodplayer/' + video_id, video_id)
+
+        title = video['name']
+        release_url = video['url']
+        try:
+            m3u8_url = self._download_json(release_url, video_id)['playURL']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                error = self._parse_json(e.cause.read().decode(), video_id)
+                if error.get('exception') == 'GeoLocationBlocked':
+                    self.raise_geo_restricted(countries=['US'])
+                raise ExtractorError(error['description'], expected=True)
+            raise
+        formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, 'mp4',
+            entry_protocol='m3u8_native', m3u8_id='hls')
+        self._sort_formats(formats)
+
+        data = try_get(
+            video, lambda x: x['trackingData']['properties'], dict) or {}
+
+        duration = int_or_none(video.get('durationInSeconds')) or int_or_none(
+            video.get('duration')) or parse_duration(video.get('duration'))
+        timestamp = unified_timestamp(video.get('datePublished'))
+        creator = data.get('brand') or data.get('network') or video.get('network')
+        series = video.get('seriesName') or data.get(
+            'seriesName') or data.get('show')
+
+        subtitles = {}
+        for doc_rel in video.get('documentReleases', []):
+            rel_url = doc_rel.get('url')
+            if not url or doc_rel.get('format') != 'SCC':
+                continue
+            subtitles['en'] = [{
+                'url': rel_url,
+                'ext': 'scc',
+            }]
+            break
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': video.get('description'),
+            'duration': duration,
+            'timestamp': timestamp,
+            'age_limit': parse_age_limit(video.get('contentRating')),
+            'creator': creator,
+            'series': series,
+            'season_number': int_or_none(video.get('seasonNumber')),
+            'episode': video.get('name'),
+            'episode_number': int_or_none(video.get('episodeNumber')),
+            'release_year': int_or_none(video.get('releaseYear')),
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/fox9.py b/youtube_dl/extractor/fox9.py

new file mode 100644 (file)

index 0000000..91f8f7b
--- /dev/null
+++ b/youtube_dl/extractor/fox9.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class FOX9IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?fox9\.com/video/(?P<id>\d+)'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result(
+            'anvato:anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b:' + video_id,
+            'Anvato', video_id)
+
+
+class FOX9NewsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?fox9\.com/news/(?P<id>[^/?&#]+)'
+    _TEST = {
+        'url': 'https://www.fox9.com/news/black-bear-in-tree-draws-crowd-in-downtown-duluth-minnesota',
+        'md5': 'd6e1b2572c3bab8a849c9103615dd243',
+        'info_dict': {
+            'id': '314473',
+            'ext': 'mp4',
+            'title': 'Bear climbs tree in downtown Duluth',
+            'description': 'md5:6a36bfb5073a411758a752455408ac90',
+            'duration': 51,
+            'timestamp': 1478123580,
+            'upload_date': '20161102',
+            'uploader': 'EPFOX',
+            'categories': ['News', 'Sports'],
+            'tags': ['news', 'video'],
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        anvato_id = self._search_regex(
+            r'anvatoId\s*:\s*[\'"](\d+)', webpage, 'anvato id')
+        return self.url_result('https://www.fox9.com/video/' + anvato_id, 'FOX9')
diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py

new file mode 100644 (file)

index 0000000..512a106
--- /dev/null
+++ b/youtube_dl/extractor/foxgay.py
@@ -0,0 +1,63 @@
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+    get_element_by_id,
+    int_or_none,
+    remove_end,
+)
+
+
+class FoxgayIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P<id>\d+)\.shtml'
+    _TEST = {
+        'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml',
+        'md5': '344558ccfea74d33b7adbce22e577f54',
+        'info_dict': {
+            'id': '2582',
+            'ext': 'mp4',
+            'title': 'Fuck Turkish-style',
+            'description': 'md5:6ae2d9486921891efe89231ace13ffdf',
+            'age_limit': 18,
+            'thumbnail': r're:https?://.*\.jpg$',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = remove_end(self._html_search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title'), ' - Foxgay.com')
+        description = get_element_by_id('inf_tit', webpage)
+
+        # The default user-agent with foxgay cookies leads to pages without videos
+        self._downloader.cookiejar.clear('.foxgay.com')
+        # Find the URL for the iFrame which contains the actual video.
+        iframe_url = self._html_search_regex(
+            r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', webpage,
+            'video frame', group='url')
+        iframe = self._download_webpage(
+            iframe_url, video_id, headers={'User-Agent': 'curl/7.50.1'},
+            note='Downloading video frame')
+        video_data = self._parse_json(self._search_regex(
+            r'video_data\s*=\s*([^;]+);', iframe, 'video data'), video_id)
+
+        formats = [{
+            'url': source,
+            'height': int_or_none(resolution),
+        } for source, resolution in zip(
+            video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))]
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'thumbnail': video_data.get('act_vid', {}).get('thumb'),
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py

new file mode 100644 (file)

index 0000000..63613cb
--- /dev/null
+++ b/youtube_dl/extractor/foxnews.py
@@ -0,0 +1,127 @@
+from __future__ import unicode_literals
+
+import re
+
+from .amp import AMPIE
+from .common import InfoExtractor
+
+
+class FoxNewsIE(AMPIE):
+    IE_NAME = 'foxnews'
+    IE_DESC = 'Fox News and Fox Business Video'
+    _VALID_URL = r'https?://(?P<host>video\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
+    _TESTS = [
+        {
+            'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips',
+            'md5': '32aaded6ba3ef0d1c04e238d01031e5e',
+            'info_dict': {
+                'id': '3937480',
+                'ext': 'flv',
+                'title': 'Frozen in Time',
+                'description': '16-year-old girl is size of toddler',
+                'duration': 265,
+                'timestamp': 1304411491,
+                'upload_date': '20110503',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            'url': 'http://video.foxnews.com/v/3922535568001/rep-luis-gutierrez-on-if-obamas-immigration-plan-is-legal/#sp=show-clips',
+            'md5': '5846c64a1ea05ec78175421b8323e2df',
+            'info_dict': {
+                'id': '3922535568001',
+                'ext': 'mp4',
+                'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal",
+                'description': "Congressman discusses president's plan",
+                'duration': 292,
+                'timestamp': 1417662047,
+                'upload_date': '20141204',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://video.foxbusiness.com/v/4442309889001',
+            'only_matching': True,
+        },
+        {
+            # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words
+            'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true',
+            'only_matching': True,
+        },
+    ]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url')
+            for mobj in re.finditer(
+                r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1',
+                webpage)]
+
+    def _real_extract(self, url):
+        host, video_id = re.match(self._VALID_URL, url).groups()
+
+        info = self._extract_feed_info(
+            'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id))
+        info['id'] = video_id
+        return info
+
+
+class FoxNewsArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)'
+    IE_NAME = 'foxnews:article'
+
+    _TESTS = [{
+        # data-video-id
+        'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html',
+        'md5': '83d44e1aff1433e7a29a7b537d1700b5',
+        'info_dict': {
+            'id': '5116295019001',
+            'ext': 'mp4',
+            'title': 'Trump and Clinton asked to defend positions on Iraq War',
+            'description': 'Veterans react on \'The Kelly File\'',
+            'timestamp': 1473301045,
+            'upload_date': '20160908',
+        },
+    }, {
+        # iframe embed
+        'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true',
+        'info_dict': {
+            'id': '5748266721001',
+            'ext': 'flv',
+            'title': 'Kyle Kashuv has a positive message for the Trump White House',
+            'description': 'Marjory Stoneman Douglas student disagrees with classmates.',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 229,
+            'timestamp': 1520594670,
+            'upload_date': '20180309',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._html_search_regex(
+            r'data-video-id=([\'"])(?P<id>[^\'"]+)\1',
+            webpage, 'video ID', group='id', default=None)
+        if video_id:
+            return self.url_result(
+                'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key())
+
+        return self.url_result(
+            FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key())
diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py

new file mode 100644 (file)

index 0000000..2b2cb6c
--- /dev/null
+++ b/youtube_dl/extractor/foxsports.py
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class FoxSportsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*video/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.foxsports.com/tennessee/video/432609859715',
+        'md5': 'b49050e955bebe32c301972e4012ac17',
+        'info_dict': {
+            'id': '432609859715',
+            'ext': 'mp4',
+            'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
+            'description': 'Courtney Lee talks about Memphis being focused.',
+            # TODO: fix timestamp
+            'upload_date': '19700101',  # '20150423',
+            # 'timestamp': 1429761109,
+            'uploader': 'NEWA-FNG-FOXSPORTS',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['ThePlatform'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        return self.url_result(
+            'https://feed.theplatform.com/f/BKQ29B/foxsports-all?byId=' + video_id, 'ThePlatformFeed')
diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py

new file mode 100644 (file)

index 0000000..306b45f
--- /dev/null
+++ b/youtube_dl/extractor/franceculture.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    extract_attributes,
+    int_or_none,
+)
+
+
+class FranceCultureIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
+        'info_dict': {
+            'id': 'rendez-vous-au-pays-des-geeks',
+            'display_id': 'rendez-vous-au-pays-des-geeks',
+            'ext': 'mp3',
+            'title': 'Rendez-vous au pays des geeks',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20140301',
+            'timestamp': 1393642916,
+            'vcodec': 'none',
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_data = extract_attributes(self._search_regex(
+            r'''(?sx)
+                (?:
+                    </h1>|
+                    <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>
+                ).*?
+                (<button[^>]+data-asset-source="[^"]+"[^>]+>)
+            ''',
+            webpage, 'video data'))
+
+        video_url = video_data['data-asset-source']
+        title = video_data.get('data-asset-title') or self._og_search_title(webpage)
+
+        description = self._html_search_regex(
+            r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>',
+            webpage, 'description', default=None)
+        thumbnail = self._search_regex(
+            r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',
+            webpage, 'thumbnail', fatal=False)
+        uploader = self._html_search_regex(
+            r'(?s)<span class="author">(.*?)</span>',
+            webpage, 'uploader', default=None)
+        ext = determine_ext(video_url.lower())
+
+        return {
+            'id': display_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'ext': ext,
+            'vcodec': 'none' if ext == 'mp3' else None,
+            'uploader': uploader,
+            'timestamp': int_or_none(video_data.get('data-asset-created-date')),
+            'duration': int_or_none(video_data.get('data-duration')),
+        }
diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py

new file mode 100644 (file)

index 0000000..0580689
--- /dev/null
+++ b/youtube_dl/extractor/franceinter.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import month_by_name
+
+
+class FranceInterIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P<id>[^?#]+)'
+
+    _TEST = {
+        'url': 'https://www.franceinter.fr/emissions/affaires-sensibles/affaires-sensibles-07-septembre-2016',
+        'md5': '9e54d7bdb6fdc02a841007f8a975c094',
+        'info_dict': {
+            'id': 'affaires-sensibles/affaires-sensibles-07-septembre-2016',
+            'ext': 'mp3',
+            'title': 'Affaire Cahuzac : le contentieux du compte en Suisse',
+            'description': 'md5:401969c5d318c061f86bda1fa359292b',
+            'upload_date': '20160907',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'(?s)<div[^>]+class=["\']page-diffusion["\'][^>]*>.*?<button[^>]+data-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'video url', group='url')
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+
+        upload_date_str = self._search_regex(
+            r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<',
+            webpage, 'upload date', fatal=False)
+        if upload_date_str:
+            upload_date_list = upload_date_str.split()
+            upload_date_list.reverse()
+            upload_date_list[1] = '%02d' % (month_by_name(upload_date_list[1], lang='fr') or 0)
+            upload_date_list[2] = '%02d' % int(upload_date_list[2])
+            upload_date = ''.join(upload_date_list)
+        else:
+            upload_date = None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'upload_date': upload_date,
+            'formats': [{
+                'url': video_url,
+                'vcodec': 'none',
+            }],
+        }
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py

new file mode 100644 (file)

index 0000000..e340cdd
--- /dev/null
+++ b/youtube_dl/extractor/francetv.py
@@ -0,0 +1,518 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    clean_html,
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    parse_duration,
+    try_get,
+    url_or_none,
+)
+from .dailymotion import DailymotionIE
+
+
+class FranceTVBaseInfoExtractor(InfoExtractor):
+    def _make_url_result(self, video_or_full_id, catalog=None):
+        full_id = 'francetv:%s' % video_or_full_id
+        if '@' not in video_or_full_id and catalog:
+            full_id += '@%s' % catalog
+        return self.url_result(
+            full_id, ie=FranceTVIE.ie_key(),
+            video_id=video_or_full_id.split('@')[0])
+
+
+class FranceTVIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        https?://
+                            sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\?
+                            .*?\bidDiffusion=[^&]+|
+                        (?:
+                            https?://videos\.francetv\.fr/video/|
+                            francetv:
+                        )
+                        (?P<id>[^@]+)(?:@(?P<catalog>.+))?
+                    )
+                    '''
+
+    _TESTS = [{
+        # without catalog
+        'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0',
+        'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f',
+        'info_dict': {
+            'id': '162311093',
+            'ext': 'mp4',
+            'title': '13h15, le dimanche... - Les mystères de Jésus',
+            'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
+            'timestamp': 1502623500,
+            'upload_date': '20170813',
+        },
+    }, {
+        # with catalog
+        'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4',
+        'only_matching': True,
+    }, {
+        'url': 'http://videos.francetv.fr/video/NI_657393@Regions',
+        'only_matching': True,
+    }, {
+        'url': 'francetv:162311093',
+        'only_matching': True,
+    }, {
+        'url': 'francetv:NI_1004933@Zouzous',
+        'only_matching': True,
+    }, {
+        'url': 'francetv:NI_983319@Info-web',
+        'only_matching': True,
+    }, {
+        'url': 'francetv:NI_983319',
+        'only_matching': True,
+    }, {
+        'url': 'francetv:NI_657393@Regions',
+        'only_matching': True,
+    }, {
+        # france-3 live
+        'url': 'francetv:SIM_France3',
+        'only_matching': True,
+    }]
+
+    def _extract_video(self, video_id, catalogue=None):
+        # Videos are identified by idDiffusion so catalogue part is optional.
+        # However when provided, some extra formats may be returned so we pass
+        # it if available.
+        info = self._download_json(
+            'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/',
+            video_id, 'Downloading video JSON', query={
+                'idDiffusion': video_id,
+                'catalogue': catalogue or '',
+            })
+
+        if info.get('status') == 'NOK':
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, info['message']),
+                expected=True)
+        allowed_countries = info['videos'][0].get('geoblocage')
+        if allowed_countries:
+            georestricted = True
+            geo_info = self._download_json(
+                'http://geo.francetv.fr/ws/edgescape.json', video_id,
+                'Downloading geo restriction info')
+            country = geo_info['reponse']['geo_info']['country_code']
+            if country not in allowed_countries:
+                raise ExtractorError(
+                    'The video is not available from your location',
+                    expected=True)
+        else:
+            georestricted = False
+
+        def sign(manifest_url, manifest_id):
+            for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'):
+                signed_url = url_or_none(self._download_webpage(
+                    'https://%s/esi/TA' % host, video_id,
+                    'Downloading signed %s manifest URL' % manifest_id,
+                    fatal=False, query={
+                        'url': manifest_url,
+                    }))
+                if signed_url:
+                    return signed_url
+            return manifest_url
+
+        is_live = None
+
+        formats = []
+        for video in info['videos']:
+            if video['statut'] != 'ONLINE':
+                continue
+            video_url = video['url']
+            if not video_url:
+                continue
+            if is_live is None:
+                is_live = (try_get(
+                    video, lambda x: x['plages_ouverture'][0]['direct'],
+                    bool) is True) or '/live.francetv.fr/' in video_url
+            format_id = video['format']
+            ext = determine_ext(video_url)
+            if ext == 'f4m':
+                if georestricted:
+                    # See https://github.com/ytdl-org/youtube-dl/issues/3963
+                    # m3u8 urls work fine
+                    continue
+                formats.extend(self._extract_f4m_formats(
+                    sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
+                    video_id, f4m_id=format_id, fatal=False))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    sign(video_url, format_id), video_id, 'mp4',
+                    entry_protocol='m3u8_native', m3u8_id=format_id,
+                    fatal=False))
+            elif video_url.startswith('rtmp'):
+                formats.append({
+                    'url': video_url,
+                    'format_id': 'rtmp-%s' % format_id,
+                    'ext': 'flv',
+                })
+            else:
+                if self._is_valid_url(video_url, video_id, format_id):
+                    formats.append({
+                        'url': video_url,
+                        'format_id': format_id,
+                    })
+        self._sort_formats(formats)
+
+        title = info['titre']
+        subtitle = info.get('sous_titre')
+        if subtitle:
+            title += ' - %s' % subtitle
+        title = title.strip()
+
+        subtitles = {}
+        subtitles_list = [{
+            'url': subformat['url'],
+            'ext': subformat.get('format'),
+        } for subformat in info.get('subtitles', []) if subformat.get('url')]
+        if subtitles_list:
+            subtitles['fr'] = subtitles_list
+
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'description': clean_html(info['synopsis']),
+            'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
+            'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
+            'timestamp': int_or_none(info['diffusion']['timestamp']),
+            'is_live': is_live,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        catalog = mobj.group('catalog')
+
+        if not video_id:
+            qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+            video_id = qs.get('idDiffusion', [None])[0]
+            catalog = qs.get('catalogue', [None])[0]
+            if not video_id:
+                raise ExtractorError('Invalid URL', expected=True)
+
+        return self._extract_video(video_id, catalog)
+
+
+class FranceTVSiteIE(FranceTVBaseInfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P<id>[^/]+)\.html'
+
+    _TESTS = [{
+        'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
+        'info_dict': {
+            'id': 'ec217ecc-0733-48cf-ac06-af1347b849d1',
+            'ext': 'mp4',
+            'title': '13h15, le dimanche... - Les mystères de Jésus',
+            'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
+            'timestamp': 1502623500,
+            'upload_date': '20170813',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [FranceTVIE.ie_key()],
+    }, {
+        # france3
+        'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
+        'only_matching': True,
+    }, {
+        # france4
+        'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html',
+        'only_matching': True,
+    }, {
+        # france5
+        'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html',
+        'only_matching': True,
+    }, {
+        # franceo
+        'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html',
+        'only_matching': True,
+    }, {
+        # france2 live
+        'url': 'https://www.france.tv/france-2/direct.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.france.tv/142749-rouge-sang.html',
+        'only_matching': True,
+    }, {
+        # france-3 live
+        'url': 'https://www.france.tv/france-3/direct.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        catalogue = None
+        video_id = self._search_regex(
+            r'(?:data-main-video\s*=|videoId["\']?\s*[:=])\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
+            webpage, 'video id', default=None, group='id')
+
+        if not video_id:
+            video_id, catalogue = self._html_search_regex(
+                r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
+                webpage, 'video ID').split('@')
+
+        return self._make_url_result(video_id, catalogue)
+
+
+class FranceTVEmbedIE(FranceTVBaseInfoExtractor):
+    _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)'
+
+    _TESTS = [{
+        'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961',
+        'info_dict': {
+            'id': 'NI_983319',
+            'ext': 'mp4',
+            'title': 'Le Pen Reims',
+            'upload_date': '20170505',
+            'timestamp': 1493981780,
+            'duration': 16,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [FranceTVIE.ie_key()],
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id,
+            video_id)
+
+        return self._make_url_result(video['video_id'], video.get('catalog'))
+
+
+class FranceTVInfoIE(FranceTVBaseInfoExtractor):
+    IE_NAME = 'francetvinfo.fr'
+    _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)'
+
+    _TESTS = [{
+        'url': 'https://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-jeudi-22-aout-2019_3561461.html',
+        'info_dict': {
+            'id': 'd12458ee-5062-48fe-bfdd-a30d6a01b793',
+            'ext': 'mp4',
+            'title': 'Soir 3',
+            'upload_date': '20190822',
+            'timestamp': 1566510900,
+            'description': 'md5:72d167097237701d6e8452ff03b83c00',
+            'subtitles': {
+                'fr': 'mincount:2',
+            },
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [FranceTVIE.ie_key()],
+    }, {
+        'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html',
+        'only_matching': True,
+    }, {
+        # Dailymotion embed
+        'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html',
+        'md5': 'ee7f1828f25a648addc90cb2687b1f12',
+        'info_dict': {
+            'id': 'x4iiko0',
+            'ext': 'mp4',
+            'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen',
+            'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016',
+            'timestamp': 1467011958,
+            'upload_date': '20160627',
+            'uploader': 'France Inter',
+            'uploader_id': 'x2q2ez',
+        },
+        'add_ie': ['Dailymotion'],
+    }, {
+        'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        dailymotion_urls = DailymotionIE._extract_urls(webpage)
+        if dailymotion_urls:
+            return self.playlist_result([
+                self.url_result(dailymotion_url, DailymotionIE.ie_key())
+                for dailymotion_url in dailymotion_urls])
+
+        video_id = self._search_regex(
+            (r'player\.load[^;]+src:\s*["\']([^"\']+)',
+             r'id-video=([^@]+@[^"]+)',
+             r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"',
+             r'data-id="([^"]+)"'),
+            webpage, 'video id')
+
+        return self._make_url_result(video_id)
+
+
+class FranceTVInfoSportIE(FranceTVBaseInfoExtractor):
+    IE_NAME = 'sport.francetvinfo.fr'
+    _VALID_URL = r'https?://sport\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://sport.francetvinfo.fr/les-jeux-olympiques/retour-sur-les-meilleurs-moments-de-pyeongchang-2018',
+        'info_dict': {
+            'id': '6e49080e-3f45-11e8-b459-000d3a2439ea',
+            'ext': 'mp4',
+            'title': 'Retour sur les meilleurs moments de Pyeongchang 2018',
+            'timestamp': 1523639962,
+            'upload_date': '20180413',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [FranceTVIE.ie_key()],
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._search_regex(r'data-video="([^"]+)"', webpage, 'video_id')
+        return self._make_url_result(video_id, 'Sport-web')
+
+
+class GenerationWhatIE(InfoExtractor):
+    IE_NAME = 'france2.fr:generation-what'
+    _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'http://generation-what.francetv.fr/portrait/video/present-arms',
+        'info_dict': {
+            'id': 'wtvKYUG45iw',
+            'ext': 'mp4',
+            'title': 'Generation What - Garde à vous - FRA',
+            'uploader': 'Generation What',
+            'uploader_id': 'UCHH9p1eetWCgt4kXBYCb3_w',
+            'upload_date': '20160411',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Youtube'],
+    }, {
+        'url': 'http://generation-what.francetv.fr/europe/video/present-arms',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        youtube_id = self._search_regex(
+            r"window\.videoURL\s*=\s*'([0-9A-Za-z_-]{11})';",
+            webpage, 'youtube id')
+
+        return self.url_result(youtube_id, ie='Youtube', video_id=youtube_id)
+
+
+class CultureboxIE(FranceTVBaseInfoExtractor):
+    _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'https://culturebox.francetvinfo.fr/opera-classique/musique-classique/c-est-baroque/concerts/cantates-bwv-4-106-et-131-de-bach-par-raphael-pichon-57-268689',
+        'info_dict': {
+            'id': 'EV_134885',
+            'ext': 'mp4',
+            'title': 'Cantates BWV 4, 106 et 131 de Bach par Raphaël Pichon 5/7',
+            'description': 'md5:19c44af004b88219f4daa50fa9a351d4',
+            'upload_date': '20180206',
+            'timestamp': 1517945220,
+            'duration': 5981,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [FranceTVIE.ie_key()],
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        if ">Ce live n'est plus disponible en replay<" in webpage:
+            raise ExtractorError(
+                'Video %s is not available' % display_id, expected=True)
+
+        video_id, catalogue = self._search_regex(
+            r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]',
+            webpage, 'video id').split('@')
+
+        return self._make_url_result(video_id, catalogue)
+
+
+class FranceTVJeunesseIE(FranceTVBaseInfoExtractor):
+    _VALID_URL = r'(?P<url>https?://(?:www\.)?(?:zouzous|ludo)\.fr/heros/(?P<id>[^/?#&]+))'
+
+    _TESTS = [{
+        'url': 'https://www.zouzous.fr/heros/simon',
+        'info_dict': {
+            'id': 'simon',
+        },
+        'playlist_count': 9,
+    }, {
+        'url': 'https://www.ludo.fr/heros/ninjago',
+        'info_dict': {
+            'id': 'ninjago',
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'https://www.zouzous.fr/heros/simon?abc',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+
+        playlist = self._download_json(
+            '%s/%s' % (mobj.group('url'), 'playlist'), playlist_id)
+
+        if not playlist.get('count'):
+            raise ExtractorError(
+                '%s is not available' % playlist_id, expected=True)
+
+        entries = []
+        for item in playlist['items']:
+            identity = item.get('identity')
+            if identity and isinstance(identity, compat_str):
+                entries.append(self._make_url_result(identity))
+
+        return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/freesound.py b/youtube_dl/extractor/freesound.py

new file mode 100644 (file)

index 0000000..138b6bc
--- /dev/null
+++ b/youtube_dl/extractor/freesound.py
@@ -0,0 +1,79 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    get_element_by_class,
+    get_element_by_id,
+    unified_strdate,
+)
+
+
+class FreesoundIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?freesound\.org/people/[^/]+/sounds/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://www.freesound.org/people/miklovan/sounds/194503/',
+        'md5': '12280ceb42c81f19a515c745eae07650',
+        'info_dict': {
+            'id': '194503',
+            'ext': 'mp3',
+            'title': 'gulls in the city.wav',
+            'description': 'the sounds of seagulls in the city',
+            'duration': 130.233,
+            'uploader': 'miklovan',
+            'upload_date': '20130715',
+            'tags': list,
+        }
+    }
+
+    def _real_extract(self, url):
+        audio_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, audio_id)
+
+        audio_url = self._og_search_property('audio', webpage, 'song url')
+        title = self._og_search_property('audio:title', webpage, 'song title')
+
+        description = self._html_search_regex(
+            r'(?s)id=["\']sound_description["\'][^>]*>(.+?)</div>',
+            webpage, 'description', fatal=False)
+
+        duration = float_or_none(
+            get_element_by_class('duration', webpage), scale=1000)
+
+        upload_date = unified_strdate(get_element_by_id('sound_date', webpage))
+        uploader = self._og_search_property(
+            'audio:artist', webpage, 'uploader', fatal=False)
+
+        channels = self._html_search_regex(
+            r'Channels</dt><dd>(.+?)</dd>', webpage,
+            'channels info', fatal=False)
+
+        tags_str = get_element_by_class('tags', webpage)
+        tags = re.findall(r'<a[^>]+>([^<]+)', tags_str) if tags_str else None
+
+        audio_urls = [audio_url]
+
+        LQ_FORMAT = '-lq.mp3'
+        if LQ_FORMAT in audio_url:
+            audio_urls.append(audio_url.replace(LQ_FORMAT, '-hq.mp3'))
+
+        formats = [{
+            'url': format_url,
+            'format_note': channels,
+            'quality': quality,
+        } for quality, format_url in enumerate(audio_urls)]
+        self._sort_formats(formats)
+
+        return {
+            'id': audio_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'uploader': uploader,
+            'upload_date': upload_date,
+            'tags': tags,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py

new file mode 100644 (file)

index 0000000..ea9c3e3
--- /dev/null
+++ b/youtube_dl/extractor/freespeech.py
@@ -0,0 +1,31 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+
+
+class FreespeechIE(InfoExtractor):
+    IE_NAME = 'freespeech.org'
+    _VALID_URL = r'https?://(?:www\.)?freespeech\.org/stories/(?P<id>.+)'
+    _TEST = {
+        'add_ie': ['Youtube'],
+        'url': 'http://www.freespeech.org/stories/fcc-announces-net-neutrality-rollback-whats-stake/',
+        'info_dict': {
+            'id': 'waRk6IPqyWM',
+            'ext': 'mp4',
+            'title': 'What\'s At Stake - Net Neutrality Special',
+            'description': 'Presented by MNN and FSTV',
+            'upload_date': '20170728',
+            'uploader_id': 'freespeechtv',
+            'uploader': 'freespeechtv',
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        youtube_url = self._search_regex(
+            r'data-video-url="([^"]+)"',
+            webpage, 'youtube url')
+
+        return self.url_result(youtube_url, YoutubeIE.ie_key())
diff --git a/youtube_dl/extractor/freshlive.py b/youtube_dl/extractor/freshlive.py

new file mode 100644 (file)

index 0000000..72a8459
--- /dev/null
+++ b/youtube_dl/extractor/freshlive.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    try_get,
+    unified_timestamp,
+)
+
+
+class FreshLiveIE(InfoExtractor):
+    _VALID_URL = r'https?://freshlive\.tv/[^/]+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://freshlive.tv/satotv/74712',
+        'md5': '9f0cf5516979c4454ce982df3d97f352',
+        'info_dict': {
+            'id': '74712',
+            'ext': 'mp4',
+            'title': 'テスト',
+            'description': 'テスト',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1511,
+            'timestamp': 1483619655,
+            'upload_date': '20170105',
+            'uploader': 'サトTV',
+            'uploader_id': 'satotv',
+            'view_count': int,
+            'comment_count': int,
+            'is_live': False,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        options = self._parse_json(
+            self._search_regex(
+                r'window\.__CONTEXT__\s*=\s*({.+?});\s*</script>',
+                webpage, 'initial context'),
+            video_id)
+
+        info = options['context']['dispatcher']['stores']['ProgramStore']['programs'][video_id]
+
+        title = info['title']
+
+        if info.get('status') == 'upcoming':
+            raise ExtractorError('Stream %s is upcoming' % video_id, expected=True)
+
+        stream_url = info.get('liveStreamUrl') or info['archiveStreamUrl']
+
+        is_live = info.get('liveStreamUrl') is not None
+
+        formats = self._extract_m3u8_formats(
+            stream_url, video_id, 'mp4',
+            'm3u8_native', m3u8_id='hls')
+
+        if is_live:
+            title = self._live_title(title)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': info.get('description'),
+            'thumbnail': info.get('thumbnailUrl'),
+            'duration': int_or_none(info.get('airTime')),
+            'timestamp': unified_timestamp(info.get('createdAt')),
+            'uploader': try_get(
+                info, lambda x: x['channel']['title'], compat_str),
+            'uploader_id': try_get(
+                info, lambda x: x['channel']['code'], compat_str),
+            'uploader_url': try_get(
+                info, lambda x: x['channel']['permalink'], compat_str),
+            'view_count': int_or_none(info.get('viewCount')),
+            'comment_count': int_or_none(info.get('commentCount')),
+            'tags': info.get('tags', []),
+            'is_live': is_live,
+        }
diff --git a/youtube_dl/extractor/frontendmasters.py b/youtube_dl/extractor/frontendmasters.py

new file mode 100644 (file)

index 0000000..f1db33f
--- /dev/null
+++ b/youtube_dl/extractor/frontendmasters.py
@@ -0,0 +1,263 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    parse_duration,
+    url_or_none,
+    urlencode_postdata,
+)
+
+
+class FrontendMastersBaseIE(InfoExtractor):
+    _API_BASE = 'https://api.frontendmasters.com/v1/kabuki'
+    _LOGIN_URL = 'https://frontendmasters.com/login/'
+
+    _NETRC_MACHINE = 'frontendmasters'
+
+    _QUALITIES = {
+        'low': {'width': 480, 'height': 360},
+        'mid': {'width': 1280, 'height': 720},
+        'high': {'width': 1920, 'height': 1080}
+    }
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login page')
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'username': username,
+            'password': password
+        })
+
+        post_url = self._search_regex(
+            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+            'post_url', default=self._LOGIN_URL, group='url')
+
+        if not post_url.startswith('http'):
+            post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+
+        response = self._download_webpage(
+            post_url, None, 'Logging in', data=urlencode_postdata(login_form),
+            headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+        # Successful login
+        if any(p in response for p in (
+                'wp-login.php?action=logout', '>Logout')):
+            return
+
+        error = self._html_search_regex(
+            r'class=(["\'])(?:(?!\1).)*\bMessageAlert\b(?:(?!\1).)*\1[^>]*>(?P<error>[^<]+)<',
+            response, 'error message', default=None, group='error')
+        if error:
+            raise ExtractorError('Unable to login: %s' % error, expected=True)
+        raise ExtractorError('Unable to log in')
+
+
+class FrontendMastersPageBaseIE(FrontendMastersBaseIE):
+    def _download_course(self, course_name, url):
+        return self._download_json(
+            '%s/courses/%s' % (self._API_BASE, course_name), course_name,
+            'Downloading course JSON', headers={'Referer': url})
+
+    @staticmethod
+    def _extract_chapters(course):
+        chapters = []
+        lesson_elements = course.get('lessonElements')
+        if isinstance(lesson_elements, list):
+            chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)]
+        return chapters
+
+    @staticmethod
+    def _extract_lesson(chapters, lesson_id, lesson):
+        title = lesson.get('title') or lesson_id
+        display_id = lesson.get('slug')
+        description = lesson.get('description')
+        thumbnail = lesson.get('thumbnail')
+
+        chapter_number = None
+        index = lesson.get('index')
+        element_index = lesson.get('elementIndex')
+        if (isinstance(index, int) and isinstance(element_index, int)
+                and index < element_index):
+            chapter_number = element_index - index
+        chapter = (chapters[chapter_number - 1]
+                   if chapter_number - 1 < len(chapters) else None)
+
+        duration = None
+        timestamp = lesson.get('timestamp')
+        if isinstance(timestamp, compat_str):
+            mobj = re.search(
+                r'(?P<start>\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P<end>\s*\d{1,2}:\d{1,2}:\d{1,2})',
+                timestamp)
+            if mobj:
+                duration = parse_duration(mobj.group('end')) - parse_duration(
+                    mobj.group('start'))
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'frontendmasters:%s' % lesson_id,
+            'ie_key': FrontendMastersIE.ie_key(),
+            'id': lesson_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'chapter': chapter,
+            'chapter_number': chapter_number,
+        }
+
+
+class FrontendMastersIE(FrontendMastersBaseIE):
+    _VALID_URL = r'(?:frontendmasters:|https?://api\.frontendmasters\.com/v\d+/kabuki/video/)(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'https://api.frontendmasters.com/v1/kabuki/video/a2qogef6ba',
+        'md5': '7f161159710d6b7016a4f4af6fcb05e2',
+        'info_dict': {
+            'id': 'a2qogef6ba',
+            'ext': 'mp4',
+            'title': 'a2qogef6ba',
+        },
+        'skip': 'Requires FrontendMasters account credentials',
+    }, {
+        'url': 'frontendmasters:a2qogef6ba',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        lesson_id = self._match_id(url)
+
+        source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id)
+
+        formats = []
+        for ext in ('webm', 'mp4'):
+            for quality in ('low', 'mid', 'high'):
+                resolution = self._QUALITIES[quality].copy()
+                format_id = '%s-%s' % (ext, quality)
+                format_url = self._download_json(
+                    source_url, lesson_id,
+                    'Downloading %s source JSON' % format_id, query={
+                        'f': ext,
+                        'r': resolution['height'],
+                    }, headers={
+                        'Referer': url,
+                    }, fatal=False)['url']
+
+                if not format_url:
+                    continue
+
+                f = resolution.copy()
+                f.update({
+                    'url': format_url,
+                    'ext': ext,
+                    'format_id': format_id,
+                })
+                formats.append(f)
+        self._sort_formats(formats)
+
+        subtitles = {
+            'en': [{
+                'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id),
+            }]
+        }
+
+        return {
+            'id': lesson_id,
+            'title': lesson_id,
+            'formats': formats,
+            'subtitles': subtitles
+        }
+
+
+class FrontendMastersLessonIE(FrontendMastersPageBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<course_name>[^/]+)/(?P<lesson_name>[^/]+)'
+    _TEST = {
+        'url': 'https://frontendmasters.com/courses/web-development/tools',
+        'info_dict': {
+            'id': 'a2qogef6ba',
+            'display_id': 'tools',
+            'ext': 'mp4',
+            'title': 'Tools',
+            'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'chapter': 'Introduction',
+            'chapter_number': 1,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Requires FrontendMasters account credentials',
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        course_name, lesson_name = mobj.group('course_name', 'lesson_name')
+
+        course = self._download_course(course_name, url)
+
+        lesson_id, lesson = next(
+            (video_id, data)
+            for video_id, data in course['lessonData'].items()
+            if data.get('slug') == lesson_name)
+
+        chapters = self._extract_chapters(course)
+        return self._extract_lesson(chapters, lesson_id, lesson)
+
+
+class FrontendMastersCourseIE(FrontendMastersPageBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'https://frontendmasters.com/courses/web-development/',
+        'info_dict': {
+            'id': 'web-development',
+            'title': 'Introduction to Web Development',
+            'description': 'md5:9317e6e842098bf725d62360e52d49a6',
+        },
+        'playlist_count': 81,
+        'skip': 'Requires FrontendMasters account credentials',
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if FrontendMastersLessonIE.suitable(url) else super(
+            FrontendMastersBaseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        course_name = self._match_id(url)
+
+        course = self._download_course(course_name, url)
+
+        chapters = self._extract_chapters(course)
+
+        lessons = sorted(
+            course['lessonData'].values(), key=lambda data: data['index'])
+
+        entries = []
+        for lesson in lessons:
+            lesson_name = lesson.get('slug')
+            if not lesson_name:
+                continue
+            lesson_id = lesson.get('hash') or lesson.get('statsId')
+            entries.append(self._extract_lesson(chapters, lesson_id, lesson))
+
+        title = course.get('title')
+        description = course.get('description')
+
+        return self.playlist_result(entries, course_name, title, description)
diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py

new file mode 100644 (file)

index 0000000..8bbedca
--- /dev/null
+++ b/youtube_dl/extractor/funimation.py
@@ -0,0 +1,154 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import string
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    js_to_json,
+    ExtractorError,
+    urlencode_postdata
+)
+
+
+class FunimationIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)'
+
+    _NETRC_MACHINE = 'funimation'
+    _TOKEN = None
+
+    _TESTS = [{
+        'url': 'https://www.funimation.com/shows/hacksign/role-play/',
+        'info_dict': {
+            'id': '91144',
+            'display_id': 'role-play',
+            'ext': 'mp4',
+            'title': '.hack//SIGN - Role Play',
+            'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
+            'thumbnail': r're:https?://.*\.jpg',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/',
+        'info_dict': {
+            'id': '210051',
+            'display_id': 'broadcast-dub-preview',
+            'ext': 'mp4',
+            'title': 'Attack on Titan: Junior High - Broadcast Dub Preview',
+            'thumbnail': r're:https?://.*\.(?:jpg|png)',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/',
+        'only_matching': True,
+    }]
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+        try:
+            data = self._download_json(
+                'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/',
+                None, 'Logging in', data=urlencode_postdata({
+                    'username': username,
+                    'password': password,
+                }))
+            self._TOKEN = data['token']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                error = self._parse_json(e.cause.read().decode(), None)['error']
+                raise ExtractorError(error, expected=True)
+            raise
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        def _search_kane(name):
+            return self._search_regex(
+                r"KANE_customdimensions\.%s\s*=\s*'([^']+)';" % name,
+                webpage, name, default=None)
+
+        title_data = self._parse_json(self._search_regex(
+            r'TITLE_DATA\s*=\s*({[^}]+})',
+            webpage, 'title data', default=''),
+            display_id, js_to_json, fatal=False) or {}
+
+        video_id = title_data.get('id') or self._search_regex([
+            r"KANE_customdimensions.videoID\s*=\s*'(\d+)';",
+            r'<iframe[^>]+src="/player/(\d+)',
+        ], webpage, 'video_id', default=None)
+        if not video_id:
+            player_url = self._html_search_meta([
+                'al:web:url',
+                'og:video:url',
+                'og:video:secure_url',
+            ], webpage, fatal=True)
+            video_id = self._search_regex(r'/player/(\d+)', player_url, 'video id')
+
+        title = episode = title_data.get('title') or _search_kane('videoTitle') or self._og_search_title(webpage)
+        series = _search_kane('showName')
+        if series:
+            title = '%s - %s' % (series, title)
+        description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True)
+
+        try:
+            headers = {}
+            if self._TOKEN:
+                headers['Authorization'] = 'Token %s' % self._TOKEN
+            sources = self._download_json(
+                'https://www.funimation.com/api/showexperience/%s/' % video_id,
+                video_id, headers=headers, query={
+                    'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]),
+                })['items']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                error = self._parse_json(e.cause.read(), video_id)['errors'][0]
+                raise ExtractorError('%s said: %s' % (
+                    self.IE_NAME, error.get('detail') or error.get('title')), expected=True)
+            raise
+
+        formats = []
+        for source in sources:
+            source_url = source.get('src')
+            if not source_url:
+                continue
+            source_type = source.get('videoType') or determine_ext(source_url)
+            if source_type == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_url, video_id, 'mp4',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'format_id': source_type,
+                    'url': source_url,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'series': series,
+            'season_number': int_or_none(title_data.get('seasonNum') or _search_kane('season')),
+            'episode_number': int_or_none(title_data.get('episodeNum')),
+            'episode': episode,
+            'season_id': title_data.get('seriesId'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py

new file mode 100644 (file)

index 0000000..81d1949
--- /dev/null
+++ b/youtube_dl/extractor/funk.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .nexx import NexxIE
+from ..utils import (
+    int_or_none,
+    str_or_none,
+)
+
+
+class FunkIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?funk\.net/(?:channel|playlist)/[^/]+/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.funk.net/channel/ba-793/die-lustigsten-instrumente-aus-dem-internet-teil-2-1155821',
+        'md5': '8dd9d9ab59b4aa4173b3197f2ea48e81',
+        'info_dict': {
+            'id': '1155821',
+            'ext': 'mp4',
+            'title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet - Teil 2',
+            'description': 'md5:a691d0413ef4835588c5b03ded670c1f',
+            'timestamp': 1514507395,
+            'upload_date': '20171229',
+        },
+
+    }, {
+        'url': 'https://www.funk.net/playlist/neuesteVideos/kameras-auf-dem-fusion-festival-1618699',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id, nexx_id = re.match(self._VALID_URL, url).groups()
+        video = self._download_json(
+            'https://www.funk.net/api/v4.0/videos/' + nexx_id, nexx_id)
+        return {
+            '_type': 'url_transparent',
+            'url': 'nexx:741:' + nexx_id,
+            'ie_key': NexxIE.ie_key(),
+            'id': nexx_id,
+            'title': video.get('title'),
+            'description': video.get('description'),
+            'duration': int_or_none(video.get('duration')),
+            'channel_id': str_or_none(video.get('channelId')),
+            'display_id': display_id,
+            'tags': video.get('tags'),
+            'thumbnail': video.get('imageUrlLandscape'),
+        }
diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py

new file mode 100644 (file)

index 0000000..a3f44b8
--- /dev/null
+++ b/youtube_dl/extractor/fusion.py
@@ -0,0 +1,84 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    mimetype2ext,
+    parse_iso8601,
+)
+
+
+class FusionIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?fusion\.(?:net|tv)/(?:video/|show/.+?\bvideo=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://fusion.tv/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/',
+        'info_dict': {
+            'id': '3145868',
+            'ext': 'mp4',
+            'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs',
+            'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7',
+            'duration': 140.0,
+            'timestamp': 1442589635,
+            'uploader': 'UNIVISON',
+            'upload_date': '20150918',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Anvato'],
+    }, {
+        'url': 'http://fusion.tv/video/201781',
+        'only_matching': True,
+    }, {
+        'url': 'https://fusion.tv/show/food-exposed-with-nelufar-hedayat/?ancla=full-episodes&video=588644',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video = self._download_json(
+            'https://platform.fusion.net/wp-json/fusiondotnet/v1/video/' + video_id, video_id)
+
+        info = {
+            'id': video_id,
+            'title': video['title'],
+            'description': video.get('excerpt'),
+            'timestamp': parse_iso8601(video.get('published')),
+            'series': video.get('show'),
+        }
+
+        formats = []
+        src = video.get('src') or {}
+        for f_id, f in src.items():
+            for q_id, q in f.items():
+                q_url = q.get('url')
+                if not q_url:
+                    continue
+                ext = determine_ext(q_url, mimetype2ext(q.get('type')))
+                if ext == 'smil':
+                    formats.extend(self._extract_smil_formats(q_url, video_id, fatal=False))
+                elif f_id == 'm3u8-variant' or (ext == 'm3u8' and q_id == 'Variant'):
+                    formats.extend(self._extract_m3u8_formats(
+                        q_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+                else:
+                    formats.append({
+                        'format_id': '-'.join([f_id, q_id]),
+                        'url': q_url,
+                        'width': int_or_none(q.get('width')),
+                        'height': int_or_none(q.get('height')),
+                        'tbr': int_or_none(self._search_regex(r'_(\d+)\.m(?:p4|3u8)', q_url, 'bitrate')),
+                        'ext': 'mp4' if ext == 'm3u8' else ext,
+                        'protocol': 'm3u8_native' if ext == 'm3u8' else 'https',
+                    })
+        if formats:
+            self._sort_formats(formats)
+            info['formats'] = formats
+        else:
+            info.update({
+                '_type': 'url',
+                'url': 'anvato:uni:' + video['video_ids']['anvato'],
+                'ie_key': 'Anvato',
+            })
+
+        return info
diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py

new file mode 100644 (file)

index 0000000..00e6742
--- /dev/null
+++ b/youtube_dl/extractor/fxnetworks.py
@@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .adobepass import AdobePassIE
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+    parse_age_limit,
+    smuggle_url,
+    update_url_query,
+)
+
+
+class FXNetworksIE(AdobePassIE):
+    _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.fxnetworks.com/video/1032565827847',
+        'md5': '8d99b97b4aa7a202f55b6ed47ea7e703',
+        'info_dict': {
+            'id': 'dRzwHC_MMqIv',
+            'ext': 'mp4',
+            'title': 'First Look: Better Things - Season 2',
+            'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.',
+            'age_limit': 14,
+            'uploader': 'NEWA-FNG-FX',
+            'upload_date': '20170825',
+            'timestamp': 1503686274,
+            'episode_number': 0,
+            'season_number': 2,
+            'series': 'Better Things',
+        },
+        'add_ie': ['ThePlatform'],
+    }, {
+        'url': 'http://www.simpsonsworld.com/video/716094019682',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        if 'The content you are trying to access is not available in your region.' in webpage:
+            self.raise_geo_restricted()
+        video_data = extract_attributes(self._search_regex(
+            r'(<a.+?rel="https?://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data'))
+        player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None)
+        release_url = video_data['rel']
+        title = video_data['data-title']
+        rating = video_data.get('data-rating')
+        query = {
+            'mbr': 'true',
+        }
+        if player_type == 'movies':
+            query.update({
+                'manifest': 'm3u',
+            })
+        else:
+            query.update({
+                'switch': 'http',
+            })
+        if video_data.get('data-req-auth') == '1':
+            resource = self._get_mvpd_resource(
+                video_data['data-channel'], title,
+                video_data.get('data-guid'), rating)
+            query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource)
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': title,
+            'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}),
+            'series': video_data.get('data-show-title'),
+            'episode_number': int_or_none(video_data.get('data-episode')),
+            'season_number': int_or_none(video_data.get('data-season')),
+            'thumbnail': video_data.get('data-large-thumb'),
+            'age_limit': parse_age_limit(rating),
+            'ie_key': 'ThePlatform',
+        }
diff --git a/youtube_dl/extractor/gaia.py b/youtube_dl/extractor/gaia.py

new file mode 100644 (file)

index 0000000..e952775
--- /dev/null
+++ b/youtube_dl/extractor/gaia.py
@@ -0,0 +1,130 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_unquote,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    str_or_none,
+    strip_or_none,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class GaiaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?gaia\.com/video/(?P<id>[^/?]+).*?\bfullplayer=(?P<type>feature|preview)'
+    _TESTS = [{
+        'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=feature',
+        'info_dict': {
+            'id': '89356',
+            'ext': 'mp4',
+            'title': 'Connecting with Universal Consciousness',
+            'description': 'md5:844e209ad31b7d31345f5ed689e3df6f',
+            'upload_date': '20151116',
+            'timestamp': 1447707266,
+            'duration': 936,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=preview',
+        'info_dict': {
+            'id': '89351',
+            'ext': 'mp4',
+            'title': 'Connecting with Universal Consciousness',
+            'description': 'md5:844e209ad31b7d31345f5ed689e3df6f',
+            'upload_date': '20151116',
+            'timestamp': 1447707266,
+            'duration': 53,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+    _NETRC_MACHINE = 'gaia'
+    _jwt = None
+
+    def _real_initialize(self):
+        auth = self._get_cookies('https://www.gaia.com/').get('auth')
+        if auth:
+            auth = self._parse_json(
+                compat_urllib_parse_unquote(auth.value),
+                None, fatal=False)
+        if not auth:
+            username, password = self._get_login_info()
+            if username is None:
+                return
+            auth = self._download_json(
+                'https://auth.gaia.com/v1/login',
+                None, data=urlencode_postdata({
+                    'username': username,
+                    'password': password
+                }))
+            if auth.get('success') is False:
+                raise ExtractorError(', '.join(auth['messages']), expected=True)
+        if auth:
+            self._jwt = auth.get('jwt')
+
+    def _real_extract(self, url):
+        display_id, vtype = re.search(self._VALID_URL, url).groups()
+        node_id = self._download_json(
+            'https://brooklyn.gaia.com/pathinfo', display_id, query={
+                'path': 'video/' + display_id,
+            })['id']
+        node = self._download_json(
+            'https://brooklyn.gaia.com/node/%d' % node_id, node_id)
+        vdata = node[vtype]
+        media_id = compat_str(vdata['nid'])
+        title = node['title']
+
+        headers = None
+        if self._jwt:
+            headers = {'Authorization': 'Bearer ' + self._jwt}
+        media = self._download_json(
+            'https://brooklyn.gaia.com/media/' + media_id,
+            media_id, headers=headers)
+        formats = self._extract_m3u8_formats(
+            media['mediaUrls']['bcHLS'], media_id, 'mp4')
+        self._sort_formats(formats)
+
+        subtitles = {}
+        text_tracks = media.get('textTracks', {})
+        for key in ('captions', 'subtitles'):
+            for lang, sub_url in text_tracks.get(key, {}).items():
+                subtitles.setdefault(lang, []).append({
+                    'url': sub_url,
+                })
+
+        fivestar = node.get('fivestar', {})
+        fields = node.get('fields', {})
+
+        def get_field_value(key, value_key='value'):
+            return try_get(fields, lambda x: x[key][0][value_key])
+
+        return {
+            'id': media_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'description': strip_or_none(get_field_value('body') or get_field_value('teaser')),
+            'timestamp': int_or_none(node.get('created')),
+            'subtitles': subtitles,
+            'duration': int_or_none(vdata.get('duration')),
+            'like_count': int_or_none(try_get(fivestar, lambda x: x['up_count']['value'])),
+            'dislike_count': int_or_none(try_get(fivestar, lambda x: x['down_count']['value'])),
+            'comment_count': int_or_none(node.get('comment_count')),
+            'series': try_get(node, lambda x: x['series']['title'], compat_str),
+            'season_number': int_or_none(get_field_value('season')),
+            'season_id': str_or_none(get_field_value('series_nid', 'nid')),
+            'episode_number': int_or_none(get_field_value('episode')),
+        }
diff --git a/youtube_dl/extractor/gameinformer.py b/youtube_dl/extractor/gameinformer.py

new file mode 100644 (file)

index 0000000..f1b96c1
--- /dev/null
+++ b/youtube_dl/extractor/gameinformer.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    get_element_by_class,
+    get_element_by_id,
+)
+
+
+class GameInformerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?gameinformer\.com/(?:[^/]+/)*(?P<id>[^.?&#]+)'
+    _TESTS = [{
+        # normal Brightcove embed code extracted with BrightcoveNewIE._extract_url
+        'url': 'http://www.gameinformer.com/b/features/archive/2015/09/26/replay-animal-crossing.aspx',
+        'md5': '292f26da1ab4beb4c9099f1304d2b071',
+        'info_dict': {
+            'id': '4515472681001',
+            'ext': 'mp4',
+            'title': 'Replay - Animal Crossing',
+            'description': 'md5:2e211891b215c85d061adc7a4dd2d930',
+            'timestamp': 1443457610,
+            'upload_date': '20150928',
+            'uploader_id': '694940074001',
+        },
+    }, {
+        # Brightcove id inside unique element with field--name-field-brightcove-video-id class
+        'url': 'https://www.gameinformer.com/video-feature/new-gameplay-today/2019/07/09/new-gameplay-today-streets-of-rogue',
+        'info_dict': {
+            'id': '6057111913001',
+            'ext': 'mp4',
+            'title': 'New Gameplay Today – Streets Of Rogue',
+            'timestamp': 1562699001,
+            'upload_date': '20190709',
+            'uploader_id': '694940074001',
+
+        },
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/694940074001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, display_id, headers=self.geo_verification_headers())
+        brightcove_id = clean_html(get_element_by_class('field--name-field-brightcove-video-id', webpage) or get_element_by_id('video-source-content', webpage))
+        brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id if brightcove_id else BrightcoveNewIE._extract_url(self, webpage)
+        return self.url_result(brightcove_url, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py

new file mode 100644 (file)

index 0000000..4236a5e
--- /dev/null
+++ b/youtube_dl/extractor/gamespot.py
@@ -0,0 +1,139 @@
+from __future__ import unicode_literals
+
+import re
+
+from .once import OnceIE
+from ..compat import (
+    compat_urllib_parse_unquote,
+)
+from ..utils import (
+    unescapeHTML,
+    url_basename,
+    dict_get,
+)
+
+
+class GameSpotIE(OnceIE):
+    _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article|review)s/(?:[^/]+/\d+-|embed/)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
+        'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
+        'info_dict': {
+            'id': 'gs-2300-6410818',
+            'ext': 'mp4',
+            'title': 'Arma 3 - Community Guide: SITREP I',
+            'description': 'Check out this video where some of the basics of Arma 3 is explained.',
+        },
+    }, {
+        'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/',
+        'info_dict': {
+            'id': 'gs-2300-6424837',
+            'ext': 'mp4',
+            'title': 'Now Playing - The Witcher 3: Wild Hunt',
+            'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.',
+        },
+        'params': {
+            'skip_download': True,  # m3u8 downloads
+        },
+    }, {
+        'url': 'https://www.gamespot.com/videos/embed/6439218/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.gamespot.com/articles/the-last-of-us-2-receives-new-ps4-trailer/1100-6454469/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.gamespot.com/reviews/gears-of-war-review/1900-6161188/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+        webpage = self._download_webpage(url, page_id)
+        data_video_json = self._search_regex(
+            r'data-video=["\'](.*?)["\']', webpage, 'data video')
+        data_video = self._parse_json(unescapeHTML(data_video_json), page_id)
+        streams = data_video['videoStreams']
+
+        manifest_url = None
+        formats = []
+        f4m_url = streams.get('f4m_stream')
+        if f4m_url:
+            manifest_url = f4m_url
+            formats.extend(self._extract_f4m_formats(
+                f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False))
+        m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream'))
+        if m3u8_url:
+            manifest_url = m3u8_url
+            m3u8_formats = self._extract_m3u8_formats(
+                m3u8_url, page_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False)
+            formats.extend(m3u8_formats)
+        progressive_url = dict_get(
+            streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr'))
+        if progressive_url and manifest_url:
+            qualities_basename = self._search_regex(
+                r'/([^/]+)\.csmil/',
+                manifest_url, 'qualities basename', default=None)
+            if qualities_basename:
+                QUALITIES_RE = r'((,\d+)+,?)'
+                qualities = self._search_regex(
+                    QUALITIES_RE, qualities_basename,
+                    'qualities', default=None)
+                if qualities:
+                    qualities = list(map(lambda q: int(q), qualities.strip(',').split(',')))
+                    qualities.sort()
+                    http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename)
+                    http_url_basename = url_basename(progressive_url)
+                    if m3u8_formats:
+                        self._sort_formats(m3u8_formats)
+                        m3u8_formats = list(filter(
+                            lambda f: f.get('vcodec') != 'none', m3u8_formats))
+                    if len(qualities) == len(m3u8_formats):
+                        for q, m3u8_format in zip(qualities, m3u8_formats):
+                            f = m3u8_format.copy()
+                            f.update({
+                                'url': progressive_url.replace(
+                                    http_url_basename, http_template % q),
+                                'format_id': f['format_id'].replace('hls', 'http'),
+                                'protocol': 'http',
+                            })
+                            formats.append(f)
+                    else:
+                        for q in qualities:
+                            formats.append({
+                                'url': progressive_url.replace(
+                                    http_url_basename, http_template % q),
+                                'ext': 'mp4',
+                                'format_id': 'http-%d' % q,
+                                'tbr': q,
+                            })
+
+        onceux_json = self._search_regex(
+            r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None)
+        if onceux_json:
+            onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri')
+            if onceux_url:
+                formats.extend(self._extract_once_formats(re.sub(
+                    r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url),
+                    http_formats_preference=-1))
+
+        if not formats:
+            for quality in ['sd', 'hd']:
+                # It's actually a link to a flv file
+                flv_url = streams.get('f4m_{0}'.format(quality))
+                if flv_url is not None:
+                    formats.append({
+                        'url': flv_url,
+                        'ext': 'flv',
+                        'format_id': quality,
+                    })
+        self._sort_formats(formats)
+
+        return {
+            'id': data_video['guid'],
+            'display_id': page_id,
+            'title': compat_urllib_parse_unquote(data_video['title']),
+            'formats': formats,
+            'description': self._html_search_meta('description', webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py

new file mode 100644 (file)

index 0000000..f00dab2
--- /dev/null
+++ b/youtube_dl/extractor/gamestar.py
@@ -0,0 +1,65 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    remove_end,
+)
+
+
+class GameStarIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?game(?P<site>pro|star)\.de/videos/.*,(?P<id>[0-9]+)\.html'
+    _TESTS = [{
+        'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html',
+        'md5': 'ee782f1f8050448c95c5cacd63bc851c',
+        'info_dict': {
+            'id': '76110',
+            'ext': 'mp4',
+            'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil',
+            'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1406542380,
+            'upload_date': '20140728',
+            'duration': 17,
+        }
+    }, {
+        'url': 'http://www.gamepro.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.gamestar.de/videos/top-10-indie-spiele-fuer-nintendo-switch-video-tolle-nindies-games-zum-download,95316.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        site = mobj.group('site')
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        # TODO: there are multiple ld+json objects in the webpage,
+        # while _search_json_ld finds only the first one
+        json_ld = self._parse_json(self._search_regex(
+            r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>[^<]+VideoObject[^<]+)</script>',
+            webpage, 'JSON-LD', group='json_ld'), video_id)
+        info_dict = self._json_ld(json_ld, video_id)
+        info_dict['title'] = remove_end(
+            info_dict['title'], ' - Game%s' % site.title())
+
+        view_count = int_or_none(json_ld.get('interactionCount'))
+        comment_count = int_or_none(self._html_search_regex(
+            r'<span>Kommentare</span>\s*<span[^>]+class=["\']count[^>]+>\s*\(\s*([0-9]+)',
+            webpage, 'comment count', fatal=False))
+
+        info_dict.update({
+            'id': video_id,
+            'url': 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id,
+            'ext': 'mp4',
+            'view_count': view_count,
+            'comment_count': comment_count
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py

new file mode 100644 (file)

index 0000000..1726a67
--- /dev/null
+++ b/youtube_dl/extractor/gaskrank.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    unified_strdate,
+)
+
+
+class GaskrankIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P<categories>[^/]+)/(?P<id>[^/]+)\.htm'
+    _TESTS = [{
+        'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm',
+        'md5': '1ae88dbac97887d85ebd1157a95fc4f9',
+        'info_dict': {
+            'id': '201601/26955',
+            'ext': 'mp4',
+            'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'categories': ['motorrad-fun'],
+            'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden',
+            'uploader_id': 'Bikefun',
+            'upload_date': '20170110',
+            'uploader_url': None,
+        }
+    }, {
+        'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm',
+        'md5': 'c33ee32c711bc6c8224bfcbe62b23095',
+        'info_dict': {
+            'id': '201106/15920',
+            'ext': 'mp4',
+            'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'categories': ['racing'],
+            'display_id': 'isle-of-man-tt-2011-michael-du-15920',
+            'uploader_id': 'IOM',
+            'upload_date': '20170523',
+            'uploader_url': 'www.iomtt.com',
+        }
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(
+            webpage, default=None) or self._html_search_meta(
+            'title', webpage, fatal=True)
+
+        categories = [re.match(self._VALID_URL, url).group('categories')]
+
+        mobj = re.search(
+            r'Video von:\s*(?P<uploader_id>[^|]*?)\s*\|\s*vom:\s*(?P<upload_date>[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])',
+            webpage)
+        if mobj is not None:
+            uploader_id = mobj.groupdict().get('uploader_id')
+            upload_date = unified_strdate(mobj.groupdict().get('upload_date'))
+
+        uploader_url = self._search_regex(
+            r'Homepage:\s*<[^>]*>(?P<uploader_url>[^<]*)',
+            webpage, 'uploader_url', default=None)
+        tags = re.findall(
+            r'/tv/tags/[^/]+/"\s*>(?P<tag>[^<]*?)<',
+            webpage)
+
+        view_count = self._search_regex(
+            r'class\s*=\s*"gkRight"(?:[^>]*>\s*<[^>]*)*icon-eye-open(?:[^>]*>\s*<[^>]*)*>\s*(?P<view_count>[0-9\.]*)',
+            webpage, 'view_count', default=None)
+        if view_count:
+            view_count = int_or_none(view_count.replace('.', ''))
+
+        average_rating = self._search_regex(
+            r'itemprop\s*=\s*"ratingValue"[^>]*>\s*(?P<average_rating>[0-9,]+)',
+            webpage, 'average_rating')
+        if average_rating:
+            average_rating = float_or_none(average_rating.replace(',', '.'))
+
+        video_id = self._search_regex(
+            r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4',
+            webpage, 'video id', default=display_id)
+
+        entry = self._parse_html5_media_entries(url, webpage, video_id)[0]
+        entry.update({
+            'id': video_id,
+            'title': title,
+            'categories': categories,
+            'display_id': display_id,
+            'uploader_id': uploader_id,
+            'upload_date': upload_date,
+            'uploader_url': uploader_url,
+            'tags': tags,
+            'view_count': view_count,
+            'average_rating': average_rating,
+        })
+        self._sort_formats(entry['formats'])
+
+        return entry
diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py

new file mode 100644 (file)

index 0000000..57c67a4
--- /dev/null
+++ b/youtube_dl/extractor/gazeta.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class GazetaIE(InfoExtractor):
+    _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:main/)*(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)'
+    _TESTS = [{
+        'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml',
+        'md5': 'd49c9bdc6e5a7888f27475dc215ee789',
+        'info_dict': {
+            'id': '205566',
+            'ext': 'mp4',
+            'title': '«70–80 процентов гражданских в Донецке на грани голода»',
+            'description': 'md5:38617526050bd17b234728e7f9620a71',
+            'thumbnail': r're:^https?://.*\.jpg',
+        },
+        'skip': 'video not found',
+    }, {
+        'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.gazeta.ru/video/main/main/2015/06/22/platit_ili_ne_platit_po_isku_yukosa.shtml',
+        'md5': '37f19f78355eb2f4256ee1688359f24c',
+        'info_dict': {
+            'id': '252048',
+            'ext': 'mp4',
+            'title': '"Если по иску ЮКОСа придется платить, это будет большой удар по бюджету"',
+        },
+        'add_ie': ['EaglePlatform'],
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        display_id = mobj.group('id')
+        embed_url = '%s?p=embed' % mobj.group('url')
+        embed_page = self._download_webpage(
+            embed_url, display_id, 'Downloading embed page')
+
+        video_id = self._search_regex(
+            r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id')
+
+        return self.url_result(
+            'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform')
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py

new file mode 100644 (file)

index 0000000..2f555c1
--- /dev/null
+++ b/youtube_dl/extractor/gdcvault.py
@@ -0,0 +1,188 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .kaltura import KalturaIE
+from ..utils import (
+    HEADRequest,
+    sanitized_Request,
+    smuggle_url,
+    urlencode_postdata,
+)
+
+
+class GDCVaultIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)(?:/(?P<name>[\w-]+))?'
+    _NETRC_MACHINE = 'gdcvault'
+    _TESTS = [
+        {
+            'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
+            'md5': '7ce8388f544c88b7ac11c7ab1b593704',
+            'info_dict': {
+                'id': '201311826596_AWNY',
+                'display_id': 'Doki-Doki-Universe-Sweet-Simple',
+                'ext': 'mp4',
+                'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
+            }
+        },
+        {
+            'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
+            'info_dict': {
+                'id': '201203272_1330951438328RSXR',
+                'display_id': 'Embracing-the-Dark-Art-of',
+                'ext': 'flv',
+                'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
+            },
+            'params': {
+                'skip_download': True,  # Requires rtmpdump
+            }
+        },
+        {
+            'url': 'http://www.gdcvault.com/play/1015301/Thexder-Meets-Windows-95-or',
+            'md5': 'a5eb77996ef82118afbbe8e48731b98e',
+            'info_dict': {
+                'id': '1015301',
+                'display_id': 'Thexder-Meets-Windows-95-or',
+                'ext': 'flv',
+                'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment',
+            },
+            'skip': 'Requires login',
+        },
+        {
+            'url': 'http://gdcvault.com/play/1020791/',
+            'only_matching': True,
+        },
+        {
+            # Hard-coded hostname
+            'url': 'http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface',
+            'md5': 'a8efb6c31ed06ca8739294960b2dbabd',
+            'info_dict': {
+                'id': '840376_BQRC',
+                'ext': 'mp4',
+                'display_id': 'Tenacious-Design-and-The-Interface',
+                'title': 'Tenacious Design and The Interface of \'Destiny\'',
+            },
+        },
+        {
+            # Multiple audios
+            'url': 'http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC',
+            'info_dict': {
+                'id': '12396_1299111843500GMPX',
+                'ext': 'mp4',
+                'title': 'How to Create a Good Game - From My Experience of Designing Pac-Man',
+            },
+            # 'params': {
+            #     'skip_download': True,  # Requires rtmpdump
+            #     'format': 'jp',  # The japanese audio
+            # }
+        },
+        {
+            # gdc-player.html
+            'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo',
+            'info_dict': {
+                'id': '9350_1238021887562UHXB',
+                'display_id': 'An-American-engine-in-Tokyo',
+                'ext': 'mp4',
+                'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT',
+            },
+        },
+        {
+            # Kaltura Embed
+            'url': 'https://www.gdcvault.com/play/1026180/Mastering-the-Apex-of-Scaling',
+            'info_dict': {
+                'id': '0_h1fg8j3p',
+                'ext': 'mp4',
+                'title': 'Mastering the Apex of Scaling Game Servers (Presented by Multiplay)',
+                'timestamp': 1554401811,
+                'upload_date': '20190404',
+                'uploader_id': 'joe@blazestreaming.com',
+            },
+            'params': {
+                'format': 'mp4-408',
+            },
+        },
+    ]
+
+    def _login(self, webpage_url, display_id):
+        username, password = self._get_login_info()
+        if username is None or password is None:
+            self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.')
+            return None
+
+        mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url)
+        login_url = mobj.group('root_url') + 'api/login.php'
+        logout_url = mobj.group('root_url') + 'logout'
+
+        login_form = {
+            'email': username,
+            'password': password,
+        }
+
+        request = sanitized_Request(login_url, urlencode_postdata(login_form))
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        self._download_webpage(request, display_id, 'Logging in')
+        start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page')
+        self._download_webpage(logout_url, display_id, 'Logging out')
+
+        return start_page
+
+    def _real_extract(self, url):
+        video_id, name = re.match(self._VALID_URL, url).groups()
+        display_id = name or video_id
+
+        webpage_url = 'http://www.gdcvault.com/play/' + video_id
+        start_page = self._download_webpage(webpage_url, display_id)
+
+        direct_url = self._search_regex(
+            r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);',
+            start_page, 'url', default=None)
+        if direct_url:
+            title = self._html_search_regex(
+                r'<td><strong>Session Name:?</strong></td>\s*<td>(.*?)</td>',
+                start_page, 'title')
+            video_url = 'http://www.gdcvault.com' + direct_url
+            # resolve the url so that we can detect the correct extension
+            video_url = self._request_webpage(
+                HEADRequest(video_url), video_id).geturl()
+
+            return {
+                'id': video_id,
+                'display_id': display_id,
+                'url': video_url,
+                'title': title,
+            }
+
+        embed_url = KalturaIE._extract_url(start_page)
+        if embed_url:
+            embed_url = smuggle_url(embed_url, {'source_url': url})
+            ie_key = 'Kaltura'
+        else:
+            PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>'
+
+            xml_root = self._html_search_regex(
+                PLAYER_REGEX, start_page, 'xml root', default=None)
+            if xml_root is None:
+                # Probably need to authenticate
+                login_res = self._login(webpage_url, display_id)
+                if login_res is None:
+                    self.report_warning('Could not login.')
+                else:
+                    start_page = login_res
+                    # Grab the url from the authenticated page
+                    xml_root = self._html_search_regex(
+                        PLAYER_REGEX, start_page, 'xml root')
+
+            xml_name = self._html_search_regex(
+                r'<iframe src=".*?\?xml(?:=|URL=xml/)(.+?\.xml).*?".*?</iframe>',
+                start_page, 'xml filename')
+            embed_url = '%s/xml/%s' % (xml_root, xml_name)
+            ie_key = 'DigitallySpeaking'
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'display_id': display_id,
+            'url': embed_url,
+            'ie_key': ie_key,
+        }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

new file mode 100644 (file)

index 0000000..aba06b3
--- /dev/null
+++ b/youtube_dl/extractor/generic.py
@@ -0,0 +1,3459 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+import os
+import re
+import sys
+
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..compat import (
+    compat_etree_fromstring,
+    compat_str,
+    compat_urllib_parse_unquote,
+    compat_urlparse,
+    compat_xml_parse_error,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    HEADRequest,
+    is_html,
+    js_to_json,
+    KNOWN_EXTENSIONS,
+    merge_dicts,
+    mimetype2ext,
+    orderedSet,
+    sanitized_Request,
+    smuggle_url,
+    unescapeHTML,
+    unified_strdate,
+    unsmuggle_url,
+    UnsupportedError,
+    xpath_text,
+)
+from .commonprotocols import RtmpIE
+from .brightcove import (
+    BrightcoveLegacyIE,
+    BrightcoveNewIE,
+)
+from .nexx import (
+    NexxIE,
+    NexxEmbedIE,
+)
+from .nbc import NBCSportsVPlayerIE
+from .ooyala import OoyalaIE
+from .rutv import RUTVIE
+from .tvc import TVCIE
+from .sportbox import SportBoxIE
+from .smotri import SmotriIE
+from .myvi import MyviIE
+from .condenast import CondeNastIE
+from .udn import UDNEmbedIE
+from .senateisvp import SenateISVPIE
+from .svt import SVTIE
+from .pornhub import PornHubIE
+from .xhamster import XHamsterEmbedIE
+from .tnaflix import TNAFlixNetworkEmbedIE
+from .drtuber import DrTuberIE
+from .redtube import RedTubeIE
+from .tube8 import Tube8IE
+from .mofosex import MofosexEmbedIE
+from .spankwire import SpankwireIE
+from .youporn import YouPornIE
+from .vimeo import VimeoIE
+from .dailymotion import DailymotionIE
+from .dailymail import DailyMailIE
+from .onionstudios import OnionStudiosIE
+from .viewlift import ViewLiftEmbedIE
+from .mtv import MTVServicesEmbeddedIE
+from .pladform import PladformIE
+from .videomore import VideomoreIE
+from .webcaster import WebcasterFeedIE
+from .googledrive import GoogleDriveIE
+from .jwplatform import JWPlatformIE
+from .digiteka import DigitekaIE
+from .arkena import ArkenaIE
+from .instagram import InstagramIE
+from .liveleak import LiveLeakIE
+from .threeqsdn import ThreeQSDNIE
+from .theplatform import ThePlatformIE
+from .kaltura import KalturaIE
+from .eagleplatform import EaglePlatformIE
+from .facebook import FacebookIE
+from .soundcloud import SoundcloudEmbedIE
+from .tunein import TuneInBaseIE
+from .vbox7 import Vbox7IE
+from .dbtv import DBTVIE
+from .piksel import PikselIE
+from .videa import VideaIE
+from .twentymin import TwentyMinutenIE
+from .ustream import UstreamIE
+from .videopress import VideoPressIE
+from .rutube import RutubeIE
+from .limelight import LimelightBaseIE
+from .anvato import AnvatoIE
+from .washingtonpost import WashingtonPostIE
+from .wistia import WistiaIE
+from .mediaset import MediasetIE
+from .joj import JojIE
+from .megaphone import MegaphoneIE
+from .vzaar import VzaarIE
+from .channel9 import Channel9IE
+from .vshare import VShareIE
+from .mediasite import MediasiteIE
+from .springboardplatform import SpringboardPlatformIE
+from .yapfiles import YapFilesIE
+from .vice import ViceIE
+from .xfileshare import XFileShareIE
+from .cloudflarestream import CloudflareStreamIE
+from .peertube import PeerTubeIE
+from .teachable import TeachableIE
+from .indavideo import IndavideoEmbedIE
+from .apa import APAIE
+from .foxnews import FoxNewsIE
+from .viqeo import ViqeoIE
+from .expressen import ExpressenIE
+from .zype import ZypeIE
+from .odnoklassniki import OdnoklassnikiIE
+from .kinja import KinjaEmbedIE
+
+
+class GenericIE(InfoExtractor):
+    IE_DESC = 'Generic downloader that works on some sites'
+    _VALID_URL = r'.*'
+    IE_NAME = 'generic'
+    _TESTS = [
+        # Direct link to a video
+        {
+            'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
+            'md5': '67d406c2bcb6af27fa886f31aa934bbe',
+            'info_dict': {
+                'id': 'trailer',
+                'ext': 'mp4',
+                'title': 'trailer',
+                'upload_date': '20100513',
+            }
+        },
+        # Direct link to media delivered compressed (until Accept-Encoding is *)
+        {
+            'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
+            'md5': '128c42e68b13950268b648275386fc74',
+            'info_dict': {
+                'id': 'FictionJunction-Parallel_Hearts',
+                'ext': 'flac',
+                'title': 'FictionJunction-Parallel_Hearts',
+                'upload_date': '20140522',
+            },
+            'expected_warnings': [
+                'URL could be a direct video link, returning it as such.'
+            ],
+            'skip': 'URL invalid',
+        },
+        # Direct download with broken HEAD
+        {
+            'url': 'http://ai-radio.org:8000/radio.opus',
+            'info_dict': {
+                'id': 'radio',
+                'ext': 'opus',
+                'title': 'radio',
+            },
+            'params': {
+                'skip_download': True,  # infinite live stream
+            },
+            'expected_warnings': [
+                r'501.*Not Implemented',
+                r'400.*Bad Request',
+            ],
+        },
+        # Direct link with incorrect MIME type
+        {
+            'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+            'md5': '4ccbebe5f36706d85221f204d7eb5913',
+            'info_dict': {
+                'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+                'id': '5_Lennart_Poettering_-_Systemd',
+                'ext': 'webm',
+                'title': '5_Lennart_Poettering_-_Systemd',
+                'upload_date': '20141120',
+            },
+            'expected_warnings': [
+                'URL could be a direct video link, returning it as such.'
+            ]
+        },
+        # RSS feed
+        {
+            'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+            'info_dict': {
+                'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
+                'title': 'Zero Punctuation',
+                'description': 're:.*groundbreaking video review series.*'
+            },
+            'playlist_mincount': 11,
+        },
+        # RSS feed with enclosure
+        {
+            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+            'info_dict': {
+                'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+                'ext': 'm4v',
+                'upload_date': '20150228',
+                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
+            }
+        },
+        # RSS feed with enclosures and unsupported link URLs
+        {
+            'url': 'http://www.hellointernet.fm/podcast?format=rss',
+            'info_dict': {
+                'id': 'http://www.hellointernet.fm/podcast?format=rss',
+                'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.',
+                'title': 'Hello Internet',
+            },
+            'playlist_mincount': 100,
+        },
+        # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
+        {
+            'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
+            'info_dict': {
+                'id': 'smil',
+                'ext': 'mp4',
+                'title': 'Automatics, robotics and biocybernetics',
+                'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+                'upload_date': '20130627',
+                'formats': 'mincount:16',
+                'subtitles': 'mincount:1',
+            },
+            'params': {
+                'force_generic_extractor': True,
+                'skip_download': True,
+            },
+        },
+        # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html
+        {
+            'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil',
+            'info_dict': {
+                'id': 'hds',
+                'ext': 'flv',
+                'title': 'hds',
+                'formats': 'mincount:1',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # SMIL from https://www.restudy.dk/video/play/id/1637
+        {
+            'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml',
+            'info_dict': {
+                'id': 'video_1637',
+                'ext': 'flv',
+                'title': 'video_1637',
+                'formats': 'mincount:3',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm
+        {
+            'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil',
+            'info_dict': {
+                'id': 'smil-service',
+                'ext': 'flv',
+                'title': 'smil-service',
+                'formats': 'mincount:1',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370
+        {
+            'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil',
+            'info_dict': {
+                'id': '4719370',
+                'ext': 'mp4',
+                'title': '571de1fd-47bc-48db-abf9-238872a58d1f',
+                'formats': 'mincount:3',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html
+        {
+            'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf',
+            'info_dict': {
+                'id': 'mZlp2ctYIUEB',
+                'ext': 'mp4',
+                'title': 'Tikibad ontruimd wegens brand',
+                'description': 'md5:05ca046ff47b931f9b04855015e163a4',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 33,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # MPD from http://dash-mse-test.appspot.com/media.html
+        {
+            'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd',
+            'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53',
+            'info_dict': {
+                'id': 'car-20120827-manifest',
+                'ext': 'mp4',
+                'title': 'car-20120827-manifest',
+                'formats': 'mincount:9',
+                'upload_date': '20130904',
+            },
+            'params': {
+                'format': 'bestvideo',
+            },
+        },
+        # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8
+        {
+            'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8',
+            'info_dict': {
+                'id': 'content',
+                'ext': 'mp4',
+                'title': 'content',
+                'formats': 'mincount:8',
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            },
+            'skip': 'video gone',
+        },
+        # m3u8 served with Content-Type: text/plain
+        {
+            'url': 'http://www.nacentapps.com/m3u8/index.m3u8',
+            'info_dict': {
+                'id': 'index',
+                'ext': 'mp4',
+                'title': 'index',
+                'upload_date': '20140720',
+                'formats': 'mincount:11',
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            },
+            'skip': 'video gone',
+        },
+        # google redirect
+        {
+            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+            'info_dict': {
+                'id': 'cmQHVoWB5FY',
+                'ext': 'mp4',
+                'upload_date': '20130224',
+                'uploader_id': 'TheVerge',
+                'description': r're:^Chris Ziegler takes a look at the\.*',
+                'uploader': 'The Verge',
+                'title': 'First Firefox OS phones side-by-side',
+            },
+            'params': {
+                'skip_download': False,
+            }
+        },
+        {
+            # redirect in Refresh HTTP header
+            'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1',
+            'info_dict': {
+                'id': 'pO8h3EaFRdo',
+                'ext': 'mp4',
+                'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set',
+                'description': 'md5:6294cc1af09c4049e0652b51a2df10d5',
+                'upload_date': '20150917',
+                'uploader_id': 'brtvofficial',
+                'uploader': 'Boiler Room',
+            },
+            'params': {
+                'skip_download': False,
+            },
+        },
+        {
+            'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
+            'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
+            'info_dict': {
+                'id': '13601338388002',
+                'ext': 'mp4',
+                'uploader': 'www.hodiho.fr',
+                'title': 'R\u00e9gis plante sa Jeep',
+            }
+        },
+        # bandcamp page with custom domain
+        {
+            'add_ie': ['Bandcamp'],
+            'url': 'http://bronyrock.com/track/the-pony-mash',
+            'info_dict': {
+                'id': '3235767654',
+                'ext': 'mp3',
+                'title': 'The Pony Mash',
+                'uploader': 'M_Pallante',
+            },
+            'skip': 'There is a limit of 200 free downloads / month for the test song',
+        },
+        {
+            # embedded brightcove video
+            # it also tests brightcove videos that need to set the 'Referer'
+            # in the http requests
+            'add_ie': ['BrightcoveLegacy'],
+            'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
+            'info_dict': {
+                'id': '2765128793001',
+                'ext': 'mp4',
+                'title': 'Le cours de bourse : l’analyse technique',
+                'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
+                'uploader': 'BFM BUSINESS',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # embedded with itemprop embedURL and video id spelled as `idVideo`
+            'add_id': ['BrightcoveLegacy'],
+            'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/',
+            'info_dict': {
+                'id': '5255628253001',
+                'ext': 'mp4',
+                'title': 'md5:37c519b1128915607601e75a87995fc0',
+                'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26',
+                'uploader': 'BFM BUSINESS',
+                'uploader_id': '876450612001',
+                'timestamp': 1482255315,
+                'upload_date': '20161220',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # https://github.com/ytdl-org/youtube-dl/issues/2253
+            'url': 'http://bcove.me/i6nfkrc3',
+            'md5': '0ba9446db037002366bab3b3eb30c88c',
+            'info_dict': {
+                'id': '3101154703001',
+                'ext': 'mp4',
+                'title': 'Still no power',
+                'uploader': 'thestar.com',
+                'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
+            },
+            'add_ie': ['BrightcoveLegacy'],
+            'skip': 'video gone',
+        },
+        {
+            'url': 'http://www.championat.com/video/football/v/87/87499.html',
+            'md5': 'fb973ecf6e4a78a67453647444222983',
+            'info_dict': {
+                'id': '3414141473001',
+                'ext': 'mp4',
+                'title': 'Видео. Удаление Дзагоева (ЦСКА)',
+                'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
+                'uploader': 'Championat',
+            },
+        },
+        {
+            # https://github.com/ytdl-org/youtube-dl/issues/3541
+            'add_ie': ['BrightcoveLegacy'],
+            'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
+            'info_dict': {
+                'id': '3866516442001',
+                'ext': 'mp4',
+                'title': 'Leer mij vrouwen kennen: Aflevering 1',
+                'description': 'Leer mij vrouwen kennen: Aflevering 1',
+                'uploader': 'SBS Broadcasting',
+            },
+            'skip': 'Restricted to Netherlands',
+            'params': {
+                'skip_download': True,  # m3u8 download
+            },
+        },
+        {
+            # Brightcove video in <iframe>
+            'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724',
+            'md5': '36d74ef5e37c8b4a2ce92880d208b968',
+            'info_dict': {
+                'id': '5360463607001',
+                'ext': 'mp4',
+                'title': '叙利亚失明儿童在废墟上演唱《心跳》  呼吁获得正常童年生活',
+                'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼（Zade Dirani）在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》（HEARTBEAT），为受到六年冲突影响的叙利亚儿童发出强烈呐喊，呼吁世界做出共同努力，使叙利亚儿童重新获得享有正常童年生活的权利。',
+                'uploader': 'United Nations',
+                'uploader_id': '1362235914001',
+                'timestamp': 1489593889,
+                'upload_date': '20170315',
+            },
+            'add_ie': ['BrightcoveLegacy'],
+        },
+        {
+            # Brightcove with alternative playerID key
+            'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html',
+            'info_dict': {
+                'id': 'nmeth.2062_SV1',
+                'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'id': '2228375078001',
+                    'ext': 'mp4',
+                    'title': 'nmeth.2062-sv1',
+                    'description': 'nmeth.2062-sv1',
+                    'timestamp': 1363357591,
+                    'upload_date': '20130315',
+                    'uploader': 'Nature Publishing Group',
+                    'uploader_id': '1964492299001',
+                },
+            }],
+        },
+        {
+            # Brightcove with UUID in videoPlayer
+            'url': 'http://www8.hp.com/cn/zh/home.html',
+            'info_dict': {
+                'id': '5255815316001',
+                'ext': 'mp4',
+                'title': 'Sprocket Video - China',
+                'description': 'Sprocket Video - China',
+                'uploader': 'HP-Video Gallery',
+                'timestamp': 1482263210,
+                'upload_date': '20161220',
+                'uploader_id': '1107601872001',
+            },
+            'params': {
+                'skip_download': True,  # m3u8 download
+            },
+            'skip': 'video rotates...weekly?',
+        },
+        {
+            # Brightcove:new type [2].
+            'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis',
+            'md5': '2b35148fcf48da41c9fb4591650784f3',
+            'info_dict': {
+                'id': '5348741021001',
+                'ext': 'mp4',
+                'upload_date': '20170306',
+                'uploader_id': '4191638492001',
+                'timestamp': 1488769918,
+                'title': 'VIDEO:  St. Thomas More earns first trip to basketball semis',
+
+            },
+        },
+        {
+            # Alternative brightcove <video> attributes
+            'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/',
+            'info_dict': {
+                'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche',
+                'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs",
+            },
+            'playlist': [{
+                'md5': '732d22ba3d33f2f3fc253c39f8f36523',
+                'info_dict': {
+                    'id': '5311302538001',
+                    'ext': 'mp4',
+                    'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche",
+                    'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)",
+                    'timestamp': 1486321708,
+                    'upload_date': '20170205',
+                    'uploader_id': '800000640001',
+                },
+                'only_matching': True,
+            }],
+        },
+        {
+            # Brightcove with UUID in videoPlayer
+            'url': 'http://www8.hp.com/cn/zh/home.html',
+            'info_dict': {
+                'id': '5255815316001',
+                'ext': 'mp4',
+                'title': 'Sprocket Video - China',
+                'description': 'Sprocket Video - China',
+                'uploader': 'HP-Video Gallery',
+                'timestamp': 1482263210,
+                'upload_date': '20161220',
+                'uploader_id': '1107601872001',
+            },
+            'params': {
+                'skip_download': True,  # m3u8 download
+            },
+        },
+        # ooyala video
+        {
+            'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
+            'md5': '166dd577b433b4d4ebfee10b0824d8ff',
+            'info_dict': {
+                'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
+                'ext': 'mp4',
+                'title': '2cc213299525360.mov',  # that's what we get
+                'duration': 238.231,
+            },
+            'add_ie': ['Ooyala'],
+        },
+        {
+            # ooyala video embedded with http://player.ooyala.com/iframe.js
+            'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
+            'info_dict': {
+                'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
+                'ext': 'mp4',
+                'title': '"Steve Jobs: Man in the Machine" trailer',
+                'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
+                'duration': 135.427,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'movie expired',
+        },
+        # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js
+        {
+            'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/',
+            'info_dict': {
+                'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2',
+                'ext': 'mp4',
+                'title': 'Steampunk Fest Comes to Honesdale',
+                'duration': 43.276,
+            },
+            'params': {
+                'skip_download': True,
+            }
+        },
+        # embed.ly video
+        {
+            'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
+            'info_dict': {
+                'id': '9ODmcdjQcHQ',
+                'ext': 'mp4',
+                'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
+                'upload_date': '20140225',
+                'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
+                'uploader': 'Tested',
+                'uploader_id': 'testedcom',
+            },
+            # No need to test YoutubeIE here
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # funnyordie embed
+        {
+            'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
+            'info_dict': {
+                'id': '18e820ec3f',
+                'ext': 'mp4',
+                'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
+                'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
+            },
+            # HEAD requests lead to endless 301, while GET is OK
+            'expected_warnings': ['301'],
+        },
+        # RUTV embed
+        {
+            'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
+            'info_dict': {
+                'id': '776940',
+                'ext': 'mp4',
+                'title': 'Охотское море стало целиком российским',
+                'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        # TVC embed
+        {
+            'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
+            'info_dict': {
+                'id': '55304',
+                'ext': 'mp4',
+                'title': 'Дошкольное воспитание',
+            },
+        },
+        # SportBox embed
+        {
+            'url': 'http://www.vestifinance.ru/articles/25753',
+            'info_dict': {
+                'id': '25753',
+                'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'id': '370908',
+                    'title': 'Госзаказ. День 3',
+                    'ext': 'mp4',
+                }
+            }, {
+                'info_dict': {
+                    'id': '370905',
+                    'title': 'Госзаказ. День 2',
+                    'ext': 'mp4',
+                }
+            }, {
+                'info_dict': {
+                    'id': '370902',
+                    'title': 'Госзаказ. День 1',
+                    'ext': 'mp4',
+                }
+            }],
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        # Myvi.ru embed
+        {
+            'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1',
+            'info_dict': {
+                'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e',
+                'ext': 'mp4',
+                'title': 'Ужастики, русский трейлер (2015)',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 153,
+            }
+        },
+        # XHamster embed
+        {
+            'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
+            'info_dict': {
+                'id': 'showthread',
+                'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
+            },
+            'playlist_mincount': 7,
+            # This forum does not allow <iframe> syntaxes anymore
+            # Now HTML tags are displayed as-is
+            'skip': 'No videos on this page',
+        },
+        # Embedded TED video
+        {
+            'url': 'http://en.support.wordpress.com/videos/ted-talks/',
+            'md5': '65fdff94098e4a607385a60c5177c638',
+            'info_dict': {
+                'id': '1969',
+                'ext': 'mp4',
+                'title': 'Hidden miracles of the natural world',
+                'uploader': 'Louie Schwartzberg',
+                'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
+            }
+        },
+        # nowvideo embed hidden behind percent encoding
+        {
+            'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
+            'md5': '2baf4ddd70f697d94b1c18cf796d5107',
+            'info_dict': {
+                'id': '06e53103ca9aa',
+                'ext': 'flv',
+                'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
+                'description': 'No description',
+            },
+        },
+        # arte embed
+        {
+            'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
+            'md5': '7653032cbb25bf6c80d80f217055fa43',
+            'info_dict': {
+                'id': '048195-004_PLUS7-F',
+                'ext': 'flv',
+                'title': 'X:enius',
+                'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
+                'upload_date': '20140320',
+            },
+            'params': {
+                'skip_download': 'Requires rtmpdump'
+            },
+            'skip': 'video gone',
+        },
+        # francetv embed
+        {
+            'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero',
+            'info_dict': {
+                'id': 'EV_30231',
+                'ext': 'mp4',
+                'title': 'Alcaline, le concert avec Calogero',
+                'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
+                'upload_date': '20150226',
+                'timestamp': 1424989860,
+                'duration': 5400,
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            },
+            'expected_warnings': [
+                'Forbidden'
+            ]
+        },
+        # Condé Nast embed
+        {
+            'url': 'http://www.wired.com/2014/04/honda-asimo/',
+            'md5': 'ba0dfe966fa007657bd1443ee672db0f',
+            'info_dict': {
+                'id': '53501be369702d3275860000',
+                'ext': 'mp4',
+                'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
+            }
+        },
+        # Dailymotion embed
+        {
+            'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
+            'md5': '441aeeb82eb72c422c7f14ec533999cd',
+            'info_dict': {
+                'id': 'k2mm4bCdJ6CQ2i7c8o2',
+                'ext': 'mp4',
+                'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
+                'description': 'md5:faf028e48a461b8b7fad38f1e104b119',
+                'uploader': 'Spi0n',
+                'uploader_id': 'xgditw',
+                'upload_date': '20140425',
+                'timestamp': 1398441542,
+            },
+            'add_ie': ['Dailymotion'],
+        },
+        # DailyMail embed
+        {
+            'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot',
+            'info_dict': {
+                'id': '1495629',
+                'ext': 'mp4',
+                'title': 'Care worker punches elderly dementia patient in head 11 times',
+                'description': 'md5:3a743dee84e57e48ec68bf67113199a5',
+            },
+            'add_ie': ['DailyMail'],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # YouTube embed
+        {
+            'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
+            'info_dict': {
+                'id': 'FXRb4ykk4S0',
+                'ext': 'mp4',
+                'title': 'The NBL Auction 2014',
+                'uploader': 'BADMINTON England',
+                'uploader_id': 'BADMINTONEvents',
+                'upload_date': '20140603',
+                'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
+            },
+            'add_ie': ['Youtube'],
+            'params': {
+                'skip_download': True,
+            }
+        },
+        # MTVSercices embed
+        {
+            'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html',
+            'md5': 'ca1aef97695ef2c1d6973256a57e5252',
+            'info_dict': {
+                'id': '769f7ec0-0692-4d62-9b45-0d88074bffc1',
+                'ext': 'mp4',
+                'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored',
+                'description': 'Two valets share their love for movie star Liam Neesons.',
+                'timestamp': 1349922600,
+                'upload_date': '20121011',
+            },
+        },
+        # YouTube embed via <data-embed-url="">
+        {
+            'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
+            'info_dict': {
+                'id': '4vAffPZIT44',
+                'ext': 'mp4',
+                'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
+                'uploader': 'Gameloft',
+                'uploader_id': 'gameloft',
+                'upload_date': '20140828',
+                'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
+            },
+            'params': {
+                'skip_download': True,
+            }
+        },
+        # YouTube <object> embed
+        {
+            'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/',
+            'md5': '516718101ec834f74318df76259fb3cc',
+            'info_dict': {
+                'id': 'msN87y-iEx0',
+                'ext': 'webm',
+                'title': 'Feynman: Mirrors FUN TO IMAGINE 6',
+                'upload_date': '20080526',
+                'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d',
+                'uploader': 'Christopher Sykes',
+                'uploader_id': 'ChristopherJSykes',
+            },
+            'add_ie': ['Youtube'],
+        },
+        # Camtasia studio
+        {
+            'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
+            'playlist': [{
+                'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
+                'info_dict': {
+                    'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
+                    'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
+                    'ext': 'flv',
+                    'duration': 2235.90,
+                }
+            }, {
+                'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
+                'info_dict': {
+                    'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
+                    'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
+                    'ext': 'flv',
+                    'duration': 2235.93,
+                }
+            }],
+            'info_dict': {
+                'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
+            }
+        },
+        # Flowplayer
+        {
+            'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
+            'md5': '9d65602bf31c6e20014319c7d07fba27',
+            'info_dict': {
+                'id': '5123ea6d5e5a7',
+                'ext': 'mp4',
+                'age_limit': 18,
+                'uploader': 'www.handjobhub.com',
+                'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
+            }
+        },
+        # Multiple brightcove videos
+        # https://github.com/ytdl-org/youtube-dl/issues/2283
+        {
+            'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
+            'info_dict': {
+                'id': 'always-never',
+                'title': 'Always / Never - The New Yorker',
+            },
+            'playlist_count': 3,
+            'params': {
+                'extract_flat': False,
+                'skip_download': True,
+            }
+        },
+        # MLB embed
+        {
+            'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
+            'md5': '96f09a37e44da40dd083e12d9a683327',
+            'info_dict': {
+                'id': '33322633',
+                'ext': 'mp4',
+                'title': 'Ump changes call to ball',
+                'description': 'md5:71c11215384298a172a6dcb4c2e20685',
+                'duration': 48,
+                'timestamp': 1401537900,
+                'upload_date': '20140531',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        # Wistia embed
+        {
+            'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
+            'md5': '1953f3a698ab51cfc948ed3992a0b7ff',
+            'info_dict': {
+                'id': '6e2wtrbdaf',
+                'ext': 'mov',
+                'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england',
+                'description': 'a Paywall Videos video from Remilon',
+                'duration': 644.072,
+                'uploader': 'study.com',
+                'timestamp': 1459678540,
+                'upload_date': '20160403',
+                'filesize': 24687186,
+            },
+        },
+        {
+            'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
+            'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
+            'info_dict': {
+                'id': 'uxjb0lwrcz',
+                'ext': 'mp4',
+                'title': 'Conversation about Hexagonal Rails Part 1',
+                'description': 'a Martin Fowler video from ThoughtWorks',
+                'duration': 1715.0,
+                'uploader': 'thoughtworks.wistia.com',
+                'timestamp': 1401832161,
+                'upload_date': '20140603',
+            },
+        },
+        # Wistia standard embed (async)
+        {
+            'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/',
+            'info_dict': {
+                'id': '807fafadvk',
+                'ext': 'mp4',
+                'title': 'Drip Brennan Dunn Workshop',
+                'description': 'a JV Webinars video from getdrip-1',
+                'duration': 4986.95,
+                'timestamp': 1463607249,
+                'upload_date': '20160518',
+            },
+            'params': {
+                'skip_download': True,
+            }
+        },
+        # Soundcloud embed
+        {
+            'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
+            'info_dict': {
+                'id': '174391317',
+                'ext': 'mp3',
+                'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
+                'uploader': 'Sophos Security',
+                'title': 'Chet Chat 171 - Oct 29, 2014',
+                'upload_date': '20141029',
+            }
+        },
+        # Soundcloud multiple embeds
+        {
+            'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809',
+            'info_dict': {
+                'id': '52809',
+                'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance  | TAB + AUDIO',
+            },
+            'playlist_mincount': 7,
+        },
+        # TuneIn station embed
+        {
+            'url': 'http://radiocnrv.com/promouvoir-radio-cnrv/',
+            'info_dict': {
+                'id': '204146',
+                'ext': 'mp3',
+                'title': 'CNRV',
+                'location': 'Paris, France',
+                'is_live': True,
+            },
+            'params': {
+                # Live stream
+                'skip_download': True,
+            },
+        },
+        # Livestream embed
+        {
+            'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
+            'info_dict': {
+                'id': '67864563',
+                'ext': 'flv',
+                'upload_date': '20141112',
+                'title': 'Rosetta #CometLanding webcast HL 10',
+            }
+        },
+        # Another Livestream embed, without 'new.' in URL
+        {
+            'url': 'https://www.freespeech.org/',
+            'info_dict': {
+                'id': '123537347',
+                'ext': 'mp4',
+                'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            },
+            'params': {
+                # Live stream
+                'skip_download': True,
+            },
+        },
+        # LazyYT
+        {
+            'url': 'https://skiplagged.com/',
+            'info_dict': {
+                'id': 'skiplagged',
+                'title': 'Skiplagged: The smart way to find cheap flights',
+            },
+            'playlist_mincount': 1,
+            'add_ie': ['Youtube'],
+        },
+        # Cinchcast embed
+        {
+            'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
+            'info_dict': {
+                'id': '7141703',
+                'ext': 'mp3',
+                'upload_date': '20141126',
+                'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
+            }
+        },
+        # Cinerama player
+        {
+            'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
+            'info_dict': {
+                'id': '730m_DandD_1901_512k',
+                'ext': 'mp4',
+                'uploader': 'www.abc.net.au',
+                'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
+            }
+        },
+        # embedded viddler video
+        {
+            'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
+            'info_dict': {
+                'id': '4d03aad9',
+                'ext': 'mp4',
+                'uploader': 'deadspin',
+                'title': 'WALL-TO-GORTAT',
+                'timestamp': 1422285291,
+                'upload_date': '20150126',
+            },
+            'add_ie': ['Viddler'],
+        },
+        # Libsyn embed
+        {
+            'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
+            'info_dict': {
+                'id': '3377616',
+                'ext': 'mp3',
+                'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
+                'description': 'md5:601cb790edd05908957dae8aaa866465',
+                'upload_date': '20150220',
+            },
+            'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/',
+        },
+        # jwplayer YouTube
+        {
+            'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
+            'info_dict': {
+                'id': 'Mrj4DVp2zeA',
+                'ext': 'mp4',
+                'upload_date': '20150212',
+                'uploader': 'The National Archives UK',
+                'description': 'md5:8078af856dca76edc42910b61273dbbf',
+                'uploader_id': 'NationalArchives08',
+                'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
+            },
+        },
+        # jwplayer rtmp
+        {
+            'url': 'http://www.suffolk.edu/sjc/live.php',
+            'info_dict': {
+                'id': 'live',
+                'ext': 'flv',
+                'title': 'Massachusetts Supreme Judicial Court Oral Arguments',
+                'uploader': 'www.suffolk.edu',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/',
+        },
+        # Complex jwplayer
+        {
+            'url': 'http://www.indiedb.com/games/king-machine/videos',
+            'info_dict': {
+                'id': 'videos',
+                'ext': 'mp4',
+                'title': 'king machine trailer 1',
+                'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            # JWPlayer config passed as variable
+            'url': 'http://www.txxx.com/videos/3326530/ariele/',
+            'info_dict': {
+                'id': '3326530_hq',
+                'ext': 'mp4',
+                'title': 'ARIELE | Tube Cup',
+                'uploader': 'www.txxx.com',
+                'age_limit': 18,
+            },
+            'params': {
+                'skip_download': True,
+            }
+        },
+        {
+            # JWPlatform iframe
+            'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/',
+            'md5': 'ca00a040364b5b439230e7ebfd02c4e9',
+            'info_dict': {
+                'id': 'O0c5JcKT',
+                'ext': 'mp4',
+                'upload_date': '20171122',
+                'timestamp': 1511366290,
+                'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone',
+            },
+            'add_ie': [JWPlatformIE.ie_key()],
+        },
+        {
+            # Video.js embed, multiple formats
+            'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
+            'info_dict': {
+                'id': 'yygqldloqIk',
+                'ext': 'mp4',
+                'title': 'SolidWorks. Урок 6 Настройка чертежа',
+                'description': 'md5:baf95267792646afdbf030e4d06b2ab3',
+                'upload_date': '20130314',
+                'uploader': 'PROстое3D',
+                'uploader_id': 'PROstoe3D',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # Video.js embed, single format
+            'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=',
+            'info_dict': {
+                'id': 'watch',
+                'ext': 'mp4',
+                'title': 'Step 1 -  Good Foundation',
+                'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # rtl.nl embed
+        {
+            'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
+            'playlist_mincount': 5,
+            'info_dict': {
+                'id': 'aanslagen-kopenhagen',
+                'title': 'Aanslagen Kopenhagen',
+            }
+        },
+        # Zapiks embed
+        {
+            'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
+            'info_dict': {
+                'id': '118046',
+                'ext': 'mp4',
+                'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
+            }
+        },
+        # Kaltura embed (different embed code)
+        {
+            'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
+            'info_dict': {
+                'id': '1_a52wc67y',
+                'ext': 'flv',
+                'upload_date': '20150127',
+                'uploader_id': 'PremierMedia',
+                'timestamp': int,
+                'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014',
+            },
+        },
+        # Kaltura embed with single quotes
+        {
+            'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY',
+            'info_dict': {
+                'id': '0_izeg5utt',
+                'ext': 'mp4',
+                'title': '35871',
+                'timestamp': 1355743100,
+                'upload_date': '20121217',
+                'uploader_id': 'cplapp@learn360.com',
+            },
+            'add_ie': ['Kaltura'],
+        },
+        {
+            # Kaltura embedded via quoted entry_id
+            'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures',
+            'info_dict': {
+                'id': '0_utuok90b',
+                'ext': 'mp4',
+                'title': '06_matthew_brender_raj_dutt',
+                'timestamp': 1466638791,
+                'upload_date': '20160622',
+            },
+            'add_ie': ['Kaltura'],
+            'expected_warnings': [
+                'Could not send HEAD request'
+            ],
+            'params': {
+                'skip_download': True,
+            }
+        },
+        {
+            # Kaltura embedded, some fileExt broken (#11480)
+            'url': 'http://www.cornell.edu/video/nima-arkani-hamed-standard-models-of-particle-physics',
+            'info_dict': {
+                'id': '1_sgtvehim',
+                'ext': 'mp4',
+                'title': 'Our "Standard Models" of particle physics and cosmology',
+                'description': 'md5:67ea74807b8c4fea92a6f38d6d323861',
+                'timestamp': 1321158993,
+                'upload_date': '20111113',
+                'uploader_id': 'kps1',
+            },
+            'add_ie': ['Kaltura'],
+        },
+        {
+            # Kaltura iframe embed
+            'url': 'http://www.gsd.harvard.edu/event/i-m-pei-a-centennial-celebration/',
+            'md5': 'ae5ace8eb09dc1a35d03b579a9c2cc44',
+            'info_dict': {
+                'id': '0_f2cfbpwy',
+                'ext': 'mp4',
+                'title': 'I. M. Pei: A Centennial Celebration',
+                'description': 'md5:1db8f40c69edc46ca180ba30c567f37c',
+                'upload_date': '20170403',
+                'uploader_id': 'batchUser',
+                'timestamp': 1491232186,
+            },
+            'add_ie': ['Kaltura'],
+        },
+        {
+            # Kaltura iframe embed, more sophisticated
+            'url': 'http://www.cns.nyu.edu/~eero/math-tools/Videos/lecture-05sep2017.html',
+            'info_dict': {
+                'id': '1_9gzouybz',
+                'ext': 'mp4',
+                'title': 'lecture-05sep2017',
+                'description': 'md5:40f347d91fd4ba047e511c5321064b49',
+                'upload_date': '20170913',
+                'uploader_id': 'eps2',
+                'timestamp': 1505340777,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': ['Kaltura'],
+        },
+        {
+            # meta twitter:player
+            'url': 'http://thechive.com/2017/12/08/all-i-want-for-christmas-is-more-twerk/',
+            'info_dict': {
+                'id': '0_01b42zps',
+                'ext': 'mp4',
+                'title': 'Main Twerk (Video)',
+                'upload_date': '20171208',
+                'uploader_id': 'sebastian.salinas@thechive.com',
+                'timestamp': 1512713057,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': ['Kaltura'],
+        },
+        # referrer protected EaglePlatform embed
+        {
+            'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
+            'info_dict': {
+                'id': '582306',
+                'ext': 'mp4',
+                'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 3382,
+                'view_count': int,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # ClipYou (EaglePlatform) embed (custom URL)
+        {
+            'url': 'http://muz-tv.ru/play/7129/',
+            # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
+            'info_dict': {
+                'id': '12820',
+                'ext': 'mp4',
+                'title': "'O Sole Mio",
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 216,
+                'view_count': int,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'This video is unavailable.',
+        },
+        # Pladform embed
+        {
+            'url': 'http://muz-tv.ru/kinozal/view/7400/',
+            'info_dict': {
+                'id': '100183293',
+                'ext': 'mp4',
+                'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
+                'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 694,
+                'age_limit': 0,
+            },
+            'skip': 'HTTP Error 404: Not Found',
+        },
+        # Playwire embed
+        {
+            'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
+            'info_dict': {
+                'id': '3519514',
+                'ext': 'mp4',
+                'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
+                'thumbnail': r're:^https?://.*\.png$',
+                'duration': 45.115,
+            },
+        },
+        # 5min embed
+        {
+            'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
+            'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
+            'info_dict': {
+                'id': '518726732',
+                'ext': 'mp4',
+                'title': 'Facebook Creates "On This Day" | Crunch Report',
+                'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild',
+                'timestamp': 1427237531,
+                'uploader': 'Crunch Report',
+                'upload_date': '20150324',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        # Crooks and Liars embed
+        {
+            'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
+            'info_dict': {
+                'id': '8RUoRhRi',
+                'ext': 'mp4',
+                'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
+                'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
+                'timestamp': 1428207000,
+                'upload_date': '20150405',
+                'uploader': 'Heather',
+            },
+        },
+        # Crooks and Liars external embed
+        {
+            'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
+            'info_dict': {
+                'id': 'MTE3MjUtMzQ2MzA',
+                'ext': 'mp4',
+                'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
+                'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
+                'timestamp': 1265032391,
+                'upload_date': '20100201',
+                'uploader': 'Heather',
+            },
+        },
+        # NBC Sports vplayer embed
+        {
+            'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
+            'info_dict': {
+                'id': 'ln7x1qSThw4k',
+                'ext': 'flv',
+                'title': "PFT Live: New leader in the 'new-look' defense",
+                'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
+                'uploader': 'NBCU-SPORTS',
+                'upload_date': '20140107',
+                'timestamp': 1389118457,
+            },
+            'skip': 'Invalid Page URL',
+        },
+        # NBC News embed
+        {
+            'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html',
+            'md5': '1aa589c675898ae6d37a17913cf68d66',
+            'info_dict': {
+                'id': 'x_dtl_oa_LettermanliftPR_160608',
+                'ext': 'mp4',
+                'title': 'David Letterman: A Preview',
+                'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.',
+                'upload_date': '20160609',
+                'timestamp': 1465431544,
+                'uploader': 'NBCU-NEWS',
+            },
+        },
+        # UDN embed
+        {
+            'url': 'https://video.udn.com/news/300346',
+            'md5': 'fd2060e988c326991037b9aff9df21a6',
+            'info_dict': {
+                'id': '300346',
+                'ext': 'mp4',
+                'title': '中一中男師變性 全校師生力挺',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+            'expected_warnings': ['Failed to parse JSON Expecting value'],
+        },
+        # Brightcove URL in single quotes
+        {
+            'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
+            'md5': '4ae374f1f8b91c889c4b9203c8c752af',
+            'info_dict': {
+                'id': '4255764656001',
+                'ext': 'mp4',
+                'title': 'SN Presents: Russell Martin, World Citizen',
+                'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
+                'uploader': 'Rogers Sportsnet',
+                'uploader_id': '1704050871',
+                'upload_date': '20150525',
+                'timestamp': 1432570283,
+            },
+        },
+        # Kinja embed
+        {
+            'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
+            'info_dict': {
+                'id': '106351',
+                'ext': 'mp4',
+                'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
+                'description': 'Migrated from OnionStudios',
+                'thumbnail': r're:^https?://.*\.jpe?g$',
+                'uploader': 'clickhole',
+                'upload_date': '20150527',
+                'timestamp': 1432744860,
+            }
+        },
+        # SnagFilms embed
+        {
+            'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html',
+            'info_dict': {
+                'id': '74849a00-85a9-11e1-9660-123139220831',
+                'ext': 'mp4',
+                'title': '#whilewewatch',
+            }
+        },
+        # AdobeTVVideo embed
+        {
+            'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
+            'md5': '43662b577c018ad707a63766462b1e87',
+            'info_dict': {
+                'id': '2456',
+                'ext': 'mp4',
+                'title': 'New experience with Acrobat DC',
+                'description': 'New experience with Acrobat DC',
+                'duration': 248.667,
+            },
+        },
+        # BrightcoveInPageEmbed embed
+        {
+            'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/',
+            'info_dict': {
+                'id': '4238694884001',
+                'ext': 'flv',
+                'title': 'Tabletop: Dread, Last Thoughts',
+                'description': 'Tabletop: Dread, Last Thoughts',
+                'duration': 51690,
+            },
+        },
+        # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions'
+        # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm
+        {
+            'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html',
+            'info_dict': {
+                'id': '4785848093001',
+                'ext': 'mp4',
+                'title': 'The Cardinal Pell Interview',
+                'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ',
+                'uploader': 'GlobeCast Australia - GlobeStream',
+                'uploader_id': '2733773828001',
+                'upload_date': '20160304',
+                'timestamp': 1457083087,
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            },
+        },
+        {
+            # Brightcove embed with whitespace around attribute names
+            'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill',
+            'info_dict': {
+                'id': '3167554373001',
+                'ext': 'mp4',
+                'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill",
+                'description': 'md5:57bacb0e0f29349de4972bfda3191713',
+                'uploader_id': '1079349493',
+                'upload_date': '20140207',
+                'timestamp': 1391810548,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # Another form of arte.tv embed
+        {
+            'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
+            'md5': '850bfe45417ddf221288c88a0cffe2e2',
+            'info_dict': {
+                'id': '030273-562_PLUS7-F',
+                'ext': 'mp4',
+                'title': 'ARTE Reportage - Nulle part, en France',
+                'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d',
+                'upload_date': '20160409',
+            },
+        },
+        # LiveLeak embed
+        {
+            'url': 'http://www.wykop.pl/link/3088787/',
+            'md5': '7619da8c820e835bef21a1efa2a0fc71',
+            'info_dict': {
+                'id': '874_1459135191',
+                'ext': 'mp4',
+                'title': 'Man shows poor quality of new apartment building',
+                'description': 'The wall is like a sand pile.',
+                'uploader': 'Lake8737',
+            },
+            'add_ie': [LiveLeakIE.ie_key()],
+        },
+        # Another LiveLeak embed pattern (#13336)
+        {
+            'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/',
+            'info_dict': {
+                'id': '2eb_1496309988',
+                'ext': 'mp4',
+                'title': 'Thief robs place where everyone was armed',
+                'description': 'md5:694d73ee79e535953cf2488562288eee',
+                'uploader': 'brazilwtf',
+            },
+            'add_ie': [LiveLeakIE.ie_key()],
+        },
+        # Duplicated embedded video URLs
+        {
+            'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
+            'info_dict': {
+                'id': '149298443_480_16c25b74_2',
+                'ext': 'mp4',
+                'title': 'vs. Blue Orange Spring Game',
+                'uploader': 'www.hudl.com',
+            },
+        },
+        # twitter:player:stream embed
+        {
+            'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288',
+            'info_dict': {
+                'id': 'master',
+                'ext': 'mp4',
+                'title': 'Une nouvelle espèce de dinosaure découverte en Argentine',
+                'uploader': 'www.rtl.be',
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            },
+        },
+        # twitter:player embed
+        {
+            'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/',
+            'md5': 'a3e0df96369831de324f0778e126653c',
+            'info_dict': {
+                'id': '4909620399001',
+                'ext': 'mp4',
+                'title': 'What Do Black Holes Sound Like?',
+                'description': 'what do black holes sound like',
+                'upload_date': '20160524',
+                'uploader_id': '29913724001',
+                'timestamp': 1464107587,
+                'uploader': 'TheAtlantic',
+            },
+            'add_ie': ['BrightcoveLegacy'],
+        },
+        # Facebook <iframe> embed
+        {
+            'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
+            'md5': 'fbcde74f534176ecb015849146dd3aee',
+            'info_dict': {
+                'id': '599637780109885',
+                'ext': 'mp4',
+                'title': 'Facebook video #599637780109885',
+            },
+        },
+        # Facebook <iframe> embed, plugin video
+        {
+            'url': 'http://5pillarsuk.com/2017/06/07/tariq-ramadan-disagrees-with-pr-exercise-by-imams-refusing-funeral-prayers-for-london-attackers/',
+            'info_dict': {
+                'id': '1754168231264132',
+                'ext': 'mp4',
+                'title': 'About the Imams and Religious leaders refusing to perform funeral prayers for...',
+                'uploader': 'Tariq Ramadan (official)',
+                'timestamp': 1496758379,
+                'upload_date': '20170606',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # Facebook API embed
+        {
+            'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
+            'md5': 'a47372ee61b39a7b90287094d447d94e',
+            'info_dict': {
+                'id': '10153467542406923',
+                'ext': 'mp4',
+                'title': 'Facebook video #10153467542406923',
+            },
+        },
+        # Wordpress "YouTube Video Importer" plugin
+        {
+            'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/',
+            'md5': 'd16797741b560b485194eddda8121b48',
+            'info_dict': {
+                'id': 'HNTXWDXV9Is',
+                'ext': 'mp4',
+                'title': 'Blue Devils Drumline Stanford lot 2016',
+                'upload_date': '20160627',
+                'uploader_id': 'GENOCIDE8GENERAL10',
+                'uploader': 'cylus cyrus',
+            },
+        },
+        {
+            # video stored on custom kaltura server
+            'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv',
+            'md5': '537617d06e64dfed891fa1593c4b30cc',
+            'info_dict': {
+                'id': '0_1iotm5bh',
+                'ext': 'mp4',
+                'title': 'Elecciones británicas: 5 lecciones para Rajoy',
+                'description': 'md5:435a89d68b9760b92ce67ed227055f16',
+                'uploader_id': 'videos.expansion@el-mundo.net',
+                'upload_date': '20150429',
+                'timestamp': 1430303472,
+            },
+            'add_ie': ['Kaltura'],
+        },
+        {
+            # multiple kaltura embeds, nsfw
+            'url': 'https://www.quartier-rouge.be/prive/femmes/kamila-avec-video-jaime-sadomie.html',
+            'info_dict': {
+                'id': 'kamila-avec-video-jaime-sadomie',
+                'title': "Kamila avec vídeo “J'aime sadomie”",
+            },
+            'playlist_count': 8,
+        },
+        {
+            # Non-standard Vimeo embed
+            'url': 'https://openclassrooms.com/courses/understanding-the-web',
+            'md5': '64d86f1c7d369afd9a78b38cbb88d80a',
+            'info_dict': {
+                'id': '148867247',
+                'ext': 'mp4',
+                'title': 'Understanding the web - Teaser',
+                'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.',
+                'upload_date': '20151214',
+                'uploader': 'OpenClassrooms',
+                'uploader_id': 'openclassrooms',
+            },
+            'add_ie': ['Vimeo'],
+        },
+        {
+            # generic vimeo embed that requires original URL passed as Referer
+            'url': 'http://racing4everyone.eu/2016/07/30/formula-1-2016-round12-germany/',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video',
+            'md5': 'b96f2f71b359a8ecd05ce4e1daa72365',
+            'info_dict': {
+                'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe',
+                'ext': 'mp4',
+                'title': 'Big Buck Bunny',
+                'description': 'Royalty free test video',
+                'timestamp': 1432816365,
+                'upload_date': '20150528',
+                'is_live': False,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [ArkenaIE.ie_key()],
+        },
+        {
+            'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/',
+            'info_dict': {
+                'id': '1c7141f46c',
+                'ext': 'mp4',
+                'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [Vbox7IE.ie_key()],
+        },
+        {
+            # DBTV embeds
+            'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/',
+            'info_dict': {
+                'id': '43254897',
+                'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans',
+            },
+            'playlist_mincount': 3,
+        },
+        {
+            # Videa embeds
+            'url': 'http://forum.dvdtalk.com/movie-talk/623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style.html',
+            'info_dict': {
+                'id': '623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style',
+                'title': 'Deleted Magic - Star Wars: OT Deleted / Alt. Scenes Docu. Style - DVD Talk Forum',
+            },
+            'playlist_mincount': 2,
+        },
+        {
+            # 20 minuten embed
+            'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552',
+            'info_dict': {
+                'id': '523629',
+                'ext': 'mp4',
+                'title': 'So kommen Sie bei Eis und Schnee sicher an',
+                'description': 'md5:117c212f64b25e3d95747e5276863f7d',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [TwentyMinutenIE.ie_key()],
+        },
+        {
+            # VideoPress embed
+            'url': 'https://en.support.wordpress.com/videopress/',
+            'info_dict': {
+                'id': 'OcobLTqC',
+                'ext': 'm4v',
+                'title': 'IMG_5786',
+                'timestamp': 1435711927,
+                'upload_date': '20150701',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [VideoPressIE.ie_key()],
+        },
+        {
+            # Rutube embed
+            'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2',
+            'info_dict': {
+                'id': '9b3d5bee0a8740bf70dfd29d3ea43541',
+                'ext': 'flv',
+                'title': 'Магаззино: Казань 2',
+                'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a',
+                'uploader': 'Магаззино',
+                'upload_date': '20170228',
+                'uploader_id': '996642',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [RutubeIE.ie_key()],
+        },
+        {
+            # ThePlatform embedded with whitespaces in URLs
+            'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',
+            'only_matching': True,
+        },
+        {
+            # Senate ISVP iframe https
+            'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security',
+            'md5': 'fb8c70b0b515e5037981a2492099aab8',
+            'info_dict': {
+                'id': 'govtaff020316',
+                'ext': 'mp4',
+                'title': 'Integrated Senate Video Player',
+            },
+            'add_ie': [SenateISVPIE.ie_key()],
+        },
+        {
+            # Limelight embeds (1 channel embed + 4 media embeds)
+            'url': 'http://www.sedona.com/FacilitatorTraining2017',
+            'info_dict': {
+                'id': 'FacilitatorTraining2017',
+                'title': 'Facilitator Training 2017',
+            },
+            'playlist_mincount': 5,
+        },
+        {
+            # Limelight embed (LimelightPlayerUtil.embed)
+            'url': 'https://tv5.ca/videos?v=xuu8qowr291ri',
+            'info_dict': {
+                'id': '95d035dc5c8a401588e9c0e6bd1e9c92',
+                'ext': 'mp4',
+                'title': '07448641',
+                'timestamp': 1499890639,
+                'upload_date': '20170712',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': ['LimelightMedia'],
+        },
+        {
+            'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
+            'info_dict': {
+                'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
+                'title': 'Standoff with Walnut Creek murder suspect ends',
+                'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788',
+            },
+            'playlist_mincount': 4,
+        },
+        {
+            # WashingtonPost embed
+            'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches',
+            'info_dict': {
+                'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac',
+                'ext': 'mp4',
+                'title': "No one has seen the drama series based on Trump's life \u2014 until now",
+                'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.',
+                'timestamp': 1455216756,
+                'uploader': 'The Washington Post',
+                'upload_date': '20160211',
+            },
+            'add_ie': [WashingtonPostIE.ie_key()],
+        },
+        {
+            # Mediaset embed
+            'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml',
+            'info_dict': {
+                'id': '720642',
+                'ext': 'mp4',
+                'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [MediasetIE.ie_key()],
+        },
+        {
+            # JOJ.sk embeds
+            'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+            'info_dict': {
+                'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+                'title': 'Slovenskom sa prehnala vlna silných búrok',
+            },
+            'playlist_mincount': 5,
+            'add_ie': [JojIE.ie_key()],
+        },
+        {
+            # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
+            'url': 'https://tvrain.ru/amp/418921/',
+            'md5': 'cc00413936695987e8de148b67d14f1d',
+            'info_dict': {
+                'id': '418921',
+                'ext': 'mp4',
+                'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+            },
+        },
+        {
+            # vzaar embed
+            'url': 'http://help.vzaar.com/article/165-embedding-video',
+            'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4',
+            'info_dict': {
+                'id': '8707641',
+                'ext': 'mp4',
+                'title': 'Building A Business Online: Principal Chairs Q & A',
+            },
+        },
+        {
+            # multiple HTML5 videos on one page
+            'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html',
+            'info_dict': {
+                'id': 'keyscenarios',
+                'title': 'Rescue Kit 14 Free Edition - Getting started',
+            },
+            'playlist_count': 4,
+        },
+        {
+            # vshare embed
+            'url': 'https://youtube-dlc-demo.neocities.org/vshare.html',
+            'md5': '17b39f55b5497ae8b59f5fbce8e35886',
+            'info_dict': {
+                'id': '0f64ce6',
+                'title': 'vl14062007715967',
+                'ext': 'mp4',
+            }
+        },
+        {
+            'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/',
+            'md5': 'aecd089f55b1cb5a59032cb049d3a356',
+            'info_dict': {
+                'id': '90227f51a80c4d8f86c345a7fa62bd9a1d',
+                'ext': 'mp4',
+                'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare',
+                'description': 'md5:5a51db84a62def7b7054df2ade403c6c',
+                'timestamp': 1474354800,
+                'upload_date': '20160920',
+            }
+        },
+        {
+            'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton',
+            'info_dict': {
+                'id': '1731611',
+                'ext': 'mp4',
+                'title': 'Official Trailer | TROLLS: THE BEAT GOES ON!',
+                'description': 'md5:eb5f23826a027ba95277d105f248b825',
+                'timestamp': 1516100691,
+                'upload_date': '20180116',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [SpringboardPlatformIE.ie_key()],
+        },
+        {
+            'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU',
+            'info_dict': {
+                'id': 'uPDB5I9wfp8',
+                'ext': 'webm',
+                'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3',
+                'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d',
+                'upload_date': '20160219',
+                'uploader': 'Pocoyo - Português (BR)',
+                'uploader_id': 'PocoyoBrazil',
+            },
+            'add_ie': [YoutubeIE.ie_key()],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html',
+            'info_dict': {
+                'id': 'vMDE4NzI1Mjgt690b',
+                'ext': 'mp4',
+                'title': 'Котята',
+            },
+            'add_ie': [YapFilesIE.ie_key()],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # CloudflareStream embed
+            'url': 'https://www.cloudflare.com/products/cloudflare-stream/',
+            'info_dict': {
+                'id': '31c9291ab41fac05471db4e73aa11717',
+                'ext': 'mp4',
+                'title': '31c9291ab41fac05471db4e73aa11717',
+            },
+            'add_ie': [CloudflareStreamIE.ie_key()],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # PeerTube embed
+            'url': 'https://joinpeertube.org/fr/home/',
+            'info_dict': {
+                'id': 'home',
+                'title': 'Reprenez le contrôle de vos vidéos ! #JoinPeertube',
+            },
+            'playlist_count': 2,
+        },
+        {
+            # Indavideo embed
+            'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/',
+            'info_dict': {
+                'id': '1693903',
+                'ext': 'mp4',
+                'title': 'Így kell otthon hamburgert sütni',
+                'description': 'md5:f5a730ecf900a5c852e1e00540bbb0f7',
+                'timestamp': 1426330212,
+                'upload_date': '20150314',
+                'uploader': 'StreetKitchen',
+                'uploader_id': '546363',
+            },
+            'add_ie': [IndavideoEmbedIE.ie_key()],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # APA embed via JWPlatform embed
+            'url': 'http://www.vol.at/blue-man-group/5593454',
+            'info_dict': {
+                'id': 'jjv85FdZ',
+                'ext': 'mp4',
+                'title': '"Blau ist mysteriös": Die Blue Man Group im Interview',
+                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 254,
+                'timestamp': 1519211149,
+                'upload_date': '20180221',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://share-videos.se/auto/video/83645793?uid=13',
+            'md5': 'b68d276de422ab07ee1d49388103f457',
+            'info_dict': {
+                'id': '83645793',
+                'title': 'Lock up and get excited',
+                'ext': 'mp4'
+            },
+            'skip': 'TODO: fix nested playlists processing in tests',
+        },
+        {
+            # Viqeo embeds
+            'url': 'https://viqeo.tv/',
+            'info_dict': {
+                'id': 'viqeo',
+                'title': 'All-new video platform',
+            },
+            'playlist_count': 6,
+        },
+        {
+            # Squarespace video embed, 2019-08-28
+            'url': 'http://ootboxford.com',
+            'info_dict': {
+                'id': 'Tc7b_JGdZfw',
+                'title': 'Out of the Blue, at Childish Things 10',
+                'ext': 'mp4',
+                'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f',
+                'uploader_id': 'helendouglashouse',
+                'uploader': 'Helen & Douglas House',
+                'upload_date': '20140328',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # Zype embed
+            'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
+            'info_dict': {
+                'id': '5b400b834b32992a310622b9',
+                'ext': 'mp4',
+                'title': 'Smoky Barbecue Favorites',
+                'thumbnail': r're:^https?://.*\.jpe?g',
+                'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
+                'upload_date': '20170909',
+                'timestamp': 1504915200,
+            },
+            'add_ie': [ZypeIE.ie_key()],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # videojs embed
+            'url': 'https://video.sibnet.ru/shell.php?videoid=3422904',
+            'info_dict': {
+                'id': 'shell',
+                'ext': 'mp4',
+                'title': 'Доставщик пиццы спросил разрешения сыграть на фортепиано',
+                'description': 'md5:89209cdc587dab1e4a090453dbaa2cb1',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'expected_warnings': ['Failed to download MPD manifest'],
+        },
+        {
+            # DailyMotion embed with DM.player
+            'url': 'https://www.beinsports.com/us/copa-del-rey/video/the-locker-room-valencia-beat-barca-in-copa/1203804',
+            'info_dict': {
+                'id': 'k6aKkGHd9FJs4mtJN39',
+                'ext': 'mp4',
+                'title': 'The Locker Room: Valencia Beat Barca In Copa del Rey Final',
+                'description': 'This video is private.',
+                'uploader_id': 'x1jf30l',
+                'uploader': 'beIN SPORTS USA',
+                'upload_date': '20190528',
+                'timestamp': 1559062971,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # {
+        #     # TODO: find another test
+        #     # http://schema.org/VideoObject
+        #     'url': 'https://flipagram.com/f/nyvTSJMKId',
+        #     'md5': '888dcf08b7ea671381f00fab74692755',
+        #     'info_dict': {
+        #         'id': 'nyvTSJMKId',
+        #         'ext': 'mp4',
+        #         'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction',
+        #         'description': '#love for cats.',
+        #         'timestamp': 1461244995,
+        #         'upload_date': '20160421',
+        #     },
+        #     'params': {
+        #         'force_generic_extractor': True,
+        #     },
+        # }
+    ]
+
+    def report_following_redirect(self, new_url):
+        """Report information extraction."""
+        self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
+
+    def _extract_rss(self, url, video_id, doc):
+        playlist_title = doc.find('./channel/title').text
+        playlist_desc_el = doc.find('./channel/description')
+        playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
+
+        entries = []
+        for it in doc.findall('./channel/item'):
+            next_url = None
+            enclosure_nodes = it.findall('./enclosure')
+            for e in enclosure_nodes:
+                next_url = e.attrib.get('url')
+                if next_url:
+                    break
+
+            if not next_url:
+                next_url = xpath_text(it, 'link', fatal=False)
+
+            if not next_url:
+                continue
+
+            entries.append({
+                '_type': 'url_transparent',
+                'url': next_url,
+                'title': it.find('title').text,
+            })
+
+        return {
+            '_type': 'playlist',
+            'id': url,
+            'title': playlist_title,
+            'description': playlist_desc,
+            'entries': entries,
+        }
+
+    def _extract_camtasia(self, url, video_id, webpage):
+        """ Returns None if no camtasia video can be found. """
+
+        camtasia_cfg = self._search_regex(
+            r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
+            webpage, 'camtasia configuration file', default=None)
+        if camtasia_cfg is None:
+            return None
+
+        title = self._html_search_meta('DC.title', webpage, fatal=True)
+
+        camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
+        camtasia_cfg = self._download_xml(
+            camtasia_url, video_id,
+            note='Downloading camtasia configuration',
+            errnote='Failed to download camtasia configuration')
+        fileset_node = camtasia_cfg.find('./playlist/array/fileset')
+
+        entries = []
+        for n in fileset_node.getchildren():
+            url_n = n.find('./uri')
+            if url_n is None:
+                continue
+
+            entries.append({
+                'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
+                'title': '%s - %s' % (title, n.tag),
+                'url': compat_urlparse.urljoin(url, url_n.text),
+                'duration': float_or_none(n.find('./duration').text),
+            })
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'title': title,
+        }
+
+    def _real_extract(self, url):
+        if url.startswith('//'):
+            return self.url_result(self.http_scheme() + url)
+
+        parsed_url = compat_urlparse.urlparse(url)
+        if not parsed_url.scheme:
+            default_search = self._downloader.params.get('default_search')
+            if default_search is None:
+                default_search = 'fixup_error'
+
+            if default_search in ('auto', 'auto_warning', 'fixup_error'):
+                if re.match(r'^[^\s/]+\.[^\s/]+/', url):
+                    self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
+                    return self.url_result('http://' + url)
+                elif default_search != 'fixup_error':
+                    if default_search == 'auto_warning':
+                        if re.match(r'^(?:url|URL)$', url):
+                            raise ExtractorError(
+                                'Invalid URL:  %r . Call youtube-dlc like this:  youtube-dlc -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
+                                expected=True)
+                        else:
+                            self._downloader.report_warning(
+                                'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
+                    return self.url_result('ytsearch:' + url)
+
+            if default_search in ('error', 'fixup_error'):
+                raise ExtractorError(
+                    '%r is not a valid URL. '
+                    'Set --default-search "ytsearch" (or run  youtube-dlc "ytsearch:%s" ) to search YouTube'
+                    % (url, url), expected=True)
+            else:
+                if ':' not in default_search:
+                    default_search += ':'
+                return self.url_result(default_search + url)
+
+        url, smuggled_data = unsmuggle_url(url)
+        force_videoid = None
+        is_intentional = smuggled_data and smuggled_data.get('to_generic')
+        if smuggled_data and 'force_videoid' in smuggled_data:
+            force_videoid = smuggled_data['force_videoid']
+            video_id = force_videoid
+        else:
+            video_id = self._generic_id(url)
+
+        self.to_screen('%s: Requesting header' % video_id)
+
+        head_req = HEADRequest(url)
+        head_response = self._request_webpage(
+            head_req, video_id,
+            note=False, errnote='Could not send HEAD request to %s' % url,
+            fatal=False)
+
+        if head_response is not False:
+            # Check for redirect
+            new_url = head_response.geturl()
+            if url != new_url:
+                self.report_following_redirect(new_url)
+                if force_videoid:
+                    new_url = smuggle_url(
+                        new_url, {'force_videoid': force_videoid})
+                return self.url_result(new_url)
+
+        full_response = None
+        if head_response is False:
+            request = sanitized_Request(url)
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
+            head_response = full_response
+
+        info_dict = {
+            'id': video_id,
+            'title': self._generic_title(url),
+            'upload_date': unified_strdate(head_response.headers.get('Last-Modified'))
+        }
+
+        # Check for direct link to a video
+        content_type = head_response.headers.get('Content-Type', '').lower()
+        m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
+        if m:
+            format_id = compat_str(m.group('format_id'))
+            if format_id.endswith('mpegurl'):
+                formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+            elif format_id == 'f4m':
+                formats = self._extract_f4m_formats(url, video_id)
+            else:
+                formats = [{
+                    'format_id': format_id,
+                    'url': url,
+                    'vcodec': 'none' if m.group('type') == 'audio' else None
+                }]
+                info_dict['direct'] = True
+            self._sort_formats(formats)
+            info_dict['formats'] = formats
+            return info_dict
+
+        if not self._downloader.params.get('test', False) and not is_intentional:
+            force = self._downloader.params.get('force_generic_extractor', False)
+            self._downloader.report_warning(
+                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
+
+        if not full_response:
+            request = sanitized_Request(url)
+            # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
+            # making it impossible to download only chunk of the file (yet we need only 512kB to
+            # test whether it's HTML or not). According to youtube-dlc default Accept-Encoding
+            # that will always result in downloading the whole file that is not desirable.
+            # Therefore for extraction pass we have to override Accept-Encoding to any in order
+            # to accept raw bytes and being able to download only a chunk.
+            # It may probably better to solve this by checking Content-Type for application/octet-stream
+            # after HEAD request finishes, but not sure if we can rely on this.
+            request.add_header('Accept-Encoding', '*')
+            full_response = self._request_webpage(request, video_id)
+
+        first_bytes = full_response.read(512)
+
+        # Is it an M3U playlist?
+        if first_bytes.startswith(b'#EXTM3U'):
+            info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4')
+            self._sort_formats(info_dict['formats'])
+            return info_dict
+
+        # Maybe it's a direct link to a video?
+        # Be careful not to download the whole thing!
+        if not is_html(first_bytes):
+            self._downloader.report_warning(
+                'URL could be a direct video link, returning it as such.')
+            info_dict.update({
+                'direct': True,
+                'url': url,
+            })
+            return info_dict
+
+        webpage = self._webpage_read_content(
+            full_response, url, video_id, prefix=first_bytes)
+
+        self.report_extraction(video_id)
+
+        # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
+        try:
+            doc = compat_etree_fromstring(webpage.encode('utf-8'))
+            if doc.tag == 'rss':
+                return self._extract_rss(url, video_id, doc)
+            elif doc.tag == 'SmoothStreamingMedia':
+                info_dict['formats'] = self._parse_ism_formats(doc, url)
+                self._sort_formats(info_dict['formats'])
+                return info_dict
+            elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
+                smil = self._parse_smil(doc, url, video_id)
+                self._sort_formats(smil['formats'])
+                return smil
+            elif doc.tag == '{http://xspf.org/ns/0/}playlist':
+                return self.playlist_result(
+                    self._parse_xspf(
+                        doc, video_id, xspf_url=url,
+                        xspf_base_url=full_response.geturl()),
+                    video_id)
+            elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
+                info_dict['formats'] = self._parse_mpd_formats(
+                    doc,
+                    mpd_base_url=full_response.geturl().rpartition('/')[0],
+                    mpd_url=url)
+                self._sort_formats(info_dict['formats'])
+                return info_dict
+            elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
+                info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
+                self._sort_formats(info_dict['formats'])
+                return info_dict
+        except compat_xml_parse_error:
+            pass
+
+        # Is it a Camtasia project?
+        camtasia_res = self._extract_camtasia(url, video_id, webpage)
+        if camtasia_res is not None:
+            return camtasia_res
+
+        # Sometimes embedded video player is hidden behind percent encoding
+        # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
+        # Unescaping the whole page allows to handle those cases in a generic way
+        webpage = compat_urllib_parse_unquote(webpage)
+
+        # Unescape squarespace embeds to be detected by generic extractor,
+        # see https://github.com/ytdl-org/youtube-dl/issues/21294
+        webpage = re.sub(
+            r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',
+            lambda x: unescapeHTML(x.group(0)), webpage)
+
+        # it's tempting to parse this further, but you would
+        # have to take into account all the variations like
+        #   Video Title - Site Name
+        #   Site Name | Video Title
+        #   Video Title - Tagline | Site Name
+        # and so on and so forth; it's just not practical
+        video_title = self._og_search_title(
+            webpage, default=None) or self._html_search_regex(
+            r'(?s)<title>(.*?)</title>', webpage, 'video title',
+            default='video')
+
+        # Try to detect age limit automatically
+        age_limit = self._rta_search(webpage)
+        # And then there are the jokers who advertise that they use RTA,
+        # but actually don't.
+        AGE_LIMIT_MARKERS = [
+            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
+        ]
+        if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
+            age_limit = 18
+
+        # video uploader is domain name
+        video_uploader = self._search_regex(
+            r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
+
+        video_description = self._og_search_description(webpage, default=None)
+        video_thumbnail = self._og_search_thumbnail(webpage, default=None)
+
+        info_dict.update({
+            'title': video_title,
+            'description': video_description,
+            'thumbnail': video_thumbnail,
+            'age_limit': age_limit,
+        })
+
+        # Look for Brightcove Legacy Studio embeds
+        bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
+        if bc_urls:
+            entries = [{
+                '_type': 'url',
+                'url': smuggle_url(bc_url, {'Referer': url}),
+                'ie_key': 'BrightcoveLegacy'
+            } for bc_url in bc_urls]
+
+            return {
+                '_type': 'playlist',
+                'title': video_title,
+                'id': video_id,
+                'entries': entries,
+            }
+
+        # Look for Brightcove New Studio embeds
+        bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
+        if bc_urls:
+            return self.playlist_from_matches(
+                bc_urls, video_id, video_title,
+                getter=lambda x: smuggle_url(x, {'referrer': url}),
+                ie='BrightcoveNew')
+
+        # Look for Nexx embeds
+        nexx_urls = NexxIE._extract_urls(webpage)
+        if nexx_urls:
+            return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key())
+
+        # Look for Nexx iFrame embeds
+        nexx_embed_urls = NexxEmbedIE._extract_urls(webpage)
+        if nexx_embed_urls:
+            return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key())
+
+        # Look for ThePlatform embeds
+        tp_urls = ThePlatformIE._extract_urls(webpage)
+        if tp_urls:
+            return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
+
+        # Look for embedded rtl.nl player
+        matches = re.findall(
+            r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
+            webpage)
+        if matches:
+            return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
+
+        vimeo_urls = VimeoIE._extract_urls(url, webpage)
+        if vimeo_urls:
+            return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
+
+        vid_me_embed_url = self._search_regex(
+            r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
+            webpage, 'vid.me embed', default=None)
+        if vid_me_embed_url is not None:
+            return self.url_result(vid_me_embed_url, 'Vidme')
+
+        # Look for YouTube embeds
+        youtube_urls = YoutubeIE._extract_urls(webpage)
+        if youtube_urls:
+            return self.playlist_from_matches(
+                youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())
+
+        matches = DailymotionIE._extract_urls(webpage)
+        if matches:
+            return self.playlist_from_matches(matches, video_id, video_title)
+
+        # Look for embedded Dailymotion playlist player (#3822)
+        m = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
+        if m:
+            playlists = re.findall(
+                r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
+            if playlists:
+                return self.playlist_from_matches(
+                    playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
+
+        # Look for DailyMail embeds
+        dailymail_urls = DailyMailIE._extract_urls(webpage)
+        if dailymail_urls:
+            return self.playlist_from_matches(
+                dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key())
+
+        # Look for Teachable embeds, must be before Wistia
+        teachable_url = TeachableIE._extract_url(webpage, url)
+        if teachable_url:
+            return self.url_result(teachable_url)
+
+        # Look for embedded Wistia player
+        wistia_urls = WistiaIE._extract_urls(webpage)
+        if wistia_urls:
+            playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
+            for entry in playlist['entries']:
+                entry.update({
+                    '_type': 'url_transparent',
+                    'uploader': video_uploader,
+                })
+            return playlist
+
+        # Look for SVT player
+        svt_url = SVTIE._extract_url(webpage)
+        if svt_url:
+            return self.url_result(svt_url, 'SVT')
+
+        # Look for Bandcamp pages with custom domain
+        mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
+        if mobj is not None:
+            burl = unescapeHTML(mobj.group(1))
+            # Don't set the extractor because it can be a track url or an album
+            return self.url_result(burl)
+
+        # Look for embedded Vevo player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
+        # Look for embedded Viddler player
+        mobj = re.search(
+            r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
+        # Look for NYTimes player
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
+        # Look for Libsyn player
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
+        # Look for Ooyala videos
+        mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage)
+                or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage)
+                or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage)
+                or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage)
+                or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
+        if mobj is not None:
+            embed_token = self._search_regex(
+                r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)',
+                webpage, 'ooyala embed token', default=None)
+            return OoyalaIE._build_url_result(smuggle_url(
+                mobj.group('ec'), {
+                    'domain': url,
+                    'embed_token': embed_token,
+                }))
+
+        # Look for multiple Ooyala embeds on SBN network websites
+        mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
+        if mobj is not None:
+            embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
+            if embeds:
+                return self.playlist_from_matches(
+                    embeds, video_id, video_title,
+                    getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')
+
+        # Look for Aparat videos
+        mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group(1), 'Aparat')
+
+        # Look for MPORA videos
+        mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group(1), 'Mpora')
+
+        # Look for embedded Facebook player
+        facebook_urls = FacebookIE._extract_urls(webpage)
+        if facebook_urls:
+            return self.playlist_from_matches(facebook_urls, video_id, video_title)
+
+        # Look for embedded VK player
+        mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'VK')
+
+        # Look for embedded Odnoklassniki player
+        odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage)
+        if odnoklassniki_url:
+            return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
+
+        # Look for embedded ivi player
+        mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Ivi')
+
+        # Look for embedded Huffington Post player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'HuffPost')
+
+        # Look for embed.ly
+        mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+        mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
+        if mobj is not None:
+            return self.url_result(compat_urllib_parse_unquote(mobj.group('url')))
+
+        # Look for funnyordie embed
+        matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
+        if matches:
+            return self.playlist_from_matches(
+                matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
+
+        # Look for BBC iPlayer embed
+        matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
+        if matches:
+            return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk')
+
+        # Look for embedded RUTV player
+        rutv_url = RUTVIE._extract_url(webpage)
+        if rutv_url:
+            return self.url_result(rutv_url, 'RUTV')
+
+        # Look for embedded TVC player
+        tvc_url = TVCIE._extract_url(webpage)
+        if tvc_url:
+            return self.url_result(tvc_url, 'TVC')
+
+        # Look for embedded SportBox player
+        sportbox_urls = SportBoxIE._extract_urls(webpage)
+        if sportbox_urls:
+            return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
+
+        # Look for embedded XHamster player
+        xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
+        if xhamster_urls:
+            return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed')
+
+        # Look for embedded TNAFlixNetwork player
+        tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
+        if tnaflix_urls:
+            return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key())
+
+        # Look for embedded PornHub player
+        pornhub_urls = PornHubIE._extract_urls(webpage)
+        if pornhub_urls:
+            return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key())
+
+        # Look for embedded DrTuber player
+        drtuber_urls = DrTuberIE._extract_urls(webpage)
+        if drtuber_urls:
+            return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key())
+
+        # Look for embedded RedTube player
+        redtube_urls = RedTubeIE._extract_urls(webpage)
+        if redtube_urls:
+            return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())
+
+        # Look for embedded Tube8 player
+        tube8_urls = Tube8IE._extract_urls(webpage)
+        if tube8_urls:
+            return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key())
+
+        # Look for embedded Mofosex player
+        mofosex_urls = MofosexEmbedIE._extract_urls(webpage)
+        if mofosex_urls:
+            return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key())
+
+        # Look for embedded Spankwire player
+        spankwire_urls = SpankwireIE._extract_urls(webpage)
+        if spankwire_urls:
+            return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key())
+
+        # Look for embedded YouPorn player
+        youporn_urls = YouPornIE._extract_urls(webpage)
+        if youporn_urls:
+            return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key())
+
+        # Look for embedded Tvigle player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Tvigle')
+
+        # Look for embedded TED player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'TED')
+
+        # Look for embedded Ustream videos
+        ustream_url = UstreamIE._extract_url(webpage)
+        if ustream_url:
+            return self.url_result(ustream_url, UstreamIE.ie_key())
+
+        # Look for embedded arte.tv player
+        mobj = re.search(
+            r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'ArteTVEmbed')
+
+        # Look for embedded francetv player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
+        # Look for embedded smotri.com player
+        smotri_url = SmotriIE._extract_url(webpage)
+        if smotri_url:
+            return self.url_result(smotri_url, 'Smotri')
+
+        # Look for embedded Myvi.ru player
+        myvi_url = MyviIE._extract_url(webpage)
+        if myvi_url:
+            return self.url_result(myvi_url)
+
+        # Look for embedded soundcloud player
+        soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage)
+        if soundcloud_urls:
+            return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML)
+
+        # Look for tunein player
+        tunein_urls = TuneInBaseIE._extract_urls(webpage)
+        if tunein_urls:
+            return self.playlist_from_matches(tunein_urls, video_id, video_title)
+
+        # Look for embedded mtvservices player
+        mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
+        if mtvservices_url:
+            return self.url_result(mtvservices_url, ie='MTVServicesEmbedded')
+
+        # Look for embedded yahoo player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Yahoo')
+
+        # Look for embedded sbs.com.au player
+        mobj = re.search(
+            r'''(?x)
+            (?:
+                <meta\s+property="og:video"\s+content=|
+                <iframe[^>]+?src=
+            )
+            (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'SBS')
+
+        # Look for embedded Cinchcast player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Cinchcast')
+
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
+            webpage)
+        if not mobj:
+            mobj = re.search(
+                r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
+                webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'MLB')
+
+        mobj = re.search(
+            r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
+            webpage)
+        if mobj is not None:
+            return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
+
+        mobj = re.search(
+            r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Livestream')
+
+        # Look for Zapiks embed
+        mobj = re.search(
+            r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Zapiks')
+
+        # Look for Kaltura embeds
+        kaltura_urls = KalturaIE._extract_urls(webpage)
+        if kaltura_urls:
+            return self.playlist_from_matches(
+                kaltura_urls, video_id, video_title,
+                getter=lambda x: smuggle_url(x, {'source_url': url}),
+                ie=KalturaIE.ie_key())
+
+        # Look for EaglePlatform embeds
+        eagleplatform_url = EaglePlatformIE._extract_url(webpage)
+        if eagleplatform_url:
+            return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key())
+
+        # Look for ClipYou (uses EaglePlatform) embeds
+        mobj = re.search(
+            r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
+        if mobj is not None:
+            return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
+
+        # Look for Pladform embeds
+        pladform_url = PladformIE._extract_url(webpage)
+        if pladform_url:
+            return self.url_result(pladform_url)
+
+        # Look for Videomore embeds
+        videomore_url = VideomoreIE._extract_url(webpage)
+        if videomore_url:
+            return self.url_result(videomore_url)
+
+        # Look for Webcaster embeds
+        webcaster_url = WebcasterFeedIE._extract_url(self, webpage)
+        if webcaster_url:
+            return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key())
+
+        # Look for Playwire embeds
+        mobj = re.search(
+            r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
+        # Look for 5min embeds
+        mobj = re.search(
+            r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
+        if mobj is not None:
+            return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
+
+        # Look for Crooks and Liars embeds
+        mobj = re.search(
+            r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+
+        # Look for NBC Sports VPlayer embeds
+        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
+        if nbc_sports_url:
+            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
+
+        # Look for NBC News embeds
+        nbc_news_embed_url = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage)
+        if nbc_news_embed_url:
+            return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews')
+
+        # Look for Google Drive embeds
+        google_drive_url = GoogleDriveIE._extract_url(webpage)
+        if google_drive_url:
+            return self.url_result(google_drive_url, 'GoogleDrive')
+
+        # Look for UDN embeds
+        mobj = re.search(
+            r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
+        if mobj is not None:
+            return self.url_result(
+                compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
+
+        # Look for Senate ISVP iframe
+        senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
+        if senate_isvp_url:
+            return self.url_result(senate_isvp_url, 'SenateISVP')
+
+        # Look for Kinja embeds
+        kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url)
+        if kinja_embed_urls:
+            return self.playlist_from_matches(
+                kinja_embed_urls, video_id, video_title)
+
+        # Look for OnionStudios embeds
+        onionstudios_url = OnionStudiosIE._extract_url(webpage)
+        if onionstudios_url:
+            return self.url_result(onionstudios_url)
+
+        # Look for ViewLift embeds
+        viewlift_url = ViewLiftEmbedIE._extract_url(webpage)
+        if viewlift_url:
+            return self.url_result(viewlift_url)
+
+        # Look for JWPlatform embeds
+        jwplatform_urls = JWPlatformIE._extract_urls(webpage)
+        if jwplatform_urls:
+            return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key())
+
+        # Look for Digiteka embeds
+        digiteka_url = DigitekaIE._extract_url(webpage)
+        if digiteka_url:
+            return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key())
+
+        # Look for Arkena embeds
+        arkena_url = ArkenaIE._extract_url(webpage)
+        if arkena_url:
+            return self.url_result(arkena_url, ArkenaIE.ie_key())
+
+        # Look for Piksel embeds
+        piksel_url = PikselIE._extract_url(webpage)
+        if piksel_url:
+            return self.url_result(piksel_url, PikselIE.ie_key())
+
+        # Look for Limelight embeds
+        limelight_urls = LimelightBaseIE._extract_urls(webpage, url)
+        if limelight_urls:
+            return self.playlist_result(
+                limelight_urls, video_id, video_title, video_description)
+
+        # Look for Anvato embeds
+        anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)
+        if anvato_urls:
+            return self.playlist_result(
+                anvato_urls, video_id, video_title, video_description)
+
+        # Look for AdobeTVVideo embeds
+        mobj = re.search(
+            r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
+            webpage)
+        if mobj is not None:
+            return self.url_result(
+                self._proto_relative_url(unescapeHTML(mobj.group(1))),
+                'AdobeTVVideo')
+
+        # Look for Vine embeds
+        mobj = re.search(
+            r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))',
+            webpage)
+        if mobj is not None:
+            return self.url_result(
+                self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine')
+
+        # Look for VODPlatform embeds
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1',
+            webpage)
+        if mobj is not None:
+            return self.url_result(
+                self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform')
+
+        # Look for Mangomolo embeds
+        mobj = re.search(
+            r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//
+                (?:
+                    admin\.mangomolo\.com/analytics/index\.php/customers/embed|
+                    player\.mangomolo\.com/v1
+                )/
+                (?:
+                    video\?.*?\bid=(?P<video_id>\d+)|
+                    (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)
+                ).+?)\1''', webpage)
+        if mobj is not None:
+            info = {
+                '_type': 'url_transparent',
+                'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))),
+                'title': video_title,
+                'description': video_description,
+                'thumbnail': video_thumbnail,
+                'uploader': video_uploader,
+            }
+            video_id = mobj.group('video_id')
+            if video_id:
+                info.update({
+                    'ie_key': 'MangomoloVideo',
+                    'id': video_id,
+                })
+            else:
+                info.update({
+                    'ie_key': 'MangomoloLive',
+                    'id': mobj.group('channel_id'),
+                })
+            return info
+
+        # Look for Instagram embeds
+        instagram_embed_url = InstagramIE._extract_embed_url(webpage)
+        if instagram_embed_url is not None:
+            return self.url_result(
+                self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
+
+        # Look for LiveLeak embeds
+        liveleak_urls = LiveLeakIE._extract_urls(webpage)
+        if liveleak_urls:
+            return self.playlist_from_matches(liveleak_urls, video_id, video_title)
+
+        # Look for 3Q SDN embeds
+        threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
+        if threeqsdn_url:
+            return {
+                '_type': 'url_transparent',
+                'ie_key': ThreeQSDNIE.ie_key(),
+                'url': self._proto_relative_url(threeqsdn_url),
+                'title': video_title,
+                'description': video_description,
+                'thumbnail': video_thumbnail,
+                'uploader': video_uploader,
+            }
+
+        # Look for VBOX7 embeds
+        vbox7_url = Vbox7IE._extract_url(webpage)
+        if vbox7_url:
+            return self.url_result(vbox7_url, Vbox7IE.ie_key())
+
+        # Look for DBTV embeds
+        dbtv_urls = DBTVIE._extract_urls(webpage)
+        if dbtv_urls:
+            return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key())
+
+        # Look for Videa embeds
+        videa_urls = VideaIE._extract_urls(webpage)
+        if videa_urls:
+            return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key())
+
+        # Look for 20 minuten embeds
+        twentymin_urls = TwentyMinutenIE._extract_urls(webpage)
+        if twentymin_urls:
+            return self.playlist_from_matches(
+                twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())
+
+        # Look for VideoPress embeds
+        videopress_urls = VideoPressIE._extract_urls(webpage)
+        if videopress_urls:
+            return self.playlist_from_matches(
+                videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key())
+
+        # Look for Rutube embeds
+        rutube_urls = RutubeIE._extract_urls(webpage)
+        if rutube_urls:
+            return self.playlist_from_matches(
+                rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
+
+        # Look for WashingtonPost embeds
+        wapo_urls = WashingtonPostIE._extract_urls(webpage)
+        if wapo_urls:
+            return self.playlist_from_matches(
+                wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key())
+
+        # Look for Mediaset embeds
+        mediaset_urls = MediasetIE._extract_urls(self, webpage)
+        if mediaset_urls:
+            return self.playlist_from_matches(
+                mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
+
+        # Look for JOJ.sk embeds
+        joj_urls = JojIE._extract_urls(webpage)
+        if joj_urls:
+            return self.playlist_from_matches(
+                joj_urls, video_id, video_title, ie=JojIE.ie_key())
+
+        # Look for megaphone.fm embeds
+        mpfn_urls = MegaphoneIE._extract_urls(webpage)
+        if mpfn_urls:
+            return self.playlist_from_matches(
+                mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key())
+
+        # Look for vzaar embeds
+        vzaar_urls = VzaarIE._extract_urls(webpage)
+        if vzaar_urls:
+            return self.playlist_from_matches(
+                vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key())
+
+        channel9_urls = Channel9IE._extract_urls(webpage)
+        if channel9_urls:
+            return self.playlist_from_matches(
+                channel9_urls, video_id, video_title, ie=Channel9IE.ie_key())
+
+        vshare_urls = VShareIE._extract_urls(webpage)
+        if vshare_urls:
+            return self.playlist_from_matches(
+                vshare_urls, video_id, video_title, ie=VShareIE.ie_key())
+
+        # Look for Mediasite embeds
+        mediasite_urls = MediasiteIE._extract_urls(webpage)
+        if mediasite_urls:
+            entries = [
+                self.url_result(smuggle_url(
+                    compat_urlparse.urljoin(url, mediasite_url),
+                    {'UrlReferrer': url}), ie=MediasiteIE.ie_key())
+                for mediasite_url in mediasite_urls]
+            return self.playlist_result(entries, video_id, video_title)
+
+        springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage)
+        if springboardplatform_urls:
+            return self.playlist_from_matches(
+                springboardplatform_urls, video_id, video_title,
+                ie=SpringboardPlatformIE.ie_key())
+
+        yapfiles_urls = YapFilesIE._extract_urls(webpage)
+        if yapfiles_urls:
+            return self.playlist_from_matches(
+                yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key())
+
+        vice_urls = ViceIE._extract_urls(webpage)
+        if vice_urls:
+            return self.playlist_from_matches(
+                vice_urls, video_id, video_title, ie=ViceIE.ie_key())
+
+        xfileshare_urls = XFileShareIE._extract_urls(webpage)
+        if xfileshare_urls:
+            return self.playlist_from_matches(
+                xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key())
+
+        cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage)
+        if cloudflarestream_urls:
+            return self.playlist_from_matches(
+                cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key())
+
+        peertube_urls = PeerTubeIE._extract_urls(webpage, url)
+        if peertube_urls:
+            return self.playlist_from_matches(
+                peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key())
+
+        indavideo_urls = IndavideoEmbedIE._extract_urls(webpage)
+        if indavideo_urls:
+            return self.playlist_from_matches(
+                indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key())
+
+        apa_urls = APAIE._extract_urls(webpage)
+        if apa_urls:
+            return self.playlist_from_matches(
+                apa_urls, video_id, video_title, ie=APAIE.ie_key())
+
+        foxnews_urls = FoxNewsIE._extract_urls(webpage)
+        if foxnews_urls:
+            return self.playlist_from_matches(
+                foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key())
+
+        sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer(
+            r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1',
+            webpage)]
+        if sharevideos_urls:
+            return self.playlist_from_matches(
+                sharevideos_urls, video_id, video_title)
+
+        viqeo_urls = ViqeoIE._extract_urls(webpage)
+        if viqeo_urls:
+            return self.playlist_from_matches(
+                viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key())
+
+        expressen_urls = ExpressenIE._extract_urls(webpage)
+        if expressen_urls:
+            return self.playlist_from_matches(
+                expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key())
+
+        zype_urls = ZypeIE._extract_urls(webpage)
+        if zype_urls:
+            return self.playlist_from_matches(
+                zype_urls, video_id, video_title, ie=ZypeIE.ie_key())
+
+        # Look for HTML5 media
+        entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
+        if entries:
+            if len(entries) == 1:
+                entries[0].update({
+                    'id': video_id,
+                    'title': video_title,
+                })
+            else:
+                for num, entry in enumerate(entries, start=1):
+                    entry.update({
+                        'id': '%s-%s' % (video_id, num),
+                        'title': '%s (%d)' % (video_title, num),
+                    })
+            for entry in entries:
+                self._sort_formats(entry['formats'])
+            return self.playlist_result(entries, video_id, video_title)
+
+        jwplayer_data = self._find_jwplayer_data(
+            webpage, video_id, transform_source=js_to_json)
+        if jwplayer_data:
+            try:
+                info = self._parse_jwplayer_data(
+                    jwplayer_data, video_id, require_title=False, base_url=url)
+                return merge_dicts(info, info_dict)
+            except ExtractorError:
+                # See https://github.com/ytdl-org/youtube-dl/pull/16735
+                pass
+
+        # Video.js embed
+        mobj = re.search(
+            r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
+            webpage)
+        if mobj is not None:
+            sources = self._parse_json(
+                mobj.group(1), video_id, transform_source=js_to_json,
+                fatal=False) or []
+            if not isinstance(sources, list):
+                sources = [sources]
+            formats = []
+            for source in sources:
+                src = source.get('src')
+                if not src or not isinstance(src, compat_str):
+                    continue
+                src = compat_urlparse.urljoin(url, src)
+                src_type = source.get('type')
+                if isinstance(src_type, compat_str):
+                    src_type = src_type.lower()
+                ext = determine_ext(src).lower()
+                if src_type == 'video/youtube':
+                    return self.url_result(src, YoutubeIE.ie_key())
+                if src_type == 'application/dash+xml' or ext == 'mpd':
+                    formats.extend(self._extract_mpd_formats(
+                        src, video_id, mpd_id='dash', fatal=False))
+                elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        src, video_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id='hls', fatal=False))
+                else:
+                    formats.append({
+                        'url': src,
+                        'ext': (mimetype2ext(src_type)
+                                or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
+                    })
+            if formats:
+                self._sort_formats(formats)
+                info_dict['formats'] = formats
+                return info_dict
+
+        # Looking for http://schema.org/VideoObject
+        json_ld = self._search_json_ld(
+            webpage, video_id, default={}, expected_type='VideoObject')
+        if json_ld.get('url'):
+            return merge_dicts(json_ld, info_dict)
+
+        def check_video(vurl):
+            if YoutubeIE.suitable(vurl):
+                return True
+            if RtmpIE.suitable(vurl):
+                return True
+            vpath = compat_urlparse.urlparse(vurl).path
+            vext = determine_ext(vpath)
+            return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
+
+        def filter_video(urls):
+            return list(filter(check_video, urls))
+
+        # Start with something easy: JW Player in SWFObject
+        found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
+        if not found:
+            # Look for gorilla-vid style embedding
+            found = filter_video(re.findall(r'''(?sx)
+                (?:
+                    jw_plugins|
+                    JWPlayerOptions|
+                    jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
+                )
+                .*?
+                ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
+        if not found:
+            # Broaden the search a little bit
+            found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
+        if not found:
+            # Broaden the findall a little bit: JWPlayer JS loader
+            found = filter_video(re.findall(
+                r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
+        if not found:
+            # Flow player
+            found = filter_video(re.findall(r'''(?xs)
+                flowplayer\("[^"]+",\s*
+                    \{[^}]+?\}\s*,
+                    \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
+                        ["']?url["']?\s*:\s*["']([^"']+)["']
+            ''', webpage))
+        if not found:
+            # Cinerama player
+            found = re.findall(
+                r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
+        if not found:
+            # Try to find twitter cards info
+            # twitter:player:stream should be checked before twitter:player since
+            # it is expected to contain a raw stream (see
+            # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
+            found = filter_video(re.findall(
+                r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
+        if not found:
+            # We look for Open Graph info:
+            # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
+            m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
+            # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
+            if m_video_type is not None:
+                found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
+        if not found:
+            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
+            found = re.search(
+                r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
+                r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
+                webpage)
+            if not found:
+                # Look also in Refresh HTTP header
+                refresh_header = head_response.headers.get('Refresh')
+                if refresh_header:
+                    # In python 2 response HTTP headers are bytestrings
+                    if sys.version_info < (3, 0) and isinstance(refresh_header, str):
+                        refresh_header = refresh_header.decode('iso-8859-1')
+                    found = re.search(REDIRECT_REGEX, refresh_header)
+            if found:
+                new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1)))
+                if new_url != url:
+                    self.report_following_redirect(new_url)
+                    return {
+                        '_type': 'url',
+                        'url': new_url,
+                    }
+                else:
+                    found = None
+
+        if not found:
+            # twitter:player is a https URL to iframe player that may or may not
+            # be supported by youtube-dlc thus this is checked the very last (see
+            # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
+            embed_url = self._html_search_meta('twitter:player', webpage, default=None)
+            if embed_url and embed_url != url:
+                return self.url_result(embed_url)
+
+        if not found:
+            raise UnsupportedError(url)
+
+        entries = []
+        for video_url in orderedSet(found):
+            video_url = unescapeHTML(video_url)
+            video_url = video_url.replace('\\/', '/')
+            video_url = compat_urlparse.urljoin(url, video_url)
+            video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
+
+            # Sometimes, jwplayer extraction will result in a YouTube URL
+            if YoutubeIE.suitable(video_url):
+                entries.append(self.url_result(video_url, 'Youtube'))
+                continue
+
+            # here's a fun little line of code for you:
+            video_id = os.path.splitext(video_id)[0]
+
+            entry_info_dict = {
+                'id': video_id,
+                'uploader': video_uploader,
+                'title': video_title,
+                'age_limit': age_limit,
+            }
+
+            if RtmpIE.suitable(video_url):
+                entry_info_dict.update({
+                    '_type': 'url_transparent',
+                    'ie_key': RtmpIE.ie_key(),
+                    'url': video_url,
+                })
+                entries.append(entry_info_dict)
+                continue
+
+            ext = determine_ext(video_url)
+            if ext == 'smil':
+                entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)
+            elif ext == 'xspf':
+                return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id)
+            elif ext == 'm3u8':
+                entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
+            elif ext == 'mpd':
+                entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id)
+            elif ext == 'f4m':
+                entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
+            elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
+                # Just matching .ism/manifest is not enough to be reliably sure
+                # whether it's actually an ISM manifest or some other streaming
+                # manifest since there are various streaming URL formats
+                # possible (see [1]) as well as some other shenanigans like
+                # .smil/manifest URLs that actually serve an ISM (see [2]) and
+                # so on.
+                # Thus the most reasonable way to solve this is to delegate
+                # to generic extractor in order to look into the contents of
+                # the manifest itself.
+                # 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats
+                # 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest
+                entry_info_dict = self.url_result(
+                    smuggle_url(video_url, {'to_generic': True}),
+                    GenericIE.ie_key())
+            else:
+                entry_info_dict['url'] = video_url
+
+            if entry_info_dict.get('formats'):
+                self._sort_formats(entry_info_dict['formats'])
+
+            entries.append(entry_info_dict)
+
+        if len(entries) == 1:
+            return entries[0]
+        else:
+            for num, e in enumerate(entries, start=1):
+                # 'url' results don't have a title
+                if e.get('title') is not None:
+                    e['title'] = '%s (%d)' % (e['title'], num)
+            return {
+                '_type': 'playlist',
+                'entries': entries,
+            }
diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py

new file mode 100644 (file)

index 0000000..18a30fe
--- /dev/null
+++ b/youtube_dl/extractor/gfycat.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    float_or_none,
+    qualities,
+    ExtractorError,
+)
+
+
+class GfycatIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\.]+)'
+    _TESTS = [{
+        'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher',
+        'info_dict': {
+            'id': 'DeadlyDecisiveGermanpinscher',
+            'ext': 'mp4',
+            'title': 'Ghost in the Shell',
+            'timestamp': 1410656006,
+            'upload_date': '20140914',
+            'uploader': 'anonymous',
+            'duration': 10.4,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'categories': list,
+            'age_limit': 0,
+        }
+    }, {
+        'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa',
+        'info_dict': {
+            'id': 'JauntyTimelyAmazontreeboa',
+            'ext': 'mp4',
+            'title': 'JauntyTimelyAmazontreeboa',
+            'timestamp': 1411720126,
+            'upload_date': '20140926',
+            'uploader': 'anonymous',
+            'duration': 3.52,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'categories': list,
+            'age_limit': 0,
+        }
+    }, {
+        'url': 'https://gfycat.com/ru/RemarkableDrearyAmurstarfish',
+        'only_matching': True
+    }, {
+        'url': 'https://gfycat.com/gifs/detail/UnconsciousLankyIvorygull',
+        'only_matching': True
+    }, {
+        'url': 'https://gfycat.com/acceptablehappygoluckyharborporpoise-baseball',
+        'only_matching': True
+    }, {
+        'url': 'https://thumbs.gfycat.com/acceptablehappygoluckyharborporpoise-size_restricted.gif',
+        'only_matching': True
+    }, {
+        'url': 'https://giant.gfycat.com/acceptablehappygoluckyharborporpoise.mp4',
+        'only_matching': True
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        gfy = self._download_json(
+            'https://api.gfycat.com/v1/gfycats/%s' % video_id,
+            video_id, 'Downloading video info')
+        if 'error' in gfy:
+            raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True)
+        gfy = gfy['gfyItem']
+
+        title = gfy.get('title') or gfy['gfyName']
+        description = gfy.get('description')
+        timestamp = int_or_none(gfy.get('createDate'))
+        uploader = gfy.get('userName')
+        view_count = int_or_none(gfy.get('views'))
+        like_count = int_or_none(gfy.get('likes'))
+        dislike_count = int_or_none(gfy.get('dislikes'))
+        age_limit = 18 if gfy.get('nsfw') == '1' else 0
+
+        width = int_or_none(gfy.get('width'))
+        height = int_or_none(gfy.get('height'))
+        fps = int_or_none(gfy.get('frameRate'))
+        num_frames = int_or_none(gfy.get('numFrames'))
+
+        duration = float_or_none(num_frames, fps) if num_frames and fps else None
+
+        categories = gfy.get('tags') or gfy.get('extraLemmas') or []
+
+        FORMATS = ('gif', 'webm', 'mp4')
+        quality = qualities(FORMATS)
+
+        formats = []
+        for format_id in FORMATS:
+            video_url = gfy.get('%sUrl' % format_id)
+            if not video_url:
+                continue
+            filesize = int_or_none(gfy.get('%sSize' % format_id))
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'width': width,
+                'height': height,
+                'fps': fps,
+                'filesize': filesize,
+                'quality': quality(format_id),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'categories': categories,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/giantbomb.py b/youtube_dl/extractor/giantbomb.py

new file mode 100644 (file)

index 0000000..c647795
--- /dev/null
+++ b/youtube_dl/extractor/giantbomb.py
@@ -0,0 +1,90 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    qualities,
+    unescapeHTML,
+)
+
+
+class GiantBombIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/(?:videos|shows)/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)'
+    _TESTS = [{
+        'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/',
+        'md5': '132f5a803e7e0ab0e274d84bda1e77ae',
+        'info_dict': {
+            'id': '2300-9782',
+            'display_id': 'quick-look-destiny-the-dark-below',
+            'ext': 'mp4',
+            'title': 'Quick Look: Destiny: The Dark Below',
+            'description': 'md5:0aa3aaf2772a41b91d44c63f30dfad24',
+            'duration': 2399,
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }, {
+        'url': 'https://www.giantbomb.com/shows/ben-stranding/2970-20212',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        video = json.loads(unescapeHTML(self._search_regex(
+            r'data-video="([^"]+)"', webpage, 'data-video')))
+
+        duration = int_or_none(video.get('lengthSeconds'))
+
+        quality = qualities([
+            'f4m_low', 'progressive_low', 'f4m_high',
+            'progressive_high', 'f4m_hd', 'progressive_hd'])
+
+        formats = []
+        for format_id, video_url in video['videoStreams'].items():
+            if format_id == 'f4m_stream':
+                continue
+            ext = determine_ext(video_url)
+            if ext == 'f4m':
+                f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id)
+                if f4m_formats:
+                    f4m_formats[0]['quality'] = quality(format_id)
+                    formats.extend(f4m_formats)
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, display_id, ext='mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': video_url,
+                    'format_id': format_id,
+                    'quality': quality(format_id),
+                })
+
+        if not formats:
+            youtube_id = video.get('youtubeID')
+            if youtube_id:
+                return self.url_result(youtube_id, 'Youtube')
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/giga.py b/youtube_dl/extractor/giga.py

new file mode 100644 (file)

index 0000000..5a9992a
--- /dev/null
+++ b/youtube_dl/extractor/giga.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+    qualities,
+    compat_str,
+    parse_duration,
+    parse_iso8601,
+    str_to_int,
+)
+
+
+class GigaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?giga\.de/(?:[^/]+/)*(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.giga.de/filme/anime-awesome/trailer/anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss/',
+        'md5': '6bc5535e945e724640664632055a584f',
+        'info_dict': {
+            'id': '2622086',
+            'display_id': 'anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss',
+            'ext': 'mp4',
+            'title': 'Anime Awesome: Chihiros Reise ins Zauberland – Das Beste kommt zum Schluss',
+            'description': 'md5:afdf5862241aded4718a30dff6a57baf',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 578,
+            'timestamp': 1414749706,
+            'upload_date': '20141031',
+            'uploader': 'Robin Schweiger',
+            'view_count': int,
+        },
+    }, {
+        'url': 'http://www.giga.de/games/channel/giga-top-montag/giga-topmontag-die-besten-serien-2014/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.giga.de/extra/netzkultur/videos/giga-games-tom-mats-robin-werden-eigene-wege-gehen-eine-ankuendigung/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.giga.de/tv/jonas-liest-spieletitel-eingedeutscht-episode-2/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            [r'data-video-id="(\d+)"', r'/api/video/jwplayer/#v=(\d+)'],
+            webpage, 'video id')
+
+        playlist = self._download_json(
+            'http://www.giga.de/api/syndication/video/video_id/%s/playlist.json?content=syndication/key/368b5f151da4ae05ced7fa296bdff65a/'
+            % video_id, video_id)[0]
+
+        quality = qualities(['normal', 'hd720'])
+
+        formats = []
+        for format_id in itertools.count(0):
+            fmt = playlist.get(compat_str(format_id))
+            if not fmt:
+                break
+            formats.append({
+                'url': fmt['src'],
+                'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]),
+                'quality': quality(fmt['quality']),
+            })
+        self._sort_formats(formats)
+
+        title = self._html_search_meta(
+            'title', webpage, 'title', fatal=True)
+        description = self._html_search_meta(
+            'description', webpage, 'description')
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        duration = parse_duration(self._search_regex(
+            r'(?s)(?:data-video-id="{0}"|data-video="[^"]*/api/video/jwplayer/#v={0}[^"]*")[^>]*>.+?<span class="duration">([^<]+)</span>'.format(video_id),
+            webpage, 'duration', fatal=False))
+
+        timestamp = parse_iso8601(self._search_regex(
+            r'datetime="([^"]+)"', webpage, 'upload date', fatal=False))
+        uploader = self._search_regex(
+            r'class="author">([^<]+)</a>', webpage, 'uploader', fatal=False)
+
+        view_count = str_to_int(self._search_regex(
+            r'<span class="views"><strong>([\d.,]+)</strong>',
+            webpage, 'view count', fatal=False))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'view_count': view_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/gigya.py b/youtube_dl/extractor/gigya.py

new file mode 100644 (file)

index 0000000..4121784
--- /dev/null
+++ b/youtube_dl/extractor/gigya.py
@@ -0,0 +1,22 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..utils import (
+    ExtractorError,
+    urlencode_postdata,
+)
+
+
+class GigyaBaseIE(InfoExtractor):
+    def _gigya_login(self, auth_data):
+        auth_info = self._download_json(
+            'https://accounts.eu1.gigya.com/accounts.login', None,
+            note='Logging in', errnote='Unable to log in',
+            data=urlencode_postdata(auth_data))
+
+        error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage')
+        if error_message:
+            raise ExtractorError(
+                'Unable to login: %s' % error_message, expected=True)
+        return auth_info
diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py

new file mode 100644 (file)

index 0000000..d94dfbf
--- /dev/null
+++ b/youtube_dl/extractor/glide.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class GlideIE(InfoExtractor):
+    IE_DESC = 'Glide mobile video messages (glide.me)'
+    _VALID_URL = r'https?://share\.glide\.me/(?P<id>[A-Za-z0-9\-=_+]+)'
+    _TEST = {
+        'url': 'http://share.glide.me/UZF8zlmuQbe4mr+7dCiQ0w==',
+        'md5': '4466372687352851af2d131cfaa8a4c7',
+        'info_dict': {
+            'id': 'UZF8zlmuQbe4mr+7dCiQ0w==',
+            'ext': 'mp4',
+            'title': "Damon's Glide message",
+            'thumbnail': r're:^https?://.*?\.cloudfront\.net/.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<title>(.+?)</title>', webpage,
+            'title', default=None) or self._og_search_title(webpage)
+        video_url = self._proto_relative_url(self._search_regex(
+            r'<source[^>]+src=(["\'])(?P<url>.+?)\1',
+            webpage, 'video URL', default=None,
+            group='url')) or self._og_search_video_url(webpage)
+        thumbnail = self._proto_relative_url(self._search_regex(
+            r'<img[^>]+id=["\']video-thumbnail["\'][^>]+src=(["\'])(?P<url>.+?)\1',
+            webpage, 'thumbnail url', default=None,
+            group='url')) or self._og_search_thumbnail(webpage)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py

new file mode 100644 (file)

index 0000000..60d842d
--- /dev/null
+++ b/youtube_dl/extractor/globo.py
@@ -0,0 +1,240 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import hashlib
+import json
+import random
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    orderedSet,
+    str_or_none,
+)
+
+
+class GloboIE(InfoExtractor):
+    _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})'
+    _NETRC_MACHINE = 'globo'
+    _TESTS = [{
+        'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
+        'md5': 'b3ccc801f75cd04a914d51dadb83a78d',
+        'info_dict': {
+            'id': '3607726',
+            'ext': 'mp4',
+            'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
+            'duration': 103.204,
+            'uploader': 'Globo.com',
+            'uploader_id': '265',
+        },
+    }, {
+        'url': 'http://globoplay.globo.com/v/4581987/',
+        'md5': 'f36a1ecd6a50da1577eee6dd17f67eff',
+        'info_dict': {
+            'id': '4581987',
+            'ext': 'mp4',
+            'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP',
+            'duration': 137.973,
+            'uploader': 'Rede Globo',
+            'uploader_id': '196',
+        },
+    }, {
+        'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://globosatplay.globo.com/globonews/v/4472924/',
+        'only_matching': True,
+    }, {
+        'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/',
+        'only_matching': True,
+    }, {
+        'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/',
+        'only_matching': True,
+    }, {
+        'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html',
+        'only_matching': True,
+    }, {
+        'url': 'globo:3607726',
+        'only_matching': True,
+    }]
+
+    def _real_initialize(self):
+        email, password = self._get_login_info()
+        if email is None:
+            return
+
+        try:
+            glb_id = (self._download_json(
+                'https://login.globo.com/api/authentication', None, data=json.dumps({
+                    'payload': {
+                        'email': email,
+                        'password': password,
+                        'serviceId': 4654,
+                    },
+                }).encode(), headers={
+                    'Content-Type': 'application/json; charset=utf-8',
+                }) or {}).get('glbId')
+            if glb_id:
+                self._set_cookie('.globo.com', 'GLBID', glb_id)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                resp = self._parse_json(e.cause.read(), None)
+                raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True)
+            raise
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://api.globovideos.com/videos/%s/playlist' % video_id,
+            video_id)['videos'][0]
+        if video.get('encrypted') is True:
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
+        title = video['title']
+
+        formats = []
+        subtitles = {}
+        for resource in video['resources']:
+            resource_id = resource.get('_id')
+            resource_url = resource.get('url')
+            resource_type = resource.get('type')
+            if not resource_url or (resource_type == 'media' and not resource_id) or resource_type not in ('subtitle', 'media'):
+                continue
+
+            if resource_type == 'subtitle':
+                subtitles.setdefault(resource.get('language') or 'por', []).append({
+                    'url': resource_url,
+                })
+                continue
+
+            security = self._download_json(
+                'http://security.video.globo.com/videos/%s/hash' % video_id,
+                video_id, 'Downloading security hash for %s' % resource_id, query={
+                    'player': 'desktop',
+                    'version': '5.19.1',
+                    'resource_id': resource_id,
+                })
+
+            security_hash = security.get('hash')
+            if not security_hash:
+                message = security.get('message')
+                if message:
+                    raise ExtractorError(
+                        '%s returned error: %s' % (self.IE_NAME, message), expected=True)
+                continue
+
+            hash_code = security_hash[:2]
+            padding = '%010d' % random.randint(1, 10000000000)
+            if hash_code in ('04', '14'):
+                received_time = security_hash[3:13]
+                received_md5 = security_hash[24:]
+                hash_prefix = security_hash[:23]
+            elif hash_code in ('02', '12', '03', '13'):
+                received_time = security_hash[2:12]
+                received_md5 = security_hash[22:]
+                padding += '1'
+                hash_prefix = '05' + security_hash[:22]
+
+            padded_sign_time = compat_str(int(received_time) + 86400) + padding
+            md5_data = (received_md5 + padded_sign_time + '0xAC10FD').encode()
+            signed_md5 = base64.urlsafe_b64encode(hashlib.md5(md5_data).digest()).decode().strip('=')
+            signed_hash = hash_prefix + padded_sign_time + signed_md5
+            signed_url = '%s?h=%s&k=html5&a=%s&u=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A', security.get('user') or '')
+
+            if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'):
+                formats.extend(self._extract_m3u8_formats(
+                    signed_url, resource_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif resource_id.endswith('mpd') or resource_url.endswith('.mpd'):
+                formats.extend(self._extract_mpd_formats(
+                    signed_url, resource_id, mpd_id='dash', fatal=False))
+            elif resource_id.endswith('manifest') or resource_url.endswith('/manifest'):
+                formats.extend(self._extract_ism_formats(
+                    signed_url, resource_id, ism_id='mss', fatal=False))
+            else:
+                formats.append({
+                    'url': signed_url,
+                    'format_id': 'http-%s' % resource_id,
+                    'height': int_or_none(resource.get('height')),
+                })
+
+        self._sort_formats(formats)
+
+        duration = float_or_none(video.get('duration'), 1000)
+        uploader = video.get('channel')
+        uploader_id = str_or_none(video.get('channel_id'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'duration': duration,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class GloboArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/.]+)(?:\.html)?'
+
+    _VIDEOID_REGEXES = [
+        r'\bdata-video-id=["\'](\d{7,})',
+        r'\bdata-player-videosids=["\'](\d{7,})',
+        r'\bvideosIDs\s*:\s*["\']?(\d{7,})',
+        r'\bdata-id=["\'](\d{7,})',
+        r'<div[^>]+\bid=["\'](\d{7,})',
+    ]
+
+    _TESTS = [{
+        'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html',
+        'info_dict': {
+            'id': 'novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes',
+            'title': 'Novidade na fiscalização de bagagem pela Receita provoca discussões',
+            'description': 'md5:c3c4b4d4c30c32fce460040b1ac46b12',
+        },
+        'playlist_count': 1,
+    }, {
+        'url': 'http://g1.globo.com/pr/parana/noticia/2016/09/mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato.html',
+        'info_dict': {
+            'id': 'mpf-denuncia-lula-marisa-e-mais-seis-na-operacao-lava-jato',
+            'title': "Lula era o 'comandante máximo' do esquema da Lava Jato, diz MPF",
+            'description': 'md5:8aa7cc8beda4dc71cc8553e00b77c54c',
+        },
+        'playlist_count': 6,
+    }, {
+        'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_ids = []
+        for video_regex in self._VIDEOID_REGEXES:
+            video_ids.extend(re.findall(video_regex, webpage))
+        entries = [
+            self.url_result('globo:%s' % video_id, GloboIE.ie_key())
+            for video_id in orderedSet(video_ids)]
+        title = self._og_search_title(webpage, fatal=False)
+        description = self._html_search_meta('description', webpage)
+        return self.playlist_result(entries, display_id, title, description)
diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py

new file mode 100644 (file)

index 0000000..03cfba9
--- /dev/null
+++ b/youtube_dl/extractor/go.py
@@ -0,0 +1,268 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .adobepass import AdobePassIE
+from ..utils import (
+    int_or_none,
+    determine_ext,
+    parse_age_limit,
+    urlencode_postdata,
+    ExtractorError,
+)
+
+
+class GoIE(AdobePassIE):
+    _SITE_INFO = {
+        'abc': {
+            'brand': '001',
+            'requestor_id': 'ABC',
+        },
+        'freeform': {
+            'brand': '002',
+            'requestor_id': 'ABCFamily',
+        },
+        'watchdisneychannel': {
+            'brand': '004',
+            'resource_id': 'Disney',
+        },
+        'watchdisneyjunior': {
+            'brand': '008',
+            'resource_id': 'DisneyJunior',
+        },
+        'watchdisneyxd': {
+            'brand': '009',
+            'resource_id': 'DisneyXD',
+        },
+        'disneynow': {
+            'brand': '011',
+            'resource_id': 'Disney',
+        }
+    }
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:(?P<sub_domain>%s)\.)?go|
+                            (?P<sub_domain_2>abc|freeform|disneynow)
+                        )\.com/
+                        (?:
+                            (?:[^/]+/)*(?P<id>[Vv][Dd][Kk][Aa]\w+)|
+                            (?:[^/]+/)*(?P<display_id>[^/?\#]+)
+                        )
+                    ''' % '|'.join(list(_SITE_INFO.keys()))
+    _TESTS = [{
+        'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643',
+        'info_dict': {
+            'id': 'VDKA3807643',
+            'ext': 'mp4',
+            'title': 'The Traitor in the White House',
+            'description': 'md5:05b009d2d145a1e85d25111bd37222e8',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': 'This content is no longer available.',
+    }, {
+        'url': 'http://watchdisneyxd.go.com/doraemon',
+        'info_dict': {
+            'title': 'Doraemon',
+            'id': 'SH55574025',
+        },
+        'playlist_mincount': 51,
+    }, {
+        'url': 'http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood',
+        'info_dict': {
+            'id': 'VDKA3609139',
+            'ext': 'mp4',
+            'title': 'This Guilty Blood',
+            'description': 'md5:f18e79ad1c613798d95fdabfe96cd292',
+            'age_limit': 14,
+        },
+        'params': {
+            'geo_bypass_ip_block': '3.244.239.0/24',
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet',
+        'info_dict': {
+            'id': 'VDKA13435179',
+            'ext': 'mp4',
+            'title': 'The Bet',
+            'description': 'md5:c66de8ba2e92c6c5c113c3ade84ab404',
+            'age_limit': 14,
+        },
+        'params': {
+            'geo_bypass_ip_block': '3.244.239.0/24',
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',
+        'only_matching': True,
+    }, {
+        'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland',
+        'only_matching': True,
+    }, {
+        # brand 004
+        'url': 'http://disneynow.go.com/shows/big-hero-6-the-series/season-01/episode-10-mr-sparkles-loses-his-sparkle/vdka4637915',
+        'only_matching': True,
+    }, {
+        # brand 008
+        'url': 'http://disneynow.go.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013',
+        'only_matching': True,
+    }, {
+        'url': 'https://disneynow.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013',
+        'only_matching': True,
+    }]
+
+    def _extract_videos(self, brand, video_id='-1', show_id='-1'):
+        display_id = video_id if video_id != '-1' else show_id
+        return self._download_json(
+            'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id),
+            display_id)['video']
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        sub_domain = mobj.group('sub_domain') or mobj.group('sub_domain_2')
+        video_id, display_id = mobj.group('id', 'display_id')
+        site_info = self._SITE_INFO.get(sub_domain, {})
+        brand = site_info.get('brand')
+        if not video_id or not site_info:
+            webpage = self._download_webpage(url, display_id or video_id)
+            video_id = self._search_regex(
+                (
+                    # There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
+                    # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
+                    r'data-video-id=["\']*(VDKA\w+)',
+                    # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet
+                    r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)'
+                ), webpage, 'video id', default=video_id)
+            if not site_info:
+                brand = self._search_regex(
+                    (r'data-brand=\s*["\']\s*(\d+)',
+                     r'data-page-brand=\s*["\']\s*(\d+)'), webpage, 'brand',
+                    default='004')
+                site_info = next(
+                    si for _, si in self._SITE_INFO.items()
+                    if si.get('brand') == brand)
+            if not video_id:
+                # show extraction works for Disney, DisneyJunior and DisneyXD
+                # ABC and Freeform has different layout
+                show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id')
+                videos = self._extract_videos(brand, show_id=show_id)
+                show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False)
+                entries = []
+                for video in videos:
+                    entries.append(self.url_result(
+                        video['url'], 'Go', video.get('id'), video.get('title')))
+                entries.reverse()
+                return self.playlist_result(entries, show_id, show_title)
+        video_data = self._extract_videos(brand, video_id)[0]
+        video_id = video_data['id']
+        title = video_data['title']
+
+        formats = []
+        for asset in video_data.get('assets', {}).get('asset', []):
+            asset_url = asset.get('value')
+            if not asset_url:
+                continue
+            format_id = asset.get('format')
+            ext = determine_ext(asset_url)
+            if ext == 'm3u8':
+                video_type = video_data.get('type')
+                data = {
+                    'video_id': video_data['id'],
+                    'video_type': video_type,
+                    'brand': brand,
+                    'device': '001',
+                }
+                if video_data.get('accesslevel') == '1':
+                    requestor_id = site_info.get('requestor_id', 'DisneyChannels')
+                    resource = site_info.get('resource_id') or self._get_mvpd_resource(
+                        requestor_id, title, video_id, None)
+                    auth = self._extract_mvpd_auth(
+                        url, video_id, requestor_id, resource)
+                    data.update({
+                        'token': auth,
+                        'token_type': 'ap',
+                        'adobe_requestor_id': requestor_id,
+                    })
+                else:
+                    self._initialize_geo_bypass({'countries': ['US']})
+                entitlement = self._download_json(
+                    'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json',
+                    video_id, data=urlencode_postdata(data))
+                errors = entitlement.get('errors', {}).get('errors', [])
+                if errors:
+                    for error in errors:
+                        if error.get('code') == 1002:
+                            self.raise_geo_restricted(
+                                error['message'], countries=['US'])
+                    error_message = ', '.join([error['message'] for error in errors])
+                    raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
+                asset_url += '?' + entitlement['uplynkData']['sessionKey']
+                formats.extend(self._extract_m3u8_formats(
+                    asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False))
+            else:
+                f = {
+                    'format_id': format_id,
+                    'url': asset_url,
+                    'ext': ext,
+                }
+                if re.search(r'(?:/mp4/source/|_source\.mp4)', asset_url):
+                    f.update({
+                        'format_id': ('%s-' % format_id if format_id else '') + 'SOURCE',
+                        'preference': 1,
+                    })
+                else:
+                    mobj = re.search(r'/(\d+)x(\d+)/', asset_url)
+                    if mobj:
+                        height = int(mobj.group(2))
+                        f.update({
+                            'format_id': ('%s-' % format_id if format_id else '') + '%dP' % height,
+                            'width': int(mobj.group(1)),
+                            'height': height,
+                        })
+                formats.append(f)
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for cc in video_data.get('closedcaption', {}).get('src', []):
+            cc_url = cc.get('value')
+            if not cc_url:
+                continue
+            ext = determine_ext(cc_url)
+            if ext == 'xml':
+                ext = 'ttml'
+            subtitles.setdefault(cc.get('lang'), []).append({
+                'url': cc_url,
+                'ext': ext,
+            })
+
+        thumbnails = []
+        for thumbnail in video_data.get('thumbnails', {}).get('thumbnail', []):
+            thumbnail_url = thumbnail.get('value')
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+                'width': int_or_none(thumbnail.get('width')),
+                'height': int_or_none(thumbnail.get('height')),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('longdescription') or video_data.get('description'),
+            'duration': int_or_none(video_data.get('duration', {}).get('value'), 1000),
+            'age_limit': parse_age_limit(video_data.get('tvrating', {}).get('rating')),
+            'episode_number': int_or_none(video_data.get('episodenumber')),
+            'series': video_data.get('show', {}).get('title'),
+            'season_number': int_or_none(video_data.get('season', {}).get('num')),
+            'thumbnails': thumbnails,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/godtube.py b/youtube_dl/extractor/godtube.py

new file mode 100644 (file)

index 0000000..92efd16
--- /dev/null
+++ b/youtube_dl/extractor/godtube.py
@@ -0,0 +1,58 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    parse_iso8601,
+)
+
+
+class GodTubeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?godtube\.com/watch/\?v=(?P<id>[\da-zA-Z]+)'
+    _TESTS = [
+        {
+            'url': 'https://www.godtube.com/watch/?v=0C0CNNNU',
+            'md5': '77108c1e4ab58f48031101a1a2119789',
+            'info_dict': {
+                'id': '0C0CNNNU',
+                'ext': 'mp4',
+                'title': 'Woman at the well.',
+                'duration': 159,
+                'timestamp': 1205712000,
+                'uploader': 'beverlybmusic',
+                'upload_date': '20080317',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        config = self._download_xml(
+            'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(),
+            video_id, 'Downloading player config XML')
+
+        video_url = config.find('file').text
+        uploader = config.find('author').text
+        timestamp = parse_iso8601(config.find('date').text)
+        duration = parse_duration(config.find('duration').text)
+        thumbnail = config.find('image').text
+
+        media = self._download_xml(
+            'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML')
+
+        title = media.find('title').text
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py

new file mode 100644 (file)

index 0000000..47a068e
--- /dev/null
+++ b/youtube_dl/extractor/golem.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    determine_ext,
+)
+
+
+class GolemIE(InfoExtractor):
+    _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/'
+    _TEST = {
+        'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html',
+        'md5': 'c1a2c0a3c863319651c7c992c5ee29bf',
+        'info_dict': {
+            'id': '14095',
+            'format_id': 'high',
+            'ext': 'mp4',
+            'title': 'iPhone 6 und 6 Plus - Test',
+            'duration': 300.44,
+            'filesize': 65309548,
+        }
+    }
+
+    _PREFIX = 'http://video.golem.de'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        config = self._download_xml(
+            'https://video.golem.de/xml/{0}.xml'.format(video_id), video_id)
+
+        info = {
+            'id': video_id,
+            'title': config.findtext('./title', 'golem'),
+            'duration': self._float(config.findtext('./playtime'), 'duration'),
+        }
+
+        formats = []
+        for e in config:
+            url = e.findtext('./url')
+            if not url:
+                continue
+
+            formats.append({
+                'format_id': compat_str(e.tag),
+                'url': compat_urlparse.urljoin(self._PREFIX, url),
+                'height': self._int(e.get('height'), 'height'),
+                'width': self._int(e.get('width'), 'width'),
+                'filesize': self._int(e.findtext('filesize'), 'filesize'),
+                'ext': determine_ext(e.findtext('./filename')),
+            })
+        self._sort_formats(formats)
+        info['formats'] = formats
+
+        thumbnails = []
+        for e in config.findall('.//teaser'):
+            url = e.findtext('./url')
+            if not url:
+                continue
+            thumbnails.append({
+                'url': compat_urlparse.urljoin(self._PREFIX, url),
+                'width': self._int(e.get('width'), 'thumbnail width'),
+                'height': self._int(e.get('height'), 'thumbnail height'),
+            })
+        info['thumbnails'] = thumbnails
+
+        return info
diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py

new file mode 100644 (file)

index 0000000..589e4d5
--- /dev/null
+++ b/youtube_dl/extractor/googledrive.py
@@ -0,0 +1,277 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    lowercase_escape,
+    update_url_query,
+)
+
+
+class GoogleDriveIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:
+                                (?:docs|drive)\.google\.com/
+                                (?:
+                                    (?:uc|open)\?.*?id=|
+                                    file/d/
+                                )|
+                                video\.google\.com/get_player\?.*?docid=
+                            )
+                            (?P<id>[a-zA-Z0-9_-]{28,})
+                    '''
+    _TESTS = [{
+        'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
+        'md5': '5c602afbbf2c1db91831f5d82f678554',
+        'info_dict': {
+            'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
+            'ext': 'mp4',
+            'title': 'Big Buck Bunny.mp4',
+            'duration': 45,
+        }
+    }, {
+        # video can't be watched anonymously due to view count limit reached,
+        # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
+        'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
+        'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
+        'info_dict': {
+            'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
+            'ext': 'mp4',
+            'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
+        }
+    }, {
+        # video id is longer than 28 characters
+        'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
+        'info_dict': {
+            'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
+            'ext': 'mp4',
+            'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
+            'duration': 189,
+        },
+        'only_matching': True,
+    }, {
+        'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
+        'only_matching': True,
+    }, {
+        'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
+        'only_matching': True,
+    }]
+    _FORMATS_EXT = {
+        '5': 'flv',
+        '6': 'flv',
+        '13': '3gp',
+        '17': '3gp',
+        '18': 'mp4',
+        '22': 'mp4',
+        '34': 'flv',
+        '35': 'flv',
+        '36': '3gp',
+        '37': 'mp4',
+        '38': 'mp4',
+        '43': 'webm',
+        '44': 'webm',
+        '45': 'webm',
+        '46': 'webm',
+        '59': 'mp4',
+    }
+    _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
+    _CAPTIONS_ENTRY_TAG = {
+        'subtitles': 'track',
+        'automatic_captions': 'target',
+    }
+    _caption_formats_ext = []
+    _captions_xml = None
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
+            webpage)
+        if mobj:
+            return 'https://drive.google.com/file/d/%s' % mobj.group('id')
+
+    def _download_subtitles_xml(self, video_id, subtitles_id, hl):
+        if self._captions_xml:
+            return
+        self._captions_xml = self._download_xml(
+            self._BASE_URL_CAPTIONS, video_id, query={
+                'id': video_id,
+                'vid': subtitles_id,
+                'hl': hl,
+                'v': video_id,
+                'type': 'list',
+                'tlangs': '1',
+                'fmts': '1',
+                'vssids': '1',
+            }, note='Downloading subtitles XML',
+            errnote='Unable to download subtitles XML', fatal=False)
+        if self._captions_xml:
+            for f in self._captions_xml.findall('format'):
+                if f.attrib.get('fmt_code') and not f.attrib.get('default'):
+                    self._caption_formats_ext.append(f.attrib['fmt_code'])
+
+    def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
+                              origin_lang_code=None):
+        if not subtitles_id or not caption_type:
+            return
+        captions = {}
+        for caption_entry in self._captions_xml.findall(
+                self._CAPTIONS_ENTRY_TAG[caption_type]):
+            caption_lang_code = caption_entry.attrib.get('lang_code')
+            if not caption_lang_code:
+                continue
+            caption_format_data = []
+            for caption_format in self._caption_formats_ext:
+                query = {
+                    'vid': subtitles_id,
+                    'v': video_id,
+                    'fmt': caption_format,
+                    'lang': (caption_lang_code if origin_lang_code is None
+                             else origin_lang_code),
+                    'type': 'track',
+                    'name': '',
+                    'kind': '',
+                }
+                if origin_lang_code is not None:
+                    query.update({'tlang': caption_lang_code})
+                caption_format_data.append({
+                    'url': update_url_query(self._BASE_URL_CAPTIONS, query),
+                    'ext': caption_format,
+                })
+            captions[caption_lang_code] = caption_format_data
+        return captions
+
+    def _get_subtitles(self, video_id, subtitles_id, hl):
+        if not subtitles_id or not hl:
+            return
+        self._download_subtitles_xml(video_id, subtitles_id, hl)
+        if not self._captions_xml:
+            return
+        return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
+
+    def _get_automatic_captions(self, video_id, subtitles_id, hl):
+        if not subtitles_id or not hl:
+            return
+        self._download_subtitles_xml(video_id, subtitles_id, hl)
+        if not self._captions_xml:
+            return
+        track = self._captions_xml.find('track')
+        if track is None:
+            return
+        origin_lang_code = track.attrib.get('lang_code')
+        if not origin_lang_code:
+            return
+        return self._get_captions_by_type(
+            video_id, subtitles_id, 'automatic_captions', origin_lang_code)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'http://docs.google.com/file/d/%s' % video_id, video_id)
+
+        title = self._search_regex(
+            r'"title"\s*,\s*"([^"]+)', webpage, 'title',
+            default=None) or self._og_search_title(webpage)
+        duration = int_or_none(self._search_regex(
+            r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
+            default=None))
+
+        formats = []
+        fmt_stream_map = self._search_regex(
+            r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
+            'fmt stream map', default='').split(',')
+        fmt_list = self._search_regex(
+            r'"fmt_list"\s*,\s*"([^"]+)', webpage,
+            'fmt_list', default='').split(',')
+        if fmt_stream_map and fmt_list:
+            resolutions = {}
+            for fmt in fmt_list:
+                mobj = re.search(
+                    r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
+                if mobj:
+                    resolutions[mobj.group('format_id')] = (
+                        int(mobj.group('width')), int(mobj.group('height')))
+
+            for fmt_stream in fmt_stream_map:
+                fmt_stream_split = fmt_stream.split('|')
+                if len(fmt_stream_split) < 2:
+                    continue
+                format_id, format_url = fmt_stream_split[:2]
+                f = {
+                    'url': lowercase_escape(format_url),
+                    'format_id': format_id,
+                    'ext': self._FORMATS_EXT[format_id],
+                }
+                resolution = resolutions.get(format_id)
+                if resolution:
+                    f.update({
+                        'width': resolution[0],
+                        'height': resolution[1],
+                    })
+                formats.append(f)
+
+        source_url = update_url_query(
+            'https://drive.google.com/uc', {
+                'id': video_id,
+                'export': 'download',
+            })
+        urlh = self._request_webpage(
+            source_url, video_id, note='Requesting source file',
+            errnote='Unable to request source file', fatal=False)
+        if urlh:
+            def add_source_format(src_url):
+                formats.append({
+                    'url': src_url,
+                    'ext': determine_ext(title, 'mp4').lower(),
+                    'format_id': 'source',
+                    'quality': 1,
+                })
+            if urlh.headers.get('Content-Disposition'):
+                add_source_format(source_url)
+            else:
+                confirmation_webpage = self._webpage_read_content(
+                    urlh, url, video_id, note='Downloading confirmation page',
+                    errnote='Unable to confirm download', fatal=False)
+                if confirmation_webpage:
+                    confirm = self._search_regex(
+                        r'confirm=([^&"\']+)', confirmation_webpage,
+                        'confirmation code', fatal=False)
+                    if confirm:
+                        add_source_format(update_url_query(source_url, {
+                            'confirm': confirm,
+                        }))
+
+        if not formats:
+            reason = self._search_regex(
+                r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
+            if reason:
+                raise ExtractorError(reason, expected=True)
+
+        self._sort_formats(formats)
+
+        hl = self._search_regex(
+            r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
+        subtitles_id = None
+        ttsurl = self._search_regex(
+            r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
+        if ttsurl:
+            # the video Id for subtitles will be the last value in the ttsurl
+            # query string
+            subtitles_id = ttsurl.encode('utf-8').decode(
+                'unicode_escape').split('=')[-1]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'duration': duration,
+            'formats': formats,
+            'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
+            'automatic_captions': self.extract_automatic_captions(
+                video_id, subtitles_id, hl),
+        }
diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py

new file mode 100644 (file)

index 0000000..6b927bb
--- /dev/null
+++ b/youtube_dl/extractor/googleplus.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import codecs
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class GooglePlusIE(InfoExtractor):
+    IE_DESC = 'Google Plus'
+    _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P<id>\w+)'
+    IE_NAME = 'plus.google'
+    _TEST = {
+        'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH',
+        'info_dict': {
+            'id': 'ZButuJc6CtH',
+            'ext': 'flv',
+            'title': '嘆きの天使 降臨',
+            'upload_date': '20120613',
+            'uploader': '井上ヨシマサ',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # Step 1, Retrieve post webpage to extract further information
+        webpage = self._download_webpage(url, video_id, 'Downloading entry webpage')
+
+        title = self._og_search_description(webpage).splitlines()[0]
+        upload_date = unified_strdate(self._html_search_regex(
+            r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>
+                    ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''',
+            webpage, 'upload date', fatal=False, flags=re.VERBOSE))
+        uploader = self._html_search_regex(
+            r'rel="author".*?>(.*?)</a>', webpage, 'uploader', fatal=False)
+
+        # Step 2, Simulate clicking the image box to launch video
+        DOMAIN = 'https://plus.google.com/'
+        video_page = self._search_regex(
+            r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
+            webpage, 'video page URL')
+        if not video_page.startswith(DOMAIN):
+            video_page = DOMAIN + video_page
+
+        webpage = self._download_webpage(video_page, video_id, 'Downloading video page')
+
+        def unicode_escape(s):
+            decoder = codecs.getdecoder('unicode_escape')
+            return re.sub(
+                r'\\u[0-9a-fA-F]{4,}',
+                lambda m: decoder(m.group(0))[0],
+                s)
+
+        # Extract video links all sizes
+        formats = [{
+            'url': unicode_escape(video_url),
+            'ext': 'flv',
+            'width': int(width),
+            'height': int(height),
+        } for width, height, video_url in re.findall(
+            r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent\.com.*?)"', webpage)]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'uploader': uploader,
+            'upload_date': upload_date,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/googlesearch.py b/youtube_dl/extractor/googlesearch.py

new file mode 100644 (file)

index 0000000..5279fa8
--- /dev/null
+++ b/youtube_dl/extractor/googlesearch.py
@@ -0,0 +1,59 @@
+from __future__ import unicode_literals
+
+import itertools
+import re
+
+from .common import SearchInfoExtractor
+
+
+class GoogleSearchIE(SearchInfoExtractor):
+    IE_DESC = 'Google Video search'
+    _MAX_RESULTS = 1000
+    IE_NAME = 'video.google:search'
+    _SEARCH_KEY = 'gvsearch'
+    _TEST = {
+        'url': 'gvsearch15:python language',
+        'info_dict': {
+            'id': 'python language',
+            'title': 'python language',
+        },
+        'playlist_count': 15,
+    }
+
+    def _get_n_results(self, query, n):
+        """Get a specified number of results for a query"""
+
+        entries = []
+        res = {
+            '_type': 'playlist',
+            'id': query,
+            'title': query,
+        }
+
+        for pagenum in itertools.count():
+            webpage = self._download_webpage(
+                'http://www.google.com/search',
+                'gvsearch:' + query,
+                note='Downloading result page %s' % (pagenum + 1),
+                query={
+                    'tbm': 'vid',
+                    'q': query,
+                    'start': pagenum * 10,
+                    'hl': 'en',
+                })
+
+            for hit_idx, mobj in enumerate(re.finditer(
+                    r'<h3 class="r"><a href="([^"]+)"', webpage)):
+
+                # Skip playlists
+                if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
+                    continue
+
+                entries.append({
+                    '_type': 'url',
+                    'url': mobj.group(1)
+                })
+
+            if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
+                res['entries'] = entries[:n]
+                return res
diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py

new file mode 100644 (file)

index 0000000..377981d
--- /dev/null
+++ b/youtube_dl/extractor/goshgay.py
@@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+)
+from ..utils import (
+    parse_duration,
+)
+
+
+class GoshgayIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?goshgay\.com/video(?P<id>\d+?)($|/)'
+    _TEST = {
+        'url': 'http://www.goshgay.com/video299069/diesel_sfw_xxx_video',
+        'md5': '4b6db9a0a333142eb9f15913142b0ed1',
+        'info_dict': {
+            'id': '299069',
+            'ext': 'flv',
+            'title': 'DIESEL SFW XXX Video',
+            'thumbnail': r're:^http://.*\.jpg$',
+            'duration': 80,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<h2>(.*?)<', webpage, 'title')
+        duration = parse_duration(self._html_search_regex(
+            r'<span class="duration">\s*-?\s*(.*?)</span>',
+            webpage, 'duration', fatal=False))
+
+        flashvars = compat_parse_qs(self._html_search_regex(
+            r'<embed.+?id="flash-player-embed".+?flashvars="([^"]+)"',
+            webpage, 'flashvars'))
+        thumbnail = flashvars.get('url_bigthumb', [None])[0]
+        video_url = flashvars['flv_url'][0]
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/gputechconf.py b/youtube_dl/extractor/gputechconf.py

new file mode 100644 (file)

index 0000000..73dc62c
--- /dev/null
+++ b/youtube_dl/extractor/gputechconf.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class GPUTechConfIE(InfoExtractor):
+    _VALID_URL = r'https?://on-demand\.gputechconf\.com/gtc/2015/video/S(?P<id>\d+)\.html'
+    _TEST = {
+        'url': 'http://on-demand.gputechconf.com/gtc/2015/video/S5156.html',
+        'md5': 'a8862a00a0fd65b8b43acc5b8e33f798',
+        'info_dict': {
+            'id': '5156',
+            'ext': 'mp4',
+            'title': 'Coordinating More Than 3 Million CUDA Threads for Social Network Analysis',
+            'duration': 1219,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        root_path = self._search_regex(
+            r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path',
+            default='http://evt.dispeak.com/nvidia/events/gtc15/')
+        xml_file_id = self._search_regex(
+            r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id')
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': '%sxml/%s.xml' % (root_path, xml_file_id),
+            'ie_key': 'DigitallySpeaking',
+        }
diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py

new file mode 100644 (file)

index 0000000..a6da909
--- /dev/null
+++ b/youtube_dl/extractor/groupon.py
@@ -0,0 +1,67 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class GrouponIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?groupon\.com/deals/(?P<id>[^/?#&]+)'
+
+    _TEST = {
+        'url': 'https://www.groupon.com/deals/bikram-yoga-huntington-beach-2#ooid=tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
+        'info_dict': {
+            'id': 'bikram-yoga-huntington-beach-2',
+            'title': '$49 for 10 Yoga Classes or One Month of Unlimited Classes at Bikram Yoga Huntington Beach ($180 Value)',
+            'description': 'Studio kept at 105 degrees and 40% humidity with anti-microbial and anti-slip Flotex flooring; certified instructors',
+        },
+        'playlist': [{
+            'md5': '42428ce8a00585f9bc36e49226eae7a1',
+            'info_dict': {
+                'id': 'fk6OhWpXgIQ',
+                'ext': 'mp4',
+                'title': 'Bikram Yoga Huntington Beach | Orange County !tubGNycTo@9Uxg82uESj4i61EYX8nyuf',
+                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+                'duration': 45,
+                'upload_date': '20160405',
+                'uploader_id': 'groupon',
+                'uploader': 'Groupon',
+            },
+            'add_ie': ['Youtube'],
+        }],
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    _PROVIDERS = {
+        'ooyala': ('ooyala:%s', 'Ooyala'),
+        'youtube': ('%s', 'Youtube'),
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+
+        payload = self._parse_json(self._search_regex(
+            r'(?:var\s+|window\.)payload\s*=\s*(.*?);\n', webpage, 'payload'), playlist_id)
+        videos = payload['carousel'].get('dealVideos', [])
+        entries = []
+        for v in videos:
+            provider = v.get('provider')
+            video_id = v.get('media') or v.get('id') or v.get('baseURL')
+            if not provider or not video_id:
+                continue
+            url_pattern, ie_key = self._PROVIDERS.get(provider.lower())
+            if not url_pattern:
+                self.report_warning(
+                    '%s: Unsupported video provider %s, skipping video' %
+                    (playlist_id, provider))
+                continue
+            entries.append(self.url_result(url_pattern % video_id, ie_key))
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'entries': entries,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+        }
diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py

new file mode 100644 (file)

index 0000000..68df748
--- /dev/null
+++ b/youtube_dl/extractor/hbo.py
@@ -0,0 +1,175 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    xpath_text,
+    xpath_element,
+    int_or_none,
+    parse_duration,
+    urljoin,
+)
+
+
+class HBOBaseIE(InfoExtractor):
+    _FORMATS_INFO = {
+        'pro7': {
+            'width': 1280,
+            'height': 720,
+        },
+        '1920': {
+            'width': 1280,
+            'height': 720,
+        },
+        'pro6': {
+            'width': 768,
+            'height': 432,
+        },
+        '640': {
+            'width': 768,
+            'height': 432,
+        },
+        'pro5': {
+            'width': 640,
+            'height': 360,
+        },
+        'highwifi': {
+            'width': 640,
+            'height': 360,
+        },
+        'high3g': {
+            'width': 640,
+            'height': 360,
+        },
+        'medwifi': {
+            'width': 400,
+            'height': 224,
+        },
+        'med3g': {
+            'width': 400,
+            'height': 224,
+        },
+    }
+
+    def _extract_info(self, url, display_id):
+        video_data = self._download_xml(url, display_id)
+        video_id = xpath_text(video_data, 'id', fatal=True)
+        episode_title = title = xpath_text(video_data, 'title', fatal=True)
+        series = xpath_text(video_data, 'program')
+        if series:
+            title = '%s - %s' % (series, title)
+
+        formats = []
+        for source in xpath_element(video_data, 'videos', 'sources', True):
+            if source.tag == 'size':
+                path = xpath_text(source, './/path')
+                if not path:
+                    continue
+                width = source.attrib.get('width')
+                format_info = self._FORMATS_INFO.get(width, {})
+                height = format_info.get('height')
+                fmt = {
+                    'url': path,
+                    'format_id': 'http%s' % ('-%dp' % height if height else ''),
+                    'width': format_info.get('width'),
+                    'height': height,
+                }
+                rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', path)
+                if rtmp:
+                    fmt.update({
+                        'url': rtmp.group('url'),
+                        'play_path': rtmp.group('playpath'),
+                        'app': rtmp.group('app'),
+                        'ext': 'flv',
+                        'format_id': fmt['format_id'].replace('http', 'rtmp'),
+                    })
+                formats.append(fmt)
+            else:
+                video_url = source.text
+                if not video_url:
+                    continue
+                if source.tag == 'tarball':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url.replace('.tar', '/base_index_w8.m3u8'),
+                        video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+                elif source.tag == 'hls':
+                    m3u8_formats = self._extract_m3u8_formats(
+                        video_url.replace('.tar', '/base_index.m3u8'),
+                        video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+                    for f in m3u8_formats:
+                        if f.get('vcodec') == 'none' and not f.get('tbr'):
+                            f['tbr'] = int_or_none(self._search_regex(
+                                r'-(\d+)k/', f['url'], 'tbr', default=None))
+                    formats.extend(m3u8_formats)
+                elif source.tag == 'dash':
+                    formats.extend(self._extract_mpd_formats(
+                        video_url.replace('.tar', '/manifest.mpd'),
+                        video_id, mpd_id='dash', fatal=False))
+                else:
+                    format_info = self._FORMATS_INFO.get(source.tag, {})
+                    formats.append({
+                        'format_id': 'http-%s' % source.tag,
+                        'url': video_url,
+                        'width': format_info.get('width'),
+                        'height': format_info.get('height'),
+                    })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        card_sizes = xpath_element(video_data, 'titleCardSizes')
+        if card_sizes is not None:
+            for size in card_sizes:
+                path = xpath_text(size, 'path')
+                if not path:
+                    continue
+                width = int_or_none(size.get('width'))
+                thumbnails.append({
+                    'id': width,
+                    'url': path,
+                    'width': width,
+                })
+
+        subtitles = None
+        caption_url = xpath_text(video_data, 'captionUrl')
+        if caption_url:
+            subtitles = {
+                'en': [{
+                    'url': caption_url,
+                    'ext': 'ttml'
+                }],
+            }
+
+        return {
+            'id': video_id,
+            'title': title,
+            'duration': parse_duration(xpath_text(video_data, 'duration/tv14')),
+            'series': series,
+            'episode': episode_title,
+            'formats': formats,
+            'thumbnails': thumbnails,
+            'subtitles': subtitles,
+        }
+
+
+class HBOIE(HBOBaseIE):
+    IE_NAME = 'hbo'
+    _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?:video|embed)(?:/[^/]+)*/(?P<id>[^/?#]+)'
+    _TEST = {
+        'url': 'https://www.hbo.com/video/game-of-thrones/seasons/season-8/videos/trailer',
+        'md5': '8126210656f433c452a21367f9ad85b3',
+        'info_dict': {
+            'id': '22113301',
+            'ext': 'mp4',
+            'title': 'Game of Thrones - Trailer',
+        },
+        'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'],
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        location_path = self._parse_json(self._html_search_regex(
+            r'data-state="({.+?})"', webpage, 'state'), display_id)['video']['locationUrl']
+        return self._extract_info(urljoin(url, location_path), display_id)
diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py

new file mode 100644 (file)

index 0000000..18c2520
--- /dev/null
+++ b/youtube_dl/extractor/hearthisat.py
@@ -0,0 +1,135 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    HEADRequest,
+    KNOWN_EXTENSIONS,
+    sanitized_Request,
+    str_to_int,
+    urlencode_postdata,
+    urlhandle_detect_ext,
+)
+
+
+class HearThisAtIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$'
+    _PLAYLIST_URL = 'https://hearthis.at/playlist.php'
+    _TESTS = [{
+        'url': 'https://hearthis.at/moofi/dr-kreep',
+        'md5': 'ab6ec33c8fed6556029337c7885eb4e0',
+        'info_dict': {
+            'id': '150939',
+            'ext': 'wav',
+            'title': 'Moofi - Dr. Kreep',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1421564134,
+            'description': 'Listen to Dr. Kreep by Moofi on hearthis.at - Modular, Eurorack, Mutable Intruments Braids, Valhalla-DSP',
+            'upload_date': '20150118',
+            'comment_count': int,
+            'view_count': int,
+            'like_count': int,
+            'duration': 71,
+            'categories': ['Experimental'],
+        }
+    }, {
+        # 'download' link redirects to the original webpage
+        'url': 'https://hearthis.at/twitchsf/dj-jim-hopkins-totally-bitchin-80s-dance-mix/',
+        'md5': '5980ceb7c461605d30f1f039df160c6e',
+        'info_dict': {
+            'id': '811296',
+            'ext': 'mp3',
+            'title': 'TwitchSF - DJ Jim Hopkins -  Totally Bitchin\' 80\'s Dance Mix!',
+            'description': 'Listen to DJ Jim Hopkins -  Totally Bitchin\' 80\'s Dance Mix! by TwitchSF on hearthis.at - Dance',
+            'upload_date': '20160328',
+            'timestamp': 1459186146,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'comment_count': int,
+            'view_count': int,
+            'like_count': int,
+            'duration': 4360,
+            'categories': ['Dance'],
+        },
+    }]
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        display_id = '{artist:s} - {title:s}'.format(**m.groupdict())
+
+        webpage = self._download_webpage(url, display_id)
+        track_id = self._search_regex(
+            r'intTrackId\s*=\s*(\d+)', webpage, 'track ID')
+
+        payload = urlencode_postdata({'tracks[]': track_id})
+        req = sanitized_Request(self._PLAYLIST_URL, payload)
+        req.add_header('Content-type', 'application/x-www-form-urlencoded')
+
+        track = self._download_json(req, track_id, 'Downloading playlist')[0]
+        title = '{artist:s} - {title:s}'.format(**track)
+
+        categories = None
+        if track.get('category'):
+            categories = [track['category']]
+
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        meta_span = r'<span[^>]+class="%s".*?</i>([^<]+)</span>'
+        view_count = str_to_int(self._search_regex(
+            meta_span % 'plays_count', webpage, 'view count', fatal=False))
+        like_count = str_to_int(self._search_regex(
+            meta_span % 'likes_count', webpage, 'like count', fatal=False))
+        comment_count = str_to_int(self._search_regex(
+            meta_span % 'comment_count', webpage, 'comment count', fatal=False))
+        duration = str_to_int(self._search_regex(
+            r'data-length="(\d+)', webpage, 'duration', fatal=False))
+        timestamp = str_to_int(self._search_regex(
+            r'<span[^>]+class="calctime"[^>]+data-time="(\d+)', webpage, 'timestamp', fatal=False))
+
+        formats = []
+        mp3_url = self._search_regex(
+            r'(?s)<a class="player-link"\s+(?:[a-zA-Z0-9_:-]+="[^"]+"\s+)*?data-mp3="([^"]+)"',
+            webpage, 'mp3 URL', fatal=False)
+        if mp3_url:
+            formats.append({
+                'format_id': 'mp3',
+                'vcodec': 'none',
+                'acodec': 'mp3',
+                'url': mp3_url,
+            })
+        download_path = self._search_regex(
+            r'<a class="[^"]*download_fct[^"]*"\s+href="([^"]+)"',
+            webpage, 'download URL', default=None)
+        if download_path:
+            download_url = compat_urlparse.urljoin(url, download_path)
+            ext_req = HEADRequest(download_url)
+            ext_handle = self._request_webpage(
+                ext_req, display_id, note='Determining extension')
+            ext = urlhandle_detect_ext(ext_handle)
+            if ext in KNOWN_EXTENSIONS:
+                formats.append({
+                    'format_id': 'download',
+                    'vcodec': 'none',
+                    'ext': ext,
+                    'url': download_url,
+                    'preference': 2,  # Usually better quality
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': track_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'like_count': like_count,
+            'categories': categories,
+        }
diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py

new file mode 100644 (file)

index 0000000..cbe564a
--- /dev/null
+++ b/youtube_dl/extractor/heise.py
@@ -0,0 +1,172 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .kaltura import KalturaIE
+from .youtube import YoutubeIE
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    NO_DEFAULT,
+    parse_iso8601,
+    smuggle_url,
+    xpath_text,
+)
+
+
+class HeiseIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?heise\.de/(?:[^/]+/)+[^/]+-(?P<id>[0-9]+)\.html'
+    _TESTS = [{
+        # kaltura embed
+        'url': 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html',
+        'info_dict': {
+            'id': '1_kkrq94sm',
+            'ext': 'mp4',
+            'title': "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone",
+            'timestamp': 1512734959,
+            'upload_date': '20171208',
+            'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # YouTube embed
+        'url': 'http://www.heise.de/newsticker/meldung/Netflix-In-20-Jahren-vom-Videoverleih-zum-TV-Revolutionaer-3814130.html',
+        'md5': 'e403d2b43fea8e405e88e3f8623909f1',
+        'info_dict': {
+            'id': '6kmWbXleKW4',
+            'ext': 'mp4',
+            'title': 'NEU IM SEPTEMBER | Netflix',
+            'description': 'md5:2131f3c7525e540d5fd841de938bd452',
+            'upload_date': '20170830',
+            'uploader': 'Netflix Deutschland, Österreich und Schweiz',
+            'uploader_id': 'netflixdach',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.heise.de/video/artikel/nachgehakt-Wie-sichert-das-c-t-Tool-Restric-tor-Windows-10-ab-3700244.html',
+        'info_dict': {
+            'id': '1_ntrmio2s',
+            'ext': 'mp4',
+            'title': "nachgehakt: Wie sichert das c't-Tool Restric'tor Windows 10 ab?",
+            'description': 'md5:47e8ffb6c46d85c92c310a512d6db271',
+            'timestamp': 1512470717,
+            'upload_date': '20171205',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.heise.de/ct/artikel/c-t-uplink-20-8-Staubsaugerroboter-Xiaomi-Vacuum-2-AR-Brille-Meta-2-und-Android-rooten-3959893.html',
+        'info_dict': {
+            'id': '1_59mk80sf',
+            'ext': 'mp4',
+            'title': "c't uplink 20.8: Staubsaugerroboter Xiaomi Vacuum 2, AR-Brille Meta 2 und Android rooten",
+            'description': 'md5:f50fe044d3371ec73a8f79fcebd74afc',
+            'timestamp': 1517567237,
+            'upload_date': '20180202',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.heise.de/newsticker/meldung/c-t-uplink-Owncloud-Tastaturen-Peilsender-Smartphone-2404251.html?wt_mc=rss.ho.beitrag.atom',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.heise.de/ct/ausgabe/2016-12-Spiele-3214137.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        def extract_title(default=NO_DEFAULT):
+            title = self._html_search_meta(
+                ('fulltitle', 'title'), webpage, default=None)
+            if not title or title == "c't":
+                title = self._search_regex(
+                    r'<div[^>]+class="videoplayerjw"[^>]+data-title="([^"]+)"',
+                    webpage, 'title', default=None)
+            if not title:
+                title = self._html_search_regex(
+                    r'<h1[^>]+\bclass=["\']article_page_title[^>]+>(.+?)<',
+                    webpage, 'title', default=default)
+            return title
+
+        title = extract_title(default=None)
+        description = self._og_search_description(
+            webpage, default=None) or self._html_search_meta(
+            'description', webpage)
+
+        def _make_kaltura_result(kaltura_url):
+            return {
+                '_type': 'url_transparent',
+                'url': smuggle_url(kaltura_url, {'source_url': url}),
+                'ie_key': KalturaIE.ie_key(),
+                'title': title,
+                'description': description,
+            }
+
+        kaltura_url = KalturaIE._extract_url(webpage)
+        if kaltura_url:
+            return _make_kaltura_result(kaltura_url)
+
+        kaltura_id = self._search_regex(
+            r'entry-id=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'kaltura id',
+            default=None, group='id')
+        if kaltura_id:
+            return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id)
+
+        yt_urls = YoutubeIE._extract_urls(webpage)
+        if yt_urls:
+            return self.playlist_from_matches(
+                yt_urls, video_id, title, ie=YoutubeIE.ie_key())
+
+        title = extract_title()
+
+        container_id = self._search_regex(
+            r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"',
+            webpage, 'container ID')
+
+        sequenz_id = self._search_regex(
+            r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"',
+            webpage, 'sequenz ID')
+
+        doc = self._download_xml(
+            'http://www.heise.de/videout/feed', video_id, query={
+                'container': container_id,
+                'sequenz': sequenz_id,
+            })
+
+        formats = []
+        for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'):
+            label = source_node.attrib['label']
+            height = int_or_none(self._search_regex(
+                r'^(.*?_)?([0-9]+)p$', label, 'height', default=None))
+            video_url = source_node.attrib['file']
+            ext = determine_ext(video_url, '')
+            formats.append({
+                'url': video_url,
+                'format_note': label,
+                'format_id': '%s_%s' % (ext, label),
+                'height': height,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': (xpath_text(doc, './/{http://rss.jwpcdn.com/}image')
+                          or self._og_search_thumbnail(webpage)),
+            'timestamp': parse_iso8601(
+                self._html_search_meta('date', webpage)),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py

new file mode 100644 (file)

index 0000000..fae4251
--- /dev/null
+++ b/youtube_dl/extractor/hellporno.py
@@ -0,0 +1,76 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    merge_dicts,
+    remove_end,
+    unified_timestamp,
+)
+
+
+class HellPornoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?hellporno\.(?:com/videos|net/v)/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/',
+        'md5': 'f0a46ebc0bed0c72ae8fe4629f7de5f3',
+        'info_dict': {
+            'id': '149116',
+            'display_id': 'dixie-is-posing-with-naked-ass-very-erotic',
+            'ext': 'mp4',
+            'title': 'Dixie is posing with naked ass very erotic',
+            'description': 'md5:9a72922749354edb1c4b6e540ad3d215',
+            'categories': list,
+            'thumbnail': r're:https?://.*\.jpg$',
+            'duration': 240,
+            'timestamp': 1398762720,
+            'upload_date': '20140429',
+            'view_count': int,
+            'age_limit': 18,
+        },
+    }, {
+        'url': 'http://hellporno.net/v/186271/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = remove_end(self._html_search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno')
+
+        info = self._parse_html5_media_entries(url, webpage, display_id)[0]
+        self._sort_formats(info['formats'])
+
+        video_id = self._search_regex(
+            (r'chs_object\s*=\s*["\'](\d+)',
+             r'params\[["\']video_id["\']\]\s*=\s*(\d+)'), webpage, 'video id',
+            default=display_id)
+        description = self._search_regex(
+            r'class=["\']desc_video_view_v2[^>]+>([^<]+)', webpage,
+            'description', fatal=False)
+        categories = [
+            c.strip()
+            for c in self._html_search_meta(
+                'keywords', webpage, 'categories', default='').split(',')
+            if c.strip()]
+        duration = int_or_none(self._og_search_property(
+            'video:duration', webpage, fatal=False))
+        timestamp = unified_timestamp(self._og_search_property(
+            'video:release_date', webpage, fatal=False))
+        view_count = int_or_none(self._search_regex(
+            r'>Views\s+(\d+)', webpage, 'view count', fatal=False))
+
+        return merge_dicts(info, {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'categories': categories,
+            'duration': duration,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'age_limit': 18,
+        })
diff --git a/youtube_dl/extractor/helsinki.py b/youtube_dl/extractor/helsinki.py

new file mode 100644 (file)

index 0000000..575fb33
--- /dev/null
+++ b/youtube_dl/extractor/helsinki.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class HelsinkiIE(InfoExtractor):
+    IE_DESC = 'helsinki.fi'
+    _VALID_URL = r'https?://video\.helsinki\.fi/Arkisto/flash\.php\?id=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://video.helsinki.fi/Arkisto/flash.php?id=20258',
+        'info_dict': {
+            'id': '20258',
+            'ext': 'mp4',
+            'title': 'Tietotekniikkafoorumi-iltapäivä',
+            'description': 'md5:f5c904224d43c133225130fe156a5ee0',
+        },
+        'params': {
+            'skip_download': True,  # RTMP
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        params = self._parse_json(self._html_search_regex(
+            r'(?s)jwplayer\("player"\).setup\((\{.*?\})\);',
+            webpage, 'player code'), video_id, transform_source=js_to_json)
+        formats = [{
+            'url': s['file'],
+            'ext': 'mp4',
+        } for s in params['sources']]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage).replace('Video: ', ''),
+            'description': self._og_search_description(webpage),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py

new file mode 100644 (file)

index 0000000..86a93de
--- /dev/null
+++ b/youtube_dl/extractor/hentaistigma.py
@@ -0,0 +1,39 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class HentaiStigmaIE(InfoExtractor):
+    _VALID_URL = r'^https?://hentai\.animestigma\.com/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://hentai.animestigma.com/inyouchuu-etsu-bonus/',
+        'md5': '4e3d07422a68a4cc363d8f57c8bf0d23',
+        'info_dict': {
+            'id': 'inyouchuu-etsu-bonus',
+            'ext': 'mp4',
+            'title': 'Inyouchuu Etsu Bonus',
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<h2[^>]+class="posttitle"[^>]*><a[^>]*>([^<]+)</a>',
+            webpage, 'title')
+        wrap_url = self._html_search_regex(
+            r'<iframe[^>]+src="([^"]+mp4)"', webpage, 'wrapper url')
+        wrap_webpage = self._download_webpage(wrap_url, video_id)
+
+        video_url = self._html_search_regex(
+            r'file\s*:\s*"([^"]+)"', wrap_webpage, 'video url')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py

new file mode 100644 (file)

index 0000000..a4f3325
--- /dev/null
+++ b/youtube_dl/extractor/hgtv.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class HGTVComShowIE(InfoExtractor):
+    IE_NAME = 'hgtv.com:show'
+    _VALID_URL = r'https?://(?:www\.)?hgtv\.com/shows/[^/]+/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        # data-module="video"
+        'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-season-4-videos',
+        'info_dict': {
+            'id': 'flip-or-flop-full-episodes-season-4-videos',
+            'title': 'Flip or Flop Full Episodes',
+        },
+        'playlist_mincount': 15,
+    }, {
+        # data-deferred-module="video"
+        'url': 'http://www.hgtv.com/shows/good-bones/episodes/an-old-victorian-house-gets-a-new-facelift',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        config = self._parse_json(
+            self._search_regex(
+                r'(?s)data-(?:deferred-)?module=["\']video["\'][^>]*>.*?<script[^>]+type=["\']text/x-config["\'][^>]*>(.+?)</script',
+                webpage, 'video config'),
+            display_id)['channels'][0]
+
+        entries = [
+            self.url_result(video['releaseUrl'])
+            for video in config['videos'] if video.get('releaseUrl')]
+
+        return self.playlist_result(
+            entries, display_id, config.get('title'), config.get('description'))
diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py

new file mode 100644 (file)

index 0000000..f26f802
--- /dev/null
+++ b/youtube_dl/extractor/hidive.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    url_or_none,
+    urlencode_postdata,
+)
+
+
+class HiDiveIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?hidive\.com/stream/(?P<title>[^/]+)/(?P<key>[^/?#&]+)'
+    # Using X-Forwarded-For results in 403 HTTP error for HLS fragments,
+    # so disabling geo bypass completely
+    _GEO_BYPASS = False
+    _NETRC_MACHINE = 'hidive'
+    _LOGIN_URL = 'https://www.hidive.com/account/login'
+
+    _TESTS = [{
+        'url': 'https://www.hidive.com/stream/the-comic-artist-and-his-assistants/s01e001',
+        'info_dict': {
+            'id': 'the-comic-artist-and-his-assistants/s01e001',
+            'ext': 'mp4',
+            'title': 'the-comic-artist-and-his-assistants/s01e001',
+            'series': 'the-comic-artist-and-his-assistants',
+            'season_number': 1,
+            'episode_number': 1,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Requires Authentication',
+    }]
+
+    def _real_initialize(self):
+        email, password = self._get_login_info()
+        if email is None:
+            return
+
+        webpage = self._download_webpage(self._LOGIN_URL, None)
+        form = self._search_regex(
+            r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>',
+            webpage, 'login form')
+        data = self._hidden_inputs(form)
+        data.update({
+            'Email': email,
+            'Password': password,
+        })
+        self._download_webpage(
+            self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data))
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title, key = mobj.group('title', 'key')
+        video_id = '%s/%s' % (title, key)
+
+        settings = self._download_json(
+            'https://www.hidive.com/play/settings', video_id,
+            data=urlencode_postdata({
+                'Title': title,
+                'Key': key,
+                'PlayerId': 'f4f895ce1ca713ba263b91caeb1daa2d08904783',
+            }))
+
+        restriction = settings.get('restrictionReason')
+        if restriction == 'RegionRestricted':
+            self.raise_geo_restricted()
+
+        if restriction and restriction != 'None':
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, restriction), expected=True)
+
+        formats = []
+        subtitles = {}
+        for rendition_id, rendition in settings['renditions'].items():
+            bitrates = rendition.get('bitrates')
+            if not isinstance(bitrates, dict):
+                continue
+            m3u8_url = url_or_none(bitrates.get('hls'))
+            if not m3u8_url:
+                continue
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='%s-hls' % rendition_id, fatal=False))
+            cc_files = rendition.get('ccFiles')
+            if not isinstance(cc_files, list):
+                continue
+            for cc_file in cc_files:
+                if not isinstance(cc_file, list) or len(cc_file) < 3:
+                    continue
+                cc_lang = cc_file[0]
+                cc_url = url_or_none(cc_file[2])
+                if not isinstance(cc_lang, compat_str) or not cc_url:
+                    continue
+                subtitles.setdefault(cc_lang, []).append({
+                    'url': cc_url,
+                })
+        self._sort_formats(formats)
+
+        season_number = int_or_none(self._search_regex(
+            r's(\d+)', key, 'season number', default=None))
+        episode_number = int_or_none(self._search_regex(
+            r'e(\d+)', key, 'episode number', default=None))
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'subtitles': subtitles,
+            'formats': formats,
+            'series': title,
+            'season_number': season_number,
+            'episode_number': episode_number,
+        }
diff --git a/youtube_dl/extractor/historicfilms.py b/youtube_dl/extractor/historicfilms.py

new file mode 100644 (file)

index 0000000..56343e9
--- /dev/null
+++ b/youtube_dl/extractor/historicfilms.py
@@ -0,0 +1,47 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class HistoricFilmsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?historicfilms\.com/(?:tapes/|play)(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.historicfilms.com/tapes/4728',
+        'md5': 'd4a437aec45d8d796a38a215db064e9a',
+        'info_dict': {
+            'id': '4728',
+            'ext': 'mov',
+            'title': 'Historic Films: GP-7',
+            'description': 'md5:1a86a0f3ac54024e419aba97210d959a',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2096,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        tape_id = self._search_regex(
+            [r'class="tapeId"[^>]*>([^<]+)<', r'tapeId\s*:\s*"([^"]+)"'],
+            webpage, 'tape id')
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._html_search_meta(
+            'thumbnailUrl', webpage, 'thumbnails') or self._og_search_thumbnail(webpage)
+        duration = parse_duration(self._html_search_meta(
+            'duration', webpage, 'duration'))
+
+        video_url = 'http://www.historicfilms.com/video/%s_%s_web.mov' % (tape_id, video_id)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py

new file mode 100644 (file)

index 0000000..3e5ff26
--- /dev/null
+++ b/youtube_dl/extractor/hitbox.py
@@ -0,0 +1,214 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    parse_iso8601,
+    float_or_none,
+    int_or_none,
+    compat_str,
+    determine_ext,
+)
+
+
+class HitboxIE(InfoExtractor):
+    IE_NAME = 'hitbox'
+    _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?:[^/]+/)*videos?/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.hitbox.tv/video/203213',
+        'info_dict': {
+            'id': '203213',
+            'title': 'hitbox @ gamescom, Sub Button Hype extended, Giveaway - hitbox News Update with Oxy',
+            'alt_title': 'hitboxlive - Aug 9th #6',
+            'description': '',
+            'ext': 'mp4',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 215.1666,
+            'resolution': 'HD 720p',
+            'uploader': 'hitboxlive',
+            'view_count': int,
+            'timestamp': 1407576133,
+            'upload_date': '20140809',
+            'categories': ['Live Show'],
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.smashcast.tv/hitboxlive/videos/203213',
+        'only_matching': True,
+    }]
+
+    def _extract_metadata(self, url, video_id):
+        thumb_base = 'https://edge.sf.hitbox.tv'
+        metadata = self._download_json(
+            '%s/%s' % (url, video_id), video_id, 'Downloading metadata JSON')
+
+        date = 'media_live_since'
+        media_type = 'livestream'
+        if metadata.get('media_type') == 'video':
+            media_type = 'video'
+            date = 'media_date_added'
+
+        video_meta = metadata.get(media_type, [])[0]
+        title = video_meta.get('media_status')
+        alt_title = video_meta.get('media_title')
+        description = clean_html(
+            video_meta.get('media_description')
+            or video_meta.get('media_description_md'))
+        duration = float_or_none(video_meta.get('media_duration'))
+        uploader = video_meta.get('media_user_name')
+        views = int_or_none(video_meta.get('media_views'))
+        timestamp = parse_iso8601(video_meta.get(date), ' ')
+        categories = [video_meta.get('category_name')]
+        thumbs = [{
+            'url': thumb_base + video_meta.get('media_thumbnail'),
+            'width': 320,
+            'height': 180
+        }, {
+            'url': thumb_base + video_meta.get('media_thumbnail_large'),
+            'width': 768,
+            'height': 432
+        }]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'alt_title': alt_title,
+            'description': description,
+            'ext': 'mp4',
+            'thumbnails': thumbs,
+            'duration': duration,
+            'uploader': uploader,
+            'view_count': views,
+            'timestamp': timestamp,
+            'categories': categories,
+        }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        player_config = self._download_json(
+            'https://www.smashcast.tv/api/player/config/video/%s' % video_id,
+            video_id, 'Downloading video JSON')
+
+        formats = []
+        for video in player_config['clip']['bitrates']:
+            label = video.get('label')
+            if label == 'Auto':
+                continue
+            video_url = video.get('url')
+            if not video_url:
+                continue
+            bitrate = int_or_none(video.get('bitrate'))
+            if determine_ext(video_url) == 'm3u8':
+                if not video_url.startswith('http'):
+                    continue
+                formats.append({
+                    'url': video_url,
+                    'ext': 'mp4',
+                    'tbr': bitrate,
+                    'format_note': label,
+                    'protocol': 'm3u8_native',
+                })
+            else:
+                formats.append({
+                    'url': video_url,
+                    'tbr': bitrate,
+                    'format_note': label,
+                })
+        self._sort_formats(formats)
+
+        metadata = self._extract_metadata(
+            'https://www.smashcast.tv/api/media/video', video_id)
+        metadata['formats'] = formats
+
+        return metadata
+
+
+class HitboxLiveIE(HitboxIE):
+    IE_NAME = 'hitbox:live'
+    _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.hitbox.tv/dimak',
+        'info_dict': {
+            'id': 'dimak',
+            'ext': 'mp4',
+            'description': 'md5:c9f80fa4410bc588d7faa40003fc7d0e',
+            'timestamp': int,
+            'upload_date': compat_str,
+            'title': compat_str,
+            'uploader': 'Dimak',
+        },
+        'params': {
+            # live
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.smashcast.tv/dimak',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if HitboxIE.suitable(url) else super(HitboxLiveIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        player_config = self._download_json(
+            'https://www.smashcast.tv/api/player/config/live/%s' % video_id,
+            video_id)
+
+        formats = []
+        cdns = player_config.get('cdns')
+        servers = []
+        for cdn in cdns:
+            # Subscribe URLs are not playable
+            if cdn.get('rtmpSubscribe') is True:
+                continue
+            base_url = cdn.get('netConnectionUrl')
+            host = re.search(r'.+\.([^\.]+\.[^\./]+)/.+', base_url).group(1)
+            if base_url not in servers:
+                servers.append(base_url)
+                for stream in cdn.get('bitrates'):
+                    label = stream.get('label')
+                    if label == 'Auto':
+                        continue
+                    stream_url = stream.get('url')
+                    if not stream_url:
+                        continue
+                    bitrate = int_or_none(stream.get('bitrate'))
+                    if stream.get('provider') == 'hls' or determine_ext(stream_url) == 'm3u8':
+                        if not stream_url.startswith('http'):
+                            continue
+                        formats.append({
+                            'url': stream_url,
+                            'ext': 'mp4',
+                            'tbr': bitrate,
+                            'format_note': label,
+                            'rtmp_live': True,
+                        })
+                    else:
+                        formats.append({
+                            'url': '%s/%s' % (base_url, stream_url),
+                            'ext': 'mp4',
+                            'tbr': bitrate,
+                            'rtmp_live': True,
+                            'format_note': host,
+                            'page_url': url,
+                            'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf',
+                        })
+        self._sort_formats(formats)
+
+        metadata = self._extract_metadata(
+            'https://www.smashcast.tv/api/media/live', video_id)
+        metadata['formats'] = formats
+        metadata['is_live'] = True
+        metadata['title'] = self._live_title(metadata.get('title'))
+
+        return metadata
diff --git a/youtube_dl/extractor/hitrecord.py b/youtube_dl/extractor/hitrecord.py

new file mode 100644 (file)

index 0000000..fd5dc29
--- /dev/null
+++ b/youtube_dl/extractor/hitrecord.py
@@ -0,0 +1,68 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    clean_html,
+    float_or_none,
+    int_or_none,
+    try_get,
+)
+
+
+class HitRecordIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?hitrecord\.org/records/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://hitrecord.org/records/2954362',
+        'md5': 'fe1cdc2023bce0bbb95c39c57426aa71',
+        'info_dict': {
+            'id': '2954362',
+            'ext': 'mp4',
+            'title': 'A Very Different World (HITRECORD x ACLU)',
+            'description': 'md5:e62defaffab5075a5277736bead95a3d',
+            'duration': 139.327,
+            'timestamp': 1471557582,
+            'upload_date': '20160818',
+            'uploader': 'Zuzi.C12',
+            'uploader_id': '362811',
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+            'tags': list,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'https://hitrecord.org/api/web/records/%s' % video_id, video_id)
+
+        title = video['title']
+        video_url = video['source_url']['mp4_url']
+
+        tags = None
+        tags_list = try_get(video, lambda x: x['tags'], list)
+        if tags_list:
+            tags = [
+                t['text']
+                for t in tags_list
+                if isinstance(t, dict) and t.get('text')
+                and isinstance(t['text'], compat_str)]
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': clean_html(video.get('body')),
+            'duration': float_or_none(video.get('duration'), 1000),
+            'timestamp': int_or_none(video.get('created_at_i')),
+            'uploader': try_get(
+                video, lambda x: x['user']['username'], compat_str),
+            'uploader_id': try_get(
+                video, lambda x: compat_str(x['user']['id'])),
+            'view_count': int_or_none(video.get('total_views_count')),
+            'like_count': int_or_none(video.get('hearts_count')),
+            'comment_count': int_or_none(video.get('comments_count')),
+            'tags': tags,
+        }
diff --git a/youtube_dl/extractor/hketv.py b/youtube_dl/extractor/hketv.py

new file mode 100644 (file)

index 0000000..1f3502b
--- /dev/null
+++ b/youtube_dl/extractor/hketv.py
@@ -0,0 +1,191 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    merge_dicts,
+    parse_count,
+    str_or_none,
+    try_get,
+    unified_strdate,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class HKETVIE(InfoExtractor):
+    IE_NAME = 'hketv'
+    IE_DESC = '香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau'
+    _GEO_BYPASS = False
+    _GEO_COUNTRIES = ['HK']
+    _VALID_URL = r'https?://(?:www\.)?hkedcity\.net/etv/resource/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'https://www.hkedcity.net/etv/resource/2932360618',
+        'md5': 'f193712f5f7abb208ddef3c5ea6ed0b7',
+        'info_dict': {
+            'id': '2932360618',
+            'ext': 'mp4',
+            'title': '喜閱一生(共享閱讀樂) (中、英文字幕可供選擇)',
+            'description': 'md5:d5286d05219ef50e0613311cbe96e560',
+            'upload_date': '20181024',
+            'duration': 900,
+            'subtitles': 'count:2',
+        },
+        'skip': 'Geo restricted to HK',
+    }, {
+        'url': 'https://www.hkedcity.net/etv/resource/972641418',
+        'md5': '1ed494c1c6cf7866a8290edad9b07dc9',
+        'info_dict': {
+            'id': '972641418',
+            'ext': 'mp4',
+            'title': '衣冠楚楚 (天使系列之一)',
+            'description': 'md5:10bb3d659421e74f58e5db5691627b0f',
+            'upload_date': '20070109',
+            'duration': 907,
+            'subtitles': {},
+        },
+        'params': {
+            'geo_verification_proxy': '<HK proxy here>',
+        },
+        'skip': 'Geo restricted to HK',
+    }]
+
+    _CC_LANGS = {
+        '中文（繁體中文）': 'zh-Hant',
+        '中文（简体中文）': 'zh-Hans',
+        'English': 'en',
+        'Bahasa Indonesia': 'id',
+        '\u0939\u093f\u0928\u094d\u0926\u0940': 'hi',
+        '\u0928\u0947\u092a\u093e\u0932\u0940': 'ne',
+        'Tagalog': 'tl',
+        '\u0e44\u0e17\u0e22': 'th',
+        '\u0627\u0631\u062f\u0648': 'ur',
+    }
+    _FORMAT_HEIGHTS = {
+        'SD': 360,
+        'HD': 720,
+    }
+    _APPS_BASE_URL = 'https://apps.hkedcity.net'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = (
+            self._html_search_meta(
+                ('ed_title', 'search.ed_title'), webpage, default=None)
+            or self._search_regex(
+                r'data-favorite_title_(?:eng|chi)=(["\'])(?P<id>(?:(?!\1).)+)\1',
+                webpage, 'title', default=None, group='url')
+            or self._html_search_regex(
+                r'<h1>([^<]+)</h1>', webpage, 'title', default=None)
+            or self._og_search_title(webpage)
+        )
+
+        file_id = self._search_regex(
+            r'post_var\[["\']file_id["\']\s*\]\s*=\s*(.+?);',
+            webpage, 'file ID')
+        curr_url = self._search_regex(
+            r'post_var\[["\']curr_url["\']\s*\]\s*=\s*"(.+?)";',
+            webpage, 'curr URL')
+        data = {
+            'action': 'get_info',
+            'curr_url': curr_url,
+            'file_id': file_id,
+            'video_url': file_id,
+        }
+
+        response = self._download_json(
+            self._APPS_BASE_URL + '/media/play/handler.php', video_id,
+            data=urlencode_postdata(data),
+            headers=merge_dicts({
+                'Content-Type': 'application/x-www-form-urlencoded'},
+                self.geo_verification_headers()))
+
+        result = response['result']
+
+        if not response.get('success') or not response.get('access'):
+            error = clean_html(response.get('access_err_msg'))
+            if 'Video streaming is not available in your country' in error:
+                self.raise_geo_restricted(
+                    msg=error, countries=self._GEO_COUNTRIES)
+            else:
+                raise ExtractorError(error, expected=True)
+
+        formats = []
+
+        width = int_or_none(result.get('width'))
+        height = int_or_none(result.get('height'))
+
+        playlist0 = result['playlist'][0]
+        for fmt in playlist0['sources']:
+            file_url = urljoin(self._APPS_BASE_URL, fmt.get('file'))
+            if not file_url:
+                continue
+            # If we ever wanted to provide the final resolved URL that
+            # does not require cookies, albeit with a shorter lifespan:
+            #     urlh = self._downloader.urlopen(file_url)
+            #     resolved_url = urlh.geturl()
+            label = fmt.get('label')
+            h = self._FORMAT_HEIGHTS.get(label)
+            w = h * width // height if h and width and height else None
+            formats.append({
+                'format_id': label,
+                'ext': fmt.get('type'),
+                'url': file_url,
+                'width': w,
+                'height': h,
+            })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        tracks = try_get(playlist0, lambda x: x['tracks'], list) or []
+        for track in tracks:
+            if not isinstance(track, dict):
+                continue
+            track_kind = str_or_none(track.get('kind'))
+            if not track_kind or not isinstance(track_kind, compat_str):
+                continue
+            if track_kind.lower() not in ('captions', 'subtitles'):
+                continue
+            track_url = urljoin(self._APPS_BASE_URL, track.get('file'))
+            if not track_url:
+                continue
+            track_label = track.get('label')
+            subtitles.setdefault(self._CC_LANGS.get(
+                track_label, track_label), []).append({
+                    'url': self._proto_relative_url(track_url),
+                    'ext': 'srt',
+                })
+
+        # Likes
+        emotion = self._download_json(
+            'https://emocounter.hkedcity.net/handler.php', video_id,
+            data=urlencode_postdata({
+                'action': 'get_emotion',
+                'data[bucket_id]': 'etv',
+                'data[identifier]': video_id,
+            }),
+            headers={'Content-Type': 'application/x-www-form-urlencoded'},
+            fatal=False) or {}
+        like_count = int_or_none(try_get(
+            emotion, lambda x: x['data']['emotion_data'][0]['count']))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': self._html_search_meta(
+                'description', webpage, fatal=False),
+            'upload_date': unified_strdate(self._html_search_meta(
+                'ed_date', webpage, fatal=False), day_first=False),
+            'duration': int_or_none(result.get('length')),
+            'formats': formats,
+            'subtitles': subtitles,
+            'thumbnail': urljoin(self._APPS_BASE_URL, result.get('image')),
+            'view_count': parse_count(result.get('view_count')),
+            'like_count': like_count,
+        }
diff --git a/youtube_dl/extractor/hornbunny.py b/youtube_dl/extractor/hornbunny.py

new file mode 100644 (file)

index 0000000..c458a95
--- /dev/null
+++ b/youtube_dl/extractor/hornbunny.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+)
+
+
+class HornBunnyIE(InfoExtractor):
+    _VALID_URL = r'http?://(?:www\.)?hornbunny\.com/videos/(?P<title_dash>[a-z-]+)-(?P<id>\d+)\.html'
+    _TEST = {
+        'url': 'http://hornbunny.com/videos/panty-slut-jerk-off-instruction-5227.html',
+        'md5': 'e20fd862d1894b67564c96f180f43924',
+        'info_dict': {
+            'id': '5227',
+            'ext': 'mp4',
+            'title': 'panty slut jerk off instruction',
+            'duration': 550,
+            'age_limit': 18,
+            'view_count': int,
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._og_search_title(webpage)
+        info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]
+
+        duration = parse_duration(self._search_regex(
+            r'<strong>Runtime:</strong>\s*([0-9:]+)</div>',
+            webpage, 'duration', fatal=False))
+        view_count = int_or_none(self._search_regex(
+            r'<strong>Views:</strong>\s*(\d+)</div>',
+            webpage, 'view count', fatal=False))
+
+        info_dict.update({
+            'id': video_id,
+            'title': title,
+            'duration': duration,
+            'view_count': view_count,
+            'age_limit': 18,
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py

new file mode 100644 (file)

index 0000000..4703e18
--- /dev/null
+++ b/youtube_dl/extractor/hotnewhiphop.py
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_b64decode
+from ..utils import (
+    ExtractorError,
+    HEADRequest,
+    sanitized_Request,
+    urlencode_postdata,
+)
+
+
+class HotNewHipHopIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?hotnewhiphop\.com/.*\.(?P<id>.*)\.html'
+    _TEST = {
+        'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html',
+        'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96',
+        'info_dict': {
+            'id': '1435540',
+            'ext': 'mp3',
+            'title': 'Freddie Gibbs - Lay It Down'
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_url_base64 = self._search_regex(
+            r'data-path="(.*?)"', webpage, 'video URL', default=None)
+
+        if video_url_base64 is None:
+            video_url = self._search_regex(
+                r'"contentUrl" content="(.*?)"', webpage, 'content URL')
+            return self.url_result(video_url, ie='Youtube')
+
+        reqdata = urlencode_postdata([
+            ('mediaType', 's'),
+            ('mediaId', video_id),
+        ])
+        r = sanitized_Request(
+            'http://www.hotnewhiphop.com/ajax/media/getActions/', data=reqdata)
+        r.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        mkd = self._download_json(
+            r, video_id, note='Requesting media key',
+            errnote='Could not download media key')
+        if 'mediaKey' not in mkd:
+            raise ExtractorError('Did not get a media key')
+
+        redirect_url = compat_b64decode(video_url_base64).decode('utf-8')
+        redirect_req = HEADRequest(redirect_url)
+        req = self._request_webpage(
+            redirect_req, video_id,
+            note='Resolving final URL', errnote='Could not resolve final URL')
+        video_url = req.geturl()
+        if video_url.endswith('.html'):
+            raise ExtractorError('Redirect failed')
+
+        video_title = self._og_search_title(webpage).strip()
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': video_title,
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py

new file mode 100644 (file)

index 0000000..f97eefa
--- /dev/null
+++ b/youtube_dl/extractor/hotstar.py
@@ -0,0 +1,210 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import hmac
+import re
+import time
+import uuid
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    str_or_none,
+    try_get,
+    url_or_none,
+)
+
+
+class HotStarBaseIE(InfoExtractor):
+    _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee'
+
+    def _call_api_impl(self, path, video_id, query):
+        st = int(time.time())
+        exp = st + 6000
+        auth = 'st=%d~exp=%d~acl=/*' % (st, exp)
+        auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest()
+        response = self._download_json(
+            'https://api.hotstar.com/' + path, video_id, headers={
+                'hotstarauth': auth,
+                'x-country-code': 'IN',
+                'x-platform-code': 'JIO',
+            }, query=query)
+        if response['statusCode'] != 'OK':
+            raise ExtractorError(
+                response['body']['message'], expected=True)
+        return response['body']['results']
+
+    def _call_api(self, path, video_id, query_name='contentId'):
+        return self._call_api_impl(path, video_id, {
+            query_name: video_id,
+            'tas': 10000,
+        })
+
+    def _call_api_v2(self, path, video_id):
+        return self._call_api_impl(
+            '%s/in/contents/%s' % (path, video_id), video_id, {
+                'desiredConfig': 'encryption:plain;ladder:phone,tv;package:hls,dash',
+                'client': 'mweb',
+                'clientVersion': '6.18.0',
+                'deviceId': compat_str(uuid.uuid4()),
+                'osName': 'Windows',
+                'osVersion': '10',
+            })
+
+
+class HotStarIE(HotStarBaseIE):
+    IE_NAME = 'hotstar'
+    _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})'
+    _TESTS = [{
+        # contentData
+        'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273',
+        'info_dict': {
+            'id': '1000076273',
+            'ext': 'mp4',
+            'title': 'Can You Not Spread Rumours?',
+            'description': 'md5:c957d8868e9bc793ccb813691cc4c434',
+            'timestamp': 1447248600,
+            'upload_date': '20151111',
+            'duration': 381,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        # contentDetail
+        'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.hotstar.com/1000000515',
+        'only_matching': True,
+    }, {
+        # only available via api v2
+        'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847',
+        'only_matching': True,
+    }]
+    _GEO_BYPASS = False
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        app_state = self._parse_json(self._search_regex(
+            r'<script>window\.APP_STATE\s*=\s*({.+?})</script>',
+            webpage, 'app state'), video_id)
+        video_data = {}
+        getters = list(
+            lambda x, k=k: x['initialState']['content%s' % k]['content']
+            for k in ('Data', 'Detail')
+        )
+        for v in app_state.values():
+            content = try_get(v, getters, dict)
+            if content and content.get('contentId') == video_id:
+                video_data = content
+                break
+
+        title = video_data['title']
+
+        if video_data.get('drmProtected'):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
+        headers = {'Referer': url}
+        formats = []
+        geo_restricted = False
+        playback_sets = self._call_api_v2('h/v2/play', video_id)['playBackSets']
+        for playback_set in playback_sets:
+            if not isinstance(playback_set, dict):
+                continue
+            format_url = url_or_none(playback_set.get('playbackUrl'))
+            if not format_url:
+                continue
+            format_url = re.sub(
+                r'(?<=//staragvod)(\d)', r'web\1', format_url)
+            tags = str_or_none(playback_set.get('tagsCombination')) or ''
+            if tags and 'encryption:plain' not in tags:
+                continue
+            ext = determine_ext(format_url)
+            try:
+                if 'package:hls' in tags or ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native',
+                        m3u8_id='hls', headers=headers))
+                elif 'package:dash' in tags or ext == 'mpd':
+                    formats.extend(self._extract_mpd_formats(
+                        format_url, video_id, mpd_id='dash', headers=headers))
+                elif ext == 'f4m':
+                    # produce broken files
+                    pass
+                else:
+                    formats.append({
+                        'url': format_url,
+                        'width': int_or_none(playback_set.get('width')),
+                        'height': int_or_none(playback_set.get('height')),
+                    })
+            except ExtractorError as e:
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                    geo_restricted = True
+                continue
+        if not formats and geo_restricted:
+            self.raise_geo_restricted(countries=['IN'])
+        self._sort_formats(formats)
+
+        for f in formats:
+            f.setdefault('http_headers', {}).update(headers)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'duration': int_or_none(video_data.get('duration')),
+            'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')),
+            'formats': formats,
+            'channel': video_data.get('channelName'),
+            'channel_id': video_data.get('channelId'),
+            'series': video_data.get('showName'),
+            'season': video_data.get('seasonName'),
+            'season_number': int_or_none(video_data.get('seasonNo')),
+            'season_id': video_data.get('seasonId'),
+            'episode': title,
+            'episode_number': int_or_none(video_data.get('episodeNo')),
+        }
+
+
+class HotStarPlaylistIE(HotStarBaseIE):
+    IE_NAME = 'hotstar:playlist'
+    _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)'
+    _TESTS = [{
+        'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26',
+        'info_dict': {
+            'id': '3_2_26',
+        },
+        'playlist_mincount': 20,
+    }, {
+        'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId')
+
+        entries = [
+            self.url_result(
+                'https://www.hotstar.com/%s' % video['contentId'],
+                ie=HotStarIE.ie_key(), video_id=video['contentId'])
+            for video in collection['assets']['items']
+            if video.get('contentId')]
+
+        return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py

new file mode 100644 (file)

index 0000000..7e36b85
--- /dev/null
+++ b/youtube_dl/extractor/howcast.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import parse_iso8601
+
+
+class HowcastIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
+        'md5': '7d45932269a288149483144f01b99789',
+        'info_dict': {
+            'id': '390161',
+            'ext': 'mp4',
+            'title': 'How to Tie a Square Knot Properly',
+            'description': 'md5:dbe792e5f6f1489027027bf2eba188a3',
+            'timestamp': 1276081287,
+            'upload_date': '20100609',
+            'duration': 56.823,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        embed_code = self._search_regex(
+            r'<iframe[^>]+src="[^"]+\bembed_code=([^\b]+)\b',
+            webpage, 'ooyala embed code')
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'Ooyala',
+            'url': 'ooyala:%s' % embed_code,
+            'id': video_id,
+            'timestamp': parse_iso8601(self._html_search_meta(
+                'article:published_time', webpage, 'timestamp')),
+        }
diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py

new file mode 100644 (file)

index 0000000..cf90ab3
--- /dev/null
+++ b/youtube_dl/extractor/howstuffworks.py
@@ -0,0 +1,90 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    find_xpath_attr,
+    int_or_none,
+    js_to_json,
+    unescapeHTML,
+    determine_ext,
+)
+
+
+class HowStuffWorksIE(InfoExtractor):
+    _VALID_URL = r'https?://[\da-z-]+\.(?:howstuffworks|stuff(?:(?:youshould|theydontwantyouto)know|toblowyourmind|momnevertoldyou)|(?:brain|car)stuffshow|fwthinking|geniusstuff)\.com/(?:[^/]+/)*(?:\d+-)?(?P<id>.+?)-video\.htm'
+    _TESTS = [
+        {
+            'url': 'http://www.stufftoblowyourmind.com/videos/optical-illusions-video.htm',
+            'md5': '76646a5acc0c92bf7cd66751ca5db94d',
+            'info_dict': {
+                'id': '855410',
+                'ext': 'mp4',
+                'title': 'Your Trickster Brain: Optical Illusions -- Science on the Web',
+                'description': 'md5:e374ff9561f6833ad076a8cc0a5ab2fb',
+            },
+        },
+        {
+            'url': 'http://shows.howstuffworks.com/more-shows/why-does-balloon-stick-to-hair-video.htm',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        clip_js = self._search_regex(
+            r'(?s)var clip = ({.*?});', webpage, 'clip info')
+        clip_info = self._parse_json(
+            clip_js, display_id, transform_source=js_to_json)
+
+        video_id = clip_info['content_id']
+        formats = []
+        m3u8_url = clip_info.get('m3u8')
+        if m3u8_url and determine_ext(m3u8_url) == 'm3u8':
+            formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', format_id='hls', fatal=True))
+        flv_url = clip_info.get('flv_url')
+        if flv_url:
+            formats.append({
+                'url': flv_url,
+                'format_id': 'flv',
+            })
+        for video in clip_info.get('mp4', []):
+            formats.append({
+                'url': video['src'],
+                'format_id': 'mp4-%s' % video['bitrate'],
+                'vbr': int_or_none(video['bitrate'].rstrip('k')),
+            })
+
+        if not formats:
+            smil = self._download_xml(
+                'http://services.media.howstuffworks.com/videos/%s/smil-service.smil' % video_id,
+                video_id, 'Downloading video SMIL')
+
+            http_base = find_xpath_attr(
+                smil,
+                './{0}head/{0}meta'.format('{http://www.w3.org/2001/SMIL20/Language}'),
+                'name',
+                'httpBase').get('content')
+
+            URL_SUFFIX = '?v=2.11.3&fp=LNX 11,2,202,356&r=A&g=A'
+
+            for video in smil.findall(
+                    './{0}body/{0}switch/{0}video'.format('{http://www.w3.org/2001/SMIL20/Language}')):
+                vbr = int_or_none(video.attrib['system-bitrate'], scale=1000)
+                formats.append({
+                    'url': '%s/%s%s' % (http_base, video.attrib['src'], URL_SUFFIX),
+                    'format_id': '%dk' % vbr,
+                    'vbr': vbr,
+                })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': '%s' % video_id,
+            'display_id': display_id,
+            'title': unescapeHTML(clip_info['clip_title']),
+            'description': unescapeHTML(clip_info.get('caption')),
+            'thumbnail': clip_info.get('video_still_url'),
+            'duration': int_or_none(clip_info.get('duration')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/hrfensehen.py b/youtube_dl/extractor/hrfensehen.py

new file mode 100644 (file)

index 0000000..805345e
--- /dev/null
+++ b/youtube_dl/extractor/hrfensehen.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from youtube_dlc.utils import int_or_none, unified_timestamp, unescapeHTML
+from .common import InfoExtractor
+
+
+class HRFernsehenIE(InfoExtractor):
+    IE_NAME = 'hrfernsehen'
+    _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html'
+
+    _TESTS = [{
+        'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html',
+        'md5': '5c4e0ba94677c516a2f65a84110fc536',
+        'info_dict': {
+            'id': '130546',
+            'ext': 'mp4',
+            'description': 'Sturmtief Kirsten fegt über Hessen / Die Corona-Pandemie – eine Chronologie / '
+                           'Sterbehilfe: Die Lage in Hessen / Miss Hessen leitet zwei eigene Unternehmen / '
+                           'Pop-Up Museum zeigt Schwarze Unterhaltung und Black Music',
+            'subtitles': {'de': [{
+                'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt'
+            }]},
+            'timestamp': 1598470200,
+            'upload_date': '20200826',
+            'thumbnails': [{
+                'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg',
+                'id': '0'
+            }, {
+                'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg',
+                'id': '1'
+            }],
+            'title': 'hessenschau vom 26.08.2020'
+        }
+    }, {
+        'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html',
+        'only_matching': True
+    }]
+
+    _GEO_COUNTRIES = ['DE']
+
+    def extract_airdate(self, loader_data):
+        airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate')
+
+        if airdate_str is None:
+            return None
+
+        return unified_timestamp(airdate_str)
+
+    def extract_formats(self, loader_data):
+        stream_formats = []
+        for stream_obj in loader_data["videoResolutionLevels"]:
+            stream_format = {
+                'format_id': str(stream_obj['verticalResolution']) + "p",
+                'height': stream_obj['verticalResolution'],
+                'url': stream_obj['url'],
+            }
+
+            quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit',
+                                            stream_obj['url'])
+            if quality_information:
+                stream_format['width'] = int_or_none(quality_information.group(1))
+                stream_format['height'] = int_or_none(quality_information.group(2))
+                stream_format['fps'] = int_or_none(quality_information.group(3))
+                stream_format['tbr'] = int_or_none(quality_information.group(4))
+
+            stream_formats.append(stream_format)
+
+        self._sort_formats(stream_formats)
+        return stream_formats
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_meta(
+            ['og:title', 'twitter:title', 'name'], webpage)
+        description = self._html_search_meta(
+            ['description'], webpage)
+
+        loader_str = unescapeHTML(self._search_regex(r"data-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader"))
+        loader_data = json.loads(loader_str)
+
+        info = {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'formats': self.extract_formats(loader_data),
+            'timestamp': self.extract_airdate(loader_data)
+        }
+
+        if "subtitle" in loader_data:
+            info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]}
+
+        thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()]))
+        if len(thumbnails) > 0:
+            info["thumbnails"] = [{"url": t} for t in thumbnails]
+
+        return info
diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py

new file mode 100644 (file)

index 0000000..23f7b1f
--- /dev/null
+++ b/youtube_dl/extractor/hrti.py
@@ -0,0 +1,208 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    parse_age_limit,
+    sanitized_Request,
+    try_get,
+)
+
+
+class HRTiBaseIE(InfoExtractor):
+    """
+        Base Information Extractor for Croatian Radiotelevision
+        video on demand site https://hrti.hrt.hr
+        Reverse engineered from the JavaScript app in app.min.js
+    """
+    _NETRC_MACHINE = 'hrti'
+
+    _APP_LANGUAGE = 'hr'
+    _APP_VERSION = '1.1'
+    _APP_PUBLICATION_ID = 'all_in_one'
+    _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json'
+
+    def _initialize_api(self):
+        init_data = {
+            'application_publication_id': self._APP_PUBLICATION_ID
+        }
+
+        uuid = self._download_json(
+            self._API_URL, None, note='Downloading uuid',
+            errnote='Unable to download uuid',
+            data=json.dumps(init_data).encode('utf-8'))['uuid']
+
+        app_data = {
+            'uuid': uuid,
+            'application_publication_id': self._APP_PUBLICATION_ID,
+            'application_version': self._APP_VERSION
+        }
+
+        req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8'))
+        req.get_method = lambda: 'PUT'
+
+        resources = self._download_json(
+            req, None, note='Downloading session information',
+            errnote='Unable to download session information')
+
+        self._session_id = resources['session_id']
+
+        modules = resources['modules']
+
+        self._search_url = modules['vod_catalog']['resources']['search']['uri'].format(
+            language=self._APP_LANGUAGE,
+            application_id=self._APP_PUBLICATION_ID)
+
+        self._login_url = (modules['user']['resources']['login']['uri']
+                           + '/format/json').format(session_id=self._session_id)
+
+        self._logout_url = modules['user']['resources']['logout']['uri']
+
+    def _login(self):
+        username, password = self._get_login_info()
+        # TODO: figure out authentication with cookies
+        if username is None or password is None:
+            self.raise_login_required()
+
+        auth_data = {
+            'username': username,
+            'password': password,
+        }
+
+        try:
+            auth_info = self._download_json(
+                self._login_url, None, note='Logging in', errnote='Unable to log in',
+                data=json.dumps(auth_data).encode('utf-8'))
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406:
+                auth_info = self._parse_json(e.cause.read().encode('utf-8'), None)
+            else:
+                raise
+
+        error_message = auth_info.get('error', {}).get('message')
+        if error_message:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error_message),
+                expected=True)
+
+        self._token = auth_info['secure_streaming_token']
+
+    def _real_initialize(self):
+        self._initialize_api()
+        self._login()
+
+
+class HRTiIE(HRTiBaseIE):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            hrti:(?P<short_id>[0-9]+)|
+                            https?://
+                                hrti\.hrt\.hr/(?:\#/)?video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?
+                        )
+                    '''
+    _TESTS = [{
+        'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd',
+        'info_dict': {
+            'id': '2181385',
+            'display_id': 'republika-dokumentarna-serija-16-hd',
+            'ext': 'mp4',
+            'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)',
+            'description': 'md5:48af85f620e8e0e1df4096270568544f',
+            'duration': 2922,
+            'view_count': int,
+            'average_rating': int,
+            'episode_number': int,
+            'season_number': int,
+            'age_limit': 12,
+        },
+        'skip': 'Requires account credentials',
+    }, {
+        'url': 'https://hrti.hrt.hr/#/video/show/2181385/',
+        'only_matching': True,
+    }, {
+        'url': 'hrti:2181385',
+        'only_matching': True,
+    }, {
+        'url': 'https://hrti.hrt.hr/video/show/3873068/cuvar-dvorca-dramska-serija-14',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('short_id') or mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        video = self._download_json(
+            '%s/video_id/%s/format/json' % (self._search_url, video_id),
+            display_id, 'Downloading video metadata JSON')['video'][0]
+
+        title_info = video['title']
+        title = title_info['title_long']
+
+        movie = video['video_assets']['movie'][0]
+        m3u8_url = movie['url'].format(TOKEN=self._token)
+        formats = self._extract_m3u8_formats(
+            m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+        self._sort_formats(formats)
+
+        description = clean_html(title_info.get('summary_long'))
+        age_limit = parse_age_limit(video.get('parental_control', {}).get('rating'))
+        view_count = int_or_none(video.get('views'))
+        average_rating = int_or_none(video.get('user_rating'))
+        duration = int_or_none(movie.get('duration'))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'view_count': view_count,
+            'average_rating': average_rating,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
+
+
+class HRTiPlaylistIE(HRTiBaseIE):
+    _VALID_URL = r'https?://hrti\.hrt\.hr/(?:#/)?video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?'
+    _TESTS = [{
+        'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena',
+        'info_dict': {
+            'id': '212',
+            'title': 'ekumena',
+        },
+        'playlist_mincount': 8,
+        'skip': 'Requires account credentials',
+    }, {
+        'url': 'https://hrti.hrt.hr/#/video/list/category/212/',
+        'only_matching': True,
+    }, {
+        'url': 'https://hrti.hrt.hr/video/list/category/212/ekumena',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        category_id = mobj.group('id')
+        display_id = mobj.group('display_id') or category_id
+
+        response = self._download_json(
+            '%s/category_id/%s/format/json' % (self._search_url, category_id),
+            display_id, 'Downloading video metadata JSON')
+
+        video_ids = try_get(
+            response, lambda x: x['video_listings'][0]['alternatives'][0]['list'],
+            list) or [video['id'] for video in response.get('videos', []) if video.get('id')]
+
+        entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids]
+
+        return self.playlist_result(entries, category_id, display_id)
diff --git a/youtube_dl/extractor/huajiao.py b/youtube_dl/extractor/huajiao.py

new file mode 100644 (file)

index 0000000..4ca275d
--- /dev/null
+++ b/youtube_dl/extractor/huajiao.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    parse_iso8601,
+)
+
+
+class HuajiaoIE(InfoExtractor):
+    IE_DESC = '花椒直播'
+    _VALID_URL = r'https?://(?:www\.)?huajiao\.com/l/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.huajiao.com/l/38941232',
+        'md5': 'd08bf9ac98787d24d1e4c0283f2d372d',
+        'info_dict': {
+            'id': '38941232',
+            'ext': 'mp4',
+            'title': '#新人求关注#',
+            'description': 're:.*',
+            'duration': 2424.0,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1475866459,
+            'upload_date': '20161007',
+            'uploader': 'Penny_余姿昀',
+            'uploader_id': '75206005',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        feed_json = self._search_regex(
+            r'var\s+feed\s*=\s*({.+})', webpage, 'feed json')
+        feed = self._parse_json(feed_json, video_id)
+
+        description = self._html_search_meta(
+            'description', webpage, 'description', fatal=False)
+
+        def get(section, field):
+            return feed.get(section, {}).get(field)
+
+        return {
+            'id': video_id,
+            'title': feed['feed']['formated_title'],
+            'description': description,
+            'duration': parse_duration(get('feed', 'duration')),
+            'thumbnail': get('feed', 'image'),
+            'timestamp': parse_iso8601(feed.get('creatime'), ' '),
+            'uploader': get('author', 'nickname'),
+            'uploader_id': get('author', 'uid'),
+            'formats': self._extract_m3u8_formats(
+                feed['feed']['m3u8'], video_id, 'mp4', 'm3u8_native'),
+        }
diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py

new file mode 100644 (file)

index 0000000..97e36f0
--- /dev/null
+++ b/youtube_dl/extractor/huffpost.py
@@ -0,0 +1,96 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    parse_duration,
+    unified_strdate,
+)
+
+
+class HuffPostIE(InfoExtractor):
+    IE_DESC = 'Huffington Post'
+    _VALID_URL = r'''(?x)
+        https?://(embed\.)?live\.huffingtonpost\.com/
+        (?:
+            r/segment/[^/]+/|
+            HPLEmbedPlayer/\?segmentId=
+        )
+        (?P<id>[0-9a-f]+)'''
+
+    _TEST = {
+        'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
+        'md5': '55f5e8981c1c80a64706a44b74833de8',
+        'info_dict': {
+            'id': '52dd3e4b02a7602131000677',
+            'ext': 'mp4',
+            'title': 'Legalese It! with @MikeSacksHP',
+            'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more.  ',
+            'duration': 1549,
+            'upload_date': '20140124',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'expected_warnings': ['HTTP Error 404: Not Found'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        api_url = 'http://embed.live.huffingtonpost.com/api/segments/%s.json' % video_id
+        data = self._download_json(api_url, video_id)['data']
+
+        video_title = data['title']
+        duration = parse_duration(data.get('running_time'))
+        upload_date = unified_strdate(
+            data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time'))
+        description = data.get('description')
+
+        thumbnails = []
+        for url in filter(None, data['images'].values()):
+            m = re.match(r'.*-([0-9]+x[0-9]+)\.', url)
+            if not m:
+                continue
+            thumbnails.append({
+                'url': url,
+                'resolution': m.group(1),
+            })
+
+        formats = []
+        sources = data.get('sources', {})
+        live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items())
+        for key, url in live_sources:
+            ext = determine_ext(url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    url, video_id, ext='mp4', m3u8_id='hls', fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False))
+            else:
+                formats.append({
+                    'format': key,
+                    'format_id': key.replace('/', '.'),
+                    'ext': 'mp4',
+                    'url': url,
+                    'vcodec': 'none' if key.startswith('audio/') else None,
+                })
+
+        if not formats and data.get('fivemin_id'):
+            return self.url_result('5min:%s' % data['fivemin_id'])
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'description': description,
+            'formats': formats,
+            'duration': duration,
+            'upload_date': upload_date,
+            'thumbnails': thumbnails,
+        }
diff --git a/youtube_dl/extractor/hungama.py b/youtube_dl/extractor/hungama.py

new file mode 100644 (file)

index 0000000..3fdaac5
--- /dev/null
+++ b/youtube_dl/extractor/hungama.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    urlencode_postdata,
+)
+
+
+class HungamaIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?hungama\.com/
+                        (?:
+                            (?:video|movie)/[^/]+/|
+                            tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/
+                        )
+                        (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'http://www.hungama.com/video/krishna-chants/39349649/',
+        'md5': 'a845a6d1ebd08d80c1035126d49bd6a0',
+        'info_dict': {
+            'id': '2931166',
+            'ext': 'mp4',
+            'title': 'Lucky Ali - Kitni Haseen Zindagi',
+            'track': 'Kitni Haseen Zindagi',
+            'artist': 'Lucky Ali',
+            'album': 'Aks',
+            'release_year': 2000,
+        }
+    }, {
+        'url': 'https://www.hungama.com/movie/kahaani-2/44129919/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.hungama.com/tv-show/padded-ki-pushup/season-1/44139461/episode/ep-02-training-sasu-pathlaag-karing/44139503/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        info = self._search_json_ld(webpage, video_id)
+
+        m3u8_url = self._download_json(
+            'https://www.hungama.com/index.php', video_id,
+            data=urlencode_postdata({'content_id': video_id}), headers={
+                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+                'X-Requested-With': 'XMLHttpRequest',
+            }, query={
+                'c': 'common',
+                'm': 'get_video_mdn_url',
+            })['stream_url']
+
+        formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+        self._sort_formats(formats)
+
+        info.update({
+            'id': video_id,
+            'formats': formats,
+        })
+        return info
+
+
+class HungamaSongIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/',
+        'md5': 'a845a6d1ebd08d80c1035126d49bd6a0',
+        'info_dict': {
+            'id': '2931166',
+            'ext': 'mp4',
+            'title': 'Lucky Ali - Kitni Haseen Zindagi',
+            'track': 'Kitni Haseen Zindagi',
+            'artist': 'Lucky Ali',
+            'album': 'Aks',
+            'release_year': 2000,
+        }
+    }
+
+    def _real_extract(self, url):
+        audio_id = self._match_id(url)
+
+        data = self._download_json(
+            'https://www.hungama.com/audio-player-data/track/%s' % audio_id,
+            audio_id, query={'_country': 'IN'})[0]
+
+        track = data['song_name']
+        artist = data.get('singer_name')
+
+        m3u8_url = self._download_json(
+            data.get('file') or data['preview_link'],
+            audio_id)['response']['media_url']
+
+        formats = self._extract_m3u8_formats(
+            m3u8_url, audio_id, ext='mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+        self._sort_formats(formats)
+
+        title = '%s - %s' % (artist, track) if artist else track
+        thumbnail = data.get('img_src') or data.get('album_image')
+
+        return {
+            'id': audio_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'track': track,
+            'artist': artist,
+            'album': data.get('album_name'),
+            'release_year': int_or_none(data.get('date')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py

new file mode 100644 (file)

index 0000000..9ca28d6
--- /dev/null
+++ b/youtube_dl/extractor/hypem.py
@@ -0,0 +1,49 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class HypemIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?hypem\.com/track/(?P<id>[0-9a-z]{5})'
+    _TEST = {
+        'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME',
+        'md5': 'b9cc91b5af8995e9f0c1cee04c575828',
+        'info_dict': {
+            'id': '1v6ga',
+            'ext': 'mp3',
+            'title': 'Tame',
+            'uploader': 'BODYWORK',
+            'timestamp': 1371810457,
+            'upload_date': '20130621',
+        }
+    }
+
+    def _real_extract(self, url):
+        track_id = self._match_id(url)
+
+        response = self._download_webpage(url, track_id)
+
+        track = self._parse_json(self._html_search_regex(
+            r'(?s)<script\s+type="application/json"\s+id="displayList-data">(.+?)</script>',
+            response, 'tracks'), track_id)['tracks'][0]
+
+        track_id = track['id']
+        title = track['song']
+
+        final_url = self._download_json(
+            'http://hypem.com/serve/source/%s/%s' % (track_id, track['key']),
+            track_id, 'Downloading metadata', headers={
+                'Content-Type': 'application/json'
+            })['url']
+
+        return {
+            'id': track_id,
+            'url': final_url,
+            'ext': 'mp3',
+            'title': title,
+            'uploader': track.get('artist'),
+            'duration': int_or_none(track.get('time')),
+            'timestamp': int_or_none(track.get('ts')),
+            'track': title,
+        }
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py

new file mode 100644 (file)

index 0000000..a96ea80
--- /dev/null
+++ b/youtube_dl/extractor/ign.py
@@ -0,0 +1,232 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class IGNIE(InfoExtractor):
+    """
+    Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
+    Some videos of it.ign.com are also supported
+    """
+
+    _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?P<type>videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P<name_or_id>.+)'
+    IE_NAME = 'ign.com'
+
+    _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s'
+    _EMBED_RE = r'<iframe[^>]+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']'
+
+    _TESTS = [
+        {
+            'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
+            'md5': 'febda82c4bafecd2d44b6e1a18a595f8',
+            'info_dict': {
+                'id': '8f862beef863986b2785559b9e1aa599',
+                'ext': 'mp4',
+                'title': 'The Last of Us Review',
+                'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
+                'timestamp': 1370440800,
+                'upload_date': '20130605',
+                'uploader_id': 'cberidon@ign.com',
+            }
+        },
+        {
+            'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
+            'info_dict': {
+                'id': '100-little-things-in-gta-5-that-will-blow-your-mind',
+            },
+            'playlist': [
+                {
+                    'info_dict': {
+                        'id': '5ebbd138523268b93c9141af17bec937',
+                        'ext': 'mp4',
+                        'title': 'GTA 5 Video Review',
+                        'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
+                        'timestamp': 1379339880,
+                        'upload_date': '20130916',
+                        'uploader_id': 'danieljkrupa@gmail.com',
+                    },
+                },
+                {
+                    'info_dict': {
+                        'id': '638672ee848ae4ff108df2a296418ee2',
+                        'ext': 'mp4',
+                        'title': '26 Twisted Moments from GTA 5 in Slow Motion',
+                        'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
+                        'timestamp': 1386878820,
+                        'upload_date': '20131212',
+                        'uploader_id': 'togilvie@ign.com',
+                    },
+                },
+            ],
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
+            'md5': '618fedb9c901fd086f6f093564ef8558',
+            'info_dict': {
+                'id': '078fdd005f6d3c02f63d795faa1b984f',
+                'ext': 'mp4',
+                'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
+                'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.',
+                'timestamp': 1408047180,
+                'upload_date': '20140814',
+                'uploader_id': 'jamesduggan1990@gmail.com',
+            },
+        },
+        {
+            'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
+            'only_matching': True,
+        },
+        {
+            # videoId pattern
+            'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
+            'only_matching': True,
+        },
+    ]
+
+    def _find_video_id(self, webpage):
+        res_id = [
+            r'"video_id"\s*:\s*"(.*?)"',
+            r'class="hero-poster[^"]*?"[^>]*id="(.+?)"',
+            r'data-video-id="(.+?)"',
+            r'<object id="vid_(.+?)"',
+            r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
+            r'videoId&quot;\s*:\s*&quot;(.+?)&quot;',
+            r'videoId["\']\s*:\s*["\']([^"\']+?)["\']',
+        ]
+        return self._search_regex(res_id, webpage, 'video id', default=None)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        name_or_id = mobj.group('name_or_id')
+        page_type = mobj.group('type')
+        webpage = self._download_webpage(url, name_or_id)
+        if page_type != 'video':
+            multiple_urls = re.findall(
+                r'<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]',
+                webpage)
+            if multiple_urls:
+                entries = [self.url_result(u, ie='IGN') for u in multiple_urls]
+                return {
+                    '_type': 'playlist',
+                    'id': name_or_id,
+                    'entries': entries,
+                }
+
+        video_id = self._find_video_id(webpage)
+        if not video_id:
+            return self.url_result(self._search_regex(
+                self._EMBED_RE, webpage, 'embed url'))
+        return self._get_video_info(video_id)
+
+    def _get_video_info(self, video_id):
+        api_data = self._download_json(
+            self._API_URL_TEMPLATE % video_id, video_id)
+
+        formats = []
+        m3u8_url = api_data['refs'].get('m3uUrl')
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False))
+        f4m_url = api_data['refs'].get('f4mUrl')
+        if f4m_url:
+            formats.extend(self._extract_f4m_formats(
+                f4m_url, video_id, f4m_id='hds', fatal=False))
+        for asset in api_data['assets']:
+            formats.append({
+                'url': asset['url'],
+                'tbr': asset.get('actual_bitrate_kbps'),
+                'fps': asset.get('frame_rate'),
+                'height': int_or_none(asset.get('height')),
+                'width': int_or_none(asset.get('width')),
+            })
+        self._sort_formats(formats)
+
+        thumbnails = [{
+            'url': thumbnail['url']
+        } for thumbnail in api_data.get('thumbnails', [])]
+
+        metadata = api_data['metadata']
+
+        return {
+            'id': api_data.get('videoId') or video_id,
+            'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'],
+            'description': metadata.get('description'),
+            'timestamp': parse_iso8601(metadata.get('publishDate')),
+            'duration': int_or_none(metadata.get('duration')),
+            'display_id': metadata.get('slug') or video_id,
+            'uploader_id': metadata.get('creator'),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
+
+
+class OneUPIE(IGNIE):
+    _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html'
+    IE_NAME = '1up.com'
+
+    _TESTS = [{
+        'url': 'http://gamevideos.1up.com/video/id/34976.html',
+        'md5': 'c9cc69e07acb675c31a16719f909e347',
+        'info_dict': {
+            'id': '34976',
+            'ext': 'mp4',
+            'title': 'Sniper Elite V2 - Trailer',
+            'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826',
+            'timestamp': 1313099220,
+            'upload_date': '20110811',
+            'uploader_id': 'IGN',
+        }
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        result = super(OneUPIE, self)._real_extract(url)
+        result['id'] = mobj.group('name_or_id')
+        return result
+
+
+class PCMagIE(IGNIE):
+    _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)'
+    IE_NAME = 'pcmag'
+
+    _EMBED_RE = r'iframe\.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content\.html?[^"]*url=([^"]+)["&]'
+
+    _TESTS = [{
+        'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
+        'md5': '212d6154fd0361a2781075f1febbe9ad',
+        'info_dict': {
+            'id': 'ee10d774b508c9b8ec07e763b9125b91',
+            'ext': 'mp4',
+            'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?',
+            'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3',
+            'timestamp': 1420571160,
+            'upload_date': '20150106',
+            'uploader_id': 'cozzipix@gmail.com',
+        }
+    }, {
+        'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp',
+        'md5': '94130c1ca07ba0adb6088350681f16c1',
+        'info_dict': {
+            'id': '042e560ba94823d43afcb12ddf7142ca',
+            'ext': 'mp4',
+            'title': 'HTC\'s Weird New Re Camera - What\'s New Now',
+            'description': 'md5:53433c45df96d2ea5d0fda18be2ca908',
+            'timestamp': 1412953920,
+            'upload_date': '20141010',
+            'uploader_id': 'chris_snyder@pcmag.com',
+        }
+    }]
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py

new file mode 100644 (file)

index 0000000..a313019
--- /dev/null
+++ b/youtube_dl/extractor/imdb.py
@@ -0,0 +1,147 @@
+from __future__ import unicode_literals
+
+import base64
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    mimetype2ext,
+    parse_duration,
+    qualities,
+    try_get,
+    url_or_none,
+)
+
+
+class ImdbIE(InfoExtractor):
+    IE_NAME = 'imdb'
+    IE_DESC = 'Internet Movie Database trailers'
+    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).*?[/-]vi(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.imdb.com/video/imdb/vi2524815897',
+        'info_dict': {
+            'id': '2524815897',
+            'ext': 'mp4',
+            'title': 'No. 2',
+            'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7',
+            'duration': 152,
+        }
+    }, {
+        'url': 'http://www.imdb.com/video/_/vi2524815897',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.imdb.com/title/tt1667889/?ref_=ext_shr_eml_vi#lb-vi2524815897',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.imdb.com/videoplayer/vi1562949145',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.imdb.com/list/ls009921623/videoplayer/vi260482329',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        data = self._download_json(
+            'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id,
+            query={
+                'key': base64.b64encode(json.dumps({
+                    'type': 'VIDEO_PLAYER',
+                    'subType': 'FORCE_LEGACY',
+                    'id': 'vi%s' % video_id,
+                }).encode()).decode(),
+            })[0]
+
+        quality = qualities(('SD', '480p', '720p', '1080p'))
+        formats = []
+        for encoding in data['videoLegacyEncodings']:
+            if not encoding or not isinstance(encoding, dict):
+                continue
+            video_url = url_or_none(encoding.get('url'))
+            if not video_url:
+                continue
+            ext = mimetype2ext(encoding.get(
+                'mimeType')) or determine_ext(video_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    preference=1, m3u8_id='hls', fatal=False))
+                continue
+            format_id = encoding.get('definition')
+            formats.append({
+                'format_id': format_id,
+                'url': video_url,
+                'ext': ext,
+                'quality': quality(format_id),
+            })
+        self._sort_formats(formats)
+
+        webpage = self._download_webpage(
+            'https://www.imdb.com/video/vi' + video_id, video_id)
+        video_metadata = self._parse_json(self._search_regex(
+            r'args\.push\(\s*({.+?})\s*\)\s*;', webpage,
+            'video metadata'), video_id)
+
+        video_info = video_metadata.get('VIDEO_INFO')
+        if video_info and isinstance(video_info, dict):
+            info = try_get(
+                video_info, lambda x: x[list(video_info.keys())[0]][0], dict)
+        else:
+            info = {}
+
+        title = self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage) or self._html_search_regex(
+            r'<title>(.+?)</title>', webpage, 'title',
+            default=None) or info['videoTitle']
+
+        return {
+            'id': video_id,
+            'title': title,
+            'alt_title': info.get('videoSubTitle'),
+            'formats': formats,
+            'description': info.get('videoDescription'),
+            'thumbnail': url_or_none(try_get(
+                video_metadata, lambda x: x['videoSlate']['source'])),
+            'duration': parse_duration(info.get('videoRuntime')),
+        }
+
+
+class ImdbListIE(InfoExtractor):
+    IE_NAME = 'imdb:list'
+    IE_DESC = 'Internet Movie Database lists'
+    _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P<id>\d{9})(?!/videoplayer/vi\d+)'
+    _TEST = {
+        'url': 'https://www.imdb.com/list/ls009921623/',
+        'info_dict': {
+            'id': '009921623',
+            'title': 'The Bourne Legacy',
+            'description': 'A list of trailers, clips, and more from The Bourne Legacy, starring Jeremy Renner and Rachel Weisz.',
+        },
+        'playlist_count': 8,
+    }
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+        webpage = self._download_webpage(url, list_id)
+        entries = [
+            self.url_result('http://www.imdb.com' + m, 'Imdb')
+            for m in re.findall(r'href="(/list/ls%s/videoplayer/vi[^"]+)"' % list_id, webpage)]
+
+        list_title = self._html_search_regex(
+            r'<h1[^>]+class="[^"]*header[^"]*"[^>]*>(.*?)</h1>',
+            webpage, 'list title')
+        list_description = self._html_search_regex(
+            r'<div[^>]+class="[^"]*list-description[^"]*"[^>]*><p>(.*?)</p>',
+            webpage, 'list description')
+
+        return self.playlist_result(entries, list_id, list_title, list_description)
diff --git a/youtube_dl/extractor/imggaming.py b/youtube_dl/extractor/imggaming.py

new file mode 100644 (file)

index 0000000..e11f920
--- /dev/null
+++ b/youtube_dl/extractor/imggaming.py
@@ -0,0 +1,133 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    str_or_none,
+    try_get,
+)
+
+
+class ImgGamingBaseIE(InfoExtractor):
+    _API_BASE = 'https://dce-frontoffice.imggaming.com/api/v2/'
+    _API_KEY = '857a1e5d-e35e-4fdf-805b-a87b6f8364bf'
+    _HEADERS = None
+    _MANIFEST_HEADERS = {'Accept-Encoding': 'identity'}
+    _REALM = None
+    _VALID_URL_TEMPL = r'https?://(?P<domain>%s)/(?P<type>live|playlist|video)/(?P<id>\d+)(?:\?.*?\bplaylistId=(?P<playlist_id>\d+))?'
+
+    def _real_initialize(self):
+        self._HEADERS = {
+            'Realm': 'dce.' + self._REALM,
+            'x-api-key': self._API_KEY,
+        }
+
+        email, password = self._get_login_info()
+        if email is None:
+            self.raise_login_required()
+
+        p_headers = self._HEADERS.copy()
+        p_headers['Content-Type'] = 'application/json'
+        self._HEADERS['Authorization'] = 'Bearer ' + self._download_json(
+            self._API_BASE + 'login',
+            None, 'Logging in', data=json.dumps({
+                'id': email,
+                'secret': password,
+            }).encode(), headers=p_headers)['authorisationToken']
+
+    def _call_api(self, path, media_id):
+        return self._download_json(
+            self._API_BASE + path + media_id, media_id, headers=self._HEADERS)
+
+    def _extract_dve_api_url(self, media_id, media_type):
+        stream_path = 'stream'
+        if media_type == 'video':
+            stream_path += '/vod/'
+        else:
+            stream_path += '?eventId='
+        try:
+            return self._call_api(
+                stream_path, media_id)['playerUrlCallback']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                raise ExtractorError(
+                    self._parse_json(e.cause.read().decode(), media_id)['messages'][0],
+                    expected=True)
+            raise
+
+    def _real_extract(self, url):
+        domain, media_type, media_id, playlist_id = re.match(self._VALID_URL, url).groups()
+
+        if playlist_id:
+            if self._downloader.params.get('noplaylist'):
+                self.to_screen('Downloading just video %s because of --no-playlist' % media_id)
+            else:
+                self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id)
+                media_type, media_id = 'playlist', playlist_id
+
+        if media_type == 'playlist':
+            playlist = self._call_api('vod/playlist/', media_id)
+            entries = []
+            for video in try_get(playlist, lambda x: x['videos']['vods']) or []:
+                video_id = str_or_none(video.get('id'))
+                if not video_id:
+                    continue
+                entries.append(self.url_result(
+                    'https://%s/video/%s' % (domain, video_id),
+                    self.ie_key(), video_id))
+            return self.playlist_result(
+                entries, media_id, playlist.get('title'),
+                playlist.get('description'))
+
+        dve_api_url = self._extract_dve_api_url(media_id, media_type)
+        video_data = self._download_json(dve_api_url, media_id)
+        is_live = media_type == 'live'
+        if is_live:
+            title = self._live_title(self._call_api('event/', media_id)['title'])
+        else:
+            title = video_data['name']
+
+        formats = []
+        for proto in ('hls', 'dash'):
+            media_url = video_data.get(proto + 'Url') or try_get(video_data, lambda x: x[proto]['url'])
+            if not media_url:
+                continue
+            if proto == 'hls':
+                m3u8_formats = self._extract_m3u8_formats(
+                    media_url, media_id, 'mp4', 'm3u8' if is_live else 'm3u8_native',
+                    m3u8_id='hls', fatal=False, headers=self._MANIFEST_HEADERS)
+                for f in m3u8_formats:
+                    f.setdefault('http_headers', {}).update(self._MANIFEST_HEADERS)
+                    formats.append(f)
+            else:
+                formats.extend(self._extract_mpd_formats(
+                    media_url, media_id, mpd_id='dash', fatal=False,
+                    headers=self._MANIFEST_HEADERS))
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for subtitle in video_data.get('subtitles', []):
+            subtitle_url = subtitle.get('url')
+            if not subtitle_url:
+                continue
+            subtitles.setdefault(subtitle.get('lang', 'en_US'), []).append({
+                'url': subtitle_url,
+            })
+
+        return {
+            'id': media_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': video_data.get('thumbnailUrl'),
+            'description': video_data.get('description'),
+            'duration': int_or_none(video_data.get('duration')),
+            'tags': video_data.get('tags'),
+            'is_live': is_live,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py

new file mode 100644 (file)

index 0000000..4dc7b0b
--- /dev/null
+++ b/youtube_dl/extractor/imgur.py
@@ -0,0 +1,154 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    mimetype2ext,
+    ExtractorError,
+)
+
+
+class ImgurIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P<id>[a-zA-Z0-9]+)'
+
+    _TESTS = [{
+        'url': 'https://i.imgur.com/A61SaA1.gifv',
+        'info_dict': {
+            'id': 'A61SaA1',
+            'ext': 'mp4',
+            'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
+        },
+    }, {
+        'url': 'https://imgur.com/A61SaA1',
+        'only_matching': True,
+    }, {
+        'url': 'https://i.imgur.com/crGpqCV.mp4',
+        'only_matching': True,
+    }, {
+        # no title
+        'url': 'https://i.imgur.com/jxBXAMC.gifv',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id)
+
+        width = int_or_none(self._og_search_property(
+            'video:width', webpage, default=None))
+        height = int_or_none(self._og_search_property(
+            'video:height', webpage, default=None))
+
+        video_elements = self._search_regex(
+            r'(?s)<div class="video-elements">(.*?)</div>',
+            webpage, 'video elements', default=None)
+        if not video_elements:
+            raise ExtractorError(
+                'No sources found for video %s. Maybe an image?' % video_id,
+                expected=True)
+
+        formats = []
+        for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements):
+            formats.append({
+                'format_id': m.group('type').partition('/')[2],
+                'url': self._proto_relative_url(m.group('src')),
+                'ext': mimetype2ext(m.group('type')),
+                'width': width,
+                'height': height,
+                'http_headers': {
+                    'User-Agent': 'youtube-dlc (like wget)',
+                },
+            })
+
+        gif_json = self._search_regex(
+            r'(?s)var\s+videoItem\s*=\s*(\{.*?\})',
+            webpage, 'GIF code', fatal=False)
+        if gif_json:
+            gifd = self._parse_json(
+                gif_json, video_id, transform_source=js_to_json)
+            formats.append({
+                'format_id': 'gif',
+                'preference': -10,
+                'width': width,
+                'height': height,
+                'ext': 'gif',
+                'acodec': 'none',
+                'vcodec': 'gif',
+                'container': 'gif',
+                'url': self._proto_relative_url(gifd['gifUrl']),
+                'filesize': gifd.get('size'),
+                'http_headers': {
+                    'User-Agent': 'youtube-dlc (like wget)',
+                },
+            })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': self._og_search_title(webpage, default=video_id),
+        }
+
+
+class ImgurGalleryIE(InfoExtractor):
+    IE_NAME = 'imgur:gallery'
+    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P<id>[a-zA-Z0-9]+)'
+
+    _TESTS = [{
+        'url': 'http://imgur.com/gallery/Q95ko',
+        'info_dict': {
+            'id': 'Q95ko',
+            'title': 'Adding faces make every GIF better',
+        },
+        'playlist_count': 25,
+    }, {
+        'url': 'http://imgur.com/topic/Aww/ll5Vk',
+        'only_matching': True,
+    }, {
+        'url': 'https://imgur.com/gallery/YcAQlkx',
+        'info_dict': {
+            'id': 'YcAQlkx',
+            'ext': 'mp4',
+            'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....',
+        }
+    }, {
+        'url': 'http://imgur.com/topic/Funny/N8rOudd',
+        'only_matching': True,
+    }, {
+        'url': 'http://imgur.com/r/aww/VQcQPhM',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        gallery_id = self._match_id(url)
+
+        data = self._download_json(
+            'https://imgur.com/gallery/%s.json' % gallery_id,
+            gallery_id)['data']['image']
+
+        if data.get('is_album'):
+            entries = [
+                self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash'])
+                for image in data['album_images']['images'] if image.get('hash')]
+            return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description'))
+
+        return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id)
+
+
+class ImgurAlbumIE(ImgurGalleryIE):
+    IE_NAME = 'imgur:album'
+    _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)'
+
+    _TESTS = [{
+        'url': 'http://imgur.com/a/j6Orj',
+        'info_dict': {
+            'id': 'j6Orj',
+            'title': 'A Literary Analysis of "Star Wars: The Force Awakens"',
+        },
+        'playlist_count': 12,
+    }]
diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py

new file mode 100644 (file)

index 0000000..12695af
--- /dev/null
+++ b/youtube_dl/extractor/ina.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    strip_or_none,
+    xpath_attr,
+    xpath_text,
+)
+
+
+class InaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)'
+    _TESTS = [{
+        'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
+        'md5': 'a667021bf2b41f8dc6049479d9bb38a3',
+        'info_dict': {
+            'id': 'I12055569',
+            'ext': 'mp4',
+            'title': 'François Hollande "Je crois que c\'est clair"',
+            'description': 'md5:3f09eb072a06cb286b8f7e4f77109663',
+        }
+    }, {
+        'url': 'https://www.ina.fr/video/S806544_001/don-d-organes-des-avancees-mais-d-importants-besoins-video.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ina.fr/audio/P16173408',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ina.fr/video/P16173408-video.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        info_doc = self._download_xml(
+            'http://player.ina.fr/notices/%s.mrss' % video_id, video_id)
+        item = info_doc.find('channel/item')
+        title = xpath_text(item, 'title', fatal=True)
+        media_ns_xpath = lambda x: self._xpath_ns(x, 'http://search.yahoo.com/mrss/')
+        content = item.find(media_ns_xpath('content'))
+
+        get_furl = lambda x: xpath_attr(content, media_ns_xpath(x), 'url')
+        formats = []
+        for q, w, h in (('bq', 400, 300), ('mq', 512, 384), ('hq', 768, 576)):
+            q_url = get_furl(q)
+            if not q_url:
+                continue
+            formats.append({
+                'format_id': q,
+                'url': q_url,
+                'width': w,
+                'height': h,
+            })
+        if not formats:
+            furl = get_furl('player') or content.attrib['url']
+            ext = determine_ext(furl)
+            formats = [{
+                'url': furl,
+                'vcodec': 'none' if ext == 'mp3' else None,
+                'ext': ext,
+            }]
+
+        thumbnails = []
+        for thumbnail in content.findall(media_ns_xpath('thumbnail')):
+            thumbnail_url = thumbnail.get('url')
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+                'height': int_or_none(thumbnail.get('height')),
+                'width': int_or_none(thumbnail.get('width')),
+            })
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': strip_or_none(xpath_text(item, 'description')),
+            'thumbnails': thumbnails,
+        }
diff --git a/youtube_dl/extractor/inc.py b/youtube_dl/extractor/inc.py

new file mode 100644 (file)

index 0000000..d5b258a
--- /dev/null
+++ b/youtube_dl/extractor/inc.py
@@ -0,0 +1,59 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .kaltura import KalturaIE
+
+
+class IncIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?inc\.com/(?:[^/]+/)+(?P<id>[^.]+).html'
+    _TESTS = [{
+        'url': 'http://www.inc.com/tip-sheet/bill-gates-says-these-5-books-will-make-you-smarter.html',
+        'md5': '7416739c9c16438c09fa35619d6ba5cb',
+        'info_dict': {
+            'id': '1_wqig47aq',
+            'ext': 'mov',
+            'title': 'Bill Gates Says These 5 Books Will Make You Smarter',
+            'description': 'md5:bea7ff6cce100886fc1995acb743237e',
+            'timestamp': 1474414430,
+            'upload_date': '20160920',
+            'uploader_id': 'video@inc.com',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # div with id=kaltura_player_1_kqs38cgm
+        'url': 'https://www.inc.com/oscar-raymundo/richard-branson-young-entrepeneurs.html',
+        'info_dict': {
+            'id': '1_kqs38cgm',
+            'ext': 'mp4',
+            'title': 'Branson: "In the end, you have to say, Screw it. Just do it."',
+            'description': 'md5:21b832d034f9af5191ca5959da5e9cb6',
+            'timestamp': 1364403232,
+            'upload_date': '20130327',
+            'uploader_id': 'incdigital@inc.com',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.inc.com/video/david-whitford/founders-forum-tripadvisor-steve-kaufer-most-enjoyable-moment-for-entrepreneur.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        partner_id = self._search_regex(
+            r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage,
+            'partner id', default='1034971')
+
+        kaltura_id = self._search_regex(
+            r'id=(["\'])kaltura_player_(?P<id>.+?)\1', webpage, 'kaltura id',
+            default=None, group='id') or self._parse_json(self._search_regex(
+                r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'),
+            display_id)['vid_kaltura_id']
+
+        return self.url_result(
+            'kaltura:%s:%s' % (partner_id, kaltura_id), KalturaIE.ie_key())
diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py

new file mode 100644 (file)

index 0000000..4c16243
--- /dev/null
+++ b/youtube_dl/extractor/indavideo.py
@@ -0,0 +1,128 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    parse_age_limit,
+    parse_iso8601,
+    update_url_query,
+)
+
+
+class IndavideoEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)'
+    _TESTS = [{
+        'url': 'http://indavideo.hu/player/video/1bdc3c6d80/',
+        'md5': 'c8a507a1c7410685f83a06eaeeaafeab',
+        'info_dict': {
+            'id': '1837039',
+            'ext': 'mp4',
+            'title': 'Cicatánc',
+            'description': '',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'cukiajanlo',
+            'uploader_id': '83729',
+            'timestamp': 1439193826,
+            'upload_date': '20150810',
+            'duration': 72,
+            'age_limit': 0,
+            'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'],
+        },
+    }, {
+        'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1',
+        'only_matching': True,
+    }, {
+        'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1',
+        'only_matching': True,
+    }]
+
+    # Some example URLs covered by generic extractor:
+    #   http://indavideo.hu/video/Vicces_cica_1
+    #   http://index.indavideo.hu/video/2015_0728_beregszasz
+    #   http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko
+    #   http://erotika.indavideo.hu/video/Amator_tini_punci
+    #   http://film.indavideo.hu/video/f_hrom_nagymamm_volt
+    #   http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id,
+            video_id)['data']
+
+        title = video['title']
+
+        video_urls = []
+
+        video_files = video.get('video_files')
+        if isinstance(video_files, list):
+            video_urls.extend(video_files)
+        elif isinstance(video_files, dict):
+            video_urls.extend(video_files.values())
+
+        video_file = video.get('video_file')
+        if video:
+            video_urls.append(video_file)
+        video_urls = list(set(video_urls))
+
+        video_prefix = video_urls[0].rsplit('/', 1)[0]
+
+        for flv_file in video.get('flv_files', []):
+            flv_url = '%s/%s' % (video_prefix, flv_file)
+            if flv_url not in video_urls:
+                video_urls.append(flv_url)
+
+        filesh = video.get('filesh')
+
+        formats = []
+        for video_url in video_urls:
+            height = int_or_none(self._search_regex(
+                r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None))
+            if filesh:
+                if not height:
+                    continue
+                token = filesh.get(compat_str(height))
+                if token is None:
+                    continue
+                video_url = update_url_query(video_url, {'token': token})
+            formats.append({
+                'url': video_url,
+                'height': height,
+            })
+        self._sort_formats(formats)
+
+        timestamp = video.get('date')
+        if timestamp:
+            # upload date is in CEST
+            timestamp = parse_iso8601(timestamp + ' +0200', ' ')
+
+        thumbnails = [{
+            'url': self._proto_relative_url(thumbnail)
+        } for thumbnail in video.get('thumbnails', [])]
+
+        tags = [tag['title'] for tag in video.get('tags') or []]
+
+        return {
+            'id': video.get('id') or video_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnails': thumbnails,
+            'uploader': video.get('user_name'),
+            'uploader_id': video.get('user_id'),
+            'timestamp': timestamp,
+            'duration': int_or_none(video.get('length')),
+            'age_limit': parse_age_limit(video.get('age_limit')),
+            'tags': tags,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py

new file mode 100644 (file)

index 0000000..18249cf
--- /dev/null
+++ b/youtube_dl/extractor/infoq.py
@@ -0,0 +1,136 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+from ..compat import (
+    compat_b64decode,
+    compat_urllib_parse_unquote,
+    compat_urlparse,
+)
+from ..utils import (
+    determine_ext,
+    update_url_query,
+)
+from .bokecc import BokeCCBaseIE
+
+
+class InfoQIE(BokeCCBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
+
+    _TESTS = [{
+        'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',
+        'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',
+        'info_dict': {
+            'id': 'A-Few-of-My-Favorite-Python-Things',
+            'ext': 'mp4',
+            'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',
+            'title': 'A Few of My Favorite [Python] Things',
+        },
+    }, {
+        'url': 'http://www.infoq.com/fr/presentations/changez-avis-sur-javascript',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.infoq.com/cn/presentations/openstack-continued-delivery',
+        'md5': '4918d0cca1497f2244572caf626687ef',
+        'info_dict': {
+            'id': 'openstack-continued-delivery',
+            'title': 'OpenStack持续交付之路',
+            'ext': 'flv',
+            'description': 'md5:308d981fb28fa42f49f9568322c683ff',
+        },
+    }, {
+        'url': 'https://www.infoq.com/presentations/Simple-Made-Easy',
+        'md5': '0e34642d4d9ef44bf86f66f6399672db',
+        'info_dict': {
+            'id': 'Simple-Made-Easy',
+            'title': 'Simple Made Easy',
+            'ext': 'mp3',
+            'description': 'md5:3e0e213a8bbd074796ef89ea35ada25b',
+        },
+        'params': {
+            'format': 'bestaudio',
+        },
+    }]
+
+    def _extract_rtmp_video(self, webpage):
+        # The server URL is hardcoded
+        video_url = 'rtmpe://video.infoq.com/cfx/st/'
+
+        # Extract video URL
+        encoded_id = self._search_regex(
+            r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id', default=None)
+
+        real_id = compat_urllib_parse_unquote(compat_b64decode(encoded_id).decode('utf-8'))
+        playpath = 'mp4:' + real_id
+
+        return [{
+            'format_id': 'rtmp_video',
+            'url': video_url,
+            'ext': determine_ext(playpath),
+            'play_path': playpath,
+        }]
+
+    def _extract_cf_auth(self, webpage):
+        policy = self._search_regex(r'InfoQConstants\.scp\s*=\s*\'([^\']+)\'', webpage, 'policy')
+        signature = self._search_regex(r'InfoQConstants\.scs\s*=\s*\'([^\']+)\'', webpage, 'signature')
+        key_pair_id = self._search_regex(r'InfoQConstants\.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id')
+        return {
+            'Policy': policy,
+            'Signature': signature,
+            'Key-Pair-Id': key_pair_id,
+        }
+
+    def _extract_http_video(self, webpage):
+        http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL')
+        http_video_url = update_url_query(http_video_url, self._extract_cf_auth(webpage))
+        return [{
+            'format_id': 'http_video',
+            'url': http_video_url,
+        }]
+
+    def _extract_http_audio(self, webpage, video_id):
+        fields = self._hidden_inputs(webpage)
+        http_audio_url = fields.get('filename')
+        if not http_audio_url:
+            return []
+
+        # base URL is found in the Location header in the response returned by
+        # GET https://www.infoq.com/mp3download.action?filename=... when logged in.
+        http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url)
+        http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage))
+
+        # audio file seem to be missing some times even if there is a download link
+        # so probe URL to make sure
+        if not self._is_valid_url(http_audio_url, video_id):
+            return []
+
+        return [{
+            'format_id': 'http_audio',
+            'url': http_audio_url,
+            'vcodec': 'none',
+        }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
+        video_description = self._html_search_meta('description', webpage, 'description')
+
+        if '/cn/' in url:
+            # for China videos, HTTP video URL exists but always fails with 403
+            formats = self._extract_bokecc_formats(webpage, video_id)
+        else:
+            formats = (
+                self._extract_rtmp_video(webpage)
+                + self._extract_http_video(webpage)
+                + self._extract_http_audio(webpage, video_id))
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'description': video_description,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py

new file mode 100644 (file)

index 0000000..b061850
--- /dev/null
+++ b/youtube_dl/extractor/instagram.py
@@ -0,0 +1,428 @@
+from __future__ import unicode_literals
+
+import itertools
+import hashlib
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_HTTPError,
+)
+from ..utils import (
+    ExtractorError,
+    get_element_by_attribute,
+    int_or_none,
+    lowercase_escape,
+    std_headers,
+    try_get,
+    url_or_none,
+)
+
+
+class InstagramIE(InfoExtractor):
+    _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv)/(?P<id>[^/?#&]+))'
+    _TESTS = [{
+        'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
+        'md5': '0d2da106a9d2631273e192b372806516',
+        'info_dict': {
+            'id': 'aye83DjauH',
+            'ext': 'mp4',
+            'title': 'Video by naomipq',
+            'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1371748545,
+            'upload_date': '20130620',
+            'uploader_id': 'naomipq',
+            'uploader': 'Naomi Leonor Phan-Quang',
+            'like_count': int,
+            'comment_count': int,
+            'comments': list,
+        },
+    }, {
+        # missing description
+        'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
+        'info_dict': {
+            'id': 'BA-pQFBG8HZ',
+            'ext': 'mp4',
+            'title': 'Video by britneyspears',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1453760977,
+            'upload_date': '20160125',
+            'uploader_id': 'britneyspears',
+            'uploader': 'Britney Spears',
+            'like_count': int,
+            'comment_count': int,
+            'comments': list,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # multi video post
+        'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/',
+        'playlist': [{
+            'info_dict': {
+                'id': 'BQ0dSaohpPW',
+                'ext': 'mp4',
+                'title': 'Video 1',
+            },
+        }, {
+            'info_dict': {
+                'id': 'BQ0dTpOhuHT',
+                'ext': 'mp4',
+                'title': 'Video 2',
+            },
+        }, {
+            'info_dict': {
+                'id': 'BQ0dT7RBFeF',
+                'ext': 'mp4',
+                'title': 'Video 3',
+            },
+        }],
+        'info_dict': {
+            'id': 'BQ0eAlwhDrw',
+            'title': 'Post by instagram',
+            'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957',
+        },
+    }, {
+        'url': 'https://instagram.com/p/-Cmh1cukG2/',
+        'only_matching': True,
+    }, {
+        'url': 'http://instagram.com/p/9o6LshA7zy/embed/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.instagram.com/tv/aye83DjauH/',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_embed_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+        blockquote_el = get_element_by_attribute(
+            'class', 'instagram-media', webpage)
+        if blockquote_el is None:
+            return
+
+        mobj = re.search(
+            r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el)
+        if mobj:
+            return mobj.group('link')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        url = mobj.group('url')
+
+        webpage = self._download_webpage(url, video_id)
+
+        (video_url, description, thumbnail, timestamp, uploader,
+         uploader_id, like_count, comment_count, comments, height,
+         width) = [None] * 11
+
+        shared_data = self._parse_json(
+            self._search_regex(
+                r'window\._sharedData\s*=\s*({.+?});',
+                webpage, 'shared data', default='{}'),
+            video_id, fatal=False)
+        if shared_data:
+            media = try_get(
+                shared_data,
+                (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
+                 lambda x: x['entry_data']['PostPage'][0]['media']),
+                dict)
+            if media:
+                video_url = media.get('video_url')
+                height = int_or_none(media.get('dimensions', {}).get('height'))
+                width = int_or_none(media.get('dimensions', {}).get('width'))
+                description = try_get(
+                    media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
+                    compat_str) or media.get('caption')
+                thumbnail = media.get('display_src')
+                timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
+                uploader = media.get('owner', {}).get('full_name')
+                uploader_id = media.get('owner', {}).get('username')
+
+                def get_count(key, kind):
+                    return int_or_none(try_get(
+                        media, (lambda x: x['edge_media_%s' % key]['count'],
+                                lambda x: x['%ss' % kind]['count'])))
+                like_count = get_count('preview_like', 'like')
+                comment_count = get_count('to_comment', 'comment')
+
+                comments = [{
+                    'author': comment.get('user', {}).get('username'),
+                    'author_id': comment.get('user', {}).get('id'),
+                    'id': comment.get('id'),
+                    'text': comment.get('text'),
+                    'timestamp': int_or_none(comment.get('created_at')),
+                } for comment in media.get(
+                    'comments', {}).get('nodes', []) if comment.get('text')]
+                if not video_url:
+                    edges = try_get(
+                        media, lambda x: x['edge_sidecar_to_children']['edges'],
+                        list) or []
+                    if edges:
+                        entries = []
+                        for edge_num, edge in enumerate(edges, start=1):
+                            node = try_get(edge, lambda x: x['node'], dict)
+                            if not node:
+                                continue
+                            node_video_url = url_or_none(node.get('video_url'))
+                            if not node_video_url:
+                                continue
+                            entries.append({
+                                'id': node.get('shortcode') or node['id'],
+                                'title': 'Video %d' % edge_num,
+                                'url': node_video_url,
+                                'thumbnail': node.get('display_url'),
+                                'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
+                                'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
+                                'view_count': int_or_none(node.get('video_view_count')),
+                            })
+                        return self.playlist_result(
+                            entries, video_id,
+                            'Post by %s' % uploader_id if uploader_id else None,
+                            description)
+
+        if not video_url:
+            video_url = self._og_search_video_url(webpage, secure=False)
+
+        formats = [{
+            'url': video_url,
+            'width': width,
+            'height': height,
+        }]
+
+        if not uploader_id:
+            uploader_id = self._search_regex(
+                r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"',
+                webpage, 'uploader id', fatal=False)
+
+        if not description:
+            description = self._search_regex(
+                r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None)
+            if description is not None:
+                description = lowercase_escape(description)
+
+        if not thumbnail:
+            thumbnail = self._og_search_thumbnail(webpage)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'ext': 'mp4',
+            'title': 'Video by %s' % uploader_id,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'uploader_id': uploader_id,
+            'uploader': uploader,
+            'like_count': like_count,
+            'comment_count': comment_count,
+            'comments': comments,
+        }
+
+
+class InstagramPlaylistIE(InfoExtractor):
+    # A superclass for handling any kind of query based on GraphQL which
+    # results in a playlist.
+
+    _gis_tmpl = None  # used to cache GIS request type
+
+    def _parse_graphql(self, webpage, item_id):
+        # Reads a webpage and returns its GraphQL data.
+        return self._parse_json(
+            self._search_regex(
+                r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'),
+            item_id)
+
+    def _extract_graphql(self, data, url):
+        # Parses GraphQL queries containing videos and generates a playlist.
+        def get_count(suffix):
+            return int_or_none(try_get(
+                node, lambda x: x['edge_media_' + suffix]['count']))
+
+        uploader_id = self._match_id(url)
+        csrf_token = data['config']['csrf_token']
+        rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8'
+
+        cursor = ''
+        for page_num in itertools.count(1):
+            variables = {
+                'first': 12,
+                'after': cursor,
+            }
+            variables.update(self._query_vars_for(data))
+            variables = json.dumps(variables)
+
+            if self._gis_tmpl:
+                gis_tmpls = [self._gis_tmpl]
+            else:
+                gis_tmpls = [
+                    '%s' % rhx_gis,
+                    '',
+                    '%s:%s' % (rhx_gis, csrf_token),
+                    '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
+                ]
+
+            # try all of the ways to generate a GIS query, and not only use the
+            # first one that works, but cache it for future requests
+            for gis_tmpl in gis_tmpls:
+                try:
+                    json_data = self._download_json(
+                        'https://www.instagram.com/graphql/query/', uploader_id,
+                        'Downloading JSON page %d' % page_num, headers={
+                            'X-Requested-With': 'XMLHttpRequest',
+                            'X-Instagram-GIS': hashlib.md5(
+                                ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(),
+                        }, query={
+                            'query_hash': self._QUERY_HASH,
+                            'variables': variables,
+                        })
+                    media = self._parse_timeline_from(json_data)
+                    self._gis_tmpl = gis_tmpl
+                    break
+                except ExtractorError as e:
+                    # if it's an error caused by a bad query, and there are
+                    # more GIS templates to try, ignore it and keep trying
+                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                        if gis_tmpl != gis_tmpls[-1]:
+                            continue
+                    raise
+
+            edges = media.get('edges')
+            if not edges or not isinstance(edges, list):
+                break
+
+            for edge in edges:
+                node = edge.get('node')
+                if not node or not isinstance(node, dict):
+                    continue
+                if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
+                    continue
+                video_id = node.get('shortcode')
+                if not video_id:
+                    continue
+
+                info = self.url_result(
+                    'https://instagram.com/p/%s/' % video_id,
+                    ie=InstagramIE.ie_key(), video_id=video_id)
+
+                description = try_get(
+                    node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
+                    compat_str)
+                thumbnail = node.get('thumbnail_src') or node.get('display_src')
+                timestamp = int_or_none(node.get('taken_at_timestamp'))
+
+                comment_count = get_count('to_comment')
+                like_count = get_count('preview_like')
+                view_count = int_or_none(node.get('video_view_count'))
+
+                info.update({
+                    'description': description,
+                    'thumbnail': thumbnail,
+                    'timestamp': timestamp,
+                    'comment_count': comment_count,
+                    'like_count': like_count,
+                    'view_count': view_count,
+                })
+
+                yield info
+
+            page_info = media.get('page_info')
+            if not page_info or not isinstance(page_info, dict):
+                break
+
+            has_next_page = page_info.get('has_next_page')
+            if not has_next_page:
+                break
+
+            cursor = page_info.get('end_cursor')
+            if not cursor or not isinstance(cursor, compat_str):
+                break
+
+    def _real_extract(self, url):
+        user_or_tag = self._match_id(url)
+        webpage = self._download_webpage(url, user_or_tag)
+        data = self._parse_graphql(webpage, user_or_tag)
+
+        self._set_cookie('instagram.com', 'ig_pr', '1')
+
+        return self.playlist_result(
+            self._extract_graphql(data, url), user_or_tag, user_or_tag)
+
+
+class InstagramUserIE(InstagramPlaylistIE):
+    _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])'
+    IE_DESC = 'Instagram user profile'
+    IE_NAME = 'instagram:user'
+    _TEST = {
+        'url': 'https://instagram.com/porsche',
+        'info_dict': {
+            'id': 'porsche',
+            'title': 'porsche',
+        },
+        'playlist_count': 5,
+        'params': {
+            'extract_flat': True,
+            'skip_download': True,
+            'playlistend': 5,
+        }
+    }
+
+    _QUERY_HASH = '42323d64886122307be10013ad2dcc44',
+
+    @staticmethod
+    def _parse_timeline_from(data):
+        # extracts the media timeline data from a GraphQL result
+        return data['data']['user']['edge_owner_to_timeline_media']
+
+    @staticmethod
+    def _query_vars_for(data):
+        # returns a dictionary of variables to add to the timeline query based
+        # on the GraphQL of the original page
+        return {
+            'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id']
+        }
+
+
+class InstagramTagIE(InstagramPlaylistIE):
+    _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)'
+    IE_DESC = 'Instagram hashtag search'
+    IE_NAME = 'instagram:tag'
+    _TEST = {
+        'url': 'https://instagram.com/explore/tags/lolcats',
+        'info_dict': {
+            'id': 'lolcats',
+            'title': 'lolcats',
+        },
+        'playlist_count': 50,
+        'params': {
+            'extract_flat': True,
+            'skip_download': True,
+            'playlistend': 50,
+        }
+    }
+
+    _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314',
+
+    @staticmethod
+    def _parse_timeline_from(data):
+        # extracts the media timeline data from a GraphQL result
+        return data['data']['hashtag']['edge_hashtag_to_media']
+
+    @staticmethod
+    def _query_vars_for(data):
+        # returns a dictionary of variables to add to the timeline query based
+        # on the GraphQL of the original page
+        return {
+            'tag_name':
+                data['entry_data']['TagPage'][0]['graphql']['hashtag']['name']
+        }
diff --git a/youtube_dl/extractor/internazionale.py b/youtube_dl/extractor/internazionale.py

new file mode 100644 (file)

index 0000000..676e8e2
--- /dev/null
+++ b/youtube_dl/extractor/internazionale.py
@@ -0,0 +1,85 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import unified_timestamp
+
+
+class InternazionaleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?internazionale\.it/video/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.internazionale.it/video/2015/02/19/richard-linklater-racconta-una-scena-di-boyhood',
+        'md5': '3e39d32b66882c1218e305acbf8348ca',
+        'info_dict': {
+            'id': '265968',
+            'display_id': 'richard-linklater-racconta-una-scena-di-boyhood',
+            'ext': 'mp4',
+            'title': 'Richard Linklater racconta una scena di Boyhood',
+            'description': 'md5:efb7e5bbfb1a54ae2ed5a4a015f0e665',
+            'timestamp': 1424354635,
+            'upload_date': '20150219',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'params': {
+            'format': 'bestvideo',
+        },
+    }, {
+        'url': 'https://www.internazionale.it/video/2018/08/29/telefono-stare-con-noi-stessi',
+        'md5': '9db8663704cab73eb972d1cee0082c79',
+        'info_dict': {
+            'id': '761344',
+            'display_id': 'telefono-stare-con-noi-stessi',
+            'ext': 'mp4',
+            'title': 'Usiamo il telefono per evitare di stare con noi stessi',
+            'description': 'md5:75ccfb0d6bcefc6e7428c68b4aa1fe44',
+            'timestamp': 1535528954,
+            'upload_date': '20180829',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'params': {
+            'format': 'bestvideo',
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        DATA_RE = r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1'
+
+        title = self._search_regex(
+            DATA_RE % 'video-title', webpage, 'title', default=None,
+            group='value') or self._og_search_title(webpage)
+
+        video_id = self._search_regex(
+            DATA_RE % 'job-id', webpage, 'video id', group='value')
+        video_path = self._search_regex(
+            DATA_RE % 'video-path', webpage, 'video path', group='value')
+        video_available_abroad = self._search_regex(
+            DATA_RE % 'video-available_abroad', webpage,
+            'video available aboard', default='1', group='value')
+        video_available_abroad = video_available_abroad == '1'
+
+        video_base = 'https://video%s.internazionale.it/%s/%s.' % \
+            ('' if video_available_abroad else '-ita', video_path, video_id)
+
+        formats = self._extract_m3u8_formats(
+            video_base + 'm3u8', display_id, 'mp4',
+            entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+        formats.extend(self._extract_mpd_formats(
+            video_base + 'mpd', display_id, mpd_id='dash', fatal=False))
+        self._sort_formats(formats)
+
+        timestamp = unified_timestamp(self._html_search_meta(
+            'article:published_time', webpage, 'timestamp'))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage),
+            'timestamp': timestamp,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py

new file mode 100644 (file)

index 0000000..59b0a90
--- /dev/null
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -0,0 +1,64 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+
+
+class InternetVideoArchiveIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?'
+
+    _TEST = {
+        'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?customerid=69249&publishedid=194487&reporttag=vdbetatitle&playerid=641&autolist=0&domain=www.videodetective.com&maxrate=high&minrate=low&socialplayer=false',
+        'info_dict': {
+            'id': '194487',
+            'ext': 'mp4',
+            'title': 'Kick-Ass 2',
+            'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    @staticmethod
+    def _build_json_url(query):
+        return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query
+
+    def _real_extract(self, url):
+        query = compat_parse_qs(compat_urlparse.urlparse(url).query)
+        video_id = query['publishedid'][0]
+        data = self._download_json(
+            'https://video.internetvideoarchive.net/videojs7/videojs7.ivasettings.ashx',
+            video_id, data=json.dumps({
+                'customerid': query['customerid'][0],
+                'publishedid': video_id,
+            }).encode())
+        title = data['Title']
+        formats = self._extract_m3u8_formats(
+            data['VideoUrl'], video_id, 'mp4',
+            'm3u8_native', m3u8_id='hls', fatal=False)
+        file_url = formats[0]['url']
+        if '.ism/' in file_url:
+            replace_url = lambda x: re.sub(r'\.ism/[^?]+', '.ism/' + x, file_url)
+            formats.extend(self._extract_f4m_formats(
+                replace_url('.f4m'), video_id, f4m_id='hds', fatal=False))
+            formats.extend(self._extract_mpd_formats(
+                replace_url('.mpd'), video_id, mpd_id='dash', fatal=False))
+            formats.extend(self._extract_ism_formats(
+                replace_url('Manifest'), video_id, ism_id='mss', fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': data.get('PosterUrl'),
+            'description': data.get('Description'),
+        }
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py

new file mode 100644 (file)

index 0000000..53a550c
--- /dev/null
+++ b/youtube_dl/extractor/iprima.py
@@ -0,0 +1,148 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    js_to_json,
+)
+
+
+class IPrimaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _GEO_BYPASS = False
+
+    _TESTS = [{
+        'url': 'https://prima.iprima.cz/particka/92-epizoda',
+        'info_dict': {
+            'id': 'p51388',
+            'ext': 'mp4',
+            'title': 'Partička (92)',
+            'description': 'md5:859d53beae4609e6dd7796413f1b6cac',
+        },
+        'params': {
+            'skip_download': True,  # m3u8 download
+        },
+    }, {
+        'url': 'https://cnn.iprima.cz/videa/70-epizoda',
+        'info_dict': {
+            'id': 'p681554',
+            'ext': 'mp4',
+            'title': 'HLAVNÍ ZPRÁVY 3.5.2020',
+        },
+        'params': {
+            'skip_download': True,  # m3u8 download
+        },
+    }, {
+        'url': 'http://play.iprima.cz/particka/particka-92',
+        'only_matching': True,
+    }, {
+        # geo restricted
+        'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1',
+        'only_matching': True,
+    }, {
+        # iframe api.play-backend.iprima.cz
+        'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2',
+        'only_matching': True,
+    }, {
+        # iframe prima.iprima.cz
+        'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.iprima.cz/filmy/desne-rande',
+        'only_matching': True,
+    }, {
+        'url': 'https://zoom.iprima.cz/10-nejvetsich-tajemstvi-zahad/posvatna-mista-a-stavby',
+        'only_matching': True,
+    }, {
+        'url': 'https://krimi.iprima.cz/mraz-0/sebevrazdy',
+        'only_matching': True,
+    }, {
+        'url': 'https://cool.iprima.cz/derava-silnice-nevadi',
+        'only_matching': True,
+    }, {
+        'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi',
+        'only_matching': True,
+    }, {
+        'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        self._set_cookie('play.iprima.cz', 'ott_adult_confirmed', '1')
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(
+            webpage, default=None) or self._search_regex(
+            r'<h1>([^<]+)', webpage, 'title')
+
+        video_id = self._search_regex(
+            (r'<iframe[^>]+\bsrc=["\'](?:https?:)?//(?:api\.play-backend\.iprima\.cz/prehravac/embedded|prima\.iprima\.cz/[^/]+/[^/]+)\?.*?\bid=(p\d+)',
+             r'data-product="([^"]+)">',
+             r'id=["\']player-(p\d+)"',
+             r'playerId\s*:\s*["\']player-(p\d+)'),
+            webpage, 'real id')
+
+        playerpage = self._download_webpage(
+            'http://play.iprima.cz/prehravac/init',
+            video_id, note='Downloading player', query={
+                '_infuse': 1,
+                '_ts': round(time.time()),
+                'productId': video_id,
+            }, headers={'Referer': url})
+
+        formats = []
+
+        def extract_formats(format_url, format_key=None, lang=None):
+            ext = determine_ext(format_url)
+            new_formats = []
+            if format_key == 'hls' or ext == 'm3u8':
+                new_formats = self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False)
+            elif format_key == 'dash' or ext == 'mpd':
+                return
+                new_formats = self._extract_mpd_formats(
+                    format_url, video_id, mpd_id='dash', fatal=False)
+            if lang:
+                for f in new_formats:
+                    if not f.get('language'):
+                        f['language'] = lang
+            formats.extend(new_formats)
+
+        options = self._parse_json(
+            self._search_regex(
+                r'(?s)(?:TDIPlayerOptions|playerOptions)\s*=\s*({.+?});\s*\]\]',
+                playerpage, 'player options', default='{}'),
+            video_id, transform_source=js_to_json, fatal=False)
+        if options:
+            for key, tracks in options.get('tracks', {}).items():
+                if not isinstance(tracks, list):
+                    continue
+                for track in tracks:
+                    src = track.get('src')
+                    if src:
+                        extract_formats(src, key.lower(), track.get('lang'))
+
+        if not formats:
+            for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage):
+                extract_formats(src)
+
+        if not formats and '>GEO_IP_NOT_ALLOWED<' in playerpage:
+            self.raise_geo_restricted(countries=['CZ'])
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'formats': formats,
+            'description': self._og_search_description(webpage, default=None),
+        }
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py

new file mode 100644 (file)

index 0000000..cd11aa7
--- /dev/null
+++ b/youtube_dl/extractor/iqiyi.py
@@ -0,0 +1,394 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import itertools
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_urlencode,
+)
+from ..utils import (
+    clean_html,
+    decode_packed_codes,
+    get_element_by_id,
+    get_element_by_attribute,
+    ExtractorError,
+    ohdave_rsa_encrypt,
+    remove_start,
+)
+
+
+def md5_text(text):
+    return hashlib.md5(text.encode('utf-8')).hexdigest()
+
+
+class IqiyiSDK(object):
+    def __init__(self, target, ip, timestamp):
+        self.target = target
+        self.ip = ip
+        self.timestamp = timestamp
+
+    @staticmethod
+    def split_sum(data):
+        return compat_str(sum(map(lambda p: int(p, 16), list(data))))
+
+    @staticmethod
+    def digit_sum(num):
+        if isinstance(num, int):
+            num = compat_str(num)
+        return compat_str(sum(map(int, num)))
+
+    def even_odd(self):
+        even = self.digit_sum(compat_str(self.timestamp)[::2])
+        odd = self.digit_sum(compat_str(self.timestamp)[1::2])
+        return even, odd
+
+    def preprocess(self, chunksize):
+        self.target = md5_text(self.target)
+        chunks = []
+        for i in range(32 // chunksize):
+            chunks.append(self.target[chunksize * i:chunksize * (i + 1)])
+        if 32 % chunksize:
+            chunks.append(self.target[32 - 32 % chunksize:])
+        return chunks, list(map(int, self.ip.split('.')))
+
+    def mod(self, modulus):
+        chunks, ip = self.preprocess(32)
+        self.target = chunks[0] + ''.join(map(lambda p: compat_str(p % modulus), ip))
+
+    def split(self, chunksize):
+        modulus_map = {
+            4: 256,
+            5: 10,
+            8: 100,
+        }
+
+        chunks, ip = self.preprocess(chunksize)
+        ret = ''
+        for i in range(len(chunks)):
+            ip_part = compat_str(ip[i] % modulus_map[chunksize]) if i < 4 else ''
+            if chunksize == 8:
+                ret += ip_part + chunks[i]
+            else:
+                ret += chunks[i] + ip_part
+        self.target = ret
+
+    def handle_input16(self):
+        self.target = md5_text(self.target)
+        self.target = self.split_sum(self.target[:16]) + self.target + self.split_sum(self.target[16:])
+
+    def handle_input8(self):
+        self.target = md5_text(self.target)
+        ret = ''
+        for i in range(4):
+            part = self.target[8 * i:8 * (i + 1)]
+            ret += self.split_sum(part) + part
+        self.target = ret
+
+    def handleSum(self):
+        self.target = md5_text(self.target)
+        self.target = self.split_sum(self.target) + self.target
+
+    def date(self, scheme):
+        self.target = md5_text(self.target)
+        d = time.localtime(self.timestamp)
+        strings = {
+            'y': compat_str(d.tm_year),
+            'm': '%02d' % d.tm_mon,
+            'd': '%02d' % d.tm_mday,
+        }
+        self.target += ''.join(map(lambda c: strings[c], list(scheme)))
+
+    def split_time_even_odd(self):
+        even, odd = self.even_odd()
+        self.target = odd + md5_text(self.target) + even
+
+    def split_time_odd_even(self):
+        even, odd = self.even_odd()
+        self.target = even + md5_text(self.target) + odd
+
+    def split_ip_time_sum(self):
+        chunks, ip = self.preprocess(32)
+        self.target = compat_str(sum(ip)) + chunks[0] + self.digit_sum(self.timestamp)
+
+    def split_time_ip_sum(self):
+        chunks, ip = self.preprocess(32)
+        self.target = self.digit_sum(self.timestamp) + chunks[0] + compat_str(sum(ip))
+
+
+class IqiyiSDKInterpreter(object):
+    def __init__(self, sdk_code):
+        self.sdk_code = sdk_code
+
+    def run(self, target, ip, timestamp):
+        self.sdk_code = decode_packed_codes(self.sdk_code)
+
+        functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code)
+
+        sdk = IqiyiSDK(target, ip, timestamp)
+
+        other_functions = {
+            'handleSum': sdk.handleSum,
+            'handleInput8': sdk.handle_input8,
+            'handleInput16': sdk.handle_input16,
+            'splitTimeEvenOdd': sdk.split_time_even_odd,
+            'splitTimeOddEven': sdk.split_time_odd_even,
+            'splitIpTimeSum': sdk.split_ip_time_sum,
+            'splitTimeIpSum': sdk.split_time_ip_sum,
+        }
+        for function in functions:
+            if re.match(r'mod\d+', function):
+                sdk.mod(int(function[3:]))
+            elif re.match(r'date[ymd]{3}', function):
+                sdk.date(function[4:])
+            elif re.match(r'split\d+', function):
+                sdk.split(int(function[5:]))
+            elif function in other_functions:
+                other_functions[function]()
+            else:
+                raise ExtractorError('Unknown funcion %s' % function)
+
+        return sdk.target
+
+
+class IqiyiIE(InfoExtractor):
+    IE_NAME = 'iqiyi'
+    IE_DESC = '爱奇艺'
+
+    _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html'
+
+    _NETRC_MACHINE = 'iqiyi'
+
+    _TESTS = [{
+        'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
+        # MD5 checksum differs on my machine and Travis CI
+        'info_dict': {
+            'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
+            'ext': 'mp4',
+            'title': '美国德州空中惊现奇异云团 酷似UFO',
+        }
+    }, {
+        'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
+        'md5': 'b7dc800a4004b1b57749d9abae0472da',
+        'info_dict': {
+            'id': 'e3f585b550a280af23c98b6cb2be19fb',
+            'ext': 'mp4',
+            # This can be either Simplified Chinese or Traditional Chinese
+            'title': r're:^(?:名侦探柯南 国语版：第752集 迫近灰原秘密的黑影 下篇|名偵探柯南 國語版：第752集 迫近灰原秘密的黑影 下篇)$',
+        },
+        'skip': 'Geo-restricted to China',
+    }, {
+        'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.iqiyi.com/a_19rrhbc6kt.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://yule.iqiyi.com/pcb.html',
+        'info_dict': {
+            'id': '4a0af228fddb55ec96398a364248ed7f',
+            'ext': 'mp4',
+            'title': '第2017-04-21期 女艺人频遭极端粉丝骚扰',
+        },
+    }, {
+        # VIP-only video. The first 2 parts (6 minutes) are available without login
+        # MD5 sums omitted as values are different on Travis CI and my machine
+        'url': 'http://www.iqiyi.com/v_19rrny4w8w.html',
+        'info_dict': {
+            'id': 'f3cf468b39dddb30d676f89a91200dc1',
+            'ext': 'mp4',
+            'title': '泰坦尼克号',
+        },
+        'skip': 'Geo-restricted to China',
+    }, {
+        'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
+        'info_dict': {
+            'id': '202918101',
+            'title': '灌篮高手 国语版',
+        },
+        'playlist_count': 101,
+    }, {
+        'url': 'http://www.pps.tv/w_19rrbav0ph.html',
+        'only_matching': True,
+    }]
+
+    _FORMATS_MAP = {
+        '96': 1,    # 216p, 240p
+        '1': 2,     # 336p, 360p
+        '2': 3,     # 480p, 504p
+        '21': 4,    # 504p
+        '4': 5,     # 720p
+        '17': 5,    # 720p
+        '5': 6,     # 1072p, 1080p
+        '18': 7,    # 1080p
+    }
+
+    def _real_initialize(self):
+        self._login()
+
+    @staticmethod
+    def _rsa_fun(data):
+        # public key extracted from http://static.iqiyi.com/js/qiyiV2/20160129180840/jobs/i18n/i18nIndex.js
+        N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd
+        e = 65537
+
+        return ohdave_rsa_encrypt(data, e, N)
+
+    def _login(self):
+        username, password = self._get_login_info()
+
+        # No authentication to be performed
+        if not username:
+            return True
+
+        data = self._download_json(
+            'http://kylin.iqiyi.com/get_token', None,
+            note='Get token for logging', errnote='Unable to get token for logging')
+        sdk = data['sdk']
+        timestamp = int(time.time())
+        target = '/apis/reglogin/login.action?lang=zh_TW&area_code=null&email=%s&passwd=%s&agenttype=1&from=undefined&keeplogin=0&piccode=&fromurl=&_pos=1' % (
+            username, self._rsa_fun(password.encode('utf-8')))
+
+        interp = IqiyiSDKInterpreter(sdk)
+        sign = interp.run(target, data['ip'], timestamp)
+
+        validation_params = {
+            'target': target,
+            'server': 'BEA3AA1908656AABCCFF76582C4C6660',
+            'token': data['token'],
+            'bird_src': 'f8d91d57af224da7893dd397d52d811a',
+            'sign': sign,
+            'bird_t': timestamp,
+        }
+        validation_result = self._download_json(
+            'http://kylin.iqiyi.com/validate?' + compat_urllib_parse_urlencode(validation_params), None,
+            note='Validate credentials', errnote='Unable to validate credentials')
+
+        MSG_MAP = {
+            'P00107': 'please login via the web interface and enter the CAPTCHA code',
+            'P00117': 'bad username or password',
+        }
+
+        code = validation_result['code']
+        if code != 'A00000':
+            msg = MSG_MAP.get(code)
+            if not msg:
+                msg = 'error %s' % code
+                if validation_result.get('msg'):
+                    msg += ': ' + validation_result['msg']
+            self._downloader.report_warning('unable to log in: ' + msg)
+            return False
+
+        return True
+
+    def get_raw_data(self, tvid, video_id):
+        tm = int(time.time() * 1000)
+
+        key = 'd5fb4bd9d50c4be6948c97edd7254b0e'
+        sc = md5_text(compat_str(tm) + key + tvid)
+        params = {
+            'tvid': tvid,
+            'vid': video_id,
+            'src': '76f90cbd92f94a2e925d83e8ccd22cb7',
+            'sc': sc,
+            't': tm,
+        }
+
+        return self._download_json(
+            'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id),
+            video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='),
+            query=params, headers=self.geo_verification_headers())
+
+    def _extract_playlist(self, webpage):
+        PAGE_SIZE = 50
+
+        links = re.findall(
+            r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"',
+            webpage)
+        if not links:
+            return
+
+        album_id = self._search_regex(
+            r'albumId\s*:\s*(\d+),', webpage, 'album ID')
+        album_title = self._search_regex(
+            r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False)
+
+        entries = list(map(self.url_result, links))
+
+        # Start from 2 because links in the first page are already on webpage
+        for page_num in itertools.count(2):
+            pagelist_page = self._download_webpage(
+                'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE),
+                album_id,
+                note='Download playlist page %d' % page_num,
+                errnote='Failed to download playlist page %d' % page_num)
+            pagelist = self._parse_json(
+                remove_start(pagelist_page, 'var tvInfoJs='), album_id)
+            vlist = pagelist['data']['vlist']
+            for item in vlist:
+                entries.append(self.url_result(item['vurl']))
+            if len(vlist) < PAGE_SIZE:
+                break
+
+        return self.playlist_result(entries, album_id, album_title)
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(
+            url, 'temp_id', note='download video page')
+
+        # There's no simple way to determine whether an URL is a playlist or not
+        # Sometimes there are playlist links in individual videos, so treat it
+        # as a single video first
+        tvid = self._search_regex(
+            r'data-(?:player|shareplattrigger)-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid', default=None)
+        if tvid is None:
+            playlist_result = self._extract_playlist(webpage)
+            if playlist_result:
+                return playlist_result
+            raise ExtractorError('Can\'t find any video')
+
+        video_id = self._search_regex(
+            r'data-(?:player|shareplattrigger)-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
+
+        formats = []
+        for _ in range(5):
+            raw_data = self.get_raw_data(tvid, video_id)
+
+            if raw_data['code'] != 'A00000':
+                if raw_data['code'] == 'A00111':
+                    self.raise_geo_restricted()
+                raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
+
+            data = raw_data['data']
+
+            for stream in data['vidl']:
+                if 'm3utx' not in stream:
+                    continue
+                vd = compat_str(stream['vd'])
+                formats.append({
+                    'url': stream['m3utx'],
+                    'format_id': vd,
+                    'ext': 'mp4',
+                    'preference': self._FORMATS_MAP.get(vd, -1),
+                    'protocol': 'm3u8_native',
+                })
+
+            if formats:
+                break
+
+            self._sleep(5, video_id)
+
+        self._sort_formats(formats)
+        title = (get_element_by_id('widget-videotitle', webpage)
+                 or clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage))
+                 or self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py

new file mode 100644 (file)

index 0000000..d5a3f6f
--- /dev/null
+++ b/youtube_dl/extractor/ir90tv.py
@@ -0,0 +1,42 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import remove_start
+
+
+class Ir90TvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P<id>[0-9]+)/.*'
+    _TESTS = [{
+        'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218',
+        'md5': '411dbd94891381960cb9e13daa47a869',
+        'info_dict': {
+            'id': '95719',
+            'ext': 'mp4',
+            'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }, {
+        'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = remove_start(self._html_search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title'), '90tv.ir :: ')
+
+        video_url = self._search_regex(
+            r'<source[^>]+src="([^"]+)"', webpage, 'video url')
+
+        thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)
+
+        return {
+            'url': video_url,
+            'id': video_id,
+            'title': title,
+            'video_url': video_url,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py

new file mode 100644 (file)

index 0000000..ad2f4ec
--- /dev/null
+++ b/youtube_dl/extractor/itv.py
@@ -0,0 +1,312 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import uuid
+import xml.etree.ElementTree as etree
+import json
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveNewIE
+from ..compat import (
+    compat_str,
+    compat_etree_register_namespace,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    extract_attributes,
+    int_or_none,
+    merge_dicts,
+    parse_duration,
+    smuggle_url,
+    url_or_none,
+    xpath_with_ns,
+    xpath_element,
+    xpath_text,
+)
+
+
+class ITVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
+    _GEO_COUNTRIES = ['GB']
+    _TESTS = [{
+        'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053',
+        'info_dict': {
+            'id': '2a2936a0053',
+            'ext': 'flv',
+            'title': 'Home Movie',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }, {
+        # unavailable via data-playlist-url
+        'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033',
+        'only_matching': True,
+    }, {
+        # InvalidVodcrid
+        'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034',
+        'only_matching': True,
+    }, {
+        # ContentUnavailable
+        'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        params = extract_attributes(self._search_regex(
+            r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
+
+        ns_map = {
+            'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/',
+            'tem': 'http://tempuri.org/',
+            'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types',
+            'com': 'http://schemas.itv.com/2009/05/Common',
+        }
+        for ns, full_ns in ns_map.items():
+            compat_etree_register_namespace(ns, full_ns)
+
+        def _add_ns(name):
+            return xpath_with_ns(name, ns_map)
+
+        def _add_sub_element(element, name):
+            return etree.SubElement(element, _add_ns(name))
+
+        production_id = (
+            params.get('data-video-autoplay-id')
+            or '%s#001' % (
+                params.get('data-video-episode-id')
+                or video_id.replace('a', '/')))
+
+        req_env = etree.Element(_add_ns('soapenv:Envelope'))
+        _add_sub_element(req_env, 'soapenv:Header')
+        body = _add_sub_element(req_env, 'soapenv:Body')
+        get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))
+        request = _add_sub_element(get_playlist, 'tem:request')
+        _add_sub_element(request, 'itv:ProductionId').text = production_id
+        _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()
+        vodcrid = _add_sub_element(request, 'itv:Vodcrid')
+        _add_sub_element(vodcrid, 'com:Id')
+        _add_sub_element(request, 'itv:Partition')
+        user_info = _add_sub_element(get_playlist, 'tem:userInfo')
+        _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv'
+        _add_sub_element(user_info, 'itv:DM')
+        _add_sub_element(user_info, 'itv:RevenueScienceValue')
+        _add_sub_element(user_info, 'itv:SessionId')
+        _add_sub_element(user_info, 'itv:SsoToken')
+        _add_sub_element(user_info, 'itv:UserToken')
+        site_info = _add_sub_element(get_playlist, 'tem:siteInfo')
+        _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None'
+        _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV'
+        _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any'
+        _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO'
+        _add_sub_element(site_info, 'itv:Category')
+        _add_sub_element(site_info, 'itv:Platform').text = 'DotCom'
+        _add_sub_element(site_info, 'itv:Site').text = 'ItvCom'
+        device_info = _add_sub_element(get_playlist, 'tem:deviceInfo')
+        _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big'
+        player_info = _add_sub_element(get_playlist, 'tem:playerInfo')
+        _add_sub_element(player_info, 'itv:Version').text = '2'
+
+        headers = self.geo_verification_headers()
+        headers.update({
+            'Content-Type': 'text/xml; charset=utf-8',
+            'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist',
+        })
+
+        info = self._search_json_ld(webpage, video_id, default={})
+        formats = []
+        subtitles = {}
+
+        def extract_subtitle(sub_url):
+            ext = determine_ext(sub_url, 'ttml')
+            subtitles.setdefault('en', []).append({
+                'url': sub_url,
+                'ext': 'ttml' if ext == 'xml' else ext,
+            })
+
+        resp_env = self._download_xml(
+            params['data-playlist-url'], video_id,
+            headers=headers, data=etree.tostring(req_env), fatal=False)
+        if resp_env:
+            playlist = xpath_element(resp_env, './/Playlist')
+            if playlist is None:
+                fault_code = xpath_text(resp_env, './/faultcode')
+                fault_string = xpath_text(resp_env, './/faultstring')
+                if fault_code == 'InvalidGeoRegion':
+                    self.raise_geo_restricted(
+                        msg=fault_string, countries=self._GEO_COUNTRIES)
+                elif fault_code not in (
+                        'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
+                    raise ExtractorError(
+                        '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
+                info.update({
+                    'title': self._og_search_title(webpage),
+                    'episode_title': params.get('data-video-episode'),
+                    'series': params.get('data-video-title'),
+                })
+            else:
+                title = xpath_text(playlist, 'EpisodeTitle', default=None)
+                info.update({
+                    'title': title,
+                    'episode_title': title,
+                    'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
+                    'series': xpath_text(playlist, 'ProgrammeTitle'),
+                    'duration': parse_duration(xpath_text(playlist, 'Duration')),
+                })
+                video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
+                media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
+                rtmp_url = media_files.attrib['base']
+
+                for media_file in media_files.findall('MediaFile'):
+                    play_path = xpath_text(media_file, 'URL')
+                    if not play_path:
+                        continue
+                    tbr = int_or_none(media_file.get('bitrate'), 1000)
+                    f = {
+                        'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
+                        'play_path': play_path,
+                        # Providing this swfVfy allows to avoid truncated downloads
+                        'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
+                        'page_url': url,
+                        'tbr': tbr,
+                        'ext': 'flv',
+                    }
+                    app = self._search_regex(
+                        'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
+                    if app:
+                        f.update({
+                            'url': rtmp_url.split('?', 1)[0],
+                            'app': app,
+                        })
+                    else:
+                        f['url'] = rtmp_url
+                    formats.append(f)
+
+                for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
+                    if caption_url.text:
+                        extract_subtitle(caption_url.text)
+
+        ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')
+        hmac = params.get('data-video-hmac')
+        if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url):
+            headers = self.geo_verification_headers()
+            headers.update({
+                'Accept': 'application/vnd.itv.vod.playlist.v2+json',
+                'Content-Type': 'application/json',
+                'hmac': hmac.upper(),
+            })
+            ios_playlist = self._download_json(
+                ios_playlist_url, video_id, data=json.dumps({
+                    'user': {
+                        'itvUserId': '',
+                        'entitlements': [],
+                        'token': ''
+                    },
+                    'device': {
+                        'manufacturer': 'Safari',
+                        'model': '5',
+                        'os': {
+                            'name': 'Windows NT',
+                            'version': '6.1',
+                            'type': 'desktop'
+                        }
+                    },
+                    'client': {
+                        'version': '4.1',
+                        'id': 'browser'
+                    },
+                    'variantAvailability': {
+                        'featureset': {
+                            'min': ['hls', 'aes', 'outband-webvtt'],
+                            'max': ['hls', 'aes', 'outband-webvtt']
+                        },
+                        'platformTag': 'dotcom'
+                    }
+                }).encode(), headers=headers, fatal=False)
+            if ios_playlist:
+                video_data = ios_playlist.get('Playlist', {}).get('Video', {})
+                ios_base_url = video_data.get('Base')
+                for media_file in video_data.get('MediaFiles', []):
+                    href = media_file.get('Href')
+                    if not href:
+                        continue
+                    if ios_base_url:
+                        href = ios_base_url + href
+                    ext = determine_ext(href)
+                    if ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            href, video_id, 'mp4', entry_protocol='m3u8_native',
+                            m3u8_id='hls', fatal=False))
+                    else:
+                        formats.append({
+                            'url': href,
+                        })
+                subs = video_data.get('Subtitles')
+                if isinstance(subs, list):
+                    for sub in subs:
+                        if not isinstance(sub, dict):
+                            continue
+                        href = url_or_none(sub.get('Href'))
+                        if href:
+                            extract_subtitle(href)
+                if not info.get('duration'):
+                    info['duration'] = parse_duration(video_data.get('Duration'))
+
+        self._sort_formats(formats)
+
+        info.update({
+            'id': video_id,
+            'formats': formats,
+            'subtitles': subtitles,
+        })
+
+        webpage_info = self._search_json_ld(webpage, video_id, default={})
+        if not webpage_info.get('title'):
+            webpage_info['title'] = self._html_search_regex(
+                r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<',
+                webpage, 'title', default=None) or self._og_search_title(
+                webpage, default=None) or self._html_search_meta(
+                'twitter:title', webpage, 'title',
+                default=None) or webpage_info['episode']
+
+        return merge_dicts(info, webpage_info)
+
+
+class ITVBTCCIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
+        'info_dict': {
+            'id': 'btcc-2018-all-the-action-from-brands-hatch',
+            'title': 'BTCC 2018: All the action from Brands Hatch',
+        },
+        'playlist_mincount': 9,
+    }
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result(
+                smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
+                    # ITV does not like some GB IP ranges, so here are some
+                    # IP blocks it accepts
+                    'geo_ip_blocks': [
+                        '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21'
+                    ],
+                    'referrer': url,
+                }),
+                ie=BrightcoveNewIE.ie_key(), video_id=video_id)
+            for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)]
+
+        title = self._og_search_title(webpage, fatal=False)
+
+        return self.playlist_result(entries, playlist_id, title)
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py

new file mode 100644 (file)

index 0000000..b9cb5a8
--- /dev/null
+++ b/youtube_dl/extractor/ivi.py
@@ -0,0 +1,271 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+import sys
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    qualities,
+)
+
+
+class IviIE(InfoExtractor):
+    IE_DESC = 'ivi.ru'
+    IE_NAME = 'ivi'
+    _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)'
+    _GEO_BYPASS = False
+    _GEO_COUNTRIES = ['RU']
+    _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c'
+    _LIGHT_URL = 'https://api.ivi.ru/light/'
+
+    _TESTS = [
+        # Single movie
+        {
+            'url': 'http://www.ivi.ru/watch/53141',
+            'md5': '6ff5be2254e796ed346251d117196cf4',
+            'info_dict': {
+                'id': '53141',
+                'ext': 'mp4',
+                'title': 'Иван Васильевич меняет профессию',
+                'description': 'md5:b924063ea1677c8fe343d8a72ac2195f',
+                'duration': 5498,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'skip': 'Only works from Russia',
+        },
+        # Serial's series
+        {
+            'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/9549',
+            'md5': '221f56b35e3ed815fde2df71032f4b3e',
+            'info_dict': {
+                'id': '9549',
+                'ext': 'mp4',
+                'title': 'Двое из ларца - Дело Гольдберга (1 часть)',
+                'series': 'Двое из ларца',
+                'season': 'Сезон 1',
+                'season_number': 1,
+                'episode': 'Дело Гольдберга (1 часть)',
+                'episode_number': 1,
+                'duration': 2655,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'skip': 'Only works from Russia',
+        },
+        {
+            # with MP4-HD720 format
+            'url': 'http://www.ivi.ru/watch/146500',
+            'md5': 'd63d35cdbfa1ea61a5eafec7cc523e1e',
+            'info_dict': {
+                'id': '146500',
+                'ext': 'mp4',
+                'title': 'Кукла',
+                'description': 'md5:ffca9372399976a2d260a407cc74cce6',
+                'duration': 5599,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'skip': 'Only works from Russia',
+        },
+        {
+            'url': 'https://www.ivi.tv/watch/33560/',
+            'only_matching': True,
+        },
+    ]
+
+    # Sorted by quality
+    _KNOWN_FORMATS = (
+        'MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi',
+        'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        data = json.dumps({
+            'method': 'da.content.get',
+            'params': [
+                video_id, {
+                    'site': 's%d',
+                    'referrer': 'http://www.ivi.ru/watch/%s' % video_id,
+                    'contentid': video_id
+                }
+            ]
+        })
+
+        bundled = hasattr(sys, 'frozen')
+
+        for site in (353, 183):
+            content_data = (data % site).encode()
+            if site == 353:
+                if bundled:
+                    continue
+                try:
+                    from Cryptodome.Cipher import Blowfish
+                    from Cryptodome.Hash import CMAC
+                    pycryptodomex_found = True
+                except ImportError:
+                    pycryptodomex_found = False
+                    continue
+
+                timestamp = (self._download_json(
+                    self._LIGHT_URL, video_id,
+                    'Downloading timestamp JSON', data=json.dumps({
+                        'method': 'da.timestamp.get',
+                        'params': []
+                    }).encode(), fatal=False) or {}).get('result')
+                if not timestamp:
+                    continue
+
+                query = {
+                    'ts': timestamp,
+                    'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, Blowfish).hexdigest(),
+                }
+            else:
+                query = {}
+
+            video_json = self._download_json(
+                self._LIGHT_URL, video_id,
+                'Downloading video JSON', data=content_data, query=query)
+
+            error = video_json.get('error')
+            if error:
+                origin = error.get('origin')
+                message = error.get('message') or error.get('user_message')
+                extractor_msg = 'Unable to download video %s'
+                if origin == 'NotAllowedForLocation':
+                    self.raise_geo_restricted(message, self._GEO_COUNTRIES)
+                elif origin == 'NoRedisValidData':
+                    extractor_msg = 'Video %s does not exist'
+                elif site == 353:
+                    continue
+                elif bundled:
+                    raise ExtractorError(
+                        'This feature does not work from bundled exe. Run youtube-dlc from sources.',
+                        expected=True)
+                elif not pycryptodomex_found:
+                    raise ExtractorError(
+                        'pycryptodomex not found. Please install it.',
+                        expected=True)
+                elif message:
+                    extractor_msg += ': ' + message
+                raise ExtractorError(extractor_msg % video_id, expected=True)
+            else:
+                break
+
+        result = video_json['result']
+        title = result['title']
+
+        quality = qualities(self._KNOWN_FORMATS)
+
+        formats = []
+        for f in result.get('files', []):
+            f_url = f.get('url')
+            content_format = f.get('content_format')
+            if not f_url or '-MDRM-' in content_format or '-FPS-' in content_format:
+                continue
+            formats.append({
+                'url': f_url,
+                'format_id': content_format,
+                'quality': quality(content_format),
+                'filesize': int_or_none(f.get('size_in_bytes')),
+            })
+        self._sort_formats(formats)
+
+        compilation = result.get('compilation')
+        episode = title if compilation else None
+
+        title = '%s - %s' % (compilation, title) if compilation is not None else title
+
+        thumbnails = [{
+            'url': preview['url'],
+            'id': preview.get('content_format'),
+        } for preview in result.get('preview', []) if preview.get('url')]
+
+        webpage = self._download_webpage(url, video_id)
+
+        season = self._search_regex(
+            r'<li[^>]+class="season active"[^>]*><a[^>]+>([^<]+)',
+            webpage, 'season', default=None)
+        season_number = int_or_none(self._search_regex(
+            r'<li[^>]+class="season active"[^>]*><a[^>]+data-season(?:-index)?="(\d+)"',
+            webpage, 'season number', default=None))
+
+        episode_number = int_or_none(self._search_regex(
+            r'[^>]+itemprop="episode"[^>]*>\s*<meta[^>]+itemprop="episodeNumber"[^>]+content="(\d+)',
+            webpage, 'episode number', default=None))
+
+        description = self._og_search_description(webpage, default=None) or self._html_search_meta(
+            'description', webpage, 'description', default=None)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'series': compilation,
+            'season': season,
+            'season_number': season_number,
+            'episode': episode,
+            'episode_number': episode_number,
+            'thumbnails': thumbnails,
+            'description': description,
+            'duration': int_or_none(result.get('duration')),
+            'formats': formats,
+        }
+
+
+class IviCompilationIE(InfoExtractor):
+    IE_DESC = 'ivi.ru compilations'
+    IE_NAME = 'ivi:compilation'
+    _VALID_URL = r'https?://(?:www\.)?ivi\.ru/watch/(?!\d+)(?P<compilationid>[a-z\d_-]+)(?:/season(?P<seasonid>\d+))?$'
+    _TESTS = [{
+        'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa',
+        'info_dict': {
+            'id': 'dvoe_iz_lartsa',
+            'title': 'Двое из ларца (2006 - 2008)',
+        },
+        'playlist_mincount': 24,
+    }, {
+        'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/season1',
+        'info_dict': {
+            'id': 'dvoe_iz_lartsa/season1',
+            'title': 'Двое из ларца (2006 - 2008) 1 сезон',
+        },
+        'playlist_mincount': 12,
+    }]
+
+    def _extract_entries(self, html, compilation_id):
+        return [
+            self.url_result(
+                'http://www.ivi.ru/watch/%s/%s' % (compilation_id, serie), IviIE.ie_key())
+            for serie in re.findall(
+                r'<a\b[^>]+\bhref=["\']/watch/%s/(\d+)["\']' % compilation_id, html)]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        compilation_id = mobj.group('compilationid')
+        season_id = mobj.group('seasonid')
+
+        if season_id is not None:  # Season link
+            season_page = self._download_webpage(
+                url, compilation_id, 'Downloading season %s web page' % season_id)
+            playlist_id = '%s/season%s' % (compilation_id, season_id)
+            playlist_title = self._html_search_meta('title', season_page, 'title')
+            entries = self._extract_entries(season_page, compilation_id)
+        else:  # Compilation link
+            compilation_page = self._download_webpage(url, compilation_id, 'Downloading compilation web page')
+            playlist_id = compilation_id
+            playlist_title = self._html_search_meta('title', compilation_page, 'title')
+            seasons = re.findall(
+                r'<a href="/watch/%s/season(\d+)' % compilation_id, compilation_page)
+            if not seasons:  # No seasons in this compilation
+                entries = self._extract_entries(compilation_page, compilation_id)
+            else:
+                entries = []
+                for season_id in seasons:
+                    season_page = self._download_webpage(
+                        'http://www.ivi.ru/watch/%s/season%s' % (compilation_id, season_id),
+                        compilation_id, 'Downloading season %s web page' % season_id)
+                    entries.extend(self._extract_entries(season_page, compilation_id))
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/ivideon.py b/youtube_dl/extractor/ivideon.py

new file mode 100644 (file)

index 0000000..3ca824f
--- /dev/null
+++ b/youtube_dl/extractor/ivideon.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_urlencode,
+    compat_urlparse,
+)
+from ..utils import qualities
+
+
+class IvideonIE(InfoExtractor):
+    IE_NAME = 'ivideon'
+    IE_DESC = 'Ivideon TV'
+    _VALID_URL = r'https?://(?:www\.)?ivideon\.com/tv/(?:[^/]+/)*camera/(?P<id>\d+-[\da-f]+)/(?P<camera_id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.ivideon.com/tv/camera/100-916ca13b5c4ad9f564266424a026386d/0/',
+        'info_dict': {
+            'id': '100-916ca13b5c4ad9f564266424a026386d',
+            'ext': 'flv',
+            'title': 're:^Касса [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'Основное предназначение - запись действий кассиров. Плюс общий вид.',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://www.ivideon.com/tv/camera/100-c4ee4cb9ede885cf62dfbe93d7b53783/589824/?lang=ru',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ivideon.com/tv/map/22.917923/-31.816406/16/camera/100-e7bc16c7d4b5bbd633fd5350b66dfa9a/0',
+        'only_matching': True,
+    }]
+
+    _QUALITIES = ('low', 'mid', 'hi')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        server_id, camera_id = mobj.group('id'), mobj.group('camera_id')
+        camera_name, description = None, None
+        camera_url = compat_urlparse.urljoin(
+            url, '/tv/camera/%s/%s/' % (server_id, camera_id))
+
+        webpage = self._download_webpage(camera_url, server_id, fatal=False)
+        if webpage:
+            config_string = self._search_regex(
+                r'var\s+config\s*=\s*({.+?});', webpage, 'config', default=None)
+            if config_string:
+                config = self._parse_json(config_string, server_id, fatal=False)
+                camera_info = config.get('ivTvAppOptions', {}).get('currentCameraInfo')
+                if camera_info:
+                    camera_name = camera_info.get('camera_name')
+                    description = camera_info.get('misc', {}).get('description')
+            if not camera_name:
+                camera_name = self._html_search_meta(
+                    'name', webpage, 'camera name', default=None) or self._search_regex(
+                    r'<h1[^>]+class="b-video-title"[^>]*>([^<]+)', webpage, 'camera name', default=None)
+
+        quality = qualities(self._QUALITIES)
+
+        formats = [{
+            'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse_urlencode({
+                'server': server_id,
+                'camera': camera_id,
+                'sessionId': 'demo',
+                'q': quality(format_id),
+            }),
+            'format_id': format_id,
+            'ext': 'flv',
+            'quality': quality(format_id),
+        } for format_id in self._QUALITIES]
+        self._sort_formats(formats)
+
+        return {
+            'id': server_id,
+            'title': self._live_title(camera_name or server_id),
+            'description': description,
+            'is_live': True,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py

new file mode 100644 (file)

index 0000000..907d5fc
--- /dev/null
+++ b/youtube_dl/extractor/iwara.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+    int_or_none,
+    mimetype2ext,
+    remove_end,
+    url_or_none,
+)
+
+
+class IwaraIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P<id>[a-zA-Z0-9]+)'
+    _TESTS = [{
+        'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD',
+        # md5 is unstable
+        'info_dict': {
+            'id': 'amVwUl1EHpAD9RD',
+            'ext': 'mp4',
+            'title': '【MMD R-18】ガールフレンド carry_me_off',
+            'age_limit': 18,
+        },
+    }, {
+        'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO',
+        'md5': '7e5f1f359cd51a027ba4a7b7710a50f0',
+        'info_dict': {
+            'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc',
+            'ext': 'mp4',
+            'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4',
+            'age_limit': 18,
+        },
+        'add_ie': ['GoogleDrive'],
+    }, {
+        'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq',
+        # md5 is unstable
+        'info_dict': {
+            'id': '6liAP9s2Ojc',
+            'ext': 'mp4',
+            'age_limit': 18,
+            'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)',
+            'description': 'md5:590c12c0df1443d833fbebe05da8c47a',
+            'upload_date': '20160910',
+            'uploader': 'aMMDsork',
+            'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A',
+        },
+        'add_ie': ['Youtube'],
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage, urlh = self._download_webpage_handle(url, video_id)
+
+        hostname = compat_urllib_parse_urlparse(urlh.geturl()).hostname
+        # ecchi is 'sexy' in Japanese
+        age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0
+
+        video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id)
+
+        if not video_data:
+            iframe_url = self._html_search_regex(
+                r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1',
+                webpage, 'iframe URL', group='url')
+            return {
+                '_type': 'url_transparent',
+                'url': iframe_url,
+                'age_limit': age_limit,
+            }
+
+        title = remove_end(self._html_search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title'), ' | Iwara')
+
+        formats = []
+        for a_format in video_data:
+            format_uri = url_or_none(a_format.get('uri'))
+            if not format_uri:
+                continue
+            format_id = a_format.get('resolution')
+            height = int_or_none(self._search_regex(
+                r'(\d+)p', format_id, 'height', default=None))
+            formats.append({
+                'url': self._proto_relative_url(format_uri, 'https:'),
+                'format_id': format_id,
+                'ext': mimetype2ext(a_format.get('mime')) or 'mp4',
+                'height': height,
+                'width': int_or_none(height / 9.0 * 16.0 if height else None),
+                'quality': 1 if format_id == 'Source' else 0,
+            })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py

new file mode 100644 (file)

index 0000000..f8fca6c
--- /dev/null
+++ b/youtube_dl/extractor/izlesene.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_unquote,
+)
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    get_element_by_id,
+    int_or_none,
+    parse_iso8601,
+    str_to_int,
+)
+
+
+class IzleseneIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+        https?://(?:(?:www|m)\.)?izlesene\.com/
+        (?:video|embedplayer)/(?:[^/]+/)?(?P<id>[0-9]+)
+        '''
+    _TESTS = [
+        {
+            'url': 'http://www.izlesene.com/video/sevincten-cildirtan-dogum-gunu-hediyesi/7599694',
+            'md5': '4384f9f0ea65086734b881085ee05ac2',
+            'info_dict': {
+                'id': '7599694',
+                'ext': 'mp4',
+                'title': 'Sevinçten Çıldırtan Doğum Günü Hediyesi',
+                'description': 'md5:253753e2655dde93f59f74b572454f6d',
+                'thumbnail': r're:^https?://.*\.jpg',
+                'uploader_id': 'pelikzzle',
+                'timestamp': int,
+                'upload_date': '20140702',
+                'duration': 95.395,
+                'age_limit': 0,
+            }
+        },
+        {
+            'url': 'http://www.izlesene.com/video/tarkan-dortmund-2006-konseri/17997',
+            'md5': '97f09b6872bffa284cb7fa4f6910cb72',
+            'info_dict': {
+                'id': '17997',
+                'ext': 'mp4',
+                'title': 'Tarkan Dortmund 2006 Konseri',
+                'thumbnail': r're:^https://.*\.jpg',
+                'uploader_id': 'parlayankiz',
+                'timestamp': int,
+                'upload_date': '20061112',
+                'duration': 253.666,
+                'age_limit': 0,
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage('http://www.izlesene.com/video/%s' % video_id, video_id)
+
+        video = self._parse_json(
+            self._search_regex(
+                r'videoObj\s*=\s*({.+?})\s*;\s*\n', webpage, 'streams'),
+            video_id)
+
+        title = video.get('videoTitle') or self._og_search_title(webpage)
+
+        formats = []
+        for stream in video['media']['level']:
+            source_url = stream.get('source')
+            if not source_url or not isinstance(source_url, compat_str):
+                continue
+            ext = determine_ext(url, 'mp4')
+            quality = stream.get('value')
+            height = int_or_none(quality)
+            formats.append({
+                'format_id': '%sp' % quality if quality else 'sd',
+                'url': compat_urllib_parse_unquote(source_url),
+                'ext': ext,
+                'height': height,
+            })
+        self._sort_formats(formats)
+
+        description = self._og_search_description(webpage, default=None)
+        thumbnail = video.get('posterURL') or self._proto_relative_url(
+            self._og_search_thumbnail(webpage), scheme='http:')
+
+        uploader = self._html_search_regex(
+            r"adduserUsername\s*=\s*'([^']+)';",
+            webpage, 'uploader', fatal=False)
+        timestamp = parse_iso8601(self._html_search_meta(
+            'uploadDate', webpage, 'upload date'))
+
+        duration = float_or_none(video.get('duration') or self._html_search_regex(
+            r'videoduration["\']?\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
+            webpage, 'duration', fatal=False, group='value'), scale=1000)
+
+        view_count = str_to_int(get_element_by_id('videoViewCount', webpage))
+        comment_count = self._html_search_regex(
+            r'comment_count\s*=\s*\'([^\']+)\';',
+            webpage, 'comment_count', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader_id': uploader,
+            'timestamp': timestamp,
+            'duration': duration,
+            'view_count': int_or_none(view_count),
+            'comment_count': int_or_none(comment_count),
+            'age_limit': self._family_friendly_search(webpage),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py

new file mode 100644 (file)

index 0000000..490efa8
--- /dev/null
+++ b/youtube_dl/extractor/jamendo.py
@@ -0,0 +1,187 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import random
+
+from ..compat import compat_str
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+    try_get,
+)
+
+
+class JamendoIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            licensing\.jamendo\.com/[^/]+|
+                            (?:www\.)?jamendo\.com
+                        )
+                        /track/(?P<id>[0-9]+)(?:/(?P<display_id>[^/?#&]+))?
+                    '''
+    _TESTS = [{
+        'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i',
+        'md5': '6e9e82ed6db98678f171c25a8ed09ffd',
+        'info_dict': {
+            'id': '196219',
+            'display_id': 'stories-from-emona-i',
+            'ext': 'flac',
+            'title': 'Maya Filipič - Stories from Emona I',
+            'artist': 'Maya Filipič',
+            'track': 'Stories from Emona I',
+            'duration': 210,
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1217438117,
+            'upload_date': '20080730',
+        }
+    }, {
+        'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        track_id, display_id = self._VALID_URL_RE.match(url).groups()
+        webpage = self._download_webpage(
+            'https://www.jamendo.com/track/' + track_id, track_id)
+        models = self._parse_json(self._html_search_regex(
+            r"data-bundled-models='([^']+)",
+            webpage, 'bundled models'), track_id)
+        track = models['track']['models'][0]
+        title = track_name = track['name']
+        get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
+        artist = get_model('artist')
+        artist_name = artist.get('name')
+        if artist_name:
+            title = '%s - %s' % (artist_name, title)
+        album = get_model('album')
+
+        formats = [{
+            'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294'
+                   % (sub_domain, track_id, format_id),
+            'format_id': format_id,
+            'ext': ext,
+            'quality': quality,
+        } for quality, (format_id, sub_domain, ext) in enumerate((
+            ('mp31', 'mp3l', 'mp3'),
+            ('mp32', 'mp3d', 'mp3'),
+            ('ogg1', 'ogg', 'ogg'),
+            ('flac', 'flac', 'flac'),
+        ))]
+        self._sort_formats(formats)
+
+        urls = []
+        thumbnails = []
+        for _, covers in track.get('cover', {}).items():
+            for cover_id, cover_url in covers.items():
+                if not cover_url or cover_url in urls:
+                    continue
+                urls.append(cover_url)
+                size = int_or_none(cover_id.lstrip('size'))
+                thumbnails.append({
+                    'id': cover_id,
+                    'url': cover_url,
+                    'width': size,
+                    'height': size,
+                })
+
+        tags = []
+        for tag in track.get('tags', []):
+            tag_name = tag.get('name')
+            if not tag_name:
+                continue
+            tags.append(tag_name)
+
+        stats = track.get('stats') or {}
+
+        return {
+            'id': track_id,
+            'display_id': display_id,
+            'thumbnails': thumbnails,
+            'title': title,
+            'description': track.get('description'),
+            'duration': int_or_none(track.get('duration')),
+            'artist': artist_name,
+            'track': track_name,
+            'album': album.get('name'),
+            'formats': formats,
+            'license': '-'.join(track.get('licenseCC', [])) or None,
+            'timestamp': int_or_none(track.get('dateCreated')),
+            'view_count': int_or_none(stats.get('listenedAll')),
+            'like_count': int_or_none(stats.get('favorited')),
+            'average_rating': int_or_none(stats.get('averageNote')),
+            'tags': tags,
+        }
+
+
+class JamendoAlbumIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://www.jamendo.com/album/121486/duck-on-cover',
+        'info_dict': {
+            'id': '121486',
+            'title': 'Duck On Cover',
+            'description': 'md5:c2920eaeef07d7af5b96d7c64daf1239',
+        },
+        'playlist': [{
+            'md5': 'e1a2fcb42bda30dfac990212924149a8',
+            'info_dict': {
+                'id': '1032333',
+                'ext': 'flac',
+                'title': 'Shearer - Warmachine',
+                'artist': 'Shearer',
+                'track': 'Warmachine',
+                'timestamp': 1368089771,
+                'upload_date': '20130509',
+            }
+        }, {
+            'md5': '1f358d7b2f98edfe90fd55dac0799d50',
+            'info_dict': {
+                'id': '1032330',
+                'ext': 'flac',
+                'title': 'Shearer - Without Your Ghost',
+                'artist': 'Shearer',
+                'track': 'Without Your Ghost',
+                'timestamp': 1368089771,
+                'upload_date': '20130509',
+            }
+        }],
+        'params': {
+            'playlistend': 2
+        }
+    }
+
+    def _call_api(self, resource, resource_id):
+        path = '/api/%ss' % resource
+        rand = compat_str(random.random())
+        return self._download_json(
+            'https://www.jamendo.com' + path, resource_id, query={
+                'id[]': resource_id,
+            }, headers={
+                'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
+            })[0]
+
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+        album = self._call_api('album', album_id)
+        album_name = album.get('name')
+
+        entries = []
+        for track in album.get('tracks', []):
+            track_id = track.get('id')
+            if not track_id:
+                continue
+            track_id = compat_str(track_id)
+            entries.append({
+                '_type': 'url_transparent',
+                'url': 'https://www.jamendo.com/track/' + track_id,
+                'ie_key': JamendoIE.ie_key(),
+                'id': track_id,
+                'album': album_name,
+            })
+
+        return self.playlist_result(
+            entries, album_id, album_name,
+            clean_html(try_get(album, lambda x: x['description']['en'], compat_str)))
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py

new file mode 100644 (file)

index 0000000..e9f4ed7
--- /dev/null
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class JeuxVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm'
+
+    _TESTS = [{
+        'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',
+        'md5': '046e491afb32a8aaac1f44dd4ddd54ee',
+        'info_dict': {
+            'id': '114765',
+            'ext': 'mp4',
+            'title': 'Tearaway : GC 2013 : Tearaway nous présente ses papiers d\'identité',
+            'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.',
+        },
+    }, {
+        'url': 'http://www.jeuxvideo.com/videos/chroniques/434220/l-histoire-du-jeu-video-la-saturn.htm',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = mobj.group(1)
+        webpage = self._download_webpage(url, title)
+        title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)
+        config_url = self._html_search_regex(
+            r'data-src(?:set-video)?="(/contenu/medias/video\.php.*?)"',
+            webpage, 'config URL')
+        config_url = 'http://www.jeuxvideo.com' + config_url
+
+        video_id = self._search_regex(
+            r'id=(\d+)',
+            config_url, 'video ID')
+
+        config = self._download_json(
+            config_url, title, 'Downloading JSON config')
+
+        formats = [{
+            'url': source['file'],
+            'format_id': source['label'],
+            'resolution': source['label'],
+        } for source in reversed(config['sources'])]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': self._og_search_description(webpage),
+            'thumbnail': config.get('image'),
+        }
diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py

new file mode 100644 (file)

index 0000000..6376181
--- /dev/null
+++ b/youtube_dl/extractor/joj.py
@@ -0,0 +1,108 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    try_get,
+)
+
+
+class JojIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        joj:|
+                        https?://media\.joj\.sk/embed/
+                    )
+                    (?P<id>[^/?#^]+)
+                '''
+    _TESTS = [{
+        'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
+        'info_dict': {
+            'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
+            'ext': 'mp4',
+            'title': 'NOVÉ BÝVANIE',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 3118,
+        }
+    }, {
+        'url': 'https://media.joj.sk/embed/9i1cxv',
+        'only_matching': True,
+    }, {
+        'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
+        'only_matching': True,
+    }, {
+        'url': 'joj:9i1cxv',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url')
+            for mobj in re.finditer(
+                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',
+                webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://media.joj.sk/embed/%s' % video_id, video_id)
+
+        title = self._search_regex(
+            (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
+             r'<title>(?P<title>[^<]+)'), webpage, 'title',
+            default=None, group='title') or self._og_search_title(webpage)
+
+        bitrates = self._parse_json(
+            self._search_regex(
+                r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates',
+                default='{}'),
+            video_id, transform_source=js_to_json, fatal=False)
+
+        formats = []
+        for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
+            if isinstance(format_url, compat_str):
+                height = self._search_regex(
+                    r'(\d+)[pP]\.', format_url, 'height', default=None)
+                formats.append({
+                    'url': format_url,
+                    'format_id': '%sp' % height if height else None,
+                    'height': int(height),
+                })
+        if not formats:
+            playlist = self._download_xml(
+                'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
+                video_id)
+            for file_el in playlist.findall('./files/file'):
+                path = file_el.get('path')
+                if not path:
+                    continue
+                format_id = file_el.get('id') or file_el.get('label')
+                formats.append({
+                    'url': 'http://n16.joj.sk/storage/%s' % path.replace(
+                        'dat/', '', 1),
+                    'format_id': format_id,
+                    'height': int_or_none(self._search_regex(
+                        r'(\d+)[pP]', format_id or path, 'height',
+                        default=None)),
+                })
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        duration = int_or_none(self._search_regex(
+            r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/jove.py b/youtube_dl/extractor/jove.py

new file mode 100644 (file)

index 0000000..27e0e37
--- /dev/null
+++ b/youtube_dl/extractor/jove.py
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unified_strdate
+)
+
+
+class JoveIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?jove\.com/video/(?P<id>[0-9]+)'
+    _CHAPTERS_URL = 'http://www.jove.com/video-chapters?videoid={video_id:}'
+    _TESTS = [
+        {
+            'url': 'http://www.jove.com/video/2744/electrode-positioning-montage-transcranial-direct-current',
+            'md5': '93723888d82dbd6ba8b3d7d0cd65dd2b',
+            'info_dict': {
+                'id': '2744',
+                'ext': 'mp4',
+                'title': 'Electrode Positioning and Montage in Transcranial Direct Current Stimulation',
+                'description': 'md5:015dd4509649c0908bc27f049e0262c6',
+                'thumbnail': r're:^https?://.*\.png$',
+                'upload_date': '20110523',
+            }
+        },
+        {
+            'url': 'http://www.jove.com/video/51796/culturing-caenorhabditis-elegans-axenic-liquid-media-creation',
+            'md5': '914aeb356f416811d911996434811beb',
+            'info_dict': {
+                'id': '51796',
+                'ext': 'mp4',
+                'title': 'Culturing Caenorhabditis elegans in Axenic Liquid Media and Creation of Transgenic Worms by Microparticle Bombardment',
+                'description': 'md5:35ff029261900583970c4023b70f1dc9',
+                'thumbnail': r're:^https?://.*\.png$',
+                'upload_date': '20140802',
+            }
+        },
+
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        chapters_id = self._html_search_regex(
+            r'/video-chapters\?videoid=([0-9]+)', webpage, 'chapters id')
+
+        chapters_xml = self._download_xml(
+            self._CHAPTERS_URL.format(video_id=chapters_id),
+            video_id, note='Downloading chapters XML',
+            errnote='Failed to download chapters XML')
+
+        video_url = chapters_xml.attrib.get('video')
+        if not video_url:
+            raise ExtractorError('Failed to get the video URL')
+
+        title = self._html_search_meta('citation_title', webpage, 'title')
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._html_search_regex(
+            r'<div id="section_body_summary"><p class="jove_content">(.+?)</p>',
+            webpage, 'description', fatal=False)
+        publish_date = unified_strdate(self._html_search_meta(
+            'citation_publication_date', webpage, 'publish date', fatal=False))
+        comment_count = int(self._html_search_regex(
+            r'<meta name="num_comments" content="(\d+) Comments?"',
+            webpage, 'comment count', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'description': description,
+            'upload_date': publish_date,
+            'comment_count': comment_count,
+        }
diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py

new file mode 100644 (file)

index 0000000..c34b5f5
--- /dev/null
+++ b/youtube_dl/extractor/jwplatform.py
@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import unsmuggle_url
+
+
+class JWPlatformIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
+    _TESTS = [{
+        'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
+        'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
+        'info_dict': {
+            'id': 'nPripu9l',
+            'ext': 'mov',
+            'title': 'Big Buck Bunny Trailer',
+            'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
+            'upload_date': '20081127',
+            'timestamp': 1227796140,
+        }
+    }, {
+        'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        urls = JWPlatformIE._extract_urls(webpage)
+        return urls[0] if urls else None
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})',
+            webpage)
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
+        video_id = self._match_id(url)
+        json_data = self._download_json('https://cdn.jwplayer.com/v2/media/' + video_id, video_id)
+        return self._parse_jwplayer_data(json_data, video_id)
diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py

new file mode 100644 (file)

index 0000000..32935bb
--- /dev/null
+++ b/youtube_dl/extractor/kakao.py
@@ -0,0 +1,147 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    strip_or_none,
+    unified_timestamp,
+    update_url_query,
+)
+
+
+class KakaoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:play-)?tv\.kakao\.com/(?:channel/\d+|embed/player)/cliplink/(?P<id>\d+|[^?#&]+@my)'
+    _API_BASE_TMPL = 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/'
+
+    _TESTS = [{
+        'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083',
+        'md5': '702b2fbdeb51ad82f5c904e8c0766340',
+        'info_dict': {
+            'id': '301965083',
+            'ext': 'mp4',
+            'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動！顔高低差GPも！」 『乃木坂工事中』',
+            'uploader_id': 2671005,
+            'uploader': '그랑그랑이',
+            'timestamp': 1488160199,
+            'upload_date': '20170227',
+        }
+    }, {
+        'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180',
+        'md5': 'a8917742069a4dd442516b86e7d66529',
+        'info_dict': {
+            'id': '300103180',
+            'ext': 'mp4',
+            'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회',
+            'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)',
+            'uploader_id': 2653210,
+            'uploader': '쇼! 음악중심',
+            'timestamp': 1485684628,
+            'upload_date': '20170129',
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        display_id = video_id.rstrip('@my')
+        api_base = self._API_BASE_TMPL % video_id
+
+        player_header = {
+            'Referer': update_url_query(
+                'http://tv.kakao.com/embed/player/cliplink/%s' % video_id, {
+                    'service': 'kakao_tv',
+                    'autoplay': '1',
+                    'profile': 'HIGH',
+                    'wmode': 'transparent',
+                })
+        }
+
+        query = {
+            'player': 'monet_html5',
+            'referer': url,
+            'uuid': '',
+            'service': 'kakao_tv',
+            'section': '',
+            'dteType': 'PC',
+            'fields': ','.join([
+                '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title',
+                'description', 'channelId', 'createTime', 'duration', 'playCount',
+                'likeCount', 'commentCount', 'tagList', 'channel', 'name',
+                'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault',
+                'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label'])
+        }
+
+        impress = self._download_json(
+            api_base + 'impress', display_id, 'Downloading video info',
+            query=query, headers=player_header)
+
+        clip_link = impress['clipLink']
+        clip = clip_link['clip']
+
+        title = clip.get('title') or clip_link.get('displayTitle')
+
+        query['tid'] = impress.get('tid', '')
+
+        formats = []
+        for fmt in clip.get('videoOutputList', []):
+            try:
+                profile_name = fmt['profile']
+                if profile_name == 'AUDIO':
+                    continue
+                query.update({
+                    'profile': profile_name,
+                    'fields': '-*,url',
+                })
+                fmt_url_json = self._download_json(
+                    api_base + 'raw/videolocation', display_id,
+                    'Downloading video URL for profile %s' % profile_name,
+                    query=query, headers=player_header, fatal=False)
+
+                if fmt_url_json is None:
+                    continue
+
+                fmt_url = fmt_url_json['url']
+                formats.append({
+                    'url': fmt_url,
+                    'format_id': profile_name,
+                    'width': int_or_none(fmt.get('width')),
+                    'height': int_or_none(fmt.get('height')),
+                    'format_note': fmt.get('label'),
+                    'filesize': int_or_none(fmt.get('filesize')),
+                    'tbr': int_or_none(fmt.get('kbps')),
+                })
+            except KeyError:
+                pass
+        self._sort_formats(formats)
+
+        thumbs = []
+        for thumb in clip.get('clipChapterThumbnailList', []):
+            thumbs.append({
+                'url': thumb.get('thumbnailUrl'),
+                'id': compat_str(thumb.get('timeInSec')),
+                'preference': -1 if thumb.get('isDefault') else 0
+            })
+        top_thumbnail = clip.get('thumbnailUrl')
+        if top_thumbnail:
+            thumbs.append({
+                'url': top_thumbnail,
+                'preference': 10,
+            })
+
+        return {
+            'id': display_id,
+            'title': title,
+            'description': strip_or_none(clip.get('description')),
+            'uploader': clip_link.get('channel', {}).get('name'),
+            'uploader_id': clip_link.get('channelId'),
+            'thumbnails': thumbs,
+            'timestamp': unified_timestamp(clip_link.get('createTime')),
+            'duration': int_or_none(clip.get('duration')),
+            'view_count': int_or_none(clip.get('playCount')),
+            'like_count': int_or_none(clip.get('likeCount')),
+            'comment_count': int_or_none(clip.get('commentCount')),
+            'formats': formats,
+            'tags': clip.get('tagList'),
+        }
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py

new file mode 100644 (file)

index 0000000..49d1346
--- /dev/null
+++ b/youtube_dl/extractor/kaltura.py
@@ -0,0 +1,377 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import base64
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urlparse,
+    compat_parse_qs,
+)
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    unsmuggle_url,
+    smuggle_url,
+)
+
+
+class KalturaIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                (?:
+                    kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|
+                    https?://
+                        (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/
+                        (?:
+                            (?:
+                                # flash player
+                                index\.php/(?:kwidget|extwidget/preview)|
+                                # html5 player
+                                html5/html5lib/[^/]+/mwEmbedFrame\.php
+                            )
+                        )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?
+                )
+                '''
+    _SERVICE_URL = 'http://cdnapi.kaltura.com'
+    _SERVICE_BASE = '/api_v3/index.php'
+    # See https://github.com/kaltura/server/blob/master/plugins/content/caption/base/lib/model/enums/CaptionType.php
+    _CAPTION_TYPES = {
+        1: 'srt',
+        2: 'ttml',
+        3: 'vtt',
+    }
+    _TESTS = [
+        {
+            'url': 'kaltura:269692:1_1jc2y3e4',
+            'md5': '3adcbdb3dcc02d647539e53f284ba171',
+            'info_dict': {
+                'id': '1_1jc2y3e4',
+                'ext': 'mp4',
+                'title': 'Straight from the Heart',
+                'upload_date': '20131219',
+                'uploader_id': 'mlundberg@wolfgangsvault.com',
+                'description': 'The Allman Brothers Band, 12/16/1981',
+                'thumbnail': 're:^https?://.*/thumbnail/.*',
+                'timestamp': int,
+            },
+        },
+        {
+            'url': 'http://www.kaltura.com/index.php/kwidget/cache_st/1300318621/wid/_269692/uiconf_id/3873291/entry_id/1_1jc2y3e4',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://cdnapisec.kaltura.com/index.php/kwidget/wid/_557781/uiconf_id/22845202/entry_id/1_plr1syf3',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342',
+            'only_matching': True,
+        },
+        {
+            # video with subtitles
+            'url': 'kaltura:111032:1_cw786r8q',
+            'only_matching': True,
+        },
+        {
+            # video with ttml subtitles (no fileExt)
+            'url': 'kaltura:1926081:0_l5ye1133',
+            'info_dict': {
+                'id': '0_l5ye1133',
+                'ext': 'mp4',
+                'title': 'What Can You Do With Python?',
+                'upload_date': '20160221',
+                'uploader_id': 'stork',
+                'thumbnail': 're:^https?://.*/thumbnail/.*',
+                'timestamp': int,
+                'subtitles': {
+                    'en': [{
+                        'ext': 'ttml',
+                    }],
+                },
+            },
+            'skip': 'Gone. Maybe https://www.safaribooksonline.com/library/tutorials/introduction-to-python-anon/3469/',
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://www.kaltura.com/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://www.kaltura.com:443/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto',
+            'only_matching': True,
+        },
+        {
+            # unavailable source format
+            'url': 'kaltura:513551:1_66x4rg7o',
+            'only_matching': True,
+        }
+    ]
+
+    @staticmethod
+    def _extract_url(webpage):
+        urls = KalturaIE._extract_urls(webpage)
+        return urls[0] if urls else None
+
+    @staticmethod
+    def _extract_urls(webpage):
+        # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site
+        finditer = (
+            re.finditer(
+                r"""(?xs)
+                    kWidget\.(?:thumb)?[Ee]mbed\(
+                    \{.*?
+                        (?P<q1>['"])wid(?P=q1)\s*:\s*
+                        (?P<q2>['"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*?
+                        (?P<q3>['"])entry_?[Ii]d(?P=q3)\s*:\s*
+                        (?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\})
+                """, webpage)
+            or re.finditer(
+                r'''(?xs)
+                    (?P<q1>["'])
+                        (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)*
+                    (?P=q1).*?
+                    (?:
+                        (?:
+                            entry_?[Ii]d|
+                            (?P<q2>["'])entry_?[Ii]d(?P=q2)
+                        )\s*:\s*|
+                        \[\s*(?P<q2_1>["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s*
+                    )
+                    (?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3)
+                ''', webpage)
+            or re.finditer(
+                r'''(?xs)
+                    <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["'])
+                      (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)
+                      (?:(?!(?P=q1)).)*
+                      [?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+)
+                      (?:(?!(?P=q1)).)*
+                    (?P=q1)
+                ''', webpage)
+        )
+        urls = []
+        for mobj in finditer:
+            embed_info = mobj.groupdict()
+            for k, v in embed_info.items():
+                if v:
+                    embed_info[k] = v.strip()
+            url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
+            escaped_pid = re.escape(embed_info['partner_id'])
+            service_mobj = re.search(
+                r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid),
+                webpage)
+            if service_mobj:
+                url = smuggle_url(url, {'service_url': service_mobj.group('id')})
+            urls.append(url)
+        return urls
+
+    def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs):
+        params = actions[0]
+        if len(actions) > 1:
+            for i, a in enumerate(actions[1:], start=1):
+                for k, v in a.items():
+                    params['%d:%s' % (i, k)] = v
+
+        data = self._download_json(
+            (service_url or self._SERVICE_URL) + self._SERVICE_BASE,
+            video_id, query=params, *args, **kwargs)
+
+        status = data if len(actions) == 1 else data[0]
+        if status.get('objectType') == 'KalturaAPIException':
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, status['message']))
+
+        return data
+
+    def _get_video_info(self, video_id, partner_id, service_url=None):
+        actions = [
+            {
+                'action': 'null',
+                'apiVersion': '3.1.5',
+                'clientTag': 'kdp:v3.8.5',
+                'format': 1,  # JSON, 2 = XML, 3 = PHP
+                'service': 'multirequest',
+            },
+            {
+                'expiry': 86400,
+                'service': 'session',
+                'action': 'startWidgetSession',
+                'widgetId': '_%s' % partner_id,
+            },
+            {
+                'action': 'get',
+                'entryId': video_id,
+                'service': 'baseentry',
+                'ks': '{1:result:ks}',
+                'responseProfile:fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId',
+                'responseProfile:type': 1,
+            },
+            {
+                'action': 'getbyentryid',
+                'entryId': video_id,
+                'service': 'flavorAsset',
+                'ks': '{1:result:ks}',
+            },
+            {
+                'action': 'list',
+                'filter:entryIdEqual': video_id,
+                'service': 'caption_captionasset',
+                'ks': '{1:result:ks}',
+            },
+        ]
+        return self._kaltura_api_call(
+            video_id, actions, service_url, note='Downloading video info JSON')
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        mobj = re.match(self._VALID_URL, url)
+        partner_id, entry_id = mobj.group('partner_id', 'id')
+        ks = None
+        captions = None
+        if partner_id and entry_id:
+            _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url'))
+        else:
+            path, query = mobj.group('path', 'query')
+            if not path and not query:
+                raise ExtractorError('Invalid URL', expected=True)
+            params = {}
+            if query:
+                params = compat_parse_qs(query)
+            if path:
+                splitted_path = path.split('/')
+                params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]]))))
+            if 'wid' in params:
+                partner_id = params['wid'][0][1:]
+            elif 'p' in params:
+                partner_id = params['p'][0]
+            elif 'partner_id' in params:
+                partner_id = params['partner_id'][0]
+            else:
+                raise ExtractorError('Invalid URL', expected=True)
+            if 'entry_id' in params:
+                entry_id = params['entry_id'][0]
+                _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id)
+            elif 'uiconf_id' in params and 'flashvars[referenceId]' in params:
+                reference_id = params['flashvars[referenceId]'][0]
+                webpage = self._download_webpage(url, reference_id)
+                entry_data = self._parse_json(self._search_regex(
+                    r'window\.kalturaIframePackageData\s*=\s*({.*});',
+                    webpage, 'kalturaIframePackageData'),
+                    reference_id)['entryResult']
+                info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets']
+                entry_id = info['id']
+                # Unfortunately, data returned in kalturaIframePackageData lacks
+                # captions so we will try requesting the complete data using
+                # regular approach since we now know the entry_id
+                try:
+                    _, info, flavor_assets, captions = self._get_video_info(
+                        entry_id, partner_id)
+                except ExtractorError:
+                    # Regular scenario failed but we already have everything
+                    # extracted apart from captions and can process at least
+                    # with this
+                    pass
+            else:
+                raise ExtractorError('Invalid URL', expected=True)
+            ks = params.get('flashvars[ks]', [None])[0]
+
+        source_url = smuggled_data.get('source_url')
+        if source_url:
+            referrer = base64.b64encode(
+                '://'.join(compat_urlparse.urlparse(source_url)[:2])
+                .encode('utf-8')).decode('utf-8')
+        else:
+            referrer = None
+
+        def sign_url(unsigned_url):
+            if ks:
+                unsigned_url += '/ks/%s' % ks
+            if referrer:
+                unsigned_url += '?referrer=%s' % referrer
+            return unsigned_url
+
+        data_url = info['dataUrl']
+        if '/flvclipper/' in data_url:
+            data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url)
+
+        formats = []
+        for f in flavor_assets:
+            # Continue if asset is not ready
+            if f.get('status') != 2:
+                continue
+            # Original format that's not available (e.g. kaltura:1926081:0_c03e1b5g)
+            # skip for now.
+            if f.get('fileExt') == 'chun':
+                continue
+            # DRM-protected video, cannot be decrypted
+            if f.get('fileExt') == 'wvm':
+                continue
+            if not f.get('fileExt'):
+                # QT indicates QuickTime; some videos have broken fileExt
+                if f.get('containerFormat') == 'qt':
+                    f['fileExt'] = 'mov'
+                else:
+                    f['fileExt'] = 'mp4'
+            video_url = sign_url(
+                '%s/flavorId/%s' % (data_url, f['id']))
+            format_id = '%(fileExt)s-%(bitrate)s' % f
+            # Source format may not be available (e.g. kaltura:513551:1_66x4rg7o)
+            if f.get('isOriginal') is True and not self._is_valid_url(
+                    video_url, entry_id, format_id):
+                continue
+            # audio-only has no videoCodecId (e.g. kaltura:1926081:0_c03e1b5g
+            # -f mp4-56)
+            vcodec = 'none' if 'videoCodecId' not in f and f.get(
+                'frameRate') == 0 else f.get('videoCodecId')
+            formats.append({
+                'format_id': format_id,
+                'ext': f.get('fileExt'),
+                'tbr': int_or_none(f['bitrate']),
+                'fps': int_or_none(f.get('frameRate')),
+                'filesize_approx': int_or_none(f.get('size'), invscale=1024),
+                'container': f.get('containerFormat'),
+                'vcodec': vcodec,
+                'height': int_or_none(f.get('height')),
+                'width': int_or_none(f.get('width')),
+                'url': video_url,
+            })
+        if '/playManifest/' in data_url:
+            m3u8_url = sign_url(data_url.replace(
+                'format/url', 'format/applehttp'))
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, entry_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        if captions:
+            for caption in captions.get('objects', []):
+                # Continue if caption is not ready
+                if caption.get('status') != 2:
+                    continue
+                if not caption.get('id'):
+                    continue
+                caption_format = int_or_none(caption.get('format'))
+                subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({
+                    'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']),
+                    'ext': caption.get('fileExt') or self._CAPTION_TYPES.get(caption_format) or 'ttml',
+                })
+
+        return {
+            'id': entry_id,
+            'title': info['name'],
+            'formats': formats,
+            'subtitles': subtitles,
+            'description': clean_html(info.get('description')),
+            'thumbnail': info.get('thumbnailUrl'),
+            'duration': info.get('duration'),
+            'timestamp': info.get('createdAt'),
+            'uploader_id': info.get('userId') if info.get('userId') != 'None' else None,
+            'view_count': info.get('plays'),
+        }
diff --git a/youtube_dl/extractor/kanalplay.py b/youtube_dl/extractor/kanalplay.py

new file mode 100644 (file)

index 0000000..6c3498c
--- /dev/null
+++ b/youtube_dl/extractor/kanalplay.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    srt_subtitles_timecode,
+)
+
+
+class KanalPlayIE(InfoExtractor):
+    IE_DESC = 'Kanal 5/9/11 Play'
+    _VALID_URL = r'https?://(?:www\.)?kanal(?P<channel_id>5|9|11)play\.se/(?:#!/)?(?:play/)?program/\d+/video/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.kanal5play.se/#!/play/program/3060212363/video/3270012277',
+        'info_dict': {
+            'id': '3270012277',
+            'ext': 'flv',
+            'title': 'Saknar både dusch och avlopp',
+            'description': 'md5:6023a95832a06059832ae93bc3c7efb7',
+            'duration': 2636.36,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://www.kanal9play.se/#!/play/program/335032/video/246042',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.kanal11play.se/#!/play/program/232835958/video/367135199',
+        'only_matching': True,
+    }]
+
+    def _fix_subtitles(self, subs):
+        return '\r\n\r\n'.join(
+            '%s\r\n%s --> %s\r\n%s'
+            % (
+                num,
+                srt_subtitles_timecode(item['startMillis'] / 1000.0),
+                srt_subtitles_timecode(item['endMillis'] / 1000.0),
+                item['text'],
+            ) for num, item in enumerate(subs, 1))
+
+    def _get_subtitles(self, channel_id, video_id):
+        subs = self._download_json(
+            'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id),
+            video_id, 'Downloading subtitles JSON', fatal=False)
+        return {'sv': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {}
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        channel_id = mobj.group('channel_id')
+
+        video = self._download_json(
+            'http://www.kanal%splay.se/api/getVideo?format=FLASH&videoId=%s' % (channel_id, video_id),
+            video_id)
+
+        reasons_for_no_streams = video.get('reasonsForNoStreams')
+        if reasons_for_no_streams:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, '\n'.join(reasons_for_no_streams)),
+                expected=True)
+
+        title = video['title']
+        description = video.get('description')
+        duration = float_or_none(video.get('length'), 1000)
+        thumbnail = video.get('posterUrl')
+
+        stream_base_url = video['streamBaseUrl']
+
+        formats = [{
+            'url': stream_base_url,
+            'play_path': stream['source'],
+            'ext': 'flv',
+            'tbr': float_or_none(stream.get('bitrate'), 1000),
+            'rtmp_real_time': True,
+        } for stream in video['streams']]
+        self._sort_formats(formats)
+
+        subtitles = {}
+        if video.get('hasSubtitle'):
+            subtitles = self.extract_subtitles(channel_id, video_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py

new file mode 100644 (file)

index 0000000..a677ff4
--- /dev/null
+++ b/youtube_dl/extractor/kankan.py
@@ -0,0 +1,48 @@
+from __future__ import unicode_literals
+
+import re
+import hashlib
+
+from .common import InfoExtractor
+
+_md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
+
+
+class KankanIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml'
+
+    _TEST = {
+        'url': 'http://yinyue.kankan.com/vod/48/48863.shtml',
+        'md5': '29aca1e47ae68fc28804aca89f29507e',
+        'info_dict': {
+            'id': '48863',
+            'ext': 'flv',
+            'title': 'Ready To Go',
+        },
+        'skip': 'Only available from China',
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, 'video title')
+        surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0)
+        gcids = re.findall(r'http://.+?/.+?/(.+?)/', surls)
+        gcid = gcids[-1]
+
+        info_url = 'http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid
+        video_info_page = self._download_webpage(
+            info_url, video_id, 'Downloading video url info')
+        ip = self._search_regex(r'ip:"(.+?)"', video_info_page, 'video url ip')
+        path = self._search_regex(r'path:"(.+?)"', video_info_page, 'video url path')
+        param1 = self._search_regex(r'param1:(\d+)', video_info_page, 'param1')
+        param2 = self._search_regex(r'param2:(\d+)', video_info_page, 'param2')
+        key = _md5('xl_mp43651' + param1 + param2)
+        video_url = 'http://%s%s?key=%s&key1=%s' % (ip, path, key, param2)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+        }
diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py

new file mode 100644 (file)

index 0000000..bfccf89
--- /dev/null
+++ b/youtube_dl/extractor/karaoketv.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class KaraoketvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?karaoketv\.co\.il/[^/]+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.karaoketv.co.il/%D7%A9%D7%99%D7%A8%D7%99_%D7%A7%D7%A8%D7%99%D7%95%D7%A7%D7%99/58356/%D7%90%D7%99%D7%96%D7%95%D7%9F',
+        'info_dict': {
+            'id': '58356',
+            'ext': 'flv',
+            'title': 'קריוקי של איזון',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        api_page_url = self._search_regex(
+            r'<iframe[^>]+src=(["\'])(?P<url>https?://www\.karaoke\.co\.il/api_play\.php\?.+?)\1',
+            webpage, 'API play URL', group='url')
+
+        api_page = self._download_webpage(api_page_url, video_id)
+        video_cdn_url = self._search_regex(
+            r'<iframe[^>]+src=(["\'])(?P<url>https?://www\.video-cdn\.com/embed/iframe/.+?)\1',
+            api_page, 'video cdn URL', group='url')
+
+        video_cdn = self._download_webpage(video_cdn_url, video_id)
+        play_path = self._parse_json(
+            self._search_regex(
+                r'var\s+options\s*=\s*({.+?});', video_cdn, 'options'),
+            video_id)['clip']['url']
+
+        settings = self._parse_json(
+            self._search_regex(
+                r'var\s+settings\s*=\s*({.+?});', video_cdn, 'servers', default='{}'),
+            video_id, fatal=False) or {}
+
+        servers = settings.get('servers')
+        if not servers or not isinstance(servers, list):
+            servers = ('wowzail.video-cdn.com:80/vodcdn', )
+
+        formats = [{
+            'url': 'rtmp://%s' % server if not server.startswith('rtmp') else server,
+            'play_path': play_path,
+            'app': 'vodcdn',
+            'page_url': video_cdn_url,
+            'player_url': 'http://www.video-cdn.com/assets/flowplayer/flowplayer.commercial-3.2.18.swf',
+            'rtmp_real_time': True,
+            'ext': 'flv',
+        } for server in servers]
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py

new file mode 100644 (file)

index 0000000..7b291e0
--- /dev/null
+++ b/youtube_dl/extractor/karrierevideos.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    fix_xml_ampersands,
+    float_or_none,
+    xpath_with_ns,
+    xpath_text,
+)
+
+
+class KarriereVideosIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin',
+        'info_dict': {
+            'id': '32c91',
+            'ext': 'flv',
+            'title': 'AltenpflegerIn',
+            'description': 'md5:dbadd1259fde2159a9b28667cb664ae2',
+            'thumbnail': r're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        # broken ampersands
+        'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun',
+        'info_dict': {
+            'id': '5sniu',
+            'ext': 'flv',
+            'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"',
+            'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33',
+            'thumbnail': r're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = (self._html_search_meta('title', webpage, default=None)
+                 or self._search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'video title'))
+
+        video_id = self._search_regex(
+            r'/config/video/(.+?)\.xml', webpage, 'video id')
+        # Server returns malformed headers
+        # Force Accept-Encoding: * to prevent gzipped results
+        playlist = self._download_xml(
+            'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id,
+            video_id, transform_source=fix_xml_ampersands,
+            headers={'Accept-Encoding': '*'})
+
+        NS_MAP = {
+            'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'
+        }
+
+        def ns(path):
+            return xpath_with_ns(path, NS_MAP)
+
+        item = playlist.find('./tracklist/item')
+        video_file = xpath_text(
+            item, ns('./jwplayer:file'), 'video url', fatal=True)
+        streamer = xpath_text(
+            item, ns('./jwplayer:streamer'), 'streamer', fatal=True)
+
+        uploader = xpath_text(
+            item, ns('./jwplayer:author'), 'uploader')
+        duration = float_or_none(
+            xpath_text(item, ns('./jwplayer:duration'), 'duration'))
+
+        description = self._html_search_regex(
+            r'(?s)<div class="leadtext">(.+?)</div>',
+            webpage, 'description')
+
+        thumbnail = self._html_search_meta(
+            'thumbnail', webpage, 'thumbnail')
+        if thumbnail:
+            thumbnail = compat_urlparse.urljoin(url, thumbnail)
+
+        return {
+            'id': video_id,
+            'url': streamer.replace('rtmpt', 'rtmp'),
+            'play_path': 'mp4:%s' % video_file,
+            'ext': 'flv',
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py

new file mode 100644 (file)

index 0000000..c3eb74c
--- /dev/null
+++ b/youtube_dl/extractor/keezmovies.py
@@ -0,0 +1,133 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..aes import aes_decrypt_text
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    str_to_int,
+    strip_or_none,
+    url_or_none,
+)
+
+
+class KeezMoviesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.keezmovies.com/video/arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money-18070681',
+        'md5': '2ac69cdb882055f71d82db4311732a1a',
+        'info_dict': {
+            'id': '18070681',
+            'display_id': 'arab-wife-want-it-so-bad-i-see-she-thirsty-and-has-tiny-money',
+            'ext': 'mp4',
+            'title': 'Arab wife want it so bad I see she thirsty and has tiny money.',
+            'thumbnail': None,
+            'view_count': int,
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'http://www.keezmovies.com/video/18070681',
+        'only_matching': True,
+    }]
+
+    def _extract_info(self, url, fatal=True):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = (mobj.group('display_id')
+                      if 'display_id' in mobj.groupdict()
+                      else None) or mobj.group('id')
+
+        webpage = self._download_webpage(
+            url, display_id, headers={'Cookie': 'age_verified=1'})
+
+        formats = []
+        format_urls = set()
+
+        title = None
+        thumbnail = None
+        duration = None
+        encrypted = False
+
+        def extract_format(format_url, height=None):
+            format_url = url_or_none(format_url)
+            if not format_url or not format_url.startswith(('http', '//')):
+                return
+            if format_url in format_urls:
+                return
+            format_urls.add(format_url)
+            tbr = int_or_none(self._search_regex(
+                r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None))
+            if not height:
+                height = int_or_none(self._search_regex(
+                    r'[/_](\d+)[pP][/_]', format_url, 'height', default=None))
+            if encrypted:
+                format_url = aes_decrypt_text(
+                    video_url, title, 32).decode('utf-8')
+            formats.append({
+                'url': format_url,
+                'format_id': '%dp' % height if height else None,
+                'height': height,
+                'tbr': tbr,
+            })
+
+        flashvars = self._parse_json(
+            self._search_regex(
+                r'flashvars\s*=\s*({.+?});', webpage,
+                'flashvars', default='{}'),
+            display_id, fatal=False)
+
+        if flashvars:
+            title = flashvars.get('video_title')
+            thumbnail = flashvars.get('image_url')
+            duration = int_or_none(flashvars.get('video_duration'))
+            encrypted = flashvars.get('encrypted') is True
+            for key, value in flashvars.items():
+                mobj = re.search(r'quality_(\d+)[pP]', key)
+                if mobj:
+                    extract_format(value, int(mobj.group(1)))
+            video_url = flashvars.get('video_url')
+            if video_url and determine_ext(video_url, None):
+                extract_format(video_url)
+
+        video_url = self._html_search_regex(
+            r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1',
+            webpage, 'video url', default=None, group='url')
+        if video_url:
+            extract_format(compat_urllib_parse_unquote(video_url))
+
+        if not formats:
+            if 'title="This video is no longer available"' in webpage:
+                raise ExtractorError(
+                    'Video %s is no longer available' % video_id, expected=True)
+
+        try:
+            self._sort_formats(formats)
+        except ExtractorError:
+            if fatal:
+                raise
+
+        if not title:
+            title = self._html_search_regex(
+                r'<h1[^>]*>([^<]+)', webpage, 'title')
+
+        return webpage, {
+            'id': video_id,
+            'display_id': display_id,
+            'title': strip_or_none(title),
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': 18,
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        webpage, info = self._extract_info(url, fatal=False)
+        if not info['formats']:
+            return self.url_result(url, 'Generic')
+        info['view_count'] = str_to_int(self._search_regex(
+            r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False))
+        return info
diff --git a/youtube_dl/extractor/ketnet.py b/youtube_dl/extractor/ketnet.py

new file mode 100644 (file)

index 0000000..93a98e1
--- /dev/null
+++ b/youtube_dl/extractor/ketnet.py
@@ -0,0 +1,93 @@
+from __future__ import unicode_literals
+
+from .canvas import CanvasIE
+from .common import InfoExtractor
+
+
+class KetnetIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes',
+        'md5': '6bdeb65998930251bbd1c510750edba9',
+        'info_dict': {
+            'id': 'zomerse-filmpjes',
+            'ext': 'mp4',
+            'title': 'Gluur mee op de filmset en op Pennenzakkenrock',
+            'description': 'Gluur mee met Ghost Rockers op de filmset',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }, {
+        # mzid in playerConfig instead of sources
+        'url': 'https://www.ketnet.be/kijken/nachtwacht/de-greystook',
+        'md5': '90139b746a0a9bd7bb631283f6e2a64e',
+        'info_dict': {
+            'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
+            'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
+            'ext': 'flv',
+            'title': 'Nachtwacht: De Greystook',
+            'description': 'md5:1db3f5dc4c7109c821261e7512975be7',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1468.03,
+        },
+        'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
+    }, {
+        'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life',
+        'only_matching': True,
+    }, {
+        # mzsource, geo restricted to Belgium
+        'url': 'https://www.ketnet.be/kijken/nachtwacht/de-bermadoe',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        config = self._parse_json(
+            self._search_regex(
+                r'(?s)playerConfig\s*=\s*({.+?})\s*;', webpage,
+                'player config'),
+            video_id)
+
+        mzid = config.get('mzid')
+        if mzid:
+            return self.url_result(
+                'https://mediazone.vrt.be/api/v1/ketnet/assets/%s' % mzid,
+                CanvasIE.ie_key(), video_id=mzid)
+
+        title = config['title']
+
+        formats = []
+        for source_key in ('', 'mz'):
+            source = config.get('%ssource' % source_key)
+            if not isinstance(source, dict):
+                continue
+            for format_id, format_url in source.items():
+                if format_id == 'hls':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id=format_id,
+                        fatal=False))
+                elif format_id == 'hds':
+                    formats.extend(self._extract_f4m_formats(
+                        format_url, video_id, f4m_id=format_id, fatal=False))
+                else:
+                    formats.append({
+                        'url': format_url,
+                        'format_id': format_id,
+                    })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': config.get('description'),
+            'thumbnail': config.get('image'),
+            'series': config.get('program'),
+            'episode': config.get('episode'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py

new file mode 100644 (file)

index 0000000..61739ef
--- /dev/null
+++ b/youtube_dl/extractor/khanacademy.py
@@ -0,0 +1,82 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    unified_strdate,
+)
+
+
+class KhanAcademyIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:(?:www|api)\.)?khanacademy\.org/(?P<key>[^/]+)/(?:[^/]+/){,2}(?P<id>[^?#/]+)(?:$|[?#])'
+    IE_NAME = 'KhanAcademy'
+
+    _TESTS = [{
+        'url': 'http://www.khanacademy.org/video/one-time-pad',
+        'md5': '7b391cce85e758fb94f763ddc1bbb979',
+        'info_dict': {
+            'id': 'one-time-pad',
+            'ext': 'webm',
+            'title': 'The one-time pad',
+            'description': 'The perfect cipher',
+            'duration': 176,
+            'uploader': 'Brit Cruise',
+            'uploader_id': 'khanacademy',
+            'upload_date': '20120411',
+        },
+        'add_ie': ['Youtube'],
+    }, {
+        'url': 'https://www.khanacademy.org/math/applied-math/cryptography',
+        'info_dict': {
+            'id': 'cryptography',
+            'title': 'Journey into cryptography',
+            'description': 'How have humans protected their secret messages through history? What has changed today?',
+        },
+        'playlist_mincount': 3,
+    }]
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+
+        if m.group('key') == 'video':
+            data = self._download_json(
+                'http://api.khanacademy.org/api/v1/videos/' + video_id,
+                video_id, 'Downloading video info')
+
+            upload_date = unified_strdate(data['date_added'])
+            uploader = ', '.join(data['author_names'])
+            return {
+                '_type': 'url_transparent',
+                'url': data['url'],
+                'id': video_id,
+                'title': data['title'],
+                'thumbnail': data['image_url'],
+                'duration': data['duration'],
+                'description': data['description'],
+                'uploader': uploader,
+                'upload_date': upload_date,
+            }
+        else:
+            # topic
+            data = self._download_json(
+                'http://api.khanacademy.org/api/v1/topic/' + video_id,
+                video_id, 'Downloading topic info')
+
+            entries = [
+                {
+                    '_type': 'url',
+                    'url': c['url'],
+                    'id': c['id'],
+                    'title': c['title'],
+                }
+                for c in data['children'] if c['kind'] in ('Video', 'Topic')]
+
+            return {
+                '_type': 'playlist',
+                'id': video_id,
+                'title': data['title'],
+                'description': data['description'],
+                'entries': entries,
+            }
diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py

new file mode 100644 (file)

index 0000000..d4da8f4
--- /dev/null
+++ b/youtube_dl/extractor/kickstarter.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class KickStarterIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?kickstarter\.com/projects/(?P<id>[^/]*)/.*'
+    _TESTS = [{
+        'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant/description',
+        'md5': 'c81addca81327ffa66c642b5d8b08cab',
+        'info_dict': {
+            'id': '1404461844',
+            'ext': 'mp4',
+            'title': 'Intersection: The Story of Josh Grant by Kyle Cowling',
+            'description': (
+                'A unique motocross documentary that examines the '
+                'life and mind of one of sports most elite athletes: Josh Grant.'
+            ),
+        },
+    }, {
+        'note': 'Embedded video (not using the native kickstarter video service)',
+        'url': 'https://www.kickstarter.com/projects/597507018/pebble-e-paper-watch-for-iphone-and-android/posts/659178',
+        'info_dict': {
+            'id': '78704821',
+            'ext': 'mp4',
+            'uploader_id': 'pebble',
+            'uploader': 'Pebble Technology',
+            'title': 'Pebble iOS Notifications',
+        },
+        'add_ie': ['Vimeo'],
+    }, {
+        'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html',
+        'info_dict': {
+            'id': '1420158244',
+            'ext': 'mp4',
+            'title': 'Power Drive 2000',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<title>\s*(.*?)(?:\s*&mdash;\s*Kickstarter)?\s*</title>',
+            webpage, 'title')
+        video_url = self._search_regex(
+            r'data-video-url="(.*?)"',
+            webpage, 'video URL', default=None)
+        if video_url is None:  # No native kickstarter, look for embedded videos
+            return {
+                '_type': 'url_transparent',
+                'ie_key': 'Generic',
+                'url': smuggle_url(url, {'to_generic': True}),
+                'title': title,
+            }
+
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
+        if thumbnail is None:
+            thumbnail = self._html_search_regex(
+                r'<img[^>]+class="[^"]+\s*poster\s*[^"]+"[^>]+src="([^"]+)"',
+                webpage, 'thumbnail image', fatal=False)
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/kinja.py b/youtube_dl/extractor/kinja.py

new file mode 100644 (file)

index 0000000..79e3026
--- /dev/null
+++ b/youtube_dl/extractor/kinja.py
@@ -0,0 +1,221 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_unquote,
+)
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    strip_or_none,
+    try_get,
+    unescapeHTML,
+    urljoin,
+)
+
+
+class KinjaEmbedIE(InfoExtractor):
+    IENAME = 'kinja:embed'
+    _DOMAIN_REGEX = r'''(?:[^.]+\.)?
+        (?:
+            avclub|
+            clickhole|
+            deadspin|
+            gizmodo|
+            jalopnik|
+            jezebel|
+            kinja|
+            kotaku|
+            lifehacker|
+            splinternews|
+            the(?:inventory|onion|root|takeout)
+        )\.com'''
+    _COMMON_REGEX = r'''/
+        (?:
+            ajax/inset|
+            embed/video
+        )/iframe\?.*?\bid='''
+    _VALID_URL = r'''(?x)https?://%s%s
+        (?P<type>
+            fb|
+            imgur|
+            instagram|
+            jwp(?:layer)?-video|
+            kinjavideo|
+            mcp|
+            megaphone|
+            ooyala|
+            soundcloud(?:-playlist)?|
+            tumblr-post|
+            twitch-stream|
+            twitter|
+            ustream-channel|
+            vimeo|
+            vine|
+            youtube-(?:list|video)
+        )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX)
+    _TESTS = [{
+        'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=kinjavideo-100313',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-playlist-317413750',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=tumblr-post-160130699814-daydreams-at-midnight',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=twitch-stream-libratus_extra',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=twitter-1068875942473404422',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=ustream-channel-10414700',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=vimeo-120153502',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=vine-5BlvV5qqPrD',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-list-BCQ3KyrPjgA/PLE6509247C270A72E',
+        'only_matching': True,
+    }, {
+        'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-video-00QyL0AgPAE',
+        'only_matching': True,
+    }]
+    _JWPLATFORM_PROVIDER = ('cdn.jwplayer.com/v2/media/', 'JWPlatform')
+    _PROVIDER_MAP = {
+        'fb': ('facebook.com/video.php?v=', 'Facebook'),
+        'imgur': ('imgur.com/', 'Imgur'),
+        'instagram': ('instagram.com/p/', 'Instagram'),
+        'jwplayer-video': _JWPLATFORM_PROVIDER,
+        'jwp-video': _JWPLATFORM_PROVIDER,
+        'megaphone': ('player.megaphone.fm/', 'Generic'),
+        'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'),
+        'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'),
+        'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'),
+        'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'),
+        'twitch-stream': ('twitch.tv/', 'TwitchStream'),
+        'twitter': ('twitter.com/i/cards/tfw/v1/', 'TwitterCard'),
+        'ustream-channel': ('ustream.tv/embed/', 'Ustream'),
+        'vimeo': ('vimeo.com/', 'Vimeo'),
+        'vine': ('vine.co/v/', 'Vine'),
+        'youtube-list': ('youtube.com/embed/%s?list=%s', 'YoutubePlaylist'),
+        'youtube-video': ('youtube.com/embed/', 'Youtube'),
+    }
+
+    @staticmethod
+    def _extract_urls(webpage, url):
+        return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer(
+            r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX),
+            webpage)]
+
+    def _real_extract(self, url):
+        video_type, video_id = re.match(self._VALID_URL, url).groups()
+
+        provider = self._PROVIDER_MAP.get(video_type)
+        if provider:
+            video_id = compat_urllib_parse_unquote(video_id)
+            if video_type == 'tumblr-post':
+                video_id, blog = video_id.split('-', 1)
+                result_url = provider[0] % (blog, video_id)
+            elif video_type == 'youtube-list':
+                video_id, playlist_id = video_id.split('/')
+                result_url = provider[0] % (video_id, playlist_id)
+            else:
+                if video_type == 'ooyala':
+                    video_id = video_id.split('/')[0]
+                result_url = provider[0] + video_id
+            return self.url_result('http://' + result_url, provider[1])
+
+        if video_type == 'kinjavideo':
+            data = self._download_json(
+                'https://kinja.com/api/core/video/views/videoById',
+                video_id, query={'videoId': video_id})['data']
+            title = data['title']
+
+            formats = []
+            for k in ('signedPlaylist', 'streaming'):
+                m3u8_url = data.get(k + 'Url')
+                if m3u8_url:
+                    formats.extend(self._extract_m3u8_formats(
+                        m3u8_url, video_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+            self._sort_formats(formats)
+
+            thumbnail = None
+            poster = data.get('poster') or {}
+            poster_id = poster.get('id')
+            if poster_id:
+                thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg')
+
+            return {
+                'id': video_id,
+                'title': title,
+                'description': strip_or_none(data.get('description')),
+                'formats': formats,
+                'tags': data.get('tags'),
+                'timestamp': int_or_none(try_get(
+                    data, lambda x: x['postInfo']['publishTimeMillis']), 1000),
+                'thumbnail': thumbnail,
+                'uploader': data.get('network'),
+            }
+        else:
+            video_data = self._download_json(
+                'https://api.vmh.univision.com/metadata/v1/content/' + video_id,
+                video_id)['videoMetadata']
+            iptc = video_data['photoVideoMetadataIPTC']
+            title = iptc['title']['en']
+            fmg = video_data.get('photoVideoMetadata_fmg') or {}
+            tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com'
+            data = self._download_json(
+                tvss_domain + '/api/v3/video-auth/url-signature-tokens',
+                video_id, query={'mcpids': video_id})['data'][0]
+            formats = []
+
+            rendition_url = data.get('renditionUrl')
+            if rendition_url:
+                formats = self._extract_m3u8_formats(
+                    rendition_url, video_id, 'mp4',
+                    'm3u8_native', m3u8_id='hls', fatal=False)
+
+            fallback_rendition_url = data.get('fallbackRenditionUrl')
+            if fallback_rendition_url:
+                formats.append({
+                    'format_id': 'fallback',
+                    'tbr': int_or_none(self._search_regex(
+                        r'_(\d+)\.mp4', fallback_rendition_url,
+                        'bitrate', default=None)),
+                    'url': fallback_rendition_url,
+                })
+
+            self._sort_formats(formats)
+
+            return {
+                'id': video_id,
+                'title': title,
+                'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str),
+                'uploader': fmg.get('network'),
+                'duration': int_or_none(iptc.get('fileDuration')),
+                'formats': formats,
+                'description': try_get(iptc, lambda x: x['description']['en'], compat_str),
+                'timestamp': parse_iso8601(iptc.get('dateReleased')),
+            }
diff --git a/youtube_dl/extractor/kinopoisk.py b/youtube_dl/extractor/kinopoisk.py

new file mode 100644 (file)

index 0000000..9e8d01f
--- /dev/null
+++ b/youtube_dl/extractor/kinopoisk.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    dict_get,
+    int_or_none,
+)
+
+
+class KinoPoiskIE(InfoExtractor):
+    _GEO_COUNTRIES = ['RU']
+    _VALID_URL = r'https?://(?:www\.)?kinopoisk\.ru/film/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.kinopoisk.ru/film/81041/watch/',
+        'md5': '4f71c80baea10dfa54a837a46111d326',
+        'info_dict': {
+            'id': '81041',
+            'ext': 'mp4',
+            'title': 'Алеша попович и тугарин змей',
+            'description': 'md5:43787e673d68b805d0aa1df5a5aea701',
+            'thumbnail': r're:^https?://.*',
+            'duration': 4533,
+            'age_limit': 12,
+        },
+        'params': {
+            'format': 'bestvideo',
+        },
+    }, {
+        'url': 'https://www.kinopoisk.ru/film/81041',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://ott-widget.kinopoisk.ru/v1/kp/', video_id,
+            query={'kpId': video_id})
+
+        data = self._parse_json(
+            self._search_regex(
+                r'(?s)<script[^>]+\btype=["\']application/json[^>]+>(.+?)<',
+                webpage, 'data'),
+            video_id)['models']
+
+        film = data['filmStatus']
+        title = film.get('title') or film['originalTitle']
+
+        formats = self._extract_m3u8_formats(
+            data['playlistEntity']['uri'], video_id, 'mp4',
+            entry_protocol='m3u8_native', m3u8_id='hls')
+        self._sort_formats(formats)
+
+        description = dict_get(
+            film, ('descriptscription', 'description',
+                   'shortDescriptscription', 'shortDescription'))
+        thumbnail = film.get('coverUrl') or film.get('posterUrl')
+        duration = int_or_none(film.get('duration'))
+        age_limit = int_or_none(film.get('restrictionAge'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/konserthusetplay.py b/youtube_dl/extractor/konserthusetplay.py

new file mode 100644 (file)

index 0000000..dd42bb2
--- /dev/null
+++ b/youtube_dl/extractor/konserthusetplay.py
@@ -0,0 +1,124 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    url_or_none,
+)
+
+
+class KonserthusetPlayIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:konserthusetplay|rspoplay)\.se/\?.*\bm=(?P<id>[^&]+)'
+    _TESTS = [{
+        'url': 'http://www.konserthusetplay.se/?m=CKDDnlCY-dhWAAqiMERd-A',
+        'md5': 'e3fd47bf44e864bd23c08e487abe1967',
+        'info_dict': {
+            'id': 'CKDDnlCY-dhWAAqiMERd-A',
+            'ext': 'mp4',
+            'title': 'Orkesterns instrument: Valthornen',
+            'description': 'md5:f10e1f0030202020396a4d712d2fa827',
+            'thumbnail': 're:^https?://.*$',
+            'duration': 398.76,
+        },
+    }, {
+        'url': 'http://rspoplay.se/?m=elWuEH34SMKvaO4wO_cHBw',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        e = self._search_regex(
+            r'https?://csp\.picsearch\.com/rest\?.*\be=(.+?)[&"\']', webpage, 'e')
+
+        rest = self._download_json(
+            'http://csp.picsearch.com/rest?e=%s&containerId=mediaplayer&i=object' % e,
+            video_id, transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+        media = rest['media']
+        player_config = media['playerconfig']
+        playlist = player_config['playlist']
+
+        source = next(f for f in playlist if f.get('bitrates') or f.get('provider'))
+
+        FORMAT_ID_REGEX = r'_([^_]+)_h264m\.mp4'
+
+        formats = []
+
+        m3u8_url = source.get('url')
+        if m3u8_url and determine_ext(m3u8_url) == 'm3u8':
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        fallback_url = source.get('fallbackUrl')
+        fallback_format_id = None
+        if fallback_url:
+            fallback_format_id = self._search_regex(
+                FORMAT_ID_REGEX, fallback_url, 'format id', default=None)
+
+        connection_url = (player_config.get('rtmp', {}).get(
+            'netConnectionUrl') or player_config.get(
+            'plugins', {}).get('bwcheck', {}).get('netConnectionUrl'))
+        if connection_url:
+            for f in source['bitrates']:
+                video_url = f.get('url')
+                if not video_url:
+                    continue
+                format_id = self._search_regex(
+                    FORMAT_ID_REGEX, video_url, 'format id', default=None)
+                f_common = {
+                    'vbr': int_or_none(f.get('bitrate')),
+                    'width': int_or_none(f.get('width')),
+                    'height': int_or_none(f.get('height')),
+                }
+                f = f_common.copy()
+                f.update({
+                    'url': connection_url,
+                    'play_path': video_url,
+                    'format_id': 'rtmp-%s' % format_id if format_id else 'rtmp',
+                    'ext': 'flv',
+                })
+                formats.append(f)
+                if format_id and format_id == fallback_format_id:
+                    f = f_common.copy()
+                    f.update({
+                        'url': fallback_url,
+                        'format_id': 'http-%s' % format_id if format_id else 'http',
+                    })
+                    formats.append(f)
+
+        if not formats and fallback_url:
+            formats.append({
+                'url': fallback_url,
+            })
+
+        self._sort_formats(formats)
+
+        title = player_config.get('title') or media['title']
+        description = player_config.get('mediaInfo', {}).get('description')
+        thumbnail = media.get('image')
+        duration = float_or_none(media.get('duration'), 1000)
+
+        subtitles = {}
+        captions = source.get('captionsAvailableLanguages')
+        if isinstance(captions, dict):
+            for lang, subtitle_url in captions.items():
+                subtitle_url = url_or_none(subtitle_url)
+                if lang != 'none' and subtitle_url:
+                    subtitles.setdefault(lang, []).append({'url': subtitle_url})
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py

new file mode 100644 (file)

index 0000000..d27d052
--- /dev/null
+++ b/youtube_dl/extractor/krasview.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    js_to_json,
+)
+
+
+class KrasViewIE(InfoExtractor):
+    IE_DESC = 'Красвью'
+    _VALID_URL = r'https?://krasview\.ru/(?:video|embed)/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://krasview.ru/video/512228',
+        'md5': '3b91003cf85fc5db277870c8ebd98eae',
+        'info_dict': {
+            'id': '512228',
+            'ext': 'mp4',
+            'title': 'Снег, лёд, заносы',
+            'description': 'Снято в городе Нягань, в Ханты-Мансийском автономном округе.',
+            'duration': 27,
+            'thumbnail': r're:^https?://.*\.jpg',
+        },
+        'params': {
+            'skip_download': 'Not accessible from Travis CI server',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        flashvars = json.loads(js_to_json(self._search_regex(
+            r'video_Init\(({.+?})', webpage, 'flashvars')))
+
+        video_url = flashvars['url']
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage, default=None)
+        thumbnail = flashvars.get('image') or self._og_search_thumbnail(webpage)
+        duration = int_or_none(flashvars.get('duration'))
+        width = int_or_none(self._og_search_property(
+            'video:width', webpage, 'video width', default=None))
+        height = int_or_none(self._og_search_property(
+            'video:height', webpage, 'video height', default=None))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'width': width,
+            'height': height,
+        }
diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py

new file mode 100644 (file)

index 0000000..a574408
--- /dev/null
+++ b/youtube_dl/extractor/ku6.py
@@ -0,0 +1,32 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class Ku6IE(InfoExtractor):
+    _VALID_URL = r'https?://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html'
+    _TEST = {
+        'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html',
+        'md5': '01203549b9efbb45f4b87d55bdea1ed1',
+        'info_dict': {
+            'id': 'JG-8yS14xzBr4bCn1pu0xw',
+            'ext': 'f4v',
+            'title': 'techniques test',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<h1 title=.*>(.*?)</h1>', webpage, 'title')
+        dataUrl = 'http://v.ku6.com/fetchVideo4Player/%s.html' % video_id
+        jsonData = self._download_json(dataUrl, video_id)
+        downloadUrl = jsonData['data']['f']
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': downloadUrl
+        }
diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py

new file mode 100644 (file)

index 0000000..6a7e3ba
--- /dev/null
+++ b/youtube_dl/extractor/kusi.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote_plus
+from ..utils import (
+    int_or_none,
+    float_or_none,
+    timeconvert,
+    update_url_query,
+    xpath_text,
+)
+
+
+class KUSIIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))'
+    _TESTS = [{
+        'url': 'http://www.kusi.com/story/32849881/turko-files-refused-to-help-it-aint-right',
+        'md5': '4e76ce8e53660ce9697d06c0ba6fc47d',
+        'info_dict': {
+            'id': '12689020',
+            'ext': 'mp4',
+            'title': "Turko Files: Refused to Help, It Ain't Right!",
+            'duration': 223.586,
+            'upload_date': '20160826',
+            'timestamp': 1472233118,
+            'thumbnail': r're:^https?://.*\.jpg$'
+        },
+    }, {
+        'url': 'http://kusi.com/video?clipId=12203019',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        clip_id = mobj.group('clipId')
+        video_id = clip_id or mobj.group('path')
+
+        webpage = self._download_webpage(url, video_id)
+
+        if clip_id is None:
+            video_id = clip_id = self._html_search_regex(
+                r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id')
+
+        affiliate_id = self._search_regex(
+            r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id')
+
+        # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf
+        xml_url = update_url_query('http://www.kusi.com/build.asp', {
+            'buildtype': 'buildfeaturexmlrequest',
+            'featureType': 'Clip',
+            'featureid': clip_id,
+            'affiliateno': affiliate_id,
+            'clientgroupid': '1',
+            'rnd': int(round(random.random() * 1000000)),
+        })
+
+        doc = self._download_xml(xml_url, video_id)
+
+        video_title = xpath_text(doc, 'HEADLINE', fatal=True)
+        duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000)
+        description = xpath_text(doc, 'ABSTRACT')
+        thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME')
+        createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate'))
+
+        quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content')
+        formats = []
+        for quality in quality_options:
+            formats.append({
+                'url': compat_urllib_parse_unquote_plus(quality.attrib['url']),
+                'height': int_or_none(quality.attrib.get('height')),
+                'width': int_or_none(quality.attrib.get('width')),
+                'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'description': description,
+            'duration': duration,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'timestamp': createtion_time,
+        }
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py

new file mode 100644 (file)

index 0000000..cc5b2a1
--- /dev/null
+++ b/youtube_dl/extractor/kuwo.py
@@ -0,0 +1,352 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    get_element_by_id,
+    clean_html,
+    ExtractorError,
+    InAdvancePagedList,
+    remove_start,
+)
+
+
+class KuwoBaseIE(InfoExtractor):
+    _FORMATS = [
+        {'format': 'ape', 'ext': 'ape', 'preference': 100},
+        {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80},
+        {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70},
+        {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60},
+        {'format': 'wma', 'ext': 'wma', 'preference': 20},
+        {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10}
+    ]
+
+    def _get_formats(self, song_id, tolerate_ip_deny=False):
+        formats = []
+        for file_format in self._FORMATS:
+            query = {
+                'format': file_format['ext'],
+                'br': file_format.get('br', ''),
+                'rid': 'MUSIC_%s' % song_id,
+                'type': 'convert_url',
+                'response': 'url'
+            }
+
+            song_url = self._download_webpage(
+                'http://antiserver.kuwo.cn/anti.s',
+                song_id, note='Download %s url info' % file_format['format'],
+                query=query, headers=self.geo_verification_headers(),
+            )
+
+            if song_url == 'IPDeny' and not tolerate_ip_deny:
+                raise ExtractorError('This song is blocked in this region', expected=True)
+
+            if song_url.startswith('http://') or song_url.startswith('https://'):
+                formats.append({
+                    'url': song_url,
+                    'format_id': file_format['format'],
+                    'format': file_format['format'],
+                    'preference': file_format['preference'],
+                    'abr': file_format.get('abr'),
+                })
+
+        return formats
+
+
+class KuwoIE(KuwoBaseIE):
+    IE_NAME = 'kuwo:song'
+    IE_DESC = '酷我音乐'
+    _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/yinyue/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.kuwo.cn/yinyue/635632/',
+        'info_dict': {
+            'id': '635632',
+            'ext': 'ape',
+            'title': '爱我别走',
+            'creator': '张震岳',
+            'upload_date': '20080122',
+            'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c'
+        },
+        'skip': 'this song has been offline because of copyright issues',
+    }, {
+        'url': 'http://www.kuwo.cn/yinyue/6446136/',
+        'info_dict': {
+            'id': '6446136',
+            'ext': 'mp3',
+            'title': '心',
+            'description': 'md5:5d0e947b242c35dc0eb1d2fce9fbf02c',
+            'creator': 'IU',
+            'upload_date': '20150518',
+        },
+        'params': {
+            'format': 'mp3-320',
+        },
+    }, {
+        'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+        webpage, urlh = self._download_webpage_handle(
+            url, song_id, note='Download song detail info',
+            errnote='Unable to get song detail info')
+        if song_id not in urlh.geturl() or '对不起，该歌曲由于版权问题已被下线，将返回网站首页' in webpage:
+            raise ExtractorError('this song has been offline because of copyright issues', expected=True)
+
+        song_name = self._html_search_regex(
+            r'<p[^>]+id="lrcName">([^<]+)</p>', webpage, 'song name')
+        singer_name = remove_start(self._html_search_regex(
+            r'<a[^>]+href="http://www\.kuwo\.cn/artist/content\?name=([^"]+)">',
+            webpage, 'singer name', fatal=False), '歌手')
+        lrc_content = clean_html(get_element_by_id('lrcContent', webpage))
+        if lrc_content == '暂无':     # indicates no lyrics
+            lrc_content = None
+
+        formats = self._get_formats(song_id)
+        self._sort_formats(formats)
+
+        album_id = self._html_search_regex(
+            r'<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
+            webpage, 'album id', fatal=False)
+
+        publish_time = None
+        if album_id is not None:
+            album_info_page = self._download_webpage(
+                'http://www.kuwo.cn/album/%s/' % album_id, song_id,
+                note='Download album detail info',
+                errnote='Unable to get album detail info')
+
+            publish_time = self._html_search_regex(
+                r'发行时间：(\d{4}-\d{2}-\d{2})', album_info_page,
+                'publish time', fatal=False)
+            if publish_time:
+                publish_time = publish_time.replace('-', '')
+
+        return {
+            'id': song_id,
+            'title': song_name,
+            'creator': singer_name,
+            'upload_date': publish_time,
+            'description': lrc_content,
+            'formats': formats,
+        }
+
+
+class KuwoAlbumIE(InfoExtractor):
+    IE_NAME = 'kuwo:album'
+    IE_DESC = '酷我音乐 - 专辑'
+    _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/album/(?P<id>\d+?)/'
+    _TEST = {
+        'url': 'http://www.kuwo.cn/album/502294/',
+        'info_dict': {
+            'id': '502294',
+            'title': 'Made\xa0Series\xa0《M》',
+            'description': 'md5:d463f0d8a0ff3c3ea3d6ed7452a9483f',
+        },
+        'playlist_count': 2,
+    }
+
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, album_id, note='Download album info',
+            errnote='Unable to get album info')
+
+        album_name = self._html_search_regex(
+            r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage,
+            'album name')
+        album_intro = remove_start(
+            clean_html(get_element_by_id('intro', webpage)),
+            '%s简介：' % album_name)
+
+        entries = [
+            self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+                r'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"',
+                webpage)
+        ]
+        return self.playlist_result(entries, album_id, album_name, album_intro)
+
+
+class KuwoChartIE(InfoExtractor):
+    IE_NAME = 'kuwo:chart'
+    IE_DESC = '酷我音乐 - 排行榜'
+    _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm'
+    _TEST = {
+        'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm',
+        'info_dict': {
+            'id': '香港中文龙虎榜',
+        },
+        'playlist_mincount': 7,
+    }
+
+    def _real_extract(self, url):
+        chart_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, chart_id, note='Download chart info',
+            errnote='Unable to get chart info')
+
+        entries = [
+            self.url_result(song_url, 'Kuwo') for song_url in re.findall(
+                r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage)
+        ]
+        return self.playlist_result(entries, chart_id)
+
+
+class KuwoSingerIE(InfoExtractor):
+    IE_NAME = 'kuwo:singer'
+    IE_DESC = '酷我音乐 - 歌手'
+    _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mingxing/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.kuwo.cn/mingxing/bruno+mars/',
+        'info_dict': {
+            'id': 'bruno+mars',
+            'title': 'Bruno\xa0Mars',
+        },
+        'playlist_mincount': 329,
+    }, {
+        'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm',
+        'info_dict': {
+            'id': 'Ali',
+            'title': 'Ali',
+        },
+        'playlist_mincount': 95,
+        'skip': 'Regularly stalls travis build',  # See https://travis-ci.org/ytdl-org/youtube-dl/jobs/78878540
+    }]
+
+    PAGE_SIZE = 15
+
+    def _real_extract(self, url):
+        singer_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, singer_id, note='Download singer info',
+            errnote='Unable to get singer info')
+
+        singer_name = self._html_search_regex(
+            r'<h1>([^<]+)</h1>', webpage, 'singer name')
+
+        artist_id = self._html_search_regex(
+            r'data-artistid="(\d+)"', webpage, 'artist id')
+
+        page_count = int(self._html_search_regex(
+            r'data-page="(\d+)"', webpage, 'page count'))
+
+        def page_func(page_num):
+            webpage = self._download_webpage(
+                'http://www.kuwo.cn/artist/contentMusicsAjax',
+                singer_id, note='Download song list page #%d' % (page_num + 1),
+                errnote='Unable to get song list page #%d' % (page_num + 1),
+                query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE})
+
+            return [
+                self.url_result(compat_urlparse.urljoin(url, song_url), 'Kuwo')
+                for song_url in re.findall(
+                    r'<div[^>]+class="name"><a[^>]+href="(/yinyue/\d+)',
+                    webpage)
+            ]
+
+        entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE)
+
+        return self.playlist_result(entries, singer_id, singer_name)
+
+
+class KuwoCategoryIE(InfoExtractor):
+    IE_NAME = 'kuwo:category'
+    IE_DESC = '酷我音乐 - 分类'
+    _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm'
+    _TEST = {
+        'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm',
+        'info_dict': {
+            'id': '86375',
+            'title': '八十年代精选',
+            'description': '这些都是属于八十年代的回忆！',
+        },
+        'playlist_mincount': 24,
+    }
+
+    def _real_extract(self, url):
+        category_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, category_id, note='Download category info',
+            errnote='Unable to get category info')
+
+        category_name = self._html_search_regex(
+            r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name')
+
+        category_desc = remove_start(
+            get_element_by_id('intro', webpage).strip(),
+            '%s简介：' % category_name)
+        if category_desc == '暂无':
+            category_desc = None
+
+        jsonm = self._parse_json(self._html_search_regex(
+            r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id)
+
+        entries = [
+            self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo')
+            for song in jsonm['musiclist']
+        ]
+        return self.playlist_result(entries, category_id, category_name, category_desc)
+
+
+class KuwoMvIE(KuwoBaseIE):
+    IE_NAME = 'kuwo:mv'
+    IE_DESC = '酷我音乐 - MV'
+    _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mv/(?P<id>\d+?)/'
+    _TEST = {
+        'url': 'http://www.kuwo.cn/mv/6480076/',
+        'info_dict': {
+            'id': '6480076',
+            'ext': 'mp4',
+            'title': 'My HouseMV',
+            'creator': '2PM',
+        },
+        # In this video, music URLs (anti.s) are blocked outside China and
+        # USA, while the MV URL (mvurl) is available globally, so force the MV
+        # URL for consistent results in different countries
+        'params': {
+            'format': 'mv',
+        },
+    }
+    _FORMATS = KuwoBaseIE._FORMATS + [
+        {'format': 'mkv', 'ext': 'mkv', 'preference': 250},
+        {'format': 'mp4', 'ext': 'mp4', 'preference': 200},
+    ]
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, song_id, note='Download mv detail info: %s' % song_id,
+            errnote='Unable to get mv detail info: %s' % song_id)
+
+        mobj = re.search(
+            r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"',
+            webpage)
+        if mobj:
+            song_name = mobj.group('song')
+            singer_name = mobj.group('singer')
+        else:
+            raise ExtractorError('Unable to find song or singer names')
+
+        formats = self._get_formats(song_id, tolerate_ip_deny=True)
+
+        mv_url = self._download_webpage(
+            'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id,
+            song_id, note='Download %s MV URL' % song_id)
+        formats.append({
+            'url': mv_url,
+            'format_id': 'mv',
+        })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': song_id,
+            'title': song_name,
+            'creator': singer_name,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py

new file mode 100644 (file)

index 0000000..c3b4ffa
--- /dev/null
+++ b/youtube_dl/extractor/la7.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+    smuggle_url,
+)
+
+
+class LA7IE(InfoExtractor):
+    IE_NAME = 'la7.it'
+    _VALID_URL = r'''(?x)(https?://)?(?:
+        (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/|
+        tg\.la7\.it/repliche-tgla7\?id=
+    )(?P<id>.+)'''
+
+    _TESTS = [{
+        # 'src' is a plain URL
+        'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
+        'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
+        'info_dict': {
+            'id': '0_42j6wd36',
+            'ext': 'mp4',
+            'title': 'Inc.Cool8',
+            'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto  atletico',
+            'thumbnail': 're:^https?://.*',
+            'uploader_id': 'kdla7pillole@iltrovatore.it',
+            'timestamp': 1443814869,
+            'upload_date': '20151002',
+        },
+    }, {
+        # 'src' is a dictionary
+        'url': 'http://tg.la7.it/repliche-tgla7?id=189080',
+        'md5': '6b0d8888d286e39870208dfeceaf456b',
+        'info_dict': {
+            'id': '189080',
+            'ext': 'mp4',
+            'title': 'TG LA7',
+        },
+    }, {
+        'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        player_data = self._parse_json(
+            self._search_regex(
+                [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'],
+                webpage, 'player data'),
+            video_id, transform_source=js_to_json)
+
+        return {
+            '_type': 'url_transparent',
+            'url': smuggle_url('kaltura:103:%s' % player_data['vid'], {
+                'service_url': 'http://nkdam.iltrovatore.it',
+            }),
+            'id': video_id,
+            'title': player_data['title'],
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': player_data.get('poster'),
+            'ie_key': 'Kaltura',
+        }
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py

new file mode 100644 (file)

index 0000000..fa21736
--- /dev/null
+++ b/youtube_dl/extractor/laola1tv.py
@@ -0,0 +1,265 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unified_strdate,
+    urlencode_postdata,
+    xpath_element,
+    xpath_text,
+    update_url_query,
+    js_to_json,
+)
+
+
+class Laola1TvEmbedIE(InfoExtractor):
+    IE_NAME = 'laola1tv:embed'
+    _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)'
+    _TESTS = [{
+        # flashvars.premium = "false";
+        'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024',
+        'info_dict': {
+            'id': '708065',
+            'ext': 'mp4',
+            'title': 'MA Long CHN - FAN Zhendong CHN',
+            'uploader': 'ITTF - International Table Tennis Federation',
+            'upload_date': '20161211',
+        },
+    }]
+
+    def _extract_token_url(self, stream_access_url, video_id, data):
+        return self._download_json(
+            self._proto_relative_url(stream_access_url, 'https:'), video_id,
+            headers={
+                'Content-Type': 'application/json',
+            }, data=json.dumps(data).encode())['data']['stream-access'][0]
+
+    def _extract_formats(self, token_url, video_id):
+        token_doc = self._download_xml(
+            token_url, video_id, 'Downloading token',
+            headers=self.geo_verification_headers())
+
+        token_attrib = xpath_element(token_doc, './/token').attrib
+
+        if token_attrib['status'] != '0':
+            raise ExtractorError(
+                'Token error: %s' % token_attrib['comment'], expected=True)
+
+        formats = self._extract_akamai_formats(
+            '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']),
+            video_id)
+        self._sort_formats(formats)
+        return formats
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        flash_vars = self._search_regex(
+            r'(?s)flashvars\s*=\s*({.+?});', webpage, 'flash vars')
+
+        def get_flashvar(x, *args, **kwargs):
+            flash_var = self._search_regex(
+                r'%s\s*:\s*"([^"]+)"' % x,
+                flash_vars, x, default=None)
+            if not flash_var:
+                flash_var = self._search_regex([
+                    r'flashvars\.%s\s*=\s*"([^"]+)"' % x,
+                    r'%s\s*=\s*"([^"]+)"' % x],
+                    webpage, x, *args, **kwargs)
+            return flash_var
+
+        hd_doc = self._download_xml(
+            'http://www.laola1.tv/server/hd_video.php', video_id, query={
+                'play': get_flashvar('streamid'),
+                'partner': get_flashvar('partnerid'),
+                'portal': get_flashvar('portalid'),
+                'lang': get_flashvar('sprache'),
+                'v5ident': '',
+            })
+
+        _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k)
+        title = _v('title', fatal=True)
+
+        token_url = None
+        premium = get_flashvar('premium', default=None)
+        if premium:
+            token_url = update_url_query(
+                _v('url', fatal=True), {
+                    'timestamp': get_flashvar('timestamp'),
+                    'auth': get_flashvar('auth'),
+                })
+        else:
+            data_abo = urlencode_postdata(
+                dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(','))))
+            stream_access_url = update_url_query(
+                'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', {
+                    'videoId': _v('id'),
+                    'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'),
+                    'label': _v('label'),
+                    'area': _v('area'),
+                })
+            token_url = self._extract_token_url(stream_access_url, video_id, data_abo)
+
+        formats = self._extract_formats(token_url, video_id)
+
+        categories_str = _v('meta_sports')
+        categories = categories_str.split(',') if categories_str else []
+        is_live = _v('islive') == 'true'
+
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'upload_date': unified_strdate(_v('time_date')),
+            'uploader': _v('meta_organisation'),
+            'categories': categories,
+            'is_live': is_live,
+            'formats': formats,
+        }
+
+
+class Laola1TvBaseIE(Laola1TvEmbedIE):
+    def _extract_video(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        if 'Dieser Livestream ist bereits beendet.' in webpage:
+            raise ExtractorError('This live stream has already finished.', expected=True)
+
+        conf = self._parse_json(self._search_regex(
+            r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'),
+            display_id,
+            transform_source=lambda s: js_to_json(re.sub(r'shareurl:.+,', '', s)))
+        video_id = conf['videoid']
+
+        config = self._download_json(conf['configUrl'], video_id, query={
+            'videoid': video_id,
+            'partnerid': conf['partnerid'],
+            'language': conf.get('language', ''),
+            'portal': conf.get('portalid', ''),
+        })
+        error = config.get('error')
+        if error:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+        video_data = config['video']
+        title = video_data['title']
+        is_live = video_data.get('isLivestream') and video_data.get('isLive')
+        meta = video_data.get('metaInformation')
+        sports = meta.get('sports')
+        categories = sports.split(',') if sports else []
+
+        token_url = self._extract_token_url(
+            video_data['streamAccess'], video_id,
+            video_data['abo']['required'])
+
+        formats = self._extract_formats(token_url, video_id)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': self._live_title(title) if is_live else title,
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('image'),
+            'categories': categories,
+            'formats': formats,
+            'is_live': is_live,
+        }
+
+
+class Laola1TvIE(Laola1TvBaseIE):
+    IE_NAME = 'laola1tv'
+    _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html',
+        'info_dict': {
+            'id': '227883',
+            'display_id': 'straubing-tigers-koelner-haie',
+            'ext': 'flv',
+            'title': 'Straubing Tigers - Kölner Haie',
+            'upload_date': '20140912',
+            'is_live': False,
+            'categories': ['Eishockey'],
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie',
+        'info_dict': {
+            'id': '464602',
+            'display_id': 'straubing-tigers-koelner-haie',
+            'ext': 'flv',
+            'title': 'Straubing Tigers - Kölner Haie',
+            'upload_date': '20160129',
+            'is_live': False,
+            'categories': ['Eishockey'],
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.laola1.tv/de-de/livestream/2016-03-22-belogorie-belgorod-trentino-diatec-lde',
+        'info_dict': {
+            'id': '487850',
+            'display_id': '2016-03-22-belogorie-belgorod-trentino-diatec-lde',
+            'ext': 'flv',
+            'title': 'Belogorie BELGOROD - TRENTINO Diatec',
+            'upload_date': '20160322',
+            'uploader': 'CEV - Europäischer Volleyball Verband',
+            'is_live': True,
+            'categories': ['Volleyball'],
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'This live stream has already finished.',
+    }]
+
+    def _real_extract(self, url):
+        return self._extract_video(url)
+
+
+class EHFTVIE(Laola1TvBaseIE):
+    IE_NAME = 'ehftv'
+    _VALID_URL = r'https?://(?:www\.)?ehftv\.com/[a-z]+(?:-[a-z]+)?/[^/]+/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'https://www.ehftv.com/int/video/paris-saint-germain-handball-pge-vive-kielce/1166761',
+        'info_dict': {
+            'id': '1166761',
+            'display_id': 'paris-saint-germain-handball-pge-vive-kielce',
+            'ext': 'mp4',
+            'title': 'Paris Saint-Germain Handball - PGE Vive Kielce',
+            'is_live': False,
+            'categories': ['Handball'],
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        return self._extract_video(url)
+
+
+class ITTFIE(InfoExtractor):
+    _VALID_URL = r'https?://tv\.ittf\.com/video/[^/]+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://tv.ittf.com/video/peng-wang-wei-matsudaira-kenta/951802',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        return self.url_result(
+            update_url_query('https://www.laola1.tv/titanplayer.php', {
+                'videoid': self._match_id(url),
+                'type': 'V',
+                'lang': 'en',
+                'portal': 'int',
+                'customer': 1024,
+            }), Laola1TvEmbedIE.ie_key())
diff --git a/youtube_dl/extractor/lci.py b/youtube_dl/extractor/lci.py

new file mode 100644 (file)

index 0000000..920872f
--- /dev/null
+++ b/youtube_dl/extractor/lci.py
@@ -0,0 +1,26 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class LCIIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?lci\.fr/[^/]+/[\w-]+-(?P<id>\d+)\.html'
+    _TEST = {
+        'url': 'http://www.lci.fr/international/etats-unis-a-j-62-hillary-clinton-reste-sans-voix-2001679.html',
+        'md5': '2fdb2538b884d4d695f9bd2bde137e6c',
+        'info_dict': {
+            'id': '13244802',
+            'ext': 'mp4',
+            'title': 'Hillary Clinton et sa quinte de toux, en plein meeting',
+            'description': 'md5:a4363e3a960860132f8124b62f4a01c9',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        wat_id = self._search_regex(
+            (r'data-watid=[\'"](\d+)', r'idwat["\']?\s*:\s*["\']?(\d+)'),
+            webpage, 'wat id')
+        return self.url_result('wat:' + wat_id, 'Wat', wat_id)
diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py

new file mode 100644 (file)

index 0000000..ade27a9
--- /dev/null
+++ b/youtube_dl/extractor/lcp.py
@@ -0,0 +1,90 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .arkena import ArkenaIE
+
+
+class LcpPlayIE(ArkenaIE):
+    _VALID_URL = r'https?://play\.lcp\.fr/embed/(?P<id>[^/]+)/(?P<account_id>[^/]+)/[^/]+/[^/]+'
+    _TESTS = [{
+        'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0',
+        'md5': 'b8bd9298542929c06c1c15788b1f277a',
+        'info_dict': {
+            'id': '327336',
+            'ext': 'mp4',
+            'title': '327336',
+            'timestamp': 1456391602,
+            'upload_date': '20160225',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+
+class LcpIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^/]+/)*(?P<id>[^/]+)'
+
+    _TESTS = [{
+        # arkena embed
+        'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire',
+        'md5': 'b8bd9298542929c06c1c15788b1f277a',
+        'info_dict': {
+            'id': 'd56d03e9',
+            'ext': 'mp4',
+            'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche',
+            'description': 'md5:96ad55009548da9dea19f4120c6c16a8',
+            'timestamp': 1456488895,
+            'upload_date': '20160226',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # dailymotion live stream
+        'url': 'http://www.lcp.fr/le-direct',
+        'info_dict': {
+            'id': 'xji3qy',
+            'ext': 'mp4',
+            'title': 'La Chaine Parlementaire (LCP), Live TNT',
+            'description': 'md5:5c69593f2de0f38bd9a949f2c95e870b',
+            'uploader': 'LCP',
+            'uploader_id': 'xbz33d',
+            'timestamp': 1308923058,
+            'upload_date': '20110624',
+        },
+        'params': {
+            # m3u8 live stream
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.lcp.fr/emissions/277792-les-volontaires',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        play_url = self._search_regex(
+            r'<iframe[^>]+src=(["\'])(?P<url>%s?(?:(?!\1).)*)\1' % LcpPlayIE._VALID_URL,
+            webpage, 'play iframe', default=None, group='url')
+
+        if not play_url:
+            return self.url_result(url, 'Generic')
+
+        title = self._og_search_title(webpage, default=None) or self._html_search_meta(
+            'twitter:title', webpage, fatal=True)
+        description = self._html_search_meta(
+            ('description', 'twitter:description'), webpage)
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': LcpPlayIE.ie_key(),
+            'url': play_url,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+        }
diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py

new file mode 100644 (file)

index 0000000..81b5d41
--- /dev/null
+++ b/youtube_dl/extractor/lecture2go.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    determine_protocol,
+    parse_duration,
+    int_or_none,
+)
+
+
+class Lecture2GoIE(InfoExtractor):
+    _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473',
+        'md5': 'ac02b570883020d208d405d5a3fd2f7f',
+        'info_dict': {
+            'id': '17473',
+            'ext': 'mp4',
+            'title': '2 - Endliche Automaten und reguläre Sprachen',
+            'creator': 'Frank Heitmann',
+            'duration': 5220,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(r'<em[^>]+class="title">(.+)</em>', webpage, 'title')
+
+        formats = []
+        for url in set(re.findall(r'var\s+playerUri\d+\s*=\s*"([^"]+)"', webpage)):
+            ext = determine_ext(url)
+            protocol = determine_protocol({'url': url})
+            if ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(url, video_id, f4m_id='hds'))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(url, video_id, ext='mp4', m3u8_id='hls'))
+            else:
+                if protocol == 'rtmp':
+                    continue  # XXX: currently broken
+                formats.append({
+                    'format_id': protocol,
+                    'url': url,
+                })
+
+        self._sort_formats(formats)
+
+        creator = self._html_search_regex(
+            r'<div[^>]+id="description">([^<]+)</div>', webpage, 'creator', fatal=False)
+        duration = parse_duration(self._html_search_regex(
+            r'Duration:\s*</em>\s*<em[^>]*>([^<]+)</em>', webpage, 'duration', fatal=False))
+        view_count = int_or_none(self._html_search_regex(
+            r'Views:\s*</em>\s*<em[^>]+>(\d+)</em>', webpage, 'view count', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'creator': creator,
+            'duration': duration,
+            'view_count': view_count,
+        }
diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py

new file mode 100644 (file)

index 0000000..1b2dcef
--- /dev/null
+++ b/youtube_dl/extractor/lecturio.py
@@ -0,0 +1,243 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    str_or_none,
+    url_or_none,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class LecturioBaseIE(InfoExtractor):
+    _API_BASE_URL = 'https://app.lecturio.com/api/en/latest/html5/'
+    _LOGIN_URL = 'https://app.lecturio.com/en/login'
+    _NETRC_MACHINE = 'lecturio'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        # Sets some cookies
+        _, urlh = self._download_webpage_handle(
+            self._LOGIN_URL, None, 'Downloading login popup')
+
+        def is_logged(url_handle):
+            return self._LOGIN_URL not in url_handle.geturl()
+
+        # Already logged in
+        if is_logged(urlh):
+            return
+
+        login_form = {
+            'signin[email]': username,
+            'signin[password]': password,
+            'signin[remember]': 'on',
+        }
+
+        response, urlh = self._download_webpage_handle(
+            self._LOGIN_URL, None, 'Logging in',
+            data=urlencode_postdata(login_form))
+
+        # Logged in successfully
+        if is_logged(urlh):
+            return
+
+        errors = self._html_search_regex(
+            r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response,
+            'errors', default=None)
+        if errors:
+            raise ExtractorError('Unable to login: %s' % errors, expected=True)
+        raise ExtractorError('Unable to log in')
+
+
+class LecturioIE(LecturioBaseIE):
+    _VALID_URL = r'''(?x)
+                    https://
+                        (?:
+                            app\.lecturio\.com/([^/]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))|
+                            (?:www\.)?lecturio\.de/[^/]+/(?P<nt_de>[^/?#&]+)\.vortrag
+                        )
+                    '''
+    _TESTS = [{
+        'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos',
+        'md5': '9a42cf1d8282a6311bf7211bbde26fde',
+        'info_dict': {
+            'id': '39634',
+            'ext': 'mp4',
+            'title': 'Important Concepts and Terms — Introduction to Microbiology',
+        },
+        'skip': 'Requires lecturio account credentials',
+    }, {
+        'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag',
+        'only_matching': True,
+    }, {
+        'url': 'https://app.lecturio.com/#/lecture/c/6434/39634',
+        'only_matching': True,
+    }]
+
+    _CC_LANGS = {
+        'Arabic': 'ar',
+        'Bulgarian': 'bg',
+        'German': 'de',
+        'English': 'en',
+        'Spanish': 'es',
+        'Persian': 'fa',
+        'French': 'fr',
+        'Japanese': 'ja',
+        'Polish': 'pl',
+        'Pashto': 'ps',
+        'Russian': 'ru',
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        nt = mobj.group('nt') or mobj.group('nt_de')
+        lecture_id = mobj.group('id')
+        display_id = nt or lecture_id
+        api_path = 'lectures/' + lecture_id if lecture_id else 'lecture/' + nt + '.json'
+        video = self._download_json(
+            self._API_BASE_URL + api_path, display_id)
+        title = video['title'].strip()
+        if not lecture_id:
+            pid = video.get('productId') or video.get('uid')
+            if pid:
+                spid = pid.split('_')
+                if spid and len(spid) == 2:
+                    lecture_id = spid[1]
+
+        formats = []
+        for format_ in video['content']['media']:
+            if not isinstance(format_, dict):
+                continue
+            file_ = format_.get('file')
+            if not file_:
+                continue
+            ext = determine_ext(file_)
+            if ext == 'smil':
+                # smil contains only broken RTMP formats anyway
+                continue
+            file_url = url_or_none(file_)
+            if not file_url:
+                continue
+            label = str_or_none(format_.get('label'))
+            filesize = int_or_none(format_.get('fileSize'))
+            f = {
+                'url': file_url,
+                'format_id': label,
+                'filesize': float_or_none(filesize, invscale=1000)
+            }
+            if label:
+                mobj = re.match(r'(\d+)p\s*\(([^)]+)\)', label)
+                if mobj:
+                    f.update({
+                        'format_id': mobj.group(2),
+                        'height': int(mobj.group(1)),
+                    })
+            formats.append(f)
+        self._sort_formats(formats)
+
+        subtitles = {}
+        automatic_captions = {}
+        captions = video.get('captions') or []
+        for cc in captions:
+            cc_url = cc.get('url')
+            if not cc_url:
+                continue
+            cc_label = cc.get('translatedCode')
+            lang = cc.get('languageCode') or self._search_regex(
+                r'/([a-z]{2})_', cc_url, 'lang',
+                default=cc_label.split()[0] if cc_label else 'en')
+            original_lang = self._search_regex(
+                r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang',
+                default=None)
+            sub_dict = (automatic_captions
+                        if 'auto-translated' in cc_label or original_lang
+                        else subtitles)
+            sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({
+                'url': cc_url,
+            })
+
+        return {
+            'id': lecture_id or nt,
+            'title': title,
+            'formats': formats,
+            'subtitles': subtitles,
+            'automatic_captions': automatic_captions,
+        }
+
+
+class LecturioCourseIE(LecturioBaseIE):
+    _VALID_URL = r'https://app\.lecturio\.com/(?:[^/]+/(?P<nt>[^/?#&]+)\.course|(?:#/)?course/c/(?P<id>\d+))'
+    _TESTS = [{
+        'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/',
+        'info_dict': {
+            'id': 'microbiology-introduction',
+            'title': 'Microbiology: Introduction',
+            'description': 'md5:13da8500c25880c6016ae1e6d78c386a',
+        },
+        'playlist_count': 45,
+        'skip': 'Requires lecturio account credentials',
+    }, {
+        'url': 'https://app.lecturio.com/#/course/c/6434',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        nt, course_id = re.match(self._VALID_URL, url).groups()
+        display_id = nt or course_id
+        api_path = 'courses/' + course_id if course_id else 'course/content/' + nt + '.json'
+        course = self._download_json(
+            self._API_BASE_URL + api_path, display_id)
+        entries = []
+        for lecture in course.get('lectures', []):
+            lecture_id = str_or_none(lecture.get('id'))
+            lecture_url = lecture.get('url')
+            if lecture_url:
+                lecture_url = urljoin(url, lecture_url)
+            else:
+                lecture_url = 'https://app.lecturio.com/#/lecture/c/%s/%s' % (course_id, lecture_id)
+            entries.append(self.url_result(
+                lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id))
+        return self.playlist_result(
+            entries, display_id, course.get('title'),
+            clean_html(course.get('description')))
+
+
+class LecturioDeCourseIE(LecturioBaseIE):
+    _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P<id>[^/?#&]+)\.kurs'
+    _TEST = {
+        'url': 'https://www.lecturio.de/jura/grundrechte.kurs',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        entries = []
+        for mobj in re.finditer(
+                r'(?s)<td[^>]+\bdata-lecture-id=["\'](?P<id>\d+).+?\bhref=(["\'])(?P<url>(?:(?!\2).)+\.vortrag)\b[^>]+>',
+                webpage):
+            lecture_url = urljoin(url, mobj.group('url'))
+            lecture_id = mobj.group('id')
+            entries.append(self.url_result(
+                lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id))
+
+        title = self._search_regex(
+            r'<h1[^>]*>([^<]+)', webpage, 'title', default=None)
+
+        return self.playlist_result(entries, display_id, title)
diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py

new file mode 100644 (file)

index 0000000..7dc0ad7
--- /dev/null
+++ b/youtube_dl/extractor/leeco.py
@@ -0,0 +1,368 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import datetime
+import hashlib
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_ord,
+    compat_str,
+    compat_urllib_parse_urlencode,
+)
+from ..utils import (
+    determine_ext,
+    encode_data_uri,
+    ExtractorError,
+    int_or_none,
+    orderedSet,
+    parse_iso8601,
+    str_or_none,
+    url_basename,
+    urshift,
+)
+
+
+class LeIE(InfoExtractor):
+    IE_DESC = '乐视网'
+    _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|(?:sports\.le|(?:www\.)?lesports)\.com/(?:match|video))/(?P<id>\d+)\.html'
+    _GEO_COUNTRIES = ['CN']
+    _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html'
+
+    _TESTS = [{
+        'url': 'http://www.le.com/ptv/vplay/22005890.html',
+        'md5': 'edadcfe5406976f42f9f266057ee5e40',
+        'info_dict': {
+            'id': '22005890',
+            'ext': 'mp4',
+            'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家',
+            'description': 'md5:a9cb175fd753e2962176b7beca21a47c',
+        },
+        'params': {
+            'hls_prefer_native': True,
+        },
+    }, {
+        'url': 'http://www.le.com/ptv/vplay/1415246.html',
+        'info_dict': {
+            'id': '1415246',
+            'ext': 'mp4',
+            'title': '美人天下01',
+            'description': 'md5:28942e650e82ed4fcc8e4de919ee854d',
+        },
+        'params': {
+            'hls_prefer_native': True,
+        },
+    }, {
+        'note': 'This video is available only in Mainland China, thus a proxy is needed',
+        'url': 'http://www.le.com/ptv/vplay/1118082.html',
+        'md5': '2424c74948a62e5f31988438979c5ad1',
+        'info_dict': {
+            'id': '1118082',
+            'ext': 'mp4',
+            'title': '与龙共舞 完整版',
+            'description': 'md5:7506a5eeb1722bb9d4068f85024e3986',
+        },
+        'params': {
+            'hls_prefer_native': True,
+        },
+    }, {
+        'url': 'http://sports.le.com/video/25737697.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.lesports.com/match/1023203003.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://sports.le.com/match/1023203003.html',
+        'only_matching': True,
+    }]
+
+    # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf
+    def ror(self, param1, param2):
+        _loc3_ = 0
+        while _loc3_ < param2:
+            param1 = urshift(param1, 1) + ((param1 & 1) << 31)
+            _loc3_ += 1
+        return param1
+
+    def calc_time_key(self, param1):
+        _loc2_ = 185025305
+        return self.ror(param1, _loc2_ % 17) ^ _loc2_
+
+    # see M3U8Encryption class in KLetvPlayer.swf
+    @staticmethod
+    def decrypt_m3u8(encrypted_data):
+        if encrypted_data[:5].decode('utf-8').lower() != 'vc_01':
+            return encrypted_data
+        encrypted_data = encrypted_data[5:]
+
+        _loc4_ = bytearray(2 * len(encrypted_data))
+        for idx, val in enumerate(encrypted_data):
+            b = compat_ord(val)
+            _loc4_[2 * idx] = b // 16
+            _loc4_[2 * idx + 1] = b % 16
+        idx = len(_loc4_) - 11
+        _loc4_ = _loc4_[idx:] + _loc4_[:idx]
+        _loc7_ = bytearray(len(encrypted_data))
+        for i in range(len(encrypted_data)):
+            _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1]
+
+        return bytes(_loc7_)
+
+    def _check_errors(self, play_json):
+        # Check for errors
+        playstatus = play_json['msgs']['playstatus']
+        if playstatus['status'] == 0:
+            flag = playstatus['flag']
+            if flag == 1:
+                self.raise_geo_restricted()
+            else:
+                raise ExtractorError('Generic error. flag = %d' % flag, expected=True)
+
+    def _real_extract(self, url):
+        media_id = self._match_id(url)
+        page = self._download_webpage(url, media_id)
+
+        play_json_flash = self._download_json(
+            'http://player-pc.le.com/mms/out/video/playJson',
+            media_id, 'Downloading flash playJson data', query={
+                'id': media_id,
+                'platid': 1,
+                'splatid': 105,
+                'format': 1,
+                'source': 1000,
+                'tkey': self.calc_time_key(int(time.time())),
+                'domain': 'www.le.com',
+                'region': 'cn',
+            },
+            headers=self.geo_verification_headers())
+        self._check_errors(play_json_flash)
+
+        def get_flash_urls(media_url, format_id):
+            nodes_data = self._download_json(
+                media_url, media_id,
+                'Download JSON metadata for format %s' % format_id,
+                query={
+                    'm3v': 1,
+                    'format': 1,
+                    'expect': 3,
+                    'tss': 'ios',
+                })
+
+            req = self._request_webpage(
+                nodes_data['nodelist'][0]['location'], media_id,
+                note='Downloading m3u8 information for format %s' % format_id)
+
+            m3u8_data = self.decrypt_m3u8(req.read())
+
+            return {
+                'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
+            }
+
+        extracted_formats = []
+        formats = []
+        playurl = play_json_flash['msgs']['playurl']
+        play_domain = playurl['domain'][0]
+
+        for format_id, format_data in playurl.get('dispatch', []).items():
+            if format_id in extracted_formats:
+                continue
+            extracted_formats.append(format_id)
+
+            media_url = play_domain + format_data[0]
+            for protocol, format_url in get_flash_urls(media_url, format_id).items():
+                f = {
+                    'url': format_url,
+                    'ext': determine_ext(format_data[1]),
+                    'format_id': '%s-%s' % (protocol, format_id),
+                    'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+                    'quality': int_or_none(format_id),
+                }
+
+                if format_id[-1:] == 'p':
+                    f['height'] = int_or_none(format_id[:-1])
+
+                formats.append(f)
+        self._sort_formats(formats, ('height', 'quality', 'format_id'))
+
+        publish_time = parse_iso8601(self._html_search_regex(
+            r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
+            delimiter=' ', timezone=datetime.timedelta(hours=8))
+        description = self._html_search_meta('description', page, fatal=False)
+
+        return {
+            'id': media_id,
+            'formats': formats,
+            'title': playurl['title'],
+            'thumbnail': playurl['pic'],
+            'description': description,
+            'timestamp': publish_time,
+        }
+
+
+class LePlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://[a-z]+\.le\.com/(?!video)[a-z]+/(?P<id>[a-z0-9_]+)'
+
+    _TESTS = [{
+        'url': 'http://www.le.com/tv/46177.html',
+        'info_dict': {
+            'id': '46177',
+            'title': '美人天下',
+            'description': 'md5:395666ff41b44080396e59570dbac01c'
+        },
+        'playlist_count': 35
+    }, {
+        'url': 'http://tv.le.com/izt/wuzetian/index.html',
+        'info_dict': {
+            'id': 'wuzetian',
+            'title': '武媚娘传奇',
+            'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
+        },
+        # This playlist contains some extra videos other than the drama itself
+        'playlist_mincount': 96
+    }, {
+        'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml',
+        # This series is moved to http://www.le.com/tv/10005297.html
+        'only_matching': True,
+    }, {
+        'url': 'http://www.le.com/comic/92063.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        page = self._download_webpage(url, playlist_id)
+
+        # Currently old domain names are still used in playlists
+        media_ids = orderedSet(re.findall(
+            r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page))
+        entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le')
+                   for media_id in media_ids]
+
+        title = self._html_search_meta('keywords', page,
+                                       fatal=False).split('，')[0]
+        description = self._html_search_meta('description', page, fatal=False)
+
+        return self.playlist_result(entries, playlist_id, playlist_title=title,
+                                    playlist_description=description)
+
+
+class LetvCloudIE(InfoExtractor):
+    # Most of *.letv.com is changed to *.le.com on 2016/01/02
+    # but yuntv.letv.com is kept, so also keep the extractor name
+    IE_DESC = '乐视云'
+    _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+'
+
+    _TESTS = [{
+        'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=467623dedf',
+        'md5': '26450599afd64c513bc77030ad15db44',
+        'info_dict': {
+            'id': 'p7jnfw5hw9_467623dedf',
+            'ext': 'mp4',
+            'title': 'Video p7jnfw5hw9_467623dedf',
+        },
+    }, {
+        'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=ec93197892&pu=2c7cd40209&auto_play=1&gpcflag=1&width=640&height=360',
+        'md5': 'e03d9cc8d9c13191e1caf277e42dbd31',
+        'info_dict': {
+            'id': 'p7jnfw5hw9_ec93197892',
+            'ext': 'mp4',
+            'title': 'Video p7jnfw5hw9_ec93197892',
+        },
+    }, {
+        'url': 'http://yuntv.letv.com/bcloud.html?uu=p7jnfw5hw9&vu=187060b6fd',
+        'md5': 'cb988699a776b22d4a41b9d43acfb3ac',
+        'info_dict': {
+            'id': 'p7jnfw5hw9_187060b6fd',
+            'ext': 'mp4',
+            'title': 'Video p7jnfw5hw9_187060b6fd',
+        },
+    }]
+
+    @staticmethod
+    def sign_data(obj):
+        if obj['cf'] == 'flash':
+            salt = '2f9d6924b33a165a6d8b5d3d42f4f987'
+            items = ['cf', 'format', 'ran', 'uu', 'ver', 'vu']
+        elif obj['cf'] == 'html5':
+            salt = 'fbeh5player12c43eccf2bec3300344'
+            items = ['cf', 'ran', 'uu', 'bver', 'vu']
+        input_data = ''.join([item + obj[item] for item in items]) + salt
+        obj['sign'] = hashlib.md5(input_data.encode('utf-8')).hexdigest()
+
+    def _get_formats(self, cf, uu, vu, media_id):
+        def get_play_json(cf, timestamp):
+            data = {
+                'cf': cf,
+                'ver': '2.2',
+                'bver': 'firefox44.0',
+                'format': 'json',
+                'uu': uu,
+                'vu': vu,
+                'ran': compat_str(timestamp),
+            }
+            self.sign_data(data)
+            return self._download_json(
+                'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data),
+                media_id, 'Downloading playJson data for type %s' % cf)
+
+        play_json = get_play_json(cf, time.time())
+        # The server time may be different from local time
+        if play_json.get('code') == 10071:
+            play_json = get_play_json(cf, play_json['timestamp'])
+
+        if not play_json.get('data'):
+            if play_json.get('message'):
+                raise ExtractorError('Letv cloud said: %s' % play_json['message'], expected=True)
+            elif play_json.get('code'):
+                raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True)
+            else:
+                raise ExtractorError('Letv cloud returned an unknown error')
+
+        def b64decode(s):
+            return compat_b64decode(s).decode('utf-8')
+
+        formats = []
+        for media in play_json['data']['video_info']['media'].values():
+            play_url = media['play_url']
+            url = b64decode(play_url['main_url'])
+            decoded_url = b64decode(url_basename(url))
+            formats.append({
+                'url': url,
+                'ext': determine_ext(decoded_url),
+                'format_id': str_or_none(play_url.get('vtype')),
+                'format_note': str_or_none(play_url.get('definition')),
+                'width': int_or_none(play_url.get('vwidth')),
+                'height': int_or_none(play_url.get('vheight')),
+            })
+
+        return formats
+
+    def _real_extract(self, url):
+        uu_mobj = re.search(r'uu=([\w]+)', url)
+        vu_mobj = re.search(r'vu=([\w]+)', url)
+
+        if not uu_mobj or not vu_mobj:
+            raise ExtractorError('Invalid URL: %s' % url, expected=True)
+
+        uu = uu_mobj.group(1)
+        vu = vu_mobj.group(1)
+        media_id = uu + '_' + vu
+
+        formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id)
+        self._sort_formats(formats)
+
+        return {
+            'id': media_id,
+            'title': 'Video %s' % media_id,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py

new file mode 100644 (file)

index 0000000..1e3c19d
--- /dev/null
+++ b/youtube_dl/extractor/lego.py
@@ -0,0 +1,149 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import uuid
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    qualities,
+)
+
+
+class LEGOIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?lego\.com/(?P<locale>[a-z]{2}-[a-z]{2})/(?:[^/]+/)*videos/(?:[^/]+/)*[^/?#]+-(?P<id>[0-9a-f]{32})'
+    _TESTS = [{
+        'url': 'http://www.lego.com/en-us/videos/themes/club/blocumentary-kawaguchi-55492d823b1b4d5e985787fa8c2973b1',
+        'md5': 'f34468f176cfd76488767fc162c405fa',
+        'info_dict': {
+            'id': '55492d82-3b1b-4d5e-9857-87fa8c2973b1_en-US',
+            'ext': 'mp4',
+            'title': 'Blocumentary Great Creations: Akiyuki Kawaguchi',
+            'description': 'Blocumentary Great Creations: Akiyuki Kawaguchi',
+        },
+    }, {
+        # geo-restricted but the contentUrl contain a valid url
+        'url': 'http://www.lego.com/nl-nl/videos/themes/nexoknights/episode-20-kingdom-of-heroes-13bdc2299ab24d9685701a915b3d71e7##sp=399',
+        'md5': 'c7420221f7ffd03ff056f9db7f8d807c',
+        'info_dict': {
+            'id': '13bdc229-9ab2-4d96-8570-1a915b3d71e7_nl-NL',
+            'ext': 'mp4',
+            'title': 'Aflevering 20:  Helden van het koninkrijk',
+            'description': 'md5:8ee499aac26d7fa8bcb0cedb7f9c3941',
+            'age_limit': 5,
+        },
+    }, {
+        # with subtitle
+        'url': 'https://www.lego.com/nl-nl/kids/videos/classic/creative-storytelling-the-little-puppy-aa24f27c7d5242bc86102ebdc0f24cba',
+        'info_dict': {
+            'id': 'aa24f27c-7d52-42bc-8610-2ebdc0f24cba_nl-NL',
+            'ext': 'mp4',
+            'title': 'De kleine puppy',
+            'description': 'md5:5b725471f849348ac73f2e12cfb4be06',
+            'age_limit': 1,
+            'subtitles': {
+                'nl': [{
+                    'ext': 'srt',
+                    'url': r're:^https://.+\.srt$',
+                }],
+            },
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+    _QUALITIES = {
+        'Lowest': (64, 180, 320),
+        'Low': (64, 270, 480),
+        'Medium': (96, 360, 640),
+        'High': (128, 540, 960),
+        'Highest': (128, 720, 1280),
+    }
+
+    def _real_extract(self, url):
+        locale, video_id = re.match(self._VALID_URL, url).groups()
+        countries = [locale.split('-')[1].upper()]
+        self._initialize_geo_bypass({
+            'countries': countries,
+        })
+
+        try:
+            item = self._download_json(
+                # https://contentfeed.services.lego.com/api/v2/item/[VIDEO_ID]?culture=[LOCALE]&contentType=Video
+                'https://services.slingshot.lego.com/mediaplayer/v2',
+                video_id, query={
+                    'videoId': '%s_%s' % (uuid.UUID(video_id), locale),
+                }, headers=self.geo_verification_headers())
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 451:
+                self.raise_geo_restricted(countries=countries)
+            raise
+
+        video = item['Video']
+        video_id = video['Id']
+        title = video['Title']
+
+        q = qualities(['Lowest', 'Low', 'Medium', 'High', 'Highest'])
+        formats = []
+        for video_source in item.get('VideoFormats', []):
+            video_source_url = video_source.get('Url')
+            if not video_source_url:
+                continue
+            video_source_format = video_source.get('Format')
+            if video_source_format == 'F4M':
+                formats.extend(self._extract_f4m_formats(
+                    video_source_url, video_id,
+                    f4m_id=video_source_format, fatal=False))
+            elif video_source_format == 'M3U8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_source_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id=video_source_format, fatal=False))
+            else:
+                video_source_quality = video_source.get('Quality')
+                format_id = []
+                for v in (video_source_format, video_source_quality):
+                    if v:
+                        format_id.append(v)
+                f = {
+                    'format_id': '-'.join(format_id),
+                    'quality': q(video_source_quality),
+                    'url': video_source_url,
+                }
+                quality = self._QUALITIES.get(video_source_quality)
+                if quality:
+                    f.update({
+                        'abr': quality[0],
+                        'height': quality[1],
+                        'width': quality[2],
+                    }),
+                formats.append(f)
+        self._sort_formats(formats)
+
+        subtitles = {}
+        sub_file_id = video.get('SubFileId')
+        if sub_file_id and sub_file_id != '00000000-0000-0000-0000-000000000000':
+            net_storage_path = video.get('NetstoragePath')
+            invariant_id = video.get('InvariantId')
+            video_file_id = video.get('VideoFileId')
+            video_version = video.get('VideoVersion')
+            if net_storage_path and invariant_id and video_file_id and video_version:
+                subtitles.setdefault(locale[:2], []).append({
+                    'url': 'https://lc-mediaplayerns-live-s.legocdn.com/public/%s/%s_%s_%s_%s_sub.srt' % (net_storage_path, invariant_id, video_file_id, locale, video_version),
+                })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video.get('Description'),
+            'thumbnail': video.get('GeneratedCoverImage') or video.get('GeneratedThumbnail'),
+            'duration': int_or_none(video.get('Length')),
+            'formats': formats,
+            'subtitles': subtitles,
+            'age_limit': int_or_none(video.get('AgeFrom')),
+            'season': video.get('SeasonTitle'),
+            'season_number': int_or_none(video.get('Season')) or None,
+            'episode_number': int_or_none(video.get('Episode')) or None,
+        }
diff --git a/youtube_dl/extractor/lemonde.py b/youtube_dl/extractor/lemonde.py

new file mode 100644 (file)

index 0000000..3306892
--- /dev/null
+++ b/youtube_dl/extractor/lemonde.py
@@ -0,0 +1,58 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class LemondeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?lemonde\.fr/(?:[^/]+/)*(?P<id>[^/]+)\.html'
+    _TESTS = [{
+        'url': 'http://www.lemonde.fr/police-justice/video/2016/01/19/comprendre-l-affaire-bygmalion-en-cinq-minutes_4849702_1653578.html',
+        'md5': 'da120c8722d8632eec6ced937536cc98',
+        'info_dict': {
+            'id': 'lqm3kl',
+            'ext': 'mp4',
+            'title': "Comprendre l'affaire Bygmalion en 5 minutes",
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 309,
+            'upload_date': '20160119',
+            'timestamp': 1453194778,
+            'uploader_id': '3pmkp',
+        },
+    }, {
+        # standard iframe embed
+        'url': 'http://www.lemonde.fr/les-decodeurs/article/2016/10/18/tout-comprendre-du-ceta-le-petit-cousin-du-traite-transatlantique_5015920_4355770.html',
+        'info_dict': {
+            'id': 'uzsxms',
+            'ext': 'mp4',
+            'title': "CETA : quelles suites pour l'accord commercial entre l'Europe et le Canada ?",
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 325,
+            'upload_date': '20161021',
+            'timestamp': 1477044540,
+            'uploader_id': '3pmkp',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://redaction.actu.lemonde.fr/societe/video/2016/01/18/calais-debut-des-travaux-de-defrichement-dans-la-jungle_4849233_3224.html',
+        'only_matching': True,
+    }, {
+        # YouTube embeds
+        'url': 'http://www.lemonde.fr/pixels/article/2016/12/09/pourquoi-pewdiepie-superstar-de-youtube-a-menace-de-fermer-sa-chaine_5046649_4408996.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        digiteka_url = self._proto_relative_url(self._search_regex(
+            r'url\s*:\s*(["\'])(?P<url>(?:https?://)?//(?:www\.)?(?:digiteka\.net|ultimedia\.com)/deliver/.+?)\1',
+            webpage, 'digiteka url', group='url', default=None))
+
+        if digiteka_url:
+            return self.url_result(digiteka_url, 'Digiteka')
+
+        return self.url_result(url, 'Generic')
diff --git a/youtube_dl/extractor/lenta.py b/youtube_dl/extractor/lenta.py

new file mode 100644 (file)

index 0000000..2ebd4e5
--- /dev/null
+++ b/youtube_dl/extractor/lenta.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class LentaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/',
+        'info_dict': {
+            'id': '964400',
+            'ext': 'mp4',
+            'title': 'Надежду Савченко задержали',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 61,
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # EaglePlatform iframe embed
+        'url': 'http://lenta.ru/news/2015/03/06/navalny/',
+        'info_dict': {
+            'id': '227304',
+            'ext': 'mp4',
+            'title': 'Навальный вышел на свободу',
+            'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 87,
+            'view_count': int,
+            'age_limit': 0,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'vid\s*:\s*["\']?(\d+)', webpage, 'eagleplatform id',
+            default=None)
+        if video_id:
+            return self.url_result(
+                'eagleplatform:lentaru.media.eagleplatform.com:%s' % video_id,
+                ie='EaglePlatform', video_id=video_id)
+
+        return self.url_result(url, ie='Generic')
diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py

new file mode 100644 (file)

index 0000000..03f2051
--- /dev/null
+++ b/youtube_dl/extractor/libraryofcongress.py
@@ -0,0 +1,153 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    parse_filesize,
+)
+
+
+class LibraryOfCongressIE(InfoExtractor):
+    IE_NAME = 'loc'
+    IE_DESC = 'Library of Congress'
+    _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9a-z_.]+)'
+    _TESTS = [{
+        # embedded via <div class="media-player"
+        'url': 'http://loc.gov/item/90716351/',
+        'md5': '6ec0ae8f07f86731b1b2ff70f046210a',
+        'info_dict': {
+            'id': '90716351',
+            'ext': 'mp4',
+            'title': "Pa's trip to Mars",
+            'duration': 0,
+            'view_count': int,
+        },
+    }, {
+        # webcast embedded via mediaObjectId
+        'url': 'https://www.loc.gov/today/cyberlc/feature_wdesc.php?rec=5578',
+        'info_dict': {
+            'id': '5578',
+            'ext': 'mp4',
+            'title': 'Help! Preservation Training Needs Here, There & Everywhere',
+            'duration': 3765,
+            'view_count': int,
+            'subtitles': 'mincount:1',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # with direct download links
+        'url': 'https://www.loc.gov/item/78710669/',
+        'info_dict': {
+            'id': '78710669',
+            'ext': 'mp4',
+            'title': 'La vie et la passion de Jesus-Christ',
+            'duration': 0,
+            'view_count': int,
+            'formats': 'mincount:4',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.loc.gov/item/ihas.200197114/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.loc.gov/item/afc1981005_afs20503/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        media_id = self._search_regex(
+            (r'id=(["\'])media-player-(?P<id>.+?)\1',
+             r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1',
+             r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1',
+             r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1',
+             r'data-tab="share-media-(?P<id>[0-9A-F]{32})"'),
+            webpage, 'media id', group='id')
+
+        data = self._download_json(
+            'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id,
+            media_id)['mediaObject']
+
+        derivative = data['derivatives'][0]
+        media_url = derivative['derivativeUrl']
+
+        title = derivative.get('shortName') or data.get('shortName') or self._og_search_title(
+            webpage)
+
+        # Following algorithm was extracted from setAVSource js function
+        # found in webpage
+        media_url = media_url.replace('rtmp', 'https')
+
+        is_video = data.get('mediaType', 'v').lower() == 'v'
+        ext = determine_ext(media_url)
+        if ext not in ('mp4', 'mp3'):
+            media_url += '.mp4' if is_video else '.mp3'
+
+        formats = []
+        if '/vod/mp4:' in media_url:
+            formats.append({
+                'url': media_url.replace('/vod/mp4:', '/hls-vod/media/') + '.m3u8',
+                'format_id': 'hls',
+                'ext': 'mp4',
+                'protocol': 'm3u8_native',
+                'quality': 1,
+            })
+        http_format = {
+            'url': re.sub(r'(://[^/]+/)(?:[^/]+/)*(?:mp4|mp3):', r'\1', media_url),
+            'format_id': 'http',
+            'quality': 1,
+        }
+        if not is_video:
+            http_format['vcodec'] = 'none'
+        formats.append(http_format)
+
+        download_urls = set()
+        for m in re.finditer(
+                r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?:&nbsp;|\s+)\((?P<size>.+?)\))?\s*<', webpage):
+            format_id = m.group('id').lower()
+            if format_id in ('gif', 'jpeg'):
+                continue
+            download_url = m.group('url')
+            if download_url in download_urls:
+                continue
+            download_urls.add(download_url)
+            formats.append({
+                'url': download_url,
+                'format_id': format_id,
+                'filesize_approx': parse_filesize(m.group('size')),
+            })
+
+        self._sort_formats(formats)
+
+        duration = float_or_none(data.get('duration'))
+        view_count = int_or_none(data.get('viewCount'))
+
+        subtitles = {}
+        cc_url = data.get('ccUrl')
+        if cc_url:
+            subtitles.setdefault('en', []).append({
+                'url': cc_url,
+                'ext': 'ttml',
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py

new file mode 100644 (file)

index 0000000..2cf4442
--- /dev/null
+++ b/youtube_dl/extractor/libsyn.py
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    get_element_by_class,
+    parse_duration,
+    strip_or_none,
+    unified_strdate,
+)
+
+
+class LibsynIE(InfoExtractor):
+    _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))'
+
+    _TESTS = [{
+        'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/',
+        'md5': '2a55e75496c790cdeb058e7e6c087746',
+        'info_dict': {
+            'id': '6385796',
+            'ext': 'mp3',
+            'title': "Champion Minded - Developing a Growth Mindset",
+            # description fetched using another request:
+            # http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796
+            # 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
+            'upload_date': '20180320',
+            'thumbnail': 're:^https?://.*',
+        },
+    }, {
+        'url': 'https://html5-player.libsyn.com/embed/episode/id/3727166/height/75/width/200/theme/standard/direction/no/autoplay/no/autonext/no/thumbnail/no/preload/no/no_addthis/no/',
+        'md5': '6c5cb21acd622d754d3b1a92b582ce42',
+        'info_dict': {
+            'id': '3727166',
+            'ext': 'mp3',
+            'title': 'Clients From Hell Podcast - How a Sex Toy Company Kickstarted my Freelance Career',
+            'upload_date': '20150818',
+            'thumbnail': 're:^https?://.*',
+        }
+    }]
+
+    def _real_extract(self, url):
+        url, video_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, video_id)
+
+        data = self._parse_json(self._search_regex(
+            r'var\s+playlistItem\s*=\s*({.+?});',
+            webpage, 'JSON data block'), video_id)
+
+        episode_title = data.get('item_title') or get_element_by_class('episode-title', webpage)
+        if not episode_title:
+            self._search_regex(
+                [r'data-title="([^"]+)"', r'<title>(.+?)</title>'],
+                webpage, 'episode title')
+        episode_title = episode_title.strip()
+
+        podcast_title = strip_or_none(clean_html(self._search_regex(
+            r'<h3>([^<]+)</h3>', webpage, 'podcast title',
+            default=None) or get_element_by_class('podcast-title', webpage)))
+
+        title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title
+
+        formats = []
+        for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')):
+            f_url = data.get(k)
+            if not f_url:
+                continue
+            formats.append({
+                'url': f_url,
+                'format_id': format_id,
+            })
+
+        description = self._html_search_regex(
+            r'<p\s+id="info_text_body">(.+?)</p>', webpage,
+            'description', default=None)
+        if description:
+            # Strip non-breaking and normal spaces
+            description = description.replace('\u00A0', ' ').strip()
+        release_date = unified_strdate(self._search_regex(
+            r'<div class="release_date">Released: ([^<]+)<',
+            webpage, 'release date', default=None) or data.get('release_date'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': data.get('thumbnail_url'),
+            'upload_date': release_date,
+            'duration': parse_duration(data.get('duration')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py

new file mode 100644 (file)

index 0000000..42e263b
--- /dev/null
+++ b/youtube_dl/extractor/lifenews.py
@@ -0,0 +1,239 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+    remove_end,
+)
+
+
+class LifeNewsIE(InfoExtractor):
+    IE_NAME = 'life'
+    IE_DESC = 'Life.ru'
+    _VALID_URL = r'https?://life\.ru/t/[^/]+/(?P<id>\d+)'
+
+    _TESTS = [{
+        # single video embedded via video/source
+        'url': 'https://life.ru/t/новости/98736',
+        'md5': '77c95eaefaca216e32a76a343ad89d23',
+        'info_dict': {
+            'id': '98736',
+            'ext': 'mp4',
+            'title': 'Мужчина нашел дома архив оборонного завода',
+            'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26',
+            'timestamp': 1344154740,
+            'upload_date': '20120805',
+            'view_count': int,
+        }
+    }, {
+        # single video embedded via iframe
+        'url': 'https://life.ru/t/новости/152125',
+        'md5': '77d19a6f0886cd76bdbf44b4d971a273',
+        'info_dict': {
+            'id': '152125',
+            'ext': 'mp4',
+            'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',
+            'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ',
+            'timestamp': 1427961840,
+            'upload_date': '20150402',
+            'view_count': int,
+        }
+    }, {
+        # two videos embedded via iframe
+        'url': 'https://life.ru/t/новости/153461',
+        'info_dict': {
+            'id': '153461',
+            'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
+            'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+            'timestamp': 1430825520,
+            'view_count': int,
+        },
+        'playlist': [{
+            'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
+            'info_dict': {
+                'id': '153461-video1',
+                'ext': 'mp4',
+                'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)',
+                'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+                'timestamp': 1430825520,
+                'upload_date': '20150505',
+            },
+        }, {
+            'md5': 'ebb3bf3b1ce40e878d0d628e93eb0322',
+            'info_dict': {
+                'id': '153461-video2',
+                'ext': 'mp4',
+                'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)',
+                'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+                'timestamp': 1430825520,
+                'upload_date': '20150505',
+            },
+        }],
+    }, {
+        'url': 'https://life.ru/t/новости/213035',
+        'only_matching': True,
+    }, {
+        'url': 'https://life.ru/t/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/153461',
+        'only_matching': True,
+    }, {
+        'url': 'https://life.ru/t/новости/411489/manuel_vals_nazval_frantsiiu_tsieliu_nomier_odin_dlia_ighil',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_urls = re.findall(
+            r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage)
+
+        iframe_links = re.findall(
+            r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/(?:embed|video)/.+?)["\']',
+            webpage)
+
+        if not video_urls and not iframe_links:
+            raise ExtractorError('No media links available for %s' % video_id)
+
+        title = remove_end(
+            self._og_search_title(webpage),
+            ' - Life.ru')
+
+        description = self._og_search_description(webpage)
+
+        view_count = self._html_search_regex(
+            r'<div[^>]+class=(["\']).*?\bhits-count\b.*?\1[^>]*>\s*(?P<value>\d+)\s*</div>',
+            webpage, 'view count', fatal=False, group='value')
+
+        timestamp = parse_iso8601(self._search_regex(
+            r'<time[^>]+datetime=(["\'])(?P<value>.+?)\1',
+            webpage, 'upload date', fatal=False, group='value'))
+
+        common_info = {
+            'description': description,
+            'view_count': int_or_none(view_count),
+            'timestamp': timestamp,
+        }
+
+        def make_entry(video_id, video_url, index=None):
+            cur_info = dict(common_info)
+            cur_info.update({
+                'id': video_id if not index else '%s-video%s' % (video_id, index),
+                'url': video_url,
+                'title': title if not index else '%s (Видео %s)' % (title, index),
+            })
+            return cur_info
+
+        def make_video_entry(video_id, video_url, index=None):
+            video_url = compat_urlparse.urljoin(url, video_url)
+            return make_entry(video_id, video_url, index)
+
+        def make_iframe_entry(video_id, video_url, index=None):
+            video_url = self._proto_relative_url(video_url, 'http:')
+            cur_info = make_entry(video_id, video_url, index)
+            cur_info['_type'] = 'url_transparent'
+            return cur_info
+
+        if len(video_urls) == 1 and not iframe_links:
+            return make_video_entry(video_id, video_urls[0])
+
+        if len(iframe_links) == 1 and not video_urls:
+            return make_iframe_entry(video_id, iframe_links[0])
+
+        entries = []
+
+        if video_urls:
+            for num, video_url in enumerate(video_urls, 1):
+                entries.append(make_video_entry(video_id, video_url, num))
+
+        if iframe_links:
+            for num, iframe_link in enumerate(iframe_links, len(video_urls) + 1):
+                entries.append(make_iframe_entry(video_id, iframe_link, num))
+
+        playlist = common_info.copy()
+        playlist.update(self.playlist_result(entries, video_id, title, description))
+        return playlist
+
+
+class LifeEmbedIE(InfoExtractor):
+    IE_NAME = 'life:embed'
+    _VALID_URL = r'https?://embed\.life\.ru/(?:embed|video)/(?P<id>[\da-f]{32})'
+
+    _TESTS = [{
+        'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291',
+        'md5': 'b889715c9e49cb1981281d0e5458fbbe',
+        'info_dict': {
+            'id': 'e50c2dec2867350528e2574c899b8291',
+            'ext': 'mp4',
+            'title': 'e50c2dec2867350528e2574c899b8291',
+            'thumbnail': r're:http://.*\.jpg',
+        }
+    }, {
+        # with 1080p
+        'url': 'https://embed.life.ru/video/e50c2dec2867350528e2574c899b8291',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        thumbnail = None
+        formats = []
+
+        def extract_m3u8(manifest_url):
+            formats.extend(self._extract_m3u8_formats(
+                manifest_url, video_id, 'mp4',
+                entry_protocol='m3u8_native', m3u8_id='m3u8'))
+
+        def extract_original(original_url):
+            formats.append({
+                'url': original_url,
+                'format_id': determine_ext(original_url, None),
+                'preference': 1,
+            })
+
+        playlist = self._parse_json(
+            self._search_regex(
+                r'options\s*=\s*({.+?});', webpage, 'options', default='{}'),
+            video_id).get('playlist', {})
+        if playlist:
+            master = playlist.get('master')
+            if isinstance(master, compat_str) and determine_ext(master) == 'm3u8':
+                extract_m3u8(compat_urlparse.urljoin(url, master))
+            original = playlist.get('original')
+            if isinstance(original, compat_str):
+                extract_original(original)
+            thumbnail = playlist.get('image')
+
+        # Old rendition fallback
+        if not formats:
+            for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage):
+                video_url = compat_urlparse.urljoin(url, video_url)
+                if determine_ext(video_url) == 'm3u8':
+                    extract_m3u8(video_url)
+                else:
+                    extract_original(video_url)
+
+        self._sort_formats(formats)
+
+        thumbnail = thumbnail or self._search_regex(
+            r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None)
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py

new file mode 100644 (file)

index 0000000..39f74d2
--- /dev/null
+++ b/youtube_dl/extractor/limelight.py
@@ -0,0 +1,358 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    smuggle_url,
+    try_get,
+    unsmuggle_url,
+    ExtractorError,
+)
+
+
+class LimelightBaseIE(InfoExtractor):
+    _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
+
+    @classmethod
+    def _extract_urls(cls, webpage, source_url):
+        lm = {
+            'Media': 'media',
+            'Channel': 'channel',
+            'ChannelList': 'channel_list',
+        }
+
+        def smuggle(url):
+            return smuggle_url(url, {'source_url': source_url})
+
+        entries = []
+        for kind, video_id in re.findall(
+                r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})',
+                webpage):
+            entries.append(cls.url_result(
+                smuggle('limelight:%s:%s' % (lm[kind], video_id)),
+                'Limelight%s' % kind, video_id))
+        for mobj in re.finditer(
+                # As per [1] class attribute should be exactly equal to
+                # LimelightEmbeddedPlayerFlash but numerous examples seen
+                # that don't exactly match it (e.g. [2]).
+                # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage
+                # 2. http://www.sedona.com/FacilitatorTraining2017
+                r'''(?sx)
+                    <object[^>]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*?
+                        <param[^>]+
+                            name=(["\'])flashVars\2[^>]+
+                            value=(["\'])(?:(?!\3).)*(?P<kind>media|channel(?:List)?)Id=(?P<id>[a-z0-9]{32})
+                ''', webpage):
+            kind, video_id = mobj.group('kind'), mobj.group('id')
+            entries.append(cls.url_result(
+                smuggle('limelight:%s:%s' % (kind, video_id)),
+                'Limelight%s' % kind.capitalize(), video_id))
+        # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page)
+        for video_id in re.findall(
+                r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P<id>[a-z0-9]{32})',
+                webpage):
+            entries.append(cls.url_result(
+                smuggle('limelight:media:%s' % video_id),
+                LimelightMediaIE.ie_key(), video_id))
+        return entries
+
+    def _call_playlist_service(self, item_id, method, fatal=True, referer=None):
+        headers = {}
+        if referer:
+            headers['Referer'] = referer
+        try:
+            return self._download_json(
+                self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method),
+                item_id, 'Downloading PlaylistService %s JSON' % method,
+                fatal=fatal, headers=headers)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission']
+                if error == 'CountryDisabled':
+                    self.raise_geo_restricted()
+                raise ExtractorError(error, expected=True)
+            raise
+
+    def _extract(self, item_id, pc_method, mobile_method, referer=None):
+        pc = self._call_playlist_service(item_id, pc_method, referer=referer)
+        mobile = self._call_playlist_service(
+            item_id, mobile_method, fatal=False, referer=referer)
+        return pc, mobile
+
+    def _extract_info(self, pc, mobile, i, referer):
+        get_item = lambda x, y: try_get(x, lambda x: x[y][i], dict) or {}
+        pc_item = get_item(pc, 'playlistItems')
+        mobile_item = get_item(mobile, 'mediaList')
+        video_id = pc_item.get('mediaId') or mobile_item['mediaId']
+        title = pc_item.get('title') or mobile_item['title']
+
+        formats = []
+        urls = []
+        for stream in pc_item.get('streams', []):
+            stream_url = stream.get('url')
+            if not stream_url or stream.get('drmProtected') or stream_url in urls:
+                continue
+            urls.append(stream_url)
+            ext = determine_ext(stream_url)
+            if ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    stream_url, video_id, f4m_id='hds', fatal=False))
+            else:
+                fmt = {
+                    'url': stream_url,
+                    'abr': float_or_none(stream.get('audioBitRate')),
+                    'fps': float_or_none(stream.get('videoFrameRate')),
+                    'ext': ext,
+                }
+                width = int_or_none(stream.get('videoWidthInPixels'))
+                height = int_or_none(stream.get('videoHeightInPixels'))
+                vbr = float_or_none(stream.get('videoBitRate'))
+                if width or height or vbr:
+                    fmt.update({
+                        'width': width,
+                        'height': height,
+                        'vbr': vbr,
+                    })
+                else:
+                    fmt['vcodec'] = 'none'
+                rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', stream_url)
+                if rtmp:
+                    format_id = 'rtmp'
+                    if stream.get('videoBitRate'):
+                        format_id += '-%d' % int_or_none(stream['videoBitRate'])
+                    http_format_id = format_id.replace('rtmp', 'http')
+
+                    CDN_HOSTS = (
+                        ('delvenetworks.com', 'cpl.delvenetworks.com'),
+                        ('video.llnw.net', 's2.content.video.llnw.net'),
+                    )
+                    for cdn_host, http_host in CDN_HOSTS:
+                        if cdn_host not in rtmp.group('host').lower():
+                            continue
+                        http_url = 'http://%s/%s' % (http_host, rtmp.group('playpath')[4:])
+                        urls.append(http_url)
+                        if self._is_valid_url(http_url, video_id, http_format_id):
+                            http_fmt = fmt.copy()
+                            http_fmt.update({
+                                'url': http_url,
+                                'format_id': http_format_id,
+                            })
+                            formats.append(http_fmt)
+                            break
+
+                    fmt.update({
+                        'url': rtmp.group('url'),
+                        'play_path': rtmp.group('playpath'),
+                        'app': rtmp.group('app'),
+                        'ext': 'flv',
+                        'format_id': format_id,
+                    })
+                formats.append(fmt)
+
+        for mobile_url in mobile_item.get('mobileUrls', []):
+            media_url = mobile_url.get('mobileUrl')
+            format_id = mobile_url.get('targetMediaPlatform')
+            if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls:
+                continue
+            urls.append(media_url)
+            ext = determine_ext(media_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    media_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id=format_id, fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    stream_url, video_id, f4m_id=format_id, fatal=False))
+            else:
+                formats.append({
+                    'url': media_url,
+                    'format_id': format_id,
+                    'preference': -1,
+                    'ext': ext,
+                })
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for flag in mobile_item.get('flags'):
+            if flag == 'ClosedCaptions':
+                closed_captions = self._call_playlist_service(
+                    video_id, 'getClosedCaptionsDetailsByMediaId',
+                    False, referer) or []
+                for cc in closed_captions:
+                    cc_url = cc.get('webvttFileUrl')
+                    if not cc_url:
+                        continue
+                    lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en')
+                    subtitles.setdefault(lang, []).append({
+                        'url': cc_url,
+                    })
+                break
+
+        get_meta = lambda x: pc_item.get(x) or mobile_item.get(x)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': get_meta('description'),
+            'formats': formats,
+            'duration': float_or_none(get_meta('durationInMilliseconds'), 1000),
+            'thumbnail': get_meta('previewImageUrl') or get_meta('thumbnailImageUrl'),
+            'subtitles': subtitles,
+        }
+
+
+class LimelightMediaIE(LimelightBaseIE):
+    IE_NAME = 'limelight'
+    _VALID_URL = r'''(?x)
+                        (?:
+                            limelight:media:|
+                            https?://
+                                (?:
+                                    link\.videoplatform\.limelight\.com/media/|
+                                    assets\.delvenetworks\.com/player/loader\.swf
+                                )
+                                \?.*?\bmediaId=
+                        )
+                        (?P<id>[a-z0-9]{32})
+                    '''
+    _TESTS = [{
+        'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86',
+        'info_dict': {
+            'id': '3ffd040b522b4485b6d84effc750cd86',
+            'ext': 'mp4',
+            'title': 'HaP and the HB Prince Trailer',
+            'description': 'md5:8005b944181778e313d95c1237ddb640',
+            'thumbnail': r're:^https?://.*\.jpeg$',
+            'duration': 144.23,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # video with subtitles
+        'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335',
+        'md5': '2fa3bad9ac321e23860ca23bc2c69e3d',
+        'info_dict': {
+            'id': 'a3e00274d4564ec4a9b29b9466432335',
+            'ext': 'mp4',
+            'title': '3Play Media Overview Video',
+            'thumbnail': r're:^https?://.*\.jpeg$',
+            'duration': 78.101,
+            # TODO: extract all languages that were accessible via API
+            # 'subtitles': 'mincount:9',
+            'subtitles': 'mincount:1',
+        },
+    }, {
+        'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452',
+        'only_matching': True,
+    }]
+    _PLAYLIST_SERVICE_PATH = 'media'
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+        video_id = self._match_id(url)
+        source_url = smuggled_data.get('source_url')
+        self._initialize_geo_bypass({
+            'countries': smuggled_data.get('geo_countries'),
+        })
+
+        pc, mobile = self._extract(
+            video_id, 'getPlaylistByMediaId',
+            'getMobilePlaylistByMediaId', source_url)
+
+        return self._extract_info(pc, mobile, 0, source_url)
+
+
+class LimelightChannelIE(LimelightBaseIE):
+    IE_NAME = 'limelight:channel'
+    _VALID_URL = r'''(?x)
+                        (?:
+                            limelight:channel:|
+                            https?://
+                                (?:
+                                    link\.videoplatform\.limelight\.com/media/|
+                                    assets\.delvenetworks\.com/player/loader\.swf
+                                )
+                                \?.*?\bchannelId=
+                        )
+                        (?P<id>[a-z0-9]{32})
+                    '''
+    _TESTS = [{
+        'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082',
+        'info_dict': {
+            'id': 'ab6a524c379342f9b23642917020c082',
+            'title': 'Javascript Sample Code',
+            'description': 'Javascript Sample Code - http://www.delvenetworks.com/sample-code/playerCode-demo.html',
+        },
+        'playlist_mincount': 3,
+    }, {
+        'url': 'http://assets.delvenetworks.com/player/loader.swf?channelId=ab6a524c379342f9b23642917020c082',
+        'only_matching': True,
+    }]
+    _PLAYLIST_SERVICE_PATH = 'channel'
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+        channel_id = self._match_id(url)
+        source_url = smuggled_data.get('source_url')
+
+        pc, mobile = self._extract(
+            channel_id, 'getPlaylistByChannelId',
+            'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1',
+            source_url)
+
+        entries = [
+            self._extract_info(pc, mobile, i, source_url)
+            for i in range(len(pc['playlistItems']))]
+
+        return self.playlist_result(
+            entries, channel_id, pc.get('title'), mobile.get('description'))
+
+
+class LimelightChannelListIE(LimelightBaseIE):
+    IE_NAME = 'limelight:channel_list'
+    _VALID_URL = r'''(?x)
+                        (?:
+                            limelight:channel_list:|
+                            https?://
+                                (?:
+                                    link\.videoplatform\.limelight\.com/media/|
+                                    assets\.delvenetworks\.com/player/loader\.swf
+                                )
+                                \?.*?\bchannelListId=
+                        )
+                        (?P<id>[a-z0-9]{32})
+                    '''
+    _TESTS = [{
+        'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b',
+        'info_dict': {
+            'id': '301b117890c4465c8179ede21fd92e2b',
+            'title': 'Website - Hero Player',
+        },
+        'playlist_mincount': 2,
+    }, {
+        'url': 'https://assets.delvenetworks.com/player/loader.swf?channelListId=301b117890c4465c8179ede21fd92e2b',
+        'only_matching': True,
+    }]
+    _PLAYLIST_SERVICE_PATH = 'channel_list'
+
+    def _real_extract(self, url):
+        channel_list_id = self._match_id(url)
+
+        channel_list = self._call_playlist_service(
+            channel_list_id, 'getMobileChannelListById')
+
+        entries = [
+            self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel')
+            for channel in channel_list['channelList']]
+
+        return self.playlist_result(
+            entries, channel_list_id, channel_list['title'])
diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py

new file mode 100644 (file)

index 0000000..7f5fa44
--- /dev/null
+++ b/youtube_dl/extractor/line.py
@@ -0,0 +1,90 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class LineTVIE(InfoExtractor):
+    _VALID_URL = r'https?://tv\.line\.me/v/(?P<id>\d+)_[^/]+-(?P<segment>ep\d+-\d+)'
+
+    _TESTS = [{
+        'url': 'https://tv.line.me/v/793123_goodbye-mrblack-ep1-1/list/69246',
+        'info_dict': {
+            'id': '793123_ep1-1',
+            'ext': 'mp4',
+            'title': 'Goodbye Mr.Black | EP.1-1',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 998.509,
+            'view_count': int,
+        },
+    }, {
+        'url': 'https://tv.line.me/v/2587507_%E6%B4%BE%E9%81%A3%E5%A5%B3%E9%86%ABx-ep1-02/list/185245',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        series_id, segment = re.match(self._VALID_URL, url).groups()
+        video_id = '%s_%s' % (series_id, segment)
+
+        webpage = self._download_webpage(url, video_id)
+
+        player_params = self._parse_json(self._search_regex(
+            r'naver\.WebPlayer\(({[^}]+})\)', webpage, 'player parameters'),
+            video_id, transform_source=js_to_json)
+
+        video_info = self._download_json(
+            'https://global-nvapis.line.me/linetv/rmcnmv/vod_play_videoInfo.json',
+            video_id, query={
+                'videoId': player_params['videoId'],
+                'key': player_params['key'],
+            })
+
+        stream = video_info['streams'][0]
+        extra_query = '?__gda__=' + stream['key']['value']
+        formats = self._extract_m3u8_formats(
+            stream['source'] + extra_query, video_id, ext='mp4',
+            entry_protocol='m3u8_native', m3u8_id='hls')
+
+        for a_format in formats:
+            a_format['url'] += extra_query
+
+        duration = None
+        for video in video_info.get('videos', {}).get('list', []):
+            encoding_option = video.get('encodingOption', {})
+            abr = video['bitrate']['audio']
+            vbr = video['bitrate']['video']
+            tbr = abr + vbr
+            formats.append({
+                'url': video['source'],
+                'format_id': 'http-%d' % int(tbr),
+                'height': encoding_option.get('height'),
+                'width': encoding_option.get('width'),
+                'abr': abr,
+                'vbr': vbr,
+                'filesize': video.get('size'),
+            })
+            if video.get('duration') and duration is None:
+                duration = video['duration']
+
+        self._sort_formats(formats)
+
+        if not formats[0].get('width'):
+            formats[0]['vcodec'] = 'none'
+
+        title = self._og_search_title(webpage)
+
+        # like_count requires an additional API request https://tv.line.me/api/likeit/getCount
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'extra_param_to_segment_url': extra_query[1:],
+            'duration': duration,
+            'thumbnails': [{'url': thumbnail['source']}
+                           for thumbnail in video_info.get('thumbnails', {}).get('list', [])],
+            'view_count': video_info.get('meta', {}).get('count'),
+        }
diff --git a/youtube_dl/extractor/linkedin.py b/youtube_dl/extractor/linkedin.py

new file mode 100644 (file)

index 0000000..26fc703
--- /dev/null
+++ b/youtube_dl/extractor/linkedin.py
@@ -0,0 +1,182 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class LinkedInLearningBaseIE(InfoExtractor):
+    _NETRC_MACHINE = 'linkedin'
+    _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning'
+
+    def _call_api(self, course_slug, fields, video_slug=None, resolution=None):
+        query = {
+            'courseSlug': course_slug,
+            'fields': fields,
+            'q': 'slugs',
+        }
+        sub = ''
+        if video_slug:
+            query.update({
+                'videoSlug': video_slug,
+                'resolution': '_%s' % resolution,
+            })
+            sub = ' %dp' % resolution
+        api_url = 'https://www.linkedin.com/learning-api/detailedCourses'
+        return self._download_json(
+            api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={
+                'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value,
+            }, query=query)['elements'][0]
+
+    def _get_urn_id(self, video_data):
+        urn = video_data.get('urn')
+        if urn:
+            mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn)
+            if mobj:
+                return mobj.group(1)
+
+    def _get_video_id(self, video_data, course_slug, video_slug):
+        return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug)
+
+    def _real_initialize(self):
+        email, password = self._get_login_info()
+        if email is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login page')
+        action_url = urljoin(self._LOGIN_URL, self._search_regex(
+            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url',
+            default='https://www.linkedin.com/uas/login-submit', group='url'))
+        data = self._hidden_inputs(login_page)
+        data.update({
+            'session_key': email,
+            'session_password': password,
+        })
+        login_submit_page = self._download_webpage(
+            action_url, None, 'Logging in',
+            data=urlencode_postdata(data))
+        error = self._search_regex(
+            r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>',
+            login_submit_page, 'error', default=None)
+        if error:
+            raise ExtractorError(error, expected=True)
+
+
+class LinkedInLearningIE(LinkedInLearningBaseIE):
+    IE_NAME = 'linkedin:learning'
+    _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<course_slug>[^/]+)/(?P<id>[^/?#]+)'
+    _TEST = {
+        'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true',
+        'md5': 'a1d74422ff0d5e66a792deb996693167',
+        'info_dict': {
+            'id': '90426',
+            'ext': 'mp4',
+            'title': 'Welcome',
+            'timestamp': 1430396150.82,
+            'upload_date': '20150430',
+        },
+    }
+
+    def _real_extract(self, url):
+        course_slug, video_slug = re.match(self._VALID_URL, url).groups()
+
+        video_data = None
+        formats = []
+        for width, height in ((640, 360), (960, 540), (1280, 720)):
+            video_data = self._call_api(
+                course_slug, 'selectedVideo', video_slug, height)['selectedVideo']
+
+            video_url_data = video_data.get('url') or {}
+            progressive_url = video_url_data.get('progressiveUrl')
+            if progressive_url:
+                formats.append({
+                    'format_id': 'progressive-%dp' % height,
+                    'url': progressive_url,
+                    'height': height,
+                    'width': width,
+                    'source_preference': 1,
+                })
+
+        title = video_data['title']
+
+        audio_url = video_data.get('audio', {}).get('progressiveUrl')
+        if audio_url:
+            formats.append({
+                'abr': 64,
+                'ext': 'm4a',
+                'format_id': 'audio',
+                'url': audio_url,
+                'vcodec': 'none',
+            })
+
+        streaming_url = video_url_data.get('streamingUrl')
+        if streaming_url:
+            formats.extend(self._extract_m3u8_formats(
+                streaming_url, video_slug, 'mp4',
+                'm3u8_native', m3u8_id='hls', fatal=False))
+
+        self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr'))
+
+        return {
+            'id': self._get_video_id(video_data, course_slug, video_slug),
+            'title': title,
+            'formats': formats,
+            'thumbnail': video_data.get('defaultThumbnail'),
+            'timestamp': float_or_none(video_data.get('publishedOn'), 1000),
+            'duration': int_or_none(video_data.get('durationInSeconds')),
+        }
+
+
+class LinkedInLearningCourseIE(LinkedInLearningBaseIE):
+    IE_NAME = 'linkedin:learning:course'
+    _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P<id>[^/?#]+)'
+    _TEST = {
+        'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals',
+        'info_dict': {
+            'id': 'programming-foundations-fundamentals',
+            'title': 'Programming Foundations: Fundamentals',
+            'description': 'md5:76e580b017694eb89dc8e8923fff5c86',
+        },
+        'playlist_mincount': 61,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if LinkedInLearningIE.suitable(url) else super(LinkedInLearningCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        course_slug = self._match_id(url)
+        course_data = self._call_api(course_slug, 'chapters,description,title')
+
+        entries = []
+        for chapter_number, chapter in enumerate(course_data.get('chapters', []), 1):
+            chapter_title = chapter.get('title')
+            chapter_id = self._get_urn_id(chapter)
+            for video in chapter.get('videos', []):
+                video_slug = video.get('slug')
+                if not video_slug:
+                    continue
+                entries.append({
+                    '_type': 'url_transparent',
+                    'id': self._get_video_id(video, course_slug, video_slug),
+                    'title': video.get('title'),
+                    'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug),
+                    'chapter': chapter_title,
+                    'chapter_number': chapter_number,
+                    'chapter_id': chapter_id,
+                    'ie_key': LinkedInLearningIE.ie_key(),
+                })
+
+        return self.playlist_result(
+            entries, course_slug,
+            course_data.get('title'),
+            course_data.get('description'))
diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py

new file mode 100644 (file)

index 0000000..23ca965
--- /dev/null
+++ b/youtube_dl/extractor/linuxacademy.py
@@ -0,0 +1,173 @@
+from __future__ import unicode_literals
+
+import json
+import random
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_HTTPError,
+)
+from ..utils import (
+    ExtractorError,
+    orderedSet,
+    unescapeHTML,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class LinuxAcademyIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?linuxacademy\.com/cp/
+                        (?:
+                            courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
+                            modules/view/id/(?P<course_id>\d+)
+                        )
+                    '''
+    _TESTS = [{
+        'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154',
+        'info_dict': {
+            'id': '1498-2',
+            'ext': 'mp4',
+            'title': "Introduction to the Practitioner's Brief",
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Requires Linux Academy account credentials',
+    }, {
+        'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
+        'only_matching': True,
+    }, {
+        'url': 'https://linuxacademy.com/cp/modules/view/id/154',
+        'info_dict': {
+            'id': '154',
+            'title': 'AWS Certified Cloud Practitioner',
+            'description': 'md5:039db7e60e4aac9cf43630e0a75fa834',
+        },
+        'playlist_count': 41,
+        'skip': 'Requires Linux Academy account credentials',
+    }]
+
+    _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
+    _ORIGIN_URL = 'https://linuxacademy.com'
+    _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
+    _NETRC_MACHINE = 'linuxacademy'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        def random_string():
+            return ''.join([
+                random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
+                for _ in range(32)])
+
+        webpage, urlh = self._download_webpage_handle(
+            self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
+                'client_id': self._CLIENT_ID,
+                'response_type': 'token id_token',
+                'redirect_uri': self._ORIGIN_URL,
+                'scope': 'openid email user_impersonation profile',
+                'audience': self._ORIGIN_URL,
+                'state': random_string(),
+                'nonce': random_string(),
+            })
+
+        login_data = self._parse_json(
+            self._search_regex(
+                r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+                'login info', group='value'), None,
+            transform_source=lambda x: compat_b64decode(x).decode('utf-8')
+        )['extraParams']
+
+        login_data.update({
+            'client_id': self._CLIENT_ID,
+            'redirect_uri': self._ORIGIN_URL,
+            'tenant': 'lacausers',
+            'connection': 'Username-Password-Authentication',
+            'username': username,
+            'password': password,
+            'sso': 'true',
+        })
+
+        login_state_url = urlh.geturl()
+
+        try:
+            login_page = self._download_webpage(
+                'https://login.linuxacademy.com/usernamepassword/login', None,
+                'Downloading login page', data=json.dumps(login_data).encode(),
+                headers={
+                    'Content-Type': 'application/json',
+                    'Origin': 'https://login.linuxacademy.com',
+                    'Referer': login_state_url,
+                })
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                error = self._parse_json(e.cause.read(), None)
+                message = error.get('description') or error['code']
+                raise ExtractorError(
+                    '%s said: %s' % (self.IE_NAME, message), expected=True)
+            raise
+
+        callback_page, urlh = self._download_webpage_handle(
+            'https://login.linuxacademy.com/login/callback', None,
+            'Downloading callback page',
+            data=urlencode_postdata(self._hidden_inputs(login_page)),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'Origin': 'https://login.linuxacademy.com',
+                'Referer': login_state_url,
+            })
+
+        access_token = self._search_regex(
+            r'access_token=([^=&]+)', urlh.geturl(),
+            'access token')
+
+        self._download_webpage(
+            'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
+            % access_token, None, 'Downloading token validation page')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
+        item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
+
+        webpage = self._download_webpage(url, item_id)
+
+        # course path
+        if course_id:
+            entries = [
+                self.url_result(
+                    urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key())
+                for lesson_url in orderedSet(re.findall(
+                    r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)',
+                    webpage))]
+            title = unescapeHTML(self._html_search_regex(
+                (r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)',
+                 r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'),
+                webpage, 'title', default=None, group='value'))
+            description = unescapeHTML(self._html_search_regex(
+                r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
+                webpage, 'description', default=None, group='value'))
+            return self.playlist_result(entries, course_id, title, description)
+
+        # single video path
+        info = self._extract_jwplayer_data(
+            webpage, item_id, require_title=False, m3u8_id='hls',)
+        title = self._search_regex(
+            (r'>Lecture\s*:\s*(?P<value>[^<]+)',
+             r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
+            'title', group='value')
+        info.update({
+            'id': item_id,
+            'title': title,
+        })
+        return info
diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py

new file mode 100644 (file)

index 0000000..337b1b1
--- /dev/null
+++ b/youtube_dl/extractor/litv.py
@@ -0,0 +1,148 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    smuggle_url,
+    unsmuggle_url,
+)
+
+
+class LiTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P<id>[^&]+)'
+
+    _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s'
+
+    _TESTS = [{
+        'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
+        'info_dict': {
+            'id': 'VOD00041606',
+            'title': '花千骨',
+        },
+        'playlist_count': 50,
+    }, {
+        'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
+        'md5': '969e343d9244778cb29acec608e53640',
+        'info_dict': {
+            'id': 'VOD00041610',
+            'ext': 'mp4',
+            'title': '花千骨第1集',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f',
+            'episode_number': 1,
+        },
+        'params': {
+            'noplaylist': True,
+        },
+        'skip': 'Georestricted to Taiwan',
+    }, {
+        'url': 'https://www.litv.tv/promo/miyuezhuan/?content_id=VOD00044841&',
+        'md5': '88322ea132f848d6e3e18b32a832b918',
+        'info_dict': {
+            'id': 'VOD00044841',
+            'ext': 'mp4',
+            'title': '芈月傳第1集　霸星芈月降世楚國',
+            'description': '楚威王二年，太史令唐昧夜觀星象，發現霸星即將現世。王后得知霸星的預言後，想盡辦法不讓孩子順利出生，幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主，楚威王對此失望至極。楚王后命人將女嬰丟棄河中，居然奇蹟似的被少司命像攔下，楚威王認為此女非同凡響，為她取名芈月。',
+        },
+        'skip': 'Georestricted to Taiwan',
+    }]
+
+    def _extract_playlist(self, season_list, video_id, program_info, prompt=True):
+        episode_title = program_info['title']
+        content_id = season_list['contentId']
+
+        if prompt:
+            self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id))
+
+        all_episodes = [
+            self.url_result(smuggle_url(
+                self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']),
+                {'force_noplaylist': True}))  # To prevent infinite recursion
+            for episode in season_list['episode']]
+
+        return self.playlist_result(all_episodes, content_id, episode_title)
+
+    def _real_extract(self, url):
+        url, data = unsmuggle_url(url, {})
+
+        video_id = self._match_id(url)
+
+        noplaylist = self._downloader.params.get('noplaylist')
+        noplaylist_prompt = True
+        if 'force_noplaylist' in data:
+            noplaylist = data['force_noplaylist']
+            noplaylist_prompt = False
+
+        webpage = self._download_webpage(url, video_id)
+
+        program_info = self._parse_json(self._search_regex(
+            r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'),
+            video_id)
+
+        season_list = list(program_info.get('seasonList', {}).values())
+        if season_list:
+            if not noplaylist:
+                return self._extract_playlist(
+                    season_list[0], video_id, program_info,
+                    prompt=noplaylist_prompt)
+
+            if noplaylist_prompt:
+                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
+        # In browsers `getMainUrl` request is always issued. Usually this
+        # endpoint gives the same result as the data embedded in the webpage.
+        # If georestricted, there are no embedded data, so an extra request is
+        # necessary to get the error code
+        if 'assetId' not in program_info:
+            program_info = self._download_json(
+                'https://www.litv.tv/vod/ajax/getProgramInfo', video_id,
+                query={'contentId': video_id},
+                headers={'Accept': 'application/json'})
+        video_data = self._parse_json(self._search_regex(
+            r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
+            webpage, 'video data', default='{}'), video_id)
+        if not video_data:
+            payload = {
+                'assetId': program_info['assetId'],
+                'watchDevices': program_info['watchDevices'],
+                'contentType': program_info['contentType'],
+            }
+            video_data = self._download_json(
+                'https://www.litv.tv/vod/getMainUrl', video_id,
+                data=json.dumps(payload).encode('utf-8'),
+                headers={'Content-Type': 'application/json'})
+
+        if not video_data.get('fullpath'):
+            error_msg = video_data.get('errorMessage')
+            if error_msg == 'vod.error.outsideregionerror':
+                self.raise_geo_restricted('This video is available in Taiwan only')
+            if error_msg:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True)
+            raise ExtractorError('Unexpected result from %s' % self.IE_NAME)
+
+        formats = self._extract_m3u8_formats(
+            video_data['fullpath'], video_id, ext='mp4',
+            entry_protocol='m3u8_native', m3u8_id='hls')
+        for a_format in formats:
+            # LiTV HLS segments doesn't like compressions
+            a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True
+
+        title = program_info['title'] + program_info.get('secondaryMark', '')
+        description = program_info.get('description')
+        thumbnail = program_info.get('imageFile')
+        categories = [item['name'] for item in program_info.get('category', [])]
+        episode = int_or_none(program_info.get('episode'))
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'categories': categories,
+            'episode_number': episode,
+        }
diff --git a/youtube_dl/extractor/livejournal.py b/youtube_dl/extractor/livejournal.py

new file mode 100644 (file)

index 0000000..3a9f455
--- /dev/null
+++ b/youtube_dl/extractor/livejournal.py
@@ -0,0 +1,42 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import int_or_none
+
+
+class LiveJournalIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^.]+\.)?livejournal\.com/video/album/\d+.+?\bid=(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://andrei-bt.livejournal.com/video/album/407/?mode=view&id=51272',
+        'md5': 'adaf018388572ced8a6f301ace49d4b2',
+        'info_dict': {
+            'id': '1263729',
+            'ext': 'mp4',
+            'title': 'Истребители против БПЛА',
+            'upload_date': '20190624',
+            'timestamp': 1561406715,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        record = self._parse_json(self._search_regex(
+            r'Site\.page\s*=\s*({.+?});', webpage,
+            'page data'), video_id)['video']['record']
+        storage_id = compat_str(record['storageid'])
+        title = record.get('name')
+        if title:
+            # remove filename extension(.mp4, .mov, etc...)
+            title = title.rsplit('.', 1)[0]
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': title,
+            'thumbnail': record.get('thumbnail'),
+            'timestamp': int_or_none(record.get('timecreate')),
+            'url': 'eagleplatform:vc.videos.livejournal.com:' + storage_id,
+            'ie_key': 'EaglePlatform',
+        }
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py

new file mode 100644 (file)

index 0000000..4ac437c
--- /dev/null
+++ b/youtube_dl/extractor/liveleak.py
@@ -0,0 +1,191 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class LiveLeakIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?.*?\b[it]=(?P<id>[\w_]+)'
+    _TESTS = [{
+        'url': 'http://www.liveleak.com/view?i=757_1364311680',
+        'md5': '0813c2430bea7a46bf13acf3406992f4',
+        'info_dict': {
+            'id': '757_1364311680',
+            'ext': 'mp4',
+            'description': 'extremely bad day for this guy..!',
+            'uploader': 'ljfriel2',
+            'title': 'Most unlucky car accident',
+            'thumbnail': r're:^https?://.*\.jpg$'
+        }
+    }, {
+        'url': 'http://www.liveleak.com/view?i=f93_1390833151',
+        'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
+        'info_dict': {
+            'id': 'f93_1390833151',
+            'ext': 'mp4',
+            'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',
+            'uploader': 'ARD_Stinkt',
+            'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
+            'thumbnail': r're:^https?://.*\.jpg$'
+        }
+    }, {
+        # Prochan embed
+        'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
+        'md5': '42c6d97d54f1db107958760788c5f48f',
+        'info_dict': {
+            'id': '4f7_1392687779',
+            'ext': 'mp4',
+            'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing...  I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.",
+            'uploader': 'CapObveus',
+            'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
+            'age_limit': 18,
+        },
+        'skip': 'Video is dead',
+    }, {
+        # Covers https://github.com/ytdl-org/youtube-dl/pull/5983
+        # Multiple resolutions
+        'url': 'http://www.liveleak.com/view?i=801_1409392012',
+        'md5': 'c3a449dbaca5c0d1825caecd52a57d7b',
+        'info_dict': {
+            'id': '801_1409392012',
+            'ext': 'mp4',
+            'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.',
+            'uploader': 'bony333',
+            'title': 'Crazy Hungarian tourist films close call waterspout in Croatia',
+            'thumbnail': r're:^https?://.*\.jpg$'
+        }
+    }, {
+        # Covers https://github.com/ytdl-org/youtube-dl/pull/10664#issuecomment-247439521
+        'url': 'http://m.liveleak.com/view?i=763_1473349649',
+        'add_ie': ['Youtube'],
+        'info_dict': {
+            'id': '763_1473349649',
+            'ext': 'mp4',
+            'title': 'Reporters and public officials ignore epidemic of black on asian violence in Sacramento | Colin Flaherty',
+            'description': 'Colin being the warrior he is and showing the injustice Asians in Sacramento are being subjected to.',
+            'uploader': 'Ziz',
+            'upload_date': '20160908',
+            'uploader_id': 'UCEbta5E_jqlZmEJsriTEtnw'
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.liveleak.com/view?i=677_1439397581',
+        'info_dict': {
+            'id': '677_1439397581',
+            'title': 'Fuel Depot in China Explosion caught on video',
+        },
+        'playlist_count': 3,
+    }, {
+        'url': 'https://www.liveleak.com/view?t=HvHi_1523016227',
+        'only_matching': True,
+    }, {
+        # No original video
+        'url': 'https://www.liveleak.com/view?t=C26ZZ_1558612804',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
+        video_description = self._og_search_description(webpage)
+        video_uploader = self._html_search_regex(
+            r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False)
+        age_limit = int_or_none(self._search_regex(
+            r'you confirm that you are ([0-9]+) years and over.',
+            webpage, 'age limit', default=None))
+        video_thumbnail = self._og_search_thumbnail(webpage)
+
+        entries = self._parse_html5_media_entries(url, webpage, video_id)
+        if not entries:
+            # Maybe an embed?
+            embed_url = self._search_regex(
+                r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
+                webpage, 'embed URL')
+            return {
+                '_type': 'url_transparent',
+                'url': embed_url,
+                'id': video_id,
+                'title': video_title,
+                'description': video_description,
+                'uploader': video_uploader,
+                'age_limit': age_limit,
+            }
+
+        for idx, info_dict in enumerate(entries):
+            formats = []
+            for a_format in info_dict['formats']:
+                if not a_format.get('height'):
+                    a_format['height'] = int_or_none(self._search_regex(
+                        r'([0-9]+)p\.mp4', a_format['url'], 'height label',
+                        default=None))
+                formats.append(a_format)
+
+                # Removing '.*.mp4' gives the raw video, which is essentially
+                # the same video without the LiveLeak logo at the top (see
+                # https://github.com/ytdl-org/youtube-dl/pull/4768)
+                orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url'])
+                if a_format['url'] != orig_url:
+                    format_id = a_format.get('format_id')
+                    format_id = 'original' + ('-' + format_id if format_id else '')
+                    if self._is_valid_url(orig_url, video_id, format_id):
+                        formats.append({
+                            'format_id': format_id,
+                            'url': orig_url,
+                            'preference': 1,
+                        })
+            self._sort_formats(formats)
+            info_dict['formats'] = formats
+
+            # Don't append entry ID for one-video pages to keep backward compatibility
+            if len(entries) > 1:
+                info_dict['id'] = '%s_%s' % (video_id, idx + 1)
+            else:
+                info_dict['id'] = video_id
+
+            info_dict.update({
+                'title': video_title,
+                'description': video_description,
+                'uploader': video_uploader,
+                'age_limit': age_limit,
+                'thumbnail': video_thumbnail,
+            })
+
+        return self.playlist_result(entries, video_id, video_title)
+
+
+class LiveLeakEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[ift])=(?P<id>[\w_]+)'
+
+    # See generic.py for actual test cases
+    _TESTS = [{
+        'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        kind, video_id = re.match(self._VALID_URL, url).groups()
+
+        if kind == 'f':
+            webpage = self._download_webpage(url, video_id)
+            liveleak_url = self._search_regex(
+                r'(?:logourl\s*:\s*|window\.open\()(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL,
+                webpage, 'LiveLeak URL', group='url')
+        else:
+            liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id)
+
+        return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key())
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py

new file mode 100644 (file)

index 0000000..e55b1a2
--- /dev/null
+++ b/youtube_dl/extractor/livestream.py
@@ -0,0 +1,366 @@
+from __future__ import unicode_literals
+
+import re
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    find_xpath_attr,
+    xpath_attr,
+    xpath_with_ns,
+    xpath_text,
+    orderedSet,
+    update_url_query,
+    int_or_none,
+    float_or_none,
+    parse_iso8601,
+    determine_ext,
+)
+
+
+class LivestreamIE(InfoExtractor):
+    IE_NAME = 'livestream'
+    _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?'
+    _TESTS = [{
+        'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
+        'md5': '53274c76ba7754fb0e8d072716f2292b',
+        'info_dict': {
+            'id': '4719370',
+            'ext': 'mp4',
+            'title': 'Live from Webster Hall NYC',
+            'timestamp': 1350008072,
+            'upload_date': '20121012',
+            'duration': 5968.0,
+            'like_count': int,
+            'view_count': int,
+            'thumbnail': r're:^http://.*\.jpg$'
+        }
+    }, {
+        'url': 'http://new.livestream.com/tedx/cityenglish',
+        'info_dict': {
+            'title': 'TEDCity2.0 (English)',
+            'id': '2245590',
+        },
+        'playlist_mincount': 4,
+    }, {
+        'url': 'http://new.livestream.com/chess24/tatasteelchess',
+        'info_dict': {
+            'title': 'Tata Steel Chess',
+            'id': '3705884',
+        },
+        'playlist_mincount': 60,
+    }, {
+        'url': 'https://new.livestream.com/accounts/362/events/3557232/videos/67864563/player?autoPlay=false&height=360&mute=false&width=640',
+        'only_matching': True,
+    }, {
+        'url': 'http://livestream.com/bsww/concacafbeachsoccercampeonato2015',
+        'only_matching': True,
+    }]
+    _API_URL_TEMPLATE = 'http://livestream.com/api/accounts/%s/events/%s'
+
+    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+        base_ele = find_xpath_attr(
+            smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase')
+        base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/'
+
+        formats = []
+        video_nodes = smil.findall(self._xpath_ns('.//video', namespace))
+
+        for vn in video_nodes:
+            tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000)
+            furl = (
+                update_url_query(compat_urlparse.urljoin(base, vn.attrib['src']), {
+                    'v': '3.0.3',
+                    'fp': 'WIN% 14,0,0,145',
+                }))
+            if 'clipBegin' in vn.attrib:
+                furl += '&ssek=' + vn.attrib['clipBegin']
+            formats.append({
+                'url': furl,
+                'format_id': 'smil_%d' % tbr,
+                'ext': 'flv',
+                'tbr': tbr,
+                'preference': -1000,
+            })
+        return formats
+
+    def _extract_video_info(self, video_data):
+        video_id = compat_str(video_data['id'])
+
+        FORMAT_KEYS = (
+            ('sd', 'progressive_url'),
+            ('hd', 'progressive_url_hd'),
+        )
+
+        formats = []
+        for format_id, key in FORMAT_KEYS:
+            video_url = video_data.get(key)
+            if video_url:
+                ext = determine_ext(video_url)
+                if ext == 'm3u8':
+                    continue
+                bitrate = int_or_none(self._search_regex(
+                    r'(\d+)\.%s' % ext, video_url, 'bitrate', default=None))
+                formats.append({
+                    'url': video_url,
+                    'format_id': format_id,
+                    'tbr': bitrate,
+                    'ext': ext,
+                })
+
+        smil_url = video_data.get('smil_url')
+        if smil_url:
+            formats.extend(self._extract_smil_formats(smil_url, video_id, fatal=False))
+
+        m3u8_url = video_data.get('m3u8_url')
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        f4m_url = video_data.get('f4m_url')
+        if f4m_url:
+            formats.extend(self._extract_f4m_formats(
+                f4m_url, video_id, f4m_id='hds', fatal=False))
+        self._sort_formats(formats)
+
+        comments = [{
+            'author_id': comment.get('author_id'),
+            'author': comment.get('author', {}).get('full_name'),
+            'id': comment.get('id'),
+            'text': comment['text'],
+            'timestamp': parse_iso8601(comment.get('created_at')),
+        } for comment in video_data.get('comments', {}).get('data', [])]
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': video_data['caption'],
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('thumbnail_url'),
+            'duration': float_or_none(video_data.get('duration'), 1000),
+            'timestamp': parse_iso8601(video_data.get('publish_at')),
+            'like_count': video_data.get('likes', {}).get('total'),
+            'comment_count': video_data.get('comments', {}).get('total'),
+            'view_count': video_data.get('views'),
+            'comments': comments,
+        }
+
+    def _extract_stream_info(self, stream_info):
+        broadcast_id = compat_str(stream_info['broadcast_id'])
+        is_live = stream_info.get('is_live')
+
+        formats = []
+        smil_url = stream_info.get('play_url')
+        if smil_url:
+            formats.extend(self._extract_smil_formats(smil_url, broadcast_id))
+
+        m3u8_url = stream_info.get('m3u8_url')
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, broadcast_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        rtsp_url = stream_info.get('rtsp_url')
+        if rtsp_url:
+            formats.append({
+                'url': rtsp_url,
+                'format_id': 'rtsp',
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': broadcast_id,
+            'formats': formats,
+            'title': self._live_title(stream_info['stream_title']) if is_live else stream_info['stream_title'],
+            'thumbnail': stream_info.get('thumbnail_url'),
+            'is_live': is_live,
+        }
+
+    def _extract_event(self, event_data):
+        event_id = compat_str(event_data['id'])
+        account_id = compat_str(event_data['owner_account_id'])
+        feed_root_url = self._API_URL_TEMPLATE % (account_id, event_id) + '/feed.json'
+
+        stream_info = event_data.get('stream_info')
+        if stream_info:
+            return self._extract_stream_info(stream_info)
+
+        last_video = None
+        entries = []
+        for i in itertools.count(1):
+            if last_video is None:
+                info_url = feed_root_url
+            else:
+                info_url = '{root}?&id={id}&newer=-1&type=video'.format(
+                    root=feed_root_url, id=last_video)
+            videos_info = self._download_json(
+                info_url, event_id, 'Downloading page {0}'.format(i))['data']
+            videos_info = [v['data'] for v in videos_info if v['type'] == 'video']
+            if not videos_info:
+                break
+            for v in videos_info:
+                v_id = compat_str(v['id'])
+                entries.append(self.url_result(
+                    'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id),
+                    'Livestream', v_id, v.get('caption')))
+            last_video = videos_info[-1]['id']
+        return self.playlist_result(entries, event_id, event_data['full_name'])
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        event = mobj.group('event_id') or mobj.group('event_name')
+        account = mobj.group('account_id') or mobj.group('account_name')
+        api_url = self._API_URL_TEMPLATE % (account, event)
+        if video_id:
+            video_data = self._download_json(
+                api_url + '/videos/%s' % video_id, video_id)
+            return self._extract_video_info(video_data)
+        else:
+            event_data = self._download_json(api_url, video_id)
+            return self._extract_event(event_data)
+
+
+# The original version of Livestream uses a different system
+class LivestreamOriginalIE(InfoExtractor):
+    IE_NAME = 'livestream:original'
+    _VALID_URL = r'''(?x)https?://original\.livestream\.com/
+        (?P<user>[^/\?#]+)(?:/(?P<type>video|folder)
+        (?:(?:\?.*?Id=|/)(?P<id>.*?)(&|$))?)?
+        '''
+    _TESTS = [{
+        'url': 'http://original.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+        'info_dict': {
+            'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+            'ext': 'mp4',
+            'title': 'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital',
+            'duration': 771.301,
+            'view_count': int,
+        },
+    }, {
+        'url': 'https://original.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3',
+        'info_dict': {
+            'id': 'a07bf706-d0e4-4e75-a747-b021d84f2fd3',
+        },
+        'playlist_mincount': 4,
+    }, {
+        # live stream
+        'url': 'http://original.livestream.com/znsbahamas',
+        'only_matching': True,
+    }]
+
+    def _extract_video_info(self, user, video_id):
+        api_url = 'http://x%sx.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id=%s' % (user, video_id)
+        info = self._download_xml(api_url, video_id)
+
+        item = info.find('channel').find('item')
+        title = xpath_text(item, 'title')
+        media_ns = {'media': 'http://search.yahoo.com/mrss'}
+        thumbnail_url = xpath_attr(
+            item, xpath_with_ns('media:thumbnail', media_ns), 'url')
+        duration = float_or_none(xpath_attr(
+            item, xpath_with_ns('media:content', media_ns), 'duration'))
+        ls_ns = {'ls': 'http://api.channel.livestream.com/2.0'}
+        view_count = int_or_none(xpath_text(
+            item, xpath_with_ns('ls:viewsCount', ls_ns)))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail_url,
+            'duration': duration,
+            'view_count': view_count,
+        }
+
+    def _extract_video_formats(self, video_data, video_id):
+        formats = []
+
+        progressive_url = video_data.get('progressiveUrl')
+        if progressive_url:
+            formats.append({
+                'url': progressive_url,
+                'format_id': 'http',
+            })
+
+        m3u8_url = video_data.get('httpUrl')
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        rtsp_url = video_data.get('rtspUrl')
+        if rtsp_url:
+            formats.append({
+                'url': rtsp_url,
+                'format_id': 'rtsp',
+            })
+
+        self._sort_formats(formats)
+        return formats
+
+    def _extract_folder(self, url, folder_id):
+        webpage = self._download_webpage(url, folder_id)
+        paths = orderedSet(re.findall(
+            r'''(?x)(?:
+                <li\s+class="folder">\s*<a\s+href="|
+                <a\s+href="(?=https?://livestre\.am/)
+            )([^"]+)"''', webpage))
+
+        entries = [{
+            '_type': 'url',
+            'url': compat_urlparse.urljoin(url, p),
+        } for p in paths]
+
+        return self.playlist_result(entries, folder_id)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user = mobj.group('user')
+        url_type = mobj.group('type')
+        content_id = mobj.group('id')
+        if url_type == 'folder':
+            return self._extract_folder(url, content_id)
+        else:
+            # this url is used on mobile devices
+            stream_url = 'http://x%sx.api.channel.livestream.com/3.0/getstream.json' % user
+            info = {}
+            if content_id:
+                stream_url += '?id=%s' % content_id
+                info = self._extract_video_info(user, content_id)
+            else:
+                content_id = user
+                webpage = self._download_webpage(url, content_id)
+                info = {
+                    'title': self._og_search_title(webpage),
+                    'description': self._og_search_description(webpage),
+                    'thumbnail': self._search_regex(r'channelLogo\.src\s*=\s*"([^"]+)"', webpage, 'thumbnail', None),
+                }
+            video_data = self._download_json(stream_url, content_id)
+            is_live = video_data.get('isLive')
+            info.update({
+                'id': content_id,
+                'title': self._live_title(info['title']) if is_live else info['title'],
+                'formats': self._extract_video_formats(video_data, content_id),
+                'is_live': is_live,
+            })
+            return info
+
+
+# The server doesn't support HEAD request, the generic extractor can't detect
+# the redirection
+class LivestreamShortenerIE(InfoExtractor):
+    IE_NAME = 'livestream:shortener'
+    IE_DESC = False  # Do not list
+    _VALID_URL = r'https?://livestre\.am/(?P<id>.+)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        id = mobj.group('id')
+        webpage = self._download_webpage(url, id)
+
+        return self.url_result(self._og_search_url(webpage))
diff --git a/youtube_dl/extractor/lnkgo.py b/youtube_dl/extractor/lnkgo.py

new file mode 100644 (file)

index 0000000..3e71852
--- /dev/null
+++ b/youtube_dl/extractor/lnkgo.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    compat_str,
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class LnkGoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P<id>[A-Za-z0-9-]+)(?:/(?P<episode_id>\d+))?'
+    _TESTS = [{
+        'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai',
+        'info_dict': {
+            'id': '10809',
+            'ext': 'mp4',
+            'title': "Put'ka: Trys Klausimai",
+            'upload_date': '20161216',
+            'description': 'Seniai matytas Put’ka užduoda tris klausimėlius. Pabandykime surasti atsakymus.',
+            'age_limit': 18,
+            'duration': 117,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1481904000,
+        },
+        'params': {
+            'skip_download': True,  # HLS download
+        },
+    }, {
+        'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2',
+        'info_dict': {
+            'id': '10467',
+            'ext': 'mp4',
+            'title': 'Nėrdas: Kompiuterio Valymas',
+            'upload_date': '20150113',
+            'description': 'md5:7352d113a242a808676ff17e69db6a69',
+            'age_limit': 18,
+            'duration': 346,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1421164800,
+        },
+        'params': {
+            'skip_download': True,  # HLS download
+        },
+    }, {
+        'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413',
+        'only_matching': True,
+    }]
+    _AGE_LIMITS = {
+        'N-7': 7,
+        'N-14': 14,
+        'S': 18,
+    }
+    _M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s'
+
+    def _real_extract(self, url):
+        display_id, video_id = re.match(self._VALID_URL, url).groups()
+
+        video_info = self._download_json(
+            'https://lnk.lt/api/main/video-page/%s/%s/false' % (display_id, video_id or '0'),
+            display_id)['videoConfig']['videoInfo']
+
+        video_id = compat_str(video_info['id'])
+        title = video_info['title']
+        prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4'
+        formats = self._extract_m3u8_formats(
+            self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''),
+            video_id, 'mp4', 'm3u8_native')
+        self._sort_formats(formats)
+
+        poster_image = video_info.get('posterImage')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': 'https://lnk.lt/all-images/' + poster_image if poster_image else None,
+            'duration': int_or_none(video_info.get('duration')),
+            'description': clean_html(video_info.get('htmlDescription')),
+            'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0),
+            'timestamp': parse_iso8601(video_info.get('airDate')),
+            'view_count': int_or_none(video_info.get('viewsCount')),
+        }
diff --git a/youtube_dl/extractor/localnews8.py b/youtube_dl/extractor/localnews8.py

new file mode 100644 (file)

index 0000000..aad3961
--- /dev/null
+++ b/youtube_dl/extractor/localnews8.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class LocalNews8IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?localnews8\.com/(?:[^/]+/)*(?P<display_id>[^/]+)/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.localnews8.com/news/rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings/35183304',
+        'md5': 'be4d48aea61aa2bde7be2ee47691ad20',
+        'info_dict': {
+            'id': '35183304',
+            'display_id': 'rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings',
+            'ext': 'mp4',
+            'title': 'Rexburg business turns carbon fiber scraps into wedding ring',
+            'description': 'The process was first invented by Lamborghini and less than a dozen companies around the world use it.',
+            'duration': 153,
+            'timestamp': 1441844822,
+            'upload_date': '20150910',
+            'uploader_id': 'api',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        partner_id = self._search_regex(
+            r'partnerId\s*[:=]\s*(["\'])(?P<id>\d+)\1',
+            webpage, 'partner id', group='id')
+        kaltura_id = self._search_regex(
+            r'videoIdString\s*[:=]\s*(["\'])kaltura:(?P<id>[0-9a-z_]+)\1',
+            webpage, 'videl id', group='id')
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'kaltura:%s:%s' % (partner_id, kaltura_id),
+            'ie_key': 'Kaltura',
+            'id': video_id,
+            'display_id': display_id,
+        }
diff --git a/youtube_dl/extractor/lovehomeporn.py b/youtube_dl/extractor/lovehomeporn.py

new file mode 100644 (file)

index 0000000..8f65a3c
--- /dev/null
+++ b/youtube_dl/extractor/lovehomeporn.py
@@ -0,0 +1,37 @@
+from __future__ import unicode_literals
+
+import re
+
+from .nuevo import NuevoBaseIE
+
+
+class LoveHomePornIE(NuevoBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?lovehomeporn\.com/video/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+    _TEST = {
+        'url': 'http://lovehomeporn.com/video/48483/stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick#menu',
+        'info_dict': {
+            'id': '48483',
+            'display_id': 'stunning-busty-brunette-girlfriend-sucking-and-riding-a-big-dick',
+            'ext': 'mp4',
+            'title': 'Stunning busty brunette girlfriend sucking and riding a big dick',
+            'age_limit': 18,
+            'duration': 238.47,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        info = self._extract_nuevo(
+            'http://lovehomeporn.com/media/nuevo/config.php?key=%s' % video_id,
+            video_id)
+        info.update({
+            'display_id': display_id,
+            'age_limit': 18
+        })
+        return info
diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py

new file mode 100644 (file)

index 0000000..f5c997e
--- /dev/null
+++ b/youtube_dl/extractor/lrt.py
@@ -0,0 +1,94 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    parse_duration,
+    remove_end,
+)
+
+
+class LRTIE(InfoExtractor):
+    IE_NAME = 'lrt.lt'
+    _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'
+    _TESTS = [{
+        # m3u8 download
+        'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
+        'md5': 'fe44cf7e4ab3198055f2c598fc175cb0',
+        'info_dict': {
+            'id': '54391',
+            'ext': 'mp4',
+            'title': 'Septynios Kauno dienos',
+            'description': 'md5:24d84534c7dc76581e59f5689462411a',
+            'duration': 1783,
+            'view_count': int,
+            'like_count': int,
+        },
+    }, {
+        # direct mp3 download
+        'url': 'http://www.lrt.lt/mediateka/irasas/1013074524/',
+        'md5': '389da8ca3cad0f51d12bed0c844f6a0a',
+        'info_dict': {
+            'id': '1013074524',
+            'ext': 'mp3',
+            'title': 'Kita tema 2016-09-05 15:05',
+            'description': 'md5:1b295a8fc7219ed0d543fc228c931fb5',
+            'duration': 3008,
+            'view_count': int,
+            'like_count': int,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = remove_end(self._og_search_title(webpage), ' - LRT')
+
+        formats = []
+        for _, file_url in re.findall(
+                r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
+            ext = determine_ext(file_url)
+            if ext not in ('m3u8', 'mp3'):
+                continue
+            # mp3 served as m3u8 produces stuttered media file
+            if ext == 'm3u8' and '.mp3' in file_url:
+                continue
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    file_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    fatal=False))
+            elif ext == 'mp3':
+                formats.append({
+                    'url': file_url,
+                    'vcodec': 'none',
+                })
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._og_search_description(webpage)
+        duration = parse_duration(self._search_regex(
+            r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1',
+            webpage, 'duration', default=None, group='duration'))
+
+        view_count = int_or_none(self._html_search_regex(
+            r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>',
+            webpage, 'view count', fatal=False, group='count'))
+        like_count = int_or_none(self._search_regex(
+            r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<',
+            webpage, 'like count', fatal=False, group='count'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': like_count,
+        }
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py

new file mode 100644 (file)

index 0000000..b3d8653
--- /dev/null
+++ b/youtube_dl/extractor/lynda.py
@@ -0,0 +1,341 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    urlencode_postdata,
+)
+
+
+class LyndaBaseIE(InfoExtractor):
+    _SIGNIN_URL = 'https://www.lynda.com/signin/lynda'
+    _PASSWORD_URL = 'https://www.lynda.com/signin/password'
+    _USER_URL = 'https://www.lynda.com/signin/user'
+    _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
+    _NETRC_MACHINE = 'lynda'
+
+    def _real_initialize(self):
+        self._login()
+
+    @staticmethod
+    def _check_error(json_string, key_or_keys):
+        keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys
+        for key in keys:
+            error = json_string.get(key)
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
+
+    def _login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url):
+        action_url = self._search_regex(
+            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html,
+            'post url', default=fallback_action_url, group='url')
+
+        if not action_url.startswith('http'):
+            action_url = compat_urlparse.urljoin(self._SIGNIN_URL, action_url)
+
+        form_data = self._hidden_inputs(form_html)
+        form_data.update(extra_form_data)
+
+        response = self._download_json(
+            action_url, None, note,
+            data=urlencode_postdata(form_data),
+            headers={
+                'Referer': referrer_url,
+                'X-Requested-With': 'XMLHttpRequest',
+            }, expected_status=(418, 500, ))
+
+        self._check_error(response, ('email', 'password', 'ErrorMessage'))
+
+        return response, action_url
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        # Step 1: download signin page
+        signin_page = self._download_webpage(
+            self._SIGNIN_URL, None, 'Downloading signin page')
+
+        # Already logged in
+        if any(re.search(p, signin_page) for p in (
+                r'isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):
+            return
+
+        # Step 2: submit email
+        signin_form = self._search_regex(
+            r'(?s)(<form[^>]+data-form-name=["\']signin["\'][^>]*>.+?</form>)',
+            signin_page, 'signin form')
+        signin_page, signin_url = self._login_step(
+            signin_form, self._PASSWORD_URL, {'email': username},
+            'Submitting email', self._SIGNIN_URL)
+
+        # Step 3: submit password
+        password_form = signin_page['body']
+        self._login_step(
+            password_form, self._USER_URL, {'email': username, 'password': password},
+            'Submitting password', signin_url)
+
+
+class LyndaIE(LyndaBaseIE):
+    IE_NAME = 'lynda'
+    IE_DESC = 'lynda.com videos'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?(?:lynda\.com|educourse\.ga)/
+                        (?:
+                            (?:[^/]+/){2,3}(?P<course_id>\d+)|
+                            player/embed
+                        )/
+                        (?P<id>\d+)
+                    '''
+
+    _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
+
+    _TESTS = [{
+        'url': 'https://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
+        # md5 is unstable
+        'info_dict': {
+            'id': '114408',
+            'ext': 'mp4',
+            'title': 'Using the exercise files',
+            'duration': 68
+        }
+    }, {
+        'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0',
+        'only_matching': True,
+    }, {
+        'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html',
+        'only_matching': True,
+    }, {
+        # Status="NotFound", Message="Transcript not found"
+        'url': 'https://www.lynda.com/ASP-NET-tutorials/What-you-should-know/5034180/2811512-4.html',
+        'only_matching': True,
+    }]
+
+    def _raise_unavailable(self, video_id):
+        self.raise_login_required(
+            'Video %s is only available for members' % video_id)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        course_id = mobj.group('course_id')
+
+        query = {
+            'videoId': video_id,
+            'type': 'video',
+        }
+
+        video = self._download_json(
+            'https://www.lynda.com/ajax/player', video_id,
+            'Downloading video JSON', fatal=False, query=query)
+
+        # Fallback scenario
+        if not video:
+            query['courseId'] = course_id
+
+            play = self._download_json(
+                'https://www.lynda.com/ajax/course/%s/%s/play'
+                % (course_id, video_id), video_id, 'Downloading play JSON')
+
+            if not play:
+                self._raise_unavailable(video_id)
+
+            formats = []
+            for formats_dict in play:
+                urls = formats_dict.get('urls')
+                if not isinstance(urls, dict):
+                    continue
+                cdn = formats_dict.get('name')
+                for format_id, format_url in urls.items():
+                    if not format_url:
+                        continue
+                    formats.append({
+                        'url': format_url,
+                        'format_id': '%s-%s' % (cdn, format_id) if cdn else format_id,
+                        'height': int_or_none(format_id),
+                    })
+            self._sort_formats(formats)
+
+            conviva = self._download_json(
+                'https://www.lynda.com/ajax/player/conviva', video_id,
+                'Downloading conviva JSON', query=query)
+
+            return {
+                'id': video_id,
+                'title': conviva['VideoTitle'],
+                'description': conviva.get('VideoDescription'),
+                'release_year': int_or_none(conviva.get('ReleaseYear')),
+                'duration': int_or_none(conviva.get('Duration')),
+                'creator': conviva.get('Author'),
+                'formats': formats,
+            }
+
+        if 'Status' in video:
+            raise ExtractorError(
+                'lynda returned error: %s' % video['Message'], expected=True)
+
+        if video.get('HasAccess') is False:
+            self._raise_unavailable(video_id)
+
+        video_id = compat_str(video.get('ID') or video_id)
+        duration = int_or_none(video.get('DurationInSeconds'))
+        title = video['Title']
+
+        formats = []
+
+        fmts = video.get('Formats')
+        if fmts:
+            formats.extend([{
+                'url': f['Url'],
+                'ext': f.get('Extension'),
+                'width': int_or_none(f.get('Width')),
+                'height': int_or_none(f.get('Height')),
+                'filesize': int_or_none(f.get('FileSize')),
+                'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None,
+            } for f in fmts if f.get('Url')])
+
+        prioritized_streams = video.get('PrioritizedStreams')
+        if prioritized_streams:
+            for prioritized_stream_id, prioritized_stream in prioritized_streams.items():
+                formats.extend([{
+                    'url': video_url,
+                    'height': int_or_none(format_id),
+                    'format_id': '%s-%s' % (prioritized_stream_id, format_id),
+                } for format_id, video_url in prioritized_stream.items()])
+
+        self._check_formats(formats, video_id)
+        self._sort_formats(formats)
+
+        subtitles = self.extract_subtitles(video_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'duration': duration,
+            'subtitles': subtitles,
+            'formats': formats
+        }
+
+    def _fix_subtitles(self, subs):
+        srt = ''
+        seq_counter = 0
+        for pos in range(0, len(subs) - 1):
+            seq_current = subs[pos]
+            m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode'])
+            if m_current is None:
+                continue
+            seq_next = subs[pos + 1]
+            m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode'])
+            if m_next is None:
+                continue
+            appear_time = m_current.group('timecode')
+            disappear_time = m_next.group('timecode')
+            text = seq_current['Caption'].strip()
+            if text:
+                seq_counter += 1
+                srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (seq_counter, appear_time, disappear_time, text)
+        if srt:
+            return srt
+
+    def _get_subtitles(self, video_id):
+        url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
+        subs = self._download_webpage(
+            url, video_id, 'Downloading subtitles JSON', fatal=False)
+        if not subs or 'Status="NotFound"' in subs:
+            return {}
+        subs = self._parse_json(subs, video_id, fatal=False)
+        if not subs:
+            return {}
+        fixed_subs = self._fix_subtitles(subs)
+        if fixed_subs:
+            return {'en': [{'ext': 'srt', 'data': fixed_subs}]}
+        return {}
+
+
+class LyndaCourseIE(LyndaBaseIE):
+    IE_NAME = 'lynda:course'
+    IE_DESC = 'lynda.com online courses'
+
+    # Course link equals to welcome/introduction video link of same course
+    # We will recognize it as course link
+    _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P<coursepath>(?:[^/]+/){2,3}(?P<courseid>\d+))-2\.html'
+
+    _TESTS = [{
+        'url': 'https://www.lynda.com/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Grundlagen-guten-Gestaltung/393570-2.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        course_path = mobj.group('coursepath')
+        course_id = mobj.group('courseid')
+
+        item_template = 'https://www.lynda.com/%s/%%s-4.html' % course_path
+
+        course = self._download_json(
+            'https://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
+            course_id, 'Downloading course JSON', fatal=False)
+
+        if not course:
+            webpage = self._download_webpage(url, course_id)
+            entries = [
+                self.url_result(
+                    item_template % video_id, ie=LyndaIE.ie_key(),
+                    video_id=video_id)
+                for video_id in re.findall(
+                    r'data-video-id=["\'](\d+)', webpage)]
+            return self.playlist_result(
+                entries, course_id,
+                self._og_search_title(webpage, fatal=False),
+                self._og_search_description(webpage))
+
+        if course.get('Status') == 'NotFound':
+            raise ExtractorError(
+                'Course %s does not exist' % course_id, expected=True)
+
+        unaccessible_videos = 0
+        entries = []
+
+        # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
+        # by single video API anymore
+
+        for chapter in course['Chapters']:
+            for video in chapter.get('Videos', []):
+                if video.get('HasAccess') is False:
+                    unaccessible_videos += 1
+                    continue
+                video_id = video.get('ID')
+                if video_id:
+                    entries.append({
+                        '_type': 'url_transparent',
+                        'url': item_template % video_id,
+                        'ie_key': LyndaIE.ie_key(),
+                        'chapter': chapter.get('Title'),
+                        'chapter_number': int_or_none(chapter.get('ChapterIndex')),
+                        'chapter_id': compat_str(chapter.get('ID')),
+                    })
+
+        if unaccessible_videos > 0:
+            self._downloader.report_warning(
+                '%s videos are only available for members (or paid members) and will not be downloaded. '
+                % unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT)
+
+        course_title = course.get('Title')
+        course_description = course.get('Description')
+
+        return self.playlist_result(entries, course_id, course_title, course_description)
diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py

new file mode 100644 (file)

index 0000000..9806875
--- /dev/null
+++ b/youtube_dl/extractor/m6.py
@@ -0,0 +1,25 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class M6IE(InfoExtractor):
+    IE_NAME = 'm6'
+    _VALID_URL = r'https?://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html'
+
+    _TEST = {
+        'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html',
+        'md5': '242994a87de2c316891428e0176bcb77',
+        'info_dict': {
+            'id': '11323908',
+            'ext': 'mp4',
+            'title': 'Emeline est la Reine du Shopping sur le thème « Ma fête d’anniversaire ! »',
+            'description': 'md5:1212ae8fb4b7baa4dc3886c5676007c2',
+            'duration': 100,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result('6play:%s' % video_id, 'SixPlay', video_id)
diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py

new file mode 100644 (file)

index 0000000..65cc474
--- /dev/null
+++ b/youtube_dl/extractor/mailru.py
@@ -0,0 +1,329 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    remove_end,
+    try_get,
+)
+
+
+class MailRuIE(InfoExtractor):
+    IE_NAME = 'mailru'
+    IE_DESC = 'Видео@Mail.Ru'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:(?:www|m)\.)?my\.mail\.ru/+
+                        (?:
+                            video/.*\#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|
+                            (?:(?P<idv2prefix>(?:[^/]+/+){2})video/(?P<idv2suffix>[^/]+/\d+))\.html|
+                            (?:video/embed|\+/video/meta)/(?P<metaid>\d+)
+                        )
+                    '''
+    _TESTS = [
+        {
+            'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
+            'md5': 'dea205f03120046894db4ebb6159879a',
+            'info_dict': {
+                'id': '46301138_76',
+                'ext': 'mp4',
+                'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
+                'timestamp': 1393235077,
+                'upload_date': '20140224',
+                'uploader': 'sonypicturesrus',
+                'uploader_id': 'sonypicturesrus@mail.ru',
+                'duration': 184,
+            },
+            'skip': 'Not accessible from Travis CI server',
+        },
+        {
+            'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html',
+            'md5': '00a91a58c3402204dcced523777b475f',
+            'info_dict': {
+                'id': '46843144_1263',
+                'ext': 'mp4',
+                'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion',
+                'timestamp': 1397039888,
+                'upload_date': '20140409',
+                'uploader': 'hitech',
+                'uploader_id': 'hitech@corp.mail.ru',
+                'duration': 245,
+            },
+            'skip': 'Not accessible from Travis CI server',
+        },
+        {
+            # only available via metaUrl API
+            'url': 'http://my.mail.ru/mail/720pizle/video/_myvideo/502.html',
+            'md5': '3b26d2491c6949d031a32b96bd97c096',
+            'info_dict': {
+                'id': '56664382_502',
+                'ext': 'mp4',
+                'title': ':8336',
+                'timestamp': 1449094163,
+                'upload_date': '20151202',
+                'uploader': '720pizle@mail.ru',
+                'uploader_id': '720pizle@mail.ru',
+                'duration': 6001,
+            },
+            'skip': 'Not accessible from Travis CI server',
+        },
+        {
+            'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://my.mail.ru/video/embed/7949340477499637815',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://my.mail.ru/+/video/meta/7949340477499637815',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://my.mail.ru//list/sinyutin10/video/_myvideo/4.html',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://my.mail.ru//list//sinyutin10/video/_myvideo/4.html',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        meta_id = mobj.group('metaid')
+
+        video_id = None
+        if meta_id:
+            meta_url = 'https://my.mail.ru/+/video/meta/%s' % meta_id
+        else:
+            video_id = mobj.group('idv1')
+            if not video_id:
+                video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix')
+            webpage = self._download_webpage(url, video_id)
+            page_config = self._parse_json(self._search_regex(
+                r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>',
+                webpage, 'page config', default='{}'), video_id, fatal=False)
+            if page_config:
+                meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl')
+            else:
+                meta_url = None
+
+        video_data = None
+        if meta_url:
+            video_data = self._download_json(
+                meta_url, video_id or meta_id, 'Downloading video meta JSON',
+                fatal=not video_id)
+
+        # Fallback old approach
+        if not video_data:
+            video_data = self._download_json(
+                'http://api.video.mail.ru/videos/%s.json?new=1' % video_id,
+                video_id, 'Downloading video JSON')
+
+        headers = {}
+
+        video_key = self._get_cookies('https://my.mail.ru').get('video_key')
+        if video_key:
+            headers['Cookie'] = 'video_key=%s' % video_key.value
+
+        formats = []
+        for f in video_data['videos']:
+            video_url = f.get('url')
+            if not video_url:
+                continue
+            format_id = f.get('key')
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None)) if format_id else None
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'height': height,
+                'http_headers': headers,
+            })
+        self._sort_formats(formats)
+
+        meta_data = video_data['meta']
+        title = remove_end(meta_data['title'], '.mp4')
+
+        author = video_data.get('author')
+        uploader = author.get('name')
+        uploader_id = author.get('id') or author.get('email')
+        view_count = int_or_none(video_data.get('viewsCount') or video_data.get('views_count'))
+
+        acc_id = meta_data.get('accId')
+        item_id = meta_data.get('itemId')
+        content_id = '%s_%s' % (acc_id, item_id) if acc_id and item_id else video_id
+
+        thumbnail = meta_data.get('poster')
+        duration = int_or_none(meta_data.get('duration'))
+        timestamp = int_or_none(meta_data.get('timestamp'))
+
+        return {
+            'id': content_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+        }
+
+
+class MailRuMusicSearchBaseIE(InfoExtractor):
+    def _search(self, query, url, audio_id, limit=100, offset=0):
+        search = self._download_json(
+            'https://my.mail.ru/cgi-bin/my/ajax', audio_id,
+            'Downloading songs JSON page %d' % (offset // limit + 1),
+            headers={
+                'Referer': url,
+                'X-Requested-With': 'XMLHttpRequest',
+            }, query={
+                'xemail': '',
+                'ajax_call': '1',
+                'func_name': 'music.search',
+                'mna': '',
+                'mnb': '',
+                'arg_query': query,
+                'arg_extended': '1',
+                'arg_search_params': json.dumps({
+                    'music': {
+                        'limit': limit,
+                        'offset': offset,
+                    },
+                }),
+                'arg_limit': limit,
+                'arg_offset': offset,
+            })
+        return next(e for e in search if isinstance(e, dict))
+
+    @staticmethod
+    def _extract_track(t, fatal=True):
+        audio_url = t['URL'] if fatal else t.get('URL')
+        if not audio_url:
+            return
+
+        audio_id = t['File'] if fatal else t.get('File')
+        if not audio_id:
+            return
+
+        thumbnail = t.get('AlbumCoverURL') or t.get('FiledAlbumCover')
+        uploader = t.get('OwnerName') or t.get('OwnerName_Text_HTML')
+        uploader_id = t.get('UploaderID')
+        duration = int_or_none(t.get('DurationInSeconds')) or parse_duration(
+            t.get('Duration') or t.get('DurationStr'))
+        view_count = int_or_none(t.get('PlayCount') or t.get('PlayCount_hr'))
+
+        track = t.get('Name') or t.get('Name_Text_HTML')
+        artist = t.get('Author') or t.get('Author_Text_HTML')
+
+        if track:
+            title = '%s - %s' % (artist, track) if artist else track
+        else:
+            title = audio_id
+
+        return {
+            'extractor_key': MailRuMusicIE.ie_key(),
+            'id': audio_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'duration': duration,
+            'view_count': view_count,
+            'vcodec': 'none',
+            'abr': int_or_none(t.get('BitRate')),
+            'track': track,
+            'artist': artist,
+            'album': t.get('Album'),
+            'url': audio_url,
+        }
+
+
+class MailRuMusicIE(MailRuMusicSearchBaseIE):
+    IE_NAME = 'mailru:music'
+    IE_DESC = 'Музыка@Mail.Ru'
+    _VALID_URL = r'https?://my\.mail\.ru/+music/+songs/+[^/?#&]+-(?P<id>[\da-f]+)'
+    _TESTS = [{
+        'url': 'https://my.mail.ru/music/songs/%D0%BC8%D0%BB8%D1%82%D1%85-l-a-h-luciferian-aesthetics-of-herrschaft-single-2017-4e31f7125d0dfaef505d947642366893',
+        'md5': '0f8c22ef8c5d665b13ac709e63025610',
+        'info_dict': {
+            'id': '4e31f7125d0dfaef505d947642366893',
+            'ext': 'mp3',
+            'title': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017 - М8Л8ТХ',
+            'uploader': 'Игорь Мудрый',
+            'uploader_id': '1459196328',
+            'duration': 280,
+            'view_count': int,
+            'vcodec': 'none',
+            'abr': 320,
+            'track': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017',
+            'artist': 'М8Л8ТХ',
+        },
+    }]
+
+    def _real_extract(self, url):
+        audio_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, audio_id)
+
+        title = self._og_search_title(webpage)
+        music_data = self._search(title, url, audio_id)['MusicData']
+        t = next(t for t in music_data if t.get('File') == audio_id)
+
+        info = self._extract_track(t)
+        info['title'] = title
+        return info
+
+
+class MailRuMusicSearchIE(MailRuMusicSearchBaseIE):
+    IE_NAME = 'mailru:music:search'
+    IE_DESC = 'Музыка@Mail.Ru'
+    _VALID_URL = r'https?://my\.mail\.ru/+music/+search/+(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://my.mail.ru/music/search/black%20shadow',
+        'info_dict': {
+            'id': 'black shadow',
+        },
+        'playlist_mincount': 532,
+    }]
+
+    def _real_extract(self, url):
+        query = compat_urllib_parse_unquote(self._match_id(url))
+
+        entries = []
+
+        LIMIT = 100
+        offset = 0
+
+        for _ in itertools.count(1):
+            search = self._search(query, url, query, LIMIT, offset)
+
+            music_data = search.get('MusicData')
+            if not music_data or not isinstance(music_data, list):
+                break
+
+            for t in music_data:
+                track = self._extract_track(t, fatal=False)
+                if track:
+                    entries.append(track)
+
+            total = try_get(
+                search, lambda x: x['Results']['music']['Total'], int)
+
+            if total is not None:
+                if offset > total:
+                    break
+
+            offset += LIMIT
+
+        return self.playlist_result(entries, query)
diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py

new file mode 100644 (file)

index 0000000..6f4fd92
--- /dev/null
+++ b/youtube_dl/extractor/malltv.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import merge_dicts
+
+
+class MallTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|sk)\.)?mall\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
+        'md5': '1c4a37f080e1f3023103a7b43458e518',
+        'info_dict': {
+            'id': 't0zzt0',
+            'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
+            'ext': 'mp4',
+            'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?',
+            'description': 'md5:25fc0ec42a72ba602b602c683fa29deb',
+            'duration': 216,
+            'timestamp': 1538870400,
+            'upload_date': '20181007',
+            'view_count': int,
+        }
+    }, {
+        'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
+        'only_matching': True,
+    }, {
+        'url': 'https://sk.mall.tv/gejmhaus/reklamacia-nehreje-vyrobnik-tepla-alebo-spekacka',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, display_id, headers=self.geo_verification_headers())
+
+        SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b'
+        video_id = self._search_regex(
+            SOURCE_RE, webpage, 'video id', group='id')
+
+        media = self._parse_html5_media_entries(
+            url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id,
+            m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0]
+
+        info = self._search_json_ld(webpage, video_id, default={})
+
+        return merge_dicts(media, info, {
+            'id': video_id,
+            'display_id': display_id,
+            'title': self._og_search_title(webpage, default=None) or display_id,
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+        })
diff --git a/youtube_dl/extractor/mangomolo.py b/youtube_dl/extractor/mangomolo.py

new file mode 100644 (file)

index 0000000..acee370
--- /dev/null
+++ b/youtube_dl/extractor/mangomolo.py
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_urllib_parse_unquote,
+)
+from ..utils import int_or_none
+
+
+class MangomoloBaseIE(InfoExtractor):
+    _BASE_REGEX = r'https?://(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)'
+
+    def _get_real_id(self, page_id):
+        return page_id
+
+    def _real_extract(self, url):
+        page_id = self._get_real_id(self._match_id(url))
+        webpage = self._download_webpage(
+            'https://player.mangomolo.com/v1/%s?%s' % (self._TYPE, url.split('?')[1]), page_id)
+        hidden_inputs = self._hidden_inputs(webpage)
+        m3u8_entry_protocol = 'm3u8' if self._IS_LIVE else 'm3u8_native'
+
+        format_url = self._html_search_regex(
+            [
+                r'(?:file|src)\s*:\s*"(https?://[^"]+?/playlist\.m3u8)',
+                r'<a[^>]+href="(rtsp://[^"]+)"'
+            ], webpage, 'format url')
+        formats = self._extract_wowza_formats(
+            format_url, page_id, m3u8_entry_protocol, ['smil'])
+        self._sort_formats(formats)
+
+        return {
+            'id': page_id,
+            'title': self._live_title(page_id) if self._IS_LIVE else page_id,
+            'uploader_id': hidden_inputs.get('userid'),
+            'duration': int_or_none(hidden_inputs.get('duration')),
+            'is_live': self._IS_LIVE,
+            'formats': formats,
+        }
+
+
+class MangomoloVideoIE(MangomoloBaseIE):
+    _TYPE = 'video'
+    IE_NAME = 'mangomolo:' + _TYPE
+    _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'video\?.*?\bid=(?P<id>\d+)'
+    _IS_LIVE = False
+
+
+class MangomoloLiveIE(MangomoloBaseIE):
+    _TYPE = 'live'
+    IE_NAME = 'mangomolo:' + _TYPE
+    _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'(live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)'
+    _IS_LIVE = True
+
+    def _get_real_id(self, page_id):
+        return compat_b64decode(compat_urllib_parse_unquote(page_id)).decode()
diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py

new file mode 100644 (file)

index 0000000..e8d7163
--- /dev/null
+++ b/youtube_dl/extractor/manyvids.py
@@ -0,0 +1,92 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    str_to_int,
+    urlencode_postdata,
+)
+
+
+class ManyVidsIE(InfoExtractor):
+    _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)'
+    _TESTS = [{
+        # preview video
+        'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/',
+        'md5': '03f11bb21c52dd12a05be21a5c7dcc97',
+        'info_dict': {
+            'id': '133957',
+            'ext': 'mp4',
+            'title': 'everthing about me (Preview)',
+            'view_count': int,
+            'like_count': int,
+        },
+    }, {
+        # full video
+        'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/',
+        'md5': 'f3e8f7086409e9b470e2643edb96bdcc',
+        'info_dict': {
+            'id': '935718',
+            'ext': 'mp4',
+            'title': 'MY FACE REVEAL',
+            'view_count': int,
+            'like_count': int,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'video URL', group='url')
+
+        title = self._html_search_regex(
+            (r'<span[^>]+class=["\']item-title[^>]+>([^<]+)',
+             r'<h2[^>]+class=["\']h2 m-0["\'][^>]*>([^<]+)'),
+            webpage, 'title', default=None) or self._html_search_meta(
+            'twitter:title', webpage, 'title', fatal=True)
+
+        if any(p in webpage for p in ('preview_videos', '_preview.mp4')):
+            title += ' (Preview)'
+
+        mv_token = self._search_regex(
+            r'data-mvtoken=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+            'mv token', default=None, group='value')
+
+        if mv_token:
+            # Sets some cookies
+            self._download_webpage(
+                'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php',
+                video_id, fatal=False, data=urlencode_postdata({
+                    'mvtoken': mv_token,
+                    'vid': video_id,
+                }), headers={
+                    'Referer': url,
+                    'X-Requested-With': 'XMLHttpRequest'
+                })
+
+        if determine_ext(video_url) == 'm3u8':
+            formats = self._extract_m3u8_formats(
+                video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls')
+        else:
+            formats = [{'url': video_url}]
+
+        like_count = int_or_none(self._search_regex(
+            r'data-likes=["\'](\d+)', webpage, 'like count', default=None))
+        view_count = str_to_int(self._html_search_regex(
+            r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage,
+            'view count', default=None))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'view_count': view_count,
+            'like_count': like_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/markiza.py b/youtube_dl/extractor/markiza.py

new file mode 100644 (file)

index 0000000..def960a
--- /dev/null
+++ b/youtube_dl/extractor/markiza.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    orderedSet,
+    parse_duration,
+    try_get,
+)
+
+
+class MarkizaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?videoarchiv\.markiza\.sk/(?:video/(?:[^/]+/)*|embed/)(?P<id>\d+)(?:[_/]|$)'
+    _TESTS = [{
+        'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723_oteckovia-109',
+        'md5': 'ada4e9fad038abeed971843aa028c7b0',
+        'info_dict': {
+            'id': '139078',
+            'ext': 'mp4',
+            'title': 'Oteckovia 109',
+            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2760,
+        },
+    }, {
+        'url': 'http://videoarchiv.markiza.sk/video/televizne-noviny/televizne-noviny/85430_televizne-noviny',
+        'info_dict': {
+            'id': '85430',
+            'title': 'Televízne noviny',
+        },
+        'playlist_count': 23,
+    }, {
+        'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723',
+        'only_matching': True,
+    }, {
+        'url': 'http://videoarchiv.markiza.sk/video/84723',
+        'only_matching': True,
+    }, {
+        'url': 'http://videoarchiv.markiza.sk/video/filmy/85190_kamenak',
+        'only_matching': True,
+    }, {
+        'url': 'http://videoarchiv.markiza.sk/video/reflex/zo-zakulisia/84651_pribeh-alzbetky',
+        'only_matching': True,
+    }, {
+        'url': 'http://videoarchiv.markiza.sk/embed/85295',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        data = self._download_json(
+            'http://videoarchiv.markiza.sk/json/video_jwplayer7.json',
+            video_id, query={'id': video_id})
+
+        info = self._parse_jwplayer_data(data, m3u8_id='hls', mpd_id='dash')
+
+        if info.get('_type') == 'playlist':
+            info.update({
+                'id': video_id,
+                'title': try_get(
+                    data, lambda x: x['details']['name'], compat_str),
+            })
+        else:
+            info['duration'] = parse_duration(
+                try_get(data, lambda x: x['details']['duration'], compat_str))
+        return info
+
+
+class MarkizaPageIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:[^/]+\.)?markiza|tvnoviny)\.sk/(?:[^/]+/)*(?P<id>\d+)_'
+    _TESTS = [{
+        'url': 'http://www.markiza.sk/soubiz/zahranicny/1923705_oteckovia-maju-svoj-den-ti-slavni-nie-su-o-nic-menej-rozkosni',
+        'md5': 'ada4e9fad038abeed971843aa028c7b0',
+        'info_dict': {
+            'id': '139355',
+            'ext': 'mp4',
+            'title': 'Oteckovia 110',
+            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2604,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://dajto.markiza.sk/filmy-a-serialy/1774695_frajeri-vo-vegas',
+        'only_matching': True,
+    }, {
+        'url': 'http://superstar.markiza.sk/aktualne/1923870_to-je-ale-telo-spevacka-ukazala-sexy-postavicku-v-bikinach',
+        'only_matching': True,
+    }, {
+        'url': 'http://hybsa.markiza.sk/aktualne/1923790_uzasna-atmosfera-na-hybsa-v-poprade-superstaristi-si-prve-koncerty-pred-davom-ludi-poriadne-uzili',
+        'only_matching': True,
+    }, {
+        'url': 'http://doma.markiza.sk/filmy/1885250_moja-vysnivana-svadba',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tvnoviny.sk/domace/1923887_po-smrti-manzela-ju-cakalo-poriadne-prekvapenie',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if MarkizaIE.suitable(url) else super(MarkizaPageIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            # Downloading for some hosts (e.g. dajto, doma) fails with 500
+            # although everything seems to be OK, so considering 500
+            # status code to be expected.
+            url, playlist_id, expected_status=500)
+
+        entries = [
+            self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id)
+            for video_id in orderedSet(re.findall(
+                r'(?:initPlayer_|data-entity=["\']|id=["\']player_)(\d+)',
+                webpage))]
+
+        return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/massengeschmacktv.py b/youtube_dl/extractor/massengeschmacktv.py

new file mode 100644 (file)

index 0000000..cfcc6b2
--- /dev/null
+++ b/youtube_dl/extractor/massengeschmacktv.py
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    determine_ext,
+    int_or_none,
+    js_to_json,
+    mimetype2ext,
+    parse_filesize,
+)
+
+
+class MassengeschmackTVIE(InfoExtractor):
+    IE_NAME = 'massengeschmack.tv'
+    _VALID_URL = r'https?://(?:www\.)?massengeschmack\.tv/play/(?P<id>[^?&#]+)'
+
+    _TEST = {
+        'url': 'https://massengeschmack.tv/play/fktv202',
+        'md5': 'a9e054db9c2b5a08f0a0527cc201e8d3',
+        'info_dict': {
+            'id': 'fktv202',
+            'ext': 'mp4',
+            'title': 'Fernsehkritik-TV - Folge 202',
+        },
+    }
+
+    def _real_extract(self, url):
+        episode = self._match_id(url)
+
+        webpage = self._download_webpage(url, episode)
+        title = clean_html(self._html_search_regex(
+            '<h3>([^<]+)</h3>', webpage, 'title'))
+        thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False)
+        sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json)
+
+        formats = []
+        for source in sources:
+            furl = source.get('src')
+            if not furl:
+                continue
+            furl = self._proto_relative_url(furl)
+            ext = determine_ext(furl) or mimetype2ext(source.get('type'))
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    furl, episode, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': furl,
+                    'format_id': determine_ext(furl),
+                })
+
+        for (durl, format_id, width, height, filesize) in re.findall(r'''(?x)
+                                   <a[^>]+?href="(?P<url>(?:https:)?//[^"]+)".*?
+                                   <strong>(?P<format_id>.+?)</strong>.*?
+                                   <small>(?:(?P<width>\d+)x(?P<height>\d+))?\s+?\((?P<filesize>[\d,]+\s*[GM]iB)\)</small>
+                                ''', webpage):
+            formats.append({
+                'url': durl,
+                'format_id': format_id,
+                'width': int_or_none(width),
+                'height': int_or_none(height),
+                'filesize': parse_filesize(filesize),
+                'vcodec': 'none' if format_id.startswith('Audio') else None,
+            })
+
+        self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr'))
+
+        return {
+            'id': episode,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/matchtv.py b/youtube_dl/extractor/matchtv.py

new file mode 100644 (file)

index 0000000..bc9933a
--- /dev/null
+++ b/youtube_dl/extractor/matchtv.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+
+from .common import InfoExtractor
+from ..utils import xpath_text
+
+
+class MatchTVIE(InfoExtractor):
+    _VALID_URL = r'https?://matchtv\.ru(?:/on-air|/?#live-player)'
+    _TESTS = [{
+        'url': 'http://matchtv.ru/#live-player',
+        'info_dict': {
+            'id': 'matchtv-live',
+            'ext': 'flv',
+            'title': r're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://matchtv.ru/on-air/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = 'matchtv-live'
+        video_url = self._download_json(
+            'http://player.matchtv.ntvplus.tv/player/smil', video_id,
+            query={
+                'ts': '',
+                'quality': 'SD',
+                'contentId': '561d2c0df7159b37178b4567',
+                'sign': '',
+                'includeHighlights': '0',
+                'userId': '',
+                'sessionId': random.randint(1, 1000000000),
+                'contentType': 'channel',
+                'timeShift': '0',
+                'platform': 'portal',
+            },
+            headers={
+                'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf',
+            })['data']['videoUrl']
+        f4m_url = xpath_text(self._download_xml(video_url, video_id), './to')
+        formats = self._extract_f4m_formats(f4m_url, video_id)
+        self._sort_formats(formats)
+        return {
+            'id': video_id,
+            'title': self._live_title('Матч ТВ - Прямой эфир'),
+            'is_live': True,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py

new file mode 100644 (file)

index 0000000..322e5b4
--- /dev/null
+++ b/youtube_dl/extractor/mdr.py
@@ -0,0 +1,184 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+    xpath_text,
+)
+
+
+class MDRIE(InfoExtractor):
+    IE_DESC = 'MDR.DE and KiKA'
+    _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P<id>\d+)(?:_.+?)?\.html'
+
+    _TESTS = [{
+        # MDR regularly deletes its videos
+        'url': 'http://www.mdr.de/fakt/video189002.html',
+        'only_matching': True,
+    }, {
+        # audio
+        'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html',
+        'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa',
+        'info_dict': {
+            'id': '1312272',
+            'ext': 'mp3',
+            'title': 'Feuilleton vom 30. Oktober 2015',
+            'duration': 250,
+            'uploader': 'MITTELDEUTSCHER RUNDFUNK',
+        },
+        'skip': '404 not found',
+    }, {
+        'url': 'http://www.kika.de/baumhaus/videos/video19636.html',
+        'md5': '4930515e36b06c111213e80d1e4aad0e',
+        'info_dict': {
+            'id': '19636',
+            'ext': 'mp4',
+            'title': 'Baumhaus vom 30. Oktober 2015',
+            'duration': 134,
+            'uploader': 'KIKA',
+        },
+        'skip': '404 not found',
+    }, {
+        'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html',
+        'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
+        'info_dict': {
+            'id': '8182',
+            'ext': 'mp4',
+            'title': 'Beutolomäus und der geheime Weihnachtswunsch',
+            'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd',
+            'timestamp': 1482541200,
+            'upload_date': '20161224',
+            'duration': 4628,
+            'uploader': 'KIKA',
+        },
+    }, {
+        # audio with alternative playerURL pattern
+        'url': 'http://www.mdr.de/kultur/videos-und-audios/audio-radio/operation-mindfuck-robert-wilson100.html',
+        'info_dict': {
+            'id': '100',
+            'ext': 'mp4',
+            'title': 'Feature: Operation Mindfuck - Robert Anton Wilson',
+            'duration': 3239,
+            'uploader': 'MITTELDEUTSCHER RUNDFUNK',
+        },
+    }, {
+        'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        data_url = self._search_regex(
+            r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>.+?-avCustom\.xml)\1',
+            webpage, 'data url', group='url').replace(r'\/', '/')
+
+        doc = self._download_xml(
+            compat_urlparse.urljoin(url, data_url), video_id)
+
+        title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True)
+
+        formats = []
+        processed_urls = []
+        for asset in doc.findall('./assets/asset'):
+            for source in (
+                    'progressiveDownload',
+                    'dynamicHttpStreamingRedirector',
+                    'adaptiveHttpStreamingRedirector'):
+                url_el = asset.find('./%sUrl' % source)
+                if url_el is None:
+                    continue
+
+                video_url = url_el.text
+                if video_url in processed_urls:
+                    continue
+
+                processed_urls.append(video_url)
+
+                vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
+                abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)
+
+                ext = determine_ext(url_el.text)
+                if ext == 'm3u8':
+                    url_formats = self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                        preference=0, m3u8_id='HLS', fatal=False)
+                elif ext == 'f4m':
+                    url_formats = self._extract_f4m_formats(
+                        video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id,
+                        preference=0, f4m_id='HDS', fatal=False)
+                else:
+                    media_type = xpath_text(asset, './mediaType', 'media type', default='MP4')
+                    vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
+                    abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)
+                    filesize = int_or_none(xpath_text(asset, './fileSize', 'file size'))
+
+                    f = {
+                        'url': video_url,
+                        'format_id': '%s-%d' % (media_type, vbr or abr),
+                        'filesize': filesize,
+                        'abr': abr,
+                        'preference': 1,
+                    }
+
+                    if vbr:
+                        width = int_or_none(xpath_text(asset, './frameWidth', 'width'))
+                        height = int_or_none(xpath_text(asset, './frameHeight', 'height'))
+                        f.update({
+                            'vbr': vbr,
+                            'width': width,
+                            'height': height,
+                        })
+
+                    url_formats = [f]
+
+                if not url_formats:
+                    continue
+
+                if not vbr:
+                    for f in url_formats:
+                        abr = f.get('tbr') or abr
+                        if 'tbr' in f:
+                            del f['tbr']
+                        f.update({
+                            'abr': abr,
+                            'vcodec': 'none',
+                        })
+
+                formats.extend(url_formats)
+
+        self._sort_formats(formats)
+
+        description = xpath_text(doc, './broadcast/broadcastDescription', 'description')
+        timestamp = parse_iso8601(
+            xpath_text(
+                doc, [
+                    './broadcast/broadcastDate',
+                    './broadcast/broadcastStartDate',
+                    './broadcast/broadcastEndDate'],
+                'timestamp', default=None))
+        duration = parse_duration(xpath_text(doc, './duration', 'duration'))
+        uploader = xpath_text(doc, './rights', 'uploader')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'duration': duration,
+            'uploader': uploader,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py

new file mode 100644 (file)

index 0000000..50d5db8
--- /dev/null
+++ b/youtube_dl/extractor/medialaan.py
@@ -0,0 +1,269 @@
+from __future__ import unicode_literals
+
+import re
+
+from .gigya import GigyaBaseIE
+
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    try_get,
+    unified_timestamp,
+)
+
+
+class MedialaanIE(GigyaBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.|nieuws\.)?
+                        (?:
+                            (?P<site_id>vtm|q2|vtmkzoom)\.be/
+                            (?:
+                                video(?:/[^/]+/id/|/?\?.*?\baid=)|
+                                (?:[^/]+/)*
+                            )
+                        )
+                        (?P<id>[^/?#&]+)
+                    '''
+    _NETRC_MACHINE = 'medialaan'
+    _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-'
+    _SITE_TO_APP_ID = {
+        'vtm': 'vtm_watch',
+        'q2': 'q2',
+        'vtmkzoom': 'vtmkzoom',
+    }
+    _TESTS = [{
+        # vod
+        'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch',
+        'info_dict': {
+            'id': 'vtm_20170219_VM0678361_vtmwatch',
+            'ext': 'mp4',
+            'title': 'Allemaal Chris afl. 6',
+            'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2',
+            'timestamp': 1487533280,
+            'upload_date': '20170219',
+            'duration': 2562,
+            'series': 'Allemaal Chris',
+            'season': 'Allemaal Chris',
+            'season_number': 1,
+            'season_id': '256936078124527',
+            'episode': 'Allemaal Chris afl. 6',
+            'episode_number': 6,
+            'episode_id': '256936078591527',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Requires account credentials',
+    }, {
+        # clip
+        'url': 'http://vtm.be/video?aid=168332',
+        'info_dict': {
+            'id': '168332',
+            'ext': 'mp4',
+            'title': '"Veronique liegt!"',
+            'description': 'md5:1385e2b743923afe54ba4adc38476155',
+            'timestamp': 1489002029,
+            'upload_date': '20170308',
+            'duration': 96,
+        },
+    }, {
+        # vod
+        'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000',
+        'only_matching': True,
+    }, {
+        # vod
+        'url': 'http://vtm.be/video?aid=163157',
+        'only_matching': True,
+    }, {
+        # vod
+        'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2',
+        'only_matching': True,
+    }, {
+        # clip
+        'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio',
+        'only_matching': True,
+    }, {
+        # http/s redirect
+        'url': 'https://vtmkzoom.be/video?aid=45724',
+        'info_dict': {
+            'id': '257136373657000',
+            'ext': 'mp4',
+            'title': 'K3 Dansstudio Ushuaia afl.6',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Requires account credentials',
+    }, {
+        # nieuws.vtm.be
+        'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma',
+        'only_matching': True,
+    }]
+
+    def _real_initialize(self):
+        self._logged_in = False
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            self.raise_login_required()
+
+        auth_data = {
+            'APIKey': self._APIKEY,
+            'sdk': 'js_6.1',
+            'format': 'json',
+            'loginID': username,
+            'password': password,
+        }
+
+        auth_info = self._gigya_login(auth_data)
+
+        self._uid = auth_info['UID']
+        self._uid_signature = auth_info['UIDSignature']
+        self._signature_timestamp = auth_info['signatureTimestamp']
+
+        self._logged_in = True
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, site_id = mobj.group('id', 'site_id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        config = self._parse_json(
+            self._search_regex(
+                r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);',
+                webpage, 'config', default='{}'), video_id,
+            transform_source=lambda s: s.replace(
+                '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'"))
+
+        vod_id = config.get('vodId') or self._search_regex(
+            (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"',
+             r'"vodId"\s*:\s*"(.+?)"',
+             r'<[^>]+id=["\']vod-(\d+)'),
+            webpage, 'video_id', default=None)
+
+        # clip, no authentication required
+        if not vod_id:
+            player = self._parse_json(
+                self._search_regex(
+                    r'vmmaplayer\(({.+?})\);', webpage, 'vmma player',
+                    default=''),
+                video_id, transform_source=lambda s: '[%s]' % s, fatal=False)
+            if player:
+                video = player[-1]
+                if video['videoUrl'] in ('http', 'https'):
+                    return self.url_result(video['url'], MedialaanIE.ie_key())
+                info = {
+                    'id': video_id,
+                    'url': video['videoUrl'],
+                    'title': video['title'],
+                    'thumbnail': video.get('imageUrl'),
+                    'timestamp': int_or_none(video.get('createdDate')),
+                    'duration': int_or_none(video.get('duration')),
+                }
+            else:
+                info = self._parse_html5_media_entries(
+                    url, webpage, video_id, m3u8_id='hls')[0]
+                info.update({
+                    'id': video_id,
+                    'title': self._html_search_meta('description', webpage),
+                    'duration': parse_duration(self._html_search_meta('duration', webpage)),
+                })
+        # vod, authentication required
+        else:
+            if not self._logged_in:
+                self._login()
+
+            settings = self._parse_json(
+                self._search_regex(
+                    r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+                    webpage, 'drupal settings', default='{}'),
+                video_id)
+
+            def get(container, item):
+                return try_get(
+                    settings, lambda x: x[container][item],
+                    compat_str) or self._search_regex(
+                    r'"%s"\s*:\s*"([^"]+)' % item, webpage, item,
+                    default=None)
+
+            app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch')
+            sso = get('vod', 'gigyaDatabase') or 'vtm-sso'
+
+            data = self._download_json(
+                'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id,
+                video_id, query={
+                    'app_id': app_id,
+                    'user_network': sso,
+                    'UID': self._uid,
+                    'UIDSignature': self._uid_signature,
+                    'signatureTimestamp': self._signature_timestamp,
+                })
+
+            formats = self._extract_m3u8_formats(
+                data['response']['uri'], video_id, entry_protocol='m3u8_native',
+                ext='mp4', m3u8_id='hls')
+
+            self._sort_formats(formats)
+
+            info = {
+                'id': vod_id,
+                'formats': formats,
+            }
+
+            api_key = get('vod', 'apiKey')
+            channel = get('medialaanGigya', 'channel')
+
+            if api_key:
+                videos = self._download_json(
+                    'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False,
+                    query={
+                        'channels': channel,
+                        'ids': vod_id,
+                        'limit': 1,
+                        'apikey': api_key,
+                    })
+                if videos:
+                    video = try_get(
+                        videos, lambda x: x['response']['videos'][0], dict)
+                    if video:
+                        def get(container, item, expected_type=None):
+                            return try_get(
+                                video, lambda x: x[container][item], expected_type)
+
+                        def get_string(container, item):
+                            return get(container, item, compat_str)
+
+                        info.update({
+                            'series': get_string('program', 'title'),
+                            'season': get_string('season', 'title'),
+                            'season_number': int_or_none(get('season', 'number')),
+                            'season_id': get_string('season', 'id'),
+                            'episode': get_string('episode', 'title'),
+                            'episode_number': int_or_none(get('episode', 'number')),
+                            'episode_id': get_string('episode', 'id'),
+                            'duration': int_or_none(
+                                video.get('duration')) or int_or_none(
+                                video.get('durationMillis'), scale=1000),
+                            'title': get_string('episode', 'title'),
+                            'description': get_string('episode', 'text'),
+                            'timestamp': unified_timestamp(get_string(
+                                'publication', 'begin')),
+                        })
+
+            if not info.get('title'):
+                info['title'] = try_get(
+                    config, lambda x: x['videoConfig']['title'],
+                    compat_str) or self._html_search_regex(
+                    r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title',
+                    default=None) or self._og_search_title(webpage)
+
+        if not info.get('description'):
+            info['description'] = self._html_search_regex(
+                r'<div[^>]+class="field-item\s+even">\s*<p>(.+?)</p>',
+                webpage, 'description', default=None)
+
+        return info
diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py

new file mode 100644 (file)

index 0000000..933df14
--- /dev/null
+++ b/youtube_dl/extractor/mediaset.py
@@ -0,0 +1,179 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .theplatform import ThePlatformBaseIE
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    update_url_query,
+)
+
+
+class MediasetIE(ThePlatformBaseIE):
+    _TP_TLD = 'eu'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        mediaset:|
+                        https?://
+                            (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/
+                            (?:
+                                (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
+                                player/index\.html\?.*?\bprogramGuid=
+                            )
+                    )(?P<id>[0-9A-Z]{16,})
+                    '''
+    _TESTS = [{
+        # full episode
+        'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824',
+        'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
+        'info_dict': {
+            'id': 'FAFU000000661824',
+            'ext': 'mp4',
+            'title': 'Quarta puntata',
+            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1414.26,
+            'upload_date': '20161107',
+            'series': 'Hello Goodbye',
+            'timestamp': 1478532900,
+            'uploader': 'Rete 4',
+            'uploader_id': 'R4',
+        },
+    }, {
+        'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501',
+        'md5': '288532f0ad18307705b01e581304cd7b',
+        'info_dict': {
+            'id': 'F309013801000501',
+            'ext': 'mp4',
+            'title': 'Puntata del 25 maggio',
+            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 6565.007,
+            'upload_date': '20180526',
+            'series': 'Matrix',
+            'timestamp': 1527326245,
+            'uploader': 'Canale 5',
+            'uploader_id': 'C5',
+        },
+    }, {
+        # clip
+        'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680',
+        'only_matching': True,
+    }, {
+        # iframe simple
+        'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924',
+        'only_matching': True,
+    }, {
+        # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
+        'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104',
+        'only_matching': True,
+    }, {
+        'url': 'mediaset:FAFU000000665924',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.mediasetplay.mediaset.it/video/mediasethaacuoreilfuturo/palmieri-alicudi-lisola-dei-tre-bambini-felici--un-decreto-per-alicudi-e-tutte-le-microscuole_FD00000000102295',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.mediasetplay.mediaset.it/video/cherryseason/anticipazioni-degli-episodi-del-23-ottobre_F306837101005C02',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.mediasetplay.mediaset.it/video/tg5/ambiente-onda-umana-per-salvare-il-pianeta_F309453601079D01',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(ie, webpage):
+        def _qs(url):
+            return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+
+        def _program_guid(qs):
+            return qs.get('programGuid', [None])[0]
+
+        entries = []
+        for mobj in re.finditer(
+                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml.*?)\1',
+                webpage):
+            embed_url = mobj.group('url')
+            embed_qs = _qs(embed_url)
+            program_guid = _program_guid(embed_qs)
+            if program_guid:
+                entries.append(embed_url)
+                continue
+            video_id = embed_qs.get('id', [None])[0]
+            if not video_id:
+                continue
+            urlh = ie._request_webpage(
+                embed_url, video_id, note='Following embed URL redirect')
+            embed_url = urlh.geturl()
+            program_guid = _program_guid(_qs(embed_url))
+            if program_guid:
+                entries.append(embed_url)
+        return entries
+
+    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+        for video in smil.findall(self._xpath_ns('.//video', namespace)):
+            video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src'])
+        return super(MediasetIE, self)._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url)
+
+    def _real_extract(self, url):
+        guid = self._match_id(url)
+        tp_path = 'PR1GhC/media/guid/2702976343/' + guid
+        info = self._extract_theplatform_metadata(tp_path, guid)
+
+        formats = []
+        subtitles = {}
+        first_e = None
+        for asset_type in ('SD', 'HD'):
+            # TODO: fixup ISM+none manifest URLs
+            for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'):
+                try:
+                    tp_formats, tp_subtitles = self._extract_theplatform_smil(
+                        update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
+                            'mbr': 'true',
+                            'formats': f,
+                            'assetTypes': asset_type,
+                        }), guid, 'Downloading %s %s SMIL data' % (f.split('+')[0], asset_type))
+                except ExtractorError as e:
+                    if not first_e:
+                        first_e = e
+                    break
+                for tp_f in tp_formats:
+                    tp_f['quality'] = 1 if asset_type == 'HD' else 0
+                formats.extend(tp_formats)
+                subtitles = self._merge_subtitles(subtitles, tp_subtitles)
+        if first_e and not formats:
+            raise first_e
+        self._sort_formats(formats)
+
+        fields = []
+        for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))):
+            fields.extend(templ % repl for repl in repls)
+        feed_data = self._download_json(
+            'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid,
+            guid, fatal=False, query={'fields': ','.join(fields)})
+        if feed_data:
+            publish_info = feed_data.get('mediasetprogram$publishInfo') or {}
+            info.update({
+                'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')),
+                'season_number': int_or_none(feed_data.get('tvSeasonNumber')),
+                'series': feed_data.get('mediasetprogram$brandTitle'),
+                'uploader': publish_info.get('description'),
+                'uploader_id': publish_info.get('channel'),
+                'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')),
+            })
+
+        info.update({
+            'id': guid,
+            'formats': formats,
+            'subtitles': subtitles,
+        })
+        return info
diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py

new file mode 100644 (file)

index 0000000..d6eb157
--- /dev/null
+++ b/youtube_dl/extractor/mediasite.py
@@ -0,0 +1,366 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    mimetype2ext,
+    str_or_none,
+    try_get,
+    unescapeHTML,
+    unsmuggle_url,
+    url_or_none,
+    urljoin,
+)
+
+
+_ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12,14})'
+
+
+class MediasiteIE(InfoExtractor):
+    _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE
+    _TESTS = [
+        {
+            'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d',
+            'info_dict': {
+                'id': '2db6c271681e4f199af3c60d1f82869b1d',
+                'ext': 'mp4',
+                'title': 'Lecture: Tuesday, September 20, 2016 - Sir Andrew Wiles',
+                'description': 'Sir Andrew Wiles: “Equations in arithmetic”\\n\\nI will describe some of the interactions between modern number theory and the problem of solving equations in rational numbers or integers\\u0027.',
+                'timestamp': 1474268400.0,
+                'upload_date': '20160919',
+            },
+        },
+        {
+            'url': 'http://mediasite.uib.no/Mediasite/Play/90bb363295d945d6b548c867d01181361d?catalog=a452b7df-9ae1-46b7-a3ba-aceeb285f3eb',
+            'info_dict': {
+                'id': '90bb363295d945d6b548c867d01181361d',
+                'ext': 'mp4',
+                'upload_date': '20150429',
+                'title': '5) IT-forum 2015-Dag 1  - Dungbeetle -  How and why Rain created a tiny bug tracker for Unity',
+                'timestamp': 1430311380.0,
+            },
+        },
+        {
+            'url': 'https://collegerama.tudelft.nl/Mediasite/Play/585a43626e544bdd97aeb71a0ec907a01d',
+            'md5': '481fda1c11f67588c0d9d8fbdced4e39',
+            'info_dict': {
+                'id': '585a43626e544bdd97aeb71a0ec907a01d',
+                'ext': 'mp4',
+                'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.',
+                'description': '',
+                'thumbnail': r're:^https?://.*\.jpg(?:\?.*)?$',
+                'duration': 7713.088,
+                'timestamp': 1413309600,
+                'upload_date': '20141014',
+            },
+        },
+        {
+            'url': 'https://collegerama.tudelft.nl/Mediasite/Play/86a9ea9f53e149079fbdb4202b521ed21d?catalog=fd32fd35-6c99-466c-89d4-cd3c431bc8a4',
+            'md5': 'ef1fdded95bdf19b12c5999949419c92',
+            'info_dict': {
+                'id': '86a9ea9f53e149079fbdb4202b521ed21d',
+                'ext': 'wmv',
+                'title': '64ste Vakantiecursus: Afvalwater',
+                'description': 'md5:7fd774865cc69d972f542b157c328305',
+                'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
+                'duration': 10853,
+                'timestamp': 1326446400,
+                'upload_date': '20120113',
+            },
+        },
+        {
+            'url': 'http://digitalops.sandia.gov/Mediasite/Play/24aace4429fc450fb5b38cdbf424a66e1d',
+            'md5': '9422edc9b9a60151727e4b6d8bef393d',
+            'info_dict': {
+                'id': '24aace4429fc450fb5b38cdbf424a66e1d',
+                'ext': 'mp4',
+                'title': 'Xyce Software Training - Section 1',
+                'description': r're:(?s)SAND Number: SAND 2013-7800.{200,}',
+                'upload_date': '20120409',
+                'timestamp': 1333983600,
+                'duration': 7794,
+            }
+        },
+        {
+            'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d',
+            'only_matching': True,
+        },
+        {
+            # dashed id
+            'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271-681e-4f19-9af3-c60d1f82869b1d',
+            'only_matching': True,
+        }
+    ]
+
+    # look in Mediasite.Core.js (Mediasite.ContentStreamType[*])
+    _STREAM_TYPES = {
+        0: 'video1',  # the main video
+        2: 'slide',
+        3: 'presentation',
+        4: 'video2',  # screencast?
+        5: 'video3',
+    }
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            unescapeHTML(mobj.group('url'))
+            for mobj in re.finditer(
+                r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE,
+                webpage)]
+
+    def _real_extract(self, url):
+        url, data = unsmuggle_url(url, {})
+        mobj = re.match(self._VALID_URL, url)
+        resource_id = mobj.group('id')
+        query = mobj.group('query')
+
+        webpage, urlh = self._download_webpage_handle(url, resource_id)  # XXX: add UrlReferrer?
+        redirect_url = urlh.geturl()
+
+        # XXX: might have also extracted UrlReferrer and QueryString from the html
+        service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex(
+            r'<div[^>]+\bid=["\']ServicePath[^>]+>(.+?)</div>', webpage, resource_id,
+            default='/Mediasite/PlayerService/PlayerService.svc/json'))
+
+        player_options = self._download_json(
+            '%s/GetPlayerOptions' % service_path, resource_id,
+            headers={
+                'Content-type': 'application/json; charset=utf-8',
+                'X-Requested-With': 'XMLHttpRequest',
+            },
+            data=json.dumps({
+                'getPlayerOptionsRequest': {
+                    'ResourceId': resource_id,
+                    'QueryString': query,
+                    'UrlReferrer': data.get('UrlReferrer', ''),
+                    'UseScreenReader': False,
+                }
+            }).encode('utf-8'))['d']
+
+        presentation = player_options['Presentation']
+        title = presentation['Title']
+
+        if presentation is None:
+            raise ExtractorError(
+                'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'],
+                expected=True)
+
+        thumbnails = []
+        formats = []
+        for snum, Stream in enumerate(presentation['Streams']):
+            stream_type = Stream.get('StreamType')
+            if stream_type is None:
+                continue
+
+            video_urls = Stream.get('VideoUrls')
+            if not isinstance(video_urls, list):
+                video_urls = []
+
+            stream_id = self._STREAM_TYPES.get(
+                stream_type, 'type%u' % stream_type)
+
+            stream_formats = []
+            for unum, VideoUrl in enumerate(video_urls):
+                video_url = url_or_none(VideoUrl.get('Location'))
+                if not video_url:
+                    continue
+                # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS
+
+                media_type = VideoUrl.get('MediaType')
+                if media_type == 'SS':
+                    stream_formats.extend(self._extract_ism_formats(
+                        video_url, resource_id,
+                        ism_id='%s-%u.%u' % (stream_id, snum, unum),
+                        fatal=False))
+                elif media_type == 'Dash':
+                    stream_formats.extend(self._extract_mpd_formats(
+                        video_url, resource_id,
+                        mpd_id='%s-%u.%u' % (stream_id, snum, unum),
+                        fatal=False))
+                else:
+                    stream_formats.append({
+                        'format_id': '%s-%u.%u' % (stream_id, snum, unum),
+                        'url': video_url,
+                        'ext': mimetype2ext(VideoUrl.get('MimeType')),
+                    })
+
+            # TODO: if Stream['HasSlideContent']:
+            # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum)
+            # from Stream['Slides']
+            # this will require writing a custom downloader...
+
+            # disprefer 'secondary' streams
+            if stream_type != 0:
+                for fmt in stream_formats:
+                    fmt['preference'] = -1
+
+            thumbnail_url = Stream.get('ThumbnailUrl')
+            if thumbnail_url:
+                thumbnails.append({
+                    'id': '%s-%u' % (stream_id, snum),
+                    'url': urljoin(redirect_url, thumbnail_url),
+                    'preference': -1 if stream_type != 0 else 0,
+                })
+            formats.extend(stream_formats)
+
+        self._sort_formats(formats)
+
+        # XXX: Presentation['Presenters']
+        # XXX: Presentation['Transcript']
+
+        return {
+            'id': resource_id,
+            'title': title,
+            'description': presentation.get('Description'),
+            'duration': float_or_none(presentation.get('Duration'), 1000),
+            'timestamp': float_or_none(presentation.get('UnixTime'), 1000),
+            'formats': formats,
+            'thumbnails': thumbnails,
+        }
+
+
+class MediasiteCatalogIE(InfoExtractor):
+    _VALID_URL = r'''(?xi)
+                        (?P<url>https?://[^/]+/Mediasite)
+                        /Catalog/Full/
+                        (?P<catalog_id>{0})
+                        (?:
+                            /(?P<current_folder_id>{0})
+                            /(?P<root_dynamic_folder_id>{0})
+                        )?
+                    '''.format(_ID_RE)
+    _TESTS = [{
+        'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48530d454381549f955d08c75e21',
+        'info_dict': {
+            'id': '631f9e48530d454381549f955d08c75e21',
+            'title': 'WCET Summit: Adaptive Learning in Higher Ed: Improving Outcomes Dynamically',
+        },
+        'playlist_count': 6,
+        'expected_warnings': ['is not a supported codec'],
+    }, {
+        # with CurrentFolderId and RootDynamicFolderId
+        'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521',
+        'info_dict': {
+            'id': '9518c4a6c5cf4993b21cbd53e828a92521',
+            'title': 'IUSM Family and Friends Sessions',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'http://uipsyc.mediasite.com/mediasite/Catalog/Full/d5d79287c75243c58c50fef50174ec1b21',
+        'only_matching': True,
+    }, {
+        # no AntiForgeryToken
+        'url': 'https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21',
+        'only_matching': True,
+    }, {
+        'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521',
+        'only_matching': True,
+    }, {
+        # dashed id
+        'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48-530d-4543-8154-9f955d08c75e',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        mediasite_url = mobj.group('url')
+        catalog_id = mobj.group('catalog_id')
+        current_folder_id = mobj.group('current_folder_id') or catalog_id
+        root_dynamic_folder_id = mobj.group('root_dynamic_folder_id')
+
+        webpage = self._download_webpage(url, catalog_id)
+
+        # AntiForgeryToken is optional (e.g. [1])
+        # 1. https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21
+        anti_forgery_token = self._search_regex(
+            r'AntiForgeryToken\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
+            webpage, 'anti forgery token', default=None, group='value')
+        if anti_forgery_token:
+            anti_forgery_header = self._search_regex(
+                r'AntiForgeryHeaderName\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
+                webpage, 'anti forgery header name',
+                default='X-SOFO-AntiForgeryHeader', group='value')
+
+        data = {
+            'IsViewPage': True,
+            'IsNewFolder': True,
+            'AuthTicket': None,
+            'CatalogId': catalog_id,
+            'CurrentFolderId': current_folder_id,
+            'RootDynamicFolderId': root_dynamic_folder_id,
+            'ItemsPerPage': 1000,
+            'PageIndex': 0,
+            'PermissionMask': 'Execute',
+            'CatalogSearchType': 'SearchInFolder',
+            'SortBy': 'Date',
+            'SortDirection': 'Descending',
+            'StartDate': None,
+            'EndDate': None,
+            'StatusFilterList': None,
+            'PreviewKey': None,
+            'Tags': [],
+        }
+
+        headers = {
+            'Content-Type': 'application/json; charset=UTF-8',
+            'Referer': url,
+            'X-Requested-With': 'XMLHttpRequest',
+        }
+        if anti_forgery_token:
+            headers[anti_forgery_header] = anti_forgery_token
+
+        catalog = self._download_json(
+            '%s/Catalog/Data/GetPresentationsForFolder' % mediasite_url,
+            catalog_id, data=json.dumps(data).encode(), headers=headers)
+
+        entries = []
+        for video in catalog['PresentationDetailsList']:
+            if not isinstance(video, dict):
+                continue
+            video_id = str_or_none(video.get('Id'))
+            if not video_id:
+                continue
+            entries.append(self.url_result(
+                '%s/Play/%s' % (mediasite_url, video_id),
+                ie=MediasiteIE.ie_key(), video_id=video_id))
+
+        title = try_get(
+            catalog, lambda x: x['CurrentFolder']['Name'], compat_str)
+
+        return self.playlist_result(entries, catalog_id, title,)
+
+
+class MediasiteNamedCatalogIE(InfoExtractor):
+    _VALID_URL = r'(?xi)(?P<url>https?://[^/]+/Mediasite)/Catalog/catalogs/(?P<catalog_name>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://msite.misis.ru/Mediasite/Catalog/catalogs/2016-industrial-management-skriabin-o-o',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        mediasite_url = mobj.group('url')
+        catalog_name = mobj.group('catalog_name')
+
+        webpage = self._download_webpage(url, catalog_name)
+
+        catalog_id = self._search_regex(
+            r'CatalogId\s*:\s*["\'](%s)' % _ID_RE, webpage, 'catalog id')
+
+        return self.url_result(
+            '%s/Catalog/Full/%s' % (mediasite_url, catalog_id),
+            ie=MediasiteCatalogIE.ie_key(), video_id=catalog_id)
diff --git a/youtube_dl/extractor/medici.py b/youtube_dl/extractor/medici.py

new file mode 100644 (file)

index 0000000..cd91023
--- /dev/null
+++ b/youtube_dl/extractor/medici.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    unified_strdate,
+    update_url_query,
+    urlencode_postdata,
+)
+
+
+class MediciIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?medici\.tv/#!/(?P<id>[^?#&]+)'
+    _TEST = {
+        'url': 'http://www.medici.tv/#!/daniel-harding-frans-helmerson-verbier-festival-music-camp',
+        'md5': '004c21bb0a57248085b6ff3fec72719d',
+        'info_dict': {
+            'id': '3059',
+            'ext': 'flv',
+            'title': 'Daniel Harding conducts the Verbier Festival Music Camp \u2013 With Frans Helmerson',
+            'description': 'md5:322a1e952bafb725174fd8c1a8212f58',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20170408',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # Sets csrftoken cookie
+        self._download_webpage(url, video_id)
+
+        MEDICI_URL = 'http://www.medici.tv/'
+
+        data = self._download_json(
+            MEDICI_URL, video_id,
+            data=urlencode_postdata({
+                'json': 'true',
+                'page': '/%s' % video_id,
+                'timezone_offset': -420,
+            }), headers={
+                'X-CSRFToken': self._get_cookies(url)['csrftoken'].value,
+                'X-Requested-With': 'XMLHttpRequest',
+                'Referer': MEDICI_URL,
+                'Content-Type': 'application/x-www-form-urlencoded',
+            })
+
+        video = data['video']['videos']['video1']
+
+        title = video.get('nom') or data['title']
+
+        video_id = video.get('id') or video_id
+        formats = self._extract_f4m_formats(
+            update_url_query(video['url_akamai'], {
+                'hdcore': '3.1.0',
+                'plugin=aasp': '3.1.0.43.124',
+            }), video_id, f4m_id='hds')
+
+        description = data.get('meta_description')
+        thumbnail = video.get('url_thumbnail') or data.get('main_image')
+        upload_date = unified_strdate(data['video'].get('date'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/megaphone.py b/youtube_dl/extractor/megaphone.py

new file mode 100644 (file)

index 0000000..5bafa6c
--- /dev/null
+++ b/youtube_dl/extractor/megaphone.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class MegaphoneIE(InfoExtractor):
+    IE_NAME = 'megaphone.fm'
+    IE_DESC = 'megaphone.fm embedded players'
+    _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)'
+    _TEST = {
+        'url': 'https://player.megaphone.fm/GLT9749789991?"',
+        'md5': '4816a0de523eb3e972dc0dda2c191f96',
+        'info_dict': {
+            'id': 'GLT9749789991',
+            'ext': 'mp3',
+            'title': '#97 What Kind Of Idiot Gets Phished?',
+            'thumbnail': r're:^https://.*\.png.*$',
+            'duration': 1776.26375,
+            'author': 'Reply All',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_property('audio:title', webpage)
+        author = self._og_search_property('audio:artist', webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        episode_json = self._search_regex(r'(?s)var\s+episode\s*=\s*(\{.+?\});', webpage, 'episode JSON')
+        episode_data = self._parse_json(episode_json, video_id, js_to_json)
+        video_url = self._proto_relative_url(episode_data['mediaUrl'], 'https:')
+
+        formats = [{
+            'url': video_url,
+        }]
+
+        return {
+            'id': video_id,
+            'thumbnail': thumbnail,
+            'title': title,
+            'author': author,
+            'duration': episode_data['duration'],
+            'formats': formats,
+        }
+
+    @classmethod
+    def _extract_urls(cls, webpage):
+        return [m[0] for m in re.findall(
+            r'<iframe[^>]*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)]
diff --git a/youtube_dl/extractor/meipai.py b/youtube_dl/extractor/meipai.py

new file mode 100644 (file)

index 0000000..2445b8b
--- /dev/null
+++ b/youtube_dl/extractor/meipai.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    unified_timestamp,
+)
+
+
+class MeipaiIE(InfoExtractor):
+    IE_DESC = '美拍'
+    _VALID_URL = r'https?://(?:www\.)?meipai\.com/media/(?P<id>[0-9]+)'
+    _TESTS = [{
+        # regular uploaded video
+        'url': 'http://www.meipai.com/media/531697625',
+        'md5': 'e3e9600f9e55a302daecc90825854b4f',
+        'info_dict': {
+            'id': '531697625',
+            'ext': 'mp4',
+            'title': '#葉子##阿桑##余姿昀##超級女聲#',
+            'description': '#葉子##阿桑##余姿昀##超級女聲#',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 152,
+            'timestamp': 1465492420,
+            'upload_date': '20160609',
+            'view_count': 35511,
+            'creator': '她她-TATA',
+            'tags': ['葉子', '阿桑', '余姿昀', '超級女聲'],
+        }
+    }, {
+        # record of live streaming
+        'url': 'http://www.meipai.com/media/585526361',
+        'md5': 'ff7d6afdbc6143342408223d4f5fb99a',
+        'info_dict': {
+            'id': '585526361',
+            'ext': 'mp4',
+            'title': '姿昀和善願 練歌練琴啦😁😁😁',
+            'description': '姿昀和善願 練歌練琴啦😁😁😁',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 5975,
+            'timestamp': 1474311799,
+            'upload_date': '20160919',
+            'view_count': 1215,
+            'creator': '她她-TATA',
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(
+            webpage, default=None) or self._html_search_regex(
+            r'<title[^>]*>([^<]+)</title>', webpage, 'title')
+
+        formats = []
+
+        # recorded playback of live streaming
+        m3u8_url = self._html_search_regex(
+            r'file:\s*encodeURIComponent\((["\'])(?P<url>(?:(?!\1).)+)\1\)',
+            webpage, 'm3u8 url', group='url', default=None)
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        if not formats:
+            # regular uploaded video
+            video_url = self._search_regex(
+                r'data-video=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'video url',
+                group='url', default=None)
+            if video_url:
+                formats.append({
+                    'url': video_url,
+                    'format_id': 'http',
+                })
+
+        timestamp = unified_timestamp(self._og_search_property(
+            'video:release_date', webpage, 'release date', fatal=False))
+
+        tags = self._og_search_property(
+            'video:tag', webpage, 'tags', default='').split(',')
+
+        view_count = int_or_none(self._html_search_meta(
+            'interactionCount', webpage, 'view count'))
+        duration = parse_duration(self._html_search_meta(
+            'duration', webpage, 'duration'))
+        creator = self._og_search_property(
+            'video:director', webpage, 'creator', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'duration': duration,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'creator': creator,
+            'tags': tags,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/melonvod.py b/youtube_dl/extractor/melonvod.py

new file mode 100644 (file)

index 0000000..bd8cf13
--- /dev/null
+++ b/youtube_dl/extractor/melonvod.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    urljoin,
+)
+
+
+class MelonVODIE(InfoExtractor):
+    _VALID_URL = r'https?://vod\.melon\.com/video/detail2\.html?\?.*?mvId=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://vod.melon.com/video/detail2.htm?mvId=50158734',
+        'info_dict': {
+            'id': '50158734',
+            'ext': 'mp4',
+            'title': "Jessica 'Wonderland' MV Making Film",
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'artist': 'Jessica (제시카)',
+            'upload_date': '20161212',
+            'duration': 203,
+        },
+        'params': {
+            'skip_download': 'm3u8 download',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        play_info = self._download_json(
+            'http://vod.melon.com/video/playerInfo.json', video_id,
+            note='Downloading player info JSON', query={'mvId': video_id})
+
+        title = play_info['mvInfo']['MVTITLE']
+
+        info = self._download_json(
+            'http://vod.melon.com/delivery/streamingInfo.json', video_id,
+            note='Downloading streaming info JSON',
+            query={
+                'contsId': video_id,
+                'contsType': 'VIDEO',
+            })
+
+        stream_info = info['streamingInfo']
+
+        formats = self._extract_m3u8_formats(
+            stream_info['encUrl'], video_id, 'mp4', m3u8_id='hls')
+        self._sort_formats(formats)
+
+        artist_list = play_info.get('artistList')
+        artist = None
+        if isinstance(artist_list, list):
+            artist = ', '.join(
+                [a['ARTISTNAMEWEBLIST']
+                 for a in artist_list if a.get('ARTISTNAMEWEBLIST')])
+
+        thumbnail = urljoin(info.get('staticDomain'), stream_info.get('imgPath'))
+
+        duration = int_or_none(stream_info.get('playTime'))
+        upload_date = stream_info.get('mvSvcOpenDt', '')[:8] or None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'artist': artist,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py

new file mode 100644 (file)

index 0000000..cdb46e1
--- /dev/null
+++ b/youtube_dl/extractor/meta.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .pladform import PladformIE
+from ..utils import (
+    unescapeHTML,
+    int_or_none,
+    ExtractorError,
+)
+
+
+class METAIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://video.meta.ua/5502115.video',
+        'md5': '71b6f3ee274bef16f1ab410f7f56b476',
+        'info_dict': {
+            'id': '5502115',
+            'ext': 'mp4',
+            'title': 'Sony Xperia Z camera test [HQ]',
+            'description': 'Xperia Z shoots video in FullHD HDR.',
+            'uploader_id': 'nomobile',
+            'uploader': 'CHЁZA.TV',
+            'upload_date': '20130211',
+        },
+        'add_ie': ['Youtube'],
+    }, {
+        'url': 'http://video.meta.ua/iframe/5502115',
+        'only_matching': True,
+    }, {
+        # pladform embed
+        'url': 'http://video.meta.ua/7121015.video',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        st_html5 = self._search_regex(
+            r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None)
+
+        if st_html5:
+            # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js
+            json_str = ''
+            for i in range(0, len(st_html5), 3):
+                json_str += '&#x0%s;' % st_html5[i:i + 3]
+            uppod_data = self._parse_json(unescapeHTML(json_str), video_id)
+            error = uppod_data.get('customnotfound')
+            if error:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+            video_url = uppod_data['file']
+            info = {
+                'id': video_id,
+                'url': video_url,
+                'title': uppod_data.get('comment') or self._og_search_title(webpage),
+                'description': self._og_search_description(webpage, default=None),
+                'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage),
+                'duration': int_or_none(self._og_search_property(
+                    'video:duration', webpage, default=None)),
+            }
+            if 'youtube.com/' in video_url:
+                info.update({
+                    '_type': 'url_transparent',
+                    'ie_key': 'Youtube',
+                })
+            return info
+
+        pladform_url = PladformIE._extract_url(webpage)
+        if pladform_url:
+            return self.url_result(pladform_url)
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py

new file mode 100644 (file)

index 0000000..9e92416
--- /dev/null
+++ b/youtube_dl/extractor/metacafe.py
@@ -0,0 +1,287 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse,
+    compat_urllib_parse_unquote,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    get_element_by_attribute,
+    mimetype2ext,
+)
+
+
+class MetacafeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)'
+    _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
+    _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
+    IE_NAME = 'metacafe'
+    _TESTS = [
+        # Youtube video
+        {
+            'add_ie': ['Youtube'],
+            'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/',
+            'info_dict': {
+                'id': '_aUehQsCQtM',
+                'ext': 'mp4',
+                'upload_date': '20090102',
+                'title': 'The Electric Company | "Short I" | PBS KIDS GO!',
+                'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8',
+                'uploader': 'PBS',
+                'uploader_id': 'PBS'
+            }
+        },
+        # Normal metacafe video
+        {
+            'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
+            'md5': '6e0bca200eaad2552e6915ed6fd4d9ad',
+            'info_dict': {
+                'id': '11121940',
+                'ext': 'mp4',
+                'title': 'News: Stuff You Won\'t Do with Your PlayStation 4',
+                'uploader': 'ign',
+                'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
+            },
+            'skip': 'Page is temporarily unavailable.',
+        },
+        # metacafe video with family filter
+        {
+            'url': 'http://www.metacafe.com/watch/2155630/adult_art_by_david_hart_156/',
+            'md5': 'b06082c5079bbdcde677a6291fbdf376',
+            'info_dict': {
+                'id': '2155630',
+                'ext': 'mp4',
+                'title': 'Adult Art By David Hart 156',
+                'uploader': '63346',
+                'description': 'md5:9afac8fc885252201ad14563694040fc',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # AnyClip video
+        {
+            'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/',
+            'info_dict': {
+                'id': 'an-dVVXnuY7Jh77J',
+                'ext': 'mp4',
+                'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
+                'uploader': 'AnyClip',
+                'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b',
+            },
+        },
+        # age-restricted video
+        {
+            'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
+            'md5': '98dde7c1a35d02178e8ab7560fe8bd09',
+            'info_dict': {
+                'id': '5186653',
+                'ext': 'mp4',
+                'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
+                'uploader': 'Dwayne Pipe',
+                'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b',
+                'age_limit': 18,
+            },
+        },
+        # cbs video
+        {
+            'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/',
+            'info_dict': {
+                'id': '8VD4r_Zws8VP',
+                'ext': 'flv',
+                'title': 'Open: This is Face the Nation, February 9',
+                'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476',
+                'duration': 96,
+                'uploader': 'CBSI-NEW',
+                'upload_date': '20140209',
+                'timestamp': 1391959800,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        # Movieclips.com video
+        {
+            'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/',
+            'info_dict': {
+                'id': 'mv-Wy7ZU',
+                'ext': 'mp4',
+                'title': 'My Week with Marilyn - Do You Love Me?',
+                'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.',
+                'uploader': 'movie_trailers',
+                'duration': 176,
+            },
+            'params': {
+                'skip_download': 'requires rtmpdump',
+            }
+        }
+    ]
+
+    def report_disclaimer(self):
+        self.to_screen('Retrieving disclaimer')
+
+    def _real_extract(self, url):
+        # Extract id and simplified title from URL
+        video_id, display_id = re.match(self._VALID_URL, url).groups()
+
+        # the video may come from an external site
+        m_external = re.match(r'^(\w{2})-(.*)$', video_id)
+        if m_external is not None:
+            prefix, ext_id = m_external.groups()
+            # Check if video comes from YouTube
+            if prefix == 'yt':
+                return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube')
+            # CBS videos use theplatform.com
+            if prefix == 'cb':
+                return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
+
+        headers = {
+            # Disable family filter
+            'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False}))
+        }
+
+        # AnyClip videos require the flashversion cookie so that we get the link
+        # to the mp4 file
+        if video_id.startswith('an-'):
+            headers['Cookie'] += 'flashVersion=0; '
+
+        # Retrieve video webpage to extract further information
+        webpage = self._download_webpage(url, video_id, headers=headers)
+
+        error = get_element_by_attribute(
+            'class', 'notfound-page-title', webpage)
+        if error:
+            raise ExtractorError(error, expected=True)
+
+        video_title = self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
+
+        # Extract URL, uploader and title from webpage
+        self.report_extraction(video_id)
+        video_url = None
+        mobj = re.search(r'(?m)&(?:media|video)URL=([^&]+)', webpage)
+        if mobj is not None:
+            mediaURL = compat_urllib_parse_unquote(mobj.group(1))
+            video_ext = determine_ext(mediaURL)
+
+            # Extract gdaKey if available
+            mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
+            if mobj is None:
+                video_url = mediaURL
+            else:
+                gdaKey = mobj.group(1)
+                video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
+        if video_url is None:
+            mobj = re.search(r'<video src="([^"]+)"', webpage)
+            if mobj:
+                video_url = mobj.group(1)
+                video_ext = 'mp4'
+        if video_url is None:
+            flashvars = self._search_regex(
+                r' name="flashvars" value="(.*?)"', webpage, 'flashvars',
+                default=None)
+            if flashvars:
+                vardict = compat_parse_qs(flashvars)
+                if 'mediaData' not in vardict:
+                    raise ExtractorError('Unable to extract media URL')
+                mobj = re.search(
+                    r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
+                if mobj is None:
+                    raise ExtractorError('Unable to extract media URL')
+                mediaURL = mobj.group('mediaURL').replace('\\/', '/')
+                video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
+                video_ext = determine_ext(video_url)
+        if video_url is None:
+            player_url = self._search_regex(
+                r"swfobject\.embedSWF\('([^']+)'",
+                webpage, 'config URL', default=None)
+            if player_url:
+                config_url = self._search_regex(
+                    r'config=(.+)$', player_url, 'config URL')
+                config_doc = self._download_xml(
+                    config_url, video_id,
+                    note='Downloading video config')
+                smil_url = config_doc.find('.//properties').attrib['smil_file']
+                smil_doc = self._download_xml(
+                    smil_url, video_id,
+                    note='Downloading SMIL document')
+                base_url = smil_doc.find('./head/meta').attrib['base']
+                video_url = []
+                for vn in smil_doc.findall('.//video'):
+                    br = int(vn.attrib['system-bitrate'])
+                    play_path = vn.attrib['src']
+                    video_url.append({
+                        'format_id': 'smil-%d' % br,
+                        'url': base_url,
+                        'play_path': play_path,
+                        'page_url': url,
+                        'player_url': player_url,
+                        'ext': play_path.partition(':')[0],
+                    })
+        if video_url is None:
+            flashvars = self._parse_json(self._search_regex(
+                r'flashvars\s*=\s*({.*});', webpage, 'flashvars',
+                default=None), video_id, fatal=False)
+            if flashvars:
+                video_url = []
+                for source in flashvars.get('sources'):
+                    source_url = source.get('src')
+                    if not source_url:
+                        continue
+                    ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
+                    if ext == 'm3u8':
+                        video_url.extend(self._extract_m3u8_formats(
+                            source_url, video_id, 'mp4',
+                            'm3u8_native', m3u8_id='hls', fatal=False))
+                    else:
+                        video_url.append({
+                            'url': source_url,
+                            'ext': ext,
+                        })
+
+        if video_url is None:
+            raise ExtractorError('Unsupported video type')
+
+        description = self._html_search_meta(
+            ['og:description', 'twitter:description', 'description'],
+            webpage, 'title', fatal=False)
+        thumbnail = self._html_search_meta(
+            ['og:image', 'twitter:image'], webpage, 'title', fatal=False)
+        video_uploader = self._html_search_regex(
+            r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
+            webpage, 'uploader nickname', fatal=False)
+        duration = int_or_none(
+            self._html_search_meta('video:duration', webpage, default=None))
+        age_limit = (
+            18
+            if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage)
+            else 0)
+
+        if isinstance(video_url, list):
+            formats = video_url
+        else:
+            formats = [{
+                'url': video_url,
+                'ext': video_ext,
+            }]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'description': description,
+            'uploader': video_uploader,
+            'title': video_title,
+            'thumbnail': thumbnail,
+            'age_limit': age_limit,
+            'formats': formats,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py

new file mode 100644 (file)

index 0000000..7d468d7
--- /dev/null
+++ b/youtube_dl/extractor/metacritic.py
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    fix_xml_ampersands,
+)
+
+
+class MetacriticIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?metacritic\.com/.+?/trailers/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222',
+        'info_dict': {
+            'id': '3698222',
+            'ext': 'mp4',
+            'title': 'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors',
+            'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.',
+            'duration': 221,
+        },
+        'skip': 'Not providing trailers anymore',
+    }, {
+        'url': 'http://www.metacritic.com/game/playstation-4/tales-from-the-borderlands-a-telltale-game-series/trailers/5740315',
+        'info_dict': {
+            'id': '5740315',
+            'ext': 'mp4',
+            'title': 'Tales from the Borderlands - Finale: The Vault of the Traveler',
+            'description': 'In the final episode of the season, all hell breaks loose. Jack is now in control of Helios\' systems, and he\'s ready to reclaim his rightful place as king of Hyperion (with or without you).',
+            'duration': 114,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        # The xml is not well formatted, there are raw '&'
+        info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id,
+                                  video_id, 'Downloading info xml', transform_source=fix_xml_ampersands)
+
+        clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id)
+        formats = []
+        for videoFile in clip.findall('httpURI/videoFile'):
+            rate_str = videoFile.find('rate').text
+            video_url = videoFile.find('filePath').text
+            formats.append({
+                'url': video_url,
+                'ext': 'mp4',
+                'format_id': rate_str,
+                'tbr': int(rate_str),
+            })
+        self._sort_formats(formats)
+
+        description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>',
+                                              webpage, 'description', flags=re.DOTALL)
+
+        return {
+            'id': video_id,
+            'title': clip.find('title').text,
+            'formats': formats,
+            'description': description,
+            'duration': int(clip.find('duration').text),
+        }
diff --git a/youtube_dl/extractor/mgoon.py b/youtube_dl/extractor/mgoon.py

new file mode 100644 (file)

index 0000000..7bb4739
--- /dev/null
+++ b/youtube_dl/extractor/mgoon.py
@@ -0,0 +1,87 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    qualities,
+    unified_strdate,
+)
+
+
+class MgoonIE(InfoExtractor):
+    _VALID_URL = r'''(?x)https?://(?:www\.)?
+    (?:(:?m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)|
+        video\.mgoon\.com)/(?P<id>[0-9]+)'''
+    _API_URL = 'http://mpos.mgoon.com/player/video?id={0:}'
+    _TESTS = [
+        {
+            'url': 'http://m.mgoon.com/ch/hi6618/v/5582148',
+            'md5': 'dd46bb66ab35cf6d51cc812fd82da79d',
+            'info_dict': {
+                'id': '5582148',
+                'uploader_id': 'hi6618',
+                'duration': 240.419,
+                'upload_date': '20131220',
+                'ext': 'mp4',
+                'title': 'md5:543aa4c27a4931d371c3f433e8cebebc',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            }
+        },
+        {
+            'url': 'http://www.mgoon.com/play/view/5582148',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://video.mgoon.com/5582148',
+            'only_matching': True,
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        data = self._download_json(self._API_URL.format(video_id), video_id)
+
+        if data.get('errorInfo', {}).get('code') != 'NONE':
+            raise ExtractorError('%s encountered an error: %s' % (
+                self.IE_NAME, data['errorInfo']['message']), expected=True)
+
+        v_info = data['videoInfo']
+        title = v_info.get('v_title')
+        thumbnail = v_info.get('v_thumbnail')
+        duration = v_info.get('v_duration')
+        upload_date = unified_strdate(v_info.get('v_reg_date'))
+        uploader_id = data.get('userInfo', {}).get('u_alias')
+        if duration:
+            duration /= 1000.0
+
+        age_limit = None
+        if data.get('accessInfo', {}).get('code') == 'VIDEO_STATUS_ADULT':
+            age_limit = 18
+
+        formats = []
+        get_quality = qualities(['360p', '480p', '720p', '1080p'])
+        for fmt in data['videoFiles']:
+            formats.append({
+                'format_id': fmt['label'],
+                'quality': get_quality(fmt['label']),
+                'url': fmt['url'],
+                'ext': fmt['format'],
+
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'upload_date': upload_date,
+            'uploader_id': uploader_id,
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py

new file mode 100644 (file)

index 0000000..71fc3ec
--- /dev/null
+++ b/youtube_dl/extractor/mgtv.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import time
+import uuid
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
+
+
+class MGTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
+    IE_DESC = '芒果TV'
+    _GEO_COUNTRIES = ['CN']
+
+    _TESTS = [{
+        'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
+        'info_dict': {
+            'id': '3116640',
+            'ext': 'mp4',
+            'title': '我是歌手 第四季',
+            'description': '我是歌手第四季双年巅峰会',
+            'duration': 7461,
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.mgtv.com/b/301817/3826653.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        try:
+            api_data = self._download_json(
+                'https://pcweb.api.mgtv.com/player/video', video_id, query={
+                    'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1],
+                    'video_id': video_id,
+                }, headers=self.geo_verification_headers())['data']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                error = self._parse_json(e.cause.read().decode(), None)
+                if error.get('code') == 40005:
+                    self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+                raise ExtractorError(error['msg'], expected=True)
+            raise
+        info = api_data['info']
+        title = info['title'].strip()
+        stream_data = self._download_json(
+            'https://pcweb.api.mgtv.com/player/getSource', video_id, query={
+                'pm2': api_data['atc']['pm2'],
+                'video_id': video_id,
+            }, headers=self.geo_verification_headers())['data']
+        stream_domain = stream_data['stream_domain'][0]
+
+        formats = []
+        for idx, stream in enumerate(stream_data['stream']):
+            stream_path = stream.get('url')
+            if not stream_path:
+                continue
+            format_data = self._download_json(
+                stream_domain + stream_path, video_id,
+                note='Download video info for format #%d' % idx)
+            format_url = format_data.get('info')
+            if not format_url:
+                continue
+            tbr = int_or_none(stream.get('filebitrate') or self._search_regex(
+                r'_(\d+)_mp4/', format_url, 'tbr', default=None))
+            formats.append({
+                'format_id': compat_str(tbr or idx),
+                'url': format_url,
+                'ext': 'mp4',
+                'tbr': tbr,
+                'protocol': 'm3u8_native',
+                'http_headers': {
+                    'Referer': url,
+                },
+                'format_note': stream.get('name'),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': info.get('desc'),
+            'duration': int_or_none(info.get('duration')),
+            'thumbnail': info.get('thumb'),
+        }
diff --git a/youtube_dl/extractor/miaopai.py b/youtube_dl/extractor/miaopai.py

new file mode 100644 (file)

index 0000000..f9e35ac
--- /dev/null
+++ b/youtube_dl/extractor/miaopai.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class MiaoPaiIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?miaopai\.com/show/(?P<id>[-A-Za-z0-9~_]+)'
+    _TEST = {
+        'url': 'http://www.miaopai.com/show/n~0hO7sfV1nBEw4Y29-Hqg__.htm',
+        'md5': '095ed3f1cd96b821add957bdc29f845b',
+        'info_dict': {
+            'id': 'n~0hO7sfV1nBEw4Y29-Hqg__',
+            'ext': 'mp4',
+            'title': '西游记音乐会的秒拍视频',
+            'thumbnail': 're:^https?://.*/n~0hO7sfV1nBEw4Y29-Hqg___m.jpg',
+        }
+    }
+
+    _USER_AGENT_IPAD = 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD})
+
+        title = self._html_search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title')
+        thumbnail = self._html_search_regex(
+            r'<div[^>]+class=(?P<q1>[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P<q2>[\'"])(?P<url>[^\'"]+)(?P=q2)',
+            webpage, 'thumbnail', fatal=False, group='url')
+        videos = self._parse_html5_media_entries(url, webpage, video_id)
+        info = videos[0]
+
+        info.update({
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+        })
+        return info
diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py

new file mode 100644 (file)

index 0000000..8e0aee0
--- /dev/null
+++ b/youtube_dl/extractor/microsoftvirtualacademy.py
@@ -0,0 +1,195 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_xpath,
+)
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    smuggle_url,
+    unsmuggle_url,
+    xpath_text,
+)
+
+
+class MicrosoftVirtualAcademyBaseIE(InfoExtractor):
+    def _extract_base_url(self, course_id, display_id):
+        return self._download_json(
+            'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id,
+            display_id, 'Downloading course base URL')
+
+    def _extract_chapter_and_title(self, title):
+        if not title:
+            return None, None
+        m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title)
+        return (int(m.group('chapter')), m.group('title')) if m else (None, title)
+
+
+class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE):
+    IE_NAME = 'mva'
+    IE_DESC = 'Microsoft Virtual Academy videos'
+    _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME
+
+    _TESTS = [{
+        'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382',
+        'md5': '7826c44fc31678b12ad8db11f6b5abb9',
+        'info_dict': {
+            'id': 'gfVXISmEB_6804984382',
+            'ext': 'mp4',
+            'title': 'Course Introduction',
+            'formats': 'mincount:3',
+            'subtitles': {
+                'en': [{
+                    'ext': 'ttml',
+                }],
+            },
+        }
+    }, {
+        'url': 'mva:11788:gfVXISmEB_6804984382',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        mobj = re.match(self._VALID_URL, url)
+        course_id = mobj.group('course_id')
+        video_id = mobj.group('id')
+
+        base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id)
+
+        settings = self._download_xml(
+            '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id),
+            video_id, 'Downloading video settings XML')
+
+        _, title = self._extract_chapter_and_title(xpath_text(
+            settings, './/Title', 'title', fatal=True))
+
+        formats = []
+
+        for sources in settings.findall(compat_xpath('.//MediaSources')):
+            sources_type = sources.get('videoType')
+            for source in sources.findall(compat_xpath('./MediaSource')):
+                video_url = source.text
+                if not video_url or not video_url.startswith('http'):
+                    continue
+                if sources_type == 'smoothstreaming':
+                    formats.extend(self._extract_ism_formats(
+                        video_url, video_id, 'mss', fatal=False))
+                    continue
+                video_mode = source.get('videoMode')
+                height = int_or_none(self._search_regex(
+                    r'^(\d+)[pP]$', video_mode or '', 'height', default=None))
+                codec = source.get('codec')
+                acodec, vcodec = [None] * 2
+                if codec:
+                    codecs = codec.split(',')
+                    if len(codecs) == 2:
+                        acodec, vcodec = codecs
+                    elif len(codecs) == 1:
+                        vcodec = codecs[0]
+                formats.append({
+                    'url': video_url,
+                    'format_id': video_mode,
+                    'height': height,
+                    'acodec': acodec,
+                    'vcodec': vcodec,
+                })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for source in settings.findall(compat_xpath('.//MarkerResourceSource')):
+            subtitle_url = source.text
+            if not subtitle_url:
+                continue
+            subtitles.setdefault('en', []).append({
+                'url': '%s/%s' % (base_url, subtitle_url),
+                'ext': source.get('type'),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'subtitles': subtitles,
+            'formats': formats
+        }
+
+
+class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE):
+    IE_NAME = 'mva:course'
+    IE_DESC = 'Microsoft Virtual Academy courses'
+    _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME
+
+    _TESTS = [{
+        'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
+        'info_dict': {
+            'id': '11788',
+            'title': 'Microsoft Azure Fundamentals: Virtual Machines',
+        },
+        'playlist_count': 36,
+    }, {
+        # with emphasized chapters
+        'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335',
+        'info_dict': {
+            'id': '16335',
+            'title': 'Developing Windows 10 Games with Construct 2',
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
+        'only_matching': True,
+    }, {
+        'url': 'mva:course:11788',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if MicrosoftVirtualAcademyIE.suitable(url) else super(
+            MicrosoftVirtualAcademyCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        course_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        base_url = self._extract_base_url(course_id, display_id)
+
+        manifest = self._download_json(
+            '%s/imsmanifestlite.json' % base_url,
+            display_id, 'Downloading course manifest JSON')['manifest']
+
+        organization = manifest['organizations']['organization'][0]
+
+        entries = []
+        for chapter in organization['item']:
+            chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title'))
+            chapter_id = chapter.get('@identifier')
+            for item in chapter.get('item', []):
+                item_id = item.get('@identifier')
+                if not item_id:
+                    continue
+                metadata = item.get('resource', {}).get('metadata') or {}
+                if metadata.get('learningresourcetype') != 'Video':
+                    continue
+                _, title = self._extract_chapter_and_title(item.get('title'))
+                duration = parse_duration(metadata.get('duration'))
+                description = metadata.get('description')
+                entries.append({
+                    '_type': 'url_transparent',
+                    'url': smuggle_url(
+                        'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}),
+                    'title': title,
+                    'description': description,
+                    'duration': duration,
+                    'chapter': chapter_title,
+                    'chapter_number': chapter_number,
+                    'chapter_id': chapter_id,
+                })
+
+        title = organization.get('title') or manifest.get('metadata', {}).get('title')
+
+        return self.playlist_result(entries, course_id, title)
diff --git a/youtube_dl/extractor/ministrygrid.py b/youtube_dl/extractor/ministrygrid.py

new file mode 100644 (file)

index 0000000..8ad9239
--- /dev/null
+++ b/youtube_dl/extractor/ministrygrid.py
@@ -0,0 +1,57 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    smuggle_url,
+)
+
+
+class MinistryGridIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ministrygrid\.com/([^/?#]*/)*(?P<id>[^/#?]+)/?(?:$|[?#])'
+
+    _TEST = {
+        'url': 'http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers',
+        'md5': '844be0d2a1340422759c2a9101bab017',
+        'info_dict': {
+            'id': '3453494717001',
+            'ext': 'mp4',
+            'title': 'The Gospel by Numbers',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'upload_date': '20140410',
+            'description': 'Coming soon from T4G 2014!',
+            'uploader_id': '2034960640001',
+            'timestamp': 1397145591,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['TDSLifeway'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        portlets = self._parse_json(self._search_regex(
+            r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list'),
+            video_id)
+        pl_id = self._search_regex(
+            r'getPlid:function\(\){return"(\d+)"}', webpage, 'p_l_id')
+
+        for i, portlet in enumerate(portlets):
+            portlet_url = 'http://www.ministrygrid.com/c/portal/render_portlet?p_l_id=%s&p_p_id=%s' % (pl_id, portlet)
+            portlet_code = self._download_webpage(
+                portlet_url, video_id,
+                note='Looking in portlet %s (%d/%d)' % (portlet, i + 1, len(portlets)),
+                fatal=False)
+            video_iframe_url = self._search_regex(
+                r'<iframe.*?src="([^"]+)"', portlet_code, 'video iframe',
+                default=None)
+            if video_iframe_url:
+                return self.url_result(
+                    smuggle_url(video_iframe_url, {'force_videoid': video_id}),
+                    video_id=video_id)
+
+        raise ExtractorError('Could not find video iframe in any portlets')
diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py

new file mode 100644 (file)

index 0000000..6367311
--- /dev/null
+++ b/youtube_dl/extractor/minoto.py
@@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_codecs,
+)
+
+
+class MinotoIE(InfoExtractor):
+    _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        player_id = mobj.group('player_id') or '1'
+        video_id = mobj.group('id')
+        video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id)
+        video_metadata = video_data['video-metadata']
+        formats = []
+        for fmt in video_data['video-files']:
+            fmt_url = fmt.get('url')
+            if not fmt_url:
+                continue
+            container = fmt.get('container')
+            if container == 'hls':
+                formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+            else:
+                fmt_profile = fmt.get('profile') or {}
+                formats.append({
+                    'format_id': fmt_profile.get('name-short'),
+                    'format_note': fmt_profile.get('name'),
+                    'url': fmt_url,
+                    'container': container,
+                    'tbr': int_or_none(fmt.get('bitrate')),
+                    'filesize': int_or_none(fmt.get('filesize')),
+                    'width': int_or_none(fmt.get('width')),
+                    'height': int_or_none(fmt.get('height')),
+                    'codecs': parse_codecs(fmt.get('codecs')),
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_metadata['title'],
+            'description': video_metadata.get('description'),
+            'thumbnail': video_metadata.get('video-poster', {}).get('url'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py

new file mode 100644 (file)

index 0000000..40f72d6
--- /dev/null
+++ b/youtube_dl/extractor/miomio.py
@@ -0,0 +1,141 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    xpath_text,
+    int_or_none,
+    ExtractorError,
+    sanitized_Request,
+)
+
+
+class MioMioIE(InfoExtractor):
+    IE_NAME = 'miomio.tv'
+    _VALID_URL = r'https?://(?:www\.)?miomio\.tv/watch/cc(?P<id>[0-9]+)'
+    _TESTS = [{
+        # "type=video" in flashvars
+        'url': 'http://www.miomio.tv/watch/cc88912/',
+        'info_dict': {
+            'id': '88912',
+            'ext': 'flv',
+            'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕',
+            'duration': 5923,
+        },
+        'skip': 'Unable to load videos',
+    }, {
+        'url': 'http://www.miomio.tv/watch/cc184024/',
+        'info_dict': {
+            'id': '43729',
+            'title': '《动漫同人插画绘制》',
+        },
+        'playlist_mincount': 86,
+        'skip': 'Unable to load videos',
+    }, {
+        'url': 'http://www.miomio.tv/watch/cc173113/',
+        'info_dict': {
+            'id': '173113',
+            'title': 'The New Macbook 2015 上手试玩与简评'
+        },
+        'playlist_mincount': 2,
+        'skip': 'Unable to load videos',
+    }, {
+        # new 'h5' player
+        'url': 'http://www.miomio.tv/watch/cc273997/',
+        'md5': '0b27a4b4495055d826813f8c3a6b2070',
+        'info_dict': {
+            'id': '273997',
+            'ext': 'mp4',
+            'title': 'マツコの知らない世界【劇的進化SP！ビニール傘＆冷凍食品2016】 1_2 - 16 05 31',
+        },
+        'skip': 'Unable to load videos',
+    }]
+
+    def _extract_mioplayer(self, webpage, video_id, title, http_headers):
+        xml_config = self._search_regex(
+            r'flashvars="type=(?:sina|video)&amp;(.+?)&amp;',
+            webpage, 'xml config')
+
+        # skipping the following page causes lags and eventually connection drop-outs
+        self._request_webpage(
+            'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)),
+            video_id)
+
+        vid_config_request = sanitized_Request(
+            'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config),
+            headers=http_headers)
+
+        # the following xml contains the actual configuration information on the video file(s)
+        vid_config = self._download_xml(vid_config_request, video_id)
+
+        if not int_or_none(xpath_text(vid_config, 'timelength')):
+            raise ExtractorError('Unable to load videos!', expected=True)
+
+        entries = []
+        for f in vid_config.findall('./durl'):
+            segment_url = xpath_text(f, 'url', 'video url')
+            if not segment_url:
+                continue
+            order = xpath_text(f, 'order', 'order')
+            segment_id = video_id
+            segment_title = title
+            if order:
+                segment_id += '-%s' % order
+                segment_title += ' part %s' % order
+            entries.append({
+                'id': segment_id,
+                'url': segment_url,
+                'title': segment_title,
+                'duration': int_or_none(xpath_text(f, 'length', 'duration'), 1000),
+                'http_headers': http_headers,
+            })
+
+        return entries
+
+    def _download_chinese_webpage(self, *args, **kwargs):
+        # Requests with English locales return garbage
+        headers = {
+            'Accept-Language': 'zh-TW,en-US;q=0.7,en;q=0.3',
+        }
+        kwargs.setdefault('headers', {}).update(headers)
+        return self._download_webpage(*args, **kwargs)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_chinese_webpage(
+            url, video_id)
+
+        title = self._html_search_meta(
+            'description', webpage, 'title', fatal=True)
+
+        mioplayer_path = self._search_regex(
+            r'src="(/mioplayer(?:_h5)?/[^"]+)"', webpage, 'ref_path')
+
+        if '_h5' in mioplayer_path:
+            player_url = compat_urlparse.urljoin(url, mioplayer_path)
+            player_webpage = self._download_chinese_webpage(
+                player_url, video_id,
+                note='Downloading player webpage', headers={'Referer': url})
+            entries = self._parse_html5_media_entries(player_url, player_webpage, video_id)
+            http_headers = {'Referer': player_url}
+        else:
+            http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path}
+            entries = self._extract_mioplayer(webpage, video_id, title, http_headers)
+
+        if len(entries) == 1:
+            segment = entries[0]
+            segment['id'] = video_id
+            segment['title'] = title
+            segment['http_headers'] = http_headers
+            return segment
+
+        return {
+            '_type': 'multi_video',
+            'id': video_id,
+            'entries': entries,
+            'title': title,
+            'http_headers': http_headers,
+        }
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py

new file mode 100644 (file)

index 0000000..e1506a7
--- /dev/null
+++ b/youtube_dl/extractor/mit.py
@@ -0,0 +1,132 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    get_element_by_id,
+)
+
+
+class TechTVMITIE(InfoExtractor):
+    IE_NAME = 'techtv.mit.edu'
+    _VALID_URL = r'https?://techtv\.mit\.edu/(?:videos|embeds)/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
+        'md5': '00a3a27ee20d44bcaa0933ccec4a2cf7',
+        'info_dict': {
+            'id': '25418',
+            'ext': 'mp4',
+            'title': 'MIT DNA and Protein Sets',
+            'description': 'md5:46f5c69ce434f0a97e7c628cc142802d',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        raw_page = self._download_webpage(
+            'http://techtv.mit.edu/videos/%s' % video_id, video_id)
+        clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
+
+        base_url = self._proto_relative_url(self._search_regex(
+            r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url'), 'http:')
+        formats_json = self._search_regex(
+            r'bitrates: (\[.+?\])', raw_page, 'video formats')
+        formats_mit = json.loads(formats_json)
+        formats = [
+            {
+                'format_id': f['label'],
+                'url': base_url + f['url'].partition(':')[2],
+                'ext': f['url'].partition(':')[0],
+                'format': f['label'],
+                'width': f['width'],
+                'vbr': f['bitrate'],
+            }
+            for f in formats_mit
+        ]
+
+        title = get_element_by_id('edit-title', clean_page)
+        description = clean_html(get_element_by_id('edit-description', clean_page))
+        thumbnail = self._search_regex(
+            r'playlist:.*?url: \'(.+?)\'',
+            raw_page, 'thumbnail', flags=re.DOTALL)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
+
+
+class OCWMITIE(InfoExtractor):
+    IE_NAME = 'ocw.mit.edu'
+    _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
+    _BASE_URL = 'http://ocw.mit.edu/'
+
+    _TESTS = [
+        {
+            'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
+            'info_dict': {
+                'id': 'EObHWIEKGjA',
+                'ext': 'webm',
+                'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
+                'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
+                'upload_date': '20121109',
+                'uploader_id': 'MIT',
+                'uploader': 'MIT OpenCourseWare',
+            }
+        },
+        {
+            'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
+            'info_dict': {
+                'id': '7K1sB05pE0A',
+                'ext': 'mp4',
+                'title': 'Session 1: Introduction to Derivatives',
+                'upload_date': '20090818',
+                'uploader_id': 'MIT',
+                'uploader': 'MIT OpenCourseWare',
+                'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
+            }
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        topic = mobj.group('topic')
+
+        webpage = self._download_webpage(url, topic)
+        title = self._html_search_meta('WT.cg_s', webpage)
+        description = self._html_search_meta('Description', webpage)
+
+        # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
+        embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
+        if embed_chapter_media:
+            metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
+            metadata = re.split(r', ?', metadata)
+            yt = metadata[1]
+        else:
+            # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
+            embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
+            if embed_media:
+                metadata = re.sub(r'[\'"]', '', embed_media.group(1))
+                metadata = re.split(r', ?', metadata)
+                yt = metadata[1]
+            else:
+                raise ExtractorError('Unable to find embedded YouTube video.')
+        video_id = YoutubeIE.extract_id(yt)
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'url': yt,
+            'ie_key': 'Youtube',
+        }
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py

new file mode 100644 (file)

index 0000000..ad9da96
--- /dev/null
+++ b/youtube_dl/extractor/mitele.py
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    smuggle_url,
+)
+
+
+class MiTeleIE(InfoExtractor):
+    IE_DESC = 'mitele.es'
+    _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player'
+
+    _TESTS = [{
+        'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player',
+        'info_dict': {
+            'id': 'FhYW1iNTE6J6H7NkQRIEzfne6t2quqPg',
+            'ext': 'mp4',
+            'title': 'Diario de La redacción Programa 144',
+            'description': 'md5:07c35a7b11abb05876a6a79185b58d27',
+            'series': 'Diario de',
+            'season': 'Season 14',
+            'season_number': 14,
+            'episode': 'Tor, la web invisible',
+            'episode_number': 3,
+            'thumbnail': r're:(?i)^https?://.*\.jpg$',
+            'duration': 2913,
+            'age_limit': 16,
+            'timestamp': 1471209401,
+            'upload_date': '20160814',
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        # no explicit title
+        'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player',
+        'info_dict': {
+            'id': 'oyNG1iNTE6TAPP-JmCjbwfwJqqMMX3Vq',
+            'ext': 'mp4',
+            'title': 'Cuarto Milenio Temporada 6 Programa 226',
+            'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f',
+            'series': 'Cuarto Milenio',
+            'season': 'Season 6',
+            'season_number': 6,
+            'episode': 'Episode 24',
+            'episode_number': 24,
+            'thumbnail': r're:(?i)^https?://.*\.jpg$',
+            'duration': 7313,
+            'age_limit': 12,
+            'timestamp': 1471209021,
+            'upload_date': '20160814',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144-40_1006364575251/player/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        pre_player = self._parse_json(self._search_regex(
+            r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=\s*({.+})',
+            webpage, 'Pre Player'), display_id)['prePlayer']
+        title = pre_player['title']
+        video = pre_player['video']
+        video_id = video['dataMediaId']
+        content = pre_player.get('content') or {}
+        info = content.get('info') or {}
+
+        return {
+            '_type': 'url_transparent',
+            # for some reason only HLS is supported
+            'url': smuggle_url('ooyala:' + video_id, {'supportedformats': 'm3u8,dash'}),
+            'id': video_id,
+            'title': title,
+            'description': info.get('synopsis'),
+            'series': content.get('title'),
+            'season_number': int_or_none(info.get('season_number')),
+            'episode': content.get('subtitle'),
+            'episode_number': int_or_none(info.get('episode_number')),
+            'duration': int_or_none(info.get('duration')),
+            'thumbnail': video.get('dataPoster'),
+            'age_limit': int_or_none(info.get('rating')),
+            'timestamp': parse_iso8601(pre_player.get('publishedTime')),
+        }
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py

new file mode 100644 (file)

index 0000000..9759560
--- /dev/null
+++ b/youtube_dl/extractor/mixcloud.py
@@ -0,0 +1,351 @@
+from __future__ import unicode_literals
+
+import itertools
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_chr,
+    compat_ord,
+    compat_str,
+    compat_urllib_parse_unquote,
+    compat_zip
+)
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    strip_or_none,
+    try_get,
+)
+
+
+class MixcloudBaseIE(InfoExtractor):
+    def _call_api(self, object_type, object_fields, display_id, username, slug=None):
+        lookup_key = object_type + 'Lookup'
+        return self._download_json(
+            'https://www.mixcloud.com/graphql', display_id, query={
+                'query': '''{
+  %s(lookup: {username: "%s"%s}) {
+    %s
+  }
+}''' % (lookup_key, username, ', slug: "%s"' % slug if slug else '', object_fields)
+            })['data'][lookup_key]
+
+
+class MixcloudIE(MixcloudBaseIE):
+    _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)'
+    IE_NAME = 'mixcloud'
+
+    _TESTS = [{
+        'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
+        'info_dict': {
+            'id': 'dholbach_cryptkeeper',
+            'ext': 'm4a',
+            'title': 'Cryptkeeper',
+            'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
+            'uploader': 'Daniel Holbach',
+            'uploader_id': 'dholbach',
+            'thumbnail': r're:https?://.*\.jpg',
+            'view_count': int,
+            'timestamp': 1321359578,
+            'upload_date': '20111115',
+        },
+    }, {
+        'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
+        'info_dict': {
+            'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat',
+            'ext': 'mp3',
+            'title': 'Caribou 7 inch Vinyl Mix & Chat',
+            'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
+            'uploader': 'Gilles Peterson Worldwide',
+            'uploader_id': 'gillespeterson',
+            'thumbnail': 're:https?://.*',
+            'view_count': int,
+            'timestamp': 1422987057,
+            'upload_date': '20150203',
+        },
+    }, {
+        'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
+        'only_matching': True,
+    }]
+    _DECRYPTION_KEY = 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'
+
+    @staticmethod
+    def _decrypt_xor_cipher(key, ciphertext):
+        """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
+        return ''.join([
+            compat_chr(compat_ord(ch) ^ compat_ord(k))
+            for ch, k in compat_zip(ciphertext, itertools.cycle(key))])
+
+    def _real_extract(self, url):
+        username, slug = re.match(self._VALID_URL, url).groups()
+        username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug)
+        track_id = '%s_%s' % (username, slug)
+
+        cloudcast = self._call_api('cloudcast', '''audioLength
+    comments(first: 100) {
+      edges {
+        node {
+          comment
+          created
+          user {
+            displayName
+            username
+          }
+        }
+      }
+      totalCount
+    }
+    description
+    favorites {
+      totalCount
+    }
+    featuringArtistList
+    isExclusive
+    name
+    owner {
+      displayName
+      url
+      username
+    }
+    picture(width: 1024, height: 1024) {
+        url
+    }
+    plays
+    publishDate
+    reposts {
+      totalCount
+    }
+    streamInfo {
+      dashUrl
+      hlsUrl
+      url
+    }
+    tags {
+      tag {
+        name
+      }
+    }''', track_id, username, slug)
+
+        title = cloudcast['name']
+
+        stream_info = cloudcast['streamInfo']
+        formats = []
+
+        for url_key in ('url', 'hlsUrl', 'dashUrl'):
+            format_url = stream_info.get(url_key)
+            if not format_url:
+                continue
+            decrypted = self._decrypt_xor_cipher(
+                self._DECRYPTION_KEY, compat_b64decode(format_url))
+            if url_key == 'hlsUrl':
+                formats.extend(self._extract_m3u8_formats(
+                    decrypted, track_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif url_key == 'dashUrl':
+                formats.extend(self._extract_mpd_formats(
+                    decrypted, track_id, mpd_id='dash', fatal=False))
+            else:
+                formats.append({
+                    'format_id': 'http',
+                    'url': decrypted,
+                    'downloader_options': {
+                        # Mixcloud starts throttling at >~5M
+                        'http_chunk_size': 5242880,
+                    },
+                })
+
+        if not formats and cloudcast.get('isExclusive'):
+            self.raise_login_required()
+
+        self._sort_formats(formats)
+
+        comments = []
+        for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []):
+            node = edge.get('node') or {}
+            text = strip_or_none(node.get('comment'))
+            if not text:
+                continue
+            user = node.get('user') or {}
+            comments.append({
+                'author': user.get('displayName'),
+                'author_id': user.get('username'),
+                'text': text,
+                'timestamp': parse_iso8601(node.get('created')),
+            })
+
+        tags = []
+        for t in cloudcast.get('tags'):
+            tag = try_get(t, lambda x: x['tag']['name'], compat_str)
+            if not tag:
+                tags.append(tag)
+
+        get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount']))
+
+        owner = cloudcast.get('owner') or {}
+
+        return {
+            'id': track_id,
+            'title': title,
+            'formats': formats,
+            'description': cloudcast.get('description'),
+            'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], compat_str),
+            'uploader': owner.get('displayName'),
+            'timestamp': parse_iso8601(cloudcast.get('publishDate')),
+            'uploader_id': owner.get('username'),
+            'uploader_url': owner.get('url'),
+            'duration': int_or_none(cloudcast.get('audioLength')),
+            'view_count': int_or_none(cloudcast.get('plays')),
+            'like_count': get_count('favorites'),
+            'repost_count': get_count('reposts'),
+            'comment_count': get_count('comments'),
+            'comments': comments,
+            'tags': tags,
+            'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None,
+        }
+
+
+class MixcloudPlaylistBaseIE(MixcloudBaseIE):
+    def _get_cloudcast(self, node):
+        return node
+
+    def _get_playlist_title(self, title, slug):
+        return title
+
+    def _real_extract(self, url):
+        username, slug = re.match(self._VALID_URL, url).groups()
+        username = compat_urllib_parse_unquote(username)
+        if not slug:
+            slug = 'uploads'
+        else:
+            slug = compat_urllib_parse_unquote(slug)
+        playlist_id = '%s_%s' % (username, slug)
+
+        is_playlist_type = self._ROOT_TYPE == 'playlist'
+        playlist_type = 'items' if is_playlist_type else slug
+        list_filter = ''
+
+        has_next_page = True
+        entries = []
+        while has_next_page:
+            playlist = self._call_api(
+                self._ROOT_TYPE, '''%s
+    %s
+    %s(first: 100%s) {
+      edges {
+        node {
+          %s
+        }
+      }
+      pageInfo {
+        endCursor
+        hasNextPage
+      }
+    }''' % (self._TITLE_KEY, self._DESCRIPTION_KEY, playlist_type, list_filter, self._NODE_TEMPLATE),
+                playlist_id, username, slug if is_playlist_type else None)
+
+            items = playlist.get(playlist_type) or {}
+            for edge in items.get('edges', []):
+                cloudcast = self._get_cloudcast(edge.get('node') or {})
+                cloudcast_url = cloudcast.get('url')
+                if not cloudcast_url:
+                    continue
+                entries.append(self.url_result(
+                    cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug')))
+
+            page_info = items['pageInfo']
+            has_next_page = page_info['hasNextPage']
+            list_filter = ', after: "%s"' % page_info['endCursor']
+
+        return self.playlist_result(
+            entries, playlist_id,
+            self._get_playlist_title(playlist[self._TITLE_KEY], slug),
+            playlist.get(self._DESCRIPTION_KEY))
+
+
+class MixcloudUserIE(MixcloudPlaylistBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/(?P<type>uploads|favorites|listens|stream)?/?$'
+    IE_NAME = 'mixcloud:user'
+
+    _TESTS = [{
+        'url': 'http://www.mixcloud.com/dholbach/',
+        'info_dict': {
+            'id': 'dholbach_uploads',
+            'title': 'Daniel Holbach (uploads)',
+            'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
+        },
+        'playlist_mincount': 36,
+    }, {
+        'url': 'http://www.mixcloud.com/dholbach/uploads/',
+        'info_dict': {
+            'id': 'dholbach_uploads',
+            'title': 'Daniel Holbach (uploads)',
+            'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
+        },
+        'playlist_mincount': 36,
+    }, {
+        'url': 'http://www.mixcloud.com/dholbach/favorites/',
+        'info_dict': {
+            'id': 'dholbach_favorites',
+            'title': 'Daniel Holbach (favorites)',
+            'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
+        },
+        # 'params': {
+        #     'playlist_items': '1-100',
+        # },
+        'playlist_mincount': 396,
+    }, {
+        'url': 'http://www.mixcloud.com/dholbach/listens/',
+        'info_dict': {
+            'id': 'dholbach_listens',
+            'title': 'Daniel Holbach (listens)',
+            'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
+        },
+        # 'params': {
+        #     'playlist_items': '1-100',
+        # },
+        'playlist_mincount': 1623,
+        'skip': 'Large list',
+    }, {
+        'url': 'https://www.mixcloud.com/FirstEar/stream/',
+        'info_dict': {
+            'id': 'FirstEar_stream',
+            'title': 'First Ear (stream)',
+            'description': 'Curators of good music\r\n\r\nfirstearmusic.com',
+        },
+        'playlist_mincount': 271,
+    }]
+
+    _TITLE_KEY = 'displayName'
+    _DESCRIPTION_KEY = 'biog'
+    _ROOT_TYPE = 'user'
+    _NODE_TEMPLATE = '''slug
+          url'''
+
+    def _get_playlist_title(self, title, slug):
+        return '%s (%s)' % (title, slug)
+
+
+class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
+    IE_NAME = 'mixcloud:playlist'
+
+    _TESTS = [{
+        'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
+        'info_dict': {
+            'id': 'maxvibes_jazzcat-on-ness-radio',
+            'title': 'Ness Radio sessions',
+        },
+        'playlist_mincount': 59,
+    }]
+    _TITLE_KEY = 'name'
+    _DESCRIPTION_KEY = 'description'
+    _ROOT_TYPE = 'playlist'
+    _NODE_TEMPLATE = '''cloudcast {
+            slug
+            url
+          }'''
+
+    def _get_cloudcast(self, node):
+        return node.get('cloudcast') or {}
diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py

new file mode 100644 (file)

index 0000000..b907f6b
--- /dev/null
+++ b/youtube_dl/extractor/mlb.py
@@ -0,0 +1,120 @@
+from __future__ import unicode_literals
+
+from .nhl import NHLBaseIE
+
+
+class MLBIE(NHLBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:[\da-z_-]+\.)*(?P<site>mlb)\.com/
+                        (?:
+                            (?:
+                                (?:[^/]+/)*c-|
+                                (?:
+                                    shared/video/embed/(?:embed|m-internal-embed)\.html|
+                                    (?:[^/]+/)+(?:play|index)\.jsp|
+                                )\?.*?\bcontent_id=
+                            )
+                            (?P<id>\d+)
+                        )
+                    '''
+    _CONTENT_DOMAIN = 'content.mlb.com'
+    _TESTS = [
+        {
+            'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933',
+            'md5': '632358dacfceec06bad823b83d21df2d',
+            'info_dict': {
+                'id': '34698933',
+                'ext': 'mp4',
+                'title': "Ackley's spectacular catch",
+                'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0',
+                'duration': 66,
+                'timestamp': 1405995000,
+                'upload_date': '20140722',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            'url': 'https://www.mlb.com/video/stanton-prepares-for-derby/c-34496663',
+            'md5': 'bf2619bf9cacc0a564fc35e6aeb9219f',
+            'info_dict': {
+                'id': '34496663',
+                'ext': 'mp4',
+                'title': 'Stanton prepares for Derby',
+                'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57',
+                'duration': 46,
+                'timestamp': 1405120200,
+                'upload_date': '20140711',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            'url': 'https://www.mlb.com/video/cespedes-repeats-as-derby-champ/c-34578115',
+            'md5': '99bb9176531adc600b90880fb8be9328',
+            'info_dict': {
+                'id': '34578115',
+                'ext': 'mp4',
+                'title': 'Cespedes repeats as Derby champ',
+                'description': 'md5:08df253ce265d4cf6fb09f581fafad07',
+                'duration': 488,
+                'timestamp': 1405414336,
+                'upload_date': '20140715',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            'url': 'https://www.mlb.com/video/bautista-on-home-run-derby/c-34577915',
+            'md5': 'da8b57a12b060e7663ee1eebd6f330ec',
+            'info_dict': {
+                'id': '34577915',
+                'ext': 'mp4',
+                'title': 'Bautista on Home Run Derby',
+                'description': 'md5:b80b34031143d0986dddc64a8839f0fb',
+                'duration': 52,
+                'timestamp': 1405405122,
+                'upload_date': '20140715',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098',
+            'md5': 'e09e37b552351fddbf4d9e699c924d68',
+            'info_dict': {
+                'id': '75609783',
+                'ext': 'mp4',
+                'title': 'Must C: Pillar climbs for catch',
+                'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run',
+                'timestamp': 1429139220,
+                'upload_date': '20150415',
+            }
+        },
+        {
+            'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://mlb.mlb.com/shared/video/embed/embed.html?content_id=36599553',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://mlb.mlb.com/es/video/play.jsp?content_id=36599553',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://www.mlb.com/cardinals/video/piscottys-great-sliding-catch/c-51175783',
+            'only_matching': True,
+        },
+        {
+            # From http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer
+            'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842',
+            'only_matching': True,
+        }
+    ]
diff --git a/youtube_dl/extractor/mnet.py b/youtube_dl/extractor/mnet.py

new file mode 100644 (file)

index 0000000..0e26ca1
--- /dev/null
+++ b/youtube_dl/extractor/mnet.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+)
+
+
+class MnetIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?mnet\.(?:com|interest\.me)/tv/vod/(?:.*?\bclip_id=)?(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.mnet.com/tv/vod/171008',
+        'info_dict': {
+            'id': '171008',
+            'title': 'SS_이해인@히든박스',
+            'description': 'md5:b9efa592c3918b615ba69fe9f8a05c55',
+            'duration': 88,
+            'upload_date': '20151231',
+            'timestamp': 1451564040,
+            'age_limit': 0,
+            'thumbnails': 'mincount:5',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'ext': 'flv',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://mnet.interest.me/tv/vod/172790',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.mnet.com/tv/vod/vod_view.asp?clip_id=172790&tabMenu=',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # TODO: extract rtmp formats
+        # no stype -> rtmp url
+        # stype=H -> m3u8 url
+        # stype=M -> mpd url
+        info = self._download_json(
+            'http://content.api.mnet.com/player/vodConfig',
+            video_id, 'Downloading vod config JSON', query={
+                'id': video_id,
+                'ctype': 'CLIP',
+                'stype': 'H',
+            })['data']['info']
+
+        title = info['title']
+
+        cdn_data = self._download_json(
+            info['cdn'], video_id, 'Downloading vod cdn JSON')['data'][0]
+        m3u8_url = cdn_data['url']
+        token = cdn_data.get('token')
+        if token and token != '-':
+            m3u8_url += '?' + token
+        formats = self._extract_wowza_formats(
+            m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp', 'f4m'])
+        self._sort_formats(formats)
+
+        description = info.get('ment')
+        duration = parse_duration(info.get('time'))
+        timestamp = parse_iso8601(info.get('date'), delimiter=' ')
+        age_limit = info.get('adult')
+        if age_limit is not None:
+            age_limit = 0 if age_limit == 'N' else 18
+        thumbnails = [{
+            'id': thumb_format,
+            'url': thumb['url'],
+            'width': int_or_none(thumb.get('width')),
+            'height': int_or_none(thumb.get('height')),
+        } for thumb_format, thumb in info.get('cover', {}).items() if thumb.get('url')]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'age_limit': age_limit,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py

new file mode 100644 (file)

index 0000000..eb9b4ce
--- /dev/null
+++ b/youtube_dl/extractor/moevideo.py
@@ -0,0 +1,79 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+)
+
+
+class MoeVideoIE(InfoExtractor):
+    IE_DESC = 'LetitBit video services: moevideo.net, playreplay.net and videochart.net'
+    _VALID_URL = r'''(?x)
+        https?://(?P<host>(?:www\.)?
+        (?:(?:moevideo|playreplay|videochart)\.net|thesame\.tv))/
+        (?:video|framevideo|embed)/(?P<id>[0-9a-z]+\.[0-9A-Za-z]+)'''
+    _API_URL = 'http://api.letitbit.net/'
+    _API_KEY = 'tVL0gjqo5'
+    _TESTS = [
+        {
+            'url': 'http://moevideo.net/video/00297.0036103fe3d513ef27915216fd29',
+            'md5': '129f5ae1f6585d0e9bb4f38e774ffb3a',
+            'info_dict': {
+                'id': '00297.0036103fe3d513ef27915216fd29',
+                'ext': 'flv',
+                'title': 'Sink cut out machine',
+                'description': 'md5:f29ff97b663aefa760bf7ca63c8ca8a8',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'width': 540,
+                'height': 360,
+                'duration': 179,
+                'filesize': 17822500,
+            },
+            'skip': 'Video has been removed',
+        },
+        {
+            'url': 'http://playreplay.net/video/77107.7f325710a627383d40540d8e991a',
+            'md5': '74f0a014d5b661f0f0e2361300d1620e',
+            'info_dict': {
+                'id': '77107.7f325710a627383d40540d8e991a',
+                'ext': 'flv',
+                'title': 'Operacion Condor.',
+                'description': 'md5:7e68cb2fcda66833d5081c542491a9a3',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'width': 480,
+                'height': 296,
+                'duration': 6027,
+                'filesize': 588257923,
+            },
+            'skip': 'Video has been removed',
+        },
+    ]
+
+    def _real_extract(self, url):
+        host, video_id = re.match(self._VALID_URL, url).groups()
+
+        webpage = self._download_webpage(
+            'http://%s/video/%s' % (host, video_id),
+            video_id, 'Downloading webpage')
+
+        title = self._og_search_title(webpage)
+
+        embed_webpage = self._download_webpage(
+            'http://%s/embed/%s' % (host, video_id),
+            video_id, 'Downloading embed webpage')
+        video = self._parse_json(self._search_regex(
+            r'mvplayer\("#player"\s*,\s*({.+})',
+            embed_webpage, 'mvplayer'), video_id)['video']
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': video.get('poster') or self._og_search_thumbnail(webpage),
+            'description': clean_html(self._og_search_description(webpage)),
+            'duration': int_or_none(self._og_search_property('video:duration', webpage)),
+            'url': video['ourUrl'],
+        }
diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py

new file mode 100644 (file)

index 0000000..5234cac
--- /dev/null
+++ b/youtube_dl/extractor/mofosex.py
@@ -0,0 +1,79 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    str_to_int,
+    unified_strdate,
+)
+from .keezmovies import KeezMoviesIE
+
+
+class MofosexIE(KeezMoviesIE):
+    _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P<id>\d+)/(?P<display_id>[^/?#&.]+)\.html'
+    _TESTS = [{
+        'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html',
+        'md5': '558fcdafbb63a87c019218d6e49daf8a',
+        'info_dict': {
+            'id': '318131',
+            'display_id': 'amateur-teen-playing-and-masturbating-318131',
+            'ext': 'mp4',
+            'title': 'amateur teen playing and masturbating',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20121114',
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'age_limit': 18,
+        }
+    }, {
+        # This video is no longer available
+        'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        webpage, info = self._extract_info(url)
+
+        view_count = str_to_int(self._search_regex(
+            r'VIEWS:</span>\s*([\d,.]+)', webpage, 'view count', fatal=False))
+        like_count = int_or_none(self._search_regex(
+            r'id=["\']amountLikes["\'][^>]*>(\d+)', webpage,
+            'like count', fatal=False))
+        dislike_count = int_or_none(self._search_regex(
+            r'id=["\']amountDislikes["\'][^>]*>(\d+)', webpage,
+            'like count', fatal=False))
+        upload_date = unified_strdate(self._html_search_regex(
+            r'Added:</span>([^<]+)', webpage, 'upload date', fatal=False))
+
+        info.update({
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'upload_date': upload_date,
+            'thumbnail': self._og_search_thumbnail(webpage),
+        })
+
+        return info
+
+
+class MofosexEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result(
+            'http://www.mofosex.com/videos/{0}/{0}.html'.format(video_id),
+            ie=MofosexIE.ie_key(), video_id=video_id)
diff --git a/youtube_dl/extractor/mojvideo.py b/youtube_dl/extractor/mojvideo.py

new file mode 100644 (file)

index 0000000..165e658
--- /dev/null
+++ b/youtube_dl/extractor/mojvideo.py
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    parse_duration,
+)
+
+
+class MojvideoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?mojvideo\.com/video-(?P<display_id>[^/]+)/(?P<id>[a-f0-9]+)'
+    _TEST = {
+        'url': 'http://www.mojvideo.com/video-v-avtu-pred-mano-rdecelaska-alfi-nipic/3d1ed4497707730b2906',
+        'md5': 'f7fd662cc8ce2be107b0d4f2c0483ae7',
+        'info_dict': {
+            'id': '3d1ed4497707730b2906',
+            'display_id': 'v-avtu-pred-mano-rdecelaska-alfi-nipic',
+            'ext': 'mp4',
+            'title': 'V avtu pred mano rdečelaska - Alfi Nipič',
+            'thumbnail': r're:^http://.*\.jpg$',
+            'duration': 242,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        # XML is malformed
+        playerapi = self._download_webpage(
+            'http://www.mojvideo.com/playerapi.php?v=%s&t=1' % video_id, display_id)
+
+        if '<error>true</error>' in playerapi:
+            error_desc = self._html_search_regex(
+                r'<errordesc>([^<]*)</errordesc>', playerapi, 'error description', fatal=False)
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error_desc), expected=True)
+
+        title = self._html_search_regex(
+            r'<title>([^<]+)</title>', playerapi, 'title')
+        video_url = self._html_search_regex(
+            r'<file>([^<]+)</file>', playerapi, 'video URL')
+        thumbnail = self._html_search_regex(
+            r'<preview>([^<]+)</preview>', playerapi, 'thumbnail', fatal=False)
+        duration = parse_duration(self._html_search_regex(
+            r'<duration>([^<]+)</duration>', playerapi, 'duration', fatal=False))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py

new file mode 100644 (file)

index 0000000..0093bcd
--- /dev/null
+++ b/youtube_dl/extractor/morningstar.py
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MorningstarIE(InfoExtractor):
+    IE_DESC = 'morningstar.com'
+    _VALID_URL = r'https?://(?:(?:www|news)\.)morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869',
+        'md5': '6c0acface7a787aadc8391e4bbf7b0f5',
+        'info_dict': {
+            'id': '615869',
+            'ext': 'mp4',
+            'title': 'Get Ahead of the Curve on 2013 Taxes',
+            'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.",
+            'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$'
+        }
+    }, {
+        'url': 'http://news.morningstar.com/cover/videocenter.aspx?id=825556',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._html_search_regex(
+            r'<h1 id="titleLink">(.*?)</h1>', webpage, 'title')
+        video_url = self._html_search_regex(
+            r'<input type="hidden" id="hidVideoUrl" value="([^"]+)"',
+            webpage, 'video URL')
+        thumbnail = self._html_search_regex(
+            r'<input type="hidden" id="hidSnapshot" value="([^"]+)"',
+            webpage, 'thumbnail', fatal=False)
+        description = self._html_search_regex(
+            r'<div id="mstarDeck".*?>(.*?)</div>',
+            webpage, 'description', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'description': description,
+        }
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py

new file mode 100644 (file)

index 0000000..b1615b4
--- /dev/null
+++ b/youtube_dl/extractor/motherless.py
@@ -0,0 +1,207 @@
+from __future__ import unicode_literals
+
+import datetime
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    ExtractorError,
+    InAdvancePagedList,
+    orderedSet,
+    str_to_int,
+    unified_strdate,
+)
+
+
+class MotherlessIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
+    _TESTS = [{
+        'url': 'http://motherless.com/AC3FFE1',
+        'md5': '310f62e325a9fafe64f68c0bccb6e75f',
+        'info_dict': {
+            'id': 'AC3FFE1',
+            'ext': 'mp4',
+            'title': 'Fucked in the ass while playing PS3',
+            'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
+            'upload_date': '20100913',
+            'uploader_id': 'famouslyfuckedup',
+            'thumbnail': r're:https?://.*\.jpg',
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'http://motherless.com/532291B',
+        'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
+        'info_dict': {
+            'id': '532291B',
+            'ext': 'mp4',
+            'title': 'Amazing girl playing the omegle game, PERFECT!',
+            'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen',
+                           'game', 'hairy'],
+            'upload_date': '20140622',
+            'uploader_id': 'Sulivana7x',
+            'thumbnail': r're:https?://.*\.jpg',
+            'age_limit': 18,
+        },
+        'skip': '404',
+    }, {
+        'url': 'http://motherless.com/g/cosplay/633979F',
+        'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
+        'info_dict': {
+            'id': '633979F',
+            'ext': 'mp4',
+            'title': 'Turtlette',
+            'categories': ['superheroine heroine  superher'],
+            'upload_date': '20140827',
+            'uploader_id': 'shade0230',
+            'thumbnail': r're:https?://.*\.jpg',
+            'age_limit': 18,
+        }
+    }, {
+        # no keywords
+        'url': 'http://motherless.com/8B4BBC1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        if any(p in webpage for p in (
+                '<title>404 - MOTHERLESS.COM<',
+                ">The page you're looking for cannot be found.<")):
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+        if '>The content you are trying to view is for friends only.' in webpage:
+            raise ExtractorError('Video %s is for friends only' % video_id, expected=True)
+
+        title = self._html_search_regex(
+            (r'(?s)<div[^>]+\bclass=["\']media-meta-title[^>]+>(.+?)</div>',
+             r'id="view-upload-title">\s+([^<]+)<'), webpage, 'title')
+        video_url = (self._html_search_regex(
+            (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+             r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'),
+            webpage, 'video URL', default=None, group='url')
+            or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
+        age_limit = self._rta_search(webpage)
+        view_count = str_to_int(self._html_search_regex(
+            (r'>(\d+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
+            webpage, 'view count', fatal=False))
+        like_count = str_to_int(self._html_search_regex(
+            (r'>(\d+)\s+Favorites<', r'<strong>Favorited</strong>\s+([^<]+)<'),
+            webpage, 'like count', fatal=False))
+
+        upload_date = self._html_search_regex(
+            (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<',
+             r'<strong>Uploaded</strong>\s+([^<]+)<'), webpage, 'upload date')
+        if 'Ago' in upload_date:
+            days = int(re.search(r'([0-9]+)', upload_date).group(1))
+            upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d')
+        else:
+            upload_date = unified_strdate(upload_date)
+
+        comment_count = webpage.count('class="media-comment-contents"')
+        uploader_id = self._html_search_regex(
+            r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
+            webpage, 'uploader_id')
+
+        categories = self._html_search_meta('keywords', webpage, default=None)
+        if categories:
+            categories = [cat.strip() for cat in categories.split(',')]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'upload_date': upload_date,
+            'uploader_id': uploader_id,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'categories': categories,
+            'view_count': view_count,
+            'like_count': like_count,
+            'comment_count': comment_count,
+            'age_limit': age_limit,
+            'url': video_url,
+        }
+
+
+class MotherlessGroupIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)'
+    _TESTS = [{
+        'url': 'http://motherless.com/g/movie_scenes',
+        'info_dict': {
+            'id': 'movie_scenes',
+            'title': 'Movie Scenes',
+            'description': 'Hot and sexy scenes from "regular" movies... '
+                           'Beautiful actresses fully nude... A looot of '
+                           'skin! :)Enjoy!',
+        },
+        'playlist_mincount': 662,
+    }, {
+        'url': 'http://motherless.com/gv/sex_must_be_funny',
+        'info_dict': {
+            'id': 'sex_must_be_funny',
+            'title': 'Sex must be funny',
+            'description': 'Sex can be funny. Wide smiles,laugh, games, fun of '
+                           'any kind!'
+        },
+        'playlist_mincount': 9,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if MotherlessIE.suitable(url)
+                else super(MotherlessGroupIE, cls).suitable(url))
+
+    def _extract_entries(self, webpage, base):
+        entries = []
+        for mobj in re.finditer(
+                r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?',
+                webpage):
+            video_url = compat_urlparse.urljoin(base, mobj.group('href'))
+            if not MotherlessIE.suitable(video_url):
+                continue
+            video_id = MotherlessIE._match_id(video_url)
+            title = mobj.group('title')
+            entries.append(self.url_result(
+                video_url, ie=MotherlessIE.ie_key(), video_id=video_id,
+                video_title=title))
+        # Alternative fallback
+        if not entries:
+            entries = [
+                self.url_result(
+                    compat_urlparse.urljoin(base, '/' + entry_id),
+                    ie=MotherlessIE.ie_key(), video_id=entry_id)
+                for entry_id in orderedSet(re.findall(
+                    r'data-codename=["\']([A-Z0-9]+)', webpage))]
+        return entries
+
+    def _real_extract(self, url):
+        group_id = self._match_id(url)
+        page_url = compat_urlparse.urljoin(url, '/gv/%s' % group_id)
+        webpage = self._download_webpage(page_url, group_id)
+        title = self._search_regex(
+            r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False)
+        description = self._html_search_meta(
+            'description', webpage, fatal=False)
+        page_count = self._int(self._search_regex(
+            r'(\d+)</(?:a|span)><(?:a|span)[^>]+>\s*NEXT',
+            webpage, 'page_count'), 'page_count')
+        PAGE_SIZE = 80
+
+        def _get_page(idx):
+            webpage = self._download_webpage(
+                page_url, group_id, query={'page': idx + 1},
+                note='Downloading page %d/%d' % (idx + 1, page_count)
+            )
+            for entry in self._extract_entries(webpage, url):
+                yield entry
+
+        playlist = InAdvancePagedList(_get_page, page_count, PAGE_SIZE)
+
+        return {
+            '_type': 'playlist',
+            'id': group_id,
+            'title': title,
+            'description': description,
+            'entries': playlist
+        }
diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py

new file mode 100644 (file)

index 0000000..c9d1ab6
--- /dev/null
+++ b/youtube_dl/extractor/motorsport.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urlparse,
+)
+
+
+class MotorsportIE(InfoExtractor):
+    IE_DESC = 'motorsport.com'
+    _VALID_URL = r'https?://(?:www\.)?motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])'
+    _TEST = {
+        'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/',
+        'info_dict': {
+            'id': '2-T3WuR-KMM',
+            'ext': 'mp4',
+            'title': 'Red Bull Racing: 2014 Rules Explained',
+            'duration': 208,
+            'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations – which are arguably the most complex the sport has ever seen.',
+            'uploader': 'mcomstaff',
+            'uploader_id': 'UC334JIYKkVnyFoNCclfZtHQ',
+            'upload_date': '20140903',
+            'thumbnail': r're:^https?://.+\.jpg$'
+        },
+        'add_ie': ['Youtube'],
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        iframe_path = self._html_search_regex(
+            r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage,
+            'iframe path')
+        iframe = self._download_webpage(
+            compat_urlparse.urljoin(url, iframe_path), display_id,
+            'Downloading iframe')
+        youtube_id = self._search_regex(
+            r'www.youtube.com/embed/(.{11})', iframe, 'youtube id')
+
+        return {
+            '_type': 'url_transparent',
+            'display_id': display_id,
+            'url': 'https://youtube.com/watch?v=%s' % youtube_id,
+        }
diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py

new file mode 100644 (file)

index 0000000..5453da1
--- /dev/null
+++ b/youtube_dl/extractor/movieclips.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    smuggle_url,
+    float_or_none,
+    parse_iso8601,
+    update_url_query,
+)
+
+
+class MovieClipsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?movieclips\.com/videos/.+-(?P<id>\d+)(?:\?|$)'
+    _TEST = {
+        'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597',
+        'md5': '42b5a0352d4933a7bd54f2104f481244',
+        'info_dict': {
+            'id': 'pKIGmG83AqD9',
+            'ext': 'mp4',
+            'title': 'Warcraft Trailer 1',
+            'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1446843055,
+            'upload_date': '20151106',
+            'uploader': 'Movieclips',
+        },
+        'add_ie': ['ThePlatform'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        video = next(v for v in self._parse_json(self._search_regex(
+            r'var\s+__REACT_ENGINE__\s*=\s*({.+});',
+            webpage, 'react engine'), video_id)['playlist']['videos'] if v['id'] == video_id)
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': smuggle_url(update_url_query(
+                video['contentUrl'], {'mbr': 'true'}), {'force_smil_url': True}),
+            'title': self._og_search_title(webpage),
+            'description': self._html_search_meta('description', webpage),
+            'duration': float_or_none(video.get('duration')),
+            'timestamp': parse_iso8601(video.get('dateCreated')),
+            'thumbnail': video.get('defaultImage'),
+            'uploader': video.get('provider'),
+        }
diff --git a/youtube_dl/extractor/moviezine.py b/youtube_dl/extractor/moviezine.py

new file mode 100644 (file)

index 0000000..85cc6e2
--- /dev/null
+++ b/youtube_dl/extractor/moviezine.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MoviezineIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?moviezine\.se/video/(?P<id>[^?#]+)'
+
+    _TEST = {
+        'url': 'http://www.moviezine.se/video/205866',
+        'info_dict': {
+            'id': '205866',
+            'ext': 'mp4',
+            'title': 'Oculus - Trailer 1',
+            'description': 'md5:40cc6790fc81d931850ca9249b40e8a4',
+            'thumbnail': r're:http://.*\.jpg',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        jsplayer = self._download_webpage('http://www.moviezine.se/api/player.js?video=%s' % video_id, video_id, 'Downloading js api player')
+
+        formats = [{
+            'format_id': 'sd',
+            'url': self._html_search_regex(r'file: "(.+?)",', jsplayer, 'file'),
+            'quality': 0,
+            'ext': 'mp4',
+        }]
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._search_regex(r'title: "(.+?)",', jsplayer, 'title'),
+            'thumbnail': self._search_regex(r'image: "(.+?)",', jsplayer, 'image'),
+            'formats': formats,
+            'description': self._og_search_description(webpage),
+        }
diff --git a/youtube_dl/extractor/movingimage.py b/youtube_dl/extractor/movingimage.py

new file mode 100644 (file)

index 0000000..4f62d62
--- /dev/null
+++ b/youtube_dl/extractor/movingimage.py
@@ -0,0 +1,52 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    unescapeHTML,
+    parse_duration,
+)
+
+
+class MovingImageIE(InfoExtractor):
+    _VALID_URL = r'https?://movingimage\.nls\.uk/film/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://movingimage.nls.uk/film/3561',
+        'md5': '4caa05c2b38453e6f862197571a7be2f',
+        'info_dict': {
+            'id': '3561',
+            'ext': 'mp4',
+            'title': 'SHETLAND WOOL',
+            'description': 'md5:c5afca6871ad59b4271e7704fe50ab04',
+            'duration': 900,
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        formats = self._extract_m3u8_formats(
+            self._html_search_regex(r'file\s*:\s*"([^"]+)"', webpage, 'm3u8 manifest URL'),
+            video_id, ext='mp4', entry_protocol='m3u8_native')
+
+        def search_field(field_name, fatal=False):
+            return self._search_regex(
+                r'<span\s+class="field_title">%s:</span>\s*<span\s+class="field_content">([^<]+)</span>' % field_name,
+                webpage, 'title', fatal=fatal)
+
+        title = unescapeHTML(search_field('Title', fatal=True)).strip('()[]')
+        description = unescapeHTML(search_field('Description'))
+        duration = parse_duration(search_field('Running time'))
+        thumbnail = self._search_regex(
+            r"image\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py

new file mode 100644 (file)

index 0000000..e59b0b7
--- /dev/null
+++ b/youtube_dl/extractor/msn.py
@@ -0,0 +1,171 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    unescapeHTML,
+)
+
+
+class MSNIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|preview)\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)'
+    _TESTS = [{
+        'url': 'https://www.msn.com/en-in/money/video/7-ways-to-get-rid-of-chest-congestion/vi-BBPxU6d',
+        'md5': '087548191d273c5c55d05028f8d2cbcd',
+        'info_dict': {
+            'id': 'BBPxU6d',
+            'display_id': '7-ways-to-get-rid-of-chest-congestion',
+            'ext': 'mp4',
+            'title': 'Seven ways to get rid of chest congestion',
+            'description': '7 Ways to Get Rid of Chest Congestion',
+            'duration': 88,
+            'uploader': 'Health',
+            'uploader_id': 'BBPrMqa',
+        },
+    }, {
+        # Article, multiple Dailymotion Embeds
+        'url': 'https://www.msn.com/en-in/money/sports/hottest-football-wags-greatest-footballers-turned-managers-and-more/ar-BBpc7Nl',
+        'info_dict': {
+            'id': 'BBpc7Nl',
+        },
+        'playlist_mincount': 4,
+    }, {
+        'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH',
+        'only_matching': True,
+    }, {
+        # geo restricted
+        'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6',
+        'only_matching': True,
+    }, {
+        # Vidible(AOL) Embed
+        'url': 'https://www.msn.com/en-us/money/other/jupiter-is-about-to-come-so-close-you-can-see-its-moons-with-binoculars/vi-AACqsHR',
+        'only_matching': True,
+    }, {
+        # Dailymotion Embed
+        'url': 'https://www.msn.com/es-ve/entretenimiento/watch/winston-salem-paire-refait-des-siennes-en-perdant-sa-raquette-au-service/vp-AAG704L',
+        'only_matching': True,
+    }, {
+        # YouTube Embed
+        'url': 'https://www.msn.com/en-in/money/news/meet-vikram-%E2%80%94-chandrayaan-2s-lander/vi-AAGUr0v',
+        'only_matching': True,
+    }, {
+        # NBCSports Embed
+        'url': 'https://www.msn.com/en-us/money/football_nfl/week-13-preview-redskins-vs-panthers/vi-BBXsCDb',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id, page_id = re.match(self._VALID_URL, url).groups()
+
+        webpage = self._download_webpage(url, display_id)
+
+        entries = []
+        for _, metadata in re.findall(r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1', webpage):
+            video = self._parse_json(unescapeHTML(metadata), display_id)
+
+            provider_id = video.get('providerId')
+            player_name = video.get('playerName')
+            if player_name and provider_id:
+                entry = None
+                if player_name == 'AOL':
+                    if provider_id.startswith('http'):
+                        provider_id = self._search_regex(
+                            r'https?://delivery\.vidible\.tv/video/redirect/([0-9a-f]{24})',
+                            provider_id, 'vidible id')
+                    entry = self.url_result(
+                        'aol-video:' + provider_id, 'Aol', provider_id)
+                elif player_name == 'Dailymotion':
+                    entry = self.url_result(
+                        'https://www.dailymotion.com/video/' + provider_id,
+                        'Dailymotion', provider_id)
+                elif player_name == 'YouTube':
+                    entry = self.url_result(
+                        provider_id, 'Youtube', provider_id)
+                elif player_name == 'NBCSports':
+                    entry = self.url_result(
+                        'http://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/' + provider_id,
+                        'NBCSportsVPlayer', provider_id)
+                if entry:
+                    entries.append(entry)
+                    continue
+
+            video_id = video['uuid']
+            title = video['title']
+
+            formats = []
+            for file_ in video.get('videoFiles', []):
+                format_url = file_.get('url')
+                if not format_url:
+                    continue
+                if 'format=m3u8-aapl' in format_url:
+                    # m3u8_native should not be used here until
+                    # https://github.com/ytdl-org/youtube-dl/issues/9913 is fixed
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, display_id, 'mp4',
+                        m3u8_id='hls', fatal=False))
+                elif 'format=mpd-time-csf' in format_url:
+                    formats.extend(self._extract_mpd_formats(
+                        format_url, display_id, 'dash', fatal=False))
+                elif '.ism' in format_url:
+                    if format_url.endswith('.ism'):
+                        format_url += '/manifest'
+                    formats.extend(self._extract_ism_formats(
+                        format_url, display_id, 'mss', fatal=False))
+                else:
+                    format_id = file_.get('formatCode')
+                    formats.append({
+                        'url': format_url,
+                        'ext': 'mp4',
+                        'format_id': format_id,
+                        'width': int_or_none(file_.get('width')),
+                        'height': int_or_none(file_.get('height')),
+                        'vbr': int_or_none(self._search_regex(r'_(\d+)\.mp4', format_url, 'vbr', default=None)),
+                        'preference': 1 if format_id == '1001' else None,
+                    })
+            self._sort_formats(formats)
+
+            subtitles = {}
+            for file_ in video.get('files', []):
+                format_url = file_.get('url')
+                format_code = file_.get('formatCode')
+                if not format_url or not format_code:
+                    continue
+                if compat_str(format_code) == '3100':
+                    subtitles.setdefault(file_.get('culture', 'en'), []).append({
+                        'ext': determine_ext(format_url, 'ttml'),
+                        'url': format_url,
+                    })
+
+            entries.append({
+                'id': video_id,
+                'display_id': display_id,
+                'title': title,
+                'description': video.get('description'),
+                'thumbnail': video.get('headlineImage', {}).get('url'),
+                'duration': int_or_none(video.get('durationSecs')),
+                'uploader': video.get('sourceFriendly'),
+                'uploader_id': video.get('providerId'),
+                'creator': video.get('creator'),
+                'subtitles': subtitles,
+                'formats': formats,
+            })
+
+        if not entries:
+            error = unescapeHTML(self._search_regex(
+                r'data-error=(["\'])(?P<error>.+?)\1',
+                webpage, 'error', group='error'))
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+        return self.playlist_result(entries, page_id)
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py

new file mode 100644 (file)

index 0000000..fedd5f4
--- /dev/null
+++ b/youtube_dl/extractor/mtv.py
@@ -0,0 +1,474 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_xpath,
+)
+from ..utils import (
+    ExtractorError,
+    find_xpath_attr,
+    fix_xml_ampersands,
+    float_or_none,
+    HEADRequest,
+    RegexNotFoundError,
+    sanitized_Request,
+    strip_or_none,
+    timeconvert,
+    try_get,
+    unescapeHTML,
+    update_url_query,
+    url_basename,
+    xpath_text,
+)
+
+
+def _media_xml_tag(tag):
+    return '{http://search.yahoo.com/mrss/}%s' % tag
+
+
+class MTVServicesInfoExtractor(InfoExtractor):
+    _MOBILE_TEMPLATE = None
+    _LANG = None
+
+    @staticmethod
+    def _id_from_uri(uri):
+        return uri.split(':')[-1]
+
+    @staticmethod
+    def _remove_template_parameter(url):
+        # Remove the templates, like &device={device}
+        return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url)
+
+    def _get_feed_url(self, uri):
+        return self._FEED_URL
+
+    def _get_thumbnail_url(self, uri, itemdoc):
+        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
+        thumb_node = itemdoc.find(search_path)
+        if thumb_node is None:
+            return None
+        return thumb_node.get('url') or thumb_node.text or None
+
+    def _extract_mobile_video_formats(self, mtvn_id):
+        webpage_url = self._MOBILE_TEMPLATE % mtvn_id
+        req = sanitized_Request(webpage_url)
+        # Otherwise we get a webpage that would execute some javascript
+        req.add_header('User-Agent', 'curl/7')
+        webpage = self._download_webpage(req, mtvn_id,
+                                         'Downloading mobile page')
+        metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url'))
+        req = HEADRequest(metrics_url)
+        response = self._request_webpage(req, mtvn_id, 'Resolving url')
+        url = response.geturl()
+        # Transform the url to get the best quality:
+        url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1)
+        return [{'url': url, 'ext': 'mp4'}]
+
+    def _extract_video_formats(self, mdoc, mtvn_id, video_id):
+        if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$', mdoc.find('.//src').text) is not None:
+            if mtvn_id is not None and self._MOBILE_TEMPLATE is not None:
+                self.to_screen('The normal version is not available from your '
+                               'country, trying with the mobile version')
+                return self._extract_mobile_video_formats(mtvn_id)
+            raise ExtractorError('This video is not available from your country.',
+                                 expected=True)
+
+        formats = []
+        for rendition in mdoc.findall('.//rendition'):
+            if rendition.get('method') == 'hls':
+                hls_url = rendition.find('./src').text
+                formats.extend(self._extract_m3u8_formats(
+                    hls_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                # fms
+                try:
+                    _, _, ext = rendition.attrib['type'].partition('/')
+                    rtmp_video_url = rendition.find('./src').text
+                    if 'error_not_available.swf' in rtmp_video_url:
+                        raise ExtractorError(
+                            '%s said: video is not available' % self.IE_NAME,
+                            expected=True)
+                    if rtmp_video_url.endswith('siteunavail.png'):
+                        continue
+                    formats.extend([{
+                        'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext,
+                        'url': rtmp_video_url,
+                        'format_id': '-'.join(filter(None, [
+                            'rtmp' if rtmp_video_url.startswith('rtmp') else None,
+                            rendition.get('bitrate')])),
+                        'width': int(rendition.get('width')),
+                        'height': int(rendition.get('height')),
+                    }])
+                except (KeyError, TypeError):
+                    raise ExtractorError('Invalid rendition field.')
+        if formats:
+            self._sort_formats(formats)
+        return formats
+
+    def _extract_subtitles(self, mdoc, mtvn_id):
+        subtitles = {}
+        for transcript in mdoc.findall('.//transcript'):
+            if transcript.get('kind') != 'captions':
+                continue
+            lang = transcript.get('srclang')
+            for typographic in transcript.findall('./typographic'):
+                sub_src = typographic.get('src')
+                if not sub_src:
+                    continue
+                ext = typographic.get('format')
+                if ext == 'cea-608':
+                    ext = 'scc'
+                subtitles.setdefault(lang, []).append({
+                    'url': compat_str(sub_src),
+                    'ext': ext
+                })
+        return subtitles
+
+    def _get_video_info(self, itemdoc, use_hls=True):
+        uri = itemdoc.find('guid').text
+        video_id = self._id_from_uri(uri)
+        self.report_extraction(video_id)
+        content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content')))
+        mediagen_url = self._remove_template_parameter(content_el.attrib['url'])
+        mediagen_url = mediagen_url.replace('device={device}', '')
+        if 'acceptMethods' not in mediagen_url:
+            mediagen_url += '&' if '?' in mediagen_url else '?'
+            mediagen_url += 'acceptMethods='
+            mediagen_url += 'hls' if use_hls else 'fms'
+
+        mediagen_doc = self._download_xml(
+            mediagen_url, video_id, 'Downloading video urls', fatal=False)
+
+        if mediagen_doc is False:
+            return None
+
+        item = mediagen_doc.find('./video/item')
+        if item is not None and item.get('type') == 'text':
+            message = '%s returned error: ' % self.IE_NAME
+            if item.get('code') is not None:
+                message += '%s - ' % item.get('code')
+            message += item.text
+            raise ExtractorError(message, expected=True)
+
+        description = strip_or_none(xpath_text(itemdoc, 'description'))
+
+        timestamp = timeconvert(xpath_text(itemdoc, 'pubDate'))
+
+        title_el = None
+        if title_el is None:
+            title_el = find_xpath_attr(
+                itemdoc, './/{http://search.yahoo.com/mrss/}category',
+                'scheme', 'urn:mtvn:video_title')
+        if title_el is None:
+            title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title'))
+        if title_el is None:
+            title_el = itemdoc.find(compat_xpath('.//title'))
+            if title_el.text is None:
+                title_el = None
+
+        title = title_el.text
+        if title is None:
+            raise ExtractorError('Could not find video title')
+        title = title.strip()
+
+        # This a short id that's used in the webpage urls
+        mtvn_id = None
+        mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category',
+                                       'scheme', 'urn:mtvn:id')
+        if mtvn_id_node is not None:
+            mtvn_id = mtvn_id_node.text
+
+        formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id)
+
+        # Some parts of complete video may be missing (e.g. missing Act 3 in
+        # http://www.southpark.de/alle-episoden/s14e01-sexual-healing)
+        if not formats:
+            return None
+
+        self._sort_formats(formats)
+
+        return {
+            'title': title,
+            'formats': formats,
+            'subtitles': self._extract_subtitles(mediagen_doc, mtvn_id),
+            'id': video_id,
+            'thumbnail': self._get_thumbnail_url(uri, itemdoc),
+            'description': description,
+            'duration': float_or_none(content_el.attrib.get('duration')),
+            'timestamp': timestamp,
+        }
+
+    def _get_feed_query(self, uri):
+        data = {'uri': uri}
+        if self._LANG:
+            data['lang'] = self._LANG
+        return data
+
+    def _get_videos_info(self, uri, use_hls=True):
+        video_id = self._id_from_uri(uri)
+        feed_url = self._get_feed_url(uri)
+        info_url = update_url_query(feed_url, self._get_feed_query(uri))
+        return self._get_videos_info_from_url(info_url, video_id, use_hls)
+
+    def _get_videos_info_from_url(self, url, video_id, use_hls=True):
+        idoc = self._download_xml(
+            url, video_id,
+            'Downloading info', transform_source=fix_xml_ampersands)
+
+        title = xpath_text(idoc, './channel/title')
+        description = xpath_text(idoc, './channel/description')
+
+        entries = []
+        for item in idoc.findall('.//item'):
+            info = self._get_video_info(item, use_hls)
+            if info:
+                entries.append(info)
+
+        return self.playlist_result(
+            entries, playlist_title=title, playlist_description=description)
+
+    def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):
+        triforce_feed = self._parse_json(self._search_regex(
+            r'triforceManifestFeed\s*=\s*({.+?})\s*;\s*\n', webpage,
+            'triforce feed', default='{}'), video_id, fatal=False)
+
+        data_zone = self._search_regex(
+            r'data-zone=(["\'])(?P<zone>.+?_lc_promo.*?)\1', webpage,
+            'data zone', default=data_zone, group='zone')
+
+        feed_url = try_get(
+            triforce_feed, lambda x: x['manifest']['zones'][data_zone]['feed'],
+            compat_str)
+        if not feed_url:
+            return
+
+        feed = self._download_json(feed_url, video_id, fatal=False)
+        if not feed:
+            return
+
+        return try_get(feed, lambda x: x['result']['data']['id'], compat_str)
+
+    def _extract_mgid(self, webpage):
+        try:
+            # the url can be http://media.mtvnservices.com/fb/{mgid}.swf
+            # or http://media.mtvnservices.com/{mgid}
+            og_url = self._og_search_video_url(webpage)
+            mgid = url_basename(og_url)
+            if mgid.endswith('.swf'):
+                mgid = mgid[:-4]
+        except RegexNotFoundError:
+            mgid = None
+
+        if mgid is None or ':' not in mgid:
+            mgid = self._search_regex(
+                [r'data-mgid="(.*?)"', r'swfobject\.embedSWF\(".*?(mgid:.*?)"'],
+                webpage, 'mgid', default=None)
+
+        if not mgid:
+            sm4_embed = self._html_search_meta(
+                'sm4:video:embed', webpage, 'sm4 embed', default='')
+            mgid = self._search_regex(
+                r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None)
+
+        if not mgid:
+            mgid = self._extract_triforce_mgid(webpage)
+
+        return mgid
+
+    def _real_extract(self, url):
+        title = url_basename(url)
+        webpage = self._download_webpage(url, title)
+        mgid = self._extract_mgid(webpage)
+        videos_info = self._get_videos_info(mgid)
+        return videos_info
+
+
+class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
+    IE_NAME = 'mtvservices:embedded'
+    _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)'
+
+    _TEST = {
+        # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/
+        'url': 'http://media.mtvnservices.com/embed/mgid:uma:video:mtv.com:1043906/cp~vid%3D1043906%26uri%3Dmgid%3Auma%3Avideo%3Amtv.com%3A1043906',
+        'md5': 'cb349b21a7897164cede95bd7bf3fbb9',
+        'info_dict': {
+            'id': '1043906',
+            'ext': 'mp4',
+            'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds',
+            'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.',
+            'timestamp': 1400126400,
+            'upload_date': '20140515',
+        },
+    }
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _get_feed_url(self, uri):
+        video_id = self._id_from_uri(uri)
+        config = self._download_json(
+            'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id)
+        return self._remove_template_parameter(config['feedWithQueryParams'])
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        mgid = mobj.group('mgid')
+        return self._get_videos_info(mgid)
+
+
+class MTVIE(MTVServicesInfoExtractor):
+    IE_NAME = 'mtv'
+    _VALID_URL = r'https?://(?:www\.)?mtv\.com/(?:video-clips|(?:full-)?episodes)/(?P<id>[^/?#.]+)'
+    _FEED_URL = 'http://www.mtv.com/feeds/mrss/'
+
+    _TESTS = [{
+        'url': 'http://www.mtv.com/video-clips/vl8qof/unlocking-the-truth-trailer',
+        'md5': '1edbcdf1e7628e414a8c5dcebca3d32b',
+        'info_dict': {
+            'id': '5e14040d-18a4-47c4-a582-43ff602de88e',
+            'ext': 'mp4',
+            'title': 'Unlocking The Truth|July 18, 2016|1|101|Trailer',
+            'description': '"Unlocking the Truth" premieres August 17th at 11/10c.',
+            'timestamp': 1468846800,
+            'upload_date': '20160718',
+        },
+    }, {
+        'url': 'http://www.mtv.com/full-episodes/94tujl/unlocking-the-truth-gates-of-hell-season-1-ep-101',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.mtv.com/episodes/g8xu7q/teen-mom-2-breaking-the-wall-season-7-ep-713',
+        'only_matching': True,
+    }]
+
+
+class MTVJapanIE(MTVServicesInfoExtractor):
+    IE_NAME = 'mtvjapan'
+    _VALID_URL = r'https?://(?:www\.)?mtvjapan\.com/videos/(?P<id>[0-9a-z]+)'
+
+    _TEST = {
+        'url': 'http://www.mtvjapan.com/videos/prayht/fresh-info-cadillac-escalade',
+        'info_dict': {
+            'id': 'bc01da03-6fe5-4284-8880-f291f4e368f5',
+            'ext': 'mp4',
+            'title': '【Fresh Info】Cadillac ESCALADE Sport Edition',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+    _GEO_COUNTRIES = ['JP']
+    _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
+
+    def _get_feed_query(self, uri):
+        return {
+            'arcEp': 'mtvjapan.com',
+            'mgid': uri,
+        }
+
+
+class MTVVideoIE(MTVServicesInfoExtractor):
+    IE_NAME = 'mtv:video'
+    _VALID_URL = r'''(?x)^https?://
+        (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$|
+           m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))'''
+
+    _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
+
+    _TESTS = [
+        {
+            'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
+            'md5': '850f3f143316b1e71fa56a4edfd6e0f8',
+            'info_dict': {
+                'id': '853555',
+                'ext': 'mp4',
+                'title': 'Taylor Swift - "Ours (VH1 Storytellers)"',
+                'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
+                'timestamp': 1352610000,
+                'upload_date': '20121111',
+            },
+        },
+    ]
+
+    def _get_thumbnail_url(self, uri, itemdoc):
+        return 'http://mtv.mtvnimages.com/uri/' + uri
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        uri = mobj.groupdict().get('mgid')
+        if uri is None:
+            webpage = self._download_webpage(url, video_id)
+
+            # Some videos come from Vevo.com
+            m_vevo = re.search(
+                r'(?s)isVevoVideo = true;.*?vevoVideoId = "(.*?)";', webpage)
+            if m_vevo:
+                vevo_id = m_vevo.group(1)
+                self.to_screen('Vevo video detected: %s' % vevo_id)
+                return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
+
+            uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri')
+        return self._get_videos_info(uri)
+
+
+class MTVDEIE(MTVServicesInfoExtractor):
+    IE_NAME = 'mtv.de'
+    _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P<id>[0-9a-z]+)'
+    _TESTS = [{
+        'url': 'http://www.mtv.de/musik/videoclips/2gpnv7/Traum',
+        'info_dict': {
+            'id': 'd5d472bc-f5b7-11e5-bffd-a4badb20dab5',
+            'ext': 'mp4',
+            'title': 'Traum',
+            'description': 'Traum',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+        'skip': 'Blocked at Travis CI',
+    }, {
+        # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97)
+        'url': 'http://www.mtv.de/folgen/6b1ylu/teen-mom-2-enthuellungen-S5-F1',
+        'info_dict': {
+            'id': '1e5a878b-31c5-11e7-a442-0e40cf2fc285',
+            'ext': 'mp4',
+            'title': 'Teen Mom 2',
+            'description': 'md5:dc65e357ef7e1085ed53e9e9d83146a7',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+        'skip': 'Blocked at Travis CI',
+    }, {
+        'url': 'http://www.mtv.de/news/glolix/77491-mtv-movies-spotlight--pixels--teil-3',
+        'info_dict': {
+            'id': 'local_playlist-4e760566473c4c8c5344',
+            'ext': 'mp4',
+            'title': 'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1',
+            'description': 'MTV Movies Supercut',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+        'skip': 'Das Video kann zur Zeit nicht abgespielt werden.',
+    }]
+    _GEO_COUNTRIES = ['DE']
+    _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
+
+    def _get_feed_query(self, uri):
+        return {
+            'arcEp': 'mtv.de',
+            'mgid': uri,
+        }
diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py

new file mode 100644 (file)

index 0000000..2cc2bf2
--- /dev/null
+++ b/youtube_dl/extractor/muenchentv.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    js_to_json,
+)
+
+
+class MuenchenTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?muenchen\.tv/livestream'
+    IE_DESC = 'münchen.tv'
+    _TEST = {
+        'url': 'http://www.muenchen.tv/livestream/',
+        'info_dict': {
+            'id': '5334',
+            'display_id': 'live',
+            'ext': 'mp4',
+            'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'is_live': True,
+            'thumbnail': r're:^https?://.*\.jpg$'
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = 'live'
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._live_title(self._og_search_title(webpage))
+
+        data_js = self._search_regex(
+            r'(?s)\nplaylist:\s*(\[.*?}\]),',
+            webpage, 'playlist configuration')
+        data_json = js_to_json(data_js)
+        data = json.loads(data_json)[0]
+
+        video_id = data['mediaid']
+        thumbnail = data.get('image')
+
+        formats = []
+        for format_num, s in enumerate(data['sources']):
+            ext = determine_ext(s['file'], None)
+            label_str = s.get('label')
+            if label_str is None:
+                label_str = '_%d' % format_num
+
+            if ext is None:
+                format_id = label_str
+            else:
+                format_id = '%s-%s' % (ext, label_str)
+
+            formats.append({
+                'url': s['file'],
+                'tbr': int_or_none(s.get('label')),
+                'ext': 'mp4',
+                'format_id': format_id,
+                'preference': -100 if '.smil' in s['file'] else 0,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'is_live': True,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py

new file mode 100644 (file)

index 0000000..a672765
--- /dev/null
+++ b/youtube_dl/extractor/mwave.py
@@ -0,0 +1,90 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    parse_duration,
+)
+
+
+class MwaveIE(InfoExtractor):
+    _VALID_URL = r'https?://mwave\.interest\.me/(?:[^/]+/)?mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)'
+    _URL_TEMPLATE = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=%s'
+    _TESTS = [{
+        'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859',
+        # md5 is unstable
+        'info_dict': {
+            'id': '168859',
+            'ext': 'flv',
+            'title': '[M COUNTDOWN] SISTAR - SHAKE IT',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'M COUNTDOWN',
+            'duration': 206,
+            'view_count': int,
+        }
+    }, {
+        'url': 'http://mwave.interest.me/en/mnettv/videodetail.m?searchVideoDetailVO.clip_id=176199',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        vod_info = self._download_json(
+            'http://mwave.interest.me/onair/vod_info.m?vodtype=CL&sectorid=&endinfo=Y&id=%s' % video_id,
+            video_id, 'Download vod JSON')
+
+        formats = []
+        for num, cdn_info in enumerate(vod_info['cdn']):
+            stream_url = cdn_info.get('url')
+            if not stream_url:
+                continue
+            stream_name = cdn_info.get('name') or compat_str(num)
+            f4m_stream = self._download_json(
+                stream_url, video_id,
+                'Download %s stream JSON' % stream_name)
+            f4m_url = f4m_stream.get('fileurl')
+            if not f4m_url:
+                continue
+            formats.extend(
+                self._extract_f4m_formats(f4m_url + '&hdcore=3.0.3', video_id, f4m_id=stream_name))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': vod_info['title'],
+            'thumbnail': vod_info.get('cover'),
+            'uploader': vod_info.get('program_title'),
+            'duration': parse_duration(vod_info.get('time')),
+            'view_count': int_or_none(vod_info.get('hit')),
+            'formats': formats,
+        }
+
+
+class MwaveMeetGreetIE(InfoExtractor):
+    _VALID_URL = r'https?://mwave\.interest\.me/(?:[^/]+/)?meetgreet/view/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://mwave.interest.me/meetgreet/view/256',
+        'info_dict': {
+            'id': '173294',
+            'ext': 'flv',
+            'title': '[MEET&GREET] Park BoRam',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Mwave',
+            'duration': 3634,
+            'view_count': int,
+        }
+    }, {
+        'url': 'http://mwave.interest.me/en/meetgreet/view/256',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        clip_id = self._html_search_regex(
+            r'<iframe[^>]+src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(\d+)',
+            webpage, 'clip ID')
+        clip_url = MwaveIE._URL_TEMPLATE % clip_id
+        return self.url_result(clip_url, 'Mwave', clip_id)
diff --git a/youtube_dl/extractor/mychannels.py b/youtube_dl/extractor/mychannels.py

new file mode 100644 (file)

index 0000000..b1ffe78
--- /dev/null
+++ b/youtube_dl/extractor/mychannels.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MyChannelsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?mychannels\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://mychannels.com/missholland/miss-holland?production_id=3416',
+        'md5': 'b8993daad4262dd68d89d651c0c52c45',
+        'info_dict': {
+            'id': 'wUUDZZep6vQD',
+            'ext': 'mp4',
+            'title': 'Miss Holland joins VOTE LEAVE',
+            'description': 'Miss Holland | #13 Not a potato',
+            'uploader': 'Miss Holland',
+        }
+    }
+
+    def _real_extract(self, url):
+        id_type, url_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, url_id)
+        video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data')
+
+        def extract_data_val(attr, fatal=False):
+            return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal)
+        minoto_id = extract_data_val('minoto-id') or self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id')
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'minoto:%s' % minoto_id,
+            'id': url_id,
+            'title': extract_data_val('title', True),
+            'description': extract_data_val('description'),
+            'thumbnail': extract_data_val('image'),
+            'uploader': extract_data_val('channel'),
+        }
diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py

new file mode 100644 (file)

index 0000000..e164d59
--- /dev/null
+++ b/youtube_dl/extractor/myspace.py
@@ -0,0 +1,212 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class MySpaceIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        myspace\.com/[^/]+/
+                        (?P<mediatype>
+                            video/[^/]+/(?P<video_id>\d+)|
+                            music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$)
+                        )
+                    '''
+
+    _TESTS = [{
+        'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919',
+        'md5': '9c1483c106f4a695c47d2911feed50a7',
+        'info_dict': {
+            'id': '109594919',
+            'ext': 'mp4',
+            'title': 'Little Big Town',
+            'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
+            'uploader': 'Five Minutes to the Stage',
+            'uploader_id': 'fiveminutestothestage',
+            'timestamp': 1414108751,
+            'upload_date': '20141023',
+        },
+    }, {
+        # songs
+        'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681',
+        'md5': '1d7ee4604a3da226dd69a123f748b262',
+        'info_dict': {
+            'id': '93388656',
+            'ext': 'm4a',
+            'title': 'Of weakened soul...',
+            'uploader': 'Killsorrow',
+            'uploader_id': 'killsorrow',
+        },
+    }, {
+        'add_ie': ['Youtube'],
+        'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
+        'info_dict': {
+            'id': 'xqds0B_meys',
+            'ext': 'webm',
+            'title': 'Three Days Grace - Animal I Have Become',
+            'description': 'md5:8bd86b3693e72a077cf863a8530c54bb',
+            'uploader': 'ThreeDaysGraceVEVO',
+            'uploader_id': 'ThreeDaysGraceVEVO',
+            'upload_date': '20091002',
+        },
+    }, {
+        'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
+        'only_matching': True,
+    }, {
+        'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('video_id') or mobj.group('song_id')
+        is_song = mobj.group('mediatype').startswith('music/song')
+        webpage = self._download_webpage(url, video_id)
+        player_url = self._search_regex(
+            r'videoSwf":"([^"?]*)', webpage, 'player URL', fatal=False)
+
+        def formats_from_stream_urls(stream_url, hls_stream_url, http_stream_url, width=None, height=None):
+            formats = []
+            vcodec = 'none' if is_song else None
+            if hls_stream_url:
+                formats.append({
+                    'format_id': 'hls',
+                    'url': hls_stream_url,
+                    'protocol': 'm3u8_native',
+                    'ext': 'm4a' if is_song else 'mp4',
+                    'vcodec': vcodec,
+                })
+            if stream_url and player_url:
+                rtmp_url, play_path = stream_url.split(';', 1)
+                formats.append({
+                    'format_id': 'rtmp',
+                    'url': rtmp_url,
+                    'play_path': play_path,
+                    'player_url': player_url,
+                    'protocol': 'rtmp',
+                    'ext': 'flv',
+                    'width': width,
+                    'height': height,
+                    'vcodec': vcodec,
+                })
+            if http_stream_url:
+                formats.append({
+                    'format_id': 'http',
+                    'url': http_stream_url,
+                    'width': width,
+                    'height': height,
+                    'vcodec': vcodec,
+                })
+            return formats
+
+        if is_song:
+            # songs don't store any useful info in the 'context' variable
+            song_data = self._search_regex(
+                r'''<button.*data-song-id=(["\'])%s\1.*''' % video_id,
+                webpage, 'song_data', default=None, group=0)
+            if song_data is None:
+                # some songs in an album are not playable
+                self.report_warning(
+                    '%s: No downloadable song on this page' % video_id)
+                return
+
+            def search_data(name):
+                return self._search_regex(
+                    r'''data-%s=([\'"])(?P<data>.*?)\1''' % name,
+                    song_data, name, default='', group='data')
+            formats = formats_from_stream_urls(
+                search_data('stream-url'), search_data('hls-stream-url'),
+                search_data('http-stream-url'))
+            if not formats:
+                vevo_id = search_data('vevo-id')
+                youtube_id = search_data('youtube-id')
+                if vevo_id:
+                    self.to_screen('Vevo video detected: %s' % vevo_id)
+                    return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
+                elif youtube_id:
+                    self.to_screen('Youtube video detected: %s' % youtube_id)
+                    return self.url_result(youtube_id, ie='Youtube')
+                else:
+                    raise ExtractorError(
+                        'Found song but don\'t know how to download it')
+            self._sort_formats(formats)
+            return {
+                'id': video_id,
+                'title': self._og_search_title(webpage),
+                'uploader': search_data('artist-name'),
+                'uploader_id': search_data('artist-username'),
+                'thumbnail': self._og_search_thumbnail(webpage),
+                'duration': int_or_none(search_data('duration')),
+                'formats': formats,
+            }
+        else:
+            video = self._parse_json(self._search_regex(
+                r'context = ({.*?});', webpage, 'context'),
+                video_id)['video']
+            formats = formats_from_stream_urls(
+                video.get('streamUrl'), video.get('hlsStreamUrl'),
+                video.get('mp4StreamUrl'), int_or_none(video.get('width')),
+                int_or_none(video.get('height')))
+            self._sort_formats(formats)
+            return {
+                'id': video_id,
+                'title': video['title'],
+                'description': video.get('description'),
+                'thumbnail': video.get('imageUrl'),
+                'uploader': video.get('artistName'),
+                'uploader_id': video.get('artistUsername'),
+                'duration': int_or_none(video.get('duration')),
+                'timestamp': parse_iso8601(video.get('dateAdded')),
+                'formats': formats,
+            }
+
+
+class MySpaceAlbumIE(InfoExtractor):
+    IE_NAME = 'MySpace:album'
+    _VALID_URL = r'https?://myspace\.com/([^/]+)/music/album/(?P<title>.*-)(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'https://myspace.com/starset2/music/album/transmissions-19455773',
+        'info_dict': {
+            'title': 'Transmissions',
+            'id': '19455773',
+        },
+        'playlist_count': 14,
+        'skip': 'this album is only available in some countries',
+    }, {
+        'url': 'https://myspace.com/killsorrow/music/album/the-demo-18596029',
+        'info_dict': {
+            'title': 'The Demo',
+            'id': '18596029',
+        },
+        'playlist_count': 5,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+        display_id = mobj.group('title') + playlist_id
+        webpage = self._download_webpage(url, display_id)
+        tracks_paths = re.findall(r'"music:song" content="(.*?)"', webpage)
+        if not tracks_paths:
+            raise ExtractorError(
+                '%s: No songs found, try using proxy' % display_id,
+                expected=True)
+        entries = [
+            self.url_result(t_path, ie=MySpaceIE.ie_key())
+            for t_path in tracks_paths]
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'display_id': display_id,
+            'title': self._og_search_title(webpage),
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py

new file mode 100644 (file)

index 0000000..db7ebc9
--- /dev/null
+++ b/youtube_dl/extractor/myspass.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    xpath_text,
+)
+
+
+class MySpassIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?myspass\.de/([^/]+/)*(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
+        'md5': '0b49f4844a068f8b33f4b7c88405862b',
+        'info_dict': {
+            'id': '11741',
+            'ext': 'mp4',
+            'description': 'Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?',
+            'title': '17.02.2013 - Die Highlights, Teil 2',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        metadata = self._download_xml(
+            'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=' + video_id,
+            video_id)
+
+        title = xpath_text(metadata, 'title', fatal=True)
+        video_url = xpath_text(metadata, 'url_flv', 'download url', True)
+        video_id_int = int(video_id)
+        for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups():
+            group_int = int(group)
+            if group_int > video_id_int:
+                video_url = video_url.replace(
+                    group, compat_str(group_int // video_id_int))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': xpath_text(metadata, 'imagePreview'),
+            'description': xpath_text(metadata, 'description'),
+            'duration': parse_duration(xpath_text(metadata, 'duration')),
+            'series': xpath_text(metadata, 'format'),
+            'season_number': int_or_none(xpath_text(metadata, 'season')),
+            'season_id': xpath_text(metadata, 'season_id'),
+            'episode': title,
+            'episode_number': int_or_none(xpath_text(metadata, 'episode')),
+        }
diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py

new file mode 100644 (file)

index 0000000..75d2863
--- /dev/null
+++ b/youtube_dl/extractor/myvi.py
@@ -0,0 +1,111 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .vimple import SprutoBaseIE
+
+
+class MyviIE(SprutoBaseIE):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            https?://
+                                (?:www\.)?
+                                myvi\.
+                                (?:
+                                    (?:ru/player|tv)/
+                                    (?:
+                                        (?:
+                                            embed/html|
+                                            flash|
+                                            api/Video/Get
+                                        )/|
+                                        content/preloader\.swf\?.*\bid=
+                                    )|
+                                    ru/watch/
+                                )|
+                            myvi:
+                        )
+                        (?P<id>[\da-zA-Z_-]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
+        'md5': '571bbdfba9f9ed229dc6d34cc0f335bf',
+        'info_dict': {
+            'id': 'f16b2bbd-cde8-481c-a981-7cd48605df43',
+            'ext': 'mp4',
+            'title': 'хозяин жизни',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 25,
+        },
+    }, {
+        'url': 'http://myvi.ru/player/content/preloader.swf?id=oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wOYf1WFpPfc_bWTKGVf_Zafr0',
+        'only_matching': True,
+    }, {
+        'url': 'http://myvi.ru/player/api/Video/Get/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0',
+        'only_matching': True,
+    }, {
+        'url': 'http://myvi.tv/embed/html/oTGTNWdyz4Zwy_u1nraolwZ1odenTd9WkTnRfIL9y8VOgHYqOHApE575x4_xxS9Vn0?ap=0',
+        'only_matching': True,
+    }, {
+        'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.myvi.ru/watch/YwbqszQynUaHPn_s82sx0Q2',
+        'only_matching': True,
+    }, {
+        'url': 'myvi:YwbqszQynUaHPn_s82sx0Q2',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        spruto = self._download_json(
+            'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData']
+
+        return self._extract_spruto(spruto, video_id)
+
+
+class MyviEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?myvi\.tv/(?:[^?]+\?.*?\bv=|embed/)(?P<id>[\da-z]+)'
+    _TESTS = [{
+        'url': 'https://www.myvi.tv/embed/ccdqic3wgkqwpb36x9sxg43t4r',
+        'info_dict': {
+            'id': 'b3ea0663-3234-469d-873e-7fecf36b31d1',
+            'ext': 'mp4',
+            'title': 'Твоя (original song).mp4',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 277,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.myvi.tv/idmi6o?v=ccdqic3wgkqwpb36x9sxg43t4r#watch',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if MyviIE.suitable(url) else super(MyviEmbedIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://www.myvi.tv/embed/%s' % video_id, video_id)
+
+        myvi_id = self._search_regex(
+            r'CreatePlayer\s*\(\s*["\'].*?\bv=([\da-zA-Z_]+)',
+            webpage, 'video id')
+
+        return self.url_result('myvi:%s' % myvi_id, ie=MyviIE.ie_key())
diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py

new file mode 100644 (file)

index 0000000..2117d30
--- /dev/null
+++ b/youtube_dl/extractor/myvidster.py
@@ -0,0 +1,29 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class MyVidsterIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/'
+
+    _TEST = {
+        'url': 'http://www.myvidster.com/video/32059805/Hot_chemistry_with_raw_love_making',
+        'md5': '95296d0231c1363222c3441af62dc4ca',
+        'info_dict': {
+            'id': '3685814',
+            'title': 'md5:7d8427d6d02c4fbcef50fe269980c749',
+            'upload_date': '20141027',
+            'uploader': 'utkualp',
+            'ext': 'mp4',
+            'age_limit': 18,
+        },
+        'add_ie': ['XHamster'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        return self.url_result(self._html_search_regex(
+            r'rel="videolink" href="(?P<real_url>.*)">',
+            webpage, 'real video url'))
diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py

new file mode 100644 (file)

index 0000000..ee12e2b
--- /dev/null
+++ b/youtube_dl/extractor/nationalgeographic.py
@@ -0,0 +1,82 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .fox import FOXIE
+from ..utils import (
+    smuggle_url,
+    url_basename,
+)
+
+
+class NationalGeographicVideoIE(InfoExtractor):
+    IE_NAME = 'natgeo:video'
+    _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?'
+
+    _TESTS = [
+        {
+            'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo',
+            'md5': '730855d559abbad6b42c2be1fa584917',
+            'info_dict': {
+                'id': '0000014b-70a1-dd8c-af7f-f7b559330001',
+                'ext': 'mp4',
+                'title': 'Mating Crabs Busted by Sharks',
+                'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3',
+                'timestamp': 1423523799,
+                'upload_date': '20150209',
+                'uploader': 'NAGS',
+            },
+            'add_ie': ['ThePlatform'],
+        },
+        {
+            'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws',
+            'md5': '6a3105eb448c070503b3105fb9b320b5',
+            'info_dict': {
+                'id': 'ngc-I0IauNSWznb_UV008GxSbwY35BZvgi2e',
+                'ext': 'mp4',
+                'title': 'The Real Jaws',
+                'description': 'md5:8d3e09d9d53a85cd397b4b21b2c77be6',
+                'timestamp': 1433772632,
+                'upload_date': '20150608',
+                'uploader': 'NAGS',
+            },
+            'add_ie': ['ThePlatform'],
+        },
+    ]
+
+    def _real_extract(self, url):
+        name = url_basename(url)
+
+        webpage = self._download_webpage(url, name)
+        guid = self._search_regex(
+            r'id="(?:videoPlayer|player-container)"[^>]+data-guid="([^"]+)"',
+            webpage, 'guid')
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': smuggle_url(
+                'http://link.theplatform.com/s/ngs/media/guid/2423130747/%s?mbr=true' % guid,
+                {'force_smil_url': True}),
+            'id': guid,
+        }
+
+
+class NationalGeographicTVIE(FOXIE):
+    _VALID_URL = r'https?://(?:www\.)?nationalgeographic\.com/tv/watch/(?P<id>[\da-fA-F]+)'
+    _TESTS = [{
+        'url': 'https://www.nationalgeographic.com/tv/watch/6a875e6e734b479beda26438c9f21138/',
+        'info_dict': {
+            'id': '6a875e6e734b479beda26438c9f21138',
+            'ext': 'mp4',
+            'title': 'Why Nat Geo? Valley of the Boom',
+            'description': 'The lives of prominent figures in the tech world, including their friendships, rivalries, victories and failures.',
+            'timestamp': 1542662458,
+            'upload_date': '20181119',
+            'age_limit': 14,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+    _HOME_PAGE_URL = 'https://www.nationalgeographic.com/tv/'
+    _API_KEY = '238bb0a0c2aba67922c48709ce0c06fd'
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py

new file mode 100644 (file)

index 0000000..61fc591
--- /dev/null
+++ b/youtube_dl/extractor/naver.py
@@ -0,0 +1,166 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    dict_get,
+    ExtractorError,
+    int_or_none,
+    parse_duration,
+    try_get,
+    update_url_query,
+)
+
+
+class NaverBaseIE(InfoExtractor):
+    _CAPTION_EXT_RE = r'\.(?:ttml|vtt)'
+
+    def _extract_video_info(self, video_id, vid, key):
+        video_data = self._download_json(
+            'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid,
+            video_id, query={
+                'key': key,
+            })
+        meta = video_data['meta']
+        title = meta['subject']
+        formats = []
+        get_list = lambda x: try_get(video_data, lambda y: y[x + 's']['list'], list) or []
+
+        def extract_formats(streams, stream_type, query={}):
+            for stream in streams:
+                stream_url = stream.get('source')
+                if not stream_url:
+                    continue
+                stream_url = update_url_query(stream_url, query)
+                encoding_option = stream.get('encodingOption', {})
+                bitrate = stream.get('bitrate', {})
+                formats.append({
+                    'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))),
+                    'url': stream_url,
+                    'width': int_or_none(encoding_option.get('width')),
+                    'height': int_or_none(encoding_option.get('height')),
+                    'vbr': int_or_none(bitrate.get('video')),
+                    'abr': int_or_none(bitrate.get('audio')),
+                    'filesize': int_or_none(stream.get('size')),
+                    'protocol': 'm3u8_native' if stream_type == 'HLS' else None,
+                })
+
+        extract_formats(get_list('video'), 'H264')
+        for stream_set in video_data.get('streams', []):
+            query = {}
+            for param in stream_set.get('keys', []):
+                query[param['name']] = param['value']
+            stream_type = stream_set.get('type')
+            videos = stream_set.get('videos')
+            if videos:
+                extract_formats(videos, stream_type, query)
+            elif stream_type == 'HLS':
+                stream_url = stream_set.get('source')
+                if not stream_url:
+                    continue
+                formats.extend(self._extract_m3u8_formats(
+                    update_url_query(stream_url, query), video_id,
+                    'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False))
+        self._sort_formats(formats)
+
+        replace_ext = lambda x, y: re.sub(self._CAPTION_EXT_RE, '.' + y, x)
+
+        def get_subs(caption_url):
+            if re.search(self._CAPTION_EXT_RE, caption_url):
+                return [{
+                    'url': replace_ext(caption_url, 'ttml'),
+                }, {
+                    'url': replace_ext(caption_url, 'vtt'),
+                }]
+            else:
+                return [{'url': caption_url}]
+
+        automatic_captions = {}
+        subtitles = {}
+        for caption in get_list('caption'):
+            caption_url = caption.get('source')
+            if not caption_url:
+                continue
+            sub_dict = automatic_captions if caption.get('type') == 'auto' else subtitles
+            sub_dict.setdefault(dict_get(caption, ('locale', 'language')), []).extend(get_subs(caption_url))
+
+        user = meta.get('user', {})
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'subtitles': subtitles,
+            'automatic_captions': automatic_captions,
+            'thumbnail': try_get(meta, lambda x: x['cover']['source']),
+            'view_count': int_or_none(meta.get('count')),
+            'uploader_id': user.get('id'),
+            'uploader': user.get('name'),
+            'uploader_url': user.get('url'),
+        }
+
+
+class NaverIE(NaverBaseIE):
+    _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/(?:v|embed)/(?P<id>\d+)'
+    _GEO_BYPASS = False
+    _TESTS = [{
+        'url': 'http://tv.naver.com/v/81652',
+        'info_dict': {
+            'id': '81652',
+            'ext': 'mp4',
+            'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번',
+            'description': '메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
+            'timestamp': 1378200754,
+            'upload_date': '20130903',
+            'uploader': '메가스터디, 합격불변의 법칙',
+            'uploader_id': 'megastudy',
+        },
+    }, {
+        'url': 'http://tv.naver.com/v/395837',
+        'md5': '8a38e35354d26a17f73f4e90094febd3',
+        'info_dict': {
+            'id': '395837',
+            'ext': 'mp4',
+            'title': '9년이 지나도 아픈 기억, 전효성의 아버지',
+            'description': 'md5:eb6aca9d457b922e43860a2a2b1984d3',
+            'timestamp': 1432030253,
+            'upload_date': '20150519',
+            'uploader': '4가지쇼 시즌2',
+            'uploader_id': 'wrappinguser29',
+        },
+        'skip': 'Georestricted',
+    }, {
+        'url': 'http://tvcast.naver.com/v/81652',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        content = self._download_json(
+            'https://tv.naver.com/api/json/v/' + video_id,
+            video_id, headers=self.geo_verification_headers())
+        player_info_json = content.get('playerInfoJson') or {}
+        current_clip = player_info_json.get('currentClip') or {}
+
+        vid = current_clip.get('videoId')
+        in_key = current_clip.get('inKey')
+
+        if not vid or not in_key:
+            player_auth = try_get(player_info_json, lambda x: x['playerOption']['auth'])
+            if player_auth == 'notCountry':
+                self.raise_geo_restricted(countries=['KR'])
+            elif player_auth == 'notLogin':
+                self.raise_login_required()
+            raise ExtractorError('couldn\'t extract vid and key')
+        info = self._extract_video_info(video_id, vid, in_key)
+        info.update({
+            'description': clean_html(current_clip.get('description')),
+            'timestamp': int_or_none(current_clip.get('firstExposureTime'), 1000),
+            'duration': parse_duration(current_clip.get('displayPlayTime')),
+            'like_count': int_or_none(current_clip.get('recommendPoint')),
+            'age_limit': 19 if current_clip.get('adult') else None,
+        })
+        return info
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py

new file mode 100644 (file)

index 0000000..be295a7
--- /dev/null
+++ b/youtube_dl/extractor/nba.py
@@ -0,0 +1,154 @@
+from __future__ import unicode_literals
+
+import functools
+import re
+
+from .turner import TurnerBaseIE
+from ..compat import (
+    compat_urllib_parse_urlencode,
+    compat_urlparse,
+)
+from ..utils import (
+    OnDemandPagedList,
+    remove_start,
+)
+
+
+class NBAIE(TurnerBaseIE):
+    _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)+(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$'
+    _TESTS = [{
+        'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
+        'md5': '9e7729d3010a9c71506fd1248f74e4f4',
+        'info_dict': {
+            'id': '0021200253-okc-bkn-recap',
+            'ext': 'mp4',
+            'title': 'Thunder vs. Nets',
+            'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
+            'duration': 181,
+            'timestamp': 1354638466,
+            'upload_date': '20121204',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/',
+        'only_matching': True,
+    }, {
+        'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
+        'md5': 'b2b39b81cf28615ae0c3360a3f9668c4',
+        'info_dict': {
+            'id': 'channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba',
+            'ext': 'mp4',
+            'title': 'Hawks vs. Cavaliers Game 1',
+            'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d',
+            'duration': 228,
+            'timestamp': 1432134543,
+            'upload_date': '20150520',
+        },
+        'expected_warnings': ['Unable to download f4m manifest'],
+    }, {
+        'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake',
+        'info_dict': {
+            'id': 'teams/clippers/2016/02/17/1455672027478-Doc_Feb16_720.mov-297324',
+            'ext': 'mp4',
+            'title': 'Practice: Doc Rivers - 2/16/16',
+            'description': 'Head Coach Doc Rivers addresses the media following practice.',
+            'upload_date': '20160216',
+            'timestamp': 1455672000,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'expected_warnings': ['Unable to download f4m manifest'],
+    }, {
+        'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+        'info_dict': {
+            'id': 'timberwolves',
+            'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
+        },
+        'playlist_count': 30,
+        'params': {
+            # Download the whole playlist takes too long time
+            'playlist_items': '1-30',
+        },
+    }, {
+        'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+        'info_dict': {
+            'id': 'teams/timberwolves/2014/12/12/Wigginsmp4-3462601',
+            'ext': 'mp4',
+            'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
+            'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.',
+            'upload_date': '20141212',
+            'timestamp': 1418418600,
+        },
+        'params': {
+            'noplaylist': True,
+            # m3u8 download
+            'skip_download': True,
+        },
+        'expected_warnings': ['Unable to download f4m manifest'],
+    }]
+
+    _PAGE_SIZE = 30
+
+    def _fetch_page(self, team, video_id, page):
+        search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse_urlencode({
+            'type': 'teamvideo',
+            'start': page * self._PAGE_SIZE + 1,
+            'npp': (page + 1) * self._PAGE_SIZE + 1,
+            'sort': 'recent',
+            'output': 'json',
+            'site': team,
+        })
+        results = self._download_json(
+            search_url, video_id, note='Download page %d of playlist data' % page)['results'][0]
+        for item in results:
+            yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url']))
+
+    def _extract_playlist(self, orig_path, video_id, webpage):
+        team = orig_path.split('/')[0]
+
+        if self._downloader.params.get('noplaylist'):
+            self.to_screen('Downloading just video because of --no-playlist')
+            video_path = self._search_regex(
+                r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path')
+            video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path)
+            return self.url_result(video_url)
+
+        self.to_screen('Downloading playlist - add --no-playlist to just download video')
+        playlist_title = self._og_search_title(webpage, fatal=False)
+        entries = OnDemandPagedList(
+            functools.partial(self._fetch_page, team, video_id),
+            self._PAGE_SIZE)
+
+        return self.playlist_result(entries, team, playlist_title)
+
+    def _real_extract(self, url):
+        path, video_id = re.match(self._VALID_URL, url).groups()
+        orig_path = path
+        if path.startswith('nba/'):
+            path = path[3:]
+
+        if 'video/' not in path:
+            webpage = self._download_webpage(url, video_id)
+            path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/')
+
+            if path == '{{id}}':
+                return self._extract_playlist(orig_path, video_id, webpage)
+
+            # See prepareContentId() of pkgCvp.js
+            if path.startswith('video/teams'):
+                path = 'video/channels/proxy/' + path[6:]
+
+        return self._extract_cvp_info(
+            'http://www.nba.com/%s.xml' % path, video_id, {
+                'default': {
+                    'media_src': 'http://nba.cdn.turner.com/nba/big',
+                },
+                'm3u8': {
+                    'media_src': 'http://nbavod-f.akamaihd.net',
+                },
+            })
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py

new file mode 100644 (file)

index 0000000..6f3cb30
--- /dev/null
+++ b/youtube_dl/extractor/nbc.py
@@ -0,0 +1,541 @@
+from __future__ import unicode_literals
+
+import base64
+import json
+import re
+
+from .common import InfoExtractor
+from .theplatform import ThePlatformIE
+from .adobepass import AdobePassIE
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    parse_duration,
+    smuggle_url,
+    try_get,
+    unified_timestamp,
+    update_url_query,
+)
+
+
+class NBCIE(AdobePassIE):
+    _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))'
+
+    _TESTS = [
+        {
+            'url': 'http://www.nbc.com/the-tonight-show/video/jimmy-fallon-surprises-fans-at-ben-jerrys/2848237',
+            'info_dict': {
+                'id': '2848237',
+                'ext': 'mp4',
+                'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
+                'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
+                'timestamp': 1424246400,
+                'upload_date': '20150218',
+                'uploader': 'NBCU-COM',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
+            'info_dict': {
+                'id': '2832821',
+                'ext': 'mp4',
+                'title': 'Star Wars Teaser',
+                'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
+                'timestamp': 1417852800,
+                'upload_date': '20141206',
+                'uploader': 'NBCU-COM',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+            'skip': 'Only works from US',
+        },
+        {
+            # HLS streams requires the 'hdnea3' cookie
+            'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
+            'info_dict': {
+                'id': '101528f5a9e8127b107e98c5e6ce4638',
+                'ext': 'mp4',
+                'title': 'Goliath',
+                'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.',
+                'timestamp': 1237100400,
+                'upload_date': '20090315',
+                'uploader': 'NBCU-COM',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'Only works from US',
+        },
+        {
+            'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310',
+            'only_matching': True,
+        },
+        {
+            # Percent escaped url
+            'url': 'https://www.nbc.com/up-all-night/video/day-after-valentine%27s-day/n2189',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        permalink, video_id = re.match(self._VALID_URL, url).groups()
+        permalink = 'http' + compat_urllib_parse_unquote(permalink)
+        video_data = self._download_json(
+            'https://friendship.nbc.co/v2/graphql', video_id, query={
+                'query': '''query bonanzaPage(
+  $app: NBCUBrands! = nbc
+  $name: String!
+  $oneApp: Boolean
+  $platform: SupportedPlatforms! = web
+  $type: EntityPageType! = VIDEO
+  $userId: String!
+) {
+  bonanzaPage(
+    app: $app
+    name: $name
+    oneApp: $oneApp
+    platform: $platform
+    type: $type
+    userId: $userId
+  ) {
+    metadata {
+      ... on VideoPageData {
+        description
+        episodeNumber
+        keywords
+        locked
+        mpxAccountId
+        mpxGuid
+        rating
+        resourceId
+        seasonNumber
+        secondaryTitle
+        seriesShortTitle
+      }
+    }
+  }
+}''',
+                'variables': json.dumps({
+                    'name': permalink,
+                    'oneApp': True,
+                    'userId': '0',
+                }),
+            })['data']['bonanzaPage']['metadata']
+        query = {
+            'mbr': 'true',
+            'manifest': 'm3u',
+        }
+        video_id = video_data['mpxGuid']
+        title = video_data['secondaryTitle']
+        if video_data.get('locked'):
+            resource = self._get_mvpd_resource(
+                video_data.get('resourceId') or 'nbcentertainment',
+                title, video_id, video_data.get('rating'))
+            query['auth'] = self._extract_mvpd_auth(
+                url, video_id, 'nbcentertainment', resource)
+        theplatform_url = smuggle_url(update_url_query(
+            'http://link.theplatform.com/s/NnzsPC/media/guid/%s/%s' % (video_data.get('mpxAccountId') or '2410887629', video_id),
+            query), {'force_smil_url': True})
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': title,
+            'url': theplatform_url,
+            'description': video_data.get('description'),
+            'tags': video_data.get('keywords'),
+            'season_number': int_or_none(video_data.get('seasonNumber')),
+            'episode_number': int_or_none(video_data.get('episodeNumber')),
+            'episode': title,
+            'series': video_data.get('seriesShortTitle'),
+            'ie_key': 'ThePlatform',
+        }
+
+
+class NBCSportsVPlayerIE(InfoExtractor):
+    _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
+
+    _TESTS = [{
+        'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI',
+        'info_dict': {
+            'id': '9CsDKds0kvHI',
+            'ext': 'mp4',
+            'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
+            'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+            'timestamp': 1426270238,
+            'upload_date': '20150313',
+            'uploader': 'NBCU-SPORTS',
+        }
+    }, {
+        'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        iframe_m = re.search(
+            r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage)
+        if iframe_m:
+            return iframe_m.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        theplatform_url = self._og_search_video_url(webpage).replace(
+            'vplayer.nbcsports.com', 'player.theplatform.com')
+        return self.url_result(theplatform_url, 'ThePlatform')
+
+
+class NBCSportsIE(InfoExtractor):
+    # Does not include https because its certificate is invalid
+    _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
+
+    _TEST = {
+        'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke',
+        'info_dict': {
+            'id': 'PHJSaFWbrTY9',
+            'ext': 'flv',
+            'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
+            'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
+            'uploader': 'NBCU-SPORTS',
+            'upload_date': '20150330',
+            'timestamp': 1427726529,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        return self.url_result(
+            NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')
+
+
+class NBCSportsStreamIE(AdobePassIE):
+    _VALID_URL = r'https?://stream\.nbcsports\.com/.+?\bpid=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://stream.nbcsports.com/nbcsn/generic?pid=206559',
+        'info_dict': {
+            'id': '206559',
+            'ext': 'mp4',
+            'title': 'Amgen Tour of California Women\'s Recap',
+            'description': 'md5:66520066b3b5281ada7698d0ea2aa894',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': 'Requires Adobe Pass Authentication',
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        live_source = self._download_json(
+            'http://stream.nbcsports.com/data/live_sources_%s.json' % video_id,
+            video_id)
+        video_source = live_source['videoSources'][0]
+        title = video_source['title']
+        source_url = None
+        for k in ('source', 'msl4source', 'iossource', 'hlsv4'):
+            sk = k + 'Url'
+            source_url = video_source.get(sk) or video_source.get(sk + 'Alt')
+            if source_url:
+                break
+        else:
+            source_url = video_source['ottStreamUrl']
+        is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live'
+        resource = self._get_mvpd_resource('nbcsports', title, video_id, '')
+        token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource)
+        tokenized_url = self._download_json(
+            'https://token.playmakerservices.com/cdn',
+            video_id, data=json.dumps({
+                'requestorId': 'nbcsports',
+                'pid': video_id,
+                'application': 'NBCSports',
+                'version': 'v1',
+                'platform': 'desktop',
+                'cdn': 'akamai',
+                'url': video_source['sourceUrl'],
+                'token': base64.b64encode(token.encode()).decode(),
+                'resourceId': base64.b64encode(resource.encode()).decode(),
+            }).encode())['tokenizedUrl']
+        formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4')
+        self._sort_formats(formats)
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'description': live_source.get('description'),
+            'formats': formats,
+            'is_live': is_live,
+        }
+
+
+class CSNNEIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P<id>[0-9a-z-]+)'
+
+    _TEST = {
+        'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter',
+        'info_dict': {
+            'id': 'yvBLLUgQ8WU0',
+            'ext': 'mp4',
+            'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.',
+            'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3',
+            'timestamp': 1459369979,
+            'upload_date': '20160330',
+            'uploader': 'NBCU-SPORTS',
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': self._html_search_meta('twitter:player:stream', webpage),
+            'display_id': display_id,
+        }
+
+
+class NBCNewsIE(ThePlatformIE):
+    _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880',
+            'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf',
+            'info_dict': {
+                'id': '269389891880',
+                'ext': 'mp4',
+                'title': 'How Twitter Reacted To The Snowden Interview',
+                'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
+                'timestamp': 1401363060,
+                'upload_date': '20140529',
+            },
+        },
+        {
+            'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
+            'md5': 'fdbf39ab73a72df5896b6234ff98518a',
+            'info_dict': {
+                'id': '529953347624',
+                'ext': 'mp4',
+                'title': 'FULL EPISODE: Family Business',
+                'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
+            },
+            'skip': 'This page is unavailable.',
+        },
+        {
+            'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
+            'md5': '8eb831eca25bfa7d25ddd83e85946548',
+            'info_dict': {
+                'id': '394064451844',
+                'ext': 'mp4',
+                'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
+                'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
+                'timestamp': 1423104900,
+                'upload_date': '20150205',
+            },
+        },
+        {
+            'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456',
+            'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0',
+            'info_dict': {
+                'id': 'n431456',
+                'ext': 'mp4',
+                'title': "Volkswagen U.S. Chief:  We 'Totally Screwed Up'",
+                'description': 'md5:d22d1281a24f22ea0880741bb4dd6301',
+                'upload_date': '20150922',
+                'timestamp': 1442917800,
+            },
+        },
+        {
+            'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788',
+            'md5': '118d7ca3f0bea6534f119c68ef539f71',
+            'info_dict': {
+                'id': '669831235788',
+                'ext': 'mp4',
+                'title': 'See the aurora borealis from space in stunning new NASA video',
+                'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',
+                'upload_date': '20160420',
+                'timestamp': 1461152093,
+            },
+        },
+        {
+            'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
+            'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
+            'info_dict': {
+                'id': '314487875924',
+                'ext': 'mp4',
+                'title': 'The chaotic GOP immigration vote',
+                'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'timestamp': 1406937606,
+                'upload_date': '20140802',
+            },
+        },
+        {
+            'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952',
+            'only_matching': True,
+        },
+        {
+            # From http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html
+            'url': 'http://www.nbcnews.com/widget/video-embed/701714499682',
+            'only_matching': True,
+        },
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        data = self._parse_json(self._search_regex(
+            r'window\.__data\s*=\s*({.+});', webpage,
+            'bootstrap json'), video_id, js_to_json)
+        video_data = try_get(data, lambda x: x['video']['current'], dict)
+        if not video_data:
+            video_data = data['article']['content'][0]['primaryMedia']['video']
+        title = video_data['headline']['primary']
+
+        formats = []
+        for va in video_data.get('videoAssets', []):
+            public_url = va.get('publicUrl')
+            if not public_url:
+                continue
+            if '://link.theplatform.com/' in public_url:
+                public_url = update_url_query(public_url, {'format': 'redirect'})
+            format_id = va.get('format')
+            if format_id == 'M3U':
+                formats.extend(self._extract_m3u8_formats(
+                    public_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id=format_id, fatal=False))
+                continue
+            tbr = int_or_none(va.get('bitrate'), 1000)
+            if tbr:
+                format_id += '-%d' % tbr
+            formats.append({
+                'format_id': format_id,
+                'url': public_url,
+                'width': int_or_none(va.get('width')),
+                'height': int_or_none(va.get('height')),
+                'tbr': tbr,
+                'ext': 'mp4',
+            })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        closed_captioning = video_data.get('closedCaptioning')
+        if closed_captioning:
+            for cc_url in closed_captioning.values():
+                if not cc_url:
+                    continue
+                subtitles.setdefault('en', []).append({
+                    'url': cc_url,
+                })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': try_get(video_data, lambda x: x['description']['primary']),
+            'thumbnail': try_get(video_data, lambda x: x['primaryImage']['url']['primary']),
+            'duration': parse_duration(video_data.get('duration')),
+            'timestamp': unified_timestamp(video_data.get('datePublished')),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class NBCOlympicsIE(InfoExtractor):
+    IE_NAME = 'nbcolympics'
+    _VALID_URL = r'https?://www\.nbcolympics\.com/video/(?P<id>[a-z-]+)'
+
+    _TEST = {
+        # Geo-restricted to US
+        'url': 'http://www.nbcolympics.com/video/justin-roses-son-leo-was-tears-after-his-dad-won-gold',
+        'md5': '54fecf846d05429fbaa18af557ee523a',
+        'info_dict': {
+            'id': 'WjTBzDXx5AUq',
+            'display_id': 'justin-roses-son-leo-was-tears-after-his-dad-won-gold',
+            'ext': 'mp4',
+            'title': 'Rose\'s son Leo was in tears after his dad won gold',
+            'description': 'Olympic gold medalist Justin Rose gets emotional talking to the impact his win in men\'s golf has already had on his children.',
+            'timestamp': 1471274964,
+            'upload_date': '20160815',
+            'uploader': 'NBCU-SPORTS',
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        drupal_settings = self._parse_json(self._search_regex(
+            r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+            webpage, 'drupal settings'), display_id)
+
+        iframe_url = drupal_settings['vod']['iframe_url']
+        theplatform_url = iframe_url.replace(
+            'vplayer.nbcolympics.com', 'player.theplatform.com')
+
+        return {
+            '_type': 'url_transparent',
+            'url': theplatform_url,
+            'ie_key': ThePlatformIE.ie_key(),
+            'display_id': display_id,
+        }
+
+
+class NBCOlympicsStreamIE(AdobePassIE):
+    IE_NAME = 'nbcolympics:stream'
+    _VALID_URL = r'https?://stream\.nbcolympics\.com/(?P<id>[0-9a-z-]+)'
+    _TEST = {
+        'url': 'http://stream.nbcolympics.com/2018-winter-olympics-nbcsn-evening-feb-8',
+        'info_dict': {
+            'id': '203493',
+            'ext': 'mp4',
+            'title': 're:Curling, Alpine, Luge [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+    _DATA_URL_TEMPLATE = 'http://stream.nbcolympics.com/data/%s_%s.json'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        pid = self._search_regex(r'pid\s*=\s*(\d+);', webpage, 'pid')
+        resource = self._search_regex(
+            r"resource\s*=\s*'(.+)';", webpage,
+            'resource').replace("' + pid + '", pid)
+        event_config = self._download_json(
+            self._DATA_URL_TEMPLATE % ('event_config', pid),
+            pid)['eventConfig']
+        title = self._live_title(event_config['eventTitle'])
+        source_url = self._download_json(
+            self._DATA_URL_TEMPLATE % ('live_sources', pid),
+            pid)['videoSources'][0]['sourceUrl']
+        media_token = self._extract_mvpd_auth(
+            url, pid, event_config.get('requestorId', 'NBCOlympics'), resource)
+        formats = self._extract_m3u8_formats(self._download_webpage(
+            'http://sp.auth.adobe.com/tvs/v1/sign', pid, query={
+                'cdn': 'akamai',
+                'mediaToken': base64.b64encode(media_token.encode()),
+                'resource': base64.b64encode(resource.encode()),
+                'url': source_url,
+            }), pid, 'mp4')
+        self._sort_formats(formats)
+
+        return {
+            'id': pid,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'is_live': True,
+        }
diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py

new file mode 100644 (file)

index 0000000..2447c81
--- /dev/null
+++ b/youtube_dl/extractor/ndr.py
@@ -0,0 +1,402 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    merge_dicts,
+    parse_iso8601,
+    qualities,
+    try_get,
+    urljoin,
+)
+
+
+class NDRBaseIE(InfoExtractor):
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = next(group for group in mobj.groups() if group)
+        webpage = self._download_webpage(url, display_id)
+        return self._extract_embed(webpage, display_id)
+
+
+class NDRIE(NDRBaseIE):
+    IE_NAME = 'ndr'
+    IE_DESC = 'NDR.de - Norddeutscher Rundfunk'
+    _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html'
+    _TESTS = [{
+        # httpVideo, same content id
+        'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html',
+        'md5': '6515bc255dc5c5f8c85bbc38e035a659',
+        'info_dict': {
+            'id': 'hafengeburtstag988',
+            'display_id': 'Party-Poette-und-Parade',
+            'ext': 'mp4',
+            'title': 'Party, Pötte und Parade',
+            'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c',
+            'uploader': 'ndrtv',
+            'timestamp': 1431108900,
+            'upload_date': '20150510',
+            'duration': 3498,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # httpVideo, different content id
+        'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html',
+        'md5': '1043ff203eab307f0c51702ec49e9a71',
+        'info_dict': {
+            'id': 'osna272',
+            'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch',
+            'ext': 'mp4',
+            'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights',
+            'description': 'md5:32e9b800b3d2d4008103752682d5dc01',
+            'uploader': 'ndrtv',
+            'timestamp': 1442059200,
+            'upload_date': '20150912',
+            'duration': 510,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # httpAudio, same content id
+        'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html',
+        'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
+        'info_dict': {
+            'id': 'audio51535',
+            'display_id': 'La-Valette-entgeht-der-Hinrichtung',
+            'ext': 'mp3',
+            'title': 'La Valette entgeht der Hinrichtung',
+            'description': 'md5:22f9541913a40fe50091d5cdd7c9f536',
+            'uploader': 'ndrinfo',
+            'timestamp': 1290626100,
+            'upload_date': '20140729',
+            'duration': 884,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',
+        'only_matching': True,
+    }]
+
+    def _extract_embed(self, webpage, display_id):
+        embed_url = self._html_search_meta(
+            'embedURL', webpage, 'embed URL',
+            default=None) or self._search_regex(
+            r'\bembedUrl["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'embed URL', group='url')
+        description = self._search_regex(
+            r'<p[^>]+itemprop="description">([^<]+)</p>',
+            webpage, 'description', default=None) or self._og_search_description(webpage)
+        timestamp = parse_iso8601(
+            self._search_regex(
+                r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"',
+                webpage, 'upload date', default=None))
+        info = self._search_json_ld(webpage, display_id, default={})
+        return merge_dicts({
+            '_type': 'url_transparent',
+            'url': embed_url,
+            'display_id': display_id,
+            'description': description,
+            'timestamp': timestamp,
+        }, info)
+
+
+class NJoyIE(NDRBaseIE):
+    IE_NAME = 'njoy'
+    IE_DESC = 'N-JOY'
+    _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html'
+    _TESTS = [{
+        # httpVideo, same content id
+        'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html',
+        'md5': 'cb63be60cd6f9dd75218803146d8dc67',
+        'info_dict': {
+            'id': 'comedycontest2480',
+            'display_id': 'Benaissa-beim-NDR-Comedy-Contest',
+            'ext': 'mp4',
+            'title': 'Benaissa beim NDR Comedy Contest',
+            'description': 'md5:f057a6c4e1c728b10d33b5ffd36ddc39',
+            'uploader': 'ndrtv',
+            'upload_date': '20141129',
+            'duration': 654,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # httpVideo, different content id
+        'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html',
+        'md5': '417660fffa90e6df2fda19f1b40a64d8',
+        'info_dict': {
+            'id': 'dockville882',
+            'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-',
+            'ext': 'mp4',
+            'title': '"Ich hab noch nie" mit Felix Jaehn',
+            'description': 'md5:85dd312d53be1b99e1f998a16452a2f3',
+            'uploader': 'njoy',
+            'upload_date': '20150822',
+            'duration': 211,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.n-joy.de/radio/webradio/morningshow209.html',
+        'only_matching': True,
+    }]
+
+    def _extract_embed(self, webpage, display_id):
+        video_id = self._search_regex(
+            r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id')
+        description = self._search_regex(
+            r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>',
+            webpage, 'description', fatal=False)
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'NDREmbedBase',
+            'url': 'ndr:%s' % video_id,
+            'display_id': display_id,
+            'description': description,
+        }
+
+
+class NDREmbedBaseIE(InfoExtractor):
+    IE_NAME = 'ndr:embed:base'
+    _VALID_URL = r'(?:ndr:(?P<id_s>[\da-z]+)|https?://www\.ndr\.de/(?P<id>[\da-z]+)-ppjson\.json)'
+    _TESTS = [{
+        'url': 'ndr:soundcheck3366',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ndr.de/soundcheck3366-ppjson.json',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('id_s')
+
+        ppjson = self._download_json(
+            'http://www.ndr.de/%s-ppjson.json' % video_id, video_id)
+
+        playlist = ppjson['playlist']
+
+        formats = []
+        quality_key = qualities(('xs', 's', 'm', 'l', 'xl'))
+
+        for format_id, f in playlist.items():
+            src = f.get('src')
+            if not src:
+                continue
+            ext = determine_ext(src, None)
+            if ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id,
+                    f4m_id='hds', fatal=False))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', m3u8_id='hls',
+                    entry_protocol='m3u8_native', fatal=False))
+            else:
+                quality = f.get('quality')
+                ff = {
+                    'url': src,
+                    'format_id': quality or format_id,
+                    'quality': quality_key(quality),
+                }
+                type_ = f.get('type')
+                if type_ and type_.split('/')[0] == 'audio':
+                    ff['vcodec'] = 'none'
+                    ff['ext'] = ext or 'mp3'
+                formats.append(ff)
+        self._sort_formats(formats)
+
+        config = playlist['config']
+
+        live = playlist.get('config', {}).get('streamType') in ['httpVideoLive', 'httpAudioLive']
+        title = config['title']
+        if live:
+            title = self._live_title(title)
+        uploader = ppjson.get('config', {}).get('branding')
+        upload_date = ppjson.get('config', {}).get('publicationDate')
+        duration = int_or_none(config.get('duration'))
+
+        thumbnails = []
+        poster = try_get(config, lambda x: x['poster'], dict) or {}
+        for thumbnail_id, thumbnail in poster.items():
+            thumbnail_url = urljoin(url, thumbnail.get('src'))
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'id': thumbnail.get('quality') or thumbnail_id,
+                'url': thumbnail_url,
+                'preference': quality_key(thumbnail.get('quality')),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'is_live': live,
+            'uploader': uploader if uploader != '-' else None,
+            'upload_date': upload_date[0:8] if upload_date else None,
+            'duration': duration,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
+
+
+class NDREmbedIE(NDREmbedBaseIE):
+    IE_NAME = 'ndr:embed'
+    _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html'
+    _TESTS = [{
+        'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html',
+        'md5': '8b9306142fe65bbdefb5ce24edb6b0a9',
+        'info_dict': {
+            'id': 'ndraktuell28488',
+            'ext': 'mp4',
+            'title': 'Norddeutschland begrüßt Flüchtlinge',
+            'is_live': False,
+            'uploader': 'ndrtv',
+            'upload_date': '20150907',
+            'duration': 132,
+        },
+    }, {
+        'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html',
+        'md5': '002085c44bae38802d94ae5802a36e78',
+        'info_dict': {
+            'id': 'soundcheck3366',
+            'ext': 'mp4',
+            'title': 'Ella Henderson braucht Vergleiche nicht zu scheuen',
+            'is_live': False,
+            'uploader': 'ndr2',
+            'upload_date': '20150912',
+            'duration': 3554,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.ndr.de/info/audio51535-player.html',
+        'md5': 'bb3cd38e24fbcc866d13b50ca59307b8',
+        'info_dict': {
+            'id': 'audio51535',
+            'ext': 'mp3',
+            'title': 'La Valette entgeht der Hinrichtung',
+            'is_live': False,
+            'uploader': 'ndrinfo',
+            'upload_date': '20140729',
+            'duration': 884,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.ndr.de/fernsehen/sendungen/visite/visite11010-externalPlayer.html',
+        'md5': 'ae57f80511c1e1f2fd0d0d3d31aeae7c',
+        'info_dict': {
+            'id': 'visite11010',
+            'ext': 'mp4',
+            'title': 'Visite - die ganze Sendung',
+            'is_live': False,
+            'uploader': 'ndrtv',
+            'upload_date': '20150902',
+            'duration': 3525,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # httpVideoLive
+        'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html',
+        'info_dict': {
+            'id': 'livestream217',
+            'ext': 'flv',
+            'title': r're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+            'is_live': True,
+            'upload_date': '20150910',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.ndr.de/ndrkultur/audio255020-player.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ndr.de/fernsehen/sendungen/nordtour/nordtour7124-player.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ndr.de/kultur/film/videos/videoimport10424-player.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ndr.de/fernsehen/sendungen/hamburg_journal/hamj43006-player.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ndr.de/fernsehen/sendungen/weltbilder/weltbilder4518-player.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ndr.de/fernsehen/doku952-player.html',
+        'only_matching': True,
+    }]
+
+
+class NJoyEmbedIE(NDREmbedBaseIE):
+    IE_NAME = 'njoy:embed'
+    _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html'
+    _TESTS = [{
+        # httpVideo
+        'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html',
+        'md5': '8483cbfe2320bd4d28a349d62d88bd74',
+        'info_dict': {
+            'id': 'doku948',
+            'ext': 'mp4',
+            'title': 'Zehn Jahre Reeperbahn Festival - die Doku',
+            'is_live': False,
+            'upload_date': '20150807',
+            'duration': 1011,
+        },
+    }, {
+        # httpAudio
+        'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html',
+        'md5': 'd989f80f28ac954430f7b8a48197188a',
+        'info_dict': {
+            'id': 'stefanrichter100',
+            'ext': 'mp3',
+            'title': 'Interview mit einem Augenzeugen',
+            'is_live': False,
+            'uploader': 'njoy',
+            'upload_date': '20150909',
+            'duration': 140,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # httpAudioLive, no explicit ext
+        'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html',
+        'info_dict': {
+            'id': 'webradioweltweit100',
+            'ext': 'mp3',
+            'title': r're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+            'is_live': True,
+            'uploader': 'njoy',
+            'upload_date': '20150810',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.n-joy.de/musik/dockville882-player_image-3905259e-0803-4764-ac72-8b7de077d80a_theme-n-joy.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.n-joy.de/radio/sendungen/morningshow/urlaubsfotos190-player_image-066a5df1-5c95-49ec-a323-941d848718db_theme-n-joy.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.n-joy.de/entertainment/comedy/krudetv290-player_image-ab261bfe-51bf-4bf3-87ba-c5122ee35b3d_theme-n-joy.html',
+        'only_matching': True,
+    }]
diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py

new file mode 100644 (file)

index 0000000..bc3eb91
--- /dev/null
+++ b/youtube_dl/extractor/ndtv.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_unquote_plus
+)
+from ..utils import (
+    parse_duration,
+    remove_end,
+    unified_strdate,
+    urljoin
+)
+
+
+class NDTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+\.)?ndtv\.com/(?:[^/]+/)*videos?/?(?:[^/]+/)*[^/?^&]+-(?P<id>\d+)'
+
+    _TESTS = [
+        {
+            'url': 'https://khabar.ndtv.com/video/show/prime-time/prime-time-ill-system-and-poor-education-468818',
+            'md5': '78efcf3880ef3fd9b83d405ca94a38eb',
+            'info_dict': {
+                'id': '468818',
+                'ext': 'mp4',
+                'title': "प्राइम टाइम: सिस्टम बीमार, स्कूल बदहाल",
+                'description': 'md5:f410512f1b49672e5695dea16ef2731d',
+                'upload_date': '20170928',
+                'duration': 2218,
+                'thumbnail': r're:https?://.*\.jpg',
+            }
+        },
+        {
+            # __filename is url
+            'url': 'http://movies.ndtv.com/videos/cracker-free-diwali-wishes-from-karan-johar-kriti-sanon-other-stars-470304',
+            'md5': 'f1d709352305b44443515ac56b45aa46',
+            'info_dict': {
+                'id': '470304',
+                'ext': 'mp4',
+                'title': "Cracker-Free Diwali Wishes From Karan Johar, Kriti Sanon & Other Stars",
+                'description': 'md5:f115bba1adf2f6433fa7c1ade5feb465',
+                'upload_date': '20171019',
+                'duration': 137,
+                'thumbnail': r're:https?://.*\.jpg',
+            }
+        },
+        {
+            'url': 'https://www.ndtv.com/video/news/news/delhi-s-air-quality-status-report-after-diwali-is-very-poor-470372',
+            'only_matching': True
+        },
+        {
+            'url': 'https://auto.ndtv.com/videos/the-cnb-daily-october-13-2017-469935',
+            'only_matching': True
+        },
+        {
+            'url': 'https://sports.ndtv.com/cricket/videos/2nd-t20i-rock-thrown-at-australia-cricket-team-bus-after-win-over-india-469764',
+            'only_matching': True
+        },
+        {
+            'url': 'http://gadgets.ndtv.com/videos/uncharted-the-lost-legacy-review-465568',
+            'only_matching': True
+        },
+        {
+            'url': 'http://profit.ndtv.com/videos/news/video-indian-economy-on-very-solid-track-international-monetary-fund-chief-470040',
+            'only_matching': True
+        },
+        {
+            'url': 'http://food.ndtv.com/video-basil-seeds-coconut-porridge-419083',
+            'only_matching': True
+        },
+        {
+            'url': 'https://doctor.ndtv.com/videos/top-health-stories-of-the-week-467396',
+            'only_matching': True
+        },
+        {
+            'url': 'https://swirlster.ndtv.com/video/how-to-make-friends-at-work-469324',
+            'only_matching': True
+        }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        # '__title' does not contain extra words such as sub-site name, "Video" etc.
+        title = compat_urllib_parse_unquote_plus(
+            self._search_regex(r"__title\s*=\s*'([^']+)'", webpage, 'title', default=None)
+            or self._og_search_title(webpage))
+
+        filename = self._search_regex(
+            r"(?:__)?filename\s*[:=]\s*'([^']+)'", webpage, 'video filename')
+        # in "movies" sub-site pages, filename is URL
+        video_url = urljoin('https://ndtvod.bc-ssl.cdn.bitgravity.com/23372/ndtv/', filename.lstrip('/'))
+
+        # "doctor" sub-site has MM:SS format
+        duration = parse_duration(self._search_regex(
+            r"(?:__)?duration\s*[:=]\s*'([^']+)'", webpage, 'duration', fatal=False))
+
+        # "sports", "doctor", "swirlster" sub-sites don't have 'publish-date'
+        upload_date = unified_strdate(self._html_search_meta(
+            'publish-date', webpage, 'upload date', default=None) or self._html_search_meta(
+            'uploadDate', webpage, 'upload date', default=None) or self._search_regex(
+            r'datePublished"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False))
+
+        description = remove_end(self._og_search_description(webpage), ' (Read more)')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'duration': duration,
+            'upload_date': upload_date,
+        }
diff --git a/youtube_dl/extractor/nerdcubed.py b/youtube_dl/extractor/nerdcubed.py

new file mode 100644 (file)

index 0000000..9feccc6
--- /dev/null
+++ b/youtube_dl/extractor/nerdcubed.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import datetime
+
+from .common import InfoExtractor
+
+
+class NerdCubedFeedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nerdcubed\.co\.uk/feed\.json'
+    _TEST = {
+        'url': 'http://www.nerdcubed.co.uk/feed.json',
+        'info_dict': {
+            'id': 'nerdcubed-feed',
+            'title': 'nerdcubed.co.uk feed',
+        },
+        'playlist_mincount': 1300,
+    }
+
+    def _real_extract(self, url):
+        feed = self._download_json(url, url, 'Downloading NerdCubed JSON feed')
+
+        entries = [{
+            '_type': 'url',
+            'title': feed_entry['title'],
+            'uploader': feed_entry['source']['name'] if feed_entry['source'] else None,
+            'upload_date': datetime.datetime.strptime(feed_entry['date'], '%Y-%m-%d').strftime('%Y%m%d'),
+            'url': 'http://www.youtube.com/watch?v=' + feed_entry['youtube_id'],
+        } for feed_entry in feed]
+
+        return {
+            '_type': 'playlist',
+            'title': 'nerdcubed.co.uk feed',
+            'id': 'nerdcubed-feed',
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py

new file mode 100644 (file)

index 0000000..978a058
--- /dev/null
+++ b/youtube_dl/extractor/neteasemusic.py
@@ -0,0 +1,485 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from hashlib import md5
+from base64 import b64encode
+from datetime import datetime
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_urlencode,
+    compat_str,
+    compat_itertools_count,
+)
+from ..utils import (
+    sanitized_Request,
+    float_or_none,
+)
+
+
+class NetEaseMusicBaseIE(InfoExtractor):
+    _FORMATS = ['bMusic', 'mMusic', 'hMusic']
+    _NETEASE_SALT = '3go8&$8*3*3h0k(2)2'
+    _API_BASE = 'http://music.163.com/api/'
+
+    @classmethod
+    def _encrypt(cls, dfsid):
+        salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8'))
+        string_bytes = bytearray(compat_str(dfsid).encode('ascii'))
+        salt_len = len(salt_bytes)
+        for i in range(len(string_bytes)):
+            string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len]
+        m = md5()
+        m.update(bytes(string_bytes))
+        result = b64encode(m.digest()).decode('ascii')
+        return result.replace('/', '_').replace('+', '-')
+
+    def extract_formats(self, info):
+        formats = []
+        for song_format in self._FORMATS:
+            details = info.get(song_format)
+            if not details:
+                continue
+            song_file_path = '/%s/%s.%s' % (
+                self._encrypt(details['dfsId']), details['dfsId'], details['extension'])
+
+            # 203.130.59.9, 124.40.233.182, 115.231.74.139, etc is a reverse proxy-like feature
+            # from NetEase's CDN provider that can be used if m5.music.126.net does not
+            # work, especially for users outside of Mainland China
+            # via: https://github.com/JixunMoe/unblock-163/issues/3#issuecomment-163115880
+            for host in ('http://m5.music.126.net', 'http://115.231.74.139/m1.music.126.net',
+                         'http://124.40.233.182/m1.music.126.net', 'http://203.130.59.9/m1.music.126.net'):
+                song_url = host + song_file_path
+                if self._is_valid_url(song_url, info['id'], 'song'):
+                    formats.append({
+                        'url': song_url,
+                        'ext': details.get('extension'),
+                        'abr': float_or_none(details.get('bitrate'), scale=1000),
+                        'format_id': song_format,
+                        'filesize': details.get('size'),
+                        'asr': details.get('sr')
+                    })
+                    break
+        return formats
+
+    @classmethod
+    def convert_milliseconds(cls, ms):
+        return int(round(ms / 1000.0))
+
+    def query_api(self, endpoint, video_id, note):
+        req = sanitized_Request('%s%s' % (self._API_BASE, endpoint))
+        req.add_header('Referer', self._API_BASE)
+        return self._download_json(req, video_id, note)
+
+
+class NetEaseMusicIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:song'
+    IE_DESC = '网易云音乐'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://music.163.com/#/song?id=32102397',
+        'md5': 'f2e97280e6345c74ba9d5677dd5dcb45',
+        'info_dict': {
+            'id': '32102397',
+            'ext': 'mp3',
+            'title': 'Bad Blood (feat. Kendrick Lamar)',
+            'creator': 'Taylor Swift / Kendrick Lamar',
+            'upload_date': '20150517',
+            'timestamp': 1431878400,
+            'description': 'md5:a10a54589c2860300d02e1de821eb2ef',
+        },
+        'skip': 'Blocked outside Mainland China',
+    }, {
+        'note': 'No lyrics translation.',
+        'url': 'http://music.163.com/#/song?id=29822014',
+        'info_dict': {
+            'id': '29822014',
+            'ext': 'mp3',
+            'title': '听见下雨的声音',
+            'creator': '周杰伦',
+            'upload_date': '20141225',
+            'timestamp': 1419523200,
+            'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c',
+        },
+        'skip': 'Blocked outside Mainland China',
+    }, {
+        'note': 'No lyrics.',
+        'url': 'http://music.163.com/song?id=17241424',
+        'info_dict': {
+            'id': '17241424',
+            'ext': 'mp3',
+            'title': 'Opus 28',
+            'creator': 'Dustin O\'Halloran',
+            'upload_date': '20080211',
+            'timestamp': 1202745600,
+        },
+        'skip': 'Blocked outside Mainland China',
+    }, {
+        'note': 'Has translated name.',
+        'url': 'http://music.163.com/#/song?id=22735043',
+        'info_dict': {
+            'id': '22735043',
+            'ext': 'mp3',
+            'title': '소원을 말해봐 (Genie)',
+            'creator': '少女时代',
+            'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184',
+            'upload_date': '20100127',
+            'timestamp': 1264608000,
+            'alt_title': '说出愿望吧(Genie)',
+        },
+        'skip': 'Blocked outside Mainland China',
+    }]
+
+    def _process_lyrics(self, lyrics_info):
+        original = lyrics_info.get('lrc', {}).get('lyric')
+        translated = lyrics_info.get('tlyric', {}).get('lyric')
+
+        if not translated:
+            return original
+
+        lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)'
+        original_ts_texts = re.findall(lyrics_expr, original)
+        translation_ts_dict = dict(
+            (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated)
+        )
+        lyrics = '\n'.join([
+            '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, ''))
+            for time_stamp, text in original_ts_texts
+        ])
+        return lyrics
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+
+        params = {
+            'id': song_id,
+            'ids': '[%s]' % song_id
+        }
+        info = self.query_api(
+            'song/detail?' + compat_urllib_parse_urlencode(params),
+            song_id, 'Downloading song info')['songs'][0]
+
+        formats = self.extract_formats(info)
+        self._sort_formats(formats)
+
+        lyrics_info = self.query_api(
+            'song/lyric?id=%s&lv=-1&tv=-1' % song_id,
+            song_id, 'Downloading lyrics data')
+        lyrics = self._process_lyrics(lyrics_info)
+
+        alt_title = None
+        if info.get('transNames'):
+            alt_title = '/'.join(info.get('transNames'))
+
+        return {
+            'id': song_id,
+            'title': info['name'],
+            'alt_title': alt_title,
+            'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]),
+            'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')),
+            'thumbnail': info.get('album', {}).get('picUrl'),
+            'duration': self.convert_milliseconds(info.get('duration', 0)),
+            'description': lyrics,
+            'formats': formats,
+        }
+
+
+class NetEaseMusicAlbumIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:album'
+    IE_DESC = '网易云音乐 - 专辑'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://music.163.com/#/album?id=220780',
+        'info_dict': {
+            'id': '220780',
+            'title': 'B\'day',
+        },
+        'playlist_count': 23,
+        'skip': 'Blocked outside Mainland China',
+    }
+
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+
+        info = self.query_api(
+            'album/%s?id=%s' % (album_id, album_id),
+            album_id, 'Downloading album data')['album']
+
+        name = info['name']
+        desc = info.get('description')
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+                            'NetEaseMusic', song['id'])
+            for song in info['songs']
+        ]
+        return self.playlist_result(entries, album_id, name, desc)
+
+
+class NetEaseMusicSingerIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:singer'
+    IE_DESC = '网易云音乐 - 歌手'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'note': 'Singer has aliases.',
+        'url': 'http://music.163.com/#/artist?id=10559',
+        'info_dict': {
+            'id': '10559',
+            'title': '张惠妹 - aMEI;阿密特',
+        },
+        'playlist_count': 50,
+        'skip': 'Blocked outside Mainland China',
+    }, {
+        'note': 'Singer has translated name.',
+        'url': 'http://music.163.com/#/artist?id=124098',
+        'info_dict': {
+            'id': '124098',
+            'title': '李昇基 - 이승기',
+        },
+        'playlist_count': 50,
+        'skip': 'Blocked outside Mainland China',
+    }]
+
+    def _real_extract(self, url):
+        singer_id = self._match_id(url)
+
+        info = self.query_api(
+            'artist/%s?id=%s' % (singer_id, singer_id),
+            singer_id, 'Downloading singer data')
+
+        name = info['artist']['name']
+        if info['artist']['trans']:
+            name = '%s - %s' % (name, info['artist']['trans'])
+        if info['artist']['alias']:
+            name = '%s - %s' % (name, ';'.join(info['artist']['alias']))
+
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+                            'NetEaseMusic', song['id'])
+            for song in info['hotSongs']
+        ]
+        return self.playlist_result(entries, singer_id, name)
+
+
+class NetEaseMusicListIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:playlist'
+    IE_DESC = '网易云音乐 - 歌单'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://music.163.com/#/playlist?id=79177352',
+        'info_dict': {
+            'id': '79177352',
+            'title': 'Billboard 2007 Top 100',
+            'description': 'md5:12fd0819cab2965b9583ace0f8b7b022'
+        },
+        'playlist_count': 99,
+        'skip': 'Blocked outside Mainland China',
+    }, {
+        'note': 'Toplist/Charts sample',
+        'url': 'http://music.163.com/#/discover/toplist?id=3733003',
+        'info_dict': {
+            'id': '3733003',
+            'title': 're:韩国Melon排行榜周榜 [0-9]{4}-[0-9]{2}-[0-9]{2}',
+            'description': 'md5:73ec782a612711cadc7872d9c1e134fc',
+        },
+        'playlist_count': 50,
+        'skip': 'Blocked outside Mainland China',
+    }]
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        info = self.query_api(
+            'playlist/detail?id=%s&lv=-1&tv=-1' % list_id,
+            list_id, 'Downloading playlist data')['result']
+
+        name = info['name']
+        desc = info.get('description')
+
+        if info.get('specialType') == 10:  # is a chart/toplist
+            datestamp = datetime.fromtimestamp(
+                self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d')
+            name = '%s %s' % (name, datestamp)
+
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song['id'],
+                            'NetEaseMusic', song['id'])
+            for song in info['tracks']
+        ]
+        return self.playlist_result(entries, list_id, name, desc)
+
+
+class NetEaseMusicMvIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:mv'
+    IE_DESC = '网易云音乐 - MV'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://music.163.com/#/mv?id=415350',
+        'info_dict': {
+            'id': '415350',
+            'ext': 'mp4',
+            'title': '이럴거면 그러지말지',
+            'description': '白雅言自作曲唱甜蜜爱情',
+            'creator': '白雅言',
+            'upload_date': '20150520',
+        },
+        'skip': 'Blocked outside Mainland China',
+    }
+
+    def _real_extract(self, url):
+        mv_id = self._match_id(url)
+
+        info = self.query_api(
+            'mv/detail?id=%s&type=mp4' % mv_id,
+            mv_id, 'Downloading mv info')['data']
+
+        formats = [
+            {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)}
+            for brs, mv_url in info['brs'].items()
+        ]
+        self._sort_formats(formats)
+
+        return {
+            'id': mv_id,
+            'title': info['name'],
+            'description': info.get('desc') or info.get('briefDesc'),
+            'creator': info['artistName'],
+            'upload_date': info['publishTime'].replace('-', ''),
+            'formats': formats,
+            'thumbnail': info.get('cover'),
+            'duration': self.convert_milliseconds(info.get('duration', 0)),
+        }
+
+
+class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:program'
+    IE_DESC = '网易云音乐 - 电台节目'
+    _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://music.163.com/#/program?id=10109055',
+        'info_dict': {
+            'id': '10109055',
+            'ext': 'mp3',
+            'title': '不丹足球背后的故事',
+            'description': '喜马拉雅人的足球梦 ...',
+            'creator': '大话西藏',
+            'timestamp': 1434179342,
+            'upload_date': '20150613',
+            'duration': 900,
+        },
+        'skip': 'Blocked outside Mainland China',
+    }, {
+        'note': 'This program has accompanying songs.',
+        'url': 'http://music.163.com/#/program?id=10141022',
+        'info_dict': {
+            'id': '10141022',
+            'title': '25岁，你是自在如风的少年<27°C>',
+            'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
+        },
+        'playlist_count': 4,
+        'skip': 'Blocked outside Mainland China',
+    }, {
+        'note': 'This program has accompanying songs.',
+        'url': 'http://music.163.com/#/program?id=10141022',
+        'info_dict': {
+            'id': '10141022',
+            'ext': 'mp3',
+            'title': '25岁，你是自在如风的少年<27°C>',
+            'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
+            'timestamp': 1434450841,
+            'upload_date': '20150616',
+        },
+        'params': {
+            'noplaylist': True
+        },
+        'skip': 'Blocked outside Mainland China',
+    }]
+
+    def _real_extract(self, url):
+        program_id = self._match_id(url)
+
+        info = self.query_api(
+            'dj/program/detail?id=%s' % program_id,
+            program_id, 'Downloading program info')['program']
+
+        name = info['name']
+        description = info['description']
+
+        if not info['songs'] or self._downloader.params.get('noplaylist'):
+            if info['songs']:
+                self.to_screen(
+                    'Downloading just the main audio %s because of --no-playlist'
+                    % info['mainSong']['id'])
+
+            formats = self.extract_formats(info['mainSong'])
+            self._sort_formats(formats)
+
+            return {
+                'id': program_id,
+                'title': name,
+                'description': description,
+                'creator': info['dj']['brand'],
+                'timestamp': self.convert_milliseconds(info['createTime']),
+                'thumbnail': info['coverUrl'],
+                'duration': self.convert_milliseconds(info.get('duration', 0)),
+                'formats': formats,
+            }
+
+        self.to_screen(
+            'Downloading playlist %s - add --no-playlist to just download the main audio %s'
+            % (program_id, info['mainSong']['id']))
+
+        song_ids = [info['mainSong']['id']]
+        song_ids.extend([song['id'] for song in info['songs']])
+        entries = [
+            self.url_result('http://music.163.com/#/song?id=%s' % song_id,
+                            'NetEaseMusic', song_id)
+            for song_id in song_ids
+        ]
+        return self.playlist_result(entries, program_id, name, description)
+
+
+class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE):
+    IE_NAME = 'netease:djradio'
+    IE_DESC = '网易云音乐 - 电台'
+    _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://music.163.com/#/djradio?id=42',
+        'info_dict': {
+            'id': '42',
+            'title': '声音蔓延',
+            'description': 'md5:766220985cbd16fdd552f64c578a6b15'
+        },
+        'playlist_mincount': 40,
+        'skip': 'Blocked outside Mainland China',
+    }
+    _PAGE_SIZE = 1000
+
+    def _real_extract(self, url):
+        dj_id = self._match_id(url)
+
+        name = None
+        desc = None
+        entries = []
+        for offset in compat_itertools_count(start=0, step=self._PAGE_SIZE):
+            info = self.query_api(
+                'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d'
+                % (self._PAGE_SIZE, dj_id, offset),
+                dj_id, 'Downloading dj programs - %d' % offset)
+
+            entries.extend([
+                self.url_result(
+                    'http://music.163.com/#/program?id=%s' % program['id'],
+                    'NetEaseMusicProgram', program['id'])
+                for program in info['programs']
+            ])
+
+            if name is None:
+                radio = info['programs'][0]['radio']
+                name = radio['name']
+                desc = radio['desc']
+
+            if not info['more']:
+                break
+
+        return self.playlist_result(entries, dj_id, name, desc)
diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py

new file mode 100644 (file)

index 0000000..aec3026
--- /dev/null
+++ b/youtube_dl/extractor/netzkino.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+    js_to_json,
+    parse_iso8601,
+)
+
+
+class NetzkinoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)'
+
+    _TEST = {
+        'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond',
+        'md5': '92a3f8b76f8d7220acce5377ea5d4873',
+        'info_dict': {
+            'id': 'rakete-zum-mond',
+            'ext': 'mp4',
+            'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)',
+            'comments': 'mincount:3',
+            'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28',
+            'upload_date': '20120813',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'timestamp': 1344858571,
+            'age_limit': 12,
+        },
+        'params': {
+            'skip_download': 'Download only works from Germany',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        category_id = mobj.group('category')
+        video_id = mobj.group('id')
+
+        api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id
+        api_info = self._download_json(api_url, video_id)
+        info = next(
+            p for p in api_info['posts'] if p['slug'] == video_id)
+        custom_fields = info['custom_fields']
+
+        production_js = self._download_webpage(
+            'http://www.netzkino.de/beta/dist/production.min.js', video_id,
+            note='Downloading player code')
+        avo_js = self._search_regex(
+            r'var urlTemplate=(\{.*?"\})',
+            production_js, 'URL templates')
+        templates = self._parse_json(
+            avo_js, video_id, transform_source=js_to_json)
+
+        suffix = {
+            'hds': '.mp4/manifest.f4m',
+            'hls': '.mp4/master.m3u8',
+            'pmd': '.mp4',
+        }
+        film_fn = custom_fields['Streaming'][0]
+        formats = [{
+            'format_id': key,
+            'ext': 'mp4',
+            'url': tpl.replace('{}', film_fn) + suffix[key],
+        } for key, tpl in templates.items()]
+        self._sort_formats(formats)
+
+        comments = [{
+            'timestamp': parse_iso8601(c.get('date'), delimiter=' '),
+            'id': c['id'],
+            'author': c['name'],
+            'html': c['content'],
+            'parent': 'root' if c.get('parent', 0) == 0 else c['parent'],
+        } for c in info.get('comments', [])]
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'comments': comments,
+            'title': info['title'],
+            'age_limit': int_or_none(custom_fields.get('FSK')[0]),
+            'timestamp': parse_iso8601(info.get('date'), delimiter=' '),
+            'description': clean_html(info.get('content')),
+            'thumbnail': info.get('thumbnail'),
+            'playlist_title': api_info.get('title'),
+            'playlist_id': category_id,
+        }
diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py

new file mode 100644 (file)

index 0000000..82e7cf5
--- /dev/null
+++ b/youtube_dl/extractor/newgrounds.py
@@ -0,0 +1,168 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+    parse_duration,
+    parse_filesize,
+    unified_timestamp,
+)
+
+
+class NewgroundsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'https://www.newgrounds.com/audio/listen/549479',
+        'md5': 'fe6033d297591288fa1c1f780386f07a',
+        'info_dict': {
+            'id': '549479',
+            'ext': 'mp3',
+            'title': 'B7 - BusMode',
+            'uploader': 'Burn7',
+            'timestamp': 1378878540,
+            'upload_date': '20130911',
+            'duration': 143,
+        },
+    }, {
+        'url': 'https://www.newgrounds.com/portal/view/673111',
+        'md5': '3394735822aab2478c31b1004fe5e5bc',
+        'info_dict': {
+            'id': '673111',
+            'ext': 'mp4',
+            'title': 'Dancin',
+            'uploader': 'Squirrelman82',
+            'timestamp': 1460256780,
+            'upload_date': '20160410',
+        },
+    }, {
+        # source format unavailable, additional mp4 formats
+        'url': 'http://www.newgrounds.com/portal/view/689400',
+        'info_dict': {
+            'id': '689400',
+            'ext': 'mp4',
+            'title': 'ZTV News Episode 8',
+            'uploader': 'BennettTheSage',
+            'timestamp': 1487965140,
+            'upload_date': '20170224',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        media_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, media_id)
+
+        title = self._html_search_regex(
+            r'<title>([^>]+)</title>', webpage, 'title')
+
+        media_url = self._parse_json(self._search_regex(
+            r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id)
+
+        formats = [{
+            'url': media_url,
+            'format_id': 'source',
+            'quality': 1,
+        }]
+
+        max_resolution = int_or_none(self._search_regex(
+            r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
+            default=None))
+        if max_resolution:
+            url_base = media_url.rpartition('.')[0]
+            for resolution in (360, 720, 1080):
+                if resolution > max_resolution:
+                    break
+                formats.append({
+                    'url': '%s.%dp.mp4' % (url_base, resolution),
+                    'format_id': '%dp' % resolution,
+                    'height': resolution,
+                })
+
+        self._check_formats(formats, media_id)
+        self._sort_formats(formats)
+
+        uploader = self._html_search_regex(
+            (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*Author\s*</em>',
+             r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
+            fatal=False)
+
+        timestamp = unified_timestamp(self._html_search_regex(
+            (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)',
+             r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp',
+            default=None))
+        duration = parse_duration(self._search_regex(
+            r'(?s)<dd>\s*Song\s*</dd>\s*<dd>.+?</dd>\s*<dd>([^<]+)', webpage,
+            'duration', default=None))
+
+        filesize_approx = parse_filesize(self._html_search_regex(
+            r'(?s)<dd>\s*Song\s*</dd>\s*<dd>(.+?)</dd>', webpage, 'filesize',
+            default=None))
+        if len(formats) == 1:
+            formats[0]['filesize_approx'] = filesize_approx
+
+        if '<dd>Song' in webpage:
+            formats[0]['vcodec'] = 'none'
+
+        return {
+            'id': media_id,
+            'title': title,
+            'uploader': uploader,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
+
+
+class NewgroundsPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.newgrounds.com/collection/cats',
+        'info_dict': {
+            'id': 'cats',
+            'title': 'Cats',
+        },
+        'playlist_mincount': 46,
+    }, {
+        'url': 'http://www.newgrounds.com/portal/search/author/ZONE-SAMA',
+        'info_dict': {
+            'id': 'ZONE-SAMA',
+            'title': 'Portal Search: ZONE-SAMA',
+        },
+        'playlist_mincount': 47,
+    }, {
+        'url': 'http://www.newgrounds.com/audio/search/title/cats',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        title = self._search_regex(
+            r'<title>([^>]+)</title>', webpage, 'title', default=None)
+
+        # cut left menu
+        webpage = self._search_regex(
+            r'(?s)<div[^>]+\bclass=["\']column wide(.+)',
+            webpage, 'wide column', default=webpage)
+
+        entries = []
+        for a, path, media_id in re.findall(
+                r'(<a[^>]+\bhref=["\']/?((?:portal/view|audio/listen)/(\d+))[^>]+>)',
+                webpage):
+            a_class = extract_attributes(a).get('class')
+            if a_class not in ('item-portalsubmission', 'item-audiosubmission'):
+                continue
+            entries.append(
+                self.url_result(
+                    'https://www.newgrounds.com/%s' % path,
+                    ie=NewgroundsIE.ie_key(), video_id=media_id))
+
+        return self.playlist_result(entries, playlist_id, title)
diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py

new file mode 100644 (file)

index 0000000..dab4aec
--- /dev/null
+++ b/youtube_dl/extractor/newstube.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import hashlib
+
+from .common import InfoExtractor
+from ..aes import aes_cbc_decrypt
+from ..utils import (
+    bytes_to_intlist,
+    int_or_none,
+    intlist_to_bytes,
+    parse_codecs,
+    parse_duration,
+)
+
+
+class NewstubeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P<id>.+)'
+    _TEST = {
+        'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym',
+        'md5': '9d10320ad473444352f72f746ccb8b8c',
+        'info_dict': {
+            'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6',
+            'ext': 'mp4',
+            'title': 'Телеканал CNN переместил город Славянск в Крым',
+            'description': 'md5:419a8c9f03442bc0b0a794d689360335',
+            'duration': 31.05,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        page = self._download_webpage(url, video_id)
+        title = self._html_search_meta(['og:title', 'twitter:title'], page, fatal=True)
+
+        video_guid = self._html_search_regex(
+            r'<meta\s+property="og:video(?::(?:(?:secure_)?url|iframe))?"\s+content="https?://(?:www\.)?newstube\.ru/embed/(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+            page, 'video GUID')
+
+        enc_data = base64.b64decode(self._download_webpage(
+            'https://www.newstube.ru/embed/api/player/getsources2',
+            video_guid, query={
+                'guid': video_guid,
+                'ff': 3,
+            }))
+        key = hashlib.pbkdf2_hmac(
+            'sha1', video_guid.replace('-', '').encode(), enc_data[:16], 1)[:16]
+        dec_data = aes_cbc_decrypt(
+            bytes_to_intlist(enc_data[32:]), bytes_to_intlist(key),
+            bytes_to_intlist(enc_data[16:32]))
+        sources = self._parse_json(intlist_to_bytes(dec_data[:-dec_data[-1]]), video_guid)
+
+        formats = []
+        for source in sources:
+            source_url = source.get('Src')
+            if not source_url:
+                continue
+            height = int_or_none(source.get('Height'))
+            f = {
+                'format_id': 'http' + ('-%dp' % height if height else ''),
+                'url': source_url,
+                'width': int_or_none(source.get('Width')),
+                'height': height,
+            }
+            source_type = source.get('Type')
+            if source_type:
+                f.update(parse_codecs(self._search_regex(
+                    r'codecs="([^"]+)"', source_type, 'codecs', fatal=False)))
+            formats.append(f)
+
+        self._check_formats(formats, video_guid)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_guid,
+            'title': title,
+            'description': self._html_search_meta(['description', 'og:description'], page),
+            'thumbnail': self._html_search_meta(['og:image:secure_url', 'og:image', 'twitter:image'], page),
+            'duration': parse_duration(self._html_search_meta('duration', page)),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py

new file mode 100644 (file)

index 0000000..7bd1290
--- /dev/null
+++ b/youtube_dl/extractor/nextmedia.py
@@ -0,0 +1,238 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    clean_html,
+    get_element_by_class,
+    int_or_none,
+    parse_iso8601,
+    remove_start,
+    unified_timestamp,
+)
+
+
+class NextMediaIE(InfoExtractor):
+    IE_DESC = '蘋果日報'
+    _VALID_URL = r'https?://hk\.apple\.nextmedia\.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199',
+        'md5': 'dff9fad7009311c421176d1ac90bfe4f',
+        'info_dict': {
+            'id': '53109199',
+            'ext': 'mp4',
+            'title': '【佔領金鐘】50外國領事議員撐場 讚學生勇敢香港有希望',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'md5:28222b9912b6665a21011b034c70fcc7',
+            'timestamp': 1415456273,
+            'upload_date': '20141108',
+        }
+    }]
+
+    _URL_PATTERN = r'\{ url: \'(.+)\' \}'
+
+    def _real_extract(self, url):
+        news_id = self._match_id(url)
+        page = self._download_webpage(url, news_id)
+        return self._extract_from_nextmedia_page(news_id, url, page)
+
+    def _extract_from_nextmedia_page(self, news_id, url, page):
+        redirection_url = self._search_regex(
+            r'window\.location\.href\s*=\s*([\'"])(?P<url>(?!\1).+)\1',
+            page, 'redirection URL', default=None, group='url')
+        if redirection_url:
+            return self.url_result(compat_urlparse.urljoin(url, redirection_url))
+
+        title = self._fetch_title(page)
+        video_url = self._search_regex(self._URL_PATTERN, page, 'video url')
+
+        attrs = {
+            'id': news_id,
+            'title': title,
+            'url': video_url,  # ext can be inferred from url
+            'thumbnail': self._fetch_thumbnail(page),
+            'description': self._fetch_description(page),
+        }
+
+        timestamp = self._fetch_timestamp(page)
+        if timestamp:
+            attrs['timestamp'] = timestamp
+        else:
+            attrs['upload_date'] = self._fetch_upload_date(url)
+
+        return attrs
+
+    def _fetch_title(self, page):
+        return self._og_search_title(page)
+
+    def _fetch_thumbnail(self, page):
+        return self._og_search_thumbnail(page)
+
+    def _fetch_timestamp(self, page):
+        dateCreated = self._search_regex('"dateCreated":"([^"]+)"', page, 'created time')
+        return parse_iso8601(dateCreated)
+
+    def _fetch_upload_date(self, url):
+        return self._search_regex(self._VALID_URL, url, 'upload date', group='date')
+
+    def _fetch_description(self, page):
+        return self._og_search_property('description', page)
+
+
+class NextMediaActionNewsIE(NextMediaIE):
+    IE_DESC = '蘋果日報 - 動新聞'
+    _VALID_URL = r'https?://hk\.dv\.nextmedia\.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+'
+    _TESTS = [{
+        'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460',
+        'md5': '05fce8ffeed7a5e00665d4b7cf0f9201',
+        'info_dict': {
+            'id': '19009428',
+            'ext': 'mp4',
+            'title': '【壹週刊】細10年男友偷食　50歲邵美琪再失戀',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'md5:cd802fad1f40fd9ea178c1e2af02d659',
+            'timestamp': 1421791200,
+            'upload_date': '20150120',
+        }
+    }]
+
+    def _real_extract(self, url):
+        news_id = self._match_id(url)
+        actionnews_page = self._download_webpage(url, news_id)
+        article_url = self._og_search_url(actionnews_page)
+        article_page = self._download_webpage(article_url, news_id)
+        return self._extract_from_nextmedia_page(news_id, url, article_page)
+
+
+class AppleDailyIE(NextMediaIE):
+    IE_DESC = '臺灣蘋果日報'
+    _VALID_URL = r'https?://(www|ent)\.appledaily\.com\.tw/[^/]+/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
+    _TESTS = [{
+        'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
+        'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
+        'info_dict': {
+            'id': '36354694',
+            'ext': 'mp4',
+            'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',
+            'upload_date': '20150128',
+        }
+    }, {
+        'url': 'http://www.appledaily.com.tw/realtimenews/article/strange/20150128/550549/%E4%B8%8D%E6%BB%BF%E8%A2%AB%E8%B8%A9%E8%85%B3%E3%80%80%E5%B1%B1%E6%9D%B1%E5%85%A9%E5%A4%A7%E5%AA%BD%E4%B8%80%E8%B7%AF%E6%89%93%E4%B8%8B%E8%BB%8A',
+        'md5': '86b4e9132d158279c7883822d94ccc49',
+        'info_dict': {
+            'id': '550549',
+            'ext': 'mp4',
+            'title': '不滿被踩腳　山東兩大媽一路打下車',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'md5:175b4260c1d7c085993474217e4ab1b4',
+            'upload_date': '20150128',
+        }
+    }, {
+        'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
+        'md5': '03df296d95dedc2d5886debbb80cb43f',
+        'info_dict': {
+            'id': '5003671',
+            'ext': 'mp4',
+            'title': '20正妹熱舞　《刀龍傳說Online》火辣上市',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd',
+            'upload_date': '20150128',
+        },
+        'skip': 'redirect to http://www.appledaily.com.tw/animation/',
+    }, {
+        # No thumbnail
+        'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003673/',
+        'md5': 'b06182cd386ea7bc6115ec7ff0f72aeb',
+        'info_dict': {
+            'id': '5003673',
+            'ext': 'mp4',
+            'title': '半夜尿尿　好像會看到___',
+            'description': 'md5:61d2da7fe117fede148706cdb85ac066',
+            'upload_date': '20150128',
+        },
+        'expected_warnings': [
+            'video thumbnail',
+        ],
+        'skip': 'redirect to http://www.appledaily.com.tw/animation/',
+    }, {
+        'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
+        'md5': 'eaa20e6b9df418c912d7f5dec2ba734d',
+        'info_dict': {
+            'id': '35770334',
+            'ext': 'mp4',
+            'title': '咖啡占卜測 XU裝熟指數',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'md5:7b859991a6a4fedbdf3dd3b66545c748',
+            'upload_date': '20140417',
+        },
+    }, {
+        'url': 'http://www.appledaily.com.tw/actionnews/appledaily/7/20161003/960588/',
+        'only_matching': True,
+    }, {
+        # Redirected from http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694
+        'url': 'http://ent.appledaily.com.tw/section/article/headline/20150128/36354694',
+        'only_matching': True,
+    }]
+
+    _URL_PATTERN = r'\{url: \'(.+)\'\}'
+
+    def _fetch_title(self, page):
+        return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None)
+                or self._html_search_meta('description', page, 'news title'))
+
+    def _fetch_thumbnail(self, page):
+        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
+
+    def _fetch_timestamp(self, page):
+        return None
+
+    def _fetch_description(self, page):
+        return self._html_search_meta('description', page, 'news description')
+
+
+class NextTVIE(InfoExtractor):
+    IE_DESC = '壹電視'
+    _VALID_URL = r'https?://(?:www\.)?nexttv\.com\.tw/(?:[^/]+/)+(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.nexttv.com.tw/news/realtime/politics/11779671',
+        'info_dict': {
+            'id': '11779671',
+            'ext': 'mp4',
+            'title': '「超收稅」近4千億！　藍議員籲發消費券',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1484825400,
+            'upload_date': '20170119',
+            'view_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<h1[^>]*>([^<]+)</h1>', webpage, 'title')
+
+        data = self._hidden_inputs(webpage)
+
+        video_url = data['ntt-vod-src-detailview']
+
+        date_str = get_element_by_class('date', webpage)
+        timestamp = unified_timestamp(date_str + '+0800') if date_str else None
+
+        view_count = int_or_none(remove_start(
+            clean_html(get_element_by_class('click', webpage)), '點閱：'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': data.get('ntt-vod-img-src'),
+            'timestamp': timestamp,
+            'view_count': view_count,
+        }
diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py

new file mode 100644 (file)

index 0000000..586c1b7
--- /dev/null
+++ b/youtube_dl/extractor/nexx.py
@@ -0,0 +1,453 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import random
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_duration,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class NexxIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/|
+                            nexx:(?:(?P<domain_id_s>\d+):)?|
+                            https?://arc\.nexx\.cloud/api/video/
+                        )
+                        (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        # movie
+        'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907',
+        'md5': '31899fd683de49ad46f4ee67e53e83fe',
+        'info_dict': {
+            'id': '128907',
+            'ext': 'mp4',
+            'title': 'Stiftung Warentest',
+            'alt_title': 'Wie ein Test abläuft',
+            'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2',
+            'creator': 'SPIEGEL TV',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2509,
+            'timestamp': 1384264416,
+            'upload_date': '20131112',
+        },
+    }, {
+        # episode
+        'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858',
+        'info_dict': {
+            'id': '247858',
+            'ext': 'mp4',
+            'title': 'Return of the Golden Child (OV)',
+            'description': 'md5:5d969537509a92b733de21bae249dc63',
+            'release_year': 2017,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1397,
+            'timestamp': 1495033267,
+            'upload_date': '20170517',
+            'episode_number': 2,
+            'season_number': 2,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'HTTP Error 404: Not Found',
+    }, {
+        # does not work via arc
+        'url': 'nexx:741:1269984',
+        'md5': 'c714b5b238b2958dc8d5642addba6886',
+        'info_dict': {
+            'id': '1269984',
+            'ext': 'mp4',
+            'title': '1 TAG ohne KLO... wortwörtlich! 😑',
+            'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 607,
+            'timestamp': 1518614955,
+            'upload_date': '20180214',
+        },
+    }, {
+        # free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html
+        'url': 'nexx:747:1533779',
+        'md5': '6bf6883912b82b7069fb86c2297e9893',
+        'info_dict': {
+            'id': '1533779',
+            'ext': 'mp4',
+            'title': 'Aufregung um ausgebrochene Raubtiere',
+            'alt_title': 'Eifel-Zoo',
+            'description': 'md5:f21375c91c74ad741dcb164c427999d2',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 111,
+            'timestamp': 1527874460,
+            'upload_date': '20180601',
+        },
+    }, {
+        'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907',
+        'only_matching': True,
+    }, {
+        'url': 'nexx:748:128907',
+        'only_matching': True,
+    }, {
+        'url': 'nexx:128907',
+        'only_matching': True,
+    }, {
+        'url': 'https://arc.nexx.cloud/api/video/128907.json',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_domain_id(webpage):
+        mobj = re.search(
+            r'<script\b[^>]+\bsrc=["\'](?:https?:)?//(?:require|arc)\.nexx(?:\.cloud|cdn\.com)/(?:sdk/)?(?P<id>\d+)',
+            webpage)
+        return mobj.group('id') if mobj else None
+
+    @staticmethod
+    def _extract_urls(webpage):
+        # Reference:
+        # 1. https://nx-s.akamaized.net/files/201510/44.pdf
+
+        entries = []
+
+        # JavaScript Integration
+        domain_id = NexxIE._extract_domain_id(webpage)
+        if domain_id:
+            for video_id in re.findall(
+                    r'(?is)onPLAYReady.+?_play\.(?:init|(?:control\.)?addPlayer)\s*\(.+?\s*,\s*["\']?(\d+)',
+                    webpage):
+                entries.append(
+                    'https://api.nexx.cloud/v3/%s/videos/byid/%s'
+                    % (domain_id, video_id))
+
+        # TODO: support more embed formats
+
+        return entries
+
+    @staticmethod
+    def _extract_url(webpage):
+        return NexxIE._extract_urls(webpage)[0]
+
+    def _handle_error(self, response):
+        status = int_or_none(try_get(
+            response, lambda x: x['metadata']['status']) or 200)
+        if 200 <= status < 300:
+            return
+        raise ExtractorError(
+            '%s said: %s' % (self.IE_NAME, response['metadata']['errorhint']),
+            expected=True)
+
+    def _call_api(self, domain_id, path, video_id, data=None, headers={}):
+        headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
+        result = self._download_json(
+            'https://api.nexx.cloud/v3/%s/%s' % (domain_id, path), video_id,
+            'Downloading %s JSON' % path, data=urlencode_postdata(data),
+            headers=headers)
+        self._handle_error(result)
+        return result['result']
+
+    def _extract_free_formats(self, video, video_id):
+        stream_data = video['streamdata']
+        cdn = stream_data['cdnType']
+        assert cdn == 'free'
+
+        hash = video['general']['hash']
+
+        ps = compat_str(stream_data['originalDomain'])
+        if stream_data['applyFolderHierarchy'] == 1:
+            s = ('%04d' % int(video_id))[::-1]
+            ps += '/%s/%s' % (s[0:2], s[2:4])
+        ps += '/%s/%s_' % (video_id, hash)
+
+        t = 'http://%s' + ps
+        fd = stream_data['azureFileDistribution'].split(',')
+        cdn_provider = stream_data['cdnProvider']
+
+        def p0(p):
+            return '_%s' % p if stream_data['applyAzureStructure'] == 1 else ''
+
+        formats = []
+        if cdn_provider == 'ak':
+            t += ','
+            for i in fd:
+                p = i.split(':')
+                t += p[1] + p0(int(p[0])) + ','
+            t += '.mp4.csmil/master.%s'
+        elif cdn_provider == 'ce':
+            k = t.split('/')
+            h = k.pop()
+            http_base = t = '/'.join(k)
+            http_base = http_base % stream_data['cdnPathHTTP']
+            t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream='
+            for i in fd:
+                p = i.split(':')
+                tbr = int(p[0])
+                filename = '%s%s%s.mp4' % (h, p[1], p0(tbr))
+                f = {
+                    'url': http_base + '/' + filename,
+                    'format_id': '%s-http-%d' % (cdn, tbr),
+                    'tbr': tbr,
+                }
+                width_height = p[1].split('x')
+                if len(width_height) == 2:
+                    f.update({
+                        'width': int_or_none(width_height[0]),
+                        'height': int_or_none(width_height[1]),
+                    })
+                formats.append(f)
+                a = filename + ':%s' % (tbr * 1000)
+                t += a + ','
+            t = t[:-1] + '&audiostream=' + a.split(':')[0]
+        else:
+            assert False
+
+        if cdn_provider == 'ce':
+            formats.extend(self._extract_mpd_formats(
+                t % (stream_data['cdnPathDASH'], 'mpd'), video_id,
+                mpd_id='%s-dash' % cdn, fatal=False))
+        formats.extend(self._extract_m3u8_formats(
+            t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4',
+            entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False))
+
+        return formats
+
+    def _extract_azure_formats(self, video, video_id):
+        stream_data = video['streamdata']
+        cdn = stream_data['cdnType']
+        assert cdn == 'azure'
+
+        azure_locator = stream_data['azureLocator']
+
+        def get_cdn_shield_base(shield_type='', static=False):
+            for secure in ('', 's'):
+                cdn_shield = stream_data.get('cdnShield%sHTTP%s' % (shield_type, secure.upper()))
+                if cdn_shield:
+                    return 'http%s://%s' % (secure, cdn_shield)
+            else:
+                if 'fb' in stream_data['azureAccount']:
+                    prefix = 'df' if static else 'f'
+                else:
+                    prefix = 'd' if static else 'p'
+                account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', ''))
+                return 'http://nx-%s%02d.akamaized.net/' % (prefix, account)
+
+        language = video['general'].get('language_raw') or ''
+
+        azure_stream_base = get_cdn_shield_base()
+        is_ml = ',' in language
+        azure_manifest_url = '%s%s/%s_src%s.ism/Manifest' % (
+            azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s'
+
+        protection_token = try_get(
+            video, lambda x: x['protectiondata']['token'], compat_str)
+        if protection_token:
+            azure_manifest_url += '?hdnts=%s' % protection_token
+
+        formats = self._extract_m3u8_formats(
+            azure_manifest_url % '(format=m3u8-aapl)',
+            video_id, 'mp4', 'm3u8_native',
+            m3u8_id='%s-hls' % cdn, fatal=False)
+        formats.extend(self._extract_mpd_formats(
+            azure_manifest_url % '(format=mpd-time-csf)',
+            video_id, mpd_id='%s-dash' % cdn, fatal=False))
+        formats.extend(self._extract_ism_formats(
+            azure_manifest_url % '', video_id, ism_id='%s-mss' % cdn, fatal=False))
+
+        azure_progressive_base = get_cdn_shield_base('Prog', True)
+        azure_file_distribution = stream_data.get('azureFileDistribution')
+        if azure_file_distribution:
+            fds = azure_file_distribution.split(',')
+            if fds:
+                for fd in fds:
+                    ss = fd.split(':')
+                    if len(ss) == 2:
+                        tbr = int_or_none(ss[0])
+                        if tbr:
+                            f = {
+                                'url': '%s%s/%s_src_%s_%d.mp4' % (
+                                    azure_progressive_base, azure_locator, video_id, ss[1], tbr),
+                                'format_id': '%s-http-%d' % (cdn, tbr),
+                                'tbr': tbr,
+                            }
+                            width_height = ss[1].split('x')
+                            if len(width_height) == 2:
+                                f.update({
+                                    'width': int_or_none(width_height[0]),
+                                    'height': int_or_none(width_height[1]),
+                                })
+                            formats.append(f)
+
+        return formats
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        domain_id = mobj.group('domain_id') or mobj.group('domain_id_s')
+        video_id = mobj.group('id')
+
+        video = None
+
+        def find_video(result):
+            if isinstance(result, dict):
+                return result
+            elif isinstance(result, list):
+                vid = int(video_id)
+                for v in result:
+                    if try_get(v, lambda x: x['general']['ID'], int) == vid:
+                        return v
+            return None
+
+        response = self._download_json(
+            'https://arc.nexx.cloud/api/video/%s.json' % video_id,
+            video_id, fatal=False)
+        if response and isinstance(response, dict):
+            result = response.get('result')
+            if result:
+                video = find_video(result)
+
+        # not all videos work via arc, e.g. nexx:741:1269984
+        if not video:
+            # Reverse engineered from JS code (see getDeviceID function)
+            device_id = '%d:%d:%d%d' % (
+                random.randint(1, 4), int(time.time()),
+                random.randint(1e4, 99999), random.randint(1, 9))
+
+            result = self._call_api(domain_id, 'session/init', video_id, data={
+                'nxp_devh': device_id,
+                'nxp_userh': '',
+                'precid': '0',
+                'playlicense': '0',
+                'screenx': '1920',
+                'screeny': '1080',
+                'playerversion': '6.0.00',
+                'gateway': 'html5',
+                'adGateway': '',
+                'explicitlanguage': 'en-US',
+                'addTextTemplates': '1',
+                'addDomainData': '1',
+                'addAdModel': '1',
+            }, headers={
+                'X-Request-Enable-Auth-Fallback': '1',
+            })
+
+            cid = result['general']['cid']
+
+            # As described in [1] X-Request-Token generation algorithm is
+            # as follows:
+            #   md5( operation + domain_id + domain_secret )
+            # where domain_secret is a static value that will be given by nexx.tv
+            # as per [1]. Here is how this "secret" is generated (reversed
+            # from _play.api.init function, search for clienttoken). So it's
+            # actually not static and not that much of a secret.
+            # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf
+            secret = result['device']['clienttoken'][int(device_id[0]):]
+            secret = secret[0:len(secret) - int(device_id[-1])]
+
+            op = 'byid'
+
+            # Reversed from JS code for _play.api.call function (search for
+            # X-Request-Token)
+            request_token = hashlib.md5(
+                ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest()
+
+            result = self._call_api(
+                domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={
+                    'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description',
+                    'addInteractionOptions': '1',
+                    'addStatusDetails': '1',
+                    'addStreamDetails': '1',
+                    'addCaptions': '1',
+                    'addScenes': '1',
+                    'addHotSpots': '1',
+                    'addBumpers': '1',
+                    'captionFormat': 'data',
+                }, headers={
+                    'X-Request-CID': cid,
+                    'X-Request-Token': request_token,
+                })
+            video = find_video(result)
+
+        general = video['general']
+        title = general['title']
+
+        cdn = video['streamdata']['cdnType']
+
+        if cdn == 'azure':
+            formats = self._extract_azure_formats(video, video_id)
+        elif cdn == 'free':
+            formats = self._extract_free_formats(video, video_id)
+        else:
+            # TODO: reverse more cdns
+            assert False
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'alt_title': general.get('subtitle'),
+            'description': general.get('description'),
+            'release_year': int_or_none(general.get('year')),
+            'creator': general.get('studio') or general.get('studio_adref'),
+            'thumbnail': try_get(
+                video, lambda x: x['imagedata']['thumb'], compat_str),
+            'duration': parse_duration(general.get('runtime')),
+            'timestamp': int_or_none(general.get('uploaded')),
+            'episode_number': int_or_none(try_get(
+                video, lambda x: x['episodedata']['episode'])),
+            'season_number': int_or_none(try_get(
+                video, lambda x: x['episodedata']['season'])),
+            'formats': formats,
+        }
+
+
+class NexxEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
+        'md5': '16746bfc28c42049492385c989b26c4a',
+        'info_dict': {
+            'id': '161464',
+            'ext': 'mp4',
+            'title': 'Nervenkitzel Achterbahn',
+            'alt_title': 'Karussellbauer in Deutschland',
+            'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
+            'creator': 'SPIEGEL TV',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2761,
+            'timestamp': 1394021479,
+            'upload_date': '20140305',
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://embed.nexx.cloud/11888/video/DSRTO7UVOX06S7',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        # Reference:
+        # 1. https://nx-s.akamaized.net/files/201510/44.pdf
+
+        # iFrame Embed Integration
+        return [mobj.group('url') for mobj in re.finditer(
+            r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1',
+            webpage)]
+
+    def _real_extract(self, url):
+        embed_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, embed_id)
+
+        return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key())
diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py

new file mode 100644 (file)

index 0000000..460deb1
--- /dev/null
+++ b/youtube_dl/extractor/nfl.py
@@ -0,0 +1,231 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    remove_end,
+)
+
+
+class NFLIE(InfoExtractor):
+    IE_NAME = 'nfl.com'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?P<host>
+                            (?:www\.)?
+                            (?:
+                                (?:
+                                    nfl|
+                                    buffalobills|
+                                    miamidolphins|
+                                    patriots|
+                                    newyorkjets|
+                                    baltimoreravens|
+                                    bengals|
+                                    clevelandbrowns|
+                                    steelers|
+                                    houstontexans|
+                                    colts|
+                                    jaguars|
+                                    titansonline|
+                                    denverbroncos|
+                                    kcchiefs|
+                                    raiders|
+                                    chargers|
+                                    dallascowboys|
+                                    giants|
+                                    philadelphiaeagles|
+                                    redskins|
+                                    chicagobears|
+                                    detroitlions|
+                                    packers|
+                                    vikings|
+                                    atlantafalcons|
+                                    panthers|
+                                    neworleanssaints|
+                                    buccaneers|
+                                    azcardinals|
+                                    stlouisrams|
+                                    49ers|
+                                    seahawks
+                                )\.com|
+                                .+?\.clubs\.nfl\.com
+                            )
+                        )/
+                        (?:.+?/)*
+                        (?P<id>[^/#?&]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights',
+        'md5': '394ef771ddcd1354f665b471d78ec4c6',
+        'info_dict': {
+            'id': '0ap3000000398478',
+            'ext': 'mp4',
+            'title': 'Week 3: Redskins vs. Eagles highlights',
+            'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478',
+            'upload_date': '20140921',
+            'timestamp': 1411337580,
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }, {
+        'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266',
+        'md5': 'cf85bdb4bc49f6e9d3816d130c78279c',
+        'info_dict': {
+            'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266',
+            'ext': 'mp4',
+            'title': 'LIVE: Post Game vs. Browns',
+            'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8',
+            'upload_date': '20131229',
+            'timestamp': 1388354455,
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }, {
+        'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish',
+        'info_dict': {
+            'id': '0ap3000000467607',
+            'ext': 'mp4',
+            'title': 'Frustrations flare on the field',
+            'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.',
+            'timestamp': 1422850320,
+            'upload_date': '20150202',
+        },
+    }, {
+        'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette',
+        'md5': '4c319e2f625ffd0b481b4382c6fc124c',
+        'info_dict': {
+            'id': 'n-238346',
+            'ext': 'mp4',
+            'title': '10 Days at Gillette',
+            'description': 'md5:8cd9cd48fac16de596eadc0b24add951',
+            'timestamp': 1442618809,
+            'upload_date': '20150918',
+        },
+    }, {
+        # lowercase data-contentid
+        'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7',
+        'info_dict': {
+            'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2',
+            'ext': 'mp4',
+            'title': 'Tomlin looks ahead to Ravens on a short week',
+            'description': 'md5:32f3f7b139f43913181d5cbb24ecad75',
+            'timestamp': 1443459651,
+            'upload_date': '20150928',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def prepend_host(host, url):
+        if not url.startswith('http'):
+            if not url.startswith('/'):
+                url = '/%s' % url
+            url = 'http://{0:}{1:}'.format(host, url)
+        return url
+
+    @staticmethod
+    def format_from_stream(stream, protocol, host, path_prefix='',
+                           preference=0, note=None):
+        url = '{protocol:}://{host:}/{prefix:}{path:}'.format(
+            protocol=protocol,
+            host=host,
+            prefix=path_prefix,
+            path=stream.get('path'),
+        )
+        return {
+            'url': url,
+            'vbr': int_or_none(stream.get('rate', 0), 1000),
+            'preference': preference,
+            'format_note': note,
+        }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, host = mobj.group('id'), mobj.group('host')
+
+        webpage = self._download_webpage(url, video_id)
+
+        config_url = NFLIE.prepend_host(host, self._search_regex(
+            r'(?:(?:config|configURL)\s*:\s*|<nflcs:avplayer[^>]+data-config\s*=\s*)(["\'])(?P<config>.+?)\1',
+            webpage, 'config URL', default='static/content/static/config/video/config.json',
+            group='config'))
+        # For articles, the id in the url is not the video id
+        video_id = self._search_regex(
+            r'(?:<nflcs:avplayer[^>]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P<id>(?:(?!\1).)+)\1',
+            webpage, 'video id', default=video_id, group='id')
+        config = self._download_json(config_url, video_id, 'Downloading player config')
+        url_template = NFLIE.prepend_host(
+            host, '{contentURLTemplate:}'.format(**config))
+        video_data = self._download_json(
+            url_template.format(id=video_id), video_id)
+
+        formats = []
+        cdn_data = video_data.get('cdnData', {})
+        streams = cdn_data.get('bitrateInfo', [])
+        if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM':
+            parts = compat_urllib_parse_urlparse(cdn_data.get('uri'))
+            protocol, host = parts.scheme, parts.netloc
+            for stream in streams:
+                formats.append(
+                    NFLIE.format_from_stream(stream, protocol, host))
+        else:
+            cdns = config.get('cdns')
+            if not cdns:
+                raise ExtractorError('Failed to get CDN data', expected=True)
+
+            for name, cdn in cdns.items():
+                # LimeLight streams don't seem to work
+                if cdn.get('name') == 'LIMELIGHT':
+                    continue
+
+                protocol = cdn.get('protocol')
+                host = remove_end(cdn.get('host', ''), '/')
+                if not (protocol and host):
+                    continue
+
+                prefix = cdn.get('pathprefix', '')
+                if prefix and not prefix.endswith('/'):
+                    prefix = '%s/' % prefix
+
+                preference = 0
+                if protocol == 'rtmp':
+                    preference = -2
+                elif 'prog' in name.lower():
+                    preference = 1
+
+                for stream in streams:
+                    formats.append(
+                        NFLIE.format_from_stream(stream, protocol, host,
+                                                 prefix, preference, name))
+
+        self._sort_formats(formats)
+
+        thumbnail = None
+        for q in ('xl', 'l', 'm', 's', 'xs'):
+            thumbnail = video_data.get('imagePaths', {}).get(q)
+            if thumbnail:
+                break
+
+        return {
+            'id': video_id,
+            'title': video_data.get('headline'),
+            'formats': formats,
+            'description': video_data.get('caption'),
+            'duration': video_data.get('duration'),
+            'thumbnail': thumbnail,
+            'timestamp': int_or_none(video_data.get('posted'), 1000),
+        }
diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py

new file mode 100644 (file)

index 0000000..de6a707
--- /dev/null
+++ b/youtube_dl/extractor/nhk.py
@@ -0,0 +1,93 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class NhkVodIE(InfoExtractor):
+    _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(?P<type>video|audio)/(?P<id>\d{7}|[^/]+?-\d{8}-\d+)'
+    # Content available only for a limited period of time. Visit
+    # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
+    _TESTS = [{
+        # clip
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/',
+        'md5': '256a1be14f48d960a7e61e2532d95ec3',
+        'info_dict': {
+            'id': 'a95j5iza',
+            'ext': 'mp4',
+            'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU",
+            'description': 'md5:5aee4a9f9d81c26281862382103b0ea5',
+            'timestamp': 1565965194,
+            'upload_date': '20190816',
+        },
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
+        'only_matching': True,
+    }]
+    _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json'
+
+    def _real_extract(self, url):
+        lang, m_type, episode_id = re.match(self._VALID_URL, url).groups()
+        if episode_id.isdigit():
+            episode_id = episode_id[:4] + '-' + episode_id[4:]
+
+        is_video = m_type == 'video'
+        episode = self._download_json(
+            self._API_URL_TEMPLATE % (
+                'v' if is_video else 'r',
+                'clip' if episode_id[:4] == '9999' else 'esd',
+                episode_id, lang, '/all' if is_video else ''),
+            episode_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'][0]
+        title = episode.get('sub_title_clean') or episode['sub_title']
+
+        def get_clean_field(key):
+            return episode.get(key + '_clean') or episode.get(key)
+
+        series = get_clean_field('title')
+
+        thumbnails = []
+        for s, w, h in [('', 640, 360), ('_l', 1280, 720)]:
+            img_path = episode.get('image' + s)
+            if not img_path:
+                continue
+            thumbnails.append({
+                'id': '%dp' % h,
+                'height': h,
+                'width': w,
+                'url': 'https://www3.nhk.or.jp' + img_path,
+            })
+
+        info = {
+            'id': episode_id + '-' + lang,
+            'title': '%s - %s' % (series, title) if series and title else title,
+            'description': get_clean_field('description'),
+            'thumbnails': thumbnails,
+            'series': series,
+            'episode': title,
+        }
+        if is_video:
+            info.update({
+                '_type': 'url_transparent',
+                'ie_key': 'Piksel',
+                'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + episode['vod_id'],
+            })
+        else:
+            audio = episode['audio']
+            audio_path = audio['audio']
+            info['formats'] = self._extract_m3u8_formats(
+                'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
+                episode_id, 'm4a', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False)
+            for f in info['formats']:
+                f['language'] = lang
+        return info
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py

new file mode 100644 (file)

index 0000000..eddfe1f
--- /dev/null
+++ b/youtube_dl/extractor/nhl.py
@@ -0,0 +1,128 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    parse_iso8601,
+    parse_duration,
+)
+
+
+class NHLBaseIE(InfoExtractor):
+    def _real_extract(self, url):
+        site, tmp_id = re.match(self._VALID_URL, url).groups()
+        video_data = self._download_json(
+            'https://%s/%s/%sid/v1/%s/details/web-v1.json'
+            % (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id)
+        if video_data.get('type') != 'video':
+            video_data = video_data['media']
+            video = video_data.get('video')
+            if video:
+                video_data = video
+            else:
+                videos = video_data.get('videos')
+                if videos:
+                    video_data = videos[0]
+
+        video_id = compat_str(video_data['id'])
+        title = video_data['title']
+
+        formats = []
+        for playback in video_data.get('playbacks', []):
+            playback_url = playback.get('url')
+            if not playback_url:
+                continue
+            ext = determine_ext(playback_url)
+            if ext == 'm3u8':
+                m3u8_formats = self._extract_m3u8_formats(
+                    playback_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id=playback.get('name', 'hls'), fatal=False)
+                self._check_formats(m3u8_formats, video_id)
+                formats.extend(m3u8_formats)
+            else:
+                height = int_or_none(playback.get('height'))
+                formats.append({
+                    'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')),
+                    'url': playback_url,
+                    'width': int_or_none(playback.get('width')),
+                    'height': height,
+                    'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)),
+                })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        cuts = video_data.get('image', {}).get('cuts') or []
+        if isinstance(cuts, dict):
+            cuts = cuts.values()
+        for thumbnail_data in cuts:
+            thumbnail_url = thumbnail_data.get('src')
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+                'width': int_or_none(thumbnail_data.get('width')),
+                'height': int_or_none(thumbnail_data.get('height')),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'timestamp': parse_iso8601(video_data.get('date')),
+            'duration': parse_duration(video_data.get('duration')),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
+
+
+class NHLIE(NHLBaseIE):
+    IE_NAME = 'nhl.com'
+    _VALID_URL = r'https?://(?:www\.)?(?P<site>nhl|wch2016)\.com/(?:[^/]+/)*c-(?P<id>\d+)'
+    _CONTENT_DOMAIN = 'nhl.bamcontent.com'
+    _TESTS = [{
+        # type=video
+        'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503',
+        'md5': '0f7b9a8f986fb4b4eeeece9a56416eaf',
+        'info_dict': {
+            'id': '43663503',
+            'ext': 'mp4',
+            'title': 'Anisimov cleans up mess',
+            'description': 'md5:a02354acdfe900e940ce40706939ca63',
+            'timestamp': 1461288600,
+            'upload_date': '20160422',
+        },
+    }, {
+        # type=article
+        'url': 'https://www.nhl.com/news/dennis-wideman-suspended/c-278258934',
+        'md5': '1f39f4ea74c1394dea110699a25b366c',
+        'info_dict': {
+            'id': '40784403',
+            'ext': 'mp4',
+            'title': 'Wideman suspended by NHL',
+            'description': 'Flames defenseman Dennis Wideman was banned 20 games for violation of Rule 40 (Physical Abuse of Officials)',
+            'upload_date': '20160204',
+            'timestamp': 1454544904,
+        },
+    }, {
+        # Some m3u8 URLs are invalid (https://github.com/ytdl-org/youtube-dl/issues/10713)
+        'url': 'https://www.nhl.com/predators/video/poile-laviolette-on-subban-trade/t-277437416/c-44315003',
+        'md5': '50b2bb47f405121484dda3ccbea25459',
+        'info_dict': {
+            'id': '44315003',
+            'ext': 'mp4',
+            'title': 'Poile, Laviolette on Subban trade',
+            'description': 'General manager David Poile and head coach Peter Laviolette share their thoughts on acquiring P.K. Subban from Montreal (06/29/16)',
+            'timestamp': 1467242866,
+            'upload_date': '20160629',
+        },
+    }, {
+        'url': 'https://www.wch2016.com/video/caneur-best-of-game-2-micd-up/t-281230378/c-44983703',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068',
+        'only_matching': True,
+    }]
diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py

new file mode 100644 (file)

index 0000000..2e8b302
--- /dev/null
+++ b/youtube_dl/extractor/nick.py
@@ -0,0 +1,249 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .mtv import MTVServicesInfoExtractor
+from ..utils import update_url_query
+
+
+class NickIE(MTVServicesInfoExtractor):
+    # None of videos on the website are still alive?
+    IE_NAME = 'nick.com'
+    _VALID_URL = r'https?://(?P<domain>(?:(?:www|beta)\.)?nick(?:jr)?\.com)/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'
+    _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'
+    _GEO_COUNTRIES = ['US']
+    _TESTS = [{
+        'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html',
+        'playlist': [
+            {
+                'md5': '6e5adc1e28253bbb1b28ab05403dd4d4',
+                'info_dict': {
+                    'id': 'be6a17b0-412d-11e5-8ff7-0026b9414f30',
+                    'ext': 'mp4',
+                    'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S1',
+                    'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+
+                }
+            },
+            {
+                'md5': 'd7be441fc53a1d4882fa9508a1e5b3ce',
+                'info_dict': {
+                    'id': 'be6b8f96-412d-11e5-8ff7-0026b9414f30',
+                    'ext': 'mp4',
+                    'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S2',
+                    'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+
+                }
+            },
+            {
+                'md5': 'efffe1728a234b2b0d2f2b343dd1946f',
+                'info_dict': {
+                    'id': 'be6cf7e6-412d-11e5-8ff7-0026b9414f30',
+                    'ext': 'mp4',
+                    'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S3',
+                    'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+                }
+            },
+            {
+                'md5': '1ec6690733ab9f41709e274a1d5c7556',
+                'info_dict': {
+                    'id': 'be6e3354-412d-11e5-8ff7-0026b9414f30',
+                    'ext': 'mp4',
+                    'title': 'ALVINNN!!! and The Chipmunks: "Mojo Missing/Who\'s The Animal" S4',
+                    'description': 'Alvin is convinced his mojo was in a cap he gave to a fan, and must find a way to get his hat back before the Chipmunks’ big concert.\nDuring a costume visit to the zoo, Alvin finds himself mistaken for the real Tasmanian devil.',
+                }
+            },
+        ],
+    }, {
+        'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/',
+        'only_matching': True,
+    }, {
+        'url': 'http://beta.nick.com/nicky-ricky-dicky-and-dawn/videos/nicky-ricky-dicky-dawn-301-full-episode/',
+        'only_matching': True,
+    }]
+
+    def _get_feed_query(self, uri):
+        return {
+            'feed': 'nick_arc_player_prime',
+            'mgid': uri,
+        }
+
+    def _real_extract(self, url):
+        domain, display_id = re.match(self._VALID_URL, url).groups()
+        video_data = self._download_json(
+            'http://%s/data/video.endLevel.json' % domain,
+            display_id, query={
+                'urlKey': display_id,
+            })
+        return self._get_videos_info(video_data['player'] + video_data['id'])
+
+
+class NickBrIE(MTVServicesInfoExtractor):
+    IE_NAME = 'nickelodeon:br'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?P<domain>(?:www\.)?nickjr|mundonick\.uol)\.com\.br|
+                            (?:www\.)?nickjr\.[a-z]{2}|
+                            (?:www\.)?nickelodeonjunior\.fr
+                        )
+                        /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P<id>[^/?\#.]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://www.nickjr.com.br/patrulha-canina/videos/210-labirinto-de-pipoca/',
+        'only_matching': True,
+    }, {
+        'url': 'http://mundonick.uol.com.br/programas/the-loud-house/videos/muitas-irmas/7ljo9j',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickjr.nl/paw-patrol/videos/311-ge-wol-dig-om-terug-te-zijn/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickjr.de/blaze-und-die-monster-maschinen/videos/f6caaf8f-e4e8-4cc1-b489-9380d6dcd059/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeonjunior.fr/paw-patrol-la-pat-patrouille/videos/episode-401-entier-paw-patrol/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        domain, display_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, display_id)
+        uri = self._search_regex(
+            r'data-(?:contenturi|mgid)="([^"]+)', webpage, 'mgid')
+        video_id = self._id_from_uri(uri)
+        config = self._download_json(
+            'http://media.mtvnservices.com/pmt/e1/access/index.html',
+            video_id, query={
+                'uri': uri,
+                'configtype': 'edge',
+            }, headers={
+                'Referer': url,
+            })
+        info_url = self._remove_template_parameter(config['feedWithQueryParams'])
+        if info_url == 'None':
+            if domain.startswith('www.'):
+                domain = domain[4:]
+            content_domain = {
+                'mundonick.uol': 'mundonick.com.br',
+                'nickjr': 'br.nickelodeonjunior.tv',
+            }[domain]
+            query = {
+                'mgid': uri,
+                'imageEp': content_domain,
+                'arcEp': content_domain,
+            }
+            if domain == 'nickjr.com.br':
+                query['ep'] = 'c4b16088'
+            info_url = update_url_query(
+                'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed', query)
+        return self._get_videos_info_from_url(info_url, video_id)
+
+
+class NickDeIE(MTVServicesInfoExtractor):
+    IE_NAME = 'nick.de'
+    _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl|ch)|nickelodeon\.(?:nl|be|at|dk|no|se))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nick.de/shows/342-icarly',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.nl/shows/474-spongebob/videos/17403-een-kijkje-in-de-keuken-met-sandy-van-binnenuit',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.at/playlist/3773-top-videos/videos/episode/77993-das-letzte-gefecht',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.no/program/2626-bulderhuset/videoer/90947-femteklasse-veronica-vs-vanzilla',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.dk/serier/2626-hojs-hus/videoer/761-tissepause',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.se/serier/2626-lugn-i-stormen/videos/998-',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nick.ch/shows/2304-adventure-time-abenteuerzeit-mit-finn-und-jake',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.be/afspeellijst/4530-top-videos/videos/episode/73917-inval-broodschapper-lariekoek-arie',
+        'only_matching': True,
+    }]
+
+    def _extract_mrss_url(self, webpage, host):
+        return update_url_query(self._search_regex(
+            r'data-mrss=(["\'])(?P<url>http.+?)\1', webpage, 'mrss url', group='url'),
+            {'siteKey': host})
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        host = mobj.group('host')
+
+        webpage = self._download_webpage(url, video_id)
+
+        mrss_url = self._extract_mrss_url(webpage, host)
+
+        return self._get_videos_info_from_url(mrss_url, video_id)
+
+
+class NickNightIE(NickDeIE):
+    IE_NAME = 'nicknight'
+    _VALID_URL = r'https?://(?:www\.)(?P<host>nicknight\.(?:de|at|tv))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.nicknight.at/shows/977-awkward/videos/85987-nimmer-beste-freunde',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nicknight.at/shows/977-awkward',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nicknight.at/shows/1900-faking-it',
+        'only_matching': True,
+    }]
+
+    def _extract_mrss_url(self, webpage, *args):
+        return self._search_regex(
+            r'mrss\s*:\s*(["\'])(?P<url>http.+?)\1', webpage,
+            'mrss url', group='url')
+
+
+class NickRuIE(MTVServicesInfoExtractor):
+    IE_NAME = 'nickelodeonru'
+    _VALID_URL = r'https?://(?:www\.)nickelodeon\.(?:ru|fr|es|pt|ro|hu|com\.tr)/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.ru/videos/smotri-na-nickelodeon-v-iyule/g9hvh7',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.fr/programmes/bob-l-eponge/videos/le-marathon-de-booh-kini-bottom-mardi-31-octobre/nfn7z0',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.es/videos/nickelodeon-consejos-tortitas/f7w7xy',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.pt/series/spongebob-squarepants/videos/a-bolha-de-tinta-gigante/xutq1b',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.ro/emisiuni/shimmer-si-shine/video/nahal-din-bomboane/uw5u2k',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.hu/musorok/spongyabob-kockanadrag/videok/episodes/buborekfujas-az-elszakadt-nadrag/q57iob#playlist/k6te4y',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.nickelodeon.com.tr/programlar/sunger-bob/videolar/kayip-yatak/mgqbjy',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        mgid = self._extract_mgid(webpage)
+        return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py

new file mode 100644 (file)

index 0000000..eb07ca7
--- /dev/null
+++ b/youtube_dl/extractor/niconico.py
@@ -0,0 +1,470 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import datetime
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+from ..utils import (
+    determine_ext,
+    dict_get,
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    parse_duration,
+    parse_iso8601,
+    remove_start,
+    try_get,
+    unified_timestamp,
+    urlencode_postdata,
+    xpath_text,
+)
+
+
+class NiconicoIE(InfoExtractor):
+    IE_NAME = 'niconico'
+    IE_DESC = 'ニコニコ動画'
+
+    _TESTS = [{
+        'url': 'http://www.nicovideo.jp/watch/sm22312215',
+        'md5': 'd1a75c0823e2f629128c43e1212760f9',
+        'info_dict': {
+            'id': 'sm22312215',
+            'ext': 'mp4',
+            'title': 'Big Buck Bunny',
+            'thumbnail': r're:https?://.*',
+            'uploader': 'takuya0301',
+            'uploader_id': '2698420',
+            'upload_date': '20131123',
+            'timestamp': int,  # timestamp is unstable
+            'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
+            'duration': 33,
+            'view_count': int,
+            'comment_count': int,
+        },
+        'skip': 'Requires an account',
+    }, {
+        # File downloaded with and without credentials are different, so omit
+        # the md5 field
+        'url': 'http://www.nicovideo.jp/watch/nm14296458',
+        'info_dict': {
+            'id': 'nm14296458',
+            'ext': 'swf',
+            'title': '【鏡音リン】Dance on media【オリジナル】take2!',
+            'description': 'md5:689f066d74610b3b22e0f1739add0f58',
+            'thumbnail': r're:https?://.*',
+            'uploader': 'りょうた',
+            'uploader_id': '18822557',
+            'upload_date': '20110429',
+            'timestamp': 1304065916,
+            'duration': 209,
+        },
+        'skip': 'Requires an account',
+    }, {
+        # 'video exists but is marked as "deleted"
+        # md5 is unstable
+        'url': 'http://www.nicovideo.jp/watch/sm10000',
+        'info_dict': {
+            'id': 'sm10000',
+            'ext': 'unknown_video',
+            'description': 'deleted',
+            'title': 'ドラえもんエターナル第3話「決戦第3新東京市」＜前編＞',
+            'thumbnail': r're:https?://.*',
+            'upload_date': '20071224',
+            'timestamp': int,  # timestamp field has different value if logged in
+            'duration': 304,
+            'view_count': int,
+        },
+        'skip': 'Requires an account',
+    }, {
+        'url': 'http://www.nicovideo.jp/watch/so22543406',
+        'info_dict': {
+            'id': '1388129933',
+            'ext': 'mp4',
+            'title': '【第1回】RADIOアニメロミックス ラブライブ！～のぞえりRadio Garden～',
+            'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1',
+            'thumbnail': r're:https?://.*',
+            'timestamp': 1388851200,
+            'upload_date': '20140104',
+            'uploader': 'アニメロチャンネル',
+            'uploader_id': '312',
+        },
+        'skip': 'The viewing period of the video you were searching for has expired.',
+    }, {
+        # video not available via `getflv`; "old" HTML5 video
+        'url': 'http://www.nicovideo.jp/watch/sm1151009',
+        'md5': '8fa81c364eb619d4085354eab075598a',
+        'info_dict': {
+            'id': 'sm1151009',
+            'ext': 'mp4',
+            'title': 'マスターシステム本体内蔵のスペハリのメインテーマ（ＰＳＧ版）',
+            'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7',
+            'thumbnail': r're:https?://.*',
+            'duration': 184,
+            'timestamp': 1190868283,
+            'upload_date': '20070927',
+            'uploader': 'denden2',
+            'uploader_id': '1392194',
+            'view_count': int,
+            'comment_count': int,
+        },
+        'skip': 'Requires an account',
+    }, {
+        # "New" HTML5 video
+        # md5 is unstable
+        'url': 'http://www.nicovideo.jp/watch/sm31464864',
+        'info_dict': {
+            'id': 'sm31464864',
+            'ext': 'mp4',
+            'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質',
+            'description': 'md5:e52974af9a96e739196b2c1ca72b5feb',
+            'timestamp': 1498514060,
+            'upload_date': '20170626',
+            'uploader': 'ゲスト',
+            'uploader_id': '40826363',
+            'thumbnail': r're:https?://.*',
+            'duration': 198,
+            'view_count': int,
+            'comment_count': int,
+        },
+        'skip': 'Requires an account',
+    }, {
+        # Video without owner
+        'url': 'http://www.nicovideo.jp/watch/sm18238488',
+        'md5': 'd265680a1f92bdcbbd2a507fc9e78a9e',
+        'info_dict': {
+            'id': 'sm18238488',
+            'ext': 'mp4',
+            'title': '【実写版】ミュータントタートルズ',
+            'description': 'md5:15df8988e47a86f9e978af2064bf6d8e',
+            'timestamp': 1341160408,
+            'upload_date': '20120701',
+            'uploader': None,
+            'uploader_id': None,
+            'thumbnail': r're:https?://.*',
+            'duration': 5271,
+            'view_count': int,
+            'comment_count': int,
+        },
+        'skip': 'Requires an account',
+    }, {
+        'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg',
+        'only_matching': True,
+    }]
+
+    _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
+    _NETRC_MACHINE = 'niconico'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        # No authentication to be performed
+        if not username:
+            return True
+
+        # Log in
+        login_ok = True
+        login_form_strs = {
+            'mail_tel': username,
+            'password': password,
+        }
+        urlh = self._request_webpage(
+            'https://account.nicovideo.jp/api/v1/login', None,
+            note='Logging in', errnote='Unable to log in',
+            data=urlencode_postdata(login_form_strs))
+        if urlh is False:
+            login_ok = False
+        else:
+            parts = compat_urlparse.urlparse(urlh.geturl())
+            if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login':
+                login_ok = False
+        if not login_ok:
+            self._downloader.report_warning('unable to log in: bad username or password')
+        return login_ok
+
+    def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality):
+        def yesno(boolean):
+            return 'yes' if boolean else 'no'
+
+        session_api_data = api_data['video']['dmcInfo']['session_api']
+        session_api_endpoint = session_api_data['urls'][0]
+
+        format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality]))
+
+        session_response = self._download_json(
+            session_api_endpoint['url'], video_id,
+            query={'_format': 'json'},
+            headers={'Content-Type': 'application/json'},
+            note='Downloading JSON metadata for %s' % format_id,
+            data=json.dumps({
+                'session': {
+                    'client_info': {
+                        'player_id': session_api_data['player_id'],
+                    },
+                    'content_auth': {
+                        'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]],
+                        'content_key_timeout': session_api_data['content_key_timeout'],
+                        'service_id': 'nicovideo',
+                        'service_user_id': session_api_data['service_user_id']
+                    },
+                    'content_id': session_api_data['content_id'],
+                    'content_src_id_sets': [{
+                        'content_src_ids': [{
+                            'src_id_to_mux': {
+                                'audio_src_ids': [audio_quality['id']],
+                                'video_src_ids': [video_quality['id']],
+                            }
+                        }]
+                    }],
+                    'content_type': 'movie',
+                    'content_uri': '',
+                    'keep_method': {
+                        'heartbeat': {
+                            'lifetime': session_api_data['heartbeat_lifetime']
+                        }
+                    },
+                    'priority': session_api_data['priority'],
+                    'protocol': {
+                        'name': 'http',
+                        'parameters': {
+                            'http_parameters': {
+                                'parameters': {
+                                    'http_output_download_parameters': {
+                                        'use_ssl': yesno(session_api_endpoint['is_ssl']),
+                                        'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']),
+                                    }
+                                }
+                            }
+                        }
+                    },
+                    'recipe_id': session_api_data['recipe_id'],
+                    'session_operation_auth': {
+                        'session_operation_auth_by_signature': {
+                            'signature': session_api_data['signature'],
+                            'token': session_api_data['token'],
+                        }
+                    },
+                    'timing_constraint': 'unlimited'
+                }
+            }).encode())
+
+        resolution = video_quality.get('resolution', {})
+
+        return {
+            'url': session_response['data']['session']['content_uri'],
+            'format_id': format_id,
+            'ext': 'mp4',  # Session API are used in HTML5, which always serves mp4
+            'abr': float_or_none(audio_quality.get('bitrate'), 1000),
+            'vbr': float_or_none(video_quality.get('bitrate'), 1000),
+            'height': resolution.get('height'),
+            'width': resolution.get('width'),
+        }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # Get video webpage. We are not actually interested in it for normal
+        # cases, but need the cookies in order to be able to download the
+        # info webpage
+        webpage, handle = self._download_webpage_handle(
+            'http://www.nicovideo.jp/watch/' + video_id, video_id)
+        if video_id.startswith('so'):
+            video_id = self._match_id(handle.geturl())
+
+        api_data = self._parse_json(self._html_search_regex(
+            'data-api-data="([^"]+)"', webpage,
+            'API data', default='{}'), video_id)
+
+        def _format_id_from_url(video_url):
+            return 'economy' if video_real_url.endswith('low') else 'normal'
+
+        try:
+            video_real_url = api_data['video']['smileInfo']['url']
+        except KeyError:  # Flash videos
+            # Get flv info
+            flv_info_webpage = self._download_webpage(
+                'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
+                video_id, 'Downloading flv info')
+
+            flv_info = compat_urlparse.parse_qs(flv_info_webpage)
+            if 'url' not in flv_info:
+                if 'deleted' in flv_info:
+                    raise ExtractorError('The video has been deleted.',
+                                         expected=True)
+                elif 'closed' in flv_info:
+                    raise ExtractorError('Niconico videos now require logging in',
+                                         expected=True)
+                elif 'error' in flv_info:
+                    raise ExtractorError('%s reports error: %s' % (
+                        self.IE_NAME, flv_info['error'][0]), expected=True)
+                else:
+                    raise ExtractorError('Unable to find video URL')
+
+            video_info_xml = self._download_xml(
+                'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
+                video_id, note='Downloading video info page')
+
+            def get_video_info(items):
+                if not isinstance(items, list):
+                    items = [items]
+                for item in items:
+                    ret = xpath_text(video_info_xml, './/' + item)
+                    if ret:
+                        return ret
+
+            video_real_url = flv_info['url'][0]
+
+            extension = get_video_info('movie_type')
+            if not extension:
+                extension = determine_ext(video_real_url)
+
+            formats = [{
+                'url': video_real_url,
+                'ext': extension,
+                'format_id': _format_id_from_url(video_real_url),
+            }]
+        else:
+            formats = []
+
+            dmc_info = api_data['video'].get('dmcInfo')
+            if dmc_info:  # "New" HTML5 videos
+                quality_info = dmc_info['quality']
+                for audio_quality in quality_info['audios']:
+                    for video_quality in quality_info['videos']:
+                        if not audio_quality['available'] or not video_quality['available']:
+                            continue
+                        formats.append(self._extract_format_for_quality(
+                            api_data, video_id, audio_quality, video_quality))
+
+                self._sort_formats(formats)
+            else:  # "Old" HTML5 videos
+                formats = [{
+                    'url': video_real_url,
+                    'ext': 'mp4',
+                    'format_id': _format_id_from_url(video_real_url),
+                }]
+
+            def get_video_info(items):
+                return dict_get(api_data['video'], items)
+
+        # Start extracting information
+        title = get_video_info('title')
+        if not title:
+            title = self._og_search_title(webpage, default=None)
+        if not title:
+            title = self._html_search_regex(
+                r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>',
+                webpage, 'video title')
+
+        watch_api_data_string = self._html_search_regex(
+            r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>',
+            webpage, 'watch api data', default=None)
+        watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {}
+        video_detail = watch_api_data.get('videoDetail', {})
+
+        thumbnail = (
+            get_video_info(['thumbnail_url', 'thumbnailURL'])
+            or self._html_search_meta('image', webpage, 'thumbnail', default=None)
+            or video_detail.get('thumbnail'))
+
+        description = get_video_info('description')
+
+        timestamp = (parse_iso8601(get_video_info('first_retrieve'))
+                     or unified_timestamp(get_video_info('postedDateTime')))
+        if not timestamp:
+            match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
+            if match:
+                timestamp = parse_iso8601(match.replace('+', ':00+'))
+        if not timestamp and video_detail.get('postedAt'):
+            timestamp = parse_iso8601(
+                video_detail['postedAt'].replace('/', '-'),
+                delimiter=' ', timezone=datetime.timedelta(hours=9))
+
+        view_count = int_or_none(get_video_info(['view_counter', 'viewCount']))
+        if not view_count:
+            match = self._html_search_regex(
+                r'>Views: <strong[^>]*>([^<]+)</strong>',
+                webpage, 'view count', default=None)
+            if match:
+                view_count = int_or_none(match.replace(',', ''))
+        view_count = view_count or video_detail.get('viewCount')
+
+        comment_count = (int_or_none(get_video_info('comment_num'))
+                         or video_detail.get('commentCount')
+                         or try_get(api_data, lambda x: x['thread']['commentCount']))
+        if not comment_count:
+            match = self._html_search_regex(
+                r'>Comments: <strong[^>]*>([^<]+)</strong>',
+                webpage, 'comment count', default=None)
+            if match:
+                comment_count = int_or_none(match.replace(',', ''))
+
+        duration = (parse_duration(
+            get_video_info('length')
+            or self._html_search_meta(
+                'video:duration', webpage, 'video duration', default=None))
+            or video_detail.get('length')
+            or get_video_info('duration'))
+
+        webpage_url = get_video_info('watch_url') or url
+
+        # Note: cannot use api_data.get('owner', {}) because owner may be set to "null"
+        # in the JSON, which will cause None to be returned instead of {}.
+        owner = try_get(api_data, lambda x: x.get('owner'), dict) or {}
+        uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id')
+        uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+            'uploader': uploader,
+            'timestamp': timestamp,
+            'uploader_id': uploader_id,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'duration': duration,
+            'webpage_url': webpage_url,
+        }
+
+
+class NiconicoPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/mylist/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.nicovideo.jp/mylist/27411728',
+        'info_dict': {
+            'id': '27411728',
+            'title': 'AKB48のオールナイトニッポン',
+        },
+        'playlist_mincount': 225,
+    }
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+        webpage = self._download_webpage(url, list_id)
+
+        entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);',
+                                          webpage, 'entries')
+        entries = json.loads(entries_json)
+        entries = [{
+            '_type': 'url',
+            'ie_key': NiconicoIE.ie_key(),
+            'url': ('http://www.nicovideo.jp/watch/%s' %
+                    entry['item_data']['video_id']),
+        } for entry in entries]
+
+        return {
+            '_type': 'playlist',
+            'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'),
+            'id': list_id,
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py

new file mode 100644 (file)

index 0000000..65754c5
--- /dev/null
+++ b/youtube_dl/extractor/ninecninemedia.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_iso8601,
+    float_or_none,
+    ExtractorError,
+    int_or_none,
+)
+
+
+class NineCNineMediaIE(InfoExtractor):
+    IE_NAME = '9c9media'
+    _GEO_COUNTRIES = ['CA']
+    _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)'
+    _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/'
+
+    def _real_extract(self, url):
+        destination_code, content_id = re.match(self._VALID_URL, url).groups()
+        api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id)
+        content = self._download_json(api_base_url, content_id, query={
+            '$include': '[Media,Season,ContentPackages]',
+        })
+        title = content['Name']
+        if len(content['ContentPackages']) > 1:
+            raise ExtractorError('multiple content packages')
+        content_package = content['ContentPackages'][0]
+        package_id = content_package['Id']
+        content_package_url = api_base_url + 'contentpackages/%s/' % package_id
+        content_package = self._download_json(
+            content_package_url, content_id, query={
+                '$include': '[HasClosedCaptions]',
+            })
+
+        if content_package.get('Constraints', {}).get('Security', {}).get('Type'):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
+        manifest_base_url = content_package_url + 'manifest.'
+        formats = []
+        formats.extend(self._extract_m3u8_formats(
+            manifest_base_url + 'm3u8', content_id, 'mp4',
+            'm3u8_native', m3u8_id='hls', fatal=False))
+        formats.extend(self._extract_f4m_formats(
+            manifest_base_url + 'f4m', content_id,
+            f4m_id='hds', fatal=False))
+        formats.extend(self._extract_mpd_formats(
+            manifest_base_url + 'mpd', content_id,
+            mpd_id='dash', fatal=False))
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for image in content.get('Images', []):
+            image_url = image.get('Url')
+            if not image_url:
+                continue
+            thumbnails.append({
+                'url': image_url,
+                'width': int_or_none(image.get('Width')),
+                'height': int_or_none(image.get('Height')),
+            })
+
+        tags, categories = [], []
+        for source_name, container in (('Tags', tags), ('Genres', categories)):
+            for e in content.get(source_name, []):
+                e_name = e.get('Name')
+                if not e_name:
+                    continue
+                container.append(e_name)
+
+        season = content.get('Season', {})
+
+        info = {
+            'id': content_id,
+            'title': title,
+            'description': content.get('Desc') or content.get('ShortDesc'),
+            'timestamp': parse_iso8601(content.get('BroadcastDateTime')),
+            'episode_number': int_or_none(content.get('Episode')),
+            'season': season.get('Name'),
+            'season_number': season.get('Number'),
+            'season_id': season.get('Id'),
+            'series': content.get('Media', {}).get('Name'),
+            'tags': tags,
+            'categories': categories,
+            'duration': float_or_none(content_package.get('Duration')),
+            'formats': formats,
+        }
+
+        if content_package.get('HasClosedCaptions'):
+            info['subtitles'] = {
+                'en': [{
+                    'url': manifest_base_url + 'vtt',
+                    'ext': 'vtt',
+                }, {
+                    'url': manifest_base_url + 'srt',
+                    'ext': 'srt',
+                }]
+            }
+
+        return info
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py

new file mode 100644 (file)

index 0000000..dc6a27d
--- /dev/null
+++ b/youtube_dl/extractor/ninegag.py
@@ -0,0 +1,104 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import str_to_int
+
+
+class NineGagIE(InfoExtractor):
+    IE_NAME = '9gag'
+    _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P<id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^?#/]+))?'
+
+    _TESTS = [{
+        'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome',
+        'info_dict': {
+            'id': 'kXzwOKyGlSA',
+            'ext': 'mp4',
+            'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)',
+            'title': '\"People Are Awesome 2013\" Is Absolutely Awesome',
+            'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA',
+            'uploader': 'CompilationChannel',
+            'upload_date': '20131110',
+            'view_count': int,
+        },
+        'add_ie': ['Youtube'],
+    }, {
+        'url': 'http://9gag.com/tv/p/aKolP3',
+        'info_dict': {
+            'id': 'aKolP3',
+            'ext': 'mp4',
+            'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video',
+            'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!",
+            'uploader_id': 'rickmereki',
+            'uploader': 'Rick Mereki',
+            'upload_date': '20110803',
+            'view_count': int,
+        },
+        'add_ie': ['Vimeo'],
+    }, {
+        'url': 'http://9gag.com/tv/p/KklwM',
+        'only_matching': True,
+    }, {
+        'url': 'http://9gag.tv/p/Kk2X5',
+        'only_matching': True,
+    }, {
+        'url': 'http://9gag.com/tv/embed/a5Dmvl',
+        'only_matching': True,
+    }]
+
+    _EXTERNAL_VIDEO_PROVIDER = {
+        '1': {
+            'url': '%s',
+            'ie_key': 'Youtube',
+        },
+        '2': {
+            'url': 'http://player.vimeo.com/video/%s',
+            'ie_key': 'Vimeo',
+        },
+        '3': {
+            'url': 'http://instagram.com/p/%s',
+            'ie_key': 'Instagram',
+        },
+        '4': {
+            'url': 'http://vine.co/v/%s',
+            'ie_key': 'Vine',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage = self._download_webpage(url, display_id)
+
+        post_view = self._parse_json(
+            self._search_regex(
+                r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost',
+                webpage, 'post view'),
+            display_id)
+
+        ie_key = None
+        source_url = post_view.get('sourceUrl')
+        if not source_url:
+            external_video_id = post_view['videoExternalId']
+            external_video_provider = post_view['videoExternalProvider']
+            source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id
+            ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key']
+        title = post_view['title']
+        description = post_view.get('description')
+        view_count = str_to_int(post_view.get('externalView'))
+        thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')
+
+        return {
+            '_type': 'url_transparent',
+            'url': source_url,
+            'ie_key': ie_key,
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'view_count': view_count,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py

new file mode 100644 (file)

index 0000000..6157dc7
--- /dev/null
+++ b/youtube_dl/extractor/ninenow.py
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    smuggle_url,
+)
+
+
+class NineNowIE(InfoExtractor):
+    IE_NAME = '9now.com.au'
+    _VALID_URL = r'https?://(?:www\.)?9now\.com\.au/(?:[^/]+/){2}(?P<id>[^/?#]+)'
+    _GEO_COUNTRIES = ['AU']
+    _TESTS = [{
+        # clip
+        'url': 'https://www.9now.com.au/afl-footy-show/2016/clip-ciql02091000g0hp5oktrnytc',
+        'md5': '17cf47d63ec9323e562c9957a968b565',
+        'info_dict': {
+            'id': '16801',
+            'ext': 'mp4',
+            'title': 'St. Kilda\'s Joey Montagna on the potential for a player\'s strike',
+            'description': 'Is a boycott of the NAB Cup "on the table"?',
+            'uploader_id': '4460760524001',
+            'upload_date': '20160713',
+            'timestamp': 1468421266,
+        },
+        'skip': 'Only available in Australia',
+    }, {
+        # episode
+        'url': 'https://www.9now.com.au/afl-footy-show/2016/episode-19',
+        'only_matching': True,
+    }, {
+        # DRM protected
+        'url': 'https://www.9now.com.au/andrew-marrs-history-of-the-world/season-1/episode-1',
+        'only_matching': True,
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        page_data = self._parse_json(self._search_regex(
+            r'window\.__data\s*=\s*({.*?});', webpage,
+            'page data', default='{}'), display_id, fatal=False)
+        if not page_data:
+            page_data = self._parse_json(self._parse_json(self._search_regex(
+                r'window\.__data\s*=\s*JSON\.parse\s*\(\s*(".+?")\s*\)\s*;',
+                webpage, 'page data'), display_id), display_id)
+
+        for kind in ('episode', 'clip'):
+            current_key = page_data.get(kind, {}).get(
+                'current%sKey' % kind.capitalize())
+            if not current_key:
+                continue
+            cache = page_data.get(kind, {}).get('%sCache' % kind, {})
+            if not cache:
+                continue
+            common_data = (cache.get(current_key) or list(cache.values())[0])[kind]
+            break
+        else:
+            raise ExtractorError('Unable to find video data')
+
+        video_data = common_data['video']
+
+        if video_data.get('drm'):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
+        brightcove_id = video_data.get('brightcoveId') or 'ref:' + video_data['referenceId']
+        video_id = compat_str(video_data.get('id') or brightcove_id)
+        title = common_data['name']
+
+        thumbnails = [{
+            'id': thumbnail_id,
+            'url': thumbnail_url,
+            'width': int_or_none(thumbnail_id[1:])
+        } for thumbnail_id, thumbnail_url in common_data.get('image', {}).get('sizes', {}).items()]
+
+        return {
+            '_type': 'url_transparent',
+            'url': smuggle_url(
+                self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+                {'geo_countries': self._GEO_COUNTRIES}),
+            'id': video_id,
+            'title': title,
+            'description': common_data.get('description'),
+            'duration': float_or_none(video_data.get('duration'), 1000),
+            'thumbnails': thumbnails,
+            'ie_key': 'BrightcoveNew',
+        }
diff --git a/youtube_dl/extractor/nintendo.py b/youtube_dl/extractor/nintendo.py

new file mode 100644 (file)

index 0000000..ff8f70b
--- /dev/null
+++ b/youtube_dl/extractor/nintendo.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class NintendoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:games/detail|nintendo-direct)/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.nintendo.com/games/detail/duck-hunt-wii-u/',
+        'info_dict': {
+            'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW',
+            'ext': 'flv',
+            'title': 'Duck Hunt Wii U VC NES - Trailer',
+            'duration': 60.326,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        'url': 'http://www.nintendo.com/games/detail/tokyo-mirage-sessions-fe-wii-u',
+        'info_dict': {
+            'id': 'tokyo-mirage-sessions-fe-wii-u',
+            'title': 'Tokyo Mirage Sessions ♯FE',
+        },
+        'playlist_count': 4,
+    }, {
+        'url': 'https://www.nintendo.com/nintendo-direct/09-04-2019/',
+        'info_dict': {
+            'id': 'J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V',
+            'ext': 'mp4',
+            'title': 'Switch_ROS_ND0904-H264.mov',
+            'duration': 2324.758,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }]
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, page_id)
+
+        entries = [
+            OoyalaIE._build_url_result(m.group('code'))
+            for m in re.finditer(
+                r'data-(?:video-id|directVideoId)=(["\'])(?P<code>(?:(?!\1).)+)\1', webpage)]
+
+        title = self._html_search_regex(
+            r'(?s)<(?:span|div)[^>]+class="(?:title|wrapper)"[^>]*>.*?<h1>(.+?)</h1>',
+            webpage, 'title', fatal=False)
+
+        return self.playlist_result(
+            entries, page_id, title)
diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py

new file mode 100644 (file)

index 0000000..025c5d2
--- /dev/null
+++ b/youtube_dl/extractor/njpwworld.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    extract_attributes,
+    get_element_by_class,
+    urlencode_postdata,
+)
+
+
+class NJPWWorldIE(InfoExtractor):
+    _VALID_URL = r'https?://njpwworld\.com/p/(?P<id>[a-z0-9_]+)'
+    IE_DESC = '新日本プロレスワールド'
+    _NETRC_MACHINE = 'njpwworld'
+
+    _TEST = {
+        'url': 'http://njpwworld.com/p/s_series_00155_1_9/',
+        'info_dict': {
+            'id': 's_series_00155_1_9',
+            'ext': 'mp4',
+            'title': '第9試合　ランディ・サベージ　vs　リック・スタイナー',
+            'tags': list,
+        },
+        'params': {
+            'skip_download': True,  # AES-encrypted m3u8
+        },
+        'skip': 'Requires login',
+    }
+
+    _LOGIN_URL = 'https://front.njpwworld.com/auth/login'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        # No authentication to be performed
+        if not username:
+            return True
+
+        # Setup session (will set necessary cookies)
+        self._request_webpage(
+            'https://njpwworld.com/', None, note='Setting up session')
+
+        webpage, urlh = self._download_webpage_handle(
+            self._LOGIN_URL, None,
+            note='Logging in', errnote='Unable to login',
+            data=urlencode_postdata({'login_id': username, 'pw': password}),
+            headers={'Referer': 'https://front.njpwworld.com/auth'})
+        # /auth/login will return 302 for successful logins
+        if urlh.geturl() == self._LOGIN_URL:
+            self.report_warning('unable to login')
+            return False
+
+        return True
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        formats = []
+        for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage):
+            player = extract_attributes(mobj.group(0))
+            player_path = player.get('href')
+            if not player_path:
+                continue
+            kind = self._search_regex(
+                r'(low|high)$', player.get('class') or '', 'kind',
+                default='low')
+            player_url = compat_urlparse.urljoin(url, player_path)
+            player_page = self._download_webpage(
+                player_url, video_id, note='Downloading player page')
+            entries = self._parse_html5_media_entries(
+                player_url, player_page, video_id, m3u8_id='hls-%s' % kind,
+                m3u8_entry_protocol='m3u8_native')
+            kind_formats = entries[0]['formats']
+            for f in kind_formats:
+                f['quality'] = 2 if kind == 'high' else 1
+            formats.extend(kind_formats)
+
+        self._sort_formats(formats)
+
+        post_content = get_element_by_class('post-content', webpage)
+        tags = re.findall(
+            r'<li[^>]+class="tag-[^"]+"><a[^>]*>([^<]+)</a></li>', post_content
+        ) if post_content else None
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'formats': formats,
+            'tags': tags,
+        }
diff --git a/youtube_dl/extractor/nobelprize.py b/youtube_dl/extractor/nobelprize.py

new file mode 100644 (file)

index 0000000..4dfdb09
--- /dev/null
+++ b/youtube_dl/extractor/nobelprize.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+    mimetype2ext,
+    determine_ext,
+    update_url_query,
+    get_element_by_attribute,
+    int_or_none,
+)
+
+
+class NobelPrizeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nobelprize\.org/mediaplayer.*?\bid=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.nobelprize.org/mediaplayer/?id=2636',
+        'md5': '04c81e5714bb36cc4e2232fee1d8157f',
+        'info_dict': {
+            'id': '2636',
+            'ext': 'mp4',
+            'title': 'Announcement of the 2016 Nobel Prize in Physics',
+            'description': 'md5:05beba57f4f5a4bbd4cf2ef28fcff739',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        media = self._parse_json(self._search_regex(
+            r'(?s)var\s*config\s*=\s*({.+?});', webpage,
+            'config'), video_id, js_to_json)['media']
+        title = media['title']
+
+        formats = []
+        for source in media.get('source', []):
+            source_src = source.get('src')
+            if not source_src:
+                continue
+            ext = mimetype2ext(source.get('type')) or determine_ext(source_src)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_src, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    update_url_query(source_src, {'hdcore': '3.7.0'}),
+                    video_id, f4m_id='hds', fatal=False))
+            else:
+                formats.append({
+                    'url': source_src,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': get_element_by_attribute('itemprop', 'description', webpage),
+            'duration': int_or_none(media.get('duration')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py

new file mode 100644 (file)

index 0000000..30df905
--- /dev/null
+++ b/youtube_dl/extractor/noco.py
@@ -0,0 +1,235 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import time
+import hashlib
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    parse_iso8601,
+    sanitized_Request,
+    urlencode_postdata,
+)
+
+
+class NocoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)'
+    _LOGIN_URL = 'https://noco.tv/do.php'
+    _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s'
+    _SUB_LANG_TEMPLATE = '&sub_lang=%s'
+    _NETRC_MACHINE = 'noco'
+
+    _TESTS = [
+        {
+            'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
+            'md5': '0a993f0058ddbcd902630b2047ef710e',
+            'info_dict': {
+                'id': '11538',
+                'ext': 'mp4',
+                'title': 'Ami Ami Idol - Hello! France',
+                'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86',
+                'upload_date': '20140412',
+                'uploader': 'Nolife',
+                'uploader_id': 'NOL',
+                'duration': 2851.2,
+            },
+            'skip': 'Requires noco account',
+        },
+        {
+            'url': 'http://noco.tv/emission/12610/lbl42/the-guild/s01e01-wake-up-call',
+            'md5': 'c190f1f48e313c55838f1f412225934d',
+            'info_dict': {
+                'id': '12610',
+                'ext': 'mp4',
+                'title': 'The Guild #1 - Wake-Up Call',
+                'timestamp': 1403863200,
+                'upload_date': '20140627',
+                'uploader': 'LBL42',
+                'uploader_id': 'LBL',
+                'duration': 233.023,
+            },
+            'skip': 'Requires noco account',
+        }
+    ]
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login = self._download_json(
+            self._LOGIN_URL, None, 'Logging in',
+            data=urlencode_postdata({
+                'a': 'login',
+                'cookie': '1',
+                'username': username,
+                'password': password,
+            }),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+            })
+
+        if 'erreur' in login:
+            raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True)
+
+    @staticmethod
+    def _ts():
+        return int(time.time() * 1000)
+
+    def _call_api(self, path, video_id, note, sub_lang=None):
+        ts = compat_str(self._ts() + self._ts_offset)
+        tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest()
+        url = self._API_URL_TEMPLATE % (path, ts, tk)
+        if sub_lang:
+            url += self._SUB_LANG_TEMPLATE % sub_lang
+
+        request = sanitized_Request(url)
+        request.add_header('Referer', self._referer)
+
+        resp = self._download_json(request, video_id, note)
+
+        if isinstance(resp, dict) and resp.get('error'):
+            self._raise_error(resp['error'], resp['description'])
+
+        return resp
+
+    def _raise_error(self, error, description):
+        raise ExtractorError(
+            '%s returned error: %s - %s' % (self.IE_NAME, error, description),
+            expected=True)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # Timestamp adjustment offset between server time and local time
+        # must be calculated in order to use timestamps closest to server's
+        # in all API requests (see https://github.com/ytdl-org/youtube-dl/issues/7864)
+        webpage = self._download_webpage(url, video_id)
+
+        player_url = self._search_regex(
+            r'(["\'])(?P<player>https?://noco\.tv/(?:[^/]+/)+NocoPlayer.+?\.swf.*?)\1',
+            webpage, 'noco player', group='player',
+            default='http://noco.tv/cdata/js/player/NocoPlayer-v1.2.40.swf')
+
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(player_url).query)
+        ts = int_or_none(qs.get('ts', [None])[0])
+        self._ts_offset = ts - self._ts() if ts else 0
+        self._referer = player_url
+
+        medias = self._call_api(
+            'shows/%s/medias' % video_id,
+            video_id, 'Downloading video JSON')
+
+        show = self._call_api(
+            'shows/by_id/%s' % video_id,
+            video_id, 'Downloading show JSON')[0]
+
+        options = self._call_api(
+            'users/init', video_id,
+            'Downloading user options JSON')['options']
+        audio_lang_pref = options.get('audio_language') or options.get('language', 'fr')
+
+        if audio_lang_pref == 'original':
+            audio_lang_pref = show['original_lang']
+        if len(medias) == 1:
+            audio_lang_pref = list(medias.keys())[0]
+        elif audio_lang_pref not in medias:
+            audio_lang_pref = 'fr'
+
+        qualities = self._call_api(
+            'qualities',
+            video_id, 'Downloading qualities JSON')
+
+        formats = []
+
+        for audio_lang, audio_lang_dict in medias.items():
+            preference = 1 if audio_lang == audio_lang_pref else 0
+            for sub_lang, lang_dict in audio_lang_dict['video_list'].items():
+                for format_id, fmt in lang_dict['quality_list'].items():
+                    format_id_extended = 'audio-%s_sub-%s_%s' % (audio_lang, sub_lang, format_id)
+
+                    video = self._call_api(
+                        'shows/%s/video/%s/%s' % (video_id, format_id.lower(), audio_lang),
+                        video_id, 'Downloading %s video JSON' % format_id_extended,
+                        sub_lang if sub_lang != 'none' else None)
+
+                    file_url = video['file']
+                    if not file_url:
+                        continue
+
+                    if file_url in ['forbidden', 'not found']:
+                        popmessage = video['popmessage']
+                        self._raise_error(popmessage['title'], popmessage['message'])
+
+                    formats.append({
+                        'url': file_url,
+                        'format_id': format_id_extended,
+                        'width': int_or_none(fmt.get('res_width')),
+                        'height': int_or_none(fmt.get('res_lines')),
+                        'abr': int_or_none(fmt.get('audiobitrate'), 1000),
+                        'vbr': int_or_none(fmt.get('videobitrate'), 1000),
+                        'filesize': int_or_none(fmt.get('filesize')),
+                        'format_note': qualities[format_id].get('quality_name'),
+                        'quality': qualities[format_id].get('priority'),
+                        'preference': preference,
+                    })
+
+        self._sort_formats(formats)
+
+        timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ')
+
+        if timestamp is not None and timestamp < 0:
+            timestamp = None
+
+        uploader = show.get('partner_name')
+        uploader_id = show.get('partner_key')
+        duration = float_or_none(show.get('duration_ms'), 1000)
+
+        thumbnails = []
+        for thumbnail_key, thumbnail_url in show.items():
+            m = re.search(r'^screenshot_(?P<width>\d+)x(?P<height>\d+)$', thumbnail_key)
+            if not m:
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+                'width': int(m.group('width')),
+                'height': int(m.group('height')),
+            })
+
+        episode = show.get('show_TT') or show.get('show_OT')
+        family = show.get('family_TT') or show.get('family_OT')
+        episode_number = show.get('episode_number')
+
+        title = ''
+        if family:
+            title += family
+        if episode_number:
+            title += ' #' + compat_str(episode_number)
+        if episode:
+            title += ' - ' + compat_str(episode)
+
+        description = show.get('show_resume') or show.get('family_resume')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py

new file mode 100644 (file)

index 0000000..ca1424e
--- /dev/null
+++ b/youtube_dl/extractor/nonktube.py
@@ -0,0 +1,38 @@
+from __future__ import unicode_literals
+
+from .nuevo import NuevoBaseIE
+
+
+class NonkTubeIE(NuevoBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?nonktube\.com/(?:(?:video|embed)/|media/nuevo/embed\.php\?.*?\bid=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.nonktube.com/video/118636/sensual-wife-uncensored-fucked-in-hairy-pussy-and-facialized',
+        'info_dict': {
+            'id': '118636',
+            'ext': 'mp4',
+            'title': 'Sensual Wife Uncensored Fucked In Hairy Pussy And Facialized',
+            'age_limit': 18,
+            'duration': 1150.98,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://www.nonktube.com/embed/118636',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(webpage)
+        info = self._parse_html5_media_entries(url, webpage, video_id)[0]
+
+        info.update({
+            'id': video_id,
+            'title': title,
+            'age_limit': 18,
+        })
+        return info
diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py

new file mode 100644 (file)

index 0000000..b40770d
--- /dev/null
+++ b/youtube_dl/extractor/noovo.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    smuggle_url,
+    try_get,
+)
+
+
+class NoovoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)'
+    _TESTS = [{
+        # clip
+        'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial',
+        'info_dict': {
+            'id': '5386045029001',
+            'ext': 'mp4',
+            'title': 'Chrysler Imperial',
+            'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056',
+            'timestamp': 1491399228,
+            'upload_date': '20170405',
+            'uploader_id': '618566855001',
+            'series': 'RPM+',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # episode
+        'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8',
+        'info_dict': {
+            'id': '5395865725001',
+            'title': 'Épisode 13 : Les retrouvailles',
+            'description': 'md5:888c3330f0c1b4476c5bc99a1c040473',
+            'ext': 'mp4',
+            'timestamp': 1492019320,
+            'upload_date': '20170412',
+            'uploader_id': '618566855001',
+            'series': "L'amour est dans le pré",
+            'season_number': 5,
+            'episode': 'Épisode 13',
+            'episode_number': 13,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        brightcove_id = self._search_regex(
+            r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
+
+        data = self._parse_json(
+            self._search_regex(
+                r'(?s)dataLayer\.push\(\s*({.+?})\s*\);', webpage, 'data',
+                default='{}'),
+            video_id, transform_source=js_to_json, fatal=False)
+
+        title = try_get(
+            data, lambda x: x['video']['nom'],
+            compat_str) or self._html_search_meta(
+            'dcterms.Title', webpage, 'title', fatal=True)
+
+        description = self._html_search_meta(
+            ('dcterms.Description', 'description'), webpage, 'description')
+
+        series = try_get(
+            data, lambda x: x['emission']['nom']) or self._search_regex(
+            r'<div[^>]+class="banner-card__subtitle h4"[^>]*>([^<]+)',
+            webpage, 'series', default=None)
+
+        season_el = try_get(data, lambda x: x['emission']['saison'], dict) or {}
+        season = try_get(season_el, lambda x: x['nom'], compat_str)
+        season_number = int_or_none(try_get(season_el, lambda x: x['numero']))
+
+        episode_el = try_get(season_el, lambda x: x['episode'], dict) or {}
+        episode = try_get(episode_el, lambda x: x['nom'], compat_str)
+        episode_number = int_or_none(try_get(episode_el, lambda x: x['numero']))
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': BrightcoveNewIE.ie_key(),
+            'url': smuggle_url(
+                self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+                {'geo_countries': ['CA']}),
+            'id': brightcove_id,
+            'title': title,
+            'description': description,
+            'series': series,
+            'season': season,
+            'season_number': season_number,
+            'episode': episode,
+            'episode_number': episode_number,
+        }
diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py

new file mode 100644 (file)

index 0000000..61fe571
--- /dev/null
+++ b/youtube_dl/extractor/normalboots.py
@@ -0,0 +1,54 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .jwplatform import JWPlatformIE
+
+from ..utils import (
+    unified_strdate,
+)
+
+
+class NormalbootsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$'
+    _TEST = {
+        'url': 'http://normalboots.com/video/home-alone-games-jontron/',
+        'info_dict': {
+            'id': 'home-alone-games-jontron',
+            'ext': 'mp4',
+            'title': 'Home Alone Games - JonTron - NormalBoots',
+            'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/',
+            'uploader': 'JonTron',
+            'upload_date': '20140125',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['JWPlatform'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_uploader = self._html_search_regex(
+            r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>',
+            webpage, 'uploader', fatal=False)
+        video_upload_date = unified_strdate(self._html_search_regex(
+            r'<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
+            webpage, 'date', fatal=False))
+
+        jwplatform_url = JWPlatformIE._extract_url(webpage)
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': jwplatform_url,
+            'ie_key': JWPlatformIE.ie_key(),
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'uploader': video_uploader,
+            'upload_date': video_upload_date,
+        }
diff --git a/youtube_dl/extractor/nosvideo.py b/youtube_dl/extractor/nosvideo.py

new file mode 100644 (file)

index 0000000..53c500c
--- /dev/null
+++ b/youtube_dl/extractor/nosvideo.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    sanitized_Request,
+    urlencode_postdata,
+    xpath_text,
+    xpath_with_ns,
+)
+
+_x = lambda p: xpath_with_ns(p, {'xspf': 'http://xspf.org/ns/0/'})
+
+
+class NosVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nosvideo\.com/' + \
+                 r'(?:embed/|\?v=)(?P<id>[A-Za-z0-9]{12})/?'
+    _PLAYLIST_URL = 'http://nosvideo.com/xml/{xml_id:s}.xml'
+    _FILE_DELETED_REGEX = r'<b>File Not Found</b>'
+    _TEST = {
+        'url': 'http://nosvideo.com/?v=mu8fle7g7rpq',
+        'md5': '6124ed47130d8be3eacae635b071e6b6',
+        'info_dict': {
+            'id': 'mu8fle7g7rpq',
+            'ext': 'mp4',
+            'title': 'big_buck_bunny_480p_surround-fix.avi.mp4',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        fields = {
+            'id': video_id,
+            'op': 'download1',
+            'method_free': 'Continue to Video',
+        }
+        req = sanitized_Request(url, urlencode_postdata(fields))
+        req.add_header('Content-type', 'application/x-www-form-urlencoded')
+        webpage = self._download_webpage(req, video_id,
+                                         'Downloading download page')
+        if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
+            raise ExtractorError('Video %s does not exist' % video_id,
+                                 expected=True)
+
+        xml_id = self._search_regex(r'php\|([^\|]+)\|', webpage, 'XML ID')
+        playlist_url = self._PLAYLIST_URL.format(xml_id=xml_id)
+        playlist = self._download_xml(playlist_url, video_id)
+
+        track = playlist.find(_x('.//xspf:track'))
+        if track is None:
+            raise ExtractorError(
+                'XML playlist is missing the \'track\' element',
+                expected=True)
+        title = xpath_text(track, _x('./xspf:title'), 'title')
+        url = xpath_text(track, _x('./xspf:file'), 'URL', fatal=True)
+        thumbnail = xpath_text(track, _x('./xspf:image'), 'thumbnail')
+        if title is not None:
+            title = title.strip()
+
+        formats = [{
+            'format_id': 'sd',
+            'url': url,
+        }]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py

new file mode 100644 (file)

index 0000000..47b9748
--- /dev/null
+++ b/youtube_dl/extractor/nova.py
@@ -0,0 +1,305 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    determine_ext,
+    int_or_none,
+    js_to_json,
+    qualities,
+    unified_strdate,
+    url_or_none,
+)
+
+
+class NovaEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1',
+        'md5': 'ee009bafcc794541570edd44b71cbea3',
+        'info_dict': {
+            'id': '8o0n0r',
+            'ext': 'mp4',
+            'title': '2180. díl',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 2578,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        duration = None
+        formats = []
+
+        player = self._parse_json(
+            self._search_regex(
+                r'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;',
+                webpage, 'player', default='{}'), video_id, fatal=False)
+        if player:
+            for format_id, format_list in player['tracks'].items():
+                if not isinstance(format_list, list):
+                    format_list = [format_list]
+                for format_dict in format_list:
+                    if not isinstance(format_dict, dict):
+                        continue
+                    format_url = url_or_none(format_dict.get('src'))
+                    format_type = format_dict.get('type')
+                    ext = determine_ext(format_url)
+                    if (format_type == 'application/x-mpegURL'
+                            or format_id == 'HLS' or ext == 'm3u8'):
+                        formats.extend(self._extract_m3u8_formats(
+                            format_url, video_id, 'mp4',
+                            entry_protocol='m3u8_native', m3u8_id='hls',
+                            fatal=False))
+                    elif (format_type == 'application/dash+xml'
+                          or format_id == 'DASH' or ext == 'mpd'):
+                        formats.extend(self._extract_mpd_formats(
+                            format_url, video_id, mpd_id='dash', fatal=False))
+                    else:
+                        formats.append({
+                            'url': format_url,
+                        })
+            duration = int_or_none(player.get('duration'))
+        else:
+            # Old path, not actual as of 08.04.2020
+            bitrates = self._parse_json(
+                self._search_regex(
+                    r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'),
+                video_id, transform_source=js_to_json)
+
+            QUALITIES = ('lq', 'mq', 'hq', 'hd')
+            quality_key = qualities(QUALITIES)
+
+            for format_id, format_list in bitrates.items():
+                if not isinstance(format_list, list):
+                    format_list = [format_list]
+                for format_url in format_list:
+                    format_url = url_or_none(format_url)
+                    if not format_url:
+                        continue
+                    if format_id == 'hls':
+                        formats.extend(self._extract_m3u8_formats(
+                            format_url, video_id, ext='mp4',
+                            entry_protocol='m3u8_native', m3u8_id='hls',
+                            fatal=False))
+                        continue
+                    f = {
+                        'url': format_url,
+                    }
+                    f_id = format_id
+                    for quality in QUALITIES:
+                        if '%s.mp4' % quality in format_url:
+                            f_id += '-%s' % quality
+                            f.update({
+                                'quality': quality_key(quality),
+                                'format_note': quality.upper(),
+                            })
+                            break
+                    f['format_id'] = f_id
+                    formats.append(f)
+
+        self._sort_formats(formats)
+
+        title = self._og_search_title(
+            webpage, default=None) or self._search_regex(
+            (r'<value>(?P<title>[^<]+)',
+             r'videoTitle\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
+            'title', group='value')
+        thumbnail = self._og_search_thumbnail(
+            webpage, default=None) or self._search_regex(
+            r'poster\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+            'thumbnail', fatal=False, group='value')
+        duration = int_or_none(self._search_regex(
+            r'videoDuration\s*:\s*(\d+)', webpage, 'duration',
+            default=duration))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
+
+
+class NovaIE(InfoExtractor):
+    IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz'
+    _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
+    _TESTS = [{
+        'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
+        'md5': '249baab7d0104e186e78b0899c7d5f28',
+        'info_dict': {
+            'id': '1757139',
+            'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
+            'ext': 'mp4',
+            'title': 'Podzemní nemocnice v pražské Krči',
+            'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
+            'thumbnail': r're:^https?://.*\.(?:jpg)',
+        }
+    }, {
+        'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+        'info_dict': {
+            'id': '1753621',
+            'ext': 'mp4',
+            'title': 'Zaklínač 3: Divoký hon',
+            'description': 're:.*Pokud se stejně jako my nemůžete.*',
+            'thumbnail': r're:https?://.*\.jpg(\?.*)?',
+            'upload_date': '20150521',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+        'skip': 'gone',
+    }, {
+        # media.cms.nova.cz embed
+        'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil',
+        'info_dict': {
+            'id': '8o0n0r',
+            'ext': 'mp4',
+            'title': '2180. díl',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 2578,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [NovaEmbedIE.ie_key()],
+        'skip': 'CHYBA 404: STRÁNKA NENALEZENA',
+    }, {
+        'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        site = mobj.group('site')
+
+        webpage = self._download_webpage(url, display_id)
+
+        description = clean_html(self._og_search_description(webpage, default=None))
+        if site == 'novaplus':
+            upload_date = unified_strdate(self._search_regex(
+                r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None))
+        elif site == 'fanda':
+            upload_date = unified_strdate(self._search_regex(
+                r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None))
+        else:
+            upload_date = None
+
+        # novaplus
+        embed_id = self._search_regex(
+            r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)',
+            webpage, 'embed url', default=None)
+        if embed_id:
+            return {
+                '_type': 'url_transparent',
+                'url': 'https://media.cms.nova.cz/embed/%s' % embed_id,
+                'ie_key': NovaEmbedIE.ie_key(),
+                'id': embed_id,
+                'description': description,
+                'upload_date': upload_date
+            }
+
+        video_id = self._search_regex(
+            [r"(?:media|video_id)\s*:\s*'(\d+)'",
+             r'media=(\d+)',
+             r'id="article_video_(\d+)"',
+             r'id="player_(\d+)"'],
+            webpage, 'video id')
+
+        config_url = self._search_regex(
+            r'src="(https?://(?:tn|api)\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"',
+            webpage, 'config url', default=None)
+        config_params = {}
+
+        if not config_url:
+            player = self._parse_json(
+                self._search_regex(
+                    r'(?s)Player\s*\(.+?\s*,\s*({.+?\bmedia\b["\']?\s*:\s*["\']?\d+.+?})\s*\)', webpage,
+                    'player', default='{}'),
+                video_id, transform_source=js_to_json, fatal=False)
+            if player:
+                config_url = url_or_none(player.get('configUrl'))
+                params = player.get('configParams')
+                if isinstance(params, dict):
+                    config_params = params
+
+        if not config_url:
+            DEFAULT_SITE_ID = '23000'
+            SITES = {
+                'tvnoviny': DEFAULT_SITE_ID,
+                'novaplus': DEFAULT_SITE_ID,
+                'vymena': DEFAULT_SITE_ID,
+                'krasna': DEFAULT_SITE_ID,
+                'fanda': '30',
+                'tn': '30',
+                'doma': '30',
+            }
+
+            site_id = self._search_regex(
+                r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(
+                site, DEFAULT_SITE_ID)
+
+            config_url = 'https://api.nova.cz/bin/player/videojs/config.php'
+            config_params = {
+                'site': site_id,
+                'media': video_id,
+                'quality': 3,
+                'version': 1,
+            }
+
+        config = self._download_json(
+            config_url, display_id,
+            'Downloading config JSON', query=config_params,
+            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+        mediafile = config['mediafile']
+        video_url = mediafile['src']
+
+        m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url)
+        if m:
+            formats = [{
+                'url': m.group('url'),
+                'app': m.group('app'),
+                'play_path': m.group('playpath'),
+                'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf',
+                'ext': 'flv',
+            }]
+        else:
+            formats = [{
+                'url': video_url,
+            }]
+        self._sort_formats(formats)
+
+        title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage)
+        thumbnail = config.get('poster')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'upload_date': upload_date,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py

new file mode 100644 (file)

index 0000000..c136bc8
--- /dev/null
+++ b/youtube_dl/extractor/nowness.py
@@ -0,0 +1,147 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import (
+    BrightcoveLegacyIE,
+    BrightcoveNewIE,
+)
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    sanitized_Request,
+)
+
+
+class NownessBaseIE(InfoExtractor):
+    def _extract_url_result(self, post):
+        if post['type'] == 'video':
+            for media in post['media']:
+                if media['type'] == 'video':
+                    video_id = media['content']
+                    source = media['source']
+                    if source == 'brightcove':
+                        player_code = self._download_webpage(
+                            'http://www.nowness.com/iframe?id=%s' % video_id, video_id,
+                            note='Downloading player JavaScript',
+                            errnote='Unable to download player JavaScript')
+                        bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code)
+                        if bc_url:
+                            return self.url_result(bc_url, BrightcoveLegacyIE.ie_key())
+                        bc_url = BrightcoveNewIE._extract_url(self, player_code)
+                        if bc_url:
+                            return self.url_result(bc_url, BrightcoveNewIE.ie_key())
+                        raise ExtractorError('Could not find player definition')
+                    elif source == 'vimeo':
+                        return self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
+                    elif source == 'youtube':
+                        return self.url_result(video_id, 'Youtube')
+                    elif source == 'cinematique':
+                        # youtube-dlc currently doesn't support cinematique
+                        # return self.url_result('http://cinematique.com/embed/%s' % video_id, 'Cinematique')
+                        pass
+
+    def _api_request(self, url, request_path):
+        display_id = self._match_id(url)
+        request = sanitized_Request(
+            'http://api.nowness.com/api/' + request_path % display_id,
+            headers={
+                'X-Nowness-Language': 'zh-cn' if 'cn.nowness.com' in url else 'en-us',
+            })
+        return display_id, self._download_json(request, display_id)
+
+
+class NownessIE(NownessBaseIE):
+    IE_NAME = 'nowness'
+    _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/(?:story|(?:series|category)/[^/]+)/(?P<id>[^/]+?)(?:$|[?#])'
+    _TESTS = [{
+        'url': 'https://www.nowness.com/story/candor-the-art-of-gesticulation',
+        'md5': '068bc0202558c2e391924cb8cc470676',
+        'info_dict': {
+            'id': '2520295746001',
+            'ext': 'mp4',
+            'title': 'Candor: The Art of Gesticulation',
+            'description': 'Candor: The Art of Gesticulation',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1446745676,
+            'upload_date': '20151105',
+            'uploader_id': '2385340575001',
+        },
+        'add_ie': ['BrightcoveNew'],
+    }, {
+        'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr',
+        'md5': 'e79cf125e387216f86b2e0a5b5c63aa3',
+        'info_dict': {
+            'id': '3716354522001',
+            'ext': 'mp4',
+            'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
+            'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1407315371,
+            'upload_date': '20140806',
+            'uploader_id': '2385340575001',
+        },
+        'add_ie': ['BrightcoveNew'],
+    }, {
+        # vimeo
+        'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut',
+        'md5': '9a5a6a8edf806407e411296ab6bc2a49',
+        'info_dict': {
+            'id': '130020913',
+            'ext': 'mp4',
+            'title': 'Bleu, Blanc, Rouge - A Godard Supercut',
+            'description': 'md5:f0ea5f1857dffca02dbd37875d742cec',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'upload_date': '20150607',
+            'uploader': 'Cinema Sem Lei',
+            'uploader_id': 'cinemasemlei',
+        },
+        'add_ie': ['Vimeo'],
+    }]
+
+    def _real_extract(self, url):
+        _, post = self._api_request(url, 'post/getBySlug/%s')
+        return self._extract_url_result(post)
+
+
+class NownessPlaylistIE(NownessBaseIE):
+    IE_NAME = 'nowness:playlist'
+    _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/playlist/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.nowness.com/playlist/3286/i-guess-thats-why-they-call-it-the-blues',
+        'info_dict': {
+            'id': '3286',
+        },
+        'playlist_mincount': 8,
+    }
+
+    def _real_extract(self, url):
+        playlist_id, playlist = self._api_request(url, 'post?PlaylistId=%s')
+        entries = [self._extract_url_result(item) for item in playlist['items']]
+        return self.playlist_result(entries, playlist_id)
+
+
+class NownessSeriesIE(NownessBaseIE):
+    IE_NAME = 'nowness:series'
+    _VALID_URL = r'https?://(?:(?:www|cn)\.)?nowness\.com/series/(?P<id>[^/]+?)(?:$|[?#])'
+    _TEST = {
+        'url': 'https://www.nowness.com/series/60-seconds',
+        'info_dict': {
+            'id': '60',
+            'title': '60 Seconds',
+            'description': 'One-minute wisdom in a new NOWNESS series',
+        },
+        'playlist_mincount': 4,
+    }
+
+    def _real_extract(self, url):
+        display_id, series = self._api_request(url, 'series/getBySlug/%s')
+        entries = [self._extract_url_result(post) for post in series['posts']]
+        series_title = None
+        series_description = None
+        translations = series.get('translations', [])
+        if translations:
+            series_title = translations[0].get('title') or translations[0]['seoTitle']
+            series_description = translations[0].get('seoDescription')
+        return self.playlist_result(
+            entries, compat_str(series['id']), series_title, series_description)
diff --git a/youtube_dl/extractor/noz.py b/youtube_dl/extractor/noz.py

new file mode 100644 (file)

index 0000000..ccafd77
--- /dev/null
+++ b/youtube_dl/extractor/noz.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_unquote,
+    compat_xpath,
+)
+from ..utils import (
+    int_or_none,
+    find_xpath_attr,
+    xpath_text,
+    update_url_query,
+)
+
+
+class NozIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?noz\.de/video/(?P<id>[0-9]+)/'
+    _TESTS = [{
+        'url': 'http://www.noz.de/video/25151/32-Deutschland-gewinnt-Badminton-Lnderspiel-in-Melle',
+        'info_dict': {
+            'id': '25151',
+            'ext': 'mp4',
+            'duration': 215,
+            'title': '3:2 - Deutschland gewinnt Badminton-Länderspiel in Melle',
+            'description': 'Vor rund 370 Zuschauern gewinnt die deutsche Badminton-Nationalmannschaft am Donnerstag ein EM-Vorbereitungsspiel gegen Frankreich in Melle. Video Moritz Frankenberg.',
+            'thumbnail': r're:^http://.*\.jpg',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        description = self._og_search_description(webpage)
+
+        edge_url = self._html_search_regex(
+            r'<script\s+(?:type="text/javascript"\s+)?src="(.*?/videojs_.*?)"',
+            webpage, 'edge URL')
+        edge_content = self._download_webpage(edge_url, 'meta configuration')
+
+        config_url_encoded = self._search_regex(
+            r'so\.addVariable\("config_url","[^,]*,(.*?)"',
+            edge_content, 'config URL'
+        )
+        config_url = compat_urllib_parse_unquote(config_url_encoded)
+
+        doc = self._download_xml(config_url, 'video configuration')
+        title = xpath_text(doc, './/title')
+        thumbnail = xpath_text(doc, './/article/thumbnail/url')
+        duration = int_or_none(xpath_text(
+            doc, './/article/movie/file/duration'))
+        formats = []
+        for qnode in doc.findall(compat_xpath('.//article/movie/file/qualities/qual')):
+            http_url_ele = find_xpath_attr(
+                qnode, './html_urls/video_url', 'format', 'video/mp4')
+            http_url = http_url_ele.text if http_url_ele is not None else None
+            if http_url:
+                formats.append({
+                    'url': http_url,
+                    'format_name': xpath_text(qnode, './name'),
+                    'format_id': '%s-%s' % ('http', xpath_text(qnode, './id')),
+                    'height': int_or_none(xpath_text(qnode, './height')),
+                    'width': int_or_none(xpath_text(qnode, './width')),
+                    'tbr': int_or_none(xpath_text(qnode, './bitrate'), scale=1000),
+                })
+            else:
+                f4m_url = xpath_text(qnode, 'url_hd2')
+                if f4m_url:
+                    formats.extend(self._extract_f4m_formats(
+                        update_url_query(f4m_url, {'hdcore': '3.4.0'}),
+                        video_id, f4m_id='hds', fatal=False))
+                m3u8_url_ele = find_xpath_attr(
+                    qnode, './html_urls/video_url',
+                    'format', 'application/vnd.apple.mpegurl')
+                m3u8_url = m3u8_url_ele.text if m3u8_url_ele is not None else None
+                if m3u8_url:
+                    formats.extend(self._extract_m3u8_formats(
+                        m3u8_url, video_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'duration': duration,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py

new file mode 100644 (file)

index 0000000..e525ad9
--- /dev/null
+++ b/youtube_dl/extractor/npo.py
@@ -0,0 +1,767 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    fix_xml_ampersands,
+    int_or_none,
+    merge_dicts,
+    orderedSet,
+    parse_duration,
+    qualities,
+    str_or_none,
+    strip_jsonp,
+    unified_strdate,
+    unified_timestamp,
+    url_or_none,
+    urlencode_postdata,
+)
+
+
+class NPOBaseIE(InfoExtractor):
+    def _get_token(self, video_id):
+        return self._download_json(
+            'http://ida.omroep.nl/app.php/auth', video_id,
+            note='Downloading token')['token']
+
+
+class NPOIE(NPOBaseIE):
+    IE_NAME = 'npo'
+    IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        npo:|
+                        https?://
+                            (?:www\.)?
+                            (?:
+                                npo\.nl/(?:[^/]+/)*|
+                                (?:ntr|npostart)\.nl/(?:[^/]+/){2,}|
+                                omroepwnl\.nl/video/fragment/[^/]+__|
+                                (?:zapp|npo3)\.nl/(?:[^/]+/){2,}
+                            )
+                        )
+                        (?P<id>[^/?#]+)
+                '''
+
+    _TESTS = [{
+        'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719',
+        'md5': '4b3f9c429157ec4775f2c9cb7b911016',
+        'info_dict': {
+            'id': 'VPWON_1220719',
+            'ext': 'm4v',
+            'title': 'Nieuwsuur',
+            'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.',
+            'upload_date': '20140622',
+        },
+    }, {
+        'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800',
+        'md5': 'da50a5787dbfc1603c4ad80f31c5120b',
+        'info_dict': {
+            'id': 'VARA_101191800',
+            'ext': 'm4v',
+            'title': 'De Mega Mike & Mega Thomas show: The best of.',
+            'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4',
+            'upload_date': '20090227',
+            'duration': 2400,
+        },
+    }, {
+        'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289',
+        'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
+        'info_dict': {
+            'id': 'VPWON_1169289',
+            'ext': 'm4v',
+            'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika',
+            'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
+            'upload_date': '20130225',
+            'duration': 3000,
+        },
+    }, {
+        'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706',
+        'info_dict': {
+            'id': 'WO_VPRO_043706',
+            'ext': 'm4v',
+            'title': 'De nieuwe mens - Deel 1',
+            'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b',
+            'duration': 4680,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        # non asf in streams
+        'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771',
+        'info_dict': {
+            'id': 'WO_NOS_762771',
+            'ext': 'mp4',
+            'title': 'Hoe gaat Europa verder na Parijs?',
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content',
+        'info_dict': {
+            'id': 'VPWON_1233944',
+            'ext': 'm4v',
+            'title': 'Aap, poot, pies',
+            'description': 'md5:c9c8005d1869ae65b858e82c01a91fde',
+            'upload_date': '20150508',
+            'duration': 599,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698',
+        'info_dict': {
+            'id': 'POW_00996502',
+            'ext': 'm4v',
+            'title': '''"Dit is wel een 'landslide'..."''',
+            'description': 'md5:f8d66d537dfb641380226e31ca57b8e8',
+            'upload_date': '20150508',
+            'duration': 462,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        # audio
+        'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437',
+        'info_dict': {
+            'id': 'RBX_FUNX_6683215',
+            'ext': 'mp3',
+            'title': 'Jouw Stad Rotterdam',
+            'description': 'md5:db251505244f097717ec59fabc372d9f',
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870',
+        'only_matching': True,
+    }, {
+        # live stream
+        'url': 'npo:LI_NL1_4188102',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.npo.nl/radio-gaga/13-06-2017/BNN_101383373',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.npostart.nl/broodje-gezond-ei/28-05-2018/KN_1698996',
+        'only_matching': True,
+    }, {
+        'url': 'https://npo.nl/KN_1698996',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if any(ie.suitable(url)
+                for ie in (NPOLiveIE, NPORadioIE, NPORadioFragmentIE))
+                else super(NPOIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self._get_info(url, video_id) or self._get_old_info(video_id)
+
+    def _get_info(self, url, video_id):
+        token = self._download_json(
+            'https://www.npostart.nl/api/token', video_id,
+            'Downloading token', headers={
+                'Referer': url,
+                'X-Requested-With': 'XMLHttpRequest',
+            })['token']
+
+        player = self._download_json(
+            'https://www.npostart.nl/player/%s' % video_id, video_id,
+            'Downloading player JSON', data=urlencode_postdata({
+                'autoplay': 0,
+                'share': 1,
+                'pageUrl': url,
+                'hasAdConsent': 0,
+                '_token': token,
+            }))
+
+        player_token = player['token']
+
+        drm = False
+        format_urls = set()
+        formats = []
+        for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'):
+            streams = self._download_json(
+                'https://start-player.npo.nl/video/%s/streams' % video_id,
+                video_id, 'Downloading %s profile JSON' % profile, fatal=False,
+                query={
+                    'profile': profile,
+                    'quality': 'npo',
+                    'tokenId': player_token,
+                    'streamType': 'broadcast',
+                })
+            if not streams:
+                continue
+            stream = streams.get('stream')
+            if not isinstance(stream, dict):
+                continue
+            stream_url = url_or_none(stream.get('src'))
+            if not stream_url or stream_url in format_urls:
+                continue
+            format_urls.add(stream_url)
+            if stream.get('protection') is not None or stream.get('keySystemOptions') is not None:
+                drm = True
+                continue
+            stream_type = stream.get('type')
+            stream_ext = determine_ext(stream_url)
+            if stream_type == 'application/dash+xml' or stream_ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    stream_url, video_id, mpd_id='dash', fatal=False))
+            elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    stream_url, video_id, ext='mp4',
+                    entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+            elif re.search(r'\.isml?/Manifest', stream_url):
+                formats.extend(self._extract_ism_formats(
+                    stream_url, video_id, ism_id='mss', fatal=False))
+            else:
+                formats.append({
+                    'url': stream_url,
+                })
+
+        if not formats:
+            if drm:
+                raise ExtractorError('This video is DRM protected.', expected=True)
+            return
+
+        self._sort_formats(formats)
+
+        info = {
+            'id': video_id,
+            'title': video_id,
+            'formats': formats,
+        }
+
+        embed_url = url_or_none(player.get('embedUrl'))
+        if embed_url:
+            webpage = self._download_webpage(
+                embed_url, video_id, 'Downloading embed page', fatal=False)
+            if webpage:
+                video = self._parse_json(
+                    self._search_regex(
+                        r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video',
+                        default='{}'), video_id)
+                if video:
+                    title = video.get('episodeTitle')
+                    subtitles = {}
+                    subtitles_list = video.get('subtitles')
+                    if isinstance(subtitles_list, list):
+                        for cc in subtitles_list:
+                            cc_url = url_or_none(cc.get('src'))
+                            if not cc_url:
+                                continue
+                            lang = str_or_none(cc.get('language')) or 'nl'
+                            subtitles.setdefault(lang, []).append({
+                                'url': cc_url,
+                            })
+                    return merge_dicts({
+                        'title': title,
+                        'description': video.get('description'),
+                        'thumbnail': url_or_none(
+                            video.get('still_image_url') or video.get('orig_image_url')),
+                        'duration': int_or_none(video.get('duration')),
+                        'timestamp': unified_timestamp(video.get('broadcastDate')),
+                        'creator': video.get('channel'),
+                        'series': video.get('title'),
+                        'episode': title,
+                        'episode_number': int_or_none(video.get('episodeNumber')),
+                        'subtitles': subtitles,
+                    }, info)
+
+        return info
+
+    def _get_old_info(self, video_id):
+        metadata = self._download_json(
+            'http://e.omroep.nl/metadata/%s' % video_id,
+            video_id,
+            # We have to remove the javascript callback
+            transform_source=strip_jsonp,
+        )
+
+        error = metadata.get('error')
+        if error:
+            raise ExtractorError(error, expected=True)
+
+        # For some videos actual video id (prid) is different (e.g. for
+        # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698
+        # video id is POMS_WNL_853698 but prid is POW_00996502)
+        video_id = metadata.get('prid') or video_id
+
+        # titel is too generic in some cases so utilize aflevering_titel as well
+        # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html)
+        title = metadata['titel']
+        sub_title = metadata.get('aflevering_titel')
+        if sub_title and sub_title != title:
+            title += ': %s' % sub_title
+
+        token = self._get_token(video_id)
+
+        formats = []
+        urls = set()
+
+        def is_legal_url(format_url):
+            return format_url and format_url not in urls and re.match(
+                r'^(?:https?:)?//', format_url)
+
+        QUALITY_LABELS = ('Laag', 'Normaal', 'Hoog')
+        QUALITY_FORMATS = ('adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std')
+
+        quality_from_label = qualities(QUALITY_LABELS)
+        quality_from_format_id = qualities(QUALITY_FORMATS)
+        items = self._download_json(
+            'http://ida.omroep.nl/app.php/%s' % video_id, video_id,
+            'Downloading formats JSON', query={
+                'adaptive': 'yes',
+                'token': token,
+            })['items'][0]
+        for num, item in enumerate(items):
+            item_url = item.get('url')
+            if not is_legal_url(item_url):
+                continue
+            urls.add(item_url)
+            format_id = self._search_regex(
+                r'video/ida/([^/]+)', item_url, 'format id',
+                default=None)
+
+            item_label = item.get('label')
+
+            def add_format_url(format_url):
+                width = int_or_none(self._search_regex(
+                    r'(\d+)[xX]\d+', format_url, 'width', default=None))
+                height = int_or_none(self._search_regex(
+                    r'\d+[xX](\d+)', format_url, 'height', default=None))
+                if item_label in QUALITY_LABELS:
+                    quality = quality_from_label(item_label)
+                    f_id = item_label
+                elif item_label in QUALITY_FORMATS:
+                    quality = quality_from_format_id(format_id)
+                    f_id = format_id
+                else:
+                    quality, f_id = [None] * 2
+                formats.append({
+                    'url': format_url,
+                    'format_id': f_id,
+                    'width': width,
+                    'height': height,
+                    'quality': quality,
+                })
+
+            # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706
+            if item.get('contentType') in ('url', 'audio'):
+                add_format_url(item_url)
+                continue
+
+            try:
+                stream_info = self._download_json(
+                    item_url + '&type=json', video_id,
+                    'Downloading %s stream JSON'
+                    % item_label or item.get('format') or format_id or num)
+            except ExtractorError as ee:
+                if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
+                    error = (self._parse_json(
+                        ee.cause.read().decode(), video_id,
+                        fatal=False) or {}).get('errorstring')
+                    if error:
+                        raise ExtractorError(error, expected=True)
+                raise
+            # Stream URL instead of JSON, example: npo:LI_NL1_4188102
+            if isinstance(stream_info, compat_str):
+                if not stream_info.startswith('http'):
+                    continue
+                video_url = stream_info
+            # JSON
+            else:
+                video_url = stream_info.get('url')
+            if not video_url or 'vodnotavailable.' in video_url or video_url in urls:
+                continue
+            urls.add(video_url)
+            if determine_ext(video_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, ext='mp4',
+                    entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+            else:
+                add_format_url(video_url)
+
+        is_live = metadata.get('medium') == 'live'
+
+        if not is_live:
+            for num, stream in enumerate(metadata.get('streams', [])):
+                stream_url = stream.get('url')
+                if not is_legal_url(stream_url):
+                    continue
+                urls.add(stream_url)
+                # smooth streaming is not supported
+                stream_type = stream.get('type', '').lower()
+                if stream_type in ['ss', 'ms']:
+                    continue
+                if stream_type == 'hds':
+                    f4m_formats = self._extract_f4m_formats(
+                        stream_url, video_id, fatal=False)
+                    # f4m downloader downloads only piece of live stream
+                    for f4m_format in f4m_formats:
+                        f4m_format['preference'] = -1
+                    formats.extend(f4m_formats)
+                elif stream_type == 'hls':
+                    formats.extend(self._extract_m3u8_formats(
+                        stream_url, video_id, ext='mp4', fatal=False))
+                # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706
+                elif '.asf' in stream_url:
+                    asx = self._download_xml(
+                        stream_url, video_id,
+                        'Downloading stream %d ASX playlist' % num,
+                        transform_source=fix_xml_ampersands, fatal=False)
+                    if not asx:
+                        continue
+                    ref = asx.find('./ENTRY/Ref')
+                    if ref is None:
+                        continue
+                    video_url = ref.get('href')
+                    if not video_url or video_url in urls:
+                        continue
+                    urls.add(video_url)
+                    formats.append({
+                        'url': video_url,
+                        'ext': stream.get('formaat', 'asf'),
+                        'quality': stream.get('kwaliteit'),
+                        'preference': -10,
+                    })
+                else:
+                    formats.append({
+                        'url': stream_url,
+                        'quality': stream.get('kwaliteit'),
+                    })
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        if metadata.get('tt888') == 'ja':
+            subtitles['nl'] = [{
+                'ext': 'vtt',
+                'url': 'http://tt888.omroep.nl/tt888/%s' % video_id,
+            }]
+
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'description': metadata.get('info'),
+            'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],
+            'upload_date': unified_strdate(metadata.get('gidsdatum')),
+            'duration': parse_duration(metadata.get('tijdsduur')),
+            'formats': formats,
+            'subtitles': subtitles,
+            'is_live': is_live,
+        }
+
+
+class NPOLiveIE(NPOBaseIE):
+    IE_NAME = 'npo.nl:live'
+    _VALID_URL = r'https?://(?:www\.)?npo(?:start)?\.nl/live(?:/(?P<id>[^/?#&]+))?'
+
+    _TESTS = [{
+        'url': 'http://www.npo.nl/live/npo-1',
+        'info_dict': {
+            'id': 'LI_NL1_4188102',
+            'display_id': 'npo-1',
+            'ext': 'mp4',
+            'title': 're:^NPO 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://www.npo.nl/live',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.npostart.nl/live/npo-1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url) or 'npo-1'
+
+        webpage = self._download_webpage(url, display_id)
+
+        live_id = self._search_regex(
+            [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id')
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'npo:%s' % live_id,
+            'ie_key': NPOIE.ie_key(),
+            'id': live_id,
+            'display_id': display_id,
+        }
+
+
+class NPORadioIE(InfoExtractor):
+    IE_NAME = 'npo.nl:radio'
+    _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/(?P<id>[^/]+)'
+
+    _TEST = {
+        'url': 'http://www.npo.nl/radio/radio-1',
+        'info_dict': {
+            'id': 'radio-1',
+            'ext': 'mp3',
+            'title': 're:^NPO Radio 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if NPORadioFragmentIE.suitable(url) else super(NPORadioIE, cls).suitable(url)
+
+    @staticmethod
+    def _html_get_attribute_regex(attribute):
+        return r'{0}\s*=\s*\'([^\']+)\''.format(attribute)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            self._html_get_attribute_regex('data-channel'), webpage, 'title')
+
+        stream = self._parse_json(
+            self._html_search_regex(self._html_get_attribute_regex('data-streams'), webpage, 'data-streams'),
+            video_id)
+
+        codec = stream.get('codec')
+
+        return {
+            'id': video_id,
+            'url': stream['url'],
+            'title': self._live_title(title),
+            'acodec': codec,
+            'ext': codec,
+            'is_live': True,
+        }
+
+
+class NPORadioFragmentIE(InfoExtractor):
+    IE_NAME = 'npo.nl:radio:fragment'
+    _VALID_URL = r'https?://(?:www\.)?npo\.nl/radio/[^/]+/fragment/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.npo.nl/radio/radio-5/fragment/174356',
+        'md5': 'dd8cc470dad764d0fdc70a9a1e2d18c2',
+        'info_dict': {
+            'id': '174356',
+            'ext': 'mp3',
+            'title': 'Jubileumconcert Willeke Alberti',
+        },
+    }
+
+    def _real_extract(self, url):
+        audio_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, audio_id)
+
+        title = self._html_search_regex(
+            r'href="/radio/[^/]+/fragment/%s" title="([^"]+)"' % audio_id,
+            webpage, 'title')
+
+        audio_url = self._search_regex(
+            r"data-streams='([^']+)'", webpage, 'audio url')
+
+        return {
+            'id': audio_id,
+            'url': audio_url,
+            'title': title,
+        }
+
+
+class NPODataMidEmbedIE(InfoExtractor):
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._search_regex(
+            r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video_id', group='id')
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'NPO',
+            'url': 'npo:%s' % video_id,
+            'display_id': display_id
+        }
+
+
+class SchoolTVIE(NPODataMidEmbedIE):
+    IE_NAME = 'schooltv'
+    _VALID_URL = r'https?://(?:www\.)?schooltv\.nl/video/(?P<id>[^/?#&]+)'
+
+    _TEST = {
+        'url': 'http://www.schooltv.nl/video/ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam/',
+        'info_dict': {
+            'id': 'WO_NTR_429477',
+            'display_id': 'ademhaling-de-hele-dag-haal-je-adem-maar-wat-gebeurt-er-dan-eigenlijk-in-je-lichaam',
+            'title': 'Ademhaling: De hele dag haal je adem. Maar wat gebeurt er dan eigenlijk in je lichaam?',
+            'ext': 'mp4',
+            'description': 'md5:abfa0ff690adb73fd0297fd033aaa631'
+        },
+        'params': {
+            # Skip because of m3u8 download
+            'skip_download': True
+        }
+    }
+
+
+class HetKlokhuisIE(NPODataMidEmbedIE):
+    IE_NAME = 'hetklokhuis'
+    _VALID_URL = r'https?://(?:www\.)?hetklokhuis\.nl/[^/]+/\d+/(?P<id>[^/?#&]+)'
+
+    _TEST = {
+        'url': 'http://hetklokhuis.nl/tv-uitzending/3471/Zwaartekrachtsgolven',
+        'info_dict': {
+            'id': 'VPWON_1260528',
+            'display_id': 'Zwaartekrachtsgolven',
+            'ext': 'm4v',
+            'title': 'Het Klokhuis: Zwaartekrachtsgolven',
+            'description': 'md5:c94f31fb930d76c2efa4a4a71651dd48',
+            'upload_date': '20170223',
+        },
+        'params': {
+            'skip_download': True
+        }
+    }
+
+
+class NPOPlaylistBaseIE(NPOIE):
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id)
+            for video_id in orderedSet(re.findall(self._PLAYLIST_ENTRY_RE, webpage))
+        ]
+
+        playlist_title = self._html_search_regex(
+            self._PLAYLIST_TITLE_RE, webpage, 'playlist title',
+            default=None) or self._og_search_title(webpage)
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
+
+
+class VPROIE(NPOPlaylistBaseIE):
+    IE_NAME = 'vpro'
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:tegenlicht\.)?vpro|2doc)\.nl/(?:[^/]+/)*(?P<id>[^/]+)\.html'
+    _PLAYLIST_TITLE_RE = (r'<h1[^>]+class=["\'].*?\bmedia-platform-title\b.*?["\'][^>]*>([^<]+)',
+                          r'<h5[^>]+class=["\'].*?\bmedia-platform-subtitle\b.*?["\'][^>]*>([^<]+)')
+    _PLAYLIST_ENTRY_RE = r'data-media-id="([^"]+)"'
+
+    _TESTS = [
+        {
+            'url': 'http://tegenlicht.vpro.nl/afleveringen/2012-2013/de-toekomst-komt-uit-afrika.html',
+            'md5': 'f8065e4e5a7824068ed3c7e783178f2c',
+            'info_dict': {
+                'id': 'VPWON_1169289',
+                'ext': 'm4v',
+                'title': 'De toekomst komt uit Afrika',
+                'description': 'md5:52cf4eefbc96fffcbdc06d024147abea',
+                'upload_date': '20130225',
+            },
+            'skip': 'Video gone',
+        },
+        {
+            'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html',
+            'info_dict': {
+                'id': 'sergio-herman',
+                'title': 'sergio herman: fucking perfect',
+            },
+            'playlist_count': 2,
+        },
+        {
+            # playlist with youtube embed
+            'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html',
+            'info_dict': {
+                'id': 'education-education',
+                'title': 'education education',
+            },
+            'playlist_count': 2,
+        },
+        {
+            'url': 'http://www.2doc.nl/documentaires/series/2doc/2015/oktober/de-tegenprestatie.html',
+            'info_dict': {
+                'id': 'de-tegenprestatie',
+                'title': 'De Tegenprestatie',
+            },
+            'playlist_count': 2,
+        }, {
+            'url': 'http://www.2doc.nl/speel~VARA_101375237~mh17-het-verdriet-van-nederland~.html',
+            'info_dict': {
+                'id': 'VARA_101375237',
+                'ext': 'm4v',
+                'title': 'MH17: Het verdriet van Nederland',
+                'description': 'md5:09e1a37c1fdb144621e22479691a9f18',
+                'upload_date': '20150716',
+            },
+            'params': {
+                # Skip because of m3u8 download
+                'skip_download': True
+            },
+        }
+    ]
+
+
+class WNLIE(NPOPlaylistBaseIE):
+    IE_NAME = 'wnl'
+    _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+'
+    _PLAYLIST_TITLE_RE = r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>'
+    _PLAYLIST_ENTRY_RE = r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>Deel \d+'
+
+    _TESTS = [{
+        'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515',
+        'info_dict': {
+            'id': 'vandaag-de-dag-6-mei',
+            'title': 'Vandaag de Dag 6 mei',
+        },
+        'playlist_count': 4,
+    }]
+
+
+class AndereTijdenIE(NPOPlaylistBaseIE):
+    IE_NAME = 'anderetijden'
+    _VALID_URL = r'https?://(?:www\.)?anderetijden\.nl/programma/(?:[^/]+/)+(?P<id>[^/?#&]+)'
+    _PLAYLIST_TITLE_RE = r'(?s)<h1[^>]+class=["\'].*?\bpage-title\b.*?["\'][^>]*>(.+?)</h1>'
+    _PLAYLIST_ENTRY_RE = r'<figure[^>]+class=["\']episode-container episode-page["\'][^>]+data-prid=["\'](.+?)["\']'
+
+    _TESTS = [{
+        'url': 'http://anderetijden.nl/programma/1/Andere-Tijden/aflevering/676/Duitse-soldaten-over-de-Slag-bij-Arnhem',
+        'info_dict': {
+            'id': 'Duitse-soldaten-over-de-Slag-bij-Arnhem',
+            'title': 'Duitse soldaten over de Slag bij Arnhem',
+        },
+        'playlist_count': 3,
+    }]
diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py

new file mode 100644 (file)

index 0000000..53acc6e
--- /dev/null
+++ b/youtube_dl/extractor/npr.py
@@ -0,0 +1,124 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    qualities,
+    url_or_none,
+)
+
+
+class NprIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?npr\.org/(?:sections/[^/]+/)?\d{4}/\d{2}/\d{2}/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.npr.org/sections/allsongs/2015/10/21/449974205/new-music-from-beach-house-chairlift-cmj-discoveries-and-more',
+        'info_dict': {
+            'id': '449974205',
+            'title': 'New Music From Beach House, Chairlift, CMJ Discoveries And More'
+        },
+        'playlist_count': 7,
+    }, {
+        'url': 'https://www.npr.org/sections/deceptivecadence/2015/10/09/446928052/music-from-the-shadows-ancient-armenian-hymns-and-piano-jazz',
+        'info_dict': {
+            'id': '446928052',
+            'title': "Songs We Love: Tigran Hamasyan, 'Your Mercy is Boundless'"
+        },
+        'playlist': [{
+            'md5': '12fa60cb2d3ed932f53609d4aeceabf1',
+            'info_dict': {
+                'id': '446929930',
+                'ext': 'mp3',
+                'title': 'Your Mercy is Boundless (Bazum en Qo gtutyunqd)',
+                'duration': 402,
+            },
+        }],
+    }, {
+        # mutlimedia, not media title
+        'url': 'https://www.npr.org/2017/06/19/533198237/tigers-jaw-tiny-desk-concert',
+        'info_dict': {
+            'id': '533198237',
+            'title': 'Tigers Jaw: Tiny Desk Concert',
+        },
+        'playlist': [{
+            'md5': '12fa60cb2d3ed932f53609d4aeceabf1',
+            'info_dict': {
+                'id': '533201718',
+                'ext': 'mp4',
+                'title': 'Tigers Jaw: Tiny Desk Concert',
+                'duration': 402,
+            },
+        }],
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        # multimedia, no formats, stream
+        'url': 'https://www.npr.org/2020/02/14/805476846/laura-stevenson-tiny-desk-concert',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        story = self._download_json(
+            'http://api.npr.org/query', playlist_id, query={
+                'id': playlist_id,
+                'fields': 'audio,multimedia,title',
+                'format': 'json',
+                'apiKey': 'MDAzMzQ2MjAyMDEyMzk4MTU1MDg3ZmM3MQ010',
+            })['list']['story'][0]
+        playlist_title = story.get('title', {}).get('$text')
+
+        KNOWN_FORMATS = ('threegp', 'm3u8', 'smil', 'mp4', 'mp3')
+        quality = qualities(KNOWN_FORMATS)
+
+        entries = []
+        for media in story.get('audio', []) + story.get('multimedia', []):
+            media_id = media['id']
+
+            formats = []
+            for format_id, formats_entry in media.get('format', {}).items():
+                if not formats_entry:
+                    continue
+                if isinstance(formats_entry, list):
+                    formats_entry = formats_entry[0]
+                format_url = formats_entry.get('$text')
+                if not format_url:
+                    continue
+                if format_id in KNOWN_FORMATS:
+                    if format_id == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            format_url, media_id, 'mp4', 'm3u8_native',
+                            m3u8_id='hls', fatal=False))
+                    elif format_id == 'smil':
+                        smil_formats = self._extract_smil_formats(
+                            format_url, media_id, transform_source=lambda s: s.replace(
+                                'rtmp://flash.npr.org/ondemand/', 'https://ondemand.npr.org/'))
+                        self._check_formats(smil_formats, media_id)
+                        formats.extend(smil_formats)
+                    else:
+                        formats.append({
+                            'url': format_url,
+                            'format_id': format_id,
+                            'quality': quality(format_id),
+                        })
+            for stream_id, stream_entry in media.get('stream', {}).items():
+                if not isinstance(stream_entry, dict):
+                    continue
+                if stream_id != 'hlsUrl':
+                    continue
+                stream_url = url_or_none(stream_entry.get('$text'))
+                if not stream_url:
+                    continue
+                formats.extend(self._extract_m3u8_formats(
+                    stream_url, stream_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            self._sort_formats(formats)
+
+            entries.append({
+                'id': media_id,
+                'title': media.get('title', {}).get('$text') or playlist_title,
+                'thumbnail': media.get('altImageUrl', {}).get('$text'),
+                'duration': int_or_none(media.get('duration', {}).get('$text')),
+                'formats': formats,
+            })
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py

new file mode 100644 (file)

index 0000000..9411553
--- /dev/null
+++ b/youtube_dl/extractor/nrk.py
@@ -0,0 +1,717 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_unquote,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    JSON_LD_RE,
+    js_to_json,
+    NO_DEFAULT,
+    parse_age_limit,
+    parse_duration,
+    try_get,
+)
+
+
+class NRKBaseIE(InfoExtractor):
+    _GEO_COUNTRIES = ['NO']
+
+    _api_host = None
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS
+
+        for api_host in api_hosts:
+            data = self._download_json(
+                'http://%s/mediaelement/%s' % (api_host, video_id),
+                video_id, 'Downloading mediaelement JSON',
+                fatal=api_host == api_hosts[-1])
+            if not data:
+                continue
+            self._api_host = api_host
+            break
+
+        title = data.get('fullTitle') or data.get('mainTitle') or data['title']
+        video_id = data.get('id') or video_id
+
+        entries = []
+
+        conviva = data.get('convivaStatistics') or {}
+        live = (data.get('mediaElementType') == 'Live'
+                or data.get('isLive') is True or conviva.get('isLive'))
+
+        def make_title(t):
+            return self._live_title(t) if live else t
+
+        media_assets = data.get('mediaAssets')
+        if media_assets and isinstance(media_assets, list):
+            def video_id_and_title(idx):
+                return ((video_id, title) if len(media_assets) == 1
+                        else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx)))
+            for num, asset in enumerate(media_assets, 1):
+                asset_url = asset.get('url')
+                if not asset_url:
+                    continue
+                formats = self._extract_akamai_formats(asset_url, video_id)
+                if not formats:
+                    continue
+                self._sort_formats(formats)
+
+                # Some f4m streams may not work with hdcore in fragments' URLs
+                for f in formats:
+                    extra_param = f.get('extra_param_to_segment_url')
+                    if extra_param and 'hdcore' in extra_param:
+                        del f['extra_param_to_segment_url']
+
+                entry_id, entry_title = video_id_and_title(num)
+                duration = parse_duration(asset.get('duration'))
+                subtitles = {}
+                for subtitle in ('webVtt', 'timedText'):
+                    subtitle_url = asset.get('%sSubtitlesUrl' % subtitle)
+                    if subtitle_url:
+                        subtitles.setdefault('no', []).append({
+                            'url': compat_urllib_parse_unquote(subtitle_url)
+                        })
+                entries.append({
+                    'id': asset.get('carrierId') or entry_id,
+                    'title': make_title(entry_title),
+                    'duration': duration,
+                    'subtitles': subtitles,
+                    'formats': formats,
+                })
+
+        if not entries:
+            media_url = data.get('mediaUrl')
+            if media_url:
+                formats = self._extract_akamai_formats(media_url, video_id)
+                self._sort_formats(formats)
+                duration = parse_duration(data.get('duration'))
+                entries = [{
+                    'id': video_id,
+                    'title': make_title(title),
+                    'duration': duration,
+                    'formats': formats,
+                }]
+
+        if not entries:
+            MESSAGES = {
+                'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet',
+                'ProgramRightsHasExpired': 'Programmet har gått ut',
+                'NoProgramRights': 'Ikke tilgjengelig',
+                'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
+            }
+            message_type = data.get('messageType', '')
+            # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked*
+            if 'IsGeoBlocked' in message_type:
+                self.raise_geo_restricted(
+                    msg=MESSAGES.get('ProgramIsGeoBlocked'),
+                    countries=self._GEO_COUNTRIES)
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, MESSAGES.get(
+                    message_type, message_type)),
+                expected=True)
+
+        series = conviva.get('seriesName') or data.get('seriesTitle')
+        episode = conviva.get('episodeName') or data.get('episodeNumberOrDate')
+
+        season_number = None
+        episode_number = None
+        if data.get('mediaElementType') == 'Episode':
+            _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \
+                data.get('relativeOriginUrl', '')
+            EPISODENUM_RE = [
+                r'/s(?P<season>\d{,2})e(?P<episode>\d{,2})\.',
+                r'/sesong-(?P<season>\d{,2})/episode-(?P<episode>\d{,2})',
+            ]
+            season_number = int_or_none(self._search_regex(
+                EPISODENUM_RE, _season_episode, 'season number',
+                default=None, group='season'))
+            episode_number = int_or_none(self._search_regex(
+                EPISODENUM_RE, _season_episode, 'episode number',
+                default=None, group='episode'))
+
+        thumbnails = None
+        images = data.get('images')
+        if images and isinstance(images, dict):
+            web_images = images.get('webImages')
+            if isinstance(web_images, list):
+                thumbnails = [{
+                    'url': image['imageUrl'],
+                    'width': int_or_none(image.get('width')),
+                    'height': int_or_none(image.get('height')),
+                } for image in web_images if image.get('imageUrl')]
+
+        description = data.get('description')
+        category = data.get('mediaAnalytics', {}).get('category')
+
+        common_info = {
+            'description': description,
+            'series': series,
+            'episode': episode,
+            'season_number': season_number,
+            'episode_number': episode_number,
+            'categories': [category] if category else None,
+            'age_limit': parse_age_limit(data.get('legalAge')),
+            'thumbnails': thumbnails,
+        }
+
+        vcodec = 'none' if data.get('mediaType') == 'Audio' else None
+
+        for entry in entries:
+            entry.update(common_info)
+            for f in entry['formats']:
+                f['vcodec'] = vcodec
+
+        points = data.get('shortIndexPoints')
+        if isinstance(points, list):
+            chapters = []
+            for next_num, point in enumerate(points, start=1):
+                if not isinstance(point, dict):
+                    continue
+                start_time = parse_duration(point.get('startPoint'))
+                if start_time is None:
+                    continue
+                end_time = parse_duration(
+                    data.get('duration')
+                    if next_num == len(points)
+                    else points[next_num].get('startPoint'))
+                if end_time is None:
+                    continue
+                chapters.append({
+                    'start_time': start_time,
+                    'end_time': end_time,
+                    'title': point.get('title'),
+                })
+            if chapters and len(entries) == 1:
+                entries[0]['chapters'] = chapters
+
+        return self.playlist_result(entries, video_id, title, description)
+
+
+class NRKIE(NRKBaseIE):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            nrk:|
+                            https?://
+                                (?:
+                                    (?:www\.)?nrk\.no/video/PS\*|
+                                    v8[-.]psapi\.nrk\.no/mediaelement/
+                                )
+                            )
+                            (?P<id>[^?#&]+)
+                        '''
+    _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no')
+    _TESTS = [{
+        # video
+        'url': 'http://www.nrk.no/video/PS*150533',
+        'md5': '706f34cdf1322577589e369e522b50ef',
+        'info_dict': {
+            'id': '150533',
+            'ext': 'mp4',
+            'title': 'Dompap og andre fugler i Piip-Show',
+            'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
+            'duration': 262,
+        }
+    }, {
+        # audio
+        'url': 'http://www.nrk.no/video/PS*154915',
+        # MD5 is unstable
+        'info_dict': {
+            'id': '154915',
+            'ext': 'flv',
+            'title': 'Slik høres internett ut når du er blind',
+            'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
+            'duration': 20,
+        }
+    }, {
+        'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9',
+        'only_matching': True,
+    }, {
+        'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70',
+        'only_matching': True,
+    }, {
+        'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9',
+        'only_matching': True,
+    }]
+
+
+class NRKTVIE(NRKBaseIE):
+    IE_DESC = 'NRK TV and NRK Radio'
+    _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})'
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:tv|radio)\.nrk(?:super)?\.no/
+                            (?:serie(?:/[^/]+){1,2}|program)/
+                            (?![Ee]pisodes)%s
+                            (?:/\d{2}-\d{2}-\d{4})?
+                            (?:\#del=(?P<part_id>\d+))?
+                    ''' % _EPISODE_RE
+    _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no')
+    _TESTS = [{
+        'url': 'https://tv.nrk.no/program/MDDP12000117',
+        'md5': '8270824df46ec629b66aeaa5796b36fb',
+        'info_dict': {
+            'id': 'MDDP12000117AA',
+            'ext': 'mp4',
+            'title': 'Alarm Trolltunga',
+            'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce',
+            'duration': 2223,
+            'age_limit': 6,
+        },
+    }, {
+        'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
+        'md5': '9a167e54d04671eb6317a37b7bc8a280',
+        'info_dict': {
+            'id': 'MUHH48000314AA',
+            'ext': 'mp4',
+            'title': '20 spørsmål 23.05.2014',
+            'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
+            'duration': 1741,
+            'series': '20 spørsmål',
+            'episode': '23.05.2014',
+        },
+        'skip': 'NoProgramRights',
+    }, {
+        'url': 'https://tv.nrk.no/program/mdfp15000514',
+        'info_dict': {
+            'id': 'MDFP15000514CA',
+            'ext': 'mp4',
+            'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014',
+            'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db',
+            'duration': 4605,
+            'series': 'Kunnskapskanalen',
+            'episode': '24.05.2014',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # single playlist video
+        'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
+        'info_dict': {
+            'id': 'MSPO40010515-part2',
+            'ext': 'flv',
+            'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+            'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['Video is geo restricted'],
+        'skip': 'particular part is not supported currently',
+    }, {
+        'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
+        'playlist': [{
+            'info_dict': {
+                'id': 'MSPO40010515AH',
+                'ext': 'mp4',
+                'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)',
+                'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
+                'duration': 772,
+                'series': 'Tour de Ski',
+                'episode': '06.01.2015',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        }, {
+            'info_dict': {
+                'id': 'MSPO40010515BH',
+                'ext': 'mp4',
+                'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)',
+                'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
+                'duration': 6175,
+                'series': 'Tour de Ski',
+                'episode': '06.01.2015',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        }],
+        'info_dict': {
+            'id': 'MSPO40010515',
+            'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015',
+            'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
+        },
+        'expected_warnings': ['Video is geo restricted'],
+    }, {
+        'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13',
+        'info_dict': {
+            'id': 'KMTE50001317AA',
+            'ext': 'mp4',
+            'title': 'Anno 13:30',
+            'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa',
+            'duration': 2340,
+            'series': 'Anno',
+            'episode': '13:30',
+            'season_number': 3,
+            'episode_number': 13,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017',
+        'info_dict': {
+            'id': 'MUHH46000317AA',
+            'ext': 'mp4',
+            'title': 'Nytt på Nytt 27.01.2017',
+            'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b',
+            'duration': 1796,
+            'series': 'Nytt på nytt',
+            'episode': '27.01.2017',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
+        'only_matching': True,
+    }, {
+        'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller',
+        'only_matching': True,
+    }]
+
+
+class NRKTVEpisodeIE(InfoExtractor):
+    _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)'
+    _TESTS = [{
+        'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2',
+        'info_dict': {
+            'id': 'MUHH36005220BA',
+            'ext': 'mp4',
+            'title': 'Kro, krig og kjærlighet 2:6',
+            'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350',
+            'duration': 1563,
+            'series': 'Hellums kro',
+            'season_number': 1,
+            'episode_number': 2,
+            'episode': '2:6',
+            'age_limit': 6,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8',
+        'info_dict': {
+            'id': 'MSUI14000816AA',
+            'ext': 'mp4',
+            'title': 'Backstage 8:30',
+            'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4',
+            'duration': 1320,
+            'series': 'Backstage',
+            'season_number': 1,
+            'episode_number': 8,
+            'episode': '8:30',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'ProgramRightsHasExpired',
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        nrk_id = self._parse_json(
+            self._search_regex(JSON_LD_RE, webpage, 'JSON-LD', group='json_ld'),
+            display_id)['@id']
+
+        assert re.match(NRKTVIE._EPISODE_RE, nrk_id)
+        return self.url_result(
+            'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)
+
+
+class NRKTVSerieBaseIE(InfoExtractor):
+    def _extract_series(self, webpage, display_id, fatal=True):
+        config = self._parse_json(
+            self._search_regex(
+                (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;',
+                 r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'),
+                webpage, 'config', default='{}' if not fatal else NO_DEFAULT),
+            display_id, fatal=False, transform_source=js_to_json)
+        if not config:
+            return
+        return try_get(
+            config,
+            (lambda x: x['initialState']['series'], lambda x: x['series']),
+            dict)
+
+    def _extract_seasons(self, seasons):
+        if not isinstance(seasons, list):
+            return []
+        entries = []
+        for season in seasons:
+            entries.extend(self._extract_episodes(season))
+        return entries
+
+    def _extract_episodes(self, season):
+        if not isinstance(season, dict):
+            return []
+        return self._extract_entries(season.get('episodes'))
+
+    def _extract_entries(self, entry_list):
+        if not isinstance(entry_list, list):
+            return []
+        entries = []
+        for episode in entry_list:
+            nrk_id = episode.get('prfId')
+            if not nrk_id or not isinstance(nrk_id, compat_str):
+                continue
+            entries.append(self.url_result(
+                'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id))
+        return entries
+
+
+class NRKTVSeasonIE(NRKTVSerieBaseIE):
+    _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://tv.nrk.no/serie/backstage/sesong/1',
+        'info_dict': {
+            'id': '1',
+            'title': 'Sesong 1',
+        },
+        'playlist_mincount': 30,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url)
+                else super(NRKTVSeasonIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        series = self._extract_series(webpage, display_id)
+
+        season = next(
+            s for s in series['seasons']
+            if int(display_id) == s.get('seasonNumber'))
+
+        title = try_get(season, lambda x: x['titles']['title'], compat_str)
+        return self.playlist_result(
+            self._extract_episodes(season), display_id, title)
+
+
+class NRKTVSeriesIE(NRKTVSerieBaseIE):
+    _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
+    _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://tv.nrk.no/serie/blank',
+        'info_dict': {
+            'id': 'blank',
+            'title': 'Blank',
+            'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e',
+        },
+        'playlist_mincount': 30,
+    }, {
+        # new layout, seasons
+        'url': 'https://tv.nrk.no/serie/backstage',
+        'info_dict': {
+            'id': 'backstage',
+            'title': 'Backstage',
+            'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3',
+        },
+        'playlist_mincount': 60,
+    }, {
+        # new layout, instalments
+        'url': 'https://tv.nrk.no/serie/groenn-glede',
+        'info_dict': {
+            'id': 'groenn-glede',
+            'title': 'Grønn glede',
+            'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
+        },
+        'playlist_mincount': 10,
+    }, {
+        # old layout
+        'url': 'https://tv.nrksuper.no/serie/labyrint',
+        'info_dict': {
+            'id': 'labyrint',
+            'title': 'Labyrint',
+            'description': 'md5:318b597330fdac5959247c9b69fdb1ec',
+        },
+        'playlist_mincount': 3,
+    }, {
+        'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene',
+        'only_matching': True,
+    }, {
+        'url': 'https://tv.nrk.no/serie/saving-the-human-race',
+        'only_matching': True,
+    }, {
+        'url': 'https://tv.nrk.no/serie/postmann-pat',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (
+            False if any(ie.suitable(url)
+                         for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE))
+            else super(NRKTVSeriesIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        series_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, series_id)
+
+        # New layout (e.g. https://tv.nrk.no/serie/backstage)
+        series = self._extract_series(webpage, series_id, fatal=False)
+        if series:
+            title = try_get(series, lambda x: x['titles']['title'], compat_str)
+            description = try_get(
+                series, lambda x: x['titles']['subtitle'], compat_str)
+            entries = []
+            entries.extend(self._extract_seasons(series.get('seasons')))
+            entries.extend(self._extract_entries(series.get('instalments')))
+            entries.extend(self._extract_episodes(series.get('extraMaterial')))
+            return self.playlist_result(entries, series_id, title, description)
+
+        # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint)
+        entries = [
+            self.url_result(
+                'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
+                    series=series_id, season=season_id))
+            for season_id in re.findall(self._ITEM_RE, webpage)
+        ]
+
+        title = self._html_search_meta(
+            'seriestitle', webpage,
+            'title', default=None) or self._og_search_title(
+            webpage, fatal=False)
+        if title:
+            title = self._search_regex(
+                r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title)
+
+        description = self._html_search_meta(
+            'series_description', webpage,
+            'description', default=None) or self._og_search_description(webpage)
+
+        return self.playlist_result(entries, series_id, title, description)
+
+
+class NRKTVDirekteIE(NRKTVIE):
+    IE_DESC = 'NRK TV Direkte and NRK Radio Direkte'
+    _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'https://tv.nrk.no/direkte/nrk1',
+        'only_matching': True,
+    }, {
+        'url': 'https://radio.nrk.no/direkte/p1_oslo_akershus',
+        'only_matching': True,
+    }]
+
+
+class NRKPlaylistBaseIE(InfoExtractor):
+    def _extract_description(self, webpage):
+        pass
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result('nrk:%s' % video_id, NRKIE.ie_key())
+            for video_id in re.findall(self._ITEM_RE, webpage)
+        ]
+
+        playlist_title = self. _extract_title(webpage)
+        playlist_description = self._extract_description(webpage)
+
+        return self.playlist_result(
+            entries, playlist_id, playlist_title, playlist_description)
+
+
+class NRKPlaylistIE(NRKPlaylistBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)'
+    _ITEM_RE = r'class="[^"]*\brich\b[^"]*"[^>]+data-video-id="([^"]+)"'
+    _TESTS = [{
+        'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
+        'info_dict': {
+            'id': 'gjenopplev-den-historiske-solformorkelsen-1.12270763',
+            'title': 'Gjenopplev den historiske solformørkelsen',
+            'description': 'md5:c2df8ea3bac5654a26fc2834a542feed',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'http://www.nrk.no/kultur/bok/rivertonprisen-til-karin-fossum-1.12266449',
+        'info_dict': {
+            'id': 'rivertonprisen-til-karin-fossum-1.12266449',
+            'title': 'Rivertonprisen til Karin Fossum',
+            'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.',
+        },
+        'playlist_count': 2,
+    }]
+
+    def _extract_title(self, webpage):
+        return self._og_search_title(webpage, fatal=False)
+
+    def _extract_description(self, webpage):
+        return self._og_search_description(webpage)
+
+
+class NRKTVEpisodesIE(NRKPlaylistBaseIE):
+    _VALID_URL = r'https?://tv\.nrk\.no/program/[Ee]pisodes/[^/]+/(?P<id>\d+)'
+    _ITEM_RE = r'data-episode=["\']%s' % NRKTVIE._EPISODE_RE
+    _TESTS = [{
+        'url': 'https://tv.nrk.no/program/episodes/nytt-paa-nytt/69031',
+        'info_dict': {
+            'id': '69031',
+            'title': 'Nytt på nytt, sesong: 201210',
+        },
+        'playlist_count': 4,
+    }]
+
+    def _extract_title(self, webpage):
+        return self._html_search_regex(
+            r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
+
+
+class NRKSkoleIE(InfoExtractor):
+    IE_DESC = 'NRK Skole'
+    _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'https://www.nrk.no/skole/?page=search&q=&mediaId=14099',
+        'md5': '18c12c3d071953c3bf8d54ef6b2587b7',
+        'info_dict': {
+            'id': '6021',
+            'ext': 'mp4',
+            'title': 'Genetikk og eneggede tvillinger',
+            'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d',
+            'duration': 399,
+        },
+    }, {
+        'url': 'https://www.nrk.no/skole/?page=objectives&subject=naturfag&objective=K15114&mediaId=19355',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://mimir.nrk.no/plugin/1.0/static?mediaId=%s' % video_id,
+            video_id)
+
+        nrk_id = self._parse_json(
+            self._search_regex(
+                r'<script[^>]+type=["\']application/json["\'][^>]*>({.+?})</script>',
+                webpage, 'application json'),
+            video_id)['activeMedia']['psId']
+
+        return self.url_result('nrk:%s' % nrk_id)
diff --git a/youtube_dl/extractor/nrl.py b/youtube_dl/extractor/nrl.py

new file mode 100644 (file)

index 0000000..22a2df8
--- /dev/null
+++ b/youtube_dl/extractor/nrl.py
@@ -0,0 +1,30 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class NRLTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nrl\.com/tv(/[^/]+)*/(?P<id>[^/?&#]+)'
+    _TEST = {
+        'url': 'https://www.nrl.com/tv/news/match-highlights-titans-v-knights-862805/',
+        'info_dict': {
+            'id': 'YyNnFuaDE6kPJqlDhG4CGQ_w89mKTau4',
+            'ext': 'mp4',
+            'title': 'Match Highlights: Titans v Knights',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+            'format': 'bestvideo',
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        q_data = self._parse_json(self._html_search_regex(
+            r'(?s)q-data="({.+?})"', webpage, 'player data'), display_id)
+        ooyala_id = q_data['videoId']
+        return self.url_result(
+            'ooyala:' + ooyala_id, 'Ooyala', ooyala_id, q_data.get('title'))
diff --git a/youtube_dl/extractor/ntvcojp.py b/youtube_dl/extractor/ntvcojp.py

new file mode 100644 (file)

index 0000000..0c8221b
--- /dev/null
+++ b/youtube_dl/extractor/ntvcojp.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+    smuggle_url,
+)
+
+
+class NTVCoJpCUIE(InfoExtractor):
+    IE_NAME = 'cu.ntv.co.jp'
+    IE_DESC = 'Nippon Television Network'
+    _VALID_URL = r'https?://cu\.ntv\.co\.jp/(?!program)(?P<id>[^/?&#]+)'
+    _TEST = {
+        'url': 'https://cu.ntv.co.jp/televiva-chill-gohan_181031/',
+        'info_dict': {
+            'id': '5978891207001',
+            'ext': 'mp4',
+            'title': '桜エビと炒り卵がポイント！ 「中華風 エビチリおにぎり」──『美虎』五十嵐美幸',
+            'upload_date': '20181213',
+            'description': 'md5:211b52f4fd60f3e0e72b68b0c6ba52a9',
+            'uploader_id': '3855502814001',
+            'timestamp': 1544669941,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        player_config = self._parse_json(self._search_regex(
+            r'(?s)PLAYER_CONFIG\s*=\s*({.+?})',
+            webpage, 'player config'), display_id, js_to_json)
+        video_id = player_config['videoId']
+        account_id = player_config.get('account') or '3855502814001'
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'display_id': display_id,
+            'title': self._search_regex(r'<h1[^>]+class="title"[^>]*>([^<]+)', webpage, 'title').strip(),
+            'description': self._html_search_meta(['description', 'og:description'], webpage),
+            'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), {'geo_countries': ['JP']}),
+            'ie_key': 'BrightcoveNew',
+        }
diff --git a/youtube_dl/extractor/ntvde.py b/youtube_dl/extractor/ntvde.py

new file mode 100644 (file)

index 0000000..101a537
--- /dev/null
+++ b/youtube_dl/extractor/ntvde.py
@@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    parse_duration,
+)
+
+
+class NTVDeIE(InfoExtractor):
+    IE_NAME = 'n-tv.de'
+    _VALID_URL = r'https?://(?:www\.)?n-tv\.de/mediathek/videos/[^/?#]+/[^/?#]+-article(?P<id>.+)\.html'
+
+    _TESTS = [{
+        'url': 'http://www.n-tv.de/mediathek/videos/panorama/Schnee-und-Glaette-fuehren-zu-zahlreichen-Unfaellen-und-Staus-article14438086.html',
+        'md5': '6ef2514d4b1e8e03ca24b49e2f167153',
+        'info_dict': {
+            'id': '14438086',
+            'ext': 'mp4',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'title': 'Schnee und Glätte führen zu zahlreichen Unfällen und Staus',
+            'alt_title': 'Winterchaos auf deutschen Straßen',
+            'description': 'Schnee und Glätte sorgen deutschlandweit für einen chaotischen Start in die Woche: Auf den Straßen kommt es zu kilometerlangen Staus und Dutzenden Glätteunfällen. In Düsseldorf und München wirbelt der Schnee zudem den Flugplan durcheinander. Dutzende Flüge landen zu spät, einige fallen ganz aus.',
+            'duration': 4020,
+            'timestamp': 1422892797,
+            'upload_date': '20150202',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        info = self._parse_json(self._search_regex(
+            r'(?s)ntv\.pageInfo\.article\s*=\s*(\{.*?\});', webpage, 'info'),
+            video_id, transform_source=js_to_json)
+        timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp'))
+        vdata = self._parse_json(self._search_regex(
+            r'(?s)\$\(\s*"\#player"\s*\)\s*\.data\(\s*"player",\s*(\{.*?\})\);',
+            webpage, 'player data'), video_id,
+            transform_source=lambda s: js_to_json(re.sub(r'advertising:\s*{[^}]+},', '', s)))
+        duration = parse_duration(vdata.get('duration'))
+
+        formats = []
+        if vdata.get('video'):
+            formats.append({
+                'format_id': 'flash',
+                'url': 'rtmp://fms.n-tv.de/%s' % vdata['video'],
+            })
+        if vdata.get('videoMp4'):
+            formats.append({
+                'format_id': 'mobile',
+                'url': compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoMp4']),
+                'tbr': 400,  # estimation
+            })
+        if vdata.get('videoM3u8'):
+            m3u8_url = compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoM3u8'])
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+                preference=0, m3u8_id='hls', fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': info['headline'],
+            'description': info.get('intro'),
+            'alt_title': info.get('kicker'),
+            'timestamp': timestamp,
+            'thumbnail': vdata.get('html5VideoPoster'),
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py

new file mode 100644 (file)

index 0000000..c47d1df
--- /dev/null
+++ b/youtube_dl/extractor/ntvru.py
@@ -0,0 +1,131 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    strip_or_none,
+    unescapeHTML,
+    xpath_text,
+)
+
+
+class NTVRuIE(InfoExtractor):
+    IE_NAME = 'ntv.ru'
+    _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'http://www.ntv.ru/novosti/863142/',
+        'md5': 'ba7ea172a91cb83eb734cad18c10e723',
+        'info_dict': {
+            'id': '746000',
+            'ext': 'mp4',
+            'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
+            'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
+            'thumbnail': r're:^http://.*\.jpg',
+            'duration': 136,
+        },
+    }, {
+        'url': 'http://www.ntv.ru/video/novosti/750370/',
+        'md5': 'adecff79691b4d71e25220a191477124',
+        'info_dict': {
+            'id': '750370',
+            'ext': 'mp4',
+            'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
+            'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
+            'thumbnail': r're:^http://.*\.jpg',
+            'duration': 172,
+        },
+    }, {
+        'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
+        'md5': '82dbd49b38e3af1d00df16acbeab260c',
+        'info_dict': {
+            'id': '747480',
+            'ext': 'mp4',
+            'title': '«Сегодня». 21 марта 2014 года. 16:00',
+            'description': '«Сегодня». 21 марта 2014 года. 16:00',
+            'thumbnail': r're:^http://.*\.jpg',
+            'duration': 1496,
+        },
+    }, {
+        'url': 'https://www.ntv.ru/kino/Koma_film/m70281/o336036/video/',
+        'md5': 'e9c7cde24d9d3eaed545911a04e6d4f4',
+        'info_dict': {
+            'id': '1126480',
+            'ext': 'mp4',
+            'title': 'Остросюжетный фильм «Кома»',
+            'description': 'Остросюжетный фильм «Кома»',
+            'thumbnail': r're:^http://.*\.jpg',
+            'duration': 5592,
+        },
+    }, {
+        'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/',
+        'md5': '9320cd0e23f3ea59c330dc744e06ff3b',
+        'info_dict': {
+            'id': '751482',
+            'ext': 'mp4',
+            'title': '«Дело врачей»: «Деревце жизни»',
+            'description': '«Дело врачей»: «Деревце жизни»',
+            'thumbnail': r're:^http://.*\.jpg',
+            'duration': 2590,
+        },
+    }, {
+        # Schemeless file URL
+        'url': 'https://www.ntv.ru/video/1797442',
+        'only_matching': True,
+    }]
+
+    _VIDEO_ID_REGEXES = [
+        r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
+        r'<video embed=[^>]+><id>(\d+)</id>',
+        r'<video restriction[^>]+><key>(\d+)</key>',
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._og_search_property(
+            ('video', 'video:iframe'), webpage, default=None)
+        if video_url:
+            video_id = self._search_regex(
+                r'https?://(?:www\.)?ntv\.ru/video/(?:embed/)?(\d+)',
+                video_url, 'video id', default=None)
+
+        if not video_id:
+            video_id = self._html_search_regex(
+                self._VIDEO_ID_REGEXES, webpage, 'video id')
+
+        player = self._download_xml(
+            'http://www.ntv.ru/vi%s/' % video_id,
+            video_id, 'Downloading video XML')
+
+        title = strip_or_none(unescapeHTML(xpath_text(player, './data/title', 'title', fatal=True)))
+
+        video = player.find('./data/video')
+
+        formats = []
+        for format_id in ['', 'hi', 'webm']:
+            file_ = xpath_text(video, './%sfile' % format_id)
+            if not file_:
+                continue
+            if file_.startswith('//'):
+                file_ = self._proto_relative_url(file_)
+            elif not file_.startswith('http'):
+                file_ = 'http://media.ntv.ru/vod/' + file_
+            formats.append({
+                'url': file_,
+                'filesize': int_or_none(xpath_text(video, './%ssize' % format_id)),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': xpath_text(video, './id'),
+            'title': title,
+            'description': strip_or_none(unescapeHTML(xpath_text(player, './data/description'))),
+            'thumbnail': xpath_text(video, './splash'),
+            'duration': int_or_none(xpath_text(video, './totaltime')),
+            'view_count': int_or_none(xpath_text(video, './views')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py

new file mode 100644 (file)

index 0000000..be1e09d
--- /dev/null
+++ b/youtube_dl/extractor/nuevo.py
@@ -0,0 +1,39 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..utils import (
+    float_or_none,
+    xpath_text
+)
+
+
+class NuevoBaseIE(InfoExtractor):
+    def _extract_nuevo(self, config_url, video_id, headers={}):
+        config = self._download_xml(
+            config_url, video_id, transform_source=lambda s: s.strip(),
+            headers=headers)
+
+        title = xpath_text(config, './title', 'title', fatal=True).strip()
+        video_id = xpath_text(config, './mediaid', default=video_id)
+        thumbnail = xpath_text(config, ['./image', './thumb'])
+        duration = float_or_none(xpath_text(config, './duration'))
+
+        formats = []
+        for element_name, format_id in (('file', 'sd'), ('filehd', 'hd')):
+            video_url = xpath_text(config, element_name)
+            if video_url:
+                formats.append({
+                    'url': video_url,
+                    'format_id': format_id,
+                })
+        self._check_formats(formats, video_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py

new file mode 100644 (file)

index 0000000..ab6bfcd
--- /dev/null
+++ b/youtube_dl/extractor/nuvid.py
@@ -0,0 +1,71 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+)
+
+
+class NuvidIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://m.nuvid.com/video/1310741/',
+        'md5': 'eab207b7ac4fccfb4e23c86201f11277',
+        'info_dict': {
+            'id': '1310741',
+            'ext': 'mp4',
+            'title': 'Horny babes show their awesome bodeis and',
+            'duration': 129,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        page_url = 'http://m.nuvid.com/video/%s' % video_id
+        webpage = self._download_webpage(
+            page_url, video_id, 'Downloading video page')
+        # When dwnld_speed exists and has a value larger than the MP4 file's
+        # bitrate, Nuvid returns the MP4 URL
+        # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm
+        self._set_cookie('nuvid.com', 'dwnld_speed', '10.0')
+        mp4_webpage = self._download_webpage(
+            page_url, video_id, 'Downloading video page for MP4 format')
+
+        html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']',
+        video_url = self._html_search_regex(html5_video_re, webpage, video_id)
+        mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id)
+        formats = [{
+            'url': video_url,
+        }]
+        if mp4_video_url != video_url:
+            formats.append({
+                'url': mp4_video_url,
+            })
+
+        title = self._html_search_regex(
+            [r'<span title="([^"]+)">',
+             r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>',
+             r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip()
+        thumbnails = [
+            {
+                'url': thumb_url,
+            } for thumb_url in re.findall(r'<img src="([^"]+)" alt="" />', webpage)
+        ]
+        thumbnail = thumbnails[0]['url'] if thumbnails else None
+        duration = parse_duration(self._html_search_regex(
+            [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})',
+             r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnails': thumbnails,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': 18,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py

new file mode 100644 (file)

index 0000000..fc78ca5
--- /dev/null
+++ b/youtube_dl/extractor/nytimes.py
@@ -0,0 +1,223 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hmac
+import hashlib
+import base64
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    js_to_json,
+    mimetype2ext,
+    parse_iso8601,
+    remove_start,
+)
+
+
+class NYTimesBaseIE(InfoExtractor):
+    _SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v'
+
+    def _extract_video_from_id(self, video_id):
+        # Authorization generation algorithm is reverse engineered from `signer` in
+        # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js
+        path = '/svc/video/api/v3/video/' + video_id
+        hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest()
+        video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={
+            'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(),
+            'X-NYTV': 'vhs',
+        }, fatal=False)
+        if not video_data:
+            video_data = self._download_json(
+                'http://www.nytimes.com/svc/video/api/v2/video/' + video_id,
+                video_id, 'Downloading video JSON')
+
+        title = video_data['headline']
+
+        def get_file_size(file_size):
+            if isinstance(file_size, int):
+                return file_size
+            elif isinstance(file_size, dict):
+                return int(file_size.get('value', 0))
+            else:
+                return None
+
+        urls = []
+        formats = []
+        for video in video_data.get('renditions', []):
+            video_url = video.get('url')
+            format_id = video.get('type')
+            if not video_url or format_id == 'thumbs' or video_url in urls:
+                continue
+            urls.append(video_url)
+            ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id=format_id or 'hls', fatal=False))
+            elif ext == 'mpd':
+                continue
+            #     formats.extend(self._extract_mpd_formats(
+            #         video_url, video_id, format_id or 'dash', fatal=False))
+            else:
+                formats.append({
+                    'url': video_url,
+                    'format_id': format_id,
+                    'vcodec': video.get('videoencoding') or video.get('video_codec'),
+                    'width': int_or_none(video.get('width')),
+                    'height': int_or_none(video.get('height')),
+                    'filesize': get_file_size(video.get('file_size') or video.get('fileSize')),
+                    'tbr': int_or_none(video.get('bitrate'), 1000) or None,
+                    'ext': ext,
+                })
+        self._sort_formats(formats, ('height', 'width', 'filesize', 'tbr', 'fps', 'format_id'))
+
+        thumbnails = []
+        for image in video_data.get('images', []):
+            image_url = image.get('url')
+            if not image_url:
+                continue
+            thumbnails.append({
+                'url': 'http://www.nytimes.com/' + image_url,
+                'width': int_or_none(image.get('width')),
+                'height': int_or_none(image.get('height')),
+            })
+
+        publication_date = video_data.get('publication_date')
+        timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('summary'),
+            'timestamp': timestamp,
+            'uploader': video_data.get('byline'),
+            'duration': float_or_none(video_data.get('duration'), 1000),
+            'formats': formats,
+            'thumbnails': thumbnails,
+        }
+
+
+class NYTimesIE(NYTimesBaseIE):
+    _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
+        'md5': 'd665342765db043f7e225cff19df0f2d',
+        'info_dict': {
+            'id': '100000002847155',
+            'ext': 'mov',
+            'title': 'Verbatim: What Is a Photocopier?',
+            'description': 'md5:93603dada88ddbda9395632fdc5da260',
+            'timestamp': 1398631707,
+            'upload_date': '20140427',
+            'uploader': 'Brett Weiner',
+            'duration': 419,
+        }
+    }, {
+        'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        return self._extract_video_from_id(video_id)
+
+
+class NYTimesArticleIE(NYTimesBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
+    _TESTS = [{
+        'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
+        'md5': 'e2076d58b4da18e6a001d53fd56db3c9',
+        'info_dict': {
+            'id': '100000003628438',
+            'ext': 'mov',
+            'title': 'New Minimum Wage: $70,000 a Year',
+            'description': 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.',
+            'timestamp': 1429033037,
+            'upload_date': '20150414',
+            'uploader': 'Matthew Williams',
+        }
+    }, {
+        'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html',
+        'md5': 'e0d52040cafb07662acf3c9132db3575',
+        'info_dict': {
+            'id': '100000004709062',
+            'title': 'The Run-Up: ‘He Was Like an Octopus’',
+            'ext': 'mp3',
+            'description': 'md5:fb5c6b93b12efc51649b4847fe066ee4',
+            'series': 'The Run-Up',
+            'episode': '‘He Was Like an Octopus’',
+            'episode_number': 20,
+            'duration': 2130,
+        }
+    }, {
+        'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html',
+        'info_dict': {
+            'id': '100000004709479',
+            'title': 'The Rise of Hitler',
+            'ext': 'mp3',
+            'description': 'md5:bce877fd9e3444990cb141875fab0028',
+            'creator': 'Pamela Paul',
+            'duration': 3475,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1',
+        'only_matching': True,
+    }]
+
+    def _extract_podcast_from_json(self, json, page_id, webpage):
+        podcast_audio = self._parse_json(
+            json, page_id, transform_source=js_to_json)
+
+        audio_data = podcast_audio['data']
+        track = audio_data['track']
+
+        episode_title = track['title']
+        video_url = track['source']
+
+        description = track.get('description') or self._html_search_meta(
+            ['og:description', 'twitter:description'], webpage)
+
+        podcast_title = audio_data.get('podcast', {}).get('title')
+        title = ('%s: %s' % (podcast_title, episode_title)
+                 if podcast_title else episode_title)
+
+        episode = audio_data.get('podcast', {}).get('episode') or ''
+        episode_number = int_or_none(self._search_regex(
+            r'[Ee]pisode\s+(\d+)', episode, 'episode number', default=None))
+
+        return {
+            'id': remove_start(podcast_audio.get('target'), 'FT') or page_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'creator': track.get('credit'),
+            'series': podcast_title,
+            'episode': episode_title,
+            'episode_number': episode_number,
+            'duration': int_or_none(track.get('duration')),
+        }
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, page_id)
+
+        video_id = self._search_regex(
+            r'data-videoid=["\'](\d+)', webpage, 'video id',
+            default=None, fatal=False)
+        if video_id is not None:
+            return self._extract_video_from_id(video_id)
+
+        podcast_data = self._search_regex(
+            (r'NYTD\.FlexTypes\.push\s*\(\s*({.+?})\s*\)\s*;\s*</script',
+             r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'),
+            webpage, 'podcast data')
+        return self._extract_podcast_from_json(podcast_data, page_id, webpage)
diff --git a/youtube_dl/extractor/nzz.py b/youtube_dl/extractor/nzz.py

new file mode 100644 (file)

index 0000000..61ee77a
--- /dev/null
+++ b/youtube_dl/extractor/nzz.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    extract_attributes,
+)
+
+
+class NZZIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nzz\.ch/(?:[^/]+/)*[^/?#]+-ld\.(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.nzz.ch/zuerich/gymizyte/gymizyte-schreiben-schueler-heute-noch-diktate-ld.9153',
+        'info_dict': {
+            'id': '9153',
+        },
+        'playlist_mincount': 6,
+    }, {
+        'url': 'https://www.nzz.ch/video/nzz-standpunkte/cvp-auf-der-suche-nach-dem-mass-der-mitte-ld.1368112',
+        'info_dict': {
+            'id': '1368112',
+        },
+        'playlist_count': 1,
+    }]
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+        webpage = self._download_webpage(url, page_id)
+
+        entries = []
+        for player_element in re.findall(
+                r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage):
+            player_params = extract_attributes(player_element)
+            if player_params.get('data-type') not in ('kaltura_singleArticle',):
+                self.report_warning('Unsupported player type')
+                continue
+            entry_id = player_params['data-id']
+            entries.append(self.url_result(
+                'kaltura:1750922:' + entry_id, 'Kaltura', entry_id))
+
+        return self.playlist_result(entries, page_id)
diff --git a/youtube_dl/extractor/odatv.py b/youtube_dl/extractor/odatv.py

new file mode 100644 (file)

index 0000000..314527f
--- /dev/null
+++ b/youtube_dl/extractor/odatv.py
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    NO_DEFAULT,
+    remove_start
+)
+
+
+class OdaTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?odatv\.com/(?:mob|vid)_video\.php\?.*\bid=(?P<id>[^&]+)'
+    _TESTS = [{
+        'url': 'http://odatv.com/vid_video.php?id=8E388',
+        'md5': 'dc61d052f205c9bf2da3545691485154',
+        'info_dict': {
+            'id': '8E388',
+            'ext': 'mp4',
+            'title': 'Artık Davutoğlu ile devam edemeyiz'
+        }
+    }, {
+        # mobile URL
+        'url': 'http://odatv.com/mob_video.php?id=8E388',
+        'only_matching': True,
+    }, {
+        # no video
+        'url': 'http://odatv.com/mob_video.php?id=8E900',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        no_video = 'NO VIDEO!' in webpage
+
+        video_url = self._search_regex(
+            r'mp4\s*:\s*(["\'])(?P<url>http.+?)\1', webpage, 'video url',
+            default=None if no_video else NO_DEFAULT, group='url')
+
+        if no_video:
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': remove_start(self._og_search_title(webpage), 'Video: '),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py

new file mode 100644 (file)

index 0000000..7ed9fac
--- /dev/null
+++ b/youtube_dl/extractor/odnoklassniki.py
@@ -0,0 +1,268 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_etree_fromstring,
+    compat_parse_qs,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    unified_strdate,
+    int_or_none,
+    qualities,
+    unescapeHTML,
+    urlencode_postdata,
+)
+
+
+class OdnoklassnikiIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                https?://
+                    (?:(?:www|m|mobile)\.)?
+                    (?:odnoklassniki|ok)\.ru/
+                    (?:
+                        video(?:embed)?/|
+                        web-api/video/moviePlayer/|
+                        live/|
+                        dk\?.*?st\.mvId=
+                    )
+                    (?P<id>[\d-]+)
+                '''
+    _TESTS = [{
+        # metadata in JSON
+        'url': 'http://ok.ru/video/20079905452',
+        'md5': '0b62089b479e06681abaaca9d204f152',
+        'info_dict': {
+            'id': '20079905452',
+            'ext': 'mp4',
+            'title': 'Культура меняет нас (прекрасный ролик!))',
+            'duration': 100,
+            'upload_date': '20141207',
+            'uploader_id': '330537914540',
+            'uploader': 'Виталий Добровольский',
+            'like_count': int,
+            'age_limit': 0,
+        },
+    }, {
+        # metadataUrl
+        'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
+        'md5': '6ff470ea2dd51d5d18c295a355b0b6bc',
+        'info_dict': {
+            'id': '63567059965189-0',
+            'ext': 'mp4',
+            'title': 'Девушка без комплексов ...',
+            'duration': 191,
+            'upload_date': '20150518',
+            'uploader_id': '534380003155',
+            'uploader': '☭ Андрей Мещанинов ☭',
+            'like_count': int,
+            'age_limit': 0,
+            'start_time': 5,
+        },
+    }, {
+        # YouTube embed (metadataUrl, provider == USER_YOUTUBE)
+        'url': 'http://ok.ru/video/64211978996595-1',
+        'md5': '2f206894ffb5dbfcce2c5a14b909eea5',
+        'info_dict': {
+            'id': 'V_VztHT5BzY',
+            'ext': 'mp4',
+            'title': 'Космическая среда от 26 августа 2015',
+            'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0',
+            'duration': 440,
+            'upload_date': '20150826',
+            'uploader_id': 'tvroscosmos',
+            'uploader': 'Телестудия Роскосмоса',
+            'age_limit': 0,
+        },
+    }, {
+        # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
+        'url': 'http://ok.ru/video/62036049272859-0',
+        'info_dict': {
+            'id': '62036049272859-0',
+            'ext': 'mp4',
+            'title': 'МУЗЫКА     ДОЖДЯ .',
+            'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0',
+            'upload_date': '20120106',
+            'uploader_id': '473534735899',
+            'uploader': 'МARINA D',
+            'age_limit': 0,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Video has not been found',
+    }, {
+        'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ok.ru/video/20648036891',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ok.ru/videoembed/20648036891',
+        'only_matching': True,
+    }, {
+        'url': 'http://m.ok.ru/video/20079905452',
+        'only_matching': True,
+    }, {
+        'url': 'http://mobile.ok.ru/video/20079905452',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ok.ru/live/484531969818',
+        'only_matching': True,
+    }, {
+        'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#',
+        'only_matching': True,
+    }, {
+        # Paid video
+        'url': 'https://ok.ru/video/954886983203',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        start_time = int_or_none(compat_parse_qs(
+            compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
+
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://ok.ru/video/%s' % video_id, video_id)
+
+        error = self._search_regex(
+            r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<',
+            webpage, 'error', default=None)
+        if error:
+            raise ExtractorError(error, expected=True)
+
+        player = self._parse_json(
+            unescapeHTML(self._search_regex(
+                r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id,
+                webpage, 'player', group='player')),
+            video_id)
+
+        flashvars = player['flashvars']
+
+        metadata = flashvars.get('metadata')
+        if metadata:
+            metadata = self._parse_json(metadata, video_id)
+        else:
+            data = {}
+            st_location = flashvars.get('location')
+            if st_location:
+                data['st.location'] = st_location
+            metadata = self._download_json(
+                compat_urllib_parse_unquote(flashvars['metadataUrl']),
+                video_id, 'Downloading metadata JSON',
+                data=urlencode_postdata(data))
+
+        movie = metadata['movie']
+
+        # Some embedded videos may not contain title in movie dict (e.g.
+        # http://ok.ru/video/62036049272859-0) thus we allow missing title
+        # here and it's going to be extracted later by an extractor that
+        # will process the actual embed.
+        provider = metadata.get('provider')
+        title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title')
+
+        thumbnail = movie.get('poster')
+        duration = int_or_none(movie.get('duration'))
+
+        author = metadata.get('author', {})
+        uploader_id = author.get('id')
+        uploader = author.get('name')
+
+        upload_date = unified_strdate(self._html_search_meta(
+            'ya:ovs:upload_date', webpage, 'upload date', default=None))
+
+        age_limit = None
+        adult = self._html_search_meta(
+            'ya:ovs:adult', webpage, 'age limit', default=None)
+        if adult:
+            age_limit = 18 if adult == 'true' else 0
+
+        like_count = int_or_none(metadata.get('likeCount'))
+
+        info = {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'like_count': like_count,
+            'age_limit': age_limit,
+            'start_time': start_time,
+        }
+
+        if provider == 'USER_YOUTUBE':
+            info.update({
+                '_type': 'url_transparent',
+                'url': movie['contentId'],
+            })
+            return info
+
+        assert title
+        if provider == 'LIVE_TV_APP':
+            info['title'] = self._live_title(title)
+
+        quality = qualities(('4', '0', '1', '2', '3', '5'))
+
+        formats = [{
+            'url': f['url'],
+            'ext': 'mp4',
+            'format_id': f['name'],
+        } for f in metadata['videos']]
+
+        m3u8_url = metadata.get('hlsManifestUrl')
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        dash_manifest = metadata.get('metadataEmbedded')
+        if dash_manifest:
+            formats.extend(self._parse_mpd_formats(
+                compat_etree_fromstring(dash_manifest), 'mpd'))
+
+        for fmt in formats:
+            fmt_type = self._search_regex(
+                r'\btype[/=](\d)', fmt['url'],
+                'format type', default=None)
+            if fmt_type:
+                fmt['quality'] = quality(fmt_type)
+
+        # Live formats
+        m3u8_url = metadata.get('hlsMasterPlaylistUrl')
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', entry_protocol='m3u8',
+                m3u8_id='hls', fatal=False))
+        rtmp_url = metadata.get('rtmpUrl')
+        if rtmp_url:
+            formats.append({
+                'url': rtmp_url,
+                'format_id': 'rtmp',
+                'ext': 'flv',
+            })
+
+        if not formats:
+            payment_info = metadata.get('paymentInfo')
+            if payment_info:
+                raise ExtractorError('This video is paid, subscribe to download it', expected=True)
+
+        self._sort_formats(formats)
+
+        info['formats'] = formats
+        return info
diff --git a/youtube_dl/extractor/oktoberfesttv.py b/youtube_dl/extractor/oktoberfesttv.py

new file mode 100644 (file)

index 0000000..a914068
--- /dev/null
+++ b/youtube_dl/extractor/oktoberfesttv.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class OktoberfestTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P<id>[^/?#]+)'
+
+    _TEST = {
+        'url': 'http://www.oktoberfest-tv.de/de/kameras/video/hb-zelt',
+        'info_dict': {
+            'id': 'hb-zelt',
+            'ext': 'mp4',
+            'title': 're:^Live-Kamera: Hofbräuzelt [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._live_title(self._html_search_regex(
+            r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title'))
+
+        clip = self._search_regex(
+            r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip')
+        ncurl = self._search_regex(
+            r"netConnectionUrl:\s*'([^']+)'", webpage, 'rtmp base')
+        video_url = ncurl + clip
+        thumbnail = self._search_regex(
+            r"canvas:\s*\{\s*backgroundImage:\s*'url\(([^)]+)\)'", webpage,
+            'thumbnail', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'ext': 'mp4',
+            'is_live': True,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py

new file mode 100644 (file)

index 0000000..3e44b78
--- /dev/null
+++ b/youtube_dl/extractor/once.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class OnceIE(InfoExtractor):
+    _VALID_URL = r'https?://.+?\.unicornmedia\.com/now/(?:ads/vmap/)?[^/]+/[^/]+/(?P<domain_id>[^/]+)/(?P<application_id>[^/]+)/(?:[^/]+/)?(?P<media_item_id>[^/]+)/content\.(?:once|m3u8|mp4)'
+    ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8'
+    PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4'
+
+    def _extract_once_formats(self, url, http_formats_preference=None):
+        domain_id, application_id, media_item_id = re.match(
+            OnceIE._VALID_URL, url).groups()
+        formats = self._extract_m3u8_formats(
+            self.ADAPTIVE_URL_TEMPLATE % (
+                domain_id, application_id, media_item_id),
+            media_item_id, 'mp4', m3u8_id='hls', fatal=False)
+        progressive_formats = []
+        for adaptive_format in formats:
+            # Prevent advertisement from embedding into m3u8 playlist (see
+            # https://github.com/ytdl-org/youtube-dl/issues/8893#issuecomment-199912684)
+            adaptive_format['url'] = re.sub(
+                r'\badsegmentlength=\d+', r'adsegmentlength=0', adaptive_format['url'])
+            rendition_id = self._search_regex(
+                r'/now/media/playlist/[^/]+/[^/]+/([^/]+)',
+                adaptive_format['url'], 'redition id', default=None)
+            if rendition_id:
+                progressive_format = adaptive_format.copy()
+                progressive_format.update({
+                    'url': self.PROGRESSIVE_URL_TEMPLATE % (
+                        domain_id, application_id, rendition_id, media_item_id),
+                    'format_id': adaptive_format['format_id'].replace(
+                        'hls', 'http'),
+                    'protocol': 'http',
+                    'preference': http_formats_preference,
+                })
+                progressive_formats.append(progressive_format)
+        self._check_formats(progressive_formats, media_item_id)
+        formats.extend(progressive_formats)
+        return formats
diff --git a/youtube_dl/extractor/ondemandkorea.py b/youtube_dl/extractor/ondemandkorea.py

new file mode 100644 (file)

index 0000000..df1ce3c
--- /dev/null
+++ b/youtube_dl/extractor/ondemandkorea.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    js_to_json,
+)
+
+
+class OnDemandKoreaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html'
+    _GEO_COUNTRIES = ['US', 'CA']
+    _TEST = {
+        'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html',
+        'info_dict': {
+            'id': 'ask-us-anything-e43',
+            'ext': 'mp4',
+            'title': 'Ask Us Anything : E43',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'params': {
+            'skip_download': 'm3u8 download'
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id, fatal=False)
+
+        if not webpage:
+            # Page sometimes returns captcha page with HTTP 403
+            raise ExtractorError(
+                'Unable to access page. You may have been blocked.',
+                expected=True)
+
+        if 'msg_block_01.png' in webpage:
+            self.raise_geo_restricted(
+                msg='This content is not available in your region',
+                countries=self._GEO_COUNTRIES)
+
+        if 'This video is only available to ODK PLUS members.' in webpage:
+            raise ExtractorError(
+                'This video is only available to ODK PLUS members.',
+                expected=True)
+
+        title = self._og_search_title(webpage)
+
+        jw_config = self._parse_json(
+            self._search_regex(
+                r'(?s)jwplayer\(([\'"])(?:(?!\1).)+\1\)\.setup\s*\((?P<options>.+?)\);',
+                webpage, 'jw config', group='options'),
+            video_id, transform_source=js_to_json)
+        info = self._parse_jwplayer_data(
+            jw_config, video_id, require_title=False, m3u8_id='hls',
+            base_url=url)
+
+        info.update({
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage),
+        })
+        return info
diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py

new file mode 100644 (file)

index 0000000..e55b2ac
--- /dev/null
+++ b/youtube_dl/extractor/onet.py
@@ -0,0 +1,268 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    get_element_by_class,
+    int_or_none,
+    js_to_json,
+    NO_DEFAULT,
+    parse_iso8601,
+    remove_start,
+    strip_or_none,
+    url_basename,
+)
+
+
+class OnetBaseIE(InfoExtractor):
+    _URL_BASE_RE = r'https?://(?:(?:www\.)?onet\.tv|onet100\.vod\.pl)/[a-z]/'
+
+    def _search_mvp_id(self, webpage):
+        return self._search_regex(
+            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
+
+    def _extract_from_id(self, video_id, webpage=None):
+        response = self._download_json(
+            'http://qi.ckm.onetapi.pl/', video_id,
+            query={
+                'body[id]': video_id,
+                'body[jsonrpc]': '2.0',
+                'body[method]': 'get_asset_detail',
+                'body[params][ID_Publikacji]': video_id,
+                'body[params][Service]': 'www.onet.pl',
+                'content-type': 'application/jsonp',
+                'x-onet-app': 'player.front.onetapi.pl',
+            })
+
+        error = response.get('error')
+        if error:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error['message']), expected=True)
+
+        video = response['result'].get('0')
+
+        formats = []
+        for format_type, formats_dict in video['formats'].items():
+            if not isinstance(formats_dict, dict):
+                continue
+            for format_id, format_list in formats_dict.items():
+                if not isinstance(format_list, list):
+                    continue
+                for f in format_list:
+                    video_url = f.get('url')
+                    if not video_url:
+                        continue
+                    ext = determine_ext(video_url)
+                    if format_id.startswith('ism'):
+                        formats.extend(self._extract_ism_formats(
+                            video_url, video_id, 'mss', fatal=False))
+                    elif ext == 'mpd':
+                        formats.extend(self._extract_mpd_formats(
+                            video_url, video_id, mpd_id='dash', fatal=False))
+                    elif format_id.startswith('hls'):
+                        formats.extend(self._extract_m3u8_formats(
+                            video_url, video_id, 'mp4', 'm3u8_native',
+                            m3u8_id='hls', fatal=False))
+                    else:
+                        http_f = {
+                            'url': video_url,
+                            'format_id': format_id,
+                            'abr': float_or_none(f.get('audio_bitrate')),
+                        }
+                        if format_type == 'audio':
+                            http_f['vcodec'] = 'none'
+                        else:
+                            http_f.update({
+                                'height': int_or_none(f.get('vertical_resolution')),
+                                'width': int_or_none(f.get('horizontal_resolution')),
+                                'vbr': float_or_none(f.get('video_bitrate')),
+                            })
+                        formats.append(http_f)
+        self._sort_formats(formats)
+
+        meta = video.get('meta', {})
+
+        title = (self._og_search_title(
+            webpage, default=None) if webpage else None) or meta['title']
+        description = (self._og_search_description(
+            webpage, default=None) if webpage else None) or meta.get('description')
+        duration = meta.get('length') or meta.get('lenght')
+        timestamp = parse_iso8601(meta.get('addDate'), ' ')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
+
+
+class OnetMVPIE(OnetBaseIE):
+    _VALID_URL = r'onetmvp:(?P<id>\d+\.\d+)'
+
+    _TEST = {
+        'url': 'onetmvp:381027.1509591944',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        return self._extract_from_id(self._match_id(url))
+
+
+class OnetIE(OnetBaseIE):
+    _VALID_URL = OnetBaseIE._URL_BASE_RE + r'[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)'
+    IE_NAME = 'onet.tv'
+
+    _TESTS = [{
+        'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc',
+        'md5': '436102770fb095c75b8bb0392d3da9ff',
+        'info_dict': {
+            'id': 'qbpyqc',
+            'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd',
+            'ext': 'mp4',
+            'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd',
+            'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...',
+            'upload_date': '20160705',
+            'timestamp': 1467721580,
+        },
+    }, {
+        'url': 'https://onet100.vod.pl/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id, video_id = mobj.group('display_id', 'id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        mvp_id = self._search_mvp_id(webpage)
+
+        info_dict = self._extract_from_id(mvp_id, webpage)
+        info_dict.update({
+            'id': video_id,
+            'display_id': display_id,
+        })
+
+        return info_dict
+
+
+class OnetChannelIE(OnetBaseIE):
+    _VALID_URL = OnetBaseIE._URL_BASE_RE + r'(?P<id>[a-z]+)(?:[?#]|$)'
+    IE_NAME = 'onet.tv:channel'
+
+    _TESTS = [{
+        'url': 'http://onet.tv/k/openerfestival',
+        'info_dict': {
+            'id': 'openerfestival',
+            'title': "Open'er Festival",
+            'description': "Tak było na Open'er Festival 2016! Oglądaj nasze reportaże i wywiady z artystami.",
+        },
+        'playlist_mincount': 35,
+    }, {
+        'url': 'https://onet100.vod.pl/k/openerfestival',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, channel_id)
+
+        current_clip_info = self._parse_json(self._search_regex(
+            r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id,
+            transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s)))
+        video_id = remove_start(current_clip_info['ckmId'], 'mvp:')
+        video_name = url_basename(current_clip_info['url'])
+
+        if self._downloader.params.get('noplaylist'):
+            self.to_screen(
+                'Downloading just video %s because of --no-playlist' % video_name)
+            return self._extract_from_id(video_id, webpage)
+
+        self.to_screen(
+            'Downloading channel %s - add --no-playlist to just download video %s' % (
+                channel_id, video_name))
+        matches = re.findall(
+            r'<a[^>]+href=[\'"](%s[a-z]+/[0-9a-z-]+/[0-9a-z]+)' % self._URL_BASE_RE,
+            webpage)
+        entries = [
+            self.url_result(video_link, OnetIE.ie_key())
+            for video_link in matches]
+
+        channel_title = strip_or_none(get_element_by_class('o_channelName', webpage))
+        channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage))
+        return self.playlist_result(entries, channel_id, channel_title, channel_description)
+
+
+class OnetPlIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+\.)?(?:onet|businessinsider\.com|plejada)\.pl/(?:[^/]+/)+(?P<id>[0-9a-z]+)'
+    IE_NAME = 'onet.pl'
+
+    _TESTS = [{
+        'url': 'http://eurosport.onet.pl/zimowe/skoki-narciarskie/ziobro-wygral-kwalifikacje-w-pjongczangu/9ckrly',
+        'md5': 'b94021eb56214c3969380388b6e73cb0',
+        'info_dict': {
+            'id': '1561707.1685479',
+            'ext': 'mp4',
+            'title': 'Ziobro wygrał kwalifikacje w Pjongczangu',
+            'description': 'md5:61fb0740084d2d702ea96512a03585b4',
+            'upload_date': '20170214',
+            'timestamp': 1487078046,
+        },
+    }, {
+        # embedded via pulsembed
+        'url': 'http://film.onet.pl/pensjonat-nad-rozlewiskiem-relacja-z-planu-serialu/y428n0',
+        'info_dict': {
+            'id': '501235.965429946',
+            'ext': 'mp4',
+            'title': '"Pensjonat nad rozlewiskiem": relacja z planu serialu',
+            'upload_date': '20170622',
+            'timestamp': 1498159955,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://film.onet.pl/zwiastuny/ghost-in-the-shell-drugi-zwiastun-pl/5q6yl3',
+        'only_matching': True,
+    }, {
+        'url': 'http://moto.onet.pl/jak-wybierane-sa-miejsca-na-fotoradary/6rs04e',
+        'only_matching': True,
+    }, {
+        'url': 'http://businessinsider.com.pl/wideo/scenariusz-na-koniec-swiata-wedlug-nasa/dwnqptk',
+        'only_matching': True,
+    }, {
+        'url': 'http://plejada.pl/weronika-rosati-o-swoim-domniemanym-slubie/n2bq89',
+        'only_matching': True,
+    }]
+
+    def _search_mvp_id(self, webpage, default=NO_DEFAULT):
+        return self._search_regex(
+            r'data-(?:params-)?mvp=["\'](\d+\.\d+)', webpage, 'mvp id',
+            default=default)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        mvp_id = self._search_mvp_id(webpage, default=None)
+
+        if not mvp_id:
+            pulsembed_url = self._search_regex(
+                r'data-src=(["\'])(?P<url>(?:https?:)?//pulsembed\.eu/.+?)\1',
+                webpage, 'pulsembed url', group='url')
+            webpage = self._download_webpage(
+                pulsembed_url, video_id, 'Downloading pulsembed webpage')
+            mvp_id = self._search_mvp_id(webpage)
+
+        return self.url_result(
+            'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id)
diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py

new file mode 100644 (file)

index 0000000..cf5c39e
--- /dev/null
+++ b/youtube_dl/extractor/onionstudios.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import js_to_json
+
+
+class OnionStudiosIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:video(?:s/[^/]+-|/)|embed\?.*\bid=)(?P<id>\d+)(?!-)'
+
+    _TESTS = [{
+        'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
+        'md5': '5a118d466d62b5cd03647cf2c593977f',
+        'info_dict': {
+            'id': '3459881',
+            'ext': 'mp4',
+            'title': 'Hannibal charges forward, stops for a cocktail',
+            'description': 'md5:545299bda6abf87e5ec666548c6a9448',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'a.v. club',
+            'upload_date': '20150619',
+            'timestamp': 1434728546,
+        },
+    }, {
+        'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.onionstudios.com/video/6139.json',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'(?s)<(?:iframe|bulbs-video)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/(?:embed.+?|video/\d+\.json))\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://onionstudios.com/embed/dc94dc2899fe644c0e7241fa04c1b732.js',
+            video_id)
+        mcp_id = compat_str(self._parse_json(self._search_regex(
+            r'window\.mcpMapping\s*=\s*({.+?});', webpage,
+            'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id'])
+        return self.url_result(
+            'http://kinja.com/ajax/inset/iframe?id=mcp-' + mcp_id,
+            'KinjaEmbed', mcp_id)
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py

new file mode 100644 (file)

index 0000000..eb957b8
--- /dev/null
+++ b/youtube_dl/extractor/ooyala.py
@@ -0,0 +1,210 @@
+from __future__ import unicode_literals
+
+import base64
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_str,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    try_get,
+    unsmuggle_url,
+)
+
+
+class OoyalaBaseIE(InfoExtractor):
+    _PLAYER_BASE = 'http://player.ooyala.com/'
+    _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/'
+    _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s'
+
+    def _extract(self, content_tree_url, video_id, domain=None, supportedformats=None, embed_token=None):
+        content_tree = self._download_json(content_tree_url, video_id)['content_tree']
+        metadata = content_tree[list(content_tree)[0]]
+        embed_code = metadata['embed_code']
+        pcode = metadata.get('asset_pcode') or embed_code
+        title = metadata['title']
+
+        auth_data = self._download_json(
+            self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code),
+            video_id, headers=self.geo_verification_headers(), query={
+                'domain': domain or 'player.ooyala.com',
+                'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds,dash,smooth',
+                'embedToken': embed_token,
+            })['authorization_data'][embed_code]
+
+        urls = []
+        formats = []
+        streams = auth_data.get('streams') or [{
+            'delivery_type': 'hls',
+            'url': {
+                'data': base64.b64encode(('http://player.ooyala.com/hls/player/all/%s.m3u8' % embed_code).encode()).decode(),
+            }
+        }]
+        for stream in streams:
+            url_data = try_get(stream, lambda x: x['url']['data'], compat_str)
+            if not url_data:
+                continue
+            s_url = compat_b64decode(url_data).decode('utf-8')
+            if not s_url or s_url in urls:
+                continue
+            urls.append(s_url)
+            ext = determine_ext(s_url, None)
+            delivery_type = stream.get('delivery_type')
+            if delivery_type == 'hls' or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif delivery_type == 'hds' or ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
+            elif delivery_type == 'dash' or ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    s_url, embed_code, mpd_id='dash', fatal=False))
+            elif delivery_type == 'smooth':
+                self._extract_ism_formats(
+                    s_url, embed_code, ism_id='mss', fatal=False)
+            elif ext == 'smil':
+                formats.extend(self._extract_smil_formats(
+                    s_url, embed_code, fatal=False))
+            else:
+                formats.append({
+                    'url': s_url,
+                    'ext': ext or delivery_type,
+                    'vcodec': stream.get('video_codec'),
+                    'format_id': delivery_type,
+                    'width': int_or_none(stream.get('width')),
+                    'height': int_or_none(stream.get('height')),
+                    'abr': int_or_none(stream.get('audio_bitrate')),
+                    'vbr': int_or_none(stream.get('video_bitrate')),
+                    'fps': float_or_none(stream.get('framerate')),
+                })
+        if not formats and not auth_data.get('authorized'):
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, auth_data['message']), expected=True)
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items():
+            sub_url = sub.get('url')
+            if not sub_url:
+                continue
+            subtitles[lang] = [{
+                'url': sub_url,
+            }]
+
+        return {
+            'id': embed_code,
+            'title': title,
+            'description': metadata.get('description'),
+            'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'),
+            'duration': float_or_none(metadata.get('duration'), 1000),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
+
+
+class OoyalaIE(OoyalaBaseIE):
+    _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
+
+    _TESTS = [
+        {
+            # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
+            'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+            'info_dict': {
+                'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+                'ext': 'mp4',
+                'title': 'Explaining Data Recovery from Hard Drives and SSDs',
+                'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
+                'duration': 853.386,
+            },
+            # The video in the original webpage now uses PlayWire
+            'skip': 'Ooyala said: movie expired',
+        }, {
+            # Only available for ipad
+            'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+            'info_dict': {
+                'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
+                'ext': 'mp4',
+                'title': 'Simulation Overview - Levels of Simulation',
+                'duration': 194.948,
+            },
+        },
+        {
+            # Information available only through SAS api
+            # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187
+            'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+            'md5': 'a84001441b35ea492bc03736e59e7935',
+            'info_dict': {
+                'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx',
+                'ext': 'mp4',
+                'title': 'Divide Tool Path.mp4',
+                'duration': 204.405,
+            }
+        },
+        {
+            # empty stream['url']['data']
+            'url': 'http://player.ooyala.com/player.js?embedCode=w2bnZtYjE6axZ_dw1Cd0hQtXd_ige2Is',
+            'only_matching': True,
+        }
+    ]
+
+    @staticmethod
+    def _url_for_embed_code(embed_code):
+        return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code
+
+    @classmethod
+    def _build_url_result(cls, embed_code):
+        return cls.url_result(cls._url_for_embed_code(embed_code),
+                              ie=cls.ie_key())
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+        embed_code = self._match_id(url)
+        domain = smuggled_data.get('domain')
+        supportedformats = smuggled_data.get('supportedformats')
+        embed_token = smuggled_data.get('embed_token')
+        content_tree_url = self._CONTENT_TREE_BASE + 'embed_code/%s/%s' % (embed_code, embed_code)
+        return self._extract(content_tree_url, embed_code, domain, supportedformats, embed_token)
+
+
+class OoyalaExternalIE(OoyalaBaseIE):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        ooyalaexternal:|
+                        https?://.+?\.ooyala\.com/.*?\bexternalId=
+                    )
+                    (?P<partner_id>[^:]+)
+                    :
+                    (?P<id>.+)
+                    (?:
+                        :|
+                        .*?&pcode=
+                    )
+                    (?P<pcode>.+?)
+                    (?:&|$)
+                    '''
+
+    _TEST = {
+        'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always',
+        'info_dict': {
+            'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
+            'ext': 'mp4',
+            'title': 'dm_140128_30for30Shorts___JudgingJewellv2',
+            'duration': 1302.0,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        partner_id, video_id, pcode = re.match(self._VALID_URL, url).groups()
+        content_tree_url = self._CONTENT_TREE_BASE + 'external_id/%s/%s:%s' % (pcode, partner_id, video_id)
+        return self._extract(content_tree_url, video_id)
diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py

new file mode 100644 (file)

index 0000000..0c20d01
--- /dev/null
+++ b/youtube_dl/extractor/openload.py
@@ -0,0 +1,238 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import os
+import subprocess
+import tempfile
+
+from ..compat import (
+    compat_urlparse,
+    compat_kwargs,
+)
+from ..utils import (
+    check_executable,
+    encodeArgument,
+    ExtractorError,
+    get_exe_version,
+    is_outdated_version,
+    std_headers,
+)
+
+
+def cookie_to_dict(cookie):
+    cookie_dict = {
+        'name': cookie.name,
+        'value': cookie.value,
+    }
+    if cookie.port_specified:
+        cookie_dict['port'] = cookie.port
+    if cookie.domain_specified:
+        cookie_dict['domain'] = cookie.domain
+    if cookie.path_specified:
+        cookie_dict['path'] = cookie.path
+    if cookie.expires is not None:
+        cookie_dict['expires'] = cookie.expires
+    if cookie.secure is not None:
+        cookie_dict['secure'] = cookie.secure
+    if cookie.discard is not None:
+        cookie_dict['discard'] = cookie.discard
+    try:
+        if (cookie.has_nonstandard_attr('httpOnly')
+                or cookie.has_nonstandard_attr('httponly')
+                or cookie.has_nonstandard_attr('HttpOnly')):
+            cookie_dict['httponly'] = True
+    except TypeError:
+        pass
+    return cookie_dict
+
+
+def cookie_jar_to_list(cookie_jar):
+    return [cookie_to_dict(cookie) for cookie in cookie_jar]
+
+
+class PhantomJSwrapper(object):
+    """PhantomJS wrapper class
+
+    This class is experimental.
+    """
+
+    _TEMPLATE = r'''
+        phantom.onError = function(msg, trace) {{
+          var msgStack = ['PHANTOM ERROR: ' + msg];
+          if(trace && trace.length) {{
+            msgStack.push('TRACE:');
+            trace.forEach(function(t) {{
+              msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line
+                + (t.function ? ' (in function ' + t.function +')' : ''));
+            }});
+          }}
+          console.error(msgStack.join('\n'));
+          phantom.exit(1);
+        }};
+        var page = require('webpage').create();
+        var fs = require('fs');
+        var read = {{ mode: 'r', charset: 'utf-8' }};
+        var write = {{ mode: 'w', charset: 'utf-8' }};
+        JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{
+          phantom.addCookie(x);
+        }});
+        page.settings.resourceTimeout = {timeout};
+        page.settings.userAgent = "{ua}";
+        page.onLoadStarted = function() {{
+          page.evaluate(function() {{
+            delete window._phantom;
+            delete window.callPhantom;
+          }});
+        }};
+        var saveAndExit = function() {{
+          fs.write("{html}", page.content, write);
+          fs.write("{cookies}", JSON.stringify(phantom.cookies), write);
+          phantom.exit();
+        }};
+        page.onLoadFinished = function(status) {{
+          if(page.url === "") {{
+            page.setContent(fs.read("{html}", read), "{url}");
+          }}
+          else {{
+            {jscode}
+          }}
+        }};
+        page.open("");
+    '''
+
+    _TMP_FILE_NAMES = ['script', 'html', 'cookies']
+
+    @staticmethod
+    def _version():
+        return get_exe_version('phantomjs', version_re=r'([0-9.]+)')
+
+    def __init__(self, extractor, required_version=None, timeout=10000):
+        self._TMP_FILES = {}
+
+        self.exe = check_executable('phantomjs', ['-v'])
+        if not self.exe:
+            raise ExtractorError('PhantomJS executable not found in PATH, '
+                                 'download it from http://phantomjs.org',
+                                 expected=True)
+
+        self.extractor = extractor
+
+        if required_version:
+            version = self._version()
+            if is_outdated_version(version, required_version):
+                self.extractor._downloader.report_warning(
+                    'Your copy of PhantomJS is outdated, update it to version '
+                    '%s or newer if you encounter any errors.' % required_version)
+
+        self.options = {
+            'timeout': timeout,
+        }
+        for name in self._TMP_FILE_NAMES:
+            tmp = tempfile.NamedTemporaryFile(delete=False)
+            tmp.close()
+            self._TMP_FILES[name] = tmp
+
+    def __del__(self):
+        for name in self._TMP_FILE_NAMES:
+            try:
+                os.remove(self._TMP_FILES[name].name)
+            except (IOError, OSError, KeyError):
+                pass
+
+    def _save_cookies(self, url):
+        cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar)
+        for cookie in cookies:
+            if 'path' not in cookie:
+                cookie['path'] = '/'
+            if 'domain' not in cookie:
+                cookie['domain'] = compat_urlparse.urlparse(url).netloc
+        with open(self._TMP_FILES['cookies'].name, 'wb') as f:
+            f.write(json.dumps(cookies).encode('utf-8'))
+
+    def _load_cookies(self):
+        with open(self._TMP_FILES['cookies'].name, 'rb') as f:
+            cookies = json.loads(f.read().decode('utf-8'))
+        for cookie in cookies:
+            if cookie['httponly'] is True:
+                cookie['rest'] = {'httpOnly': None}
+            if 'expiry' in cookie:
+                cookie['expire_time'] = cookie['expiry']
+            self.extractor._set_cookie(**compat_kwargs(cookie))
+
+    def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'):
+        """
+        Downloads webpage (if needed) and executes JS
+
+        Params:
+            url: website url
+            html: optional, html code of website
+            video_id: video id
+            note: optional, displayed when downloading webpage
+            note2: optional, displayed when executing JS
+            headers: custom http headers
+            jscode: code to be executed when page is loaded
+
+        Returns tuple with:
+            * downloaded website (after JS execution)
+            * anything you print with `console.log` (but not inside `page.execute`!)
+
+        In most cases you don't need to add any `jscode`.
+        It is executed in `page.onLoadFinished`.
+        `saveAndExit();` is mandatory, use it instead of `phantom.exit()`
+        It is possible to wait for some element on the webpage, for example:
+            var check = function() {
+              var elementFound = page.evaluate(function() {
+                return document.querySelector('#b.done') !== null;
+              });
+              if(elementFound)
+                saveAndExit();
+              else
+                window.setTimeout(check, 500);
+            }
+
+            page.evaluate(function(){
+              document.querySelector('#a').click();
+            });
+            check();
+        """
+        if 'saveAndExit();' not in jscode:
+            raise ExtractorError('`saveAndExit();` not found in `jscode`')
+        if not html:
+            html = self.extractor._download_webpage(url, video_id, note=note, headers=headers)
+        with open(self._TMP_FILES['html'].name, 'wb') as f:
+            f.write(html.encode('utf-8'))
+
+        self._save_cookies(url)
+
+        replaces = self.options
+        replaces['url'] = url
+        user_agent = headers.get('User-Agent') or std_headers['User-Agent']
+        replaces['ua'] = user_agent.replace('"', '\\"')
+        replaces['jscode'] = jscode
+
+        for x in self._TMP_FILE_NAMES:
+            replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"')
+
+        with open(self._TMP_FILES['script'].name, 'wb') as f:
+            f.write(self._TEMPLATE.format(**replaces).encode('utf-8'))
+
+        if video_id is None:
+            self.extractor.to_screen('%s' % (note2,))
+        else:
+            self.extractor.to_screen('%s: %s' % (video_id, note2))
+
+        p = subprocess.Popen([
+            self.exe, '--ssl-protocol=any',
+            self._TMP_FILES['script'].name
+        ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        out, err = p.communicate()
+        if p.returncode != 0:
+            raise ExtractorError(
+                'Executing JS failed\n:' + encodeArgument(err))
+        with open(self._TMP_FILES['html'].name, 'rb') as f:
+            html = f.read().decode('utf-8')
+
+        self._load_cookies()
+
+        return (html, encodeArgument(out))
diff --git a/youtube_dl/extractor/ora.py b/youtube_dl/extractor/ora.py

new file mode 100644 (file)

index 0000000..1d42be3
--- /dev/null
+++ b/youtube_dl/extractor/ora.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    get_element_by_attribute,
+    qualities,
+    unescapeHTML,
+)
+
+
+class OraTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:ora\.tv|unsafespeech\.com)/([^/]+/)*(?P<id>[^/\?#]+)'
+    _TESTS = [{
+        'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq',
+        'md5': 'fa33717591c631ec93b04b0e330df786',
+        'info_dict': {
+            'id': '50178',
+            'ext': 'mp4',
+            'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!',
+            'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1',
+        }
+    }, {
+        'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        video_data = self._search_regex(
+            r'"(?:video|current)"\s*:\s*({[^}]+?})', webpage, 'current video')
+        m3u8_url = self._search_regex(
+            r'hls_stream"?\s*:\s*"([^"]+)', video_data, 'm3u8 url', None)
+        if m3u8_url:
+            formats = self._extract_m3u8_formats(
+                m3u8_url, display_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False)
+            # similar to GameSpotIE
+            m3u8_path = compat_urlparse.urlparse(m3u8_url).path
+            QUALITIES_RE = r'((,[a-z]+\d+)+,?)'
+            available_qualities = self._search_regex(
+                QUALITIES_RE, m3u8_path, 'qualities').strip(',').split(',')
+            http_path = m3u8_path[1:].split('/', 1)[1]
+            http_template = re.sub(QUALITIES_RE, r'%s', http_path)
+            http_template = http_template.replace('.csmil/master.m3u8', '')
+            http_template = compat_urlparse.urljoin(
+                'http://videocdn-pmd.ora.tv/', http_template)
+            preference = qualities(
+                ['mobile400', 'basic400', 'basic600', 'sd900', 'sd1200', 'sd1500', 'hd720', 'hd1080'])
+            for q in available_qualities:
+                formats.append({
+                    'url': http_template % q,
+                    'format_id': q,
+                    'preference': preference(q),
+                })
+            self._sort_formats(formats)
+        else:
+            return self.url_result(self._search_regex(
+                r'"youtube_id"\s*:\s*"([^"]+)', webpage, 'youtube id'), 'Youtube')
+
+        return {
+            'id': self._search_regex(
+                r'"id"\s*:\s*(\d+)', video_data, 'video id', default=display_id),
+            'display_id': display_id,
+            'title': unescapeHTML(self._og_search_title(webpage)),
+            'description': get_element_by_attribute(
+                'class', 'video_txt_decription', webpage),
+            'thumbnail': self._proto_relative_url(self._search_regex(
+                r'"thumb"\s*:\s*"([^"]+)', video_data, 'thumbnail', None)),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py

new file mode 100644 (file)

index 0000000..700ce44
--- /dev/null
+++ b/youtube_dl/extractor/orf.py
@@ -0,0 +1,570 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    clean_html,
+    determine_ext,
+    float_or_none,
+    HEADRequest,
+    int_or_none,
+    orderedSet,
+    remove_end,
+    str_or_none,
+    strip_jsonp,
+    unescapeHTML,
+    unified_strdate,
+    url_or_none,
+)
+
+
+class ORFTVthekIE(InfoExtractor):
+    IE_NAME = 'orf:tvthek'
+    IE_DESC = 'ORF TVthek'
+    _VALID_URL = r'https?://tvthek\.orf\.at/(?:[^/]+/)+(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
+        'playlist': [{
+            'md5': '2942210346ed779588f428a92db88712',
+            'info_dict': {
+                'id': '8896777',
+                'ext': 'mp4',
+                'title': 'Aufgetischt: Mit der Steirischen Tafelrunde',
+                'description': 'md5:c1272f0245537812d4e36419c207b67d',
+                'duration': 2668,
+                'upload_date': '20141208',
+            },
+        }],
+        'skip': 'Blocked outside of Austria / Germany',
+    }, {
+        'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256',
+        'info_dict': {
+            'id': '7982259',
+            'ext': 'mp4',
+            'title': 'Best of Ingrid Thurnher',
+            'upload_date': '20140527',
+            'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".',
+        },
+        'params': {
+            'skip_download': True,  # rtsp downloads
+        },
+        'skip': 'Blocked outside of Austria / Germany',
+    }, {
+        'url': 'http://tvthek.orf.at/topic/Fluechtlingskrise/10463081/Heimat-Fremde-Heimat/13879132/Senioren-betreuen-Migrantenkinder/13879141',
+        'only_matching': True,
+    }, {
+        'url': 'http://tvthek.orf.at/profile/Universum/35429',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+
+        data_jsb = self._parse_json(
+            self._search_regex(
+                r'<div[^>]+class=(["\']).*?VideoPlaylist.*?\1[^>]+data-jsb=(["\'])(?P<json>.+?)\2',
+                webpage, 'playlist', group='json'),
+            playlist_id, transform_source=unescapeHTML)['playlist']['videos']
+
+        entries = []
+        for sd in data_jsb:
+            video_id, title = sd.get('id'), sd.get('title')
+            if not video_id or not title:
+                continue
+            video_id = compat_str(video_id)
+            formats = []
+            for fd in sd['sources']:
+                src = url_or_none(fd.get('src'))
+                if not src:
+                    continue
+                format_id_list = []
+                for key in ('delivery', 'quality', 'quality_string'):
+                    value = fd.get(key)
+                    if value:
+                        format_id_list.append(value)
+                format_id = '-'.join(format_id_list)
+                ext = determine_ext(src)
+                if ext == 'm3u8':
+                    m3u8_formats = self._extract_m3u8_formats(
+                        src, video_id, 'mp4', m3u8_id=format_id, fatal=False)
+                    if any('/geoprotection' in f['url'] for f in m3u8_formats):
+                        self.raise_geo_restricted()
+                    formats.extend(m3u8_formats)
+                elif ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        src, video_id, f4m_id=format_id, fatal=False))
+                else:
+                    formats.append({
+                        'format_id': format_id,
+                        'url': src,
+                        'protocol': fd.get('protocol'),
+                    })
+
+            # Check for geoblocking.
+            # There is a property is_geoprotection, but that's always false
+            geo_str = sd.get('geoprotection_string')
+            if geo_str:
+                try:
+                    http_url = next(
+                        f['url']
+                        for f in formats
+                        if re.match(r'^https?://.*\.mp4$', f['url']))
+                except StopIteration:
+                    pass
+                else:
+                    req = HEADRequest(http_url)
+                    self._request_webpage(
+                        req, video_id,
+                        note='Testing for geoblocking',
+                        errnote=((
+                            'This video seems to be blocked outside of %s. '
+                            'You may want to try the streaming-* formats.')
+                            % geo_str),
+                        fatal=False)
+
+            self._check_formats(formats, video_id)
+            self._sort_formats(formats)
+
+            subtitles = {}
+            for sub in sd.get('subtitles', []):
+                sub_src = sub.get('src')
+                if not sub_src:
+                    continue
+                subtitles.setdefault(sub.get('lang', 'de-AT'), []).append({
+                    'url': sub_src,
+                })
+
+            upload_date = unified_strdate(sd.get('created_date'))
+            entries.append({
+                '_type': 'video',
+                'id': video_id,
+                'title': title,
+                'formats': formats,
+                'subtitles': subtitles,
+                'description': sd.get('description'),
+                'duration': int_or_none(sd.get('duration_in_seconds')),
+                'upload_date': upload_date,
+                'thumbnail': sd.get('image_full_url'),
+            })
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'id': playlist_id,
+        }
+
+
+class ORFRadioIE(InfoExtractor):
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        show_date = mobj.group('date')
+        show_id = mobj.group('show')
+
+        data = self._download_json(
+            'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s'
+            % (self._API_STATION, show_id, show_date), show_id)
+
+        entries = []
+        for info in data['streams']:
+            loop_stream_id = str_or_none(info.get('loopStreamId'))
+            if not loop_stream_id:
+                continue
+            title = str_or_none(data.get('title'))
+            if not title:
+                continue
+            start = int_or_none(info.get('start'), scale=1000)
+            end = int_or_none(info.get('end'), scale=1000)
+            duration = end - start if end and start else None
+            entries.append({
+                'id': loop_stream_id.replace('.mp3', ''),
+                'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id),
+                'title': title,
+                'description': clean_html(data.get('subtitle')),
+                'duration': duration,
+                'timestamp': start,
+                'ext': 'mp3',
+                'series': data.get('programTitle'),
+            })
+
+        return {
+            '_type': 'playlist',
+            'id': show_id,
+            'title': data.get('title'),
+            'description': clean_html(data.get('subtitle')),
+            'entries': entries,
+        }
+
+
+class ORFFM4IE(ORFRadioIE):
+    IE_NAME = 'orf:fm4'
+    IE_DESC = 'radio FM4'
+    _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>4\w+)'
+    _API_STATION = 'fm4'
+    _LOOP_STATION = 'fm4'
+
+    _TEST = {
+        'url': 'http://fm4.orf.at/player/20170107/4CC',
+        'md5': '2b0be47375432a7ef104453432a19212',
+        'info_dict': {
+            'id': '2017-01-07_2100_tl_54_7DaysSat18_31295',
+            'ext': 'mp3',
+            'title': 'Solid Steel Radioshow',
+            'description': 'Die Mixshow von Coldcut und Ninja Tune.',
+            'duration': 3599,
+            'timestamp': 1483819257,
+            'upload_date': '20170107',
+        },
+        'skip': 'Shows from ORF radios are only available for 7 days.',
+        'only_matching': True,
+    }
+
+
+class ORFNOEIE(ORFRadioIE):
+    IE_NAME = 'orf:noe'
+    IE_DESC = 'Radio Niederösterreich'
+    _VALID_URL = r'https?://(?P<station>noe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'noe'
+    _LOOP_STATION = 'oe2n'
+
+    _TEST = {
+        'url': 'https://noe.orf.at/player/20200423/NGM',
+        'only_matching': True,
+    }
+
+
+class ORFWIEIE(ORFRadioIE):
+    IE_NAME = 'orf:wien'
+    IE_DESC = 'Radio Wien'
+    _VALID_URL = r'https?://(?P<station>wien)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'wie'
+    _LOOP_STATION = 'oe2w'
+
+    _TEST = {
+        'url': 'https://wien.orf.at/player/20200423/WGUM',
+        'only_matching': True,
+    }
+
+
+class ORFBGLIE(ORFRadioIE):
+    IE_NAME = 'orf:burgenland'
+    IE_DESC = 'Radio Burgenland'
+    _VALID_URL = r'https?://(?P<station>burgenland)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'bgl'
+    _LOOP_STATION = 'oe2b'
+
+    _TEST = {
+        'url': 'https://burgenland.orf.at/player/20200423/BGM',
+        'only_matching': True,
+    }
+
+
+class ORFOOEIE(ORFRadioIE):
+    IE_NAME = 'orf:oberoesterreich'
+    IE_DESC = 'Radio Oberösterreich'
+    _VALID_URL = r'https?://(?P<station>ooe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'ooe'
+    _LOOP_STATION = 'oe2o'
+
+    _TEST = {
+        'url': 'https://ooe.orf.at/player/20200423/OGMO',
+        'only_matching': True,
+    }
+
+
+class ORFSTMIE(ORFRadioIE):
+    IE_NAME = 'orf:steiermark'
+    IE_DESC = 'Radio Steiermark'
+    _VALID_URL = r'https?://(?P<station>steiermark)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'stm'
+    _LOOP_STATION = 'oe2st'
+
+    _TEST = {
+        'url': 'https://steiermark.orf.at/player/20200423/STGMS',
+        'only_matching': True,
+    }
+
+
+class ORFKTNIE(ORFRadioIE):
+    IE_NAME = 'orf:kaernten'
+    IE_DESC = 'Radio Kärnten'
+    _VALID_URL = r'https?://(?P<station>kaernten)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'ktn'
+    _LOOP_STATION = 'oe2k'
+
+    _TEST = {
+        'url': 'https://kaernten.orf.at/player/20200423/KGUMO',
+        'only_matching': True,
+    }
+
+
+class ORFSBGIE(ORFRadioIE):
+    IE_NAME = 'orf:salzburg'
+    IE_DESC = 'Radio Salzburg'
+    _VALID_URL = r'https?://(?P<station>salzburg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'sbg'
+    _LOOP_STATION = 'oe2s'
+
+    _TEST = {
+        'url': 'https://salzburg.orf.at/player/20200423/SGUM',
+        'only_matching': True,
+    }
+
+
+class ORFTIRIE(ORFRadioIE):
+    IE_NAME = 'orf:tirol'
+    IE_DESC = 'Radio Tirol'
+    _VALID_URL = r'https?://(?P<station>tirol)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'tir'
+    _LOOP_STATION = 'oe2t'
+
+    _TEST = {
+        'url': 'https://tirol.orf.at/player/20200423/TGUMO',
+        'only_matching': True,
+    }
+
+
+class ORFVBGIE(ORFRadioIE):
+    IE_NAME = 'orf:vorarlberg'
+    IE_DESC = 'Radio Vorarlberg'
+    _VALID_URL = r'https?://(?P<station>vorarlberg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'vbg'
+    _LOOP_STATION = 'oe2v'
+
+    _TEST = {
+        'url': 'https://vorarlberg.orf.at/player/20200423/VGUM',
+        'only_matching': True,
+    }
+
+
+class ORFOE3IE(ORFRadioIE):
+    IE_NAME = 'orf:oe3'
+    IE_DESC = 'Radio Österreich 3'
+    _VALID_URL = r'https?://(?P<station>oe3)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'oe3'
+    _LOOP_STATION = 'oe3'
+
+    _TEST = {
+        'url': 'https://oe3.orf.at/player/20200424/3WEK',
+        'only_matching': True,
+    }
+
+
+class ORFOE1IE(ORFRadioIE):
+    IE_NAME = 'orf:oe1'
+    IE_DESC = 'Radio Österreich 1'
+    _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+    _API_STATION = 'oe1'
+    _LOOP_STATION = 'oe1'
+
+    _TEST = {
+        'url': 'http://oe1.orf.at/player/20170108/456544',
+        'md5': '34d8a6e67ea888293741c86a099b745b',
+        'info_dict': {
+            'id': '2017-01-08_0759_tl_51_7DaysSun6_256141',
+            'ext': 'mp3',
+            'title': 'Morgenjournal',
+            'duration': 609,
+            'timestamp': 1483858796,
+            'upload_date': '20170108',
+        },
+        'skip': 'Shows from ORF radios are only available for 7 days.'
+    }
+
+
+class ORFIPTVIE(InfoExtractor):
+    IE_NAME = 'orf:iptv'
+    IE_DESC = 'iptv.ORF.at'
+    _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://iptv.orf.at/stories/2275236/',
+        'md5': 'c8b22af4718a4b4af58342529453e3e5',
+        'info_dict': {
+            'id': '350612',
+            'ext': 'flv',
+            'title': 'Weitere Evakuierungen um Vulkan Calbuco',
+            'description': 'md5:d689c959bdbcf04efeddedbf2299d633',
+            'duration': 68.197,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20150425',
+        },
+    }
+
+    def _real_extract(self, url):
+        story_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://iptv.orf.at/stories/%s' % story_id, story_id)
+
+        video_id = self._search_regex(
+            r'data-video(?:id)?="(\d+)"', webpage, 'video id')
+
+        data = self._download_json(
+            'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
+            video_id)[0]
+
+        duration = float_or_none(data['duration'], 1000)
+
+        video = data['sources']['default']
+        load_balancer_url = video['loadBalancerUrl']
+        abr = int_or_none(video.get('audioBitrate'))
+        vbr = int_or_none(video.get('bitrate'))
+        fps = int_or_none(video.get('videoFps'))
+        width = int_or_none(video.get('videoWidth'))
+        height = int_or_none(video.get('videoHeight'))
+        thumbnail = video.get('preview')
+
+        rendition = self._download_json(
+            load_balancer_url, video_id, transform_source=strip_jsonp)
+
+        f = {
+            'abr': abr,
+            'vbr': vbr,
+            'fps': fps,
+            'width': width,
+            'height': height,
+        }
+
+        formats = []
+        for format_id, format_url in rendition['redirect'].items():
+            if format_id == 'rtmp':
+                ff = f.copy()
+                ff.update({
+                    'url': format_url,
+                    'format_id': format_id,
+                })
+                formats.append(ff)
+            elif determine_ext(format_url) == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    format_url, video_id, f4m_id=format_id))
+            elif determine_ext(format_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', m3u8_id=format_id))
+            else:
+                continue
+        self._sort_formats(formats)
+
+        title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at')
+        description = self._og_search_description(webpage)
+        upload_date = unified_strdate(self._html_search_meta(
+            'dc.date', webpage, 'upload date'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'formats': formats,
+        }
+
+
+class ORFFM4StoryIE(InfoExtractor):
+    IE_NAME = 'orf:fm4:story'
+    IE_DESC = 'fm4.orf.at stories'
+    _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://fm4.orf.at/stories/2865738/',
+        'playlist': [{
+            'md5': 'e1c2c706c45c7b34cf478bbf409907ca',
+            'info_dict': {
+                'id': '547792',
+                'ext': 'flv',
+                'title': 'Manu Delago und Inner Tongue live',
+                'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
+                'duration': 1748.52,
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'upload_date': '20170913',
+            },
+        }, {
+            'md5': 'c6dd2179731f86f4f55a7b49899d515f',
+            'info_dict': {
+                'id': '547798',
+                'ext': 'flv',
+                'title': 'Manu Delago und Inner Tongue live (2)',
+                'duration': 1504.08,
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'upload_date': '20170913',
+                'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.',
+            },
+        }],
+    }
+
+    def _real_extract(self, url):
+        story_id = self._match_id(url)
+        webpage = self._download_webpage(url, story_id)
+
+        entries = []
+        all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage))
+        for idx, video_id in enumerate(all_ids):
+            data = self._download_json(
+                'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id,
+                video_id)[0]
+
+            duration = float_or_none(data['duration'], 1000)
+
+            video = data['sources']['q8c']
+            load_balancer_url = video['loadBalancerUrl']
+            abr = int_or_none(video.get('audioBitrate'))
+            vbr = int_or_none(video.get('bitrate'))
+            fps = int_or_none(video.get('videoFps'))
+            width = int_or_none(video.get('videoWidth'))
+            height = int_or_none(video.get('videoHeight'))
+            thumbnail = video.get('preview')
+
+            rendition = self._download_json(
+                load_balancer_url, video_id, transform_source=strip_jsonp)
+
+            f = {
+                'abr': abr,
+                'vbr': vbr,
+                'fps': fps,
+                'width': width,
+                'height': height,
+            }
+
+            formats = []
+            for format_id, format_url in rendition['redirect'].items():
+                if format_id == 'rtmp':
+                    ff = f.copy()
+                    ff.update({
+                        'url': format_url,
+                        'format_id': format_id,
+                    })
+                    formats.append(ff)
+                elif determine_ext(format_url) == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        format_url, video_id, f4m_id=format_id))
+                elif determine_ext(format_url) == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4', m3u8_id=format_id))
+                else:
+                    continue
+            self._sort_formats(formats)
+
+            title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at')
+            if idx >= 1:
+                # Titles are duplicates, make them unique
+                title += ' (' + str(idx + 1) + ')'
+            description = self._og_search_description(webpage)
+            upload_date = unified_strdate(self._html_search_meta(
+                'dc.date', webpage, 'upload date'))
+
+            entries.append({
+                'id': video_id,
+                'title': title,
+                'description': description,
+                'duration': duration,
+                'thumbnail': thumbnail,
+                'upload_date': upload_date,
+                'formats': formats,
+            })
+
+        return self.playlist_result(entries)
diff --git a/youtube_dl/extractor/outsidetv.py b/youtube_dl/extractor/outsidetv.py

new file mode 100644 (file)

index 0000000..c5333b0
--- /dev/null
+++ b/youtube_dl/extractor/outsidetv.py
@@ -0,0 +1,28 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class OutsideTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?outsidetv\.com/(?:[^/]+/)*?play/[a-zA-Z0-9]{8}/\d+/\d+/(?P<id>[a-zA-Z0-9]{8})'
+    _TESTS = [{
+        'url': 'http://www.outsidetv.com/category/snow/play/ZjQYboH6/1/10/Hdg0jukV/4',
+        'md5': '192d968fedc10b2f70ec31865ffba0da',
+        'info_dict': {
+            'id': 'Hdg0jukV',
+            'ext': 'mp4',
+            'title': 'Home - Jackson Ep 1 | Arbor Snowboards',
+            'description': 'md5:41a12e94f3db3ca253b04bb1e8d8f4cd',
+            'upload_date': '20181225',
+            'timestamp': 1545742800,
+        }
+    }, {
+        'url': 'http://www.outsidetv.com/home/play/ZjQYboH6/1/10/Hdg0jukV/4',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        jw_media_id = self._match_id(url)
+        return self.url_result(
+            'jwplatform:' + jw_media_id, 'JWPlatform', jw_media_id)
diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py

new file mode 100644 (file)

index 0000000..11ad3b3
--- /dev/null
+++ b/youtube_dl/extractor/packtpub.py
@@ -0,0 +1,164 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    # compat_str,
+    compat_HTTPError,
+)
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    # remove_end,
+    str_or_none,
+    strip_or_none,
+    unified_timestamp,
+    # urljoin,
+)
+
+
+class PacktPubBaseIE(InfoExtractor):
+    # _PACKT_BASE = 'https://www.packtpub.com'
+    _STATIC_PRODUCTS_BASE = 'https://static.packt-cdn.com/products/'
+
+
+class PacktPubIE(PacktPubBaseIE):
+    _VALID_URL = r'https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>[^/]+)/(?P<id>[^/]+)(?:/(?P<display_id>[^/?&#]+))?'
+
+    _TESTS = [{
+        'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro',
+        'md5': '1e74bd6cfd45d7d07666f4684ef58f70',
+        'info_dict': {
+            'id': '20530',
+            'ext': 'mp4',
+            'title': 'Project Intro',
+            'thumbnail': r're:(?i)^https?://.*\.jpg',
+            'timestamp': 1490918400,
+            'upload_date': '20170331',
+        },
+    }, {
+        'url': 'https://subscription.packtpub.com/video/web_development/9781787122215/20528/20530/project-intro',
+        'only_matching': True,
+    }, {
+        'url': 'https://subscription.packtpub.com/video/programming/9781838988906/p1/video1_1/business-card-project',
+        'only_matching': True,
+    }]
+    _NETRC_MACHINE = 'packtpub'
+    _TOKEN = None
+
+    def _real_initialize(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+        try:
+            self._TOKEN = self._download_json(
+                'https://services.packtpub.com/auth-v1/users/tokens', None,
+                'Downloading Authorization Token', data=json.dumps({
+                    'username': username,
+                    'password': password,
+                }).encode())['data']['access']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 404):
+                message = self._parse_json(e.cause.read().decode(), None)['message']
+                raise ExtractorError(message, expected=True)
+            raise
+
+    def _real_extract(self, url):
+        course_id, chapter_id, video_id, display_id = re.match(self._VALID_URL, url).groups()
+
+        headers = {}
+        if self._TOKEN:
+            headers['Authorization'] = 'Bearer ' + self._TOKEN
+        try:
+            video_url = self._download_json(
+                'https://services.packtpub.com/products-v1/products/%s/%s/%s' % (course_id, chapter_id, video_id), video_id,
+                'Downloading JSON video', headers=headers)['data']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+                self.raise_login_required('This video is locked')
+            raise
+
+        # TODO: find a better way to avoid duplicating course requests
+        # metadata = self._download_json(
+        #     '%s/products/%s/chapters/%s/sections/%s/metadata'
+        #     % (self._MAPT_REST, course_id, chapter_id, video_id),
+        #     video_id)['data']
+
+        # title = metadata['pageTitle']
+        # course_title = metadata.get('title')
+        # if course_title:
+        #     title = remove_end(title, ' - %s' % course_title)
+        # timestamp = unified_timestamp(metadata.get('publicationDate'))
+        # thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath'))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': display_id or video_id,  # title,
+            # 'thumbnail': thumbnail,
+            # 'timestamp': timestamp,
+        }
+
+
+class PacktPubCourseIE(PacktPubBaseIE):
+    _VALID_URL = r'(?P<url>https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P<id>\d+))'
+    _TESTS = [{
+        'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215',
+        'info_dict': {
+            'id': '9781787122215',
+            'title': 'Learn Nodejs by building 12 projects [Video]',
+            'description': 'md5:489da8d953f416e51927b60a1c7db0aa',
+        },
+        'playlist_count': 90,
+    }, {
+        'url': 'https://subscription.packtpub.com/video/web_development/9781787122215',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if PacktPubIE.suitable(url) else super(
+            PacktPubCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        url, course_id = mobj.group('url', 'id')
+
+        course = self._download_json(
+            self._STATIC_PRODUCTS_BASE + '%s/toc' % course_id, course_id)
+        metadata = self._download_json(
+            self._STATIC_PRODUCTS_BASE + '%s/summary' % course_id,
+            course_id, fatal=False) or {}
+
+        entries = []
+        for chapter_num, chapter in enumerate(course['chapters'], 1):
+            chapter_id = str_or_none(chapter.get('id'))
+            sections = chapter.get('sections')
+            if not chapter_id or not isinstance(sections, list):
+                continue
+            chapter_info = {
+                'chapter': chapter.get('title'),
+                'chapter_number': chapter_num,
+                'chapter_id': chapter_id,
+            }
+            for section in sections:
+                section_id = str_or_none(section.get('id'))
+                if not section_id or section.get('contentType') != 'video':
+                    continue
+                entry = {
+                    '_type': 'url_transparent',
+                    'url': '/'.join([url, chapter_id, section_id]),
+                    'title': strip_or_none(section.get('title')),
+                    'description': clean_html(section.get('summary')),
+                    'thumbnail': metadata.get('coverImage'),
+                    'timestamp': unified_timestamp(metadata.get('publicationDate')),
+                    'ie_key': PacktPubIE.ie_key(),
+                }
+                entry.update(chapter_info)
+                entries.append(entry)
+
+        return self.playlist_result(
+            entries, course_id, metadata.get('title'),
+            clean_html(metadata.get('about')))
diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py

new file mode 100644 (file)

index 0000000..538738c
--- /dev/null
+++ b/youtube_dl/extractor/pandoratv.py
@@ -0,0 +1,134 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    parse_duration,
+    str_to_int,
+    urlencode_postdata,
+)
+
+
+class PandoraTVIE(InfoExtractor):
+    IE_NAME = 'pandora.tv'
+    IE_DESC = '판도라TV'
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:
+                                (?:www\.)?pandora\.tv/view/(?P<user_id>[^/]+)/(?P<id>\d+)|  # new format
+                                (?:.+?\.)?channel\.pandora\.tv/channel/video\.ptv\?|        # old format
+                                m\.pandora\.tv/?\?                                          # mobile
+                            )
+                    '''
+    _TESTS = [{
+        'url': 'http://jp.channel.pandora.tv/channel/video.ptv?c1=&prgid=53294230&ch_userid=mikakim&ref=main&lot=cate_01_2',
+        'info_dict': {
+            'id': '53294230',
+            'ext': 'flv',
+            'title': '頭を撫でてくれる？',
+            'description': '頭を撫でてくれる？',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 39,
+            'upload_date': '20151218',
+            'uploader': 'カワイイ動物まとめ',
+            'uploader_id': 'mikakim',
+            'view_count': int,
+            'like_count': int,
+        }
+    }, {
+        'url': 'http://channel.pandora.tv/channel/video.ptv?ch_userid=gogoucc&prgid=54721744',
+        'info_dict': {
+            'id': '54721744',
+            'ext': 'flv',
+            'title': '[HD] JAPAN COUNTDOWN 170423',
+            'description': '[HD] JAPAN COUNTDOWN 170423',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1704.9,
+            'upload_date': '20170423',
+            'uploader': 'GOGO_UCC',
+            'uploader_id': 'gogoucc',
+            'view_count': int,
+            'like_count': int,
+        },
+        'params': {
+            # Test metadata only
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.pandora.tv/view/mikakim/53294230#36797454_new',
+        'only_matching': True,
+    }, {
+        'url': 'http://m.pandora.tv/?c=view&ch_userid=mikakim&prgid=54600346',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user_id = mobj.group('user_id')
+        video_id = mobj.group('id')
+
+        if not user_id or not video_id:
+            qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+            video_id = qs.get('prgid', [None])[0]
+            user_id = qs.get('ch_userid', [None])[0]
+            if any(not f for f in (video_id, user_id,)):
+                raise ExtractorError('Invalid URL', expected=True)
+
+        data = self._download_json(
+            'http://m.pandora.tv/?c=view&m=viewJsonApi&ch_userid=%s&prgid=%s'
+            % (user_id, video_id), video_id)
+
+        info = data['data']['rows']['vod_play_info']['result']
+
+        formats = []
+        for format_id, format_url in info.items():
+            if not format_url:
+                continue
+            height = self._search_regex(
+                r'^v(\d+)[Uu]rl$', format_id, 'height', default=None)
+            if not height:
+                continue
+
+            play_url = self._download_json(
+                'http://m.pandora.tv/?c=api&m=play_url', video_id,
+                data=urlencode_postdata({
+                    'prgid': video_id,
+                    'runtime': info.get('runtime'),
+                    'vod_url': format_url,
+                }),
+                headers={
+                    'Origin': url,
+                    'Content-Type': 'application/x-www-form-urlencoded',
+                })
+            format_url = play_url.get('url')
+            if not format_url:
+                continue
+
+            formats.append({
+                'format_id': '%sp' % height,
+                'url': format_url,
+                'height': int(height),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': info['subject'],
+            'description': info.get('body'),
+            'thumbnail': info.get('thumbnail') or info.get('poster'),
+            'duration': float_or_none(info.get('runtime'), 1000) or parse_duration(info.get('time')),
+            'upload_date': info['fid'].split('/')[-1][:8] if isinstance(info.get('fid'), compat_str) else None,
+            'uploader': info.get('nickname'),
+            'uploader_id': info.get('upload_userid'),
+            'view_count': str_to_int(info.get('hit')),
+            'like_count': str_to_int(info.get('likecnt')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py

new file mode 100644 (file)

index 0000000..bdd5ff5
--- /dev/null
+++ b/youtube_dl/extractor/parliamentliveuk.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ParliamentLiveUKIE(InfoExtractor):
+    IE_NAME = 'parliamentlive.tv'
+    IE_DESC = 'UK parliament videos'
+    _VALID_URL = r'(?i)https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+
+    _TESTS = [{
+        'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b',
+        'info_dict': {
+            'id': '1_af9nv9ym',
+            'ext': 'mp4',
+            'title': 'Home Affairs Committee',
+            'uploader_id': 'FFMPEG-01',
+            'timestamp': 1422696664,
+            'upload_date': '20150131',
+        },
+    }, {
+        'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'http://vodplayer.parliamentlive.tv/?mid=' + video_id, video_id)
+        widget_config = self._parse_json(self._search_regex(
+            r'(?s)kWidgetConfig\s*=\s*({.+});',
+            webpage, 'kaltura widget config'), video_id)
+        kaltura_url = 'kaltura:%s:%s' % (
+            widget_config['wid'][1:], widget_config['entry_id'])
+        event_title = self._download_json(
+            'http://parliamentlive.tv/Event/GetShareVideo/' + video_id, video_id)['event']['title']
+        return {
+            '_type': 'url_transparent',
+            'title': event_title,
+            'description': '',
+            'url': kaltura_url,
+            'ie_key': 'Kaltura',
+        }
diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py

new file mode 100644 (file)

index 0000000..761a4b1
--- /dev/null
+++ b/youtube_dl/extractor/patreon.py
@@ -0,0 +1,156 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    determine_ext,
+    int_or_none,
+    KNOWN_EXTENSIONS,
+    mimetype2ext,
+    parse_iso8601,
+    str_or_none,
+    try_get,
+)
+
+
+class PatreonIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.patreon.com/creation?hid=743933',
+        'md5': 'e25505eec1053a6e6813b8ed369875cc',
+        'info_dict': {
+            'id': '743933',
+            'ext': 'mp3',
+            'title': 'Episode 166: David Smalley of Dogma Debate',
+            'description': 'md5:713b08b772cd6271b9f3906683cfacdf',
+            'uploader': 'Cognitive Dissonance Podcast',
+            'thumbnail': 're:^https?://.*$',
+            'timestamp': 1406473987,
+            'upload_date': '20140727',
+            'uploader_id': '87145',
+        },
+    }, {
+        'url': 'http://www.patreon.com/creation?hid=754133',
+        'md5': '3eb09345bf44bf60451b8b0b81759d0a',
+        'info_dict': {
+            'id': '754133',
+            'ext': 'mp3',
+            'title': 'CD 167 Extra',
+            'uploader': 'Cognitive Dissonance Podcast',
+            'thumbnail': 're:^https?://.*$',
+        },
+        'skip': 'Patron-only content',
+    }, {
+        'url': 'https://www.patreon.com/creation?hid=1682498',
+        'info_dict': {
+            'id': 'SU4fj_aEMVw',
+            'ext': 'mp4',
+            'title': 'I\'m on Patreon!',
+            'uploader': 'TraciJHines',
+            'thumbnail': 're:^https?://.*$',
+            'upload_date': '20150211',
+            'description': 'md5:c5a706b1f687817a3de09db1eb93acd4',
+            'uploader_id': 'TraciJHines',
+        },
+        'params': {
+            'noplaylist': True,
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://www.patreon.com/posts/episode-166-of-743933',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.patreon.com/posts/743933',
+        'only_matching': True,
+    }]
+
+    # Currently Patreon exposes download URL via hidden CSS, so login is not
+    # needed. Keeping this commented for when this inevitably changes.
+    '''
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'redirectUrl': 'http://www.patreon.com/',
+            'email': username,
+            'password': password,
+        }
+
+        request = sanitized_Request(
+            'https://www.patreon.com/processLogin',
+            compat_urllib_parse_urlencode(login_form).encode('utf-8')
+        )
+        login_page = self._download_webpage(request, None, note='Logging in')
+
+        if re.search(r'onLoginFailed', login_page):
+            raise ExtractorError('Unable to login, incorrect username and/or password', expected=True)
+
+    def _real_initialize(self):
+        self._login()
+    '''
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        post = self._download_json(
+            'https://www.patreon.com/api/posts/' + video_id, video_id, query={
+                'fields[media]': 'download_url,mimetype,size_bytes',
+                'fields[post]': 'comment_count,content,embed,image,like_count,post_file,published_at,title',
+                'fields[user]': 'full_name,url',
+                'json-api-use-default-includes': 'false',
+                'include': 'media,user',
+            })
+        attributes = post['data']['attributes']
+        title = attributes['title'].strip()
+        image = attributes.get('image') or {}
+        info = {
+            'id': video_id,
+            'title': title,
+            'description': clean_html(attributes.get('content')),
+            'thumbnail': image.get('large_url') or image.get('url'),
+            'timestamp': parse_iso8601(attributes.get('published_at')),
+            'like_count': int_or_none(attributes.get('like_count')),
+            'comment_count': int_or_none(attributes.get('comment_count')),
+        }
+
+        for i in post.get('included', []):
+            i_type = i.get('type')
+            if i_type == 'media':
+                media_attributes = i.get('attributes') or {}
+                download_url = media_attributes.get('download_url')
+                ext = mimetype2ext(media_attributes.get('mimetype'))
+                if download_url and ext in KNOWN_EXTENSIONS:
+                    info.update({
+                        'ext': ext,
+                        'filesize': int_or_none(media_attributes.get('size_bytes')),
+                        'url': download_url,
+                    })
+            elif i_type == 'user':
+                user_attributes = i.get('attributes')
+                if user_attributes:
+                    info.update({
+                        'uploader': user_attributes.get('full_name'),
+                        'uploader_id': str_or_none(i.get('id')),
+                        'uploader_url': user_attributes.get('url'),
+                    })
+
+        if not info.get('url'):
+            embed_url = try_get(attributes, lambda x: x['embed']['url'])
+            if embed_url:
+                info.update({
+                    '_type': 'url',
+                    'url': embed_url,
+                })
+
+        if not info.get('url'):
+            post_file = attributes['post_file']
+            ext = determine_ext(post_file.get('name'))
+            if ext in KNOWN_EXTENSIONS:
+                info.update({
+                    'ext': ext,
+                    'url': post_file['url'],
+                })
+
+        return info
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py

new file mode 100644 (file)

index 0000000..4dbe661
--- /dev/null
+++ b/youtube_dl/extractor/pbs.py
@@ -0,0 +1,710 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    determine_ext,
+    int_or_none,
+    float_or_none,
+    js_to_json,
+    orderedSet,
+    strip_jsonp,
+    strip_or_none,
+    unified_strdate,
+    url_or_none,
+    US_RATINGS,
+)
+
+
+class PBSIE(InfoExtractor):
+    _STATIONS = (
+        (r'(?:video|www|player)\.pbs\.org', 'PBS: Public Broadcasting Service'),  # http://www.pbs.org/
+        (r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'),  # http://aptv.org/
+        (r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'),  # http://www.gpb.org/
+        (r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'),  # http://www.mpbonline.org
+        (r'video\.wnpt\.org', 'Nashville Public Television (WNPT)'),  # http://www.wnpt.org
+        (r'video\.wfsu\.org', 'WFSU-TV (WFSU)'),  # http://wfsu.org/
+        (r'video\.wsre\.org', 'WSRE (WSRE)'),  # http://www.wsre.org
+        (r'video\.wtcitv\.org', 'WTCI (WTCI)'),  # http://www.wtcitv.org
+        (r'video\.pba\.org', 'WPBA/Channel 30 (WPBA)'),  # http://pba.org/
+        (r'video\.alaskapublic\.org', 'Alaska Public Media (KAKM)'),  # http://alaskapublic.org/kakm
+        # (r'kuac\.org', 'KUAC (KUAC)'),  # http://kuac.org/kuac-tv/
+        # (r'ktoo\.org', '360 North (KTOO)'),  # http://www.ktoo.org/
+        # (r'azpm\.org', 'KUAT 6 (KUAT)'),  # http://www.azpm.org/
+        (r'video\.azpbs\.org', 'Arizona PBS (KAET)'),  # http://www.azpbs.org
+        (r'portal\.knme\.org', 'KNME-TV/Channel 5 (KNME)'),  # http://www.newmexicopbs.org/
+        (r'video\.vegaspbs\.org', 'Vegas PBS (KLVX)'),  # http://vegaspbs.org/
+        (r'watch\.aetn\.org', 'AETN/ARKANSAS ETV NETWORK (KETS)'),  # http://www.aetn.org/
+        (r'video\.ket\.org', 'KET (WKLE)'),  # http://www.ket.org/
+        (r'video\.wkno\.org', 'WKNO/Channel 10 (WKNO)'),  # http://www.wkno.org/
+        (r'video\.lpb\.org', 'LPB/LOUISIANA PUBLIC BROADCASTING (WLPB)'),  # http://www.lpb.org/
+        (r'videos\.oeta\.tv', 'OETA (KETA)'),  # http://www.oeta.tv
+        (r'video\.optv\.org', 'Ozarks Public Television (KOZK)'),  # http://www.optv.org/
+        (r'watch\.wsiu\.org', 'WSIU Public Broadcasting (WSIU)'),  # http://www.wsiu.org/
+        (r'video\.keet\.org', 'KEET TV (KEET)'),  # http://www.keet.org
+        (r'pbs\.kixe\.org', 'KIXE/Channel 9 (KIXE)'),  # http://kixe.org/
+        (r'video\.kpbs\.org', 'KPBS San Diego (KPBS)'),  # http://www.kpbs.org/
+        (r'video\.kqed\.org', 'KQED (KQED)'),  # http://www.kqed.org
+        (r'vids\.kvie\.org', 'KVIE Public Television (KVIE)'),  # http://www.kvie.org
+        (r'video\.pbssocal\.org', 'PBS SoCal/KOCE (KOCE)'),  # http://www.pbssocal.org/
+        (r'video\.valleypbs\.org', 'ValleyPBS (KVPT)'),  # http://www.valleypbs.org/
+        (r'video\.cptv\.org', 'CONNECTICUT PUBLIC TELEVISION (WEDH)'),  # http://cptv.org
+        (r'watch\.knpb\.org', 'KNPB Channel 5 (KNPB)'),  # http://www.knpb.org/
+        (r'video\.soptv\.org', 'SOPTV (KSYS)'),  # http://www.soptv.org
+        # (r'klcs\.org', 'KLCS/Channel 58 (KLCS)'),  # http://www.klcs.org
+        # (r'krcb\.org', 'KRCB Television & Radio (KRCB)'),  # http://www.krcb.org
+        # (r'kvcr\.org', 'KVCR TV/DT/FM :: Vision for the Future (KVCR)'),  # http://kvcr.org
+        (r'video\.rmpbs\.org', 'Rocky Mountain PBS (KRMA)'),  # http://www.rmpbs.org
+        (r'video\.kenw\.org', 'KENW-TV3 (KENW)'),  # http://www.kenw.org
+        (r'video\.kued\.org', 'KUED Channel 7 (KUED)'),  # http://www.kued.org
+        (r'video\.wyomingpbs\.org', 'Wyoming PBS (KCWC)'),  # http://www.wyomingpbs.org
+        (r'video\.cpt12\.org', 'Colorado Public Television / KBDI 12 (KBDI)'),  # http://www.cpt12.org/
+        (r'video\.kbyueleven\.org', 'KBYU-TV (KBYU)'),  # http://www.kbyutv.org/
+        (r'video\.thirteen\.org', 'Thirteen/WNET New York (WNET)'),  # http://www.thirteen.org
+        (r'video\.wgbh\.org', 'WGBH/Channel 2 (WGBH)'),  # http://wgbh.org
+        (r'video\.wgby\.org', 'WGBY (WGBY)'),  # http://www.wgby.org
+        (r'watch\.njtvonline\.org', 'NJTV Public Media NJ (WNJT)'),  # http://www.njtvonline.org/
+        # (r'ripbs\.org', 'Rhode Island PBS (WSBE)'),  # http://www.ripbs.org/home/
+        (r'watch\.wliw\.org', 'WLIW21 (WLIW)'),  # http://www.wliw.org/
+        (r'video\.mpt\.tv', 'mpt/Maryland Public Television (WMPB)'),  # http://www.mpt.org
+        (r'watch\.weta\.org', 'WETA Television and Radio (WETA)'),  # http://www.weta.org
+        (r'video\.whyy\.org', 'WHYY (WHYY)'),  # http://www.whyy.org
+        (r'video\.wlvt\.org', 'PBS 39 (WLVT)'),  # http://www.wlvt.org/
+        (r'video\.wvpt\.net', 'WVPT - Your Source for PBS and More! (WVPT)'),  # http://www.wvpt.net
+        (r'video\.whut\.org', 'Howard University Television (WHUT)'),  # http://www.whut.org
+        (r'video\.wedu\.org', 'WEDU PBS (WEDU)'),  # http://www.wedu.org
+        (r'video\.wgcu\.org', 'WGCU Public Media (WGCU)'),  # http://www.wgcu.org/
+        # (r'wjct\.org', 'WJCT Public Broadcasting (WJCT)'),  # http://www.wjct.org
+        (r'video\.wpbt2\.org', 'WPBT2 (WPBT)'),  # http://www.wpbt2.org
+        (r'video\.wucftv\.org', 'WUCF TV (WUCF)'),  # http://wucftv.org
+        (r'video\.wuft\.org', 'WUFT/Channel 5 (WUFT)'),  # http://www.wuft.org
+        (r'watch\.wxel\.org', 'WXEL/Channel 42 (WXEL)'),  # http://www.wxel.org/home/
+        (r'video\.wlrn\.org', 'WLRN/Channel 17 (WLRN)'),  # http://www.wlrn.org/
+        (r'video\.wusf\.usf\.edu', 'WUSF Public Broadcasting (WUSF)'),  # http://wusf.org/
+        (r'video\.scetv\.org', 'ETV (WRLK)'),  # http://www.scetv.org
+        (r'video\.unctv\.org', 'UNC-TV (WUNC)'),  # http://www.unctv.org/
+        # (r'pbsguam\.org', 'PBS Guam (KGTF)'),  # http://www.pbsguam.org/
+        (r'video\.pbshawaii\.org', 'PBS Hawaii - Oceanic Cable Channel 10 (KHET)'),  # http://www.pbshawaii.org/
+        (r'video\.idahoptv\.org', 'Idaho Public Television (KAID)'),  # http://idahoptv.org
+        (r'video\.ksps\.org', 'KSPS (KSPS)'),  # http://www.ksps.org/home/
+        (r'watch\.opb\.org', 'OPB (KOPB)'),  # http://www.opb.org
+        (r'watch\.nwptv\.org', 'KWSU/Channel 10 & KTNW/Channel 31 (KWSU)'),  # http://www.kwsu.org
+        (r'video\.will\.illinois\.edu', 'WILL-TV (WILL)'),  # http://will.illinois.edu/
+        (r'video\.networkknowledge\.tv', 'Network Knowledge - WSEC/Springfield (WSEC)'),  # http://www.wsec.tv
+        (r'video\.wttw\.com', 'WTTW11 (WTTW)'),  # http://www.wttw.com/
+        # (r'wtvp\.org', 'WTVP & WTVP.org, Public Media for Central Illinois (WTVP)'),  # http://www.wtvp.org/
+        (r'video\.iptv\.org', 'Iowa Public Television/IPTV (KDIN)'),  # http://www.iptv.org/
+        (r'video\.ninenet\.org', 'Nine Network (KETC)'),  # http://www.ninenet.org
+        (r'video\.wfwa\.org', 'PBS39 Fort Wayne (WFWA)'),  # http://wfwa.org/
+        (r'video\.wfyi\.org', 'WFYI Indianapolis (WFYI)'),  # http://www.wfyi.org
+        (r'video\.mptv\.org', 'Milwaukee Public Television (WMVS)'),  # http://www.mptv.org
+        (r'video\.wnin\.org', 'WNIN (WNIN)'),  # http://www.wnin.org/
+        (r'video\.wnit\.org', 'WNIT Public Television (WNIT)'),  # http://www.wnit.org/
+        (r'video\.wpt\.org', 'WPT (WPNE)'),  # http://www.wpt.org/
+        (r'video\.wvut\.org', 'WVUT/Channel 22 (WVUT)'),  # http://wvut.org/
+        (r'video\.weiu\.net', 'WEIU/Channel 51 (WEIU)'),  # http://www.weiu.net
+        (r'video\.wqpt\.org', 'WQPT-TV (WQPT)'),  # http://www.wqpt.org
+        (r'video\.wycc\.org', 'WYCC PBS Chicago (WYCC)'),  # http://www.wycc.org
+        # (r'lakeshorepublicmedia\.org', 'Lakeshore Public Television (WYIN)'),  # http://lakeshorepublicmedia.org/
+        (r'video\.wipb\.org', 'WIPB-TV (WIPB)'),  # http://wipb.org
+        (r'video\.indianapublicmedia\.org', 'WTIU (WTIU)'),  # http://indianapublicmedia.org/tv/
+        (r'watch\.cetconnect\.org', 'CET  (WCET)'),  # http://www.cetconnect.org
+        (r'video\.thinktv\.org', 'ThinkTVNetwork (WPTD)'),  # http://www.thinktv.org
+        (r'video\.wbgu\.org', 'WBGU-TV (WBGU)'),  # http://wbgu.org
+        (r'video\.wgvu\.org', 'WGVU TV (WGVU)'),  # http://www.wgvu.org/
+        (r'video\.netnebraska\.org', 'NET1 (KUON)'),  # http://netnebraska.org
+        (r'video\.pioneer\.org', 'Pioneer Public Television (KWCM)'),  # http://www.pioneer.org
+        (r'watch\.sdpb\.org', 'SDPB Television (KUSD)'),  # http://www.sdpb.org
+        (r'video\.tpt\.org', 'TPT (KTCA)'),  # http://www.tpt.org
+        (r'watch\.ksmq\.org', 'KSMQ (KSMQ)'),  # http://www.ksmq.org/
+        (r'watch\.kpts\.org', 'KPTS/Channel 8 (KPTS)'),  # http://www.kpts.org/
+        (r'watch\.ktwu\.org', 'KTWU/Channel 11 (KTWU)'),  # http://ktwu.org
+        # (r'shptv\.org', 'Smoky Hills Public Television (KOOD)'),  # http://www.shptv.org
+        # (r'kcpt\.org', 'KCPT Kansas City Public Television (KCPT)'),  # http://kcpt.org/
+        # (r'blueridgepbs\.org', 'Blue Ridge PBS (WBRA)'),  # http://www.blueridgepbs.org/
+        (r'watch\.easttennesseepbs\.org', 'East Tennessee PBS (WSJK)'),  # http://easttennesseepbs.org
+        (r'video\.wcte\.tv', 'WCTE-TV (WCTE)'),  # http://www.wcte.org
+        (r'video\.wljt\.org', 'WLJT, Channel 11 (WLJT)'),  # http://wljt.org/
+        (r'video\.wosu\.org', 'WOSU TV (WOSU)'),  # http://wosu.org/
+        (r'video\.woub\.org', 'WOUB/WOUC (WOUB)'),  # http://woub.org/tv/index.php?section=5
+        (r'video\.wvpublic\.org', 'WVPB (WVPB)'),  # http://wvpublic.org/
+        (r'video\.wkyupbs\.org', 'WKYU-PBS (WKYU)'),  # http://www.wkyupbs.org
+        # (r'wyes\.org', 'WYES-TV/New Orleans (WYES)'),  # http://www.wyes.org
+        (r'video\.kera\.org', 'KERA 13 (KERA)'),  # http://www.kera.org/
+        (r'video\.mpbn\.net', 'MPBN (WCBB)'),  # http://www.mpbn.net/
+        (r'video\.mountainlake\.org', 'Mountain Lake PBS (WCFE)'),  # http://www.mountainlake.org/
+        (r'video\.nhptv\.org', 'NHPTV (WENH)'),  # http://nhptv.org/
+        (r'video\.vpt\.org', 'Vermont PBS (WETK)'),  # http://www.vpt.org
+        (r'video\.witf\.org', 'witf (WITF)'),  # http://www.witf.org
+        (r'watch\.wqed\.org', 'WQED Multimedia (WQED)'),  # http://www.wqed.org/
+        (r'video\.wmht\.org', 'WMHT Educational Telecommunications (WMHT)'),  # http://www.wmht.org/home/
+        (r'video\.deltabroadcasting\.org', 'Q-TV (WDCQ)'),  # http://www.deltabroadcasting.org
+        (r'video\.dptv\.org', 'WTVS Detroit Public TV (WTVS)'),  # http://www.dptv.org/
+        (r'video\.wcmu\.org', 'CMU Public Television (WCMU)'),  # http://www.wcmu.org
+        (r'video\.wkar\.org', 'WKAR-TV (WKAR)'),  # http://wkar.org/
+        (r'wnmuvideo\.nmu\.edu', 'WNMU-TV Public TV 13 (WNMU)'),  # http://wnmutv.nmu.edu
+        (r'video\.wdse\.org', 'WDSE - WRPT (WDSE)'),  # http://www.wdse.org/
+        (r'video\.wgte\.org', 'WGTE TV (WGTE)'),  # http://www.wgte.org
+        (r'video\.lptv\.org', 'Lakeland Public Television (KAWE)'),  # http://www.lakelandptv.org
+        # (r'prairiepublic\.org', 'PRAIRIE PUBLIC (KFME)'),  # http://www.prairiepublic.org/
+        (r'video\.kmos\.org', 'KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS)'),  # http://www.kmos.org/
+        (r'watch\.montanapbs\.org', 'MontanaPBS (KUSM)'),  # http://montanapbs.org
+        (r'video\.krwg\.org', 'KRWG/Channel 22 (KRWG)'),  # http://www.krwg.org
+        (r'video\.kacvtv\.org', 'KACV (KACV)'),  # http://www.panhandlepbs.org/home/
+        (r'video\.kcostv\.org', 'KCOS/Channel 13 (KCOS)'),  # www.kcostv.org
+        (r'video\.wcny\.org', 'WCNY/Channel 24 (WCNY)'),  # http://www.wcny.org
+        (r'video\.wned\.org', 'WNED (WNED)'),  # http://www.wned.org/
+        (r'watch\.wpbstv\.org', 'WPBS (WPBS)'),  # http://www.wpbstv.org
+        (r'video\.wskg\.org', 'WSKG Public TV (WSKG)'),  # http://wskg.org
+        (r'video\.wxxi\.org', 'WXXI (WXXI)'),  # http://wxxi.org
+        (r'video\.wpsu\.org', 'WPSU (WPSU)'),  # http://www.wpsu.org
+        # (r'wqln\.org', 'WQLN/Channel 54 (WQLN)'),  # http://www.wqln.org
+        (r'on-demand\.wvia\.org', 'WVIA Public Media Studios (WVIA)'),  # http://www.wvia.org/
+        (r'video\.wtvi\.org', 'WTVI (WTVI)'),  # http://www.wtvi.org/
+        # (r'whro\.org', 'WHRO (WHRO)'),  # http://whro.org
+        (r'video\.westernreservepublicmedia\.org', 'Western Reserve PBS (WNEO)'),  # http://www.WesternReservePublicMedia.org/
+        (r'video\.ideastream\.org', 'WVIZ/PBS ideastream (WVIZ)'),  # http://www.wviz.org/
+        (r'video\.kcts9\.org', 'KCTS 9 (KCTS)'),  # http://kcts9.org/
+        (r'video\.basinpbs\.org', 'Basin PBS (KPBT)'),  # http://www.basinpbs.org
+        (r'video\.houstonpbs\.org', 'KUHT / Channel 8 (KUHT)'),  # http://www.houstonpublicmedia.org/
+        # (r'tamu\.edu', 'KAMU - TV (KAMU)'),  # http://KAMU.tamu.edu
+        # (r'kedt\.org', 'KEDT/Channel 16 (KEDT)'),  # http://www.kedt.org
+        (r'video\.klrn\.org', 'KLRN (KLRN)'),  # http://www.klrn.org
+        (r'video\.klru\.tv', 'KLRU (KLRU)'),  # http://www.klru.org
+        # (r'kmbh\.org', 'KMBH-TV (KMBH)'),  # http://www.kmbh.org
+        # (r'knct\.org', 'KNCT (KNCT)'),  # http://www.knct.org
+        # (r'ktxt\.org', 'KTTZ-TV (KTXT)'),  # http://www.ktxt.org
+        (r'video\.wtjx\.org', 'WTJX Channel 12 (WTJX)'),  # http://www.wtjx.org/
+        (r'video\.ideastations\.org', 'WCVE PBS (WCVE)'),  # http://ideastations.org/
+        (r'video\.kbtc\.org', 'KBTC Public Television (KBTC)'),  # http://kbtc.org
+    )
+
+    IE_NAME = 'pbs'
+    IE_DESC = 'Public Broadcasting Service (PBS) and member stations: %s' % ', '.join(list(zip(*_STATIONS))[1])
+
+    _VALID_URL = r'''(?x)https?://
+        (?:
+           # Direct video URL
+           (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
+           # Article with embedded player (or direct video)
+           (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
+           # Player
+           (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
+        )
+    ''' % '|'.join(list(zip(*_STATIONS))[0])
+
+    _GEO_COUNTRIES = ['US']
+
+    _TESTS = [
+        {
+            'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
+            'md5': '173dc391afd361fa72eab5d3d918968d',
+            'info_dict': {
+                'id': '2365006249',
+                'ext': 'mp4',
+                'title': 'Constitution USA with Peter Sagal - A More Perfect Union',
+                'description': 'md5:31b664af3c65fd07fa460d306b837d00',
+                'duration': 3190,
+            },
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
+            'md5': '6f722cb3c3982186d34b0f13374499c7',
+            'info_dict': {
+                'id': '2365297690',
+                'ext': 'mp4',
+                'title': 'FRONTLINE - Losing Iraq',
+                'description': 'md5:5979a4d069b157f622d02bff62fbe654',
+                'duration': 5050,
+            },
+        },
+        {
+            'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
+            'md5': 'b19856d7f5351b17a5ab1dc6a64be633',
+            'info_dict': {
+                'id': '2201174722',
+                'ext': 'mp4',
+                'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist',
+                'description': 'md5:86ab9a3d04458b876147b355788b8781',
+                'duration': 801,
+            },
+        },
+        {
+            'url': 'http://www.pbs.org/wnet/gperf/dudamel-conducts-verdi-requiem-hollywood-bowl-full-episode/3374/',
+            'md5': 'c62859342be2a0358d6c9eb306595978',
+            'info_dict': {
+                'id': '2365297708',
+                'ext': 'mp4',
+                'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full',
+                'description': 'md5:657897370e09e2bc6bf0f8d2cd313c6b',
+                'duration': 6559,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
+            'md5': '908f3e5473a693b266b84e25e1cf9703',
+            'info_dict': {
+                'id': '2365160389',
+                'display_id': 'killer-typhoon',
+                'ext': 'mp4',
+                'description': 'md5:c741d14e979fc53228c575894094f157',
+                'title': 'NOVA - Killer Typhoon',
+                'duration': 3172,
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'upload_date': '20140122',
+                'age_limit': 10,
+            },
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
+            'info_dict': {
+                'id': 'united-states-of-secrets',
+            },
+            'playlist_count': 2,
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/',
+            'info_dict': {
+                'id': 'great-war',
+            },
+            'playlist_count': 3,
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
+            'info_dict': {
+                'id': '2276541483',
+                'display_id': 'player',
+                'ext': 'mp4',
+                'title': 'American Experience - Death and the Civil War, Chapter 1',
+                'description': 'md5:67fa89a9402e2ee7d08f53b920674c18',
+                'duration': 682,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
+        },
+        {
+            'url': 'http://www.pbs.org/video/2365245528/',
+            'md5': '115223d41bd55cda8ae5cd5ed4e11497',
+            'info_dict': {
+                'id': '2365245528',
+                'display_id': '2365245528',
+                'ext': 'mp4',
+                'title': 'FRONTLINE - United States of Secrets (Part One)',
+                'description': 'md5:55756bd5c551519cc4b7703e373e217e',
+                'duration': 6851,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            # Video embedded in iframe containing angle brackets as attribute's value (e.g.
+            # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
+            # https://github.com/ytdl-org/youtube-dl/issues/7059)
+            'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
+            'md5': '59b0ef5009f9ac8a319cc5efebcd865e',
+            'info_dict': {
+                'id': '2365546844',
+                'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
+                'ext': 'mp4',
+                'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
+                'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5',
+                'duration': 1480,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            # Frontline video embedded via flp2012.js
+            'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists',
+            'info_dict': {
+                'id': '2070868960',
+                'display_id': 'the-atomic-artists',
+                'ext': 'mp4',
+                'title': 'FRONTLINE - The Atomic Artists',
+                'description': 'md5:f677e4520cfacb4a5ce1471e31b57800',
+                'duration': 723,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
+        },
+        {
+            # Serves hd only via wigget/partnerplayer page
+            'url': 'http://www.pbs.org/video/2365641075/',
+            'md5': 'fdf907851eab57211dd589cf12006666',
+            'info_dict': {
+                'id': '2365641075',
+                'ext': 'mp4',
+                'title': 'FRONTLINE - Netanyahu at War',
+                'duration': 6852,
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'formats': 'mincount:8',
+            },
+        },
+        {
+            # https://github.com/ytdl-org/youtube-dl/issues/13801
+            'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/',
+            'info_dict': {
+                'id': '3003333873',
+                'ext': 'mp4',
+                'title': 'PBS NewsHour - full episode July 31, 2017',
+                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+                'duration': 3265,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/',
+            'info_dict': {
+                'id': '2365936247',
+                'ext': 'mp4',
+                'title': 'Antiques Roadshow - Indianapolis, Hour 2',
+                'description': 'md5:524b32249db55663e7231b6b8d1671a2',
+                'duration': 3180,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'expected_warnings': ['HTTP Error 403: Forbidden'],
+        },
+        {
+            'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/',
+            'info_dict': {
+                'id': '3007193718',
+                'ext': 'mp4',
+                'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster",
+                'description': 'md5:37efbac85e0c09b009586523ec143652',
+                'duration': 6292,
+                'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'expected_warnings': ['HTTP Error 403: Forbidden'],
+        },
+        {
+            'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/',
+            'info_dict': {
+                'id': '3011407934',
+                'ext': 'mp4',
+                'title': 'Stories from the Stage - Road Trip',
+                'duration': 1619,
+                'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'expected_warnings': ['HTTP Error 403: Forbidden'],
+        },
+        {
+            'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://watch.knpb.org/video/2365616055/',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=',
+            'only_matching': True,
+        }
+    ]
+    _ERRORS = {
+        101: 'We\'re sorry, but this video is not yet available.',
+        403: 'We\'re sorry, but this video is not available in your region due to right restrictions.',
+        404: 'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.',
+        410: 'This video has expired and is no longer available for online streaming.',
+    }
+
+    def _real_initialize(self):
+        cookie = (self._download_json(
+            'http://localization.services.pbs.org/localize/auto/cookie/',
+            None, headers=self.geo_verification_headers(), fatal=False) or {}).get('cookie')
+        if cookie:
+            station = self._search_regex(r'#?s=\["([^"]+)"', cookie, 'station')
+            if station:
+                self._set_cookie('.pbs.org', 'pbsol.station', station)
+
+    def _extract_webpage(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        description = None
+
+        presumptive_id = mobj.group('presumptive_id')
+        display_id = presumptive_id
+        if presumptive_id:
+            webpage = self._download_webpage(url, display_id)
+
+            description = strip_or_none(self._og_search_description(
+                webpage, default=None) or self._html_search_meta(
+                'description', webpage, default=None))
+            upload_date = unified_strdate(self._search_regex(
+                r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
+                webpage, 'upload date', default=None))
+
+            # tabbed frontline videos
+            MULTI_PART_REGEXES = (
+                r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"',
+                r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)',
+            )
+            for p in MULTI_PART_REGEXES:
+                tabbed_videos = orderedSet(re.findall(p, webpage))
+                if tabbed_videos:
+                    return tabbed_videos, presumptive_id, upload_date, description
+
+            MEDIA_ID_REGEXES = [
+                r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed
+                r'class="coveplayerid">([^<]+)<',                       # coveplayer
+                r'<section[^>]+data-coveid="(\d+)"',                    # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/
+                r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>',  # jwplayer
+                r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',",
+                r'<div[^>]+\bdata-cove-id=["\'](\d+)"',  # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/
+                r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)',  # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/
+            ]
+
+            media_id = self._search_regex(
+                MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
+            if media_id:
+                return media_id, presumptive_id, upload_date, description
+
+            # Fronline video embedded via flp
+            video_id = self._search_regex(
+                r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None)
+            if video_id:
+                # pkg_id calculation is reverse engineered from
+                # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js
+                prg_id = self._search_regex(
+                    r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:]
+                if 'q' in prg_id:
+                    prg_id = prg_id.split('q')[1]
+                prg_id = int(prg_id, 16)
+                getdir = self._download_json(
+                    'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id,
+                    presumptive_id, 'Downloading getdir JSON',
+                    transform_source=strip_jsonp)
+                return getdir['mid'], presumptive_id, upload_date, description
+
+            for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage):
+                url = self._search_regex(
+                    r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe,
+                    'player URL', default=None, group='url')
+                if url:
+                    break
+
+            if not url:
+                url = self._og_search_url(webpage)
+
+            mobj = re.match(
+                self._VALID_URL, self._proto_relative_url(url.strip()))
+
+        player_id = mobj.group('player_id')
+        if not display_id:
+            display_id = player_id
+        if player_id:
+            player_page = self._download_webpage(
+                url, display_id, note='Downloading player page',
+                errnote='Could not download player page')
+            video_id = self._search_regex(
+                r'<div\s+id=["\']video_(\d+)', player_page, 'video ID',
+                default=None)
+            if not video_id:
+                video_info = self._extract_video_data(
+                    player_page, 'video data', display_id)
+                video_id = compat_str(
+                    video_info.get('id') or video_info['contentID'])
+        else:
+            video_id = mobj.group('id')
+            display_id = video_id
+
+        return video_id, display_id, None, description
+
+    def _extract_video_data(self, string, name, video_id, fatal=True):
+        return self._parse_json(
+            self._search_regex(
+                [r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
+                 r'window\.videoBridge\s*=\s*({.+?});'],
+                string, name, default='{}'),
+            video_id, transform_source=js_to_json, fatal=fatal)
+
+    def _real_extract(self, url):
+        video_id, display_id, upload_date, description = self._extract_webpage(url)
+
+        if isinstance(video_id, list):
+            entries = [self.url_result(
+                'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id)
+                for vid_id in video_id]
+            return self.playlist_result(entries, display_id)
+
+        info = None
+        redirects = []
+        redirect_urls = set()
+
+        def extract_redirect_urls(info):
+            for encoding_name in ('recommended_encoding', 'alternate_encoding'):
+                redirect = info.get(encoding_name)
+                if not redirect:
+                    continue
+                redirect_url = redirect.get('url')
+                if redirect_url and redirect_url not in redirect_urls:
+                    redirects.append(redirect)
+                    redirect_urls.add(redirect_url)
+            encodings = info.get('encodings')
+            if isinstance(encodings, list):
+                for encoding in encodings:
+                    encoding_url = url_or_none(encoding)
+                    if encoding_url and encoding_url not in redirect_urls:
+                        redirects.append({'url': encoding_url})
+                        redirect_urls.add(encoding_url)
+
+        chapters = []
+        # Player pages may also serve different qualities
+        for page in ('widget/partnerplayer', 'portalplayer'):
+            player = self._download_webpage(
+                'http://player.pbs.org/%s/%s' % (page, video_id),
+                display_id, 'Downloading %s page' % page, fatal=False)
+            if player:
+                video_info = self._extract_video_data(
+                    player, '%s video data' % page, display_id, fatal=False)
+                if video_info:
+                    extract_redirect_urls(video_info)
+                    if not info:
+                        info = video_info
+                if not chapters:
+                    raw_chapters = video_info.get('chapters') or []
+                    if not raw_chapters:
+                        for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
+                            chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
+                            if not chapter:
+                                continue
+                            raw_chapters.append(chapter)
+                    for chapter in raw_chapters:
+                        start_time = float_or_none(chapter.get('start_time'), 1000)
+                        duration = float_or_none(chapter.get('duration'), 1000)
+                        if start_time is None or duration is None:
+                            continue
+                        chapters.append({
+                            'start_time': start_time,
+                            'end_time': start_time + duration,
+                            'title': chapter.get('title'),
+                        })
+
+        formats = []
+        http_url = None
+        for num, redirect in enumerate(redirects):
+            redirect_id = redirect.get('eeid')
+
+            redirect_info = self._download_json(
+                '%s?format=json' % redirect['url'], display_id,
+                'Downloading %s video url info' % (redirect_id or num),
+                headers=self.geo_verification_headers())
+
+            if redirect_info['status'] == 'error':
+                message = self._ERRORS.get(
+                    redirect_info['http_code'], redirect_info['message'])
+                if redirect_info['http_code'] == 403:
+                    self.raise_geo_restricted(
+                        msg=message, countries=self._GEO_COUNTRIES)
+                raise ExtractorError(
+                    '%s said: %s' % (self.IE_NAME, message), expected=True)
+
+            format_url = redirect_info.get('url')
+            if not format_url:
+                continue
+
+            if determine_ext(format_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, display_id, 'mp4', m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': format_url,
+                    'format_id': redirect_id,
+                })
+                if re.search(r'^https?://.*(?:\d+k|baseline)', format_url):
+                    http_url = format_url
+        self._remove_duplicate_formats(formats)
+        m3u8_formats = list(filter(
+            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
+            formats))
+        if http_url:
+            for m3u8_format in m3u8_formats:
+                bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None)
+                # Lower qualities (150k and 192k) are not available as HTTP formats (see [1]),
+                # we won't try extracting them.
+                # Since summer 2016 higher quality formats (4500k and 6500k) are also available
+                # albeit they are not documented in [2].
+                # 1. https://github.com/ytdl-org/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656
+                # 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
+                if not bitrate or int(bitrate) < 400:
+                    continue
+                f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url)
+                # This may produce invalid links sometimes (e.g.
+                # http://www.pbs.org/wgbh/frontline/film/suicide-plan)
+                if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate):
+                    continue
+                f = m3u8_format.copy()
+                f.update({
+                    'url': f_url,
+                    'format_id': m3u8_format['format_id'].replace('hls', 'http'),
+                    'protocol': 'http',
+                })
+                formats.append(f)
+        self._sort_formats(formats)
+
+        rating_str = info.get('rating')
+        if rating_str is not None:
+            rating_str = rating_str.rpartition('-')[2]
+        age_limit = US_RATINGS.get(rating_str)
+
+        subtitles = {}
+        closed_captions_url = info.get('closed_captions_url')
+        if closed_captions_url:
+            subtitles['en'] = [{
+                'ext': 'ttml',
+                'url': closed_captions_url,
+            }]
+            mobj = re.search(r'/(\d+)_Encoded\.dfxp', closed_captions_url)
+            if mobj:
+                ttml_caption_suffix, ttml_caption_id = mobj.group(0, 1)
+                ttml_caption_id = int(ttml_caption_id)
+                subtitles['en'].extend([{
+                    'url': closed_captions_url.replace(
+                        ttml_caption_suffix, '/%d_Encoded.srt' % (ttml_caption_id + 1)),
+                    'ext': 'srt',
+                }, {
+                    'url': closed_captions_url.replace(
+                        ttml_caption_suffix, '/%d_Encoded.vtt' % (ttml_caption_id + 2)),
+                    'ext': 'vtt',
+                }])
+
+        # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc)
+        # Try turning it to 'program - title' naming scheme if possible
+        alt_title = info.get('program', {}).get('title')
+        if alt_title:
+            info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + r'[\s\-:]+', '', info['title'])
+
+        description = info.get('description') or info.get(
+            'program', {}).get('description') or description
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': info['title'],
+            'description': description,
+            'thumbnail': info.get('image_url'),
+            'duration': int_or_none(info.get('duration')),
+            'age_limit': age_limit,
+            'upload_date': upload_date,
+            'formats': formats,
+            'subtitles': subtitles,
+            'chapters': chapters,
+        }
diff --git a/youtube_dl/extractor/pearvideo.py b/youtube_dl/extractor/pearvideo.py

new file mode 100644 (file)

index 0000000..1d77722
--- /dev/null
+++ b/youtube_dl/extractor/pearvideo.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    qualities,
+    unified_timestamp,
+)
+
+
+class PearVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.pearvideo.com/video_1076290',
+        'info_dict': {
+            'id': '1076290',
+            'ext': 'mp4',
+            'title': '小浣熊在主人家玻璃上滚石头：没砸',
+            'description': 'md5:01d576b747de71be0ee85eb7cac25f9d',
+            'timestamp': 1494275280,
+            'upload_date': '20170508',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        quality = qualities(
+            ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src'))
+
+        formats = [{
+            'url': mobj.group('url'),
+            'format_id': mobj.group('id'),
+            'quality': quality(mobj.group('id')),
+        } for mobj in re.finditer(
+            r'(?P<id>[a-zA-Z]+)Url\s*=\s*(["\'])(?P<url>(?:https?:)?//.+?)\2',
+            webpage)]
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            (r'<h1[^>]+\bclass=(["\'])video-tt\1[^>]*>(?P<value>[^<]+)',
+             r'<[^>]+\bdata-title=(["\'])(?P<value>(?:(?!\1).)+)\1'),
+            webpage, 'title', group='value')
+        description = self._search_regex(
+            (r'<div[^>]+\bclass=(["\'])summary\1[^>]*>(?P<value>[^<]+)',
+             r'<[^>]+\bdata-summary=(["\'])(?P<value>(?:(?!\1).)+)\1'),
+            webpage, 'description', default=None,
+            group='value') or self._html_search_meta('Description', webpage)
+        timestamp = unified_timestamp(self._search_regex(
+            r'<div[^>]+\bclass=["\']date["\'][^>]*>([^<]+)',
+            webpage, 'timestamp', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py

new file mode 100644 (file)

index 0000000..48fb954
--- /dev/null
+++ b/youtube_dl/extractor/peertube.py
@@ -0,0 +1,600 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    parse_resolution,
+    str_or_none,
+    try_get,
+    unified_timestamp,
+    url_or_none,
+    urljoin,
+)
+
+
+class PeerTubeIE(InfoExtractor):
+    _INSTANCES_RE = r'''(?:
+                            # Taken from https://instances.joinpeertube.org/instances
+                            peertube\.rainbowswingers\.net|
+                            tube\.stanisic\.nl|
+                            peer\.suiri\.us|
+                            medias\.libox\.fr|
+                            videomensoif\.ynh\.fr|
+                            peertube\.travelpandas\.eu|
+                            peertube\.rachetjay\.fr|
+                            peertube\.montecsys\.fr|
+                            tube\.eskuero\.me|
+                            peer\.tube|
+                            peertube\.umeahackerspace\.se|
+                            tube\.nx-pod\.de|
+                            video\.monsieurbidouille\.fr|
+                            tube\.openalgeria\.org|
+                            vid\.lelux\.fi|
+                            video\.anormallostpod\.ovh|
+                            tube\.crapaud-fou\.org|
+                            peertube\.stemy\.me|
+                            lostpod\.space|
+                            exode\.me|
+                            peertube\.snargol\.com|
+                            vis\.ion\.ovh|
+                            videosdulib\.re|
+                            v\.mbius\.io|
+                            videos\.judrey\.eu|
+                            peertube\.osureplayviewer\.xyz|
+                            peertube\.mathieufamily\.ovh|
+                            www\.videos-libr\.es|
+                            fightforinfo\.com|
+                            peertube\.fediverse\.ru|
+                            peertube\.oiseauroch\.fr|
+                            video\.nesven\.eu|
+                            v\.bearvideo\.win|
+                            video\.qoto\.org|
+                            justporn\.cc|
+                            video\.vny\.fr|
+                            peervideo\.club|
+                            tube\.taker\.fr|
+                            peertube\.chantierlibre\.org|
+                            tube\.ipfixe\.info|
+                            tube\.kicou\.info|
+                            tube\.dodsorf\.as|
+                            videobit\.cc|
+                            video\.yukari\.moe|
+                            videos\.elbinario\.net|
+                            hkvideo\.live|
+                            pt\.tux\.tf|
+                            www\.hkvideo\.live|
+                            FIGHTFORINFO\.com|
+                            pt\.765racing\.com|
+                            peertube\.gnumeria\.eu\.org|
+                            nordenmedia\.com|
+                            peertube\.co\.uk|
+                            tube\.darfweb\.eu|
+                            tube\.kalah-france\.org|
+                            0ch\.in|
+                            vod\.mochi\.academy|
+                            film\.node9\.org|
+                            peertube\.hatthieves\.es|
+                            video\.fitchfamily\.org|
+                            peertube\.ddns\.net|
+                            video\.ifuncle\.kr|
+                            video\.fdlibre\.eu|
+                            tube\.22decembre\.eu|
+                            peertube\.harmoniescreatives\.com|
+                            tube\.fabrigli\.fr|
+                            video\.thedwyers\.co|
+                            video\.bruitbruit\.com|
+                            peertube\.foxfam\.club|
+                            peer\.philoxweb\.be|
+                            videos\.bugs\.social|
+                            peertube\.malbert\.xyz|
+                            peertube\.bilange\.ca|
+                            libretube\.net|
+                            diytelevision\.com|
+                            peertube\.fedilab\.app|
+                            libre\.video|
+                            video\.mstddntfdn\.online|
+                            us\.tv|
+                            peertube\.sl-network\.fr|
+                            peertube\.dynlinux\.io|
+                            peertube\.david\.durieux\.family|
+                            peertube\.linuxrocks\.online|
+                            peerwatch\.xyz|
+                            v\.kretschmann\.social|
+                            tube\.otter\.sh|
+                            yt\.is\.nota\.live|
+                            tube\.dragonpsi\.xyz|
+                            peertube\.boneheadmedia\.com|
+                            videos\.funkwhale\.audio|
+                            watch\.44con\.com|
+                            peertube\.gcaillaut\.fr|
+                            peertube\.icu|
+                            pony\.tube|
+                            spacepub\.space|
+                            tube\.stbr\.io|
+                            v\.mom-gay\.faith|
+                            tube\.port0\.xyz|
+                            peertube\.simounet\.net|
+                            play\.jergefelt\.se|
+                            peertube\.zeteo\.me|
+                            tube\.danq\.me|
+                            peertube\.kerenon\.com|
+                            tube\.fab-l3\.org|
+                            tube\.calculate\.social|
+                            peertube\.mckillop\.org|
+                            tube\.netzspielplatz\.de|
+                            vod\.ksite\.de|
+                            peertube\.laas\.fr|
+                            tube\.govital\.net|
+                            peertube\.stephenson\.cc|
+                            bistule\.nohost\.me|
+                            peertube\.kajalinifi\.de|
+                            video\.ploud\.jp|
+                            video\.omniatv\.com|
+                            peertube\.ffs2play\.fr|
+                            peertube\.leboulaire\.ovh|
+                            peertube\.tronic-studio\.com|
+                            peertube\.public\.cat|
+                            peertube\.metalbanana\.net|
+                            video\.1000i100\.fr|
+                            peertube\.alter-nativ-voll\.de|
+                            tube\.pasa\.tf|
+                            tube\.worldofhauru\.xyz|
+                            pt\.kamp\.site|
+                            peertube\.teleassist\.fr|
+                            videos\.mleduc\.xyz|
+                            conf\.tube|
+                            media\.privacyinternational\.org|
+                            pt\.forty-two\.nl|
+                            video\.halle-leaks\.de|
+                            video\.grosskopfgames\.de|
+                            peertube\.schaeferit\.de|
+                            peertube\.jackbot\.fr|
+                            tube\.extinctionrebellion\.fr|
+                            peertube\.f-si\.org|
+                            video\.subak\.ovh|
+                            videos\.koweb\.fr|
+                            peertube\.zergy\.net|
+                            peertube\.roflcopter\.fr|
+                            peertube\.floss-marketing-school\.com|
+                            vloggers\.social|
+                            peertube\.iriseden\.eu|
+                            videos\.ubuntu-paris\.org|
+                            peertube\.mastodon\.host|
+                            armstube\.com|
+                            peertube\.s2s\.video|
+                            peertube\.lol|
+                            tube\.open-plug\.eu|
+                            open\.tube|
+                            peertube\.ch|
+                            peertube\.normandie-libre\.fr|
+                            peertube\.slat\.org|
+                            video\.lacaveatonton\.ovh|
+                            peertube\.uno|
+                            peertube\.servebeer\.com|
+                            peertube\.fedi\.quebec|
+                            tube\.h3z\.jp|
+                            tube\.plus200\.com|
+                            peertube\.eric\.ovh|
+                            tube\.metadocs\.cc|
+                            tube\.unmondemeilleur\.eu|
+                            gouttedeau\.space|
+                            video\.antirep\.net|
+                            nrop\.cant\.at|
+                            tube\.ksl-bmx\.de|
+                            tube\.plaf\.fr|
+                            tube\.tchncs\.de|
+                            video\.devinberg\.com|
+                            hitchtube\.fr|
+                            peertube\.kosebamse\.com|
+                            yunopeertube\.myddns\.me|
+                            peertube\.varney\.fr|
+                            peertube\.anon-kenkai\.com|
+                            tube\.maiti\.info|
+                            tubee\.fr|
+                            videos\.dinofly\.com|
+                            toobnix\.org|
+                            videotape\.me|
+                            voca\.tube|
+                            video\.heromuster\.com|
+                            video\.lemediatv\.fr|
+                            video\.up\.edu\.ph|
+                            balafon\.video|
+                            video\.ivel\.fr|
+                            thickrips\.cloud|
+                            pt\.laurentkruger\.fr|
+                            video\.monarch-pass\.net|
+                            peertube\.artica\.center|
+                            video\.alternanet\.fr|
+                            indymotion\.fr|
+                            fanvid\.stopthatimp\.net|
+                            video\.farci\.org|
+                            v\.lesterpig\.com|
+                            video\.okaris\.de|
+                            tube\.pawelko\.net|
+                            peertube\.mablr\.org|
+                            tube\.fede\.re|
+                            pytu\.be|
+                            evertron\.tv|
+                            devtube\.dev-wiki\.de|
+                            raptube\.antipub\.org|
+                            video\.selea\.se|
+                            peertube\.mygaia\.org|
+                            video\.oh14\.de|
+                            peertube\.livingutopia\.org|
+                            peertube\.the-penguin\.de|
+                            tube\.thechangebook\.org|
+                            tube\.anjara\.eu|
+                            pt\.pube\.tk|
+                            video\.samedi\.pm|
+                            mplayer\.demouliere\.eu|
+                            widemus\.de|
+                            peertube\.me|
+                            peertube\.zapashcanon\.fr|
+                            video\.latavernedejohnjohn\.fr|
+                            peertube\.pcservice46\.fr|
+                            peertube\.mazzonetto\.eu|
+                            video\.irem\.univ-paris-diderot\.fr|
+                            video\.livecchi\.cloud|
+                            alttube\.fr|
+                            video\.coop\.tools|
+                            video\.cabane-libre\.org|
+                            peertube\.openstreetmap\.fr|
+                            videos\.alolise\.org|
+                            irrsinn\.video|
+                            video\.antopie\.org|
+                            scitech\.video|
+                            tube2\.nemsia\.org|
+                            video\.amic37\.fr|
+                            peertube\.freeforge\.eu|
+                            video\.arbitrarion\.com|
+                            video\.datsemultimedia\.com|
+                            stoptrackingus\.tv|
+                            peertube\.ricostrongxxx\.com|
+                            docker\.videos\.lecygnenoir\.info|
+                            peertube\.togart\.de|
+                            tube\.postblue\.info|
+                            videos\.domainepublic\.net|
+                            peertube\.cyber-tribal\.com|
+                            video\.gresille\.org|
+                            peertube\.dsmouse\.net|
+                            cinema\.yunohost\.support|
+                            tube\.theocevaer\.fr|
+                            repro\.video|
+                            tube\.4aem\.com|
+                            quaziinc\.com|
+                            peertube\.metawurst\.space|
+                            videos\.wakapo\.com|
+                            video\.ploud\.fr|
+                            video\.freeradical\.zone|
+                            tube\.valinor\.fr|
+                            refuznik\.video|
+                            pt\.kircheneuenburg\.de|
+                            peertube\.asrun\.eu|
+                            peertube\.lagob\.fr|
+                            videos\.side-ways\.net|
+                            91video\.online|
+                            video\.valme\.io|
+                            video\.taboulisme\.com|
+                            videos-libr\.es|
+                            tv\.mooh\.fr|
+                            nuage\.acostey\.fr|
+                            video\.monsieur-a\.fr|
+                            peertube\.librelois\.fr|
+                            videos\.pair2jeux\.tube|
+                            videos\.pueseso\.club|
+                            peer\.mathdacloud\.ovh|
+                            media\.assassinate-you\.net|
+                            vidcommons\.org|
+                            ptube\.rousset\.nom\.fr|
+                            tube\.cyano\.at|
+                            videos\.squat\.net|
+                            video\.iphodase\.fr|
+                            peertube\.makotoworkshop\.org|
+                            peertube\.serveur\.slv-valbonne\.fr|
+                            vault\.mle\.party|
+                            hostyour\.tv|
+                            videos\.hack2g2\.fr|
+                            libre\.tube|
+                            pire\.artisanlogiciel\.net|
+                            videos\.numerique-en-commun\.fr|
+                            video\.netsyms\.com|
+                            video\.die-partei\.social|
+                            video\.writeas\.org|
+                            peertube\.swarm\.solvingmaz\.es|
+                            tube\.pericoloso\.ovh|
+                            watching\.cypherpunk\.observer|
+                            videos\.adhocmusic\.com|
+                            tube\.rfc1149\.net|
+                            peertube\.librelabucm\.org|
+                            videos\.numericoop\.fr|
+                            peertube\.koehn\.com|
+                            peertube\.anarchmusicall\.net|
+                            tube\.kampftoast\.de|
+                            vid\.y-y\.li|
+                            peertube\.xtenz\.xyz|
+                            diode\.zone|
+                            tube\.egf\.mn|
+                            peertube\.nomagic\.uk|
+                            visionon\.tv|
+                            videos\.koumoul\.com|
+                            video\.rastapuls\.com|
+                            video\.mantlepro\.com|
+                            video\.deadsuperhero\.com|
+                            peertube\.musicstudio\.pro|
+                            peertube\.we-keys\.fr|
+                            artitube\.artifaille\.fr|
+                            peertube\.ethernia\.net|
+                            tube\.midov\.pl|
+                            peertube\.fr|
+                            watch\.snoot\.tube|
+                            peertube\.donnadieu\.fr|
+                            argos\.aquilenet\.fr|
+                            tube\.nemsia\.org|
+                            tube\.bruniau\.net|
+                            videos\.darckoune\.moe|
+                            tube\.traydent\.info|
+                            dev\.videos\.lecygnenoir\.info|
+                            peertube\.nayya\.org|
+                            peertube\.live|
+                            peertube\.mofgao\.space|
+                            video\.lequerrec\.eu|
+                            peertube\.amicale\.net|
+                            aperi\.tube|
+                            tube\.ac-lyon\.fr|
+                            video\.lw1\.at|
+                            www\.yiny\.org|
+                            videos\.pofilo\.fr|
+                            tube\.lou\.lt|
+                            choob\.h\.etbus\.ch|
+                            tube\.hoga\.fr|
+                            peertube\.heberge\.fr|
+                            video\.obermui\.de|
+                            videos\.cloudfrancois\.fr|
+                            betamax\.video|
+                            video\.typica\.us|
+                            tube\.piweb\.be|
+                            video\.blender\.org|
+                            peertube\.cat|
+                            tube\.kdy\.ch|
+                            pe\.ertu\.be|
+                            peertube\.social|
+                            videos\.lescommuns\.org|
+                            tv\.datamol\.org|
+                            videonaute\.fr|
+                            dialup\.express|
+                            peertube\.nogafa\.org|
+                            megatube\.lilomoino\.fr|
+                            peertube\.tamanoir\.foucry\.net|
+                            peertube\.devosi\.org|
+                            peertube\.1312\.media|
+                            tube\.bootlicker\.party|
+                            skeptikon\.fr|
+                            video\.blueline\.mg|
+                            tube\.homecomputing\.fr|
+                            tube\.ouahpiti\.info|
+                            video\.tedomum\.net|
+                            video\.g3l\.org|
+                            fontube\.fr|
+                            peertube\.gaialabs\.ch|
+                            tube\.kher\.nl|
+                            peertube\.qtg\.fr|
+                            video\.migennes\.net|
+                            tube\.p2p\.legal|
+                            troll\.tv|
+                            videos\.iut-orsay\.fr|
+                            peertube\.solidev\.net|
+                            videos\.cemea\.org|
+                            video\.passageenseine\.fr|
+                            videos\.festivalparminous\.org|
+                            peertube\.touhoppai\.moe|
+                            sikke\.fi|
+                            peer\.hostux\.social|
+                            share\.tube|
+                            peertube\.walkingmountains\.fr|
+                            videos\.benpro\.fr|
+                            peertube\.parleur\.net|
+                            peertube\.heraut\.eu|
+                            tube\.aquilenet\.fr|
+                            peertube\.gegeweb\.eu|
+                            framatube\.org|
+                            thinkerview\.video|
+                            tube\.conferences-gesticulees\.net|
+                            peertube\.datagueule\.tv|
+                            video\.lqdn\.fr|
+                            tube\.mochi\.academy|
+                            media\.zat\.im|
+                            video\.colibris-outilslibres\.org|
+                            tube\.svnet\.fr|
+                            peertube\.video|
+                            peertube3\.cpy\.re|
+                            peertube2\.cpy\.re|
+                            videos\.tcit\.fr|
+                            peertube\.cpy\.re
+                        )'''
+    _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
+    _API_BASE = 'https://%s/api/v1/videos/%s/%s'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        peertube:(?P<host>[^:]+):|
+                        https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/
+                    )
+                    (?P<id>%s)
+                    ''' % (_INSTANCES_RE, _UUID_RE)
+    _TESTS = [{
+        'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d',
+        'md5': '9bed8c0137913e17b86334e5885aacff',
+        'info_dict': {
+            'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d',
+            'ext': 'mp4',
+            'title': 'What is PeerTube?',
+            'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10',
+            'thumbnail': r're:https?://.*\.(?:jpg|png)',
+            'timestamp': 1538391166,
+            'upload_date': '20181001',
+            'uploader': 'Framasoft',
+            'uploader_id': '3',
+            'uploader_url': 'https://framatube.org/accounts/framasoft',
+            'channel': 'Les vidéos de Framasoft',
+            'channel_id': '2',
+            'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8',
+            'language': 'en',
+            'license': 'Attribution - Share Alike',
+            'duration': 113,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'tags': ['framasoft', 'peertube'],
+            'categories': ['Science & Technology'],
+        }
+    }, {
+        'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44',
+        'only_matching': True,
+    }, {
+        # nsfw
+        'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39',
+        'only_matching': True,
+    }, {
+        'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7',
+        'only_matching': True,
+    }, {
+        'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8',
+        'only_matching': True,
+    }, {
+        'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_peertube_url(webpage, source_url):
+        mobj = re.match(
+            r'https?://(?P<host>[^/]+)/videos/(?:watch|embed)/(?P<id>%s)'
+            % PeerTubeIE._UUID_RE, source_url)
+        if mobj and any(p in webpage for p in (
+                '<title>PeerTube<',
+                'There will be other non JS-based clients to access PeerTube',
+                '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')):
+            return 'peertube:%s:%s' % mobj.group('host', 'id')
+
+    @staticmethod
+    def _extract_urls(webpage, source_url):
+        entries = re.findall(
+            r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)'''
+            % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage)
+        if not entries:
+            peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url)
+            if peertube_url:
+                entries = [peertube_url]
+        return entries
+
+    def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True):
+        return self._download_json(
+            self._API_BASE % (host, video_id, path), video_id,
+            note=note, errnote=errnote, fatal=fatal)
+
+    def _get_subtitles(self, host, video_id):
+        captions = self._call_api(
+            host, video_id, 'captions', note='Downloading captions JSON',
+            fatal=False)
+        if not isinstance(captions, dict):
+            return
+        data = captions.get('data')
+        if not isinstance(data, list):
+            return
+        subtitles = {}
+        for e in data:
+            language_id = try_get(e, lambda x: x['language']['id'], compat_str)
+            caption_url = urljoin('https://%s' % host, e.get('captionPath'))
+            if not caption_url:
+                continue
+            subtitles.setdefault(language_id or 'en', []).append({
+                'url': caption_url,
+            })
+        return subtitles
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        host = mobj.group('host') or mobj.group('host_2')
+        video_id = mobj.group('id')
+
+        video = self._call_api(
+            host, video_id, '', note='Downloading video JSON')
+
+        title = video['name']
+
+        formats = []
+        for file_ in video['files']:
+            if not isinstance(file_, dict):
+                continue
+            file_url = url_or_none(file_.get('fileUrl'))
+            if not file_url:
+                continue
+            file_size = int_or_none(file_.get('size'))
+            format_id = try_get(
+                file_, lambda x: x['resolution']['label'], compat_str)
+            f = parse_resolution(format_id)
+            f.update({
+                'url': file_url,
+                'format_id': format_id,
+                'filesize': file_size,
+            })
+            formats.append(f)
+        self._sort_formats(formats)
+
+        full_description = self._call_api(
+            host, video_id, 'description', note='Downloading description JSON',
+            fatal=False)
+
+        description = None
+        if isinstance(full_description, dict):
+            description = str_or_none(full_description.get('description'))
+        if not description:
+            description = video.get('description')
+
+        subtitles = self.extract_subtitles(host, video_id)
+
+        def data(section, field, type_):
+            return try_get(video, lambda x: x[section][field], type_)
+
+        def account_data(field, type_):
+            return data('account', field, type_)
+
+        def channel_data(field, type_):
+            return data('channel', field, type_)
+
+        category = data('category', 'label', compat_str)
+        categories = [category] if category else None
+
+        nsfw = video.get('nsfw')
+        if nsfw is bool:
+            age_limit = 18 if nsfw else 0
+        else:
+            age_limit = None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': urljoin(url, video.get('thumbnailPath')),
+            'timestamp': unified_timestamp(video.get('publishedAt')),
+            'uploader': account_data('displayName', compat_str),
+            'uploader_id': str_or_none(account_data('id', int)),
+            'uploader_url': url_or_none(account_data('url', compat_str)),
+            'channel': channel_data('displayName', compat_str),
+            'channel_id': str_or_none(channel_data('id', int)),
+            'channel_url': url_or_none(channel_data('url', compat_str)),
+            'language': data('language', 'id', compat_str),
+            'license': data('licence', 'label', compat_str),
+            'duration': int_or_none(video.get('duration')),
+            'view_count': int_or_none(video.get('views')),
+            'like_count': int_or_none(video.get('likes')),
+            'dislike_count': int_or_none(video.get('dislikes')),
+            'age_limit': age_limit,
+            'tags': try_get(video, lambda x: x['tags'], list),
+            'categories': categories,
+            'formats': formats,
+            'subtitles': subtitles
+        }
diff --git a/youtube_dl/extractor/people.py b/youtube_dl/extractor/people.py

new file mode 100644 (file)

index 0000000..6ca9571
--- /dev/null
+++ b/youtube_dl/extractor/people.py
@@ -0,0 +1,32 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class PeopleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?people\.com/people/videos/0,,(?P<id>\d+),00\.html'
+
+    _TEST = {
+        'url': 'http://www.people.com/people/videos/0,,20995451,00.html',
+        'info_dict': {
+            'id': 'ref:20995451',
+            'ext': 'mp4',
+            'title': 'Astronaut Love Triangle Victim Speaks Out: “The Crime in 2007 Hasn’t Defined Us”',
+            'description': 'Colleen Shipman speaks to PEOPLE for the first time about life after the attack',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 246.318,
+            'timestamp': 1458720585,
+            'upload_date': '20160323',
+            'uploader_id': '416418724',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['BrightcoveNew'],
+    }
+
+    def _real_extract(self, url):
+        return self.url_result(
+            'http://players.brightcove.net/416418724/default_default/index.html?videoId=ref:%s'
+            % self._match_id(url), 'BrightcoveNew')
diff --git a/youtube_dl/extractor/performgroup.py b/youtube_dl/extractor/performgroup.py

new file mode 100644 (file)

index 0000000..26942bf
--- /dev/null
+++ b/youtube_dl/extractor/performgroup.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class PerformGroupIE(InfoExtractor):
+    _VALID_URL = r'https?://player\.performgroup\.com/eplayer(?:/eplayer\.html|\.js)#/?(?P<id>[0-9a-f]{26})\.(?P<auth_token>[0-9a-z]{26})'
+    _TESTS = [{
+        # http://www.faz.net/aktuell/sport/fussball/wm-2018-playoffs-schweiz-besiegt-nordirland-1-0-15286104.html
+        'url': 'http://player.performgroup.com/eplayer/eplayer.html#d478c41c5d192f56b9aa859de8.1w4crrej5w14e1ed4s1ce4ykab',
+        'md5': '259cb03d142e2e52471e8837ecacb29f',
+        'info_dict': {
+            'id': 'xgrwobuzumes1lwjxtcdpwgxd',
+            'ext': 'mp4',
+            'title': 'Liga MX: Keine Einsicht nach Horrorfoul',
+            'description': 'md5:7cd3b459c82725b021e046ab10bf1c5b',
+            'timestamp': 1511533477,
+            'upload_date': '20171124',
+        }
+    }]
+
+    def _call_api(self, service, auth_token, content_id, referer_url):
+        return self._download_json(
+            'http://ep3.performfeeds.com/ep%s/%s/%s/' % (service, auth_token, content_id),
+            content_id, headers={
+                'Referer': referer_url,
+                'Origin': 'http://player.performgroup.com',
+            }, query={
+                '_fmt': 'json',
+            })
+
+    def _real_extract(self, url):
+        player_id, auth_token = re.search(self._VALID_URL, url).groups()
+        bootstrap = self._call_api('bootstrap', auth_token, player_id, url)
+        video = bootstrap['config']['dataSource']['sourceItems'][0]['videos'][0]
+        video_id = video['uuid']
+        vod = self._call_api('vod', auth_token, video_id, url)
+        media = vod['videos']['video'][0]['media']
+
+        formats = []
+        hls_url = media.get('hls', {}).get('url')
+        if hls_url:
+            formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+
+        hds_url = media.get('hds', {}).get('url')
+        if hds_url:
+            formats.extend(self._extract_f4m_formats(hds_url + '?hdcore', video_id, f4m_id='hds', fatal=False))
+
+        for c in media.get('content', []):
+            c_url = c.get('url')
+            if not c_url:
+                continue
+            tbr = int_or_none(c.get('bitrate'), 1000)
+            format_id = 'http'
+            if tbr:
+                format_id += '-%d' % tbr
+            formats.append({
+                'format_id': format_id,
+                'url': c_url,
+                'tbr': tbr,
+                'width': int_or_none(c.get('width')),
+                'height': int_or_none(c.get('height')),
+                'filesize': int_or_none(c.get('fileSize')),
+                'vcodec': c.get('type'),
+                'fps': int_or_none(c.get('videoFrameRate')),
+                'vbr': int_or_none(c.get('videoRate'), 1000),
+                'abr': int_or_none(c.get('audioRate'), 1000),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video['title'],
+            'description': video.get('description'),
+            'thumbnail': video.get('poster'),
+            'duration': int_or_none(video.get('duration')),
+            'timestamp': int_or_none(video.get('publishedTime'), 1000),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py

new file mode 100644 (file)

index 0000000..b159063
--- /dev/null
+++ b/youtube_dl/extractor/periscope.py
@@ -0,0 +1,189 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    unescapeHTML,
+)
+
+
+class PeriscopeBaseIE(InfoExtractor):
+    def _call_api(self, method, query, item_id):
+        return self._download_json(
+            'https://api.periscope.tv/api/v2/%s' % method,
+            item_id, query=query)
+
+    def _parse_broadcast_data(self, broadcast, video_id):
+        title = broadcast.get('status') or 'Periscope Broadcast'
+        uploader = broadcast.get('user_display_name') or broadcast.get('username')
+        title = '%s - %s' % (uploader, title) if uploader else title
+        is_live = broadcast.get('state').lower() == 'running'
+
+        thumbnails = [{
+            'url': broadcast[image],
+        } for image in ('image_url', 'image_url_small') if broadcast.get(image)]
+
+        return {
+            'id': broadcast.get('id') or video_id,
+            'title': self._live_title(title) if is_live else title,
+            'timestamp': parse_iso8601(broadcast.get('created_at')),
+            'uploader': uploader,
+            'uploader_id': broadcast.get('user_id') or broadcast.get('username'),
+            'thumbnails': thumbnails,
+            'view_count': int_or_none(broadcast.get('total_watched')),
+            'tags': broadcast.get('tags'),
+            'is_live': is_live,
+        }
+
+    @staticmethod
+    def _extract_common_format_info(broadcast):
+        return broadcast.get('state').lower(), int_or_none(broadcast.get('width')), int_or_none(broadcast.get('height'))
+
+    @staticmethod
+    def _add_width_and_height(f, width, height):
+        for key, val in (('width', width), ('height', height)):
+            if not f.get(key):
+                f[key] = val
+
+    def _extract_pscp_m3u8_formats(self, m3u8_url, video_id, format_id, state, width, height, fatal=True):
+        m3u8_formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, 'mp4',
+            entry_protocol='m3u8_native'
+            if state in ('ended', 'timed_out') else 'm3u8',
+            m3u8_id=format_id, fatal=fatal)
+        if len(m3u8_formats) == 1:
+            self._add_width_and_height(m3u8_formats[0], width, height)
+        return m3u8_formats
+
+
+class PeriscopeIE(PeriscopeBaseIE):
+    IE_DESC = 'Periscope'
+    IE_NAME = 'periscope'
+    _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)'
+    # Alive example URLs can be found here https://www.periscope.tv/
+    _TESTS = [{
+        'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
+        'md5': '65b57957972e503fcbbaeed8f4fa04ca',
+        'info_dict': {
+            'id': '56102209',
+            'ext': 'mp4',
+            'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗',
+            'timestamp': 1438978559,
+            'upload_date': '20150807',
+            'uploader': 'Bec Boop',
+            'uploader_id': '1465763',
+        },
+        'skip': 'Expires in 24 hours',
+    }, {
+        'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        token = self._match_id(url)
+
+        stream = self._call_api(
+            'accessVideoPublic', {'broadcast_id': token}, token)
+
+        broadcast = stream['broadcast']
+        info = self._parse_broadcast_data(broadcast, token)
+
+        state = broadcast.get('state').lower()
+        width = int_or_none(broadcast.get('width'))
+        height = int_or_none(broadcast.get('height'))
+
+        def add_width_and_height(f):
+            for key, val in (('width', width), ('height', height)):
+                if not f.get(key):
+                    f[key] = val
+
+        video_urls = set()
+        formats = []
+        for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'):
+            video_url = stream.get(format_id + '_url')
+            if not video_url or video_url in video_urls:
+                continue
+            video_urls.add(video_url)
+            if format_id != 'rtmp':
+                m3u8_formats = self._extract_pscp_m3u8_formats(
+                    video_url, token, format_id, state, width, height, False)
+                formats.extend(m3u8_formats)
+                continue
+            rtmp_format = {
+                'url': video_url,
+                'ext': 'flv' if format_id == 'rtmp' else 'mp4',
+            }
+            self._add_width_and_height(rtmp_format)
+            formats.append(rtmp_format)
+        self._sort_formats(formats)
+
+        info['formats'] = formats
+        return info
+
+
+class PeriscopeUserIE(PeriscopeBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$'
+    IE_DESC = 'Periscope user videos'
+    IE_NAME = 'periscope:user'
+
+    _TEST = {
+        'url': 'https://www.periscope.tv/LularoeHusbandMike/',
+        'info_dict': {
+            'id': 'LularoeHusbandMike',
+            'title': 'LULAROE HUSBAND MIKE',
+            'description': 'md5:6cf4ec8047768098da58e446e82c82f0',
+        },
+        # Periscope only shows videos in the last 24 hours, so it's possible to
+        # get 0 videos
+        'playlist_mincount': 0,
+    }
+
+    def _real_extract(self, url):
+        user_name = self._match_id(url)
+
+        webpage = self._download_webpage(url, user_name)
+
+        data_store = self._parse_json(
+            unescapeHTML(self._search_regex(
+                r'data-store=(["\'])(?P<data>.+?)\1',
+                webpage, 'data store', default='{}', group='data')),
+            user_name)
+
+        user = list(data_store['UserCache']['users'].values())[0]['user']
+        user_id = user['id']
+        session_id = data_store['SessionToken']['public']['broadcastHistory']['token']['session_id']
+
+        broadcasts = self._call_api(
+            'getUserBroadcastsPublic',
+            {'user_id': user_id, 'session_id': session_id},
+            user_name)['broadcasts']
+
+        broadcast_ids = [
+            broadcast['id'] for broadcast in broadcasts if broadcast.get('id')]
+
+        title = user.get('display_name') or user.get('username') or user_name
+        description = user.get('description')
+
+        entries = [
+            self.url_result(
+                'https://www.periscope.tv/%s/%s' % (user_name, broadcast_id))
+            for broadcast_id in broadcast_ids]
+
+        return self.playlist_result(entries, user_id, title, description)
diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py

new file mode 100644 (file)

index 0000000..03da64b
--- /dev/null
+++ b/youtube_dl/extractor/philharmoniedeparis.py
@@ -0,0 +1,106 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    try_get,
+    urljoin,
+)
+
+
+class PhilharmonieDeParisIE(InfoExtractor):
+    IE_DESC = 'Philharmonie de Paris'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|embed(?:app)?/|misc/Playlist\.ashx\?id=)|
+                            pad\.philharmoniedeparis\.fr/doc/CIMU/
+                        )
+                        (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'http://pad.philharmoniedeparis.fr/doc/CIMU/1086697/jazz-a-la-villette-knower',
+        'md5': 'a0a4b195f544645073631cbec166a2c2',
+        'info_dict': {
+            'id': '1086697',
+            'ext': 'mp4',
+            'title': 'Jazz à la Villette : Knower',
+        },
+    }, {
+        'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html',
+        'info_dict': {
+            'id': '1032066',
+            'title': 'md5:0a031b81807b3593cffa3c9a87a167a0',
+        },
+        'playlist_mincount': 2,
+    }, {
+        'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr',
+        'only_matching': True,
+    }, {
+        'url': 'https://live.philharmoniedeparis.fr/embedapp/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR',
+        'only_matching': True,
+    }, {
+        'url': 'https://live.philharmoniedeparis.fr/embed/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR',
+        'only_matching': True,
+    }]
+    _LIVE_URL = 'https://live.philharmoniedeparis.fr'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        config = self._download_json(
+            '%s/otoPlayer/config.ashx' % self._LIVE_URL, video_id, query={
+                'id': video_id,
+                'lang': 'fr-FR',
+            })
+
+        def extract_entry(source):
+            if not isinstance(source, dict):
+                return
+            title = source.get('title')
+            if not title:
+                return
+            files = source.get('files')
+            if not isinstance(files, dict):
+                return
+            format_urls = set()
+            formats = []
+            for format_id in ('mobile', 'desktop'):
+                format_url = try_get(
+                    files, lambda x: x[format_id]['file'], compat_str)
+                if not format_url or format_url in format_urls:
+                    continue
+                format_urls.add(format_url)
+                m3u8_url = urljoin(self._LIVE_URL, format_url)
+                formats.extend(self._extract_m3u8_formats(
+                    m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            if not formats:
+                return
+            self._sort_formats(formats)
+            return {
+                'title': title,
+                'formats': formats,
+            }
+
+        thumbnail = urljoin(self._LIVE_URL, config.get('image'))
+
+        info = extract_entry(config)
+        if info:
+            info.update({
+                'id': video_id,
+                'thumbnail': thumbnail,
+            })
+            return info
+
+        entries = []
+        for num, chapter in enumerate(config['chapters'], start=1):
+            entry = extract_entry(chapter)
+            entry['id'] = '%s-%d' % (video_id, num)
+            entries.append(entry)
+
+        return self.playlist_result(entries, video_id, config.get('title'))
diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py

new file mode 100644 (file)

index 0000000..8d52ad3
--- /dev/null
+++ b/youtube_dl/extractor/phoenix.py
@@ -0,0 +1,52 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class PhoenixIE(InfoExtractor):
+    IE_NAME = 'phoenix.de'
+    _VALID_URL = r'''https?://(?:www\.)?phoenix.de/\D+(?P<id>\d+)\.html'''
+    _TESTS = [
+        {
+            'url': 'https://www.phoenix.de/sendungen/dokumentationen/unsere-welt-in-zukunft---stadt-a-1283620.html',
+            'md5': '5e765e838aa3531c745a4f5b249ee3e3',
+            'info_dict': {
+                'id': '0OB4HFc43Ns',
+                'ext': 'mp4',
+                'title': 'Unsere Welt in Zukunft - Stadt',
+                'description': 'md5:9bfb6fd498814538f953b2dcad7ce044',
+                'upload_date': '20190912',
+                'uploader': 'phoenix',
+                'uploader_id': 'phoenix',
+            }
+        },
+        {
+            'url': 'https://www.phoenix.de/drohnenangriffe-in-saudi-arabien-a-1286995.html?ref=aktuelles',
+            'only_matching': True,
+        },
+        # an older page: https://www.phoenix.de/sendungen/gespraeche/phoenix-persoenlich/im-dialog-a-177727.html
+        # seems to not have an embedded video, even though it's uploaded on youtube: https://www.youtube.com/watch?v=4GxnoUHvOkM
+    ]
+
+    def extract_from_json_api(self, video_id, api_url):
+        doc = self._download_json(
+            api_url, video_id,
+            note="Downloading webpage metadata",
+            errnote="Failed to load webpage metadata")
+
+        for a in doc["absaetze"]:
+            if a["typ"] == "video-youtube":
+                return {
+                    '_type': 'url_transparent',
+                    'id': a["id"],
+                    'title': doc["titel"],
+                    'url': "https://www.youtube.com/watch?v=%s" % a["id"],
+                    'ie_key': 'Youtube',
+                }
+        raise ExtractorError("No downloadable video found", expected=True)
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+        api_url = 'https://www.phoenix.de/response/id/%s' % page_id
+        return self.extract_from_json_api(page_id, api_url)
diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py

new file mode 100644 (file)

index 0000000..6c8bbe1
--- /dev/null
+++ b/youtube_dl/extractor/photobucket.py
@@ -0,0 +1,46 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+
+
+class PhotobucketIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
+    _TEST = {
+        'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
+        'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99',
+        'info_dict': {
+            'id': 'zpsc0c3b9fa',
+            'ext': 'mp4',
+            'timestamp': 1367669341,
+            'upload_date': '20130504',
+            'uploader': 'rachaneronas',
+            'title': 'Tired of Link Building? Try BacklinkMyDomain.com!',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        video_extension = mobj.group('ext')
+
+        webpage = self._download_webpage(url, video_id)
+
+        # Extract URL, uploader, and title from webpage
+        self.report_extraction(video_id)
+        info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',
+                                       webpage, 'info json')
+        info = json.loads(info_json)
+        url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
+        return {
+            'id': video_id,
+            'url': url,
+            'uploader': info['username'],
+            'timestamp': info['creationDate'],
+            'title': info['title'],
+            'ext': video_extension,
+            'thumbnail': info['thumbUrl'],
+        }
diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py

new file mode 100644 (file)

index 0000000..8099ef1
--- /dev/null
+++ b/youtube_dl/extractor/picarto.py
@@ -0,0 +1,153 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    js_to_json,
+    try_get,
+    update_url_query,
+    urlencode_postdata,
+)
+
+
+class PicartoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)(?:/(?P<token>[a-zA-Z0-9]+))?'
+    _TEST = {
+        'url': 'https://picarto.tv/Setz',
+        'info_dict': {
+            'id': 'Setz',
+            'ext': 'mp4',
+            'title': 're:^Setz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'timestamp': int,
+            'is_live': True
+        },
+        'skip': 'Stream is offline',
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        channel_id = mobj.group('id')
+
+        metadata = self._download_json(
+            'https://api.picarto.tv/v1/channel/name/' + channel_id,
+            channel_id)
+
+        if metadata.get('online') is False:
+            raise ExtractorError('Stream is offline', expected=True)
+
+        cdn_data = self._download_json(
+            'https://picarto.tv/process/channel', channel_id,
+            data=urlencode_postdata({'loadbalancinginfo': channel_id}),
+            note='Downloading load balancing info')
+
+        token = mobj.group('token') or 'public'
+        params = {
+            'con': int(time.time() * 1000),
+            'token': token,
+        }
+
+        prefered_edge = cdn_data.get('preferedEdge')
+        formats = []
+
+        for edge in cdn_data['edges']:
+            edge_ep = edge.get('ep')
+            if not edge_ep or not isinstance(edge_ep, compat_str):
+                continue
+            edge_id = edge.get('id')
+            for tech in cdn_data['techs']:
+                tech_label = tech.get('label')
+                tech_type = tech.get('type')
+                preference = 0
+                if edge_id == prefered_edge:
+                    preference += 1
+                format_id = []
+                if edge_id:
+                    format_id.append(edge_id)
+                if tech_type == 'application/x-mpegurl' or tech_label == 'HLS':
+                    format_id.append('hls')
+                    formats.extend(self._extract_m3u8_formats(
+                        update_url_query(
+                            'https://%s/hls/%s/index.m3u8'
+                            % (edge_ep, channel_id), params),
+                        channel_id, 'mp4', preference=preference,
+                        m3u8_id='-'.join(format_id), fatal=False))
+                    continue
+                elif tech_type == 'video/mp4' or tech_label == 'MP4':
+                    format_id.append('mp4')
+                    formats.append({
+                        'url': update_url_query(
+                            'https://%s/mp4/%s.mp4' % (edge_ep, channel_id),
+                            params),
+                        'format_id': '-'.join(format_id),
+                        'preference': preference,
+                    })
+                else:
+                    # rtmp format does not seem to work
+                    continue
+        self._sort_formats(formats)
+
+        mature = metadata.get('adult')
+        if mature is None:
+            age_limit = None
+        else:
+            age_limit = 18 if mature is True else 0
+
+        return {
+            'id': channel_id,
+            'title': self._live_title(metadata.get('title') or channel_id),
+            'is_live': True,
+            'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']),
+            'channel': channel_id,
+            'channel_url': 'https://picarto.tv/%s' % channel_id,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
+
+
+class PicartoVodIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv',
+        'md5': '3ab45ba4352c52ee841a28fb73f2d9ca',
+        'info_dict': {
+            'id': 'ArtofZod_2017.12.12.00.13.23.flv',
+            'ext': 'mp4',
+            'title': 'ArtofZod_2017.12.12.00.13.23.flv',
+            'thumbnail': r're:^https?://.*\.jpg'
+        },
+    }, {
+        'url': 'https://picarto.tv/videopopout/Plague',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        vod_info = self._parse_json(
+            self._search_regex(
+                r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage,
+                video_id),
+            video_id, transform_source=js_to_json)
+
+        formats = self._extract_m3u8_formats(
+            vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'thumbnail': vod_info.get('vodThumb'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py

new file mode 100644 (file)

index 0000000..88b6859
--- /dev/null
+++ b/youtube_dl/extractor/piksel.py
@@ -0,0 +1,138 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    dict_get,
+    int_or_none,
+    unescapeHTML,
+    parse_iso8601,
+)
+
+
+class PikselIE(InfoExtractor):
+    _VALID_URL = r'https?://player\.piksel\.com/v/(?:refid/[^/]+/prefid/)?(?P<id>[a-z0-9_]+)'
+    _TESTS = [
+        {
+            'url': 'http://player.piksel.com/v/ums2867l',
+            'md5': '34e34c8d89dc2559976a6079db531e85',
+            'info_dict': {
+                'id': 'ums2867l',
+                'ext': 'mp4',
+                'title': 'GX-005 with Caption',
+                'timestamp': 1481335659,
+                'upload_date': '20161210'
+            }
+        },
+        {
+            # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al
+            'url': 'https://player.piksel.com/v/v80kqp41',
+            'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d',
+            'info_dict': {
+                'id': 'v80kqp41',
+                'ext': 'mp4',
+                'title': 'WAW- State of Washington vs. Donald J. Trump, et al',
+                'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.',
+                'timestamp': 1486171129,
+                'upload_date': '20170204'
+            }
+        },
+        {
+            # https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2019240/
+            'url': 'http://player.piksel.com/v/refid/nhkworld/prefid/nw_vod_v_en_2019_240_20190823233000_02_1566873477',
+            'only_matching': True,
+        }
+    ]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._search_regex(
+            r'data-de-program-uuid=[\'"]([a-z0-9]+)',
+            webpage, 'program uuid', default=display_id)
+        app_token = self._search_regex([
+            r'clientAPI\s*:\s*"([^"]+)"',
+            r'data-de-api-key\s*=\s*"([^"]+)"'
+        ], webpage, 'app token')
+        response = self._download_json(
+            'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token,
+            video_id, query={
+                'v': video_id
+            })['response']
+        failure = response.get('failure')
+        if failure:
+            raise ExtractorError(response['failure']['reason'], expected=True)
+        video_data = response['WsProgramResponse']['program']['asset']
+        title = video_data['title']
+
+        formats = []
+
+        m3u8_url = dict_get(video_data, [
+            'm3u8iPadURL',
+            'ipadM3u8Url',
+            'm3u8AndroidURL',
+            'm3u8iPhoneURL',
+            'iphoneM3u8Url'])
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        asset_type = dict_get(video_data, ['assetType', 'asset_type'])
+        for asset_file in video_data.get('assetFiles', []):
+            # TODO: extract rtmp formats
+            http_url = asset_file.get('http_url')
+            if not http_url:
+                continue
+            tbr = None
+            vbr = int_or_none(asset_file.get('videoBitrate'), 1024)
+            abr = int_or_none(asset_file.get('audioBitrate'), 1024)
+            if asset_type == 'video':
+                tbr = vbr + abr
+            elif asset_type == 'audio':
+                tbr = abr
+
+            format_id = ['http']
+            if tbr:
+                format_id.append(compat_str(tbr))
+
+            formats.append({
+                'format_id': '-'.join(format_id),
+                'url': unescapeHTML(http_url),
+                'vbr': vbr,
+                'abr': abr,
+                'width': int_or_none(asset_file.get('videoWidth')),
+                'height': int_or_none(asset_file.get('videoHeight')),
+                'filesize': int_or_none(asset_file.get('filesize')),
+                'tbr': tbr,
+            })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for caption in video_data.get('captions', []):
+            caption_url = caption.get('url')
+            if caption_url:
+                subtitles.setdefault(caption.get('locale', 'en'), []).append({
+                    'url': caption_url})
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('thumbnailUrl'),
+            'timestamp': parse_iso8601(video_data.get('dateadd')),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py

new file mode 100644 (file)

index 0000000..9f3501f
--- /dev/null
+++ b/youtube_dl/extractor/pinkbike.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    remove_end,
+    remove_start,
+    str_to_int,
+    unified_strdate,
+)
+
+
+class PinkbikeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.pinkbike.com/video/402811/',
+        'md5': '4814b8ca7651034cd87e3361d5c2155a',
+        'info_dict': {
+            'id': '402811',
+            'ext': 'mp4',
+            'title': 'Brandon Semenuk - RAW 100',
+            'description': 'Official release: www.redbull.ca/rupertwalker',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 100,
+            'upload_date': '20150406',
+            'uploader': 'revelco',
+            'location': 'Victoria, British Columbia, Canada',
+            'view_count': int,
+            'comment_count': int,
+        }
+    }, {
+        'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.pinkbike.com/video/%s' % video_id, video_id)
+
+        formats = []
+        for _, format_id, src in re.findall(
+                r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage):
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None))
+            formats.append({
+                'url': src,
+                'format_id': format_id,
+                'height': height,
+            })
+        self._sort_formats(formats)
+
+        title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike')
+        description = self._html_search_regex(
+            r'(?s)id="media-description"[^>]*>(.+?)<',
+            webpage, 'description', default=None) or remove_start(
+            self._og_search_description(webpage), title + '. ')
+        thumbnail = self._og_search_thumbnail(webpage)
+        duration = int_or_none(self._html_search_meta(
+            'video:duration', webpage, 'duration'))
+
+        uploader = self._search_regex(
+            r'<a[^>]+\brel=["\']author[^>]+>([^<]+)', webpage,
+            'uploader', fatal=False)
+        upload_date = unified_strdate(self._search_regex(
+            r'class="fullTime"[^>]+title="([^"]+)"',
+            webpage, 'upload date', fatal=False))
+
+        location = self._html_search_regex(
+            r'(?s)<dt>Location</dt>\s*<dd>(.+?)<',
+            webpage, 'location', fatal=False)
+
+        def extract_count(webpage, label):
+            return str_to_int(self._search_regex(
+                r'<span[^>]+class="stat-num"[^>]*>([\d,.]+)</span>\s*<span[^>]+class="stat-label"[^>]*>%s' % label,
+                webpage, label, fatal=False))
+
+        view_count = extract_count(webpage, 'Views')
+        comment_count = extract_count(webpage, 'Comments')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'location': location,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py

new file mode 100644 (file)

index 0000000..e86c653
--- /dev/null
+++ b/youtube_dl/extractor/pladform.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    xpath_text,
+    qualities,
+)
+
+
+class PladformIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:
+                                out\.pladform\.ru/player|
+                                static\.pladform\.ru/player\.swf
+                            )
+                            \?.*\bvideoid=|
+                            video\.pladform\.ru/catalog/video/videoid/
+                        )
+                        (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0',
+        'md5': '53362fac3a27352da20fa2803cc5cd6f',
+        'info_dict': {
+            'id': '3777899',
+            'ext': 'mp4',
+            'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко',
+            'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 3190,
+        },
+    }, {
+        'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0',
+        'only_matching': True,
+    }, {
+        'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        pl = qs.get('pl', ['1'])[0]
+
+        video = self._download_xml(
+            'http://out.pladform.ru/getVideo', video_id, query={
+                'pl': pl,
+                'videoid': video_id,
+            })
+
+        def fail(text):
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, text),
+                expected=True)
+
+        if video.tag == 'error':
+            fail(video.text)
+
+        quality = qualities(('ld', 'sd', 'hd'))
+
+        formats = []
+        for src in video.findall('./src'):
+            if src is None:
+                continue
+            format_url = src.text
+            if not format_url:
+                continue
+            if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': src.text,
+                    'format_id': src.get('quality'),
+                    'quality': quality(src.get('quality')),
+                })
+
+        if not formats:
+            error = xpath_text(video, './cap', 'error', default=None)
+            if error:
+                fail(error)
+
+        self._sort_formats(formats)
+
+        webpage = self._download_webpage(
+            'http://video.pladform.ru/catalog/video/videoid/%s' % video_id,
+            video_id)
+
+        title = self._og_search_title(webpage, fatal=False) or xpath_text(
+            video, './/title', 'title', fatal=True)
+        description = self._search_regex(
+            r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage) or xpath_text(
+            video, './/cover', 'cover')
+
+        duration = int_or_none(xpath_text(video, './/time', 'duration'))
+        age_limit = int_or_none(xpath_text(video, './/age18', 'age limit'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/platzi.py b/youtube_dl/extractor/platzi.py

new file mode 100644 (file)

index 0000000..23c8256
--- /dev/null
+++ b/youtube_dl/extractor/platzi.py
@@ -0,0 +1,224 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_str,
+)
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    str_or_none,
+    try_get,
+    url_or_none,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class PlatziBaseIE(InfoExtractor):
+    _LOGIN_URL = 'https://platzi.com/login/'
+    _NETRC_MACHINE = 'platzi'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login page')
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'email': username,
+            'password': password,
+        })
+
+        urlh = self._request_webpage(
+            self._LOGIN_URL, None, 'Logging in',
+            data=urlencode_postdata(login_form),
+            headers={'Referer': self._LOGIN_URL})
+
+        # login succeeded
+        if 'platzi.com/login' not in urlh.geturl():
+            return
+
+        login_error = self._webpage_read_content(
+            urlh, self._LOGIN_URL, None, 'Downloading login error page')
+
+        login = self._parse_json(
+            self._search_regex(
+                r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'),
+            None)
+
+        for kind in ('error', 'password', 'nonFields'):
+            error = str_or_none(login.get('%sError' % kind))
+            if error:
+                raise ExtractorError(
+                    'Unable to login: %s' % error, expected=True)
+        raise ExtractorError('Unable to log in')
+
+
+class PlatziIE(PlatziBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            platzi\.com/clases|           # es version
+                            courses\.platzi\.com/classes  # en version
+                        )/[^/]+/(?P<id>\d+)-[^/?\#&]+
+                    '''
+
+    _TESTS = [{
+        'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/',
+        'md5': '8f56448241005b561c10f11a595b37e3',
+        'info_dict': {
+            'id': '12074',
+            'ext': 'mp4',
+            'title': 'Creando nuestra primera página',
+            'description': 'md5:4c866e45034fc76412fbf6e60ae008bc',
+            'duration': 420,
+        },
+        'skip': 'Requires platzi account credentials',
+    }, {
+        'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/',
+        'info_dict': {
+            'id': '13430',
+            'ext': 'mp4',
+            'title': 'Background',
+            'description': 'md5:49c83c09404b15e6e71defaf87f6b305',
+            'duration': 360,
+        },
+        'skip': 'Requires platzi account credentials',
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        lecture_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, lecture_id)
+
+        data = self._parse_json(
+            self._search_regex(
+                # client_data may contain "};" so that we have to try more
+                # strict regex first
+                (r'client_data\s*=\s*({.+?})\s*;\s*\n',
+                 r'client_data\s*=\s*({.+?})\s*;'),
+                webpage, 'client data'),
+            lecture_id)
+
+        material = data['initialState']['material']
+        desc = material['description']
+        title = desc['title']
+
+        formats = []
+        for server_id, server in material['videos'].items():
+            if not isinstance(server, dict):
+                continue
+            for format_id in ('hls', 'dash'):
+                format_url = url_or_none(server.get(format_id))
+                if not format_url:
+                    continue
+                if format_id == 'hls':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, lecture_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id=format_id,
+                        note='Downloading %s m3u8 information' % server_id,
+                        fatal=False))
+                elif format_id == 'dash':
+                    formats.extend(self._extract_mpd_formats(
+                        format_url, lecture_id, mpd_id=format_id,
+                        note='Downloading %s MPD manifest' % server_id,
+                        fatal=False))
+        self._sort_formats(formats)
+
+        content = str_or_none(desc.get('content'))
+        description = (clean_html(compat_b64decode(content).decode('utf-8'))
+                       if content else None)
+        duration = int_or_none(material.get('duration'), invscale=60)
+
+        return {
+            'id': lecture_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'formats': formats,
+        }
+
+
+class PlatziCourseIE(PlatziBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            platzi\.com/clases|           # es version
+                            courses\.platzi\.com/classes  # en version
+                        )/(?P<id>[^/?\#&]+)
+                    '''
+    _TESTS = [{
+        'url': 'https://platzi.com/clases/next-js/',
+        'info_dict': {
+            'id': '1311',
+            'title': 'Curso de Next.js',
+        },
+        'playlist_count': 22,
+    }, {
+        'url': 'https://courses.platzi.com/classes/communication-codestream/',
+        'info_dict': {
+            'id': '1367',
+            'title': 'Codestream Course',
+        },
+        'playlist_count': 14,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        course_name = self._match_id(url)
+
+        webpage = self._download_webpage(url, course_name)
+
+        props = self._parse_json(
+            self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'),
+            course_name)['initialProps']
+
+        entries = []
+        for chapter_num, chapter in enumerate(props['concepts'], 1):
+            if not isinstance(chapter, dict):
+                continue
+            materials = chapter.get('materials')
+            if not materials or not isinstance(materials, list):
+                continue
+            chapter_title = chapter.get('title')
+            chapter_id = str_or_none(chapter.get('id'))
+            for material in materials:
+                if not isinstance(material, dict):
+                    continue
+                if material.get('material_type') != 'video':
+                    continue
+                video_url = urljoin(url, material.get('url'))
+                if not video_url:
+                    continue
+                entries.append({
+                    '_type': 'url_transparent',
+                    'url': video_url,
+                    'title': str_or_none(material.get('name')),
+                    'id': str_or_none(material.get('id')),
+                    'ie_key': PlatziIE.ie_key(),
+                    'chapter': chapter_title,
+                    'chapter_number': chapter_num,
+                    'chapter_id': chapter_id,
+                })
+
+        course_id = compat_str(try_get(props, lambda x: x['course']['id']))
+        course_title = try_get(props, lambda x: x['course']['name'], compat_str)
+
+        return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py

new file mode 100644 (file)

index 0000000..e766ccc
--- /dev/null
+++ b/youtube_dl/extractor/playfm.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class PlayFMIE(InfoExtractor):
+    IE_NAME = 'play.fm'
+    _VALID_URL = r'https?://(?:www\.)?play\.fm/(?P<slug>(?:[^/]+/)+(?P<id>[^/]+))/?(?:$|[?#])'
+
+    _TEST = {
+        'url': 'https://www.play.fm/dan-drastic/sven-tasnadi-leipzig-electronic-music-batofar-paris-fr-2014-07-12',
+        'md5': 'c505f8307825a245d0c7ad1850001f22',
+        'info_dict': {
+            'id': '71276',
+            'ext': 'mp3',
+            'title': 'Sven Tasnadi - LEIPZIG ELECTRONIC MUSIC @ Batofar (Paris,FR) - 2014-07-12',
+            'description': '',
+            'duration': 5627,
+            'timestamp': 1406033781,
+            'upload_date': '20140722',
+            'uploader': 'Dan Drastic',
+            'uploader_id': '71170',
+            'view_count': int,
+            'comment_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        slug = mobj.group('slug')
+
+        recordings = self._download_json(
+            'http://v2api.play.fm/recordings/slug/%s' % slug, video_id)
+
+        error = recordings.get('error')
+        if isinstance(error, dict):
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error.get('message')),
+                expected=True)
+
+        audio_url = recordings['audio']
+        video_id = compat_str(recordings.get('id') or video_id)
+        title = recordings['title']
+        description = recordings.get('description')
+        duration = int_or_none(recordings.get('recordingDuration'))
+        timestamp = parse_iso8601(recordings.get('created_at'))
+        uploader = recordings.get('page', {}).get('title')
+        uploader_id = compat_str(recordings.get('page', {}).get('id'))
+        view_count = int_or_none(recordings.get('playCount'))
+        comment_count = int_or_none(recordings.get('commentCount'))
+        categories = [tag['name'] for tag in recordings.get('tags', []) if tag.get('name')]
+
+        return {
+            'id': video_id,
+            'url': audio_url,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'categories': categories,
+        }
diff --git a/youtube_dl/extractor/playplustv.py b/youtube_dl/extractor/playplustv.py

new file mode 100644 (file)

index 0000000..1e30ab2
--- /dev/null
+++ b/youtube_dl/extractor/playplustv.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    PUTRequest,
+)
+
+
+class PlayPlusTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})'
+    _TEST = {
+        'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e',
+        'md5': 'd078cb89d7ab6b9df37ce23c647aef72',
+        'info_dict': {
+            'id': 'db8d274a5163424e967f35a30ddafb8e',
+            'ext': 'mp4',
+            'title': 'Capítulo 179 - Final',
+            'description': 'md5:01085d62d8033a1e34121d3c3cabc838',
+            'timestamp': 1529992740,
+            'upload_date': '20180626',
+        },
+        'skip': 'Requires account credential',
+    }
+    _NETRC_MACHINE = 'playplustv'
+    _GEO_COUNTRIES = ['BR']
+    _token = None
+    _profile_id = None
+
+    def _call_api(self, resource, video_id=None, query=None):
+        return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={
+            'Authorization': 'Bearer ' + self._token,
+        }, query=query)
+
+    def _real_initialize(self):
+        email, password = self._get_login_info()
+        if email is None:
+            self.raise_login_required()
+
+        req = PUTRequest(
+            'https://api.playplus.tv/api/web/login', json.dumps({
+                'email': email,
+                'password': password,
+            }).encode(), {
+                'Content-Type': 'application/json; charset=utf-8',
+            })
+
+        try:
+            self._token = self._download_json(req, None)['token']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                raise ExtractorError(self._parse_json(
+                    e.cause.read(), None)['errorMessage'], expected=True)
+            raise
+
+        self._profile = self._call_api('Profiles')['list'][0]['_id']
+
+    def _real_extract(self, url):
+        project_id, media_id = re.match(self._VALID_URL, url).groups()
+        media = self._call_api(
+            'Media', media_id, {
+                'profileId': self._profile,
+                'projectId': project_id,
+                'mediaId': media_id,
+            })['obj']
+        title = media['title']
+
+        formats = []
+        for f in media.get('files', []):
+            f_url = f.get('url')
+            if not f_url:
+                continue
+            file_info = f.get('fileInfo') or {}
+            formats.append({
+                'url': f_url,
+                'width': int_or_none(file_info.get('width')),
+                'height': int_or_none(file_info.get('height')),
+            })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for thumb in media.get('thumbs', []):
+            thumb_url = thumb.get('url')
+            if not thumb_url:
+                continue
+            thumbnails.append({
+                'url': thumb_url,
+                'width': int_or_none(thumb.get('width')),
+                'height': int_or_none(thumb.get('height')),
+            })
+
+        return {
+            'id': media_id,
+            'title': title,
+            'formats': formats,
+            'thumbnails': thumbnails,
+            'description': clean_html(media.get('description')) or media.get('shortDescription'),
+            'timestamp': int_or_none(media.get('publishDate'), 1000),
+            'view_count': int_or_none(media.get('numberOfViews')),
+            'comment_count': int_or_none(media.get('numberOfComments')),
+            'tags': media.get('tags'),
+        }
diff --git a/youtube_dl/extractor/plays.py b/youtube_dl/extractor/plays.py

new file mode 100644 (file)

index 0000000..ddfc6f1
--- /dev/null
+++ b/youtube_dl/extractor/plays.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class PlaysTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?plays\.tv/(?:video|embeds)/(?P<id>[0-9a-f]{18})'
+    _TESTS = [{
+        'url': 'https://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall',
+        'md5': 'dfeac1198506652b5257a62762cec7bc',
+        'info_dict': {
+            'id': '56af17f56c95335490',
+            'ext': 'mp4',
+            'title': 'Bjergsen - When you outplay the Azir wall',
+            'description': 'Posted by Bjergsen',
+        }
+    }, {
+        'url': 'https://plays.tv/embeds/56af17f56c95335490',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'https://plays.tv/video/%s' % video_id, video_id)
+
+        info = self._search_json_ld(webpage, video_id,)
+
+        mpd_url, sources = re.search(
+            r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>',
+            webpage).groups()
+        formats = self._extract_mpd_formats(
+            self._proto_relative_url(mpd_url), video_id, mpd_id='DASH')
+        for format_id, height, format_url in re.findall(r'<source\s+res="((\d+)h?)"\s+src="([^"]+)"', sources):
+            formats.append({
+                'url': self._proto_relative_url(format_url),
+                'format_id': 'http-' + format_id,
+                'height': int_or_none(height),
+            })
+        self._sort_formats(formats)
+
+        info.update({
+            'id': video_id,
+            'description': self._og_search_description(webpage),
+            'thumbnail': info.get('thumbnail') or self._og_search_thumbnail(webpage),
+            'formats': formats,
+        })
+
+        return info
diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py

new file mode 100644 (file)

index 0000000..4c5f579
--- /dev/null
+++ b/youtube_dl/extractor/playtvak.py
@@ -0,0 +1,191 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urlparse,
+    compat_urllib_parse_urlencode,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+    qualities,
+)
+
+
+class PlaytvakIE(InfoExtractor):
+    IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz'
+    _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P<id>[^&]+)'
+    _TESTS = [{
+        'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko',
+        'md5': '4525ae312c324b4be2f4603cc78ceb4a',
+        'info_dict': {
+            'id': 'A150730_150323_hodinovy-manzel_kuko',
+            'ext': 'mp4',
+            'title': 'Vyžeňte vosy a sršně ze zahrady',
+            'description': 'md5:4436e61b7df227a093778efb7e373571',
+            'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$',
+            'duration': 279,
+            'timestamp': 1438732860,
+            'upload_date': '20150805',
+            'is_live': False,
+        }
+    }, {  # live video test
+        'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat',
+        'info_dict': {
+            'id': 'A150624_164934_planespotting_cat',
+            'ext': 'flv',
+            'title': 're:^Planespotting [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,  # requires rtmpdump
+        },
+    }, {  # another live stream, this one without Misc.videoFLV
+        'url': 'https://slowtv.playtvak.cz/zive-sledujte-vlaky-v-primem-prenosu-dwi-/hlavni-nadrazi.aspx?c=A151218_145728_hlavni-nadrazi_plap',
+        'info_dict': {
+            'id': 'A151218_145728_hlavni-nadrazi_plap',
+            'ext': 'flv',
+            'title': 're:^Hlavní nádraží [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,  # requires rtmpdump
+        },
+    }, {  # idnes.cz
+        'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku',
+        'md5': '819832ba33cd7016e58a6658577fe289',
+        'info_dict': {
+            'id': 'A150809_104116_domaci_pku',
+            'ext': 'mp4',
+            'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se',
+            'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2',
+            'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$',
+            'duration': 39,
+            'timestamp': 1438969140,
+            'upload_date': '20150807',
+            'is_live': False,
+        }
+    }, {  # lidovky.cz
+        'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE',
+        'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8',
+        'info_dict': {
+            'id': 'A150808_214044_ln-video_ELE',
+            'ext': 'mp4',
+            'title': 'Táhni! Demonstrace proti imigrantům budila emoce',
+            'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c',
+            'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$',
+            'timestamp': 1439052180,
+            'upload_date': '20150808',
+            'is_live': False,
+        }
+    }, {  # metro.cz
+        'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row',
+        'md5': '84fc1deedcac37b7d4a6ccae7c716668',
+        'info_dict': {
+            'id': 'A141111_173251_metro-extra_row',
+            'ext': 'mp4',
+            'title': 'Recesisté udělali z billboardu kolotoč',
+            'description': 'md5:7369926049588c3989a66c9c1a043c4c',
+            'thumbnail': r're:(?i)^https?://.*\.(?:jpg|png)$',
+            'timestamp': 1415725500,
+            'upload_date': '20141111',
+            'is_live': False,
+        }
+    }, {
+        'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        info_url = self._html_search_regex(
+            r'Misc\.video(?:FLV)?\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url')
+
+        parsed_url = compat_urlparse.urlparse(info_url)
+
+        qs = compat_urlparse.parse_qs(parsed_url.query)
+        qs.update({
+            'reklama': ['0'],
+            'type': ['js'],
+        })
+
+        info_url = compat_urlparse.urlunparse(
+            parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
+        json_info = self._download_json(
+            info_url, video_id,
+            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+        item = None
+        for i in json_info['items']:
+            if i.get('type') == 'video' or i.get('type') == 'stream':
+                item = i
+                break
+        if not item:
+            raise ExtractorError('No suitable stream found')
+
+        quality = qualities(('low', 'middle', 'high'))
+
+        formats = []
+        for fmt in item['video']:
+            video_url = fmt.get('file')
+            if not video_url:
+                continue
+
+            format_ = fmt['format']
+            format_id = '%s_%s' % (format_, fmt['quality'])
+            preference = None
+
+            if format_ in ('mp4', 'webm'):
+                ext = format_
+            elif format_ == 'rtmp':
+                ext = 'flv'
+            elif format_ == 'apple':
+                ext = 'mp4'
+                # Some streams have mp3 audio which does not play
+                # well with ffmpeg filter aac_adtstoasc
+                preference = -1
+            elif format_ == 'adobe':  # f4m manifest fails with 404 in 80% of requests
+                continue
+            else:  # Other formats not supported yet
+                continue
+
+            formats.append({
+                'url': video_url,
+                'ext': ext,
+                'format_id': format_id,
+                'quality': quality(fmt.get('quality')),
+                'preference': preference,
+            })
+        self._sort_formats(formats)
+
+        title = item['title']
+        is_live = item['type'] == 'stream'
+        if is_live:
+            title = self._live_title(title)
+        description = self._og_search_description(webpage, default=None) or self._html_search_meta(
+            'description', webpage, 'description', default=None)
+        timestamp = None
+        duration = None
+        if not is_live:
+            duration = int_or_none(item.get('length'))
+            timestamp = item.get('published')
+            if timestamp:
+                timestamp = parse_iso8601(timestamp[:-5])
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': item.get('image'),
+            'duration': duration,
+            'timestamp': timestamp,
+            'is_live': is_live,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py

new file mode 100644 (file)

index 0000000..4aef186
--- /dev/null
+++ b/youtube_dl/extractor/playvid.py
@@ -0,0 +1,99 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
+)
+from ..utils import (
+    clean_html,
+    ExtractorError,
+)
+
+
+class PlayvidIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
+    _TESTS = [{
+        'url': 'http://www.playvid.com/watch/RnmBNgtrrJu',
+        'md5': 'ffa2f6b2119af359f544388d8c01eb6c',
+        'info_dict': {
+            'id': 'RnmBNgtrrJu',
+            'ext': 'mp4',
+            'title': 'md5:9256d01c6317e3f703848b5906880dc8',
+            'duration': 82,
+            'age_limit': 18,
+        },
+        'skip': 'Video removed due to ToS',
+    }, {
+        'url': 'http://www.playvid.com/watch/hwb0GpNkzgH',
+        'md5': '39d49df503ad7b8f23a4432cbf046477',
+        'info_dict': {
+            'id': 'hwb0GpNkzgH',
+            'ext': 'mp4',
+            'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park',
+            'age_limit': 18,
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        m_error = re.search(
+            r'<div class="block-error">\s*<div class="heading">\s*<div>(?P<msg>.+?)</div>\s*</div>', webpage)
+        if m_error:
+            raise ExtractorError(clean_html(m_error.group('msg')), expected=True)
+
+        video_title = None
+        duration = None
+        video_thumbnail = None
+        formats = []
+
+        # most of the information is stored in the flashvars
+        flashvars = self._html_search_regex(
+            r'flashvars="(.+?)"', webpage, 'flashvars')
+
+        infos = compat_urllib_parse_unquote(flashvars).split(r'&')
+        for info in infos:
+            videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
+            if videovars_match:
+                key = videovars_match.group(1)
+                val = videovars_match.group(2)
+
+                if key == 'title':
+                    video_title = compat_urllib_parse_unquote_plus(val)
+                if key == 'duration':
+                    try:
+                        duration = int(val)
+                    except ValueError:
+                        pass
+                if key == 'big_thumb':
+                    video_thumbnail = val
+
+                videourl_match = re.match(
+                    r'^video_urls\]\[(?P<resolution>[0-9]+)p', key)
+                if videourl_match:
+                    height = int(videourl_match.group('resolution'))
+                    formats.append({
+                        'height': height,
+                        'url': val,
+                    })
+        self._sort_formats(formats)
+
+        # Extract title - should be in the flashvars; if not, look elsewhere
+        if video_title is None:
+            video_title = self._html_search_regex(
+                r'<title>(.*?)</title', webpage, 'title')
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': video_title,
+            'thumbnail': video_thumbnail,
+            'duration': duration,
+            'description': None,
+            'age_limit': 18
+        }
diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py

new file mode 100644 (file)

index 0000000..4d96a10
--- /dev/null
+++ b/youtube_dl/extractor/playwire.py
@@ -0,0 +1,75 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    dict_get,
+    float_or_none,
+)
+
+
+class PlaywireIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json',
+        'md5': 'e6398701e3595888125729eaa2329ed9',
+        'info_dict': {
+            'id': '3353705',
+            'ext': 'mp4',
+            'title': 'S04_RM_UCL_Rus',
+            'thumbnail': r're:^https?://.*\.png$',
+            'duration': 145.94,
+        },
+    }, {
+        # m3u8 in f4m
+        'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json',
+        'info_dict': {
+            'id': '4840492',
+            'ext': 'mp4',
+            'title': 'ITV EL SHOW FULL',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # Multiple resolutions while bitrates missing
+        'url': 'http://cdn.playwire.com/11625/embed/85228.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://config.playwire.com/12421/videos/v2/3389892/zeus.json',
+        'only_matching': True,
+    }, {
+        'url': 'http://cdn.playwire.com/v2/12342/config/1532636.json',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        publisher_id, video_id = mobj.group('publisher_id'), mobj.group('id')
+
+        player = self._download_json(
+            'http://config.playwire.com/%s/videos/v2/%s/zeus.json' % (publisher_id, video_id),
+            video_id)
+
+        title = player['settings']['title']
+        duration = float_or_none(player.get('duration'), 1000)
+
+        content = player['content']
+        thumbnail = content.get('poster')
+        src = content['media']['f4m']
+
+        formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls')
+        for a_format in formats:
+            if not dict_get(a_format, ['tbr', 'width', 'height']):
+                a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py

new file mode 100644 (file)

index 0000000..abd08bc
--- /dev/null
+++ b/youtube_dl/extractor/pluralsight.py
@@ -0,0 +1,501 @@
+from __future__ import unicode_literals
+
+import collections
+import json
+import os
+import random
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    dict_get,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    parse_duration,
+    qualities,
+    srt_subtitles_timecode,
+    try_get,
+    update_url_query,
+    urlencode_postdata,
+)
+
+
+class PluralsightBaseIE(InfoExtractor):
+    _API_BASE = 'https://app.pluralsight.com'
+
+    _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE
+    _GRAPHQL_HEADERS = {
+        'Content-Type': 'application/json;charset=UTF-8',
+    }
+    _GRAPHQL_COURSE_TMPL = '''
+query BootstrapPlayer {
+  rpc {
+    bootstrapPlayer {
+      profile {
+        firstName
+        lastName
+        email
+        username
+        userHandle
+        authed
+        isAuthed
+        plan
+      }
+      course(courseId: "%s") {
+        name
+        title
+        courseHasCaptions
+        translationLanguages {
+          code
+          name
+        }
+        supportsWideScreenVideoFormats
+        timestamp
+        modules {
+          name
+          title
+          duration
+          formattedDuration
+          author
+          authorized
+          clips {
+            authorized
+            clipId
+            duration
+            formattedDuration
+            id
+            index
+            moduleIndex
+            moduleTitle
+            name
+            title
+            watched
+          }
+        }
+      }
+    }
+  }
+}'''
+
+    def _download_course(self, course_id, url, display_id):
+        try:
+            return self._download_course_rpc(course_id, url, display_id)
+        except ExtractorError:
+            # Old API fallback
+            return self._download_json(
+                'https://app.pluralsight.com/player/user/api/v1/player/payload',
+                display_id, data=urlencode_postdata({'courseId': course_id}),
+                headers={'Referer': url})
+
+    def _download_course_rpc(self, course_id, url, display_id):
+        response = self._download_json(
+            self._GRAPHQL_EP, display_id, data=json.dumps({
+                'query': self._GRAPHQL_COURSE_TMPL % course_id,
+                'variables': {}
+            }).encode('utf-8'), headers=self._GRAPHQL_HEADERS)
+
+        course = try_get(
+            response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'],
+            dict)
+        if course:
+            return course
+
+        raise ExtractorError(
+            '%s said: %s' % (self.IE_NAME, response['error']['message']),
+            expected=True)
+
+
+class PluralsightIE(PluralsightBaseIE):
+    IE_NAME = 'pluralsight'
+    _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:training/)?player\?'
+    _LOGIN_URL = 'https://app.pluralsight.com/id/'
+
+    _NETRC_MACHINE = 'pluralsight'
+
+    _TESTS = [{
+        'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas',
+        'md5': '4d458cf5cf4c593788672419a8dd4cf8',
+        'info_dict': {
+            'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04',
+            'ext': 'mp4',
+            'title': 'Demo Monitoring',
+            'duration': 338,
+        },
+        'skip': 'Requires pluralsight account credentials',
+    }, {
+        'url': 'https://app.pluralsight.com/training/player?course=angularjs-get-started&author=scott-allen&name=angularjs-get-started-m1-introduction&clip=0&mode=live',
+        'only_matching': True,
+    }, {
+        # available without pluralsight account
+        'url': 'http://app.pluralsight.com/training/player?author=scott-allen&name=angularjs-get-started-m1-introduction&mode=live&clip=0&course=angularjs-get-started',
+        'only_matching': True,
+    }, {
+        'url': 'https://app.pluralsight.com/player?course=ccna-intro-networking&author=ross-bagurdes&name=ccna-intro-networking-m06&clip=0',
+        'only_matching': True,
+    }]
+
+    GRAPHQL_VIEWCLIP_TMPL = '''
+query viewClip {
+  viewClip(input: {
+    author: "%(author)s",
+    clipIndex: %(clipIndex)d,
+    courseName: "%(courseName)s",
+    includeCaptions: %(includeCaptions)s,
+    locale: "%(locale)s",
+    mediaType: "%(mediaType)s",
+    moduleName: "%(moduleName)s",
+    quality: "%(quality)s"
+  }) {
+    urls {
+      url
+      cdn
+      rank
+      source
+    },
+    status
+  }
+}'''
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login page')
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'Username': username,
+            'Password': password,
+        })
+
+        post_url = self._search_regex(
+            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+            'post url', default=self._LOGIN_URL, group='url')
+
+        if not post_url.startswith('http'):
+            post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+
+        response = self._download_webpage(
+            post_url, None, 'Logging in',
+            data=urlencode_postdata(login_form),
+            headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+        error = self._search_regex(
+            r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>',
+            response, 'error message', default=None)
+        if error:
+            raise ExtractorError('Unable to login: %s' % error, expected=True)
+
+        if all(not re.search(p, response) for p in (
+                r'__INITIAL_STATE__', r'["\']currentUser["\']',
+                # new layout?
+                r'>\s*Sign out\s*<')):
+            BLOCKED = 'Your account has been blocked due to suspicious activity'
+            if BLOCKED in response:
+                raise ExtractorError(
+                    'Unable to login: %s' % BLOCKED, expected=True)
+            MUST_AGREE = 'To continue using Pluralsight, you must agree to'
+            if any(p in response for p in (MUST_AGREE, '>Disagree<', '>Agree<')):
+                raise ExtractorError(
+                    'Unable to login: %s some documents. Go to pluralsight.com, '
+                    'log in and agree with what Pluralsight requires.'
+                    % MUST_AGREE, expected=True)
+
+            raise ExtractorError('Unable to log in')
+
+    def _get_subtitles(self, author, clip_idx, clip_id, lang, name, duration, video_id):
+        captions = None
+        if clip_id:
+            captions = self._download_json(
+                '%s/transcript/api/v1/caption/json/%s/%s'
+                % (self._API_BASE, clip_id, lang), video_id,
+                'Downloading captions JSON', 'Unable to download captions JSON',
+                fatal=False)
+        if not captions:
+            captions_post = {
+                'a': author,
+                'cn': int(clip_idx),
+                'lc': lang,
+                'm': name,
+            }
+            captions = self._download_json(
+                '%s/player/retrieve-captions' % self._API_BASE, video_id,
+                'Downloading captions JSON', 'Unable to download captions JSON',
+                fatal=False, data=json.dumps(captions_post).encode('utf-8'),
+                headers={'Content-Type': 'application/json;charset=utf-8'})
+        if captions:
+            return {
+                lang: [{
+                    'ext': 'json',
+                    'data': json.dumps(captions),
+                }, {
+                    'ext': 'srt',
+                    'data': self._convert_subtitles(duration, captions),
+                }]
+            }
+
+    @staticmethod
+    def _convert_subtitles(duration, subs):
+        srt = ''
+        TIME_OFFSET_KEYS = ('displayTimeOffset', 'DisplayTimeOffset')
+        TEXT_KEYS = ('text', 'Text')
+        for num, current in enumerate(subs):
+            current = subs[num]
+            start, text = (
+                float_or_none(dict_get(current, TIME_OFFSET_KEYS, skip_false_values=False)),
+                dict_get(current, TEXT_KEYS))
+            if start is None or text is None:
+                continue
+            end = duration if num == len(subs) - 1 else float_or_none(
+                dict_get(subs[num + 1], TIME_OFFSET_KEYS, skip_false_values=False))
+            if end is None:
+                continue
+            srt += os.linesep.join(
+                (
+                    '%d' % num,
+                    '%s --> %s' % (
+                        srt_subtitles_timecode(start),
+                        srt_subtitles_timecode(end)),
+                    text,
+                    os.linesep,
+                ))
+        return srt
+
+    def _real_extract(self, url):
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+
+        author = qs.get('author', [None])[0]
+        name = qs.get('name', [None])[0]
+        clip_idx = qs.get('clip', [None])[0]
+        course_name = qs.get('course', [None])[0]
+
+        if any(not f for f in (author, name, clip_idx, course_name,)):
+            raise ExtractorError('Invalid URL', expected=True)
+
+        display_id = '%s-%s' % (name, clip_idx)
+
+        course = self._download_course(course_name, url, display_id)
+
+        collection = course['modules']
+
+        clip = None
+
+        for module_ in collection:
+            if name in (module_.get('moduleName'), module_.get('name')):
+                for clip_ in module_.get('clips', []):
+                    clip_index = clip_.get('clipIndex')
+                    if clip_index is None:
+                        clip_index = clip_.get('index')
+                    if clip_index is None:
+                        continue
+                    if compat_str(clip_index) == clip_idx:
+                        clip = clip_
+                        break
+
+        if not clip:
+            raise ExtractorError('Unable to resolve clip')
+
+        title = clip['title']
+        clip_id = clip.get('clipName') or clip.get('name') or clip['clipId']
+
+        QUALITIES = {
+            'low': {'width': 640, 'height': 480},
+            'medium': {'width': 848, 'height': 640},
+            'high': {'width': 1024, 'height': 768},
+            'high-widescreen': {'width': 1280, 'height': 720},
+        }
+
+        QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',)
+        quality_key = qualities(QUALITIES_PREFERENCE)
+
+        AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities'])
+
+        ALLOWED_QUALITIES = (
+            AllowedQuality('webm', ['high', ]),
+            AllowedQuality('mp4', ['low', 'medium', 'high', ]),
+        )
+
+        # Some courses also offer widescreen resolution for high quality (see
+        # https://github.com/ytdl-org/youtube-dl/issues/7766)
+        widescreen = course.get('supportsWideScreenVideoFormats') is True
+        best_quality = 'high-widescreen' if widescreen else 'high'
+        if widescreen:
+            for allowed_quality in ALLOWED_QUALITIES:
+                allowed_quality.qualities.append(best_quality)
+
+        # In order to minimize the number of calls to ViewClip API and reduce
+        # the probability of being throttled or banned by Pluralsight we will request
+        # only single format until formats listing was explicitly requested.
+        if self._downloader.params.get('listformats', False):
+            allowed_qualities = ALLOWED_QUALITIES
+        else:
+            def guess_allowed_qualities():
+                req_format = self._downloader.params.get('format') or 'best'
+                req_format_split = req_format.split('-', 1)
+                if len(req_format_split) > 1:
+                    req_ext, req_quality = req_format_split
+                    req_quality = '-'.join(req_quality.split('-')[:2])
+                    for allowed_quality in ALLOWED_QUALITIES:
+                        if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities:
+                            return (AllowedQuality(req_ext, (req_quality, )), )
+                req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4'
+                return (AllowedQuality(req_ext, (best_quality, )), )
+            allowed_qualities = guess_allowed_qualities()
+
+        formats = []
+        for ext, qualities_ in allowed_qualities:
+            for quality in qualities_:
+                f = QUALITIES[quality].copy()
+                clip_post = {
+                    'author': author,
+                    'includeCaptions': 'false',
+                    'clipIndex': int(clip_idx),
+                    'courseName': course_name,
+                    'locale': 'en',
+                    'moduleName': name,
+                    'mediaType': ext,
+                    'quality': '%dx%d' % (f['width'], f['height']),
+                }
+                format_id = '%s-%s' % (ext, quality)
+
+                try:
+                    viewclip = self._download_json(
+                        self._GRAPHQL_EP, display_id,
+                        'Downloading %s viewclip graphql' % format_id,
+                        data=json.dumps({
+                            'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post,
+                            'variables': {}
+                        }).encode('utf-8'),
+                        headers=self._GRAPHQL_HEADERS)['data']['viewClip']
+                except ExtractorError:
+                    # Still works but most likely will go soon
+                    viewclip = self._download_json(
+                        '%s/video/clips/viewclip' % self._API_BASE, display_id,
+                        'Downloading %s viewclip JSON' % format_id, fatal=False,
+                        data=json.dumps(clip_post).encode('utf-8'),
+                        headers={'Content-Type': 'application/json;charset=utf-8'})
+
+                # Pluralsight tracks multiple sequential calls to ViewClip API and start
+                # to return 429 HTTP errors after some time (see
+                # https://github.com/ytdl-org/youtube-dl/pull/6989). Moreover it may even lead
+                # to account ban (see https://github.com/ytdl-org/youtube-dl/issues/6842).
+                # To somewhat reduce the probability of these consequences
+                # we will sleep random amount of time before each call to ViewClip.
+                self._sleep(
+                    random.randint(2, 5), display_id,
+                    '%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling')
+
+                if not viewclip:
+                    continue
+
+                clip_urls = viewclip.get('urls')
+                if not isinstance(clip_urls, list):
+                    continue
+
+                for clip_url_data in clip_urls:
+                    clip_url = clip_url_data.get('url')
+                    if not clip_url:
+                        continue
+                    cdn = clip_url_data.get('cdn')
+                    clip_f = f.copy()
+                    clip_f.update({
+                        'url': clip_url,
+                        'ext': ext,
+                        'format_id': '%s-%s' % (format_id, cdn) if cdn else format_id,
+                        'quality': quality_key(quality),
+                        'source_preference': int_or_none(clip_url_data.get('rank')),
+                    })
+                    formats.append(clip_f)
+
+        self._sort_formats(formats)
+
+        duration = int_or_none(
+            clip.get('duration')) or parse_duration(clip.get('formattedDuration'))
+
+        # TODO: other languages?
+        subtitles = self.extract_subtitles(
+            author, clip_idx, clip.get('clipId'), 'en', name, duration, display_id)
+
+        return {
+            'id': clip_id,
+            'title': title,
+            'duration': duration,
+            'creator': author,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class PluralsightCourseIE(PluralsightBaseIE):
+    IE_NAME = 'pluralsight:course'
+    _VALID_URL = r'https?://(?:(?:www|app)\.)?pluralsight\.com/(?:library/)?courses/(?P<id>[^/]+)'
+    _TESTS = [{
+        # Free course from Pluralsight Starter Subscription for Microsoft TechNet
+        # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz
+        'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas',
+        'info_dict': {
+            'id': 'hosting-sql-server-windows-azure-iaas',
+            'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals',
+            'description': 'md5:61b37e60f21c4b2f91dc621a977d0986',
+        },
+        'playlist_count': 31,
+    }, {
+        # available without pluralsight account
+        'url': 'https://www.pluralsight.com/courses/angularjs-get-started',
+        'only_matching': True,
+    }, {
+        'url': 'https://app.pluralsight.com/library/courses/understanding-microsoft-azure-amazon-aws/table-of-contents',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        course_id = self._match_id(url)
+
+        # TODO: PSM cookie
+
+        course = self._download_course(course_id, url, course_id)
+
+        title = course['title']
+        course_name = course['name']
+        course_data = course['modules']
+        description = course.get('description') or course.get('shortDescription')
+
+        entries = []
+        for num, module in enumerate(course_data, 1):
+            author = module.get('author')
+            module_name = module.get('name')
+            if not author or not module_name:
+                continue
+            for clip in module.get('clips', []):
+                clip_index = int_or_none(clip.get('index'))
+                if clip_index is None:
+                    continue
+                clip_url = update_url_query(
+                    '%s/player' % self._API_BASE, query={
+                        'mode': 'live',
+                        'course': course_name,
+                        'author': author,
+                        'name': module_name,
+                        'clip': clip_index,
+                    })
+                entries.append({
+                    '_type': 'url_transparent',
+                    'url': clip_url,
+                    'ie_key': PluralsightIE.ie_key(),
+                    'chapter': module.get('title'),
+                    'chapter_number': num,
+                    'chapter_id': module.get('moduleRef'),
+                })
+
+        return self.playlist_result(entries, course_id, title, description)
diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py

new file mode 100644 (file)

index 0000000..e782e3f
--- /dev/null
+++ b/youtube_dl/extractor/podomatic.py
@@ -0,0 +1,76 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class PodomaticIE(InfoExtractor):
+    IE_NAME = 'podomatic'
+    _VALID_URL = r'''(?x)
+                    (?P<proto>https?)://
+                        (?:
+                            (?P<channel>[^.]+)\.podomatic\.com/entry|
+                            (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes
+                        )/
+                        (?P<id>[^/?#&]+)
+                '''
+
+    _TESTS = [{
+        'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00',
+        'md5': '84bb855fcf3429e6bf72460e1eed782d',
+        'info_dict': {
+            'id': '2009-01-02T16_03_35-08_00',
+            'ext': 'mp3',
+            'uploader': 'Science Teaching Tips',
+            'uploader_id': 'scienceteachingtips',
+            'title': '64.  When the Moon Hits Your Eye',
+            'duration': 446,
+        }
+    }, {
+        'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00',
+        'md5': 'd2cf443931b6148e27638650e2638297',
+        'info_dict': {
+            'id': '2013-11-15T16_31_21-08_00',
+            'ext': 'mp3',
+            'uploader': 'Ostbahnhof / Techno Mix',
+            'uploader_id': 'ostbahnhof',
+            'title': 'Einunddreizig',
+            'duration': 3799,
+        }
+    }, {
+        'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        channel = mobj.group('channel') or mobj.group('channel_2')
+
+        json_url = (('%s://%s.podomatic.com/entry/embed_params/%s'
+                     + '?permalink=true&rtmp=0') %
+                    (mobj.group('proto'), channel, video_id))
+        data_json = self._download_webpage(
+            json_url, video_id, 'Downloading video info')
+        data = json.loads(data_json)
+
+        video_url = data['downloadLink']
+        if not video_url:
+            video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation'])
+        uploader = data['podcast']
+        title = data['title']
+        thumbnail = data['imageLocation']
+        duration = int_or_none(data.get('length'), 1000)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'uploader': uploader,
+            'uploader_id': channel,
+            'thumbnail': thumbnail,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py

new file mode 100644 (file)

index 0000000..80222d4
--- /dev/null
+++ b/youtube_dl/extractor/pokemon.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+)
+
+
+class PokemonIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))'
+    _TESTS = [{
+        'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/',
+        'md5': '2fe8eaec69768b25ef898cda9c43062e',
+        'info_dict': {
+            'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4',
+            'ext': 'mp4',
+            'title': 'The Ol’ Raise and Switch!',
+            'description': 'md5:7db77f7107f98ba88401d3adc80ff7af',
+        },
+        'add_id': ['LimelightMedia'],
+    }, {
+        # no data-video-title
+        'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008',
+        'info_dict': {
+            'id': 'dfbaf830d7e54e179837c50c0c6cc0e1',
+            'ext': 'mp4',
+            'title': "Pokémon : L'ascension de Darkrai",
+            'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5',
+        },
+        'add_id': ['LimelightMedia'],
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id, display_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, video_id or display_id)
+        video_data = extract_attributes(self._search_regex(
+            r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'),
+            webpage, 'video data element'))
+        video_id = video_data['data-video-id']
+        title = video_data.get('data-video-title') or self._html_search_meta(
+            'pkm-title', webpage, ' title', default=None) or self._search_regex(
+            r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title')
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': 'limelight:media:%s' % video_id,
+            'title': title,
+            'description': video_data.get('data-video-summary'),
+            'thumbnail': video_data.get('data-video-poster'),
+            'series': 'Pokémon',
+            'season_number': int_or_none(video_data.get('data-video-season')),
+            'episode': title,
+            'episode_number': int_or_none(video_data.get('data-video-episode')),
+            'ie_key': 'LimelightMedia',
+        }
diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py

new file mode 100644 (file)

index 0000000..978d6f8
--- /dev/null
+++ b/youtube_dl/extractor/polskieradio.py
@@ -0,0 +1,180 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_unquote,
+    compat_urlparse
+)
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+    strip_or_none,
+    unified_timestamp,
+)
+
+
+class PolskieRadioIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie',
+        'info_dict': {
+            'id': '1587943',
+            'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
+            'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
+        },
+        'playlist': [{
+            'md5': '2984ee6ce9046d91fc233bc1a864a09a',
+            'info_dict': {
+                'id': '1540576',
+                'ext': 'mp3',
+                'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
+                'timestamp': 1456594200,
+                'upload_date': '20160227',
+                'duration': 2364,
+                'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$'
+            },
+        }],
+    }, {
+        'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal',
+        'info_dict': {
+            'id': '1635803',
+            'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał',
+            'description': 'md5:01cb7d0cad58664095d72b51a1ebada2',
+        },
+        'playlist_mincount': 12,
+    }, {
+        'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943',
+        'only_matching': True,
+    }, {
+        # with mp4 video
+        'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        content = self._search_regex(
+            r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
+            webpage, 'content')
+
+        timestamp = unified_timestamp(self._html_search_regex(
+            r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
+            webpage, 'timestamp', fatal=False))
+
+        thumbnail_url = self._og_search_thumbnail(webpage)
+
+        entries = []
+
+        media_urls = set()
+
+        for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content):
+            media = self._parse_json(data_media, playlist_id, fatal=False)
+            if not media.get('file') or not media.get('desc'):
+                continue
+            media_url = self._proto_relative_url(media['file'], 'http:')
+            if media_url in media_urls:
+                continue
+            media_urls.add(media_url)
+            entries.append({
+                'id': compat_str(media['id']),
+                'url': media_url,
+                'title': compat_urllib_parse_unquote(media['desc']),
+                'duration': int_or_none(media.get('length')),
+                'vcodec': 'none' if media.get('provider') == 'audio' else None,
+                'timestamp': timestamp,
+                'thumbnail': thumbnail_url
+            })
+
+        title = self._og_search_title(webpage).strip()
+        description = strip_or_none(self._og_search_description(webpage))
+
+        return self.playlist_result(entries, playlist_id, title, description)
+
+
+class PolskieRadioCategoryIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA',
+        'info_dict': {
+            'id': '5102',
+            'title': 'HISTORIA ŻYWA',
+        },
+        'playlist_mincount': 38,
+    }, {
+        'url': 'http://www.polskieradio.pl/7/4807',
+        'info_dict': {
+            'id': '4807',
+            'title': 'Vademecum 1050. rocznicy Chrztu Polski'
+        },
+        'playlist_mincount': 5
+    }, {
+        'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source',
+        'only_matching': True
+    }, {
+        'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
+        'info_dict': {
+            'id': '4143',
+            'title': 'Kierunek Kraków',
+        },
+        'playlist_mincount': 61
+    }, {
+        'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka',
+        'info_dict': {
+            'id': '214',
+            'title': 'Muzyka',
+        },
+        'playlist_mincount': 61
+    }, {
+        'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url)
+
+    def _entries(self, url, page, category_id):
+        content = page
+        for page_num in itertools.count(2):
+            for a_entry, entry_id in re.findall(
+                    r'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>',
+                    content):
+                entry = extract_attributes(a_entry)
+                href = entry.get('href')
+                if not href:
+                    continue
+                yield self.url_result(
+                    compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(),
+                    entry_id, entry.get('title'))
+            mobj = re.search(
+                r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1',
+                content)
+            if not mobj:
+                break
+            next_url = compat_urlparse.urljoin(url, mobj.group('url'))
+            content = self._download_webpage(
+                next_url, category_id, 'Downloading page %s' % page_num)
+
+    def _real_extract(self, url):
+        category_id = self._match_id(url)
+        webpage = self._download_webpage(url, category_id)
+        title = self._html_search_regex(
+            r'<title>([^<]+) - [^<]+ - [^<]+</title>',
+            webpage, 'title', fatal=False)
+        return self.playlist_result(
+            self._entries(url, webpage, category_id),
+            category_id, title)
diff --git a/youtube_dl/extractor/popcorntimes.py b/youtube_dl/extractor/popcorntimes.py

new file mode 100644 (file)

index 0000000..7bf7f98
--- /dev/null
+++ b/youtube_dl/extractor/popcorntimes.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_chr,
+)
+from ..utils import int_or_none
+
+
+class PopcorntimesIE(InfoExtractor):
+    _VALID_URL = r'https?://popcorntimes\.tv/[^/]+/m/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://popcorntimes.tv/de/m/A1XCFvz/haensel-und-gretel-opera-fantasy',
+        'md5': '93f210991ad94ba8c3485950a2453257',
+        'info_dict': {
+            'id': 'A1XCFvz',
+            'display_id': 'haensel-und-gretel-opera-fantasy',
+            'ext': 'mp4',
+            'title': 'Hänsel und Gretel',
+            'description': 'md5:1b8146791726342e7b22ce8125cf6945',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'creator': 'John Paul',
+            'release_date': '19541009',
+            'duration': 4260,
+            'tbr': 5380,
+            'width': 720,
+            'height': 540,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.group('id', 'display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._search_regex(
+            r'<h1>([^<]+)', webpage, 'title',
+            default=None) or self._html_search_meta(
+            'ya:ovs:original_name', webpage, 'title', fatal=True)
+
+        loc = self._search_regex(
+            r'PCTMLOC\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'loc',
+            group='value')
+
+        loc_b64 = ''
+        for c in loc:
+            c_ord = ord(c)
+            if ord('a') <= c_ord <= ord('z') or ord('A') <= c_ord <= ord('Z'):
+                upper = ord('Z') if c_ord <= ord('Z') else ord('z')
+                c_ord += 13
+                if upper < c_ord:
+                    c_ord -= 26
+            loc_b64 += compat_chr(c_ord)
+
+        video_url = compat_b64decode(loc_b64).decode('utf-8')
+
+        description = self._html_search_regex(
+            r'(?s)<div[^>]+class=["\']pt-movie-desc[^>]+>(.+?)</div>', webpage,
+            'description', fatal=False)
+
+        thumbnail = self._search_regex(
+            r'<img[^>]+class=["\']video-preview[^>]+\bsrc=(["\'])(?P<value>(?:(?!\1).)+)\1',
+            webpage, 'thumbnail', default=None,
+            group='value') or self._og_search_thumbnail(webpage)
+
+        creator = self._html_search_meta(
+            'video:director', webpage, 'creator', default=None)
+
+        release_date = self._html_search_meta(
+            'video:release_date', webpage, default=None)
+        if release_date:
+            release_date = release_date.replace('-', '')
+
+        def int_meta(name):
+            return int_or_none(self._html_search_meta(
+                name, webpage, default=None))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'creator': creator,
+            'release_date': release_date,
+            'duration': int_meta('video:duration'),
+            'tbr': int_meta('ya:ovs:bitrate'),
+            'width': int_meta('og:video:width'),
+            'height': int_meta('og:video:height'),
+            'http_headers': {
+                'Referer': url,
+            },
+        }
diff --git a/youtube_dl/extractor/popcorntv.py b/youtube_dl/extractor/popcorntv.py

new file mode 100644 (file)

index 0000000..9f834fb
--- /dev/null
+++ b/youtube_dl/extractor/popcorntv.py
@@ -0,0 +1,76 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+    unified_timestamp,
+)
+
+
+class PopcornTVIE(InfoExtractor):
+    _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P<display_id>[^/]+)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183',
+        'md5': '47d65a48d147caf692ab8562fe630b45',
+        'info_dict': {
+            'id': '9183',
+            'display_id': 'food-wars-battaglie-culinarie-episodio-01',
+            'ext': 'mp4',
+            'title': 'Food Wars, Battaglie Culinarie | Episodio 01',
+            'description': 'md5:b8bea378faae4651d3b34c6e112463d0',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1497610857,
+            'upload_date': '20170616',
+            'duration': 1440,
+            'view_count': int,
+        },
+    }, {
+        'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id, video_id = mobj.group('display_id', 'id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        m3u8_url = extract_attributes(
+            self._search_regex(
+                r'(<link[^>]+itemprop=["\'](?:content|embed)Url[^>]*>)',
+                webpage, 'content'
+            ))['href']
+
+        formats = self._extract_m3u8_formats(
+            m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+
+        title = self._search_regex(
+            r'<h1[^>]+itemprop=["\']name[^>]*>([^<]+)', webpage,
+            'title', default=None) or self._og_search_title(webpage)
+
+        description = self._html_search_regex(
+            r'(?s)<article[^>]+itemprop=["\']description[^>]*>(.+?)</article>',
+            webpage, 'description', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
+        timestamp = unified_timestamp(self._html_search_meta(
+            'uploadDate', webpage, 'timestamp'))
+        duration = int_or_none(self._html_search_meta(
+            'duration', webpage), invscale=60)
+        view_count = int_or_none(self._html_search_meta(
+            'interactionCount', webpage, 'view count'))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py

new file mode 100644 (file)

index 0000000..20eac64
--- /dev/null
+++ b/youtube_dl/extractor/porn91.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    int_or_none,
+    ExtractorError,
+)
+
+
+class Porn91IE(InfoExtractor):
+    IE_NAME = '91porn'
+    _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)'
+
+    _TEST = {
+        'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
+        'md5': '7fcdb5349354f40d41689bd0fa8db05a',
+        'info_dict': {
+            'id': '7e42283b4f5ab36da134',
+            'title': '18岁大一漂亮学妹，水嫩性感，再爽一次！',
+            'ext': 'mp4',
+            'duration': 431,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        self._set_cookie('91porn.com', 'language', 'cn_CN')
+
+        webpage = self._download_webpage(
+            'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id)
+
+        if '作为游客，你每天只可观看10个视频' in webpage:
+            raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True)
+
+        title = self._search_regex(
+            r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
+        title = title.replace('\n', '')
+
+        video_link_url = self._search_regex(
+            r'<textarea[^>]+id=["\']fm-video_link[^>]+>([^<]+)</textarea>',
+            webpage, 'video link')
+        videopage = self._download_webpage(video_link_url, video_id)
+
+        info_dict = self._parse_html5_media_entries(url, videopage, video_id)[0]
+
+        duration = parse_duration(self._search_regex(
+            r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
+
+        comment_count = int_or_none(self._search_regex(
+            r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
+
+        info_dict.update({
+            'id': video_id,
+            'title': title,
+            'duration': duration,
+            'comment_count': comment_count,
+            'age_limit': self._rta_search(webpage),
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py

new file mode 100644 (file)

index 0000000..5726cab
--- /dev/null
+++ b/youtube_dl/extractor/porncom.py
@@ -0,0 +1,103 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    parse_filesize,
+    str_to_int,
+)
+
+
+class PornComIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339',
+        'md5': '3f30ce76267533cd12ba999263156de7',
+        'info_dict': {
+            'id': '2603339',
+            'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec',
+            'ext': 'mp4',
+            'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 551,
+            'view_count': int,
+            'age_limit': 18,
+            'categories': list,
+            'tags': list,
+        },
+    }, {
+        'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage = self._download_webpage(url, display_id)
+
+        config = self._parse_json(
+            self._search_regex(
+                (r'=\s*({.+?})\s*;\s*v1ar\b',
+                 r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='),
+                webpage, 'config', default='{}'),
+            display_id, transform_source=js_to_json, fatal=False)
+
+        if config:
+            title = config['title']
+            formats = [{
+                'url': stream['url'],
+                'format_id': stream.get('id'),
+                'height': int_or_none(self._search_regex(
+                    r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None))
+            } for stream in config['streams'] if stream.get('url')]
+            thumbnail = (compat_urlparse.urljoin(
+                config['thumbCDN'], config['poster'])
+                if config.get('thumbCDN') and config.get('poster') else None)
+            duration = int_or_none(config.get('length'))
+        else:
+            title = self._search_regex(
+                (r'<title>([^<]+)</title>', r'<h1[^>]*>([^<]+)</h1>'),
+                webpage, 'title')
+            formats = [{
+                'url': compat_urlparse.urljoin(url, format_url),
+                'format_id': '%sp' % height,
+                'height': int(height),
+                'filesize_approx': parse_filesize(filesize),
+            } for format_url, height, filesize in re.findall(
+                r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<',
+                webpage)]
+            thumbnail = None
+            duration = None
+
+        self._sort_formats(formats)
+
+        view_count = str_to_int(self._search_regex(
+            (r'Views:\s*</span>\s*<span>\s*([\d,.]+)',
+             r'class=["\']views["\'][^>]*><p>([\d,.]+)'), webpage,
+            'view count', fatal=False))
+
+        def extract_list(kind):
+            s = self._search_regex(
+                (r'(?s)%s:\s*</span>\s*<span>(.+?)</span>' % kind.capitalize(),
+                 r'(?s)<p[^>]*>%s:(.+?)</p>' % kind.capitalize()),
+                webpage, kind, fatal=False)
+            return re.findall(r'<a[^>]+>([^<]+)</a>', s or '')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+            'age_limit': 18,
+            'categories': extract_list('categories'),
+            'tags': extract_list('tags'),
+        }
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py

new file mode 100644 (file)

index 0000000..c6052ac
--- /dev/null
+++ b/youtube_dl/extractor/pornhd.py
@@ -0,0 +1,121 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    js_to_json,
+    merge_dicts,
+    urljoin,
+)
+
+
+class PornHdIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?'
+    _TESTS = [{
+        'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video',
+        'md5': '87f1540746c1d32ec7a2305c12b96b25',
+        'info_dict': {
+            'id': '9864',
+            'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video',
+            'ext': 'mp4',
+            'title': 'Restroom selfie masturbation',
+            'description': 'md5:3748420395e03e31ac96857a8f125b2b',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'view_count': int,
+            'like_count': int,
+            'age_limit': 18,
+        },
+        'skip': 'HTTP Error 404: Not Found',
+    }, {
+        'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
+        'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de',
+        'info_dict': {
+            'id': '1962',
+            'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
+            'ext': 'mp4',
+            'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759',
+            'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'view_count': int,
+            'like_count': int,
+            'age_limit': 18,
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id or video_id)
+
+        title = self._html_search_regex(
+            [r'<span[^>]+class=["\']video-name["\'][^>]*>([^<]+)',
+             r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')
+
+        sources = self._parse_json(js_to_json(self._search_regex(
+            r"(?s)sources'?\s*[:=]\s*(\{.+?\})",
+            webpage, 'sources', default='{}')), video_id)
+
+        info = {}
+        if not sources:
+            entries = self._parse_html5_media_entries(url, webpage, video_id)
+            if entries:
+                info = entries[0]
+
+        if not sources and not info:
+            message = self._html_search_regex(
+                r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P<value>.+?)</\1',
+                webpage, 'error message', group='value')
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
+
+        formats = []
+        for format_id, video_url in sources.items():
+            video_url = urljoin(url, video_url)
+            if not video_url:
+                continue
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]', format_id, 'height', default=None))
+            formats.append({
+                'url': video_url,
+                'ext': determine_ext(video_url, 'mp4'),
+                'format_id': format_id,
+                'height': height,
+            })
+        if formats:
+            info['formats'] = formats
+        self._sort_formats(info['formats'])
+
+        description = self._html_search_regex(
+            (r'(?s)<section[^>]+class=["\']video-description[^>]+>(?P<value>.+?)</section>',
+             r'<(div|p)[^>]+class="description"[^>]*>(?P<value>[^<]+)</\1'),
+            webpage, 'description', fatal=False,
+            group='value') or self._html_search_meta(
+            'description', webpage, default=None) or self._og_search_description(webpage)
+        view_count = int_or_none(self._html_search_regex(
+            r'(\d+) views\s*<', webpage, 'view count', fatal=False))
+        thumbnail = self._search_regex(
+            r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage,
+            'thumbnail', default=None, group='url')
+
+        like_count = int_or_none(self._search_regex(
+            (r'(\d+)</span>\s*likes',
+             r'(\d+)\s*</11[^>]+>(?:&nbsp;|\s)*\blikes',
+             r'class=["\']save-count["\'][^>]*>\s*(\d+)'),
+            webpage, 'like count', fatal=False))
+
+        return merge_dicts(info, {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'view_count': view_count,
+            'like_count': like_count,
+            'formats': formats,
+            'age_limit': 18,
+        })
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py

new file mode 100644 (file)

index 0000000..3567a32
--- /dev/null
+++ b/youtube_dl/extractor/pornhub.py
@@ -0,0 +1,611 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import functools
+import itertools
+import operator
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+    compat_urllib_request,
+)
+from .openload import PhantomJSwrapper
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    NO_DEFAULT,
+    orderedSet,
+    remove_quotes,
+    str_to_int,
+    url_or_none,
+)
+
+
+class PornHubBaseIE(InfoExtractor):
+    def _download_webpage_handle(self, *args, **kwargs):
+        def dl(*args, **kwargs):
+            return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
+
+        webpage, urlh = dl(*args, **kwargs)
+
+        if any(re.search(p, webpage) for p in (
+                r'<body\b[^>]+\bonload=["\']go\(\)',
+                r'document\.cookie\s*=\s*["\']RNKEY=',
+                r'document\.location\.reload\(true\)')):
+            url_or_request = args[0]
+            url = (url_or_request.get_full_url()
+                   if isinstance(url_or_request, compat_urllib_request.Request)
+                   else url_or_request)
+            phantom = PhantomJSwrapper(self, required_version='2.0')
+            phantom.get(url, html=webpage)
+            webpage, urlh = dl(*args, **kwargs)
+
+        return webpage, urlh
+
+
+class PornHubIE(PornHubBaseIE):
+    IE_DESC = 'PornHub and Thumbzilla'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
+                            (?:www\.)?thumbzilla\.com/video/
+                        )
+                        (?P<id>[\da-z]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
+        'md5': '1e19b41231a02eba417839222ac9d58e',
+        'info_dict': {
+            'id': '648719015',
+            'ext': 'mp4',
+            'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
+            'uploader': 'Babes',
+            'upload_date': '20130628',
+            'duration': 361,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'comment_count': int,
+            'age_limit': 18,
+            'tags': list,
+            'categories': list,
+        },
+    }, {
+        # non-ASCII title
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
+        'info_dict': {
+            'id': '1331683002',
+            'ext': 'mp4',
+            'title': '重庆婷婷女王足交',
+            'uploader': 'Unknown',
+            'upload_date': '20150213',
+            'duration': 1753,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'comment_count': int,
+            'age_limit': 18,
+            'tags': list,
+            'categories': list,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # subtitles
+        'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
+        'info_dict': {
+            'id': 'ph5af5fef7c2aa7',
+            'ext': 'mp4',
+            'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
+            'uploader': 'BFFs',
+            'duration': 622,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'comment_count': int,
+            'age_limit': 18,
+            'tags': list,
+            'categories': list,
+            'subtitles': {
+                'en': [{
+                    "ext": 'srt'
+                }]
+            },
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
+        'only_matching': True,
+    }, {
+        # removed at the request of cam4.com
+        'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
+        'only_matching': True,
+    }, {
+        # removed at the request of the copyright owner
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
+        'only_matching': True,
+    }, {
+        # removed by uploader
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
+        'only_matching': True,
+    }, {
+        # private video
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)',
+            webpage)
+
+    def _extract_count(self, pattern, webpage, name):
+        return str_to_int(self._search_regex(
+            pattern, webpage, '%s count' % name, fatal=False))
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        host = mobj.group('host') or 'pornhub.com'
+        video_id = mobj.group('id')
+
+        if 'premium' in host:
+            if not self._downloader.params.get('cookiefile'):
+                raise ExtractorError(
+                    'PornHub Premium requires authentication.'
+                    ' You may want to use --cookies.',
+                    expected=True)
+
+        self._set_cookie(host, 'age_verified', '1')
+
+        def dl_webpage(platform):
+            self._set_cookie(host, 'platform', platform)
+            return self._download_webpage(
+                'https://www.%s/view_video.php?viewkey=%s' % (host, video_id),
+                video_id, 'Downloading %s webpage' % platform)
+
+        webpage = dl_webpage('pc')
+
+        error_msg = self._html_search_regex(
+            r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
+            webpage, 'error message', default=None, group='error')
+        if error_msg:
+            error_msg = re.sub(r'\s+', ' ', error_msg)
+            raise ExtractorError(
+                'PornHub said: %s' % error_msg,
+                expected=True, video_id=video_id)
+
+        # video_title from flashvars contains whitespace instead of non-ASCII (see
+        # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
+        # on that anymore.
+        title = self._html_search_meta(
+            'twitter:title', webpage, default=None) or self._html_search_regex(
+            (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>',
+             r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1',
+             r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'),
+            webpage, 'title', group='title')
+
+        video_urls = []
+        video_urls_set = set()
+        subtitles = {}
+
+        flashvars = self._parse_json(
+            self._search_regex(
+                r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
+            video_id)
+        if flashvars:
+            subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
+            if subtitle_url:
+                subtitles.setdefault('en', []).append({
+                    'url': subtitle_url,
+                    'ext': 'srt',
+                })
+            thumbnail = flashvars.get('image_url')
+            duration = int_or_none(flashvars.get('video_duration'))
+            media_definitions = flashvars.get('mediaDefinitions')
+            if isinstance(media_definitions, list):
+                for definition in media_definitions:
+                    if not isinstance(definition, dict):
+                        continue
+                    video_url = definition.get('videoUrl')
+                    if not video_url or not isinstance(video_url, compat_str):
+                        continue
+                    if video_url in video_urls_set:
+                        continue
+                    video_urls_set.add(video_url)
+                    video_urls.append(
+                        (video_url, int_or_none(definition.get('quality'))))
+        else:
+            thumbnail, duration = [None] * 2
+
+        def extract_js_vars(webpage, pattern, default=NO_DEFAULT):
+            assignments = self._search_regex(
+                pattern, webpage, 'encoded url', default=default)
+            if not assignments:
+                return {}
+
+            assignments = assignments.split(';')
+
+            js_vars = {}
+
+            def parse_js_value(inp):
+                inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
+                if '+' in inp:
+                    inps = inp.split('+')
+                    return functools.reduce(
+                        operator.concat, map(parse_js_value, inps))
+                inp = inp.strip()
+                if inp in js_vars:
+                    return js_vars[inp]
+                return remove_quotes(inp)
+
+            for assn in assignments:
+                assn = assn.strip()
+                if not assn:
+                    continue
+                assn = re.sub(r'var\s+', '', assn)
+                vname, value = assn.split('=', 1)
+                js_vars[vname] = parse_js_value(value)
+            return js_vars
+
+        def add_video_url(video_url):
+            v_url = url_or_none(video_url)
+            if not v_url:
+                return
+            if v_url in video_urls_set:
+                return
+            video_urls.append((v_url, None))
+            video_urls_set.add(v_url)
+
+        if not video_urls:
+            FORMAT_PREFIXES = ('media', 'quality')
+            js_vars = extract_js_vars(
+                webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES),
+                default=None)
+            if js_vars:
+                for key, format_url in js_vars.items():
+                    if any(key.startswith(p) for p in FORMAT_PREFIXES):
+                        add_video_url(format_url)
+            if not video_urls and re.search(
+                    r'<[^>]+\bid=["\']lockedPlayer', webpage):
+                raise ExtractorError(
+                    'Video %s is locked' % video_id, expected=True)
+
+        if not video_urls:
+            js_vars = extract_js_vars(
+                dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
+            add_video_url(js_vars['mediastring'])
+
+        for mobj in re.finditer(
+                r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
+                webpage):
+            video_url = mobj.group('url')
+            if video_url not in video_urls_set:
+                video_urls.append((video_url, None))
+                video_urls_set.add(video_url)
+
+        upload_date = None
+        formats = []
+        for video_url, height in video_urls:
+            if not upload_date:
+                upload_date = self._search_regex(
+                    r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
+                if upload_date:
+                    upload_date = upload_date.replace('/', '')
+            ext = determine_ext(video_url)
+            if ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    video_url, video_id, mpd_id='dash', fatal=False))
+                continue
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+                continue
+            tbr = None
+            mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
+            if mobj:
+                if not height:
+                    height = int(mobj.group('height'))
+                tbr = int(mobj.group('tbr'))
+            formats.append({
+                'url': video_url,
+                'format_id': '%dp' % height if height else None,
+                'height': height,
+                'tbr': tbr,
+            })
+        self._sort_formats(formats)
+
+        video_uploader = self._html_search_regex(
+            r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
+            webpage, 'uploader', fatal=False)
+
+        view_count = self._extract_count(
+            r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
+        like_count = self._extract_count(
+            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
+        dislike_count = self._extract_count(
+            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
+        comment_count = self._extract_count(
+            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
+
+        def extract_list(meta_key):
+            div = self._search_regex(
+                r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
+                % meta_key, webpage, meta_key, default=None)
+            if div:
+                return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div)
+
+        return {
+            'id': video_id,
+            'uploader': video_uploader,
+            'upload_date': upload_date,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'comment_count': comment_count,
+            'formats': formats,
+            'age_limit': 18,
+            'tags': extract_list('tags'),
+            'categories': extract_list('categories'),
+            'subtitles': subtitles,
+        }
+
+
+class PornHubPlaylistBaseIE(PornHubBaseIE):
+    def _extract_entries(self, webpage, host):
+        # Only process container div with main playlist content skipping
+        # drop-down menu that uses similar pattern for videos (see
+        # https://github.com/ytdl-org/youtube-dl/issues/11594).
+        container = self._search_regex(
+            r'(?s)(<div[^>]+class=["\']container.+)', webpage,
+            'container', default=webpage)
+
+        return [
+            self.url_result(
+                'http://www.%s/%s' % (host, video_url),
+                PornHubIE.ie_key(), video_title=title)
+            for video_url, title in orderedSet(re.findall(
+                r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
+                container))
+        ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        host = mobj.group('host')
+        playlist_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = self._extract_entries(webpage, host)
+
+        playlist = self._parse_json(
+            self._search_regex(
+                r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
+                'playlist', default='{}'),
+            playlist_id, fatal=False)
+        title = playlist.get('title') or self._search_regex(
+            r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
+
+        return self.playlist_result(
+            entries, playlist_id, title, playlist.get('description'))
+
+
+class PornHubUserIE(PornHubPlaylistBaseIE):
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
+    _TESTS = [{
+        'url': 'https://www.pornhub.com/model/zoe_ph',
+        'playlist_mincount': 118,
+    }, {
+        'url': 'https://www.pornhub.com/pornstar/liz-vicious',
+        'info_dict': {
+            'id': 'liz-vicious',
+        },
+        'playlist_mincount': 118,
+    }, {
+        'url': 'https://www.pornhub.com/users/russianveet69',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/channels/povd',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user_id = mobj.group('id')
+        return self.url_result(
+            '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(),
+            video_id=user_id)
+
+
+class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
+    @staticmethod
+    def _has_more(webpage):
+        return re.search(
+            r'''(?x)
+                <li[^>]+\bclass=["\']page_next|
+                <link[^>]+\brel=["\']next|
+                <button[^>]+\bid=["\']moreDataBtn
+            ''', webpage) is not None
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        host = mobj.group('host')
+        item_id = mobj.group('id')
+
+        page = int_or_none(self._search_regex(
+            r'\bpage=(\d+)', url, 'page', default=None))
+
+        entries = []
+        for page_num in (page, ) if page is not None else itertools.count(1):
+            try:
+                webpage = self._download_webpage(
+                    url, item_id, 'Downloading page %d' % page_num,
+                    query={'page': page_num})
+            except ExtractorError as e:
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+                    break
+                raise
+            page_entries = self._extract_entries(webpage, host)
+            if not page_entries:
+                break
+            entries.extend(page_entries)
+            if not self._has_more(webpage):
+                break
+
+        return self.playlist_result(orderedSet(entries), item_id)
+
+
+class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
+    _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.pornhub.com/model/zoe_ph/videos',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.pornhub.com/users/rushandlia/videos',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos',
+        'info_dict': {
+            'id': 'pornstar/jenny-blighe/videos',
+        },
+        'playlist_mincount': 149,
+    }, {
+        'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3',
+        'info_dict': {
+            'id': 'pornstar/jenny-blighe/videos',
+        },
+        'playlist_mincount': 40,
+    }, {
+        # default sorting as Top Rated Videos
+        'url': 'https://www.pornhub.com/channels/povd/videos',
+        'info_dict': {
+            'id': 'channels/povd/videos',
+        },
+        'playlist_mincount': 293,
+    }, {
+        # Top Rated Videos
+        'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
+        'only_matching': True,
+    }, {
+        # Most Recent Videos
+        'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
+        'only_matching': True,
+    }, {
+        # Most Viewed Videos
+        'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
+        'only_matching': True,
+    }, {
+        # Most Viewed Videos
+        'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv',
+        'only_matching': True,
+    }, {
+        # Top Rated Videos
+        'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr',
+        'only_matching': True,
+    }, {
+        # Longest Videos
+        'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg',
+        'only_matching': True,
+    }, {
+        # Newest Videos
+        'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/video',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/video?page=3',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/video/search?search=123',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/categories/teen',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/categories/teen?page=3',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/hd',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/hd?page=3',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/described-video',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/described-video?page=2',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhub.com/playlist/44121572',
+        'info_dict': {
+            'id': 'playlist/44121572',
+        },
+        'playlist_mincount': 132,
+    }, {
+        'url': 'https://www.pornhub.com/playlist/4667351',
+        'only_matching': True,
+    }, {
+        'url': 'https://de.pornhub.com/playlist/4667351',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False
+                if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url)
+                else super(PornHubPagedVideoListIE, cls).suitable(url))
+
+
+class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
+    _TESTS = [{
+        'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
+        'info_dict': {
+            'id': 'jenny-blighe',
+        },
+        'playlist_mincount': 129,
+    }, {
+        'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
+        'only_matching': True,
+    }]
diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py

new file mode 100644 (file)

index 0000000..1b5b9a3
--- /dev/null
+++ b/youtube_dl/extractor/pornotube.py
@@ -0,0 +1,85 @@
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class PornotubeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:\w+\.)?pornotube\.com/(?:[^?#]*?)/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.pornotube.com/orientation/straight/video/4964/title/weird-hot-and-wet-science',
+        'md5': '60fc5a4f0d93a97968fc7999d98260c9',
+        'info_dict': {
+            'id': '4964',
+            'ext': 'mp4',
+            'upload_date': '20141203',
+            'title': 'Weird Hot and Wet Science',
+            'description': 'md5:a8304bef7ef06cb4ab476ca6029b01b0',
+            'categories': ['Adult Humor', 'Blondes'],
+            'uploader': 'Alpha Blue Archives',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1417582800,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        token = self._download_json(
+            'https://api.aebn.net/auth/v2/origins/authenticate',
+            video_id, note='Downloading token',
+            data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'),
+            headers={
+                'Content-Type': 'application/json',
+                'Origin': 'http://www.pornotube.com',
+            })['tokenKey']
+
+        video_url = self._download_json(
+            'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id,
+            video_id, note='Downloading delivery information',
+            headers={'Authorization': token})['mediaUrl']
+
+        FIELDS = (
+            'title', 'description', 'startSecond', 'endSecond', 'publishDate',
+            'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber'
+        )
+
+        info = self._download_json(
+            'https://api.aebn.net/content/v2/clips/%s?fields=%s'
+            % (video_id, ','.join(FIELDS)), video_id,
+            note='Downloading metadata',
+            headers={'Authorization': token})
+
+        if isinstance(info, list):
+            info = info[0]
+
+        title = info['title']
+
+        timestamp = int_or_none(info.get('publishDate'), scale=1000)
+        uploader = info.get('studios', [{}])[0].get('name')
+        movie_id = info.get('movieId')
+        primary_image_number = info.get('primaryImageNumber')
+        thumbnail = None
+        if movie_id and primary_image_number:
+            thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % (
+                movie_id, movie_id, primary_image_number)
+        start = int_or_none(info.get('startSecond'))
+        end = int_or_none(info.get('endSecond'))
+        duration = end - start if start and end else None
+        categories = [c['name'] for c in info.get('categories', []) if c.get('name')]
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': info.get('description'),
+            'duration': duration,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'thumbnail': thumbnail,
+            'categories': categories,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py

new file mode 100644 (file)

index 0000000..b6b7106
--- /dev/null
+++ b/youtube_dl/extractor/pornovoisines.py
@@ -0,0 +1,108 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    float_or_none,
+    unified_strdate,
+)
+
+
+class PornoVoisinesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P<id>\d+)/(?P<display_id>[^/.]+)'
+
+    _TEST = {
+        'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html',
+        'md5': '6f8aca6a058592ab49fe701c8ba8317b',
+        'info_dict': {
+            'id': '919',
+            'display_id': 'recherche-appartement',
+            'ext': 'mp4',
+            'title': 'Recherche appartement',
+            'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20140925',
+            'duration': 120,
+            'view_count': int,
+            'average_rating': float,
+            'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'],
+            'age_limit': 18,
+            'subtitles': {
+                'fr': [{
+                    'ext': 'vtt',
+                }]
+            },
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        settings_url = self._download_json(
+            'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id,
+            video_id, note='Getting settings URL')['video_settings_url']
+        settings = self._download_json(settings_url, video_id)['data']
+
+        formats = []
+        for kind, data in settings['variants'].items():
+            if kind == 'HLS':
+                formats.extend(self._extract_m3u8_formats(
+                    data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls'))
+            elif kind == 'MP4':
+                for item in data:
+                    formats.append({
+                        'url': item['url'],
+                        'height': item.get('height'),
+                        'bitrate': item.get('bitrate'),
+                    })
+        self._sort_formats(formats)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+
+        # The webpage has a bug - there's no space between "thumb" and src=
+        thumbnail = self._html_search_regex(
+            r'<img[^>]+class=([\'"])thumb\1[^>]*src=([\'"])(?P<url>[^"]+)\2',
+            webpage, 'thumbnail', fatal=False, group='url')
+
+        upload_date = unified_strdate(self._search_regex(
+            r'Le\s*<b>([\d/]+)', webpage, 'upload date', fatal=False))
+        duration = settings.get('main', {}).get('duration')
+        view_count = int_or_none(self._search_regex(
+            r'(\d+) vues', webpage, 'view count', fatal=False))
+        average_rating = self._search_regex(
+            r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False)
+        if average_rating:
+            average_rating = float_or_none(average_rating.replace(',', '.'))
+
+        categories = self._html_search_regex(
+            r'(?s)Catégories\s*:\s*<b>(.+?)</b>', webpage, 'categories', fatal=False)
+        if categories:
+            categories = [category.strip() for category in categories.split(',')]
+
+        subtitles = {'fr': [{
+            'url': subtitle,
+        } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]}
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'view_count': view_count,
+            'average_rating': average_rating,
+            'categories': categories,
+            'age_limit': 18,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/pornoxo.py b/youtube_dl/extractor/pornoxo.py

new file mode 100644 (file)

index 0000000..2831368
--- /dev/null
+++ b/youtube_dl/extractor/pornoxo.py
@@ -0,0 +1,58 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    str_to_int,
+)
+
+
+class PornoXOIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pornoxo\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)\.html'
+    _TEST = {
+        'url': 'http://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary.html',
+        'md5': '582f28ecbaa9e6e24cb90f50f524ce87',
+        'info_dict': {
+            'id': '7564',
+            'ext': 'flv',
+            'title': 'Striptease From Sexy Secretary!',
+            'display_id': 'striptease-from-sexy-secretary',
+            'description': 'md5:0ee35252b685b3883f4a1d38332f9980',
+            'categories': list,  # NSFW
+            'thumbnail': r're:https?://.*\.jpg$',
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.groups()
+
+        webpage = self._download_webpage(url, video_id)
+        video_data = self._extract_jwplayer_data(webpage, video_id, require_title=False)
+
+        title = self._html_search_regex(
+            r'<title>([^<]+)\s*-\s*PornoXO', webpage, 'title')
+
+        view_count = str_to_int(self._html_search_regex(
+            r'[vV]iews:\s*([0-9,]+)', webpage, 'view count', fatal=False))
+
+        categories_str = self._html_search_regex(
+            r'<meta name="description" content=".*featuring\s*([^"]+)"',
+            webpage, 'categories', fatal=False)
+        categories = (
+            None if categories_str is None
+            else categories_str.split(','))
+
+        video_data.update({
+            'id': video_id,
+            'title': title,
+            'display_id': display_id,
+            'description': self._html_search_meta('description', webpage),
+            'categories': categories,
+            'view_count': view_count,
+            'age_limit': 18,
+        })
+
+        return video_data
diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py

new file mode 100644 (file)

index 0000000..b5c2792
--- /dev/null
+++ b/youtube_dl/extractor/presstv.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import remove_start
+
+
+class PressTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P<y>\d+)/(?P<m>\d+)/(?P<d>\d+)/(?P<id>\d+)/(?P<display_id>[^/]+)?'
+
+    _TEST = {
+        'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/',
+        'md5': '5d7e3195a447cb13e9267e931d8dd5a5',
+        'info_dict': {
+            'id': '459911',
+            'display_id': 'Australian-sewerage-treatment-facility-',
+            'ext': 'mp4',
+            'title': 'Organic mattresses used to clean waste water',
+            'upload_date': '20160409',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'description': 'md5:20002e654bbafb6908395a5c0cfcd125'
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage = self._download_webpage(url, display_id)
+
+        # extract video URL from webpage
+        video_url = self._hidden_inputs(webpage)['inpPlayback']
+
+        # build list of available formats
+        # specified in http://www.presstv.ir/Scripts/playback.js
+        base_url = 'http://192.99.219.222:82/presstv'
+        _formats = [
+            (180, '_low200.mp4'),
+            (360, '_low400.mp4'),
+            (720, '_low800.mp4'),
+            (1080, '.mp4')
+        ]
+
+        formats = [{
+            'url': base_url + video_url[:-4] + extension,
+            'format_id': '%dp' % height,
+            'height': height,
+        } for height, extension in _formats]
+
+        # extract video metadata
+        title = remove_start(
+            self._html_search_meta('title', webpage, fatal=True), 'PressTV-')
+
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._og_search_description(webpage)
+
+        upload_date = '%04d%02d%02d' % (
+            int(mobj.group('y')),
+            int(mobj.group('m')),
+            int(mobj.group('d')),
+        )
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'description': description
+        }
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py

new file mode 100644 (file)

index 0000000..e470882
--- /dev/null
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -0,0 +1,500 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from hashlib import sha1
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    merge_dicts,
+    unified_strdate,
+)
+
+
+class ProSiebenSat1BaseIE(InfoExtractor):
+    _GEO_BYPASS = False
+    _ACCESS_ID = None
+    _SUPPORTED_PROTOCOLS = 'dash:clear,hls:clear,progressive:clear'
+    _V4_BASE_URL = 'https://vas-v4.p7s1video.net/4.0/get'
+
+    def _extract_video_info(self, url, clip_id):
+        client_location = url
+
+        video = self._download_json(
+            'http://vas.sim-technik.de/vas/live/v2/videos',
+            clip_id, 'Downloading videos JSON', query={
+                'access_token': self._TOKEN,
+                'client_location': client_location,
+                'client_name': self._CLIENT_NAME,
+                'ids': clip_id,
+            })[0]
+
+        if video.get('is_protected') is True:
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
+        formats = []
+        if self._ACCESS_ID:
+            raw_ct = self._ENCRYPTION_KEY + clip_id + self._IV + self._ACCESS_ID
+            protocols = self._download_json(
+                self._V4_BASE_URL + 'protocols', clip_id,
+                'Downloading protocols JSON',
+                headers=self.geo_verification_headers(), query={
+                    'access_id': self._ACCESS_ID,
+                    'client_token': sha1((raw_ct).encode()).hexdigest(),
+                    'video_id': clip_id,
+                }, fatal=False, expected_status=(403,)) or {}
+            error = protocols.get('error') or {}
+            if error.get('title') == 'Geo check failed':
+                self.raise_geo_restricted(countries=['AT', 'CH', 'DE'])
+            server_token = protocols.get('server_token')
+            if server_token:
+                urls = (self._download_json(
+                    self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={
+                        'access_id': self._ACCESS_ID,
+                        'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(),
+                        'protocols': self._SUPPORTED_PROTOCOLS,
+                        'server_token': server_token,
+                        'video_id': clip_id,
+                    }, fatal=False) or {}).get('urls') or {}
+                for protocol, variant in urls.items():
+                    source_url = variant.get('clear', {}).get('url')
+                    if not source_url:
+                        continue
+                    if protocol == 'dash':
+                        formats.extend(self._extract_mpd_formats(
+                            source_url, clip_id, mpd_id=protocol, fatal=False))
+                    elif protocol == 'hls':
+                        formats.extend(self._extract_m3u8_formats(
+                            source_url, clip_id, 'mp4', 'm3u8_native',
+                            m3u8_id=protocol, fatal=False))
+                    else:
+                        formats.append({
+                            'url': source_url,
+                            'format_id': protocol,
+                        })
+        if not formats:
+            source_ids = [compat_str(source['id']) for source in video['sources']]
+
+            client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest()
+
+            sources = self._download_json(
+                'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id,
+                clip_id, 'Downloading sources JSON', query={
+                    'access_token': self._TOKEN,
+                    'client_id': client_id,
+                    'client_location': client_location,
+                    'client_name': self._CLIENT_NAME,
+                })
+            server_id = sources['server_id']
+
+            def fix_bitrate(bitrate):
+                bitrate = int_or_none(bitrate)
+                if not bitrate:
+                    return None
+                return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate
+
+            for source_id in source_ids:
+                client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest()
+                urls = self._download_json(
+                    'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id,
+                    clip_id, 'Downloading urls JSON', fatal=False, query={
+                        'access_token': self._TOKEN,
+                        'client_id': client_id,
+                        'client_location': client_location,
+                        'client_name': self._CLIENT_NAME,
+                        'server_id': server_id,
+                        'source_ids': source_id,
+                    })
+                if not urls:
+                    continue
+                if urls.get('status_code') != 0:
+                    raise ExtractorError('This video is unavailable', expected=True)
+                urls_sources = urls['sources']
+                if isinstance(urls_sources, dict):
+                    urls_sources = urls_sources.values()
+                for source in urls_sources:
+                    source_url = source.get('url')
+                    if not source_url:
+                        continue
+                    protocol = source.get('protocol')
+                    mimetype = source.get('mimetype')
+                    if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            source_url, clip_id, f4m_id='hds', fatal=False))
+                    elif mimetype == 'application/x-mpegURL':
+                        formats.extend(self._extract_m3u8_formats(
+                            source_url, clip_id, 'mp4', 'm3u8_native',
+                            m3u8_id='hls', fatal=False))
+                    elif mimetype == 'application/dash+xml':
+                        formats.extend(self._extract_mpd_formats(
+                            source_url, clip_id, mpd_id='dash', fatal=False))
+                    else:
+                        tbr = fix_bitrate(source['bitrate'])
+                        if protocol in ('rtmp', 'rtmpe'):
+                            mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
+                            if not mobj:
+                                continue
+                            path = mobj.group('path')
+                            mp4colon_index = path.rfind('mp4:')
+                            app = path[:mp4colon_index]
+                            play_path = path[mp4colon_index:]
+                            formats.append({
+                                'url': '%s/%s' % (mobj.group('url'), app),
+                                'app': app,
+                                'play_path': play_path,
+                                'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
+                                'page_url': 'http://www.prosieben.de',
+                                'tbr': tbr,
+                                'ext': 'flv',
+                                'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''),
+                            })
+                        else:
+                            formats.append({
+                                'url': source_url,
+                                'tbr': tbr,
+                                'format_id': 'http%s' % ('-%d' % tbr if tbr else ''),
+                            })
+        self._sort_formats(formats)
+
+        return {
+            'duration': float_or_none(video.get('duration')),
+            'formats': formats,
+        }
+
+
+class ProSiebenSat1IE(ProSiebenSat1BaseIE):
+    IE_NAME = 'prosiebensat1'
+    IE_DESC = 'ProSiebenSat.1 Digital'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        (?:
+                            (?:beta\.)?
+                            (?:
+                                prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|advopedia
+                            )\.(?:de|at|ch)|
+                            ran\.de|fem\.com|advopedia\.de|galileo\.tv/video
+                        )
+                        /(?P<id>.+)
+                    '''
+
+    _TESTS = [
+        {
+            # Tests changes introduced in https://github.com/ytdl-org/youtube-dl/pull/6242
+            # in response to fixing https://github.com/ytdl-org/youtube-dl/issues/6215:
+            # - malformed f4m manifest support
+            # - proper handling of URLs starting with `https?://` in 2.0 manifests
+            # - recursive child f4m manifests extraction
+            'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
+            'info_dict': {
+                'id': '2104602',
+                'ext': 'mp4',
+                'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2',
+                'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
+                'upload_date': '20131231',
+                'duration': 5845.04,
+                'series': 'CIRCUS HALLIGALLI',
+                'season_number': 2,
+                'episode': 'Episode 18 - Staffel 2',
+                'episode_number': 18,
+            },
+        },
+        {
+            'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html',
+            'info_dict': {
+                'id': '2570327',
+                'ext': 'mp4',
+                'title': 'Lady-Umstyling für Audrina',
+                'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d',
+                'upload_date': '20131014',
+                'duration': 606.76,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'Seems to be broken',
+        },
+        {
+            'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge',
+            'info_dict': {
+                'id': '2429369',
+                'ext': 'mp4',
+                'title': 'Countdown für die Autowerkstatt',
+                'description': 'md5:809fc051a457b5d8666013bc40698817',
+                'upload_date': '20140223',
+                'duration': 2595.04,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'This video is unavailable',
+        },
+        {
+            'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip',
+            'info_dict': {
+                'id': '2904997',
+                'ext': 'mp4',
+                'title': 'Sexy laufen in Ugg Boots',
+                'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6',
+                'upload_date': '20140122',
+                'duration': 245.32,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'This video is unavailable',
+        },
+        {
+            'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip',
+            'info_dict': {
+                'id': '2906572',
+                'ext': 'mp4',
+                'title': 'Im Interview: Kai Wiesinger',
+                'description': 'md5:e4e5370652ec63b95023e914190b4eb9',
+                'upload_date': '20140203',
+                'duration': 522.56,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'This video is unavailable',
+        },
+        {
+            'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge',
+            'info_dict': {
+                'id': '2992323',
+                'ext': 'mp4',
+                'title': 'Jagd auf Fertigkost im Elsthal - Teil 2',
+                'description': 'md5:2669cde3febe9bce13904f701e774eb6',
+                'upload_date': '20141014',
+                'duration': 2410.44,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'This video is unavailable',
+        },
+        {
+            'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge',
+            'info_dict': {
+                'id': '3004256',
+                'ext': 'mp4',
+                'title': 'Schalke: Tönnies möchte Raul zurück',
+                'description': 'md5:4b5b271d9bcde223b54390754c8ece3f',
+                'upload_date': '20140226',
+                'duration': 228.96,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'This video is unavailable',
+        },
+        {
+            'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
+            'info_dict': {
+                'id': '2572814',
+                'ext': 'mp4',
+                'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man',
+                'description': 'md5:6ddb02b0781c6adf778afea606652e38',
+                'timestamp': 1382041620,
+                'upload_date': '20131017',
+                'duration': 469.88,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.fem.com/videos/beauty-lifestyle/kurztrips-zum-valentinstag',
+            'info_dict': {
+                'id': '2156342',
+                'ext': 'mp4',
+                'title': 'Kurztrips zum Valentinstag',
+                'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.',
+                'duration': 307.24,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.prosieben.de/tv/joko-gegen-klaas/videos/playlists/episode-8-ganze-folge-playlist',
+            'info_dict': {
+                'id': '439664',
+                'title': 'Episode 8 - Ganze Folge - Playlist',
+                'description': 'md5:63b8963e71f481782aeea877658dec84',
+            },
+            'playlist_count': 2,
+            'skip': 'This video is unavailable',
+        },
+        {
+            # title in <h2 class="subtitle">
+            'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip',
+            'info_dict': {
+                'id': '4895826',
+                'ext': 'mp4',
+                'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe',
+                'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9',
+                'upload_date': '20170302',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'geo restricted to Germany',
+        },
+        {
+            # geo restricted to Germany
+            'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge',
+            'only_matching': True,
+        },
+        {
+            # geo restricted to Germany
+            'url': 'http://www.sat1gold.de/tv/edel-starck/video/11-staffel-1-episode-1-partner-wider-willen-ganze-folge',
+            'only_matching': True,
+        },
+        {
+            # geo restricted to Germany
+            'url': 'https://www.galileo.tv/video/diese-emojis-werden-oft-missverstanden',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.sat1gold.de/tv/edel-starck/playlist/die-gesamte-1-staffel',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.advopedia.de/videos/lenssen-klaert-auf/lenssen-klaert-auf-folge-8-staffel-3-feiertage-und-freie-tage',
+            'only_matching': True,
+        },
+    ]
+
+    _TOKEN = 'prosieben'
+    _SALT = '01!8d8F_)r9]4s[qeuXfP%'
+    _CLIENT_NAME = 'kolibri-2.0.19-splec4'
+
+    _ACCESS_ID = 'x_prosiebenmaxx-de'
+    _ENCRYPTION_KEY = 'Eeyeey9oquahthainoofashoyoikosag'
+    _IV = 'Aeluchoc6aevechuipiexeeboowedaok'
+
+    _CLIPID_REGEXES = [
+        r'"clip_id"\s*:\s+"(\d+)"',
+        r'clipid: "(\d+)"',
+        r'clip[iI]d=(\d+)',
+        r'clip[iI][dD]\s*=\s*["\'](\d+)',
+        r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)",
+        r'proMamsId&quot;\s*:\s*&quot;(\d+)',
+        r'proMamsId"\s*:\s*"(\d+)',
+    ]
+    _TITLE_REGEXES = [
+        r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',
+        r'<header class="clearfix">\s*<h3>(.+?)</h3>',
+        r'<!-- start video -->\s*<h1>(.+?)</h1>',
+        r'<h1 class="att-name">\s*(.+?)</h1>',
+        r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
+        r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>',
+        r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>',
+        r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>',
+    ]
+    _DESCRIPTION_REGEXES = [
+        r'<p itemprop="description">\s*(.+?)</p>',
+        r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',
+        r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>',
+        r'<p class="att-description">\s*(.+?)\s*</p>',
+        r'<p class="video-description" itemprop="description">\s*(.+?)</p>',
+        r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>',
+    ]
+    _UPLOAD_DATE_REGEXES = [
+        r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"',
+        r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr',
+        r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>',
+        r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>',
+    ]
+    _PAGE_TYPE_REGEXES = [
+        r'<meta name="page_type" content="([^"]+)">',
+        r"'itemType'\s*:\s*'([^']*)'",
+    ]
+    _PLAYLIST_ID_REGEXES = [
+        r'content[iI]d=(\d+)',
+        r"'itemId'\s*:\s*'([^']*)'",
+    ]
+    _PLAYLIST_CLIP_REGEXES = [
+        r'(?s)data-qvt=.+?<a href="([^"]+)"',
+    ]
+
+    def _extract_clip(self, url, webpage):
+        clip_id = self._html_search_regex(
+            self._CLIPID_REGEXES, webpage, 'clip id')
+        title = self._html_search_regex(
+            self._TITLE_REGEXES, webpage, 'title',
+            default=None) or self._og_search_title(webpage)
+        info = self._extract_video_info(url, clip_id)
+        description = self._html_search_regex(
+            self._DESCRIPTION_REGEXES, webpage, 'description', default=None)
+        if description is None:
+            description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+        upload_date = unified_strdate(
+            self._html_search_meta('og:published_time', webpage,
+                                   'upload date', default=None)
+            or self._html_search_regex(self._UPLOAD_DATE_REGEXES,
+                                       webpage, 'upload date', default=None))
+
+        json_ld = self._search_json_ld(webpage, clip_id, default={})
+
+        return merge_dicts(info, {
+            'id': clip_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+        }, json_ld)
+
+    def _extract_playlist(self, url, webpage):
+        playlist_id = self._html_search_regex(
+            self._PLAYLIST_ID_REGEXES, webpage, 'playlist id')
+        playlist = self._parse_json(
+            self._search_regex(
+                r'var\s+contentResources\s*=\s*(\[.+?\]);\s*</script',
+                webpage, 'playlist'),
+            playlist_id)
+        entries = []
+        for item in playlist:
+            clip_id = item.get('id') or item.get('upc')
+            if not clip_id:
+                continue
+            info = self._extract_video_info(url, clip_id)
+            info.update({
+                'id': clip_id,
+                'title': item.get('title') or item.get('teaser', {}).get('headline'),
+                'description': item.get('teaser', {}).get('description'),
+                'thumbnail': item.get('poster'),
+                'duration': float_or_none(item.get('duration')),
+                'series': item.get('tvShowTitle'),
+                'uploader': item.get('broadcastPublisher'),
+            })
+            entries.append(info)
+        return self.playlist_result(entries, playlist_id)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        page_type = self._search_regex(
+            self._PAGE_TYPE_REGEXES, webpage,
+            'page type', default='clip').lower()
+        if page_type == 'clip':
+            return self._extract_clip(url, webpage)
+        elif page_type == 'playlist':
+            return self._extract_playlist(url, webpage)
+        else:
+            raise ExtractorError(
+                'Unsupported page type %s' % page_type, expected=True)
diff --git a/youtube_dl/extractor/puhutv.py b/youtube_dl/extractor/puhutv.py

new file mode 100644 (file)

index 0000000..ca71665
--- /dev/null
+++ b/youtube_dl/extractor/puhutv.py
@@ -0,0 +1,239 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    parse_resolution,
+    str_or_none,
+    try_get,
+    unified_timestamp,
+    url_or_none,
+    urljoin,
+)
+
+
+class PuhuTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle'
+    IE_NAME = 'puhutv'
+    _TESTS = [{
+        # film
+        'url': 'https://puhutv.com/sut-kardesler-izle',
+        'md5': 'a347470371d56e1585d1b2c8dab01c96',
+        'info_dict': {
+            'id': '5085',
+            'display_id': 'sut-kardesler',
+            'ext': 'mp4',
+            'title': 'Süt Kardeşler',
+            'description': 'md5:ca09da25b7e57cbb5a9280d6e48d17aa',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 4832.44,
+            'creator': 'Arzu Film',
+            'timestamp': 1561062602,
+            'upload_date': '20190620',
+            'release_year': 1976,
+            'view_count': int,
+            'tags': list,
+        },
+    }, {
+        # episode, geo restricted, bypassable with --geo-verification-proxy
+        'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle',
+        'only_matching': True,
+    }, {
+        # 4k, with subtitles
+        'url': 'https://puhutv.com/dip-1-bolum-izle',
+        'only_matching': True,
+    }]
+    _SUBTITLE_LANGS = {
+        'English': 'en',
+        'Deutsch': 'de',
+        'عربى': 'ar'
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        info = self._download_json(
+            urljoin(url, '/api/slug/%s-izle' % display_id),
+            display_id)['data']
+
+        video_id = compat_str(info['id'])
+        show = info.get('title') or {}
+        title = info.get('name') or show['name']
+        if info.get('display_name'):
+            title = '%s %s' % (title, info['display_name'])
+
+        try:
+            videos = self._download_json(
+                'https://puhutv.com/api/assets/%s/videos' % video_id,
+                display_id, 'Downloading video JSON',
+                headers=self.geo_verification_headers())
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                self.raise_geo_restricted()
+            raise
+
+        urls = []
+        formats = []
+
+        for video in videos['data']['videos']:
+            media_url = url_or_none(video.get('url'))
+            if not media_url or media_url in urls:
+                continue
+            urls.append(media_url)
+
+            playlist = video.get('is_playlist')
+            if (video.get('stream_type') == 'hls' and playlist is True) or 'playlist.m3u8' in media_url:
+                formats.extend(self._extract_m3u8_formats(
+                    media_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+                continue
+
+            quality = int_or_none(video.get('quality'))
+            f = {
+                'url': media_url,
+                'ext': 'mp4',
+                'height': quality
+            }
+            video_format = video.get('video_format')
+            is_hls = (video_format == 'hls' or '/hls/' in media_url or '/chunklist.m3u8' in media_url) and playlist is False
+            if is_hls:
+                format_id = 'hls'
+                f['protocol'] = 'm3u8_native'
+            elif video_format == 'mp4':
+                format_id = 'http'
+            else:
+                continue
+            if quality:
+                format_id += '-%sp' % quality
+            f['format_id'] = format_id
+            formats.append(f)
+        self._sort_formats(formats)
+
+        creator = try_get(
+            show, lambda x: x['producer']['name'], compat_str)
+
+        content = info.get('content') or {}
+
+        images = try_get(
+            content, lambda x: x['images']['wide'], dict) or {}
+        thumbnails = []
+        for image_id, image_url in images.items():
+            if not isinstance(image_url, compat_str):
+                continue
+            if not image_url.startswith(('http', '//')):
+                image_url = 'https://%s' % image_url
+            t = parse_resolution(image_id)
+            t.update({
+                'id': image_id,
+                'url': image_url
+            })
+            thumbnails.append(t)
+
+        tags = []
+        for genre in show.get('genres') or []:
+            if not isinstance(genre, dict):
+                continue
+            genre_name = genre.get('name')
+            if genre_name and isinstance(genre_name, compat_str):
+                tags.append(genre_name)
+
+        subtitles = {}
+        for subtitle in content.get('subtitles') or []:
+            if not isinstance(subtitle, dict):
+                continue
+            lang = subtitle.get('language')
+            sub_url = url_or_none(subtitle.get('url') or subtitle.get('file'))
+            if not lang or not isinstance(lang, compat_str) or not sub_url:
+                continue
+            subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{
+                'url': sub_url
+            }]
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': info.get('description') or show.get('description'),
+            'season_id': str_or_none(info.get('season_id')),
+            'season_number': int_or_none(info.get('season_number')),
+            'episode_number': int_or_none(info.get('episode_number')),
+            'release_year': int_or_none(show.get('released_at')),
+            'timestamp': unified_timestamp(info.get('created_at')),
+            'creator': creator,
+            'view_count': int_or_none(content.get('watch_count')),
+            'duration': float_or_none(content.get('duration_in_ms'), 1000),
+            'tags': tags,
+            'subtitles': subtitles,
+            'thumbnails': thumbnails,
+            'formats': formats
+        }
+
+
+class PuhuTVSerieIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay'
+    IE_NAME = 'puhutv:serie'
+    _TESTS = [{
+        'url': 'https://puhutv.com/deniz-yildizi-detay',
+        'info_dict': {
+            'title': 'Deniz Yıldızı',
+            'id': 'deniz-yildizi',
+        },
+        'playlist_mincount': 205,
+    }, {
+        # a film detail page which is using same url with serie page
+        'url': 'https://puhutv.com/kaybedenler-kulubu-detay',
+        'only_matching': True,
+    }]
+
+    def _extract_entries(self, seasons):
+        for season in seasons:
+            season_id = season.get('id')
+            if not season_id:
+                continue
+            page = 1
+            has_more = True
+            while has_more is True:
+                season = self._download_json(
+                    'https://galadriel.puhutv.com/seasons/%s' % season_id,
+                    season_id, 'Downloading page %s' % page, query={
+                        'page': page,
+                        'per': 40,
+                    })
+                episodes = season.get('episodes')
+                if isinstance(episodes, list):
+                    for ep in episodes:
+                        slug_path = str_or_none(ep.get('slugPath'))
+                        if not slug_path:
+                            continue
+                        video_id = str_or_none(int_or_none(ep.get('id')))
+                        yield self.url_result(
+                            'https://puhutv.com/%s' % slug_path,
+                            ie=PuhuTVIE.ie_key(), video_id=video_id,
+                            video_title=ep.get('name') or ep.get('eventLabel'))
+                page += 1
+                has_more = season.get('hasMore')
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        info = self._download_json(
+            urljoin(url, '/api/slug/%s-detay' % playlist_id),
+            playlist_id)['data']
+
+        seasons = info.get('seasons')
+        if seasons:
+            return self.playlist_result(
+                self._extract_entries(seasons), playlist_id, info.get('name'))
+
+        # For films, these are using same url with series
+        video_id = info.get('slug') or info['assets'][0]['slug']
+        return self.url_result(
+            'https://puhutv.com/%s-izle' % video_id,
+            PuhuTVIE.ie_key(), video_id)
diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py

new file mode 100644 (file)

index 0000000..80091b8
--- /dev/null
+++ b/youtube_dl/extractor/puls4.py
@@ -0,0 +1,57 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .prosiebensat1 import ProSiebenSat1BaseIE
+from ..utils import (
+    unified_strdate,
+    parse_duration,
+    compat_str,
+)
+
+
+class Puls4IE(ProSiebenSat1BaseIE):
+    _VALID_URL = r'https?://(?:www\.)?puls4\.com/(?P<id>[^?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.puls4.com/2-minuten-2-millionen/staffel-3/videos/2min2miotalk/Tobias-Homberger-von-myclubs-im-2min2miotalk-118118',
+        'md5': 'fd3c6b0903ac72c9d004f04bc6bb3e03',
+        'info_dict': {
+            'id': '118118',
+            'ext': 'flv',
+            'title': 'Tobias Homberger von myclubs im #2min2miotalk',
+            'description': 'md5:f9def7c5e8745d6026d8885487d91955',
+            'upload_date': '20160830',
+            'uploader': 'PULS_4',
+        },
+    }, {
+        'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident.-Norbert-Hofer',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.puls4.com/pro-und-contra/wer-wird-prasident/Ganze-Folgen/Wer-wird-Praesident-Analyse-des-Interviews-mit-Norbert-Hofer-416598',
+        'only_matching': True,
+    }]
+    _TOKEN = 'puls4'
+    _SALT = '01!kaNgaiNgah1Ie4AeSha'
+    _CLIENT_NAME = ''
+
+    def _real_extract(self, url):
+        path = self._match_id(url)
+        content_path = self._download_json(
+            'http://www.puls4.com/api/json-fe/page/' + path, path)['content'][0]['url']
+        media = self._download_json(
+            'http://www.puls4.com' + content_path,
+            content_path)['mediaCurrent']
+        player_content = media['playerContent']
+        info = self._extract_video_info(url, player_content['id'])
+        info.update({
+            'id': compat_str(media['objectId']),
+            'title': player_content['title'],
+            'description': media.get('description'),
+            'thumbnail': media.get('previewLink'),
+            'upload_date': unified_strdate(media.get('date')),
+            'duration': parse_duration(player_content.get('duration')),
+            'episode': player_content.get('episodePartName'),
+            'show': media.get('channel'),
+            'season_id': player_content.get('seasonId'),
+            'uploader': player_content.get('sourceCompany'),
+        })
+        return info
diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py

new file mode 100644 (file)

index 0000000..b8ac93a
--- /dev/null
+++ b/youtube_dl/extractor/pyvideo.py
@@ -0,0 +1,72 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import int_or_none
+
+
+class PyvideoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P<category>[^/]+)/(?P<id>[^/?#&.]+)'
+
+    _TESTS = [{
+        'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html',
+        'info_dict': {
+            'id': 'become-a-logging-expert-in-30-minutes',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html',
+        'md5': '5fe1c7e0a8aa5570330784c847ff6d12',
+        'info_dict': {
+            'id': '2542',
+            'ext': 'm4v',
+            'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v',
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        category = mobj.group('category')
+        video_id = mobj.group('id')
+
+        entries = []
+
+        data = self._download_json(
+            'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json'
+            % (category, video_id), video_id, fatal=False)
+
+        if data:
+            for video in data['videos']:
+                video_url = video.get('url')
+                if video_url:
+                    if video.get('type') == 'youtube':
+                        entries.append(self.url_result(video_url, 'Youtube'))
+                    else:
+                        entries.append({
+                            'id': compat_str(data.get('id') or video_id),
+                            'url': video_url,
+                            'title': data['title'],
+                            'description': data.get('description') or data.get('summary'),
+                            'thumbnail': data.get('thumbnail_url'),
+                            'duration': int_or_none(data.get('duration')),
+                        })
+        else:
+            webpage = self._download_webpage(url, video_id)
+            title = self._og_search_title(webpage)
+            media_urls = self._search_regex(
+                r'(?s)Media URL:(.+?)</li>', webpage, 'media urls')
+            for m in re.finditer(
+                    r'<a[^>]+href=(["\'])(?P<url>http.+?)\1', media_urls):
+                media_url = m.group('url')
+                if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url):
+                    entries.append(self.url_result(media_url, 'Youtube'))
+                else:
+                    entries.append({
+                        'id': video_id,
+                        'url': media_url,
+                        'title': title,
+                    })
+
+        return self.playlist_result(entries, video_id)
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py

new file mode 100644 (file)

index 0000000..084308a
--- /dev/null
+++ b/youtube_dl/extractor/qqmusic.py
@@ -0,0 +1,369 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    strip_jsonp,
+    unescapeHTML,
+)
+
+
+class QQMusicIE(InfoExtractor):
+    IE_NAME = 'qqmusic'
+    IE_DESC = 'QQ音乐'
+    _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html'
+    _TESTS = [{
+        'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html',
+        'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8',
+        'info_dict': {
+            'id': '004295Et37taLD',
+            'ext': 'mp3',
+            'title': '可惜没如果',
+            'release_date': '20141227',
+            'creator': '林俊杰',
+            'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }, {
+        'note': 'There is no mp3-320 version of this song.',
+        'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html',
+        'md5': 'fa3926f0c585cda0af8fa4f796482e3e',
+        'info_dict': {
+            'id': '004MsGEo3DdNxV',
+            'ext': 'mp3',
+            'title': '如果',
+            'release_date': '20050626',
+            'creator': '李季美',
+            'description': 'md5:46857d5ed62bc4ba84607a805dccf437',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }, {
+        'note': 'lyrics not in .lrc format',
+        'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html',
+        'info_dict': {
+            'id': '001JyApY11tIp6',
+            'ext': 'mp3',
+            'title': 'Shadows Over Transylvania',
+            'release_date': '19970225',
+            'creator': 'Dark Funeral',
+            'description': 'md5:c9b20210587cbcd6836a1c597bab4525',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    _FORMATS = {
+        'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320},
+        'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128},
+        'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10}
+    }
+
+    # Reference: m_r_GetRUin() in top_player.js
+    # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js
+    @staticmethod
+    def m_r_get_ruin():
+        curMs = int(time.time() * 1000) % 1000
+        return int(round(random.random() * 2147483647) * curMs % 1E10)
+
+    def _real_extract(self, url):
+        mid = self._match_id(url)
+
+        detail_info_page = self._download_webpage(
+            'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid,
+            mid, note='Download song detail info',
+            errnote='Unable to get song detail info', encoding='gbk')
+
+        song_name = self._html_search_regex(
+            r"songname:\s*'([^']+)'", detail_info_page, 'song name')
+
+        publish_time = self._html_search_regex(
+            r'发行时间：(\d{4}-\d{2}-\d{2})', detail_info_page,
+            'publish time', default=None)
+        if publish_time:
+            publish_time = publish_time.replace('-', '')
+
+        singer = self._html_search_regex(
+            r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None)
+
+        lrc_content = self._html_search_regex(
+            r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>',
+            detail_info_page, 'LRC lyrics', default=None)
+        if lrc_content:
+            lrc_content = lrc_content.replace('\\n', '\n')
+
+        thumbnail_url = None
+        albummid = self._search_regex(
+            [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'],
+            detail_info_page, 'album mid', default=None)
+        if albummid:
+            thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \
+                            % (albummid[-2:-1], albummid[-1], albummid)
+
+        guid = self.m_r_get_ruin()
+
+        vkey = self._download_json(
+            'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid,
+            mid, note='Retrieve vkey', errnote='Unable to get vkey',
+            transform_source=strip_jsonp)['key']
+
+        formats = []
+        for format_id, details in self._FORMATS.items():
+            formats.append({
+                'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0'
+                       % (details['prefix'], mid, details['ext'], vkey, guid),
+                'format': format_id,
+                'format_id': format_id,
+                'preference': details['preference'],
+                'abr': details.get('abr'),
+            })
+        self._check_formats(formats, mid)
+        self._sort_formats(formats)
+
+        actual_lrc_lyrics = ''.join(
+            line + '\n' for line in re.findall(
+                r'(?m)^(\[[0-9]{2}:[0-9]{2}(?:\.[0-9]{2,})?\][^\n]*|\[[^\]]*\])', lrc_content))
+
+        info_dict = {
+            'id': mid,
+            'formats': formats,
+            'title': song_name,
+            'release_date': publish_time,
+            'creator': singer,
+            'description': lrc_content,
+            'thumbnail': thumbnail_url
+        }
+        if actual_lrc_lyrics:
+            info_dict['subtitles'] = {
+                'origin': [{
+                    'ext': 'lrc',
+                    'data': actual_lrc_lyrics,
+                }]
+            }
+        return info_dict
+
+
+class QQPlaylistBaseIE(InfoExtractor):
+    @staticmethod
+    def qq_static_url(category, mid):
+        return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid)
+
+    def get_singer_all_songs(self, singmid, num):
+        return self._download_webpage(
+            r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid,
+            query={
+                'format': 'json',
+                'inCharset': 'utf8',
+                'outCharset': 'utf-8',
+                'platform': 'yqq',
+                'needNewCode': 0,
+                'singermid': singmid,
+                'order': 'listen',
+                'begin': 0,
+                'num': num,
+                'songstatus': 1,
+            })
+
+    def get_entries_from_page(self, singmid):
+        entries = []
+
+        default_num = 1
+        json_text = self.get_singer_all_songs(singmid, default_num)
+        json_obj_all_songs = self._parse_json(json_text, singmid)
+
+        if json_obj_all_songs['code'] == 0:
+            total = json_obj_all_songs['data']['total']
+            json_text = self.get_singer_all_songs(singmid, total)
+            json_obj_all_songs = self._parse_json(json_text, singmid)
+
+        for item in json_obj_all_songs['data']['list']:
+            if item['musicData'].get('songmid') is not None:
+                songmid = item['musicData']['songmid']
+                entries.append(self.url_result(
+                    r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid))
+
+        return entries
+
+
+class QQMusicSingerIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:singer'
+    IE_DESC = 'QQ音乐 - 歌手'
+    _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html'
+    _TEST = {
+        'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html',
+        'info_dict': {
+            'id': '001BLpXF2DyJe2',
+            'title': '林俊杰',
+            'description': 'md5:870ec08f7d8547c29c93010899103751',
+        },
+        'playlist_mincount': 12,
+    }
+
+    def _real_extract(self, url):
+        mid = self._match_id(url)
+
+        entries = self.get_entries_from_page(mid)
+        singer_page = self._download_webpage(url, mid, 'Download singer page')
+        singer_name = self._html_search_regex(
+            r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None)
+        singer_desc = None
+
+        if mid:
+            singer_desc_page = self._download_xml(
+                'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid,
+                'Donwload singer description XML',
+                query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid},
+                headers={'Referer': 'https://y.qq.com/n/yqq/singer/'})
+
+            singer_desc = singer_desc_page.find('./data/info/desc').text
+
+        return self.playlist_result(entries, mid, singer_name, singer_desc)
+
+
+class QQMusicAlbumIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:album'
+    IE_DESC = 'QQ音乐 - 专辑'
+    _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html'
+
+    _TESTS = [{
+        'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html',
+        'info_dict': {
+            'id': '000gXCTb2AhRR1',
+            'title': '我们都是这样长大的',
+            'description': 'md5:179c5dce203a5931970d306aa9607ea6',
+        },
+        'playlist_count': 4,
+    }, {
+        'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html',
+        'info_dict': {
+            'id': '002Y5a3b3AlCu3',
+            'title': '그리고...',
+            'description': 'md5:a48823755615508a95080e81b51ba729',
+        },
+        'playlist_count': 8,
+    }]
+
+    def _real_extract(self, url):
+        mid = self._match_id(url)
+
+        album = self._download_json(
+            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid,
+            mid, 'Download album page')['data']
+
+        entries = [
+            self.url_result(
+                'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']
+            ) for song in album['list']
+        ]
+        album_name = album.get('name')
+        album_detail = album.get('desc')
+        if album_detail is not None:
+            album_detail = album_detail.strip()
+
+        return self.playlist_result(entries, mid, album_name, album_detail)
+
+
+class QQMusicToplistIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:toplist'
+    IE_DESC = 'QQ音乐 - 排行榜'
+    _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html'
+
+    _TESTS = [{
+        'url': 'https://y.qq.com/n/yqq/toplist/123.html',
+        'info_dict': {
+            'id': '123',
+            'title': '美国iTunes榜',
+            'description': 'md5:89db2335fdbb10678dee2d43fe9aba08',
+        },
+        'playlist_count': 100,
+    }, {
+        'url': 'https://y.qq.com/n/yqq/toplist/3.html',
+        'info_dict': {
+            'id': '3',
+            'title': '巅峰榜·欧美',
+            'description': 'md5:5a600d42c01696b26b71f8c4d43407da',
+        },
+        'playlist_count': 100,
+    }, {
+        'url': 'https://y.qq.com/n/yqq/toplist/106.html',
+        'info_dict': {
+            'id': '106',
+            'title': '韩国Mnet榜',
+            'description': 'md5:cb84b325215e1d21708c615cac82a6e7',
+        },
+        'playlist_count': 50,
+    }]
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        toplist_json = self._download_json(
+            'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id,
+            note='Download toplist page',
+            query={'type': 'toplist', 'topid': list_id, 'format': 'json'})
+
+        entries = [self.url_result(
+            'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic',
+            song['data']['songmid'])
+            for song in toplist_json['songlist']]
+
+        topinfo = toplist_json.get('topinfo', {})
+        list_name = topinfo.get('ListName')
+        list_description = topinfo.get('info')
+        return self.playlist_result(entries, list_id, list_name, list_description)
+
+
+class QQMusicPlaylistIE(QQPlaylistBaseIE):
+    IE_NAME = 'qqmusic:playlist'
+    IE_DESC = 'QQ音乐 - 歌单'
+    _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html'
+
+    _TESTS = [{
+        'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html',
+        'info_dict': {
+            'id': '3462654915',
+            'title': '韩国5月新歌精选下旬',
+            'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4',
+        },
+        'playlist_count': 40,
+        'skip': 'playlist gone',
+    }, {
+        'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html',
+        'info_dict': {
+            'id': '1374105607',
+            'title': '易入人心的华语民谣',
+            'description': '民谣的歌曲易于传唱、、歌词朗朗伤口、旋律简单温馨。属于那种才入耳孔。却上心头的感觉。没有太多的复杂情绪。简单而直接地表达乐者的情绪，就是这样的简单才易入人心。',
+        },
+        'playlist_count': 20,
+    }]
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        list_json = self._download_json(
+            'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg',
+            list_id, 'Download list page',
+            query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id},
+            transform_source=strip_jsonp)
+        if not len(list_json.get('cdlist', [])):
+            if list_json.get('code'):
+                raise ExtractorError(
+                    'QQ Music said: error %d in fetching playlist info' % list_json['code'],
+                    expected=True)
+            raise ExtractorError('Unable to get playlist info')
+
+        cdlist = list_json['cdlist'][0]
+        entries = [self.url_result(
+            'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'])
+            for song in cdlist['songlist']]
+
+        list_name = cdlist.get('dissname')
+        list_description = clean_html(unescapeHTML(cdlist.get('desc')))
+        return self.playlist_result(entries, list_id, list_name, list_description)
diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py

new file mode 100644 (file)

index 0000000..e2202d6
--- /dev/null
+++ b/youtube_dl/extractor/r7.py
@@ -0,0 +1,112 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class R7IE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                        https?://
+                        (?:
+                            (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/|
+                            noticias\.r7\.com(?:/[^/]+)+/[^/]+-|
+                            player\.r7\.com/video/i/
+                        )
+                        (?P<id>[\da-f]{24})
+                    '''
+    _TESTS = [{
+        'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html',
+        'md5': '403c4e393617e8e8ddc748978ee8efde',
+        'info_dict': {
+            'id': '54e7050b0cf2ff57e0279389',
+            'ext': 'mp4',
+            'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"',
+            'description': 'md5:01812008664be76a6479aa58ec865b72',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 98,
+            'like_count': int,
+            'view_count': int,
+        },
+    }, {
+        'url': 'http://esportes.r7.com/videos/cigano-manda-recado-aos-fas/idmedia/4e176727b51a048ee6646a1b.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://noticias.r7.com/record-news/video/representante-do-instituto-sou-da-paz-fala-sobre-fim-do-estatuto-do-desarmamento-5480fc580cf2285b117f438d/',
+        'only_matching': True,
+    }, {
+        'url': 'http://player.r7.com/video/i/54e7050b0cf2ff57e0279389?play=true&video=http://vsh.r7.com/54e7050b0cf2ff57e0279389/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-ATOS_copy.mp4&linkCallback=http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html&thumbnail=http://vtb.r7.com/ER7_RE_BG_MORTE_JOVENS_570kbps_2015-02-2009f17818-cc82-4c8f-86dc-89a66934e633-thumb.jpg&idCategory=192&share=true&layout=full&full=true',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://player-api.r7.com/video/i/%s' % video_id, video_id)
+
+        title = video['title']
+
+        formats = []
+        media_url_hls = video.get('media_url_hls')
+        if media_url_hls:
+            formats.extend(self._extract_m3u8_formats(
+                media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+        media_url = video.get('media_url')
+        if media_url:
+            f = {
+                'url': media_url,
+                'format_id': 'http',
+            }
+            # m3u8 format always matches the http format, let's copy metadata from
+            # one to another
+            m3u8_formats = list(filter(
+                lambda f: f.get('vcodec') != 'none', formats))
+            if len(m3u8_formats) == 1:
+                f_copy = m3u8_formats[0].copy()
+                f_copy.update(f)
+                f_copy['protocol'] = 'http'
+                f = f_copy
+            formats.append(f)
+        self._sort_formats(formats)
+
+        description = video.get('description')
+        thumbnail = video.get('thumb')
+        duration = int_or_none(video.get('media_duration'))
+        like_count = int_or_none(video.get('likes'))
+        view_count = int_or_none(video.get('views'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'like_count': like_count,
+            'view_count': view_count,
+            'formats': formats,
+        }
+
+
+class R7ArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015',
+        'only_matching': True,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})',
+            webpage, 'video id')
+
+        return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key())
diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py

new file mode 100644 (file)

index 0000000..2c35f98
--- /dev/null
+++ b/youtube_dl/extractor/radiobremen.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class RadioBremenIE(InfoExtractor):
+    _VALID_URL = r'http?://(?:www\.)?radiobremen\.de/mediathek/(?:index\.html)?\?id=(?P<id>[0-9]+)'
+    IE_NAME = 'radiobremen'
+
+    _TEST = {
+        'url': 'http://www.radiobremen.de/mediathek/?id=141876',
+        'info_dict': {
+            'id': '141876',
+            'ext': 'mp4',
+            'duration': 178,
+            'width': 512,
+            'title': 'Druck auf Patrick Öztürk',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'description': 'Gegen den SPD-Bürgerschaftsabgeordneten Patrick Öztürk wird wegen Beihilfe zum gewerbsmäßigen Betrug ermittelt. Am Donnerstagabend sollte er dem Vorstand des SPD-Unterbezirks Bremerhaven dazu Rede und Antwort stehen.',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        meta_url = 'http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s' % video_id
+        meta_doc = self._download_webpage(
+            meta_url, video_id, 'Downloading metadata')
+        title = self._html_search_regex(
+            r'<h1.*>(?P<title>.+)</h1>', meta_doc, 'title')
+        description = self._html_search_regex(
+            r'<p>(?P<description>.*)</p>', meta_doc, 'description', fatal=False)
+        duration = parse_duration(self._html_search_regex(
+            r'L&auml;nge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>',
+            meta_doc, 'duration', fatal=False))
+
+        page_doc = self._download_webpage(
+            url, video_id, 'Downloading video information')
+        mobj = re.search(
+            r"ardformatplayerclassic\(\'playerbereich\',\'(?P<width>[0-9]+)\',\'.*\',\'(?P<video_id>[0-9]+)\',\'(?P<secret>[0-9]+)\',\'(?P<thumbnail>.+)\',\'\'\)",
+            page_doc)
+        video_url = (
+            "http://dl-ondemand.radiobremen.de/mediabase/%s/%s_%s_%s.mp4" %
+            (video_id, video_id, mobj.group("secret"), mobj.group('width')))
+
+        formats = [{
+            'url': video_url,
+            'ext': 'mp4',
+            'width': int(mobj.group('width')),
+        }]
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'formats': formats,
+            'thumbnail': mobj.group('thumbnail'),
+        }
diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py

new file mode 100644 (file)

index 0000000..a28b1a2
--- /dev/null
+++ b/youtube_dl/extractor/radiocanada.py
@@ -0,0 +1,171 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    unified_strdate,
+)
+
+
+class RadioCanadaIE(InfoExtractor):
+    IE_NAME = 'radiocanada'
+    _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)'
+    _TESTS = [
+        {
+            'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
+            'info_dict': {
+                'id': '7184272',
+                'ext': 'mp4',
+                'title': 'Le parcours du tireur capté sur vidéo',
+                'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
+                'upload_date': '20141023',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            }
+        },
+        {
+            # empty Title
+            'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/',
+            'info_dict': {
+                'id': '7754998',
+                'ext': 'mp4',
+                'title': 'letelejournal22h',
+                'description': 'INTEGRALE WEB 22H-TJ',
+                'upload_date': '20170720',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            # with protectionType but not actually DRM protected
+            'url': 'radiocanada:toutv:140872',
+            'info_dict': {
+                'id': '140872',
+                'title': 'Épisode 1',
+                'series': 'District 31',
+            },
+            'only_matching': True,
+        }
+    ]
+    _GEO_COUNTRIES = ['CA']
+    _access_token = None
+    _claims = None
+
+    def _call_api(self, path, video_id=None, app_code=None, query=None):
+        if not query:
+            query = {}
+        query.update({
+            'client_key': '773aea60-0e80-41bb-9c7f-e6d7c3ad17fb',
+            'output': 'json',
+        })
+        if video_id:
+            query.update({
+                'appCode': app_code,
+                'idMedia': video_id,
+            })
+        if self._access_token:
+            query['access_token'] = self._access_token
+        try:
+            return self._download_json(
+                'https://services.radio-canada.ca/media/' + path, video_id, query=query)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 422):
+                data = self._parse_json(e.cause.read().decode(), None)
+                error = data.get('error_description') or data['errorMessage']['text']
+                raise ExtractorError(error, expected=True)
+            raise
+
+    def _extract_info(self, app_code, video_id):
+        metas = self._call_api('meta/v1/index.ashx', video_id, app_code)['Metas']
+
+        def get_meta(name):
+            for meta in metas:
+                if meta.get('name') == name:
+                    text = meta.get('text')
+                    if text:
+                        return text
+
+        # protectionType does not necessarily mean the video is DRM protected (see
+        # https://github.com/ytdl-org/youtube-dl/pull/18609).
+        if get_meta('protectionType'):
+            self.report_warning('This video is probably DRM protected.')
+
+        query = {
+            'connectionType': 'hd',
+            'deviceType': 'ipad',
+            'multibitrate': 'true',
+        }
+        if self._claims:
+            query['claims'] = self._claims
+        v_data = self._call_api('validation/v2/', video_id, app_code, query)
+        v_url = v_data.get('url')
+        if not v_url:
+            error = v_data['message']
+            if error == "Le contenu sélectionné n'est pas disponible dans votre pays":
+                raise self.raise_geo_restricted(error, self._GEO_COUNTRIES)
+            if error == 'Le contenu sélectionné est disponible seulement en premium':
+                self.raise_login_required(error)
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error), expected=True)
+        formats = self._extract_m3u8_formats(v_url, video_id, 'mp4')
+        self._sort_formats(formats)
+
+        subtitles = {}
+        closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5')
+        if closed_caption_url:
+            subtitles['fr'] = [{
+                'url': closed_caption_url,
+                'ext': determine_ext(closed_caption_url, 'vtt'),
+            }]
+
+        return {
+            'id': video_id,
+            'title': get_meta('Title') or get_meta('AV-nomEmission'),
+            'description': get_meta('Description') or get_meta('ShortDescription'),
+            'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),
+            'duration': int_or_none(get_meta('length')),
+            'series': get_meta('Emission'),
+            'season_number': int_or_none('SrcSaison'),
+            'episode_number': int_or_none('SrcEpisode'),
+            'upload_date': unified_strdate(get_meta('Date')),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        return self._extract_info(*re.match(self._VALID_URL, url).groups())
+
+
+class RadioCanadaAudioVideoIE(InfoExtractor):
+    IE_NAME = 'radiocanada:audiovideo'
+    _VALID_URL = r'https?://ici\.radio-canada\.ca/([^/]+/)*media-(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
+        'info_dict': {
+            'id': '7527184',
+            'ext': 'mp4',
+            'title': 'Barack Obama au Vietnam',
+            'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam',
+            'upload_date': '20160523',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://ici.radio-canada.ca/info/videos/media-7527184/barack-obama-au-vietnam',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        return self.url_result('radiocanada:medianet:%s' % self._match_id(url))
diff --git a/youtube_dl/extractor/radiode.py b/youtube_dl/extractor/radiode.py

new file mode 100644 (file)

index 0000000..2c06c8b
--- /dev/null
+++ b/youtube_dl/extractor/radiode.py
@@ -0,0 +1,52 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class RadioDeIE(InfoExtractor):
+    IE_NAME = 'radio.de'
+    _VALID_URL = r'https?://(?P<id>.+?)\.(?:radio\.(?:de|at|fr|pt|es|pl|it)|rad\.io)'
+    _TEST = {
+        'url': 'http://ndr2.radio.de/',
+        'info_dict': {
+            'id': 'ndr2',
+            'ext': 'mp3',
+            'title': 're:^NDR 2 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'md5:591c49c702db1a33751625ebfb67f273',
+            'thumbnail': r're:^https?://.*\.png',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        radio_id = self._match_id(url)
+        webpage = self._download_webpage(url, radio_id)
+        jscode = self._search_regex(
+            r"'components/station/stationService':\s*\{\s*'?station'?:\s*(\{.*?\s*\}),\n",
+            webpage, 'broadcast')
+
+        broadcast = self._parse_json(jscode, radio_id)
+        title = self._live_title(broadcast['name'])
+        description = broadcast.get('description') or broadcast.get('shortDescription')
+        thumbnail = broadcast.get('picture4Url') or broadcast.get('picture4TransUrl') or broadcast.get('logo100x100')
+
+        formats = [{
+            'url': stream['streamUrl'],
+            'ext': stream['streamContentFormat'].lower(),
+            'acodec': stream['streamContentFormat'],
+            'abr': stream['bitRate'],
+            'asr': stream['sampleRate']
+        } for stream in broadcast['streamUrls']]
+        self._sort_formats(formats)
+
+        return {
+            'id': radio_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'is_live': True,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py

new file mode 100644 (file)

index 0000000..a8afc00
--- /dev/null
+++ b/youtube_dl/extractor/radiofrance.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class RadioFranceIE(InfoExtractor):
+    _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
+    IE_NAME = 'radiofrance'
+
+    _TEST = {
+        'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
+        'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
+        'info_dict': {
+            'id': 'one-one',
+            'ext': 'ogg',
+            'title': 'One to one',
+            'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
+            'uploader': 'Thomas Hercouët',
+        },
+    }
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
+        description = self._html_search_regex(
+            r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
+            webpage, 'description', fatal=False)
+        uploader = self._html_search_regex(
+            r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
+            webpage, 'uploader', fatal=False)
+
+        formats_str = self._html_search_regex(
+            r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
+            webpage, 'audio URLs')
+        formats = [
+            {
+                'format_id': fm[0],
+                'url': fm[1],
+                'vcodec': 'none',
+                'preference': i,
+            }
+            for i, fm in
+            enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
+        ]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'uploader': uploader,
+        }
diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py

new file mode 100644 (file)

index 0000000..3f74f0c
--- /dev/null
+++ b/youtube_dl/extractor/radiojavan.py
@@ -0,0 +1,83 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_resolution,
+    str_to_int,
+    unified_strdate,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class RadioJavanIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?radiojavan\.com/videos/video/(?P<id>[^/]+)/?'
+    _TEST = {
+        'url': 'http://www.radiojavan.com/videos/video/chaartaar-ashoobam',
+        'md5': 'e85208ffa3ca8b83534fca9fe19af95b',
+        'info_dict': {
+            'id': 'chaartaar-ashoobam',
+            'ext': 'mp4',
+            'title': 'Chaartaar - Ashoobam',
+            'thumbnail': r're:^https?://.*\.jpe?g$',
+            'upload_date': '20150215',
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        download_host = self._download_json(
+            'https://www.radiojavan.com/videos/video_host', video_id,
+            data=urlencode_postdata({'id': video_id}),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'Referer': url,
+            }).get('host', 'https://host1.rjmusicmedia.com')
+
+        webpage = self._download_webpage(url, video_id)
+
+        formats = []
+        for format_id, _, video_path in re.findall(
+                r'RJ\.video(?P<format_id>\d+[pPkK])\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2',
+                webpage):
+            f = parse_resolution(format_id)
+            f.update({
+                'url': urljoin(download_host, video_path),
+                'format_id': format_id,
+            })
+            formats.append(f)
+        self._sort_formats(formats)
+
+        title = self._og_search_title(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        upload_date = unified_strdate(self._search_regex(
+            r'class="date_added">Date added: ([^<]+)<',
+            webpage, 'upload date', fatal=False))
+
+        view_count = str_to_int(self._search_regex(
+            r'class="views">Plays: ([\d,]+)',
+            webpage, 'view count', fatal=False))
+        like_count = str_to_int(self._search_regex(
+            r'class="rating">([\d,]+) likes',
+            webpage, 'like count', fatal=False))
+        dislike_count = str_to_int(self._search_regex(
+            r'class="rating">([\d,]+) dislikes',
+            webpage, 'dislike count', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py

new file mode 100644 (file)

index 0000000..207a6c2
--- /dev/null
+++ b/youtube_dl/extractor/rai.py
@@ -0,0 +1,502 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urlparse,
+    compat_str,
+)
+from ..utils import (
+    ExtractorError,
+    determine_ext,
+    find_xpath_attr,
+    fix_xml_ampersands,
+    GeoRestrictedError,
+    int_or_none,
+    parse_duration,
+    strip_or_none,
+    try_get,
+    unescapeHTML,
+    unified_strdate,
+    unified_timestamp,
+    update_url_query,
+    urljoin,
+    xpath_text,
+)
+
+
+class RaiBaseIE(InfoExtractor):
+    _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+    _GEO_COUNTRIES = ['IT']
+    _GEO_BYPASS = False
+
+    def _extract_relinker_info(self, relinker_url, video_id):
+        if not re.match(r'https?://', relinker_url):
+            return {'formats': [{'url': relinker_url}]}
+
+        formats = []
+        geoprotection = None
+        is_live = None
+        duration = None
+
+        for platform in ('mon', 'flash', 'native'):
+            relinker = self._download_xml(
+                relinker_url, video_id,
+                note='Downloading XML metadata for platform %s' % platform,
+                transform_source=fix_xml_ampersands,
+                query={'output': 45, 'pl': platform},
+                headers=self.geo_verification_headers())
+
+            if not geoprotection:
+                geoprotection = xpath_text(
+                    relinker, './geoprotection', default=None) == 'Y'
+
+            if not is_live:
+                is_live = xpath_text(
+                    relinker, './is_live', default=None) == 'Y'
+            if not duration:
+                duration = parse_duration(xpath_text(
+                    relinker, './duration', default=None))
+
+            url_elem = find_xpath_attr(relinker, './url', 'type', 'content')
+            if url_elem is None:
+                continue
+
+            media_url = url_elem.text
+
+            # This does not imply geo restriction (e.g.
+            # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html)
+            if media_url == 'http://download.rai.it/video_no_available.mp4':
+                continue
+
+            ext = determine_ext(media_url)
+            if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
+                continue
+
+            if ext == 'm3u8' or 'format=m3u8' in media_url or platform == 'mon':
+                formats.extend(self._extract_m3u8_formats(
+                    media_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif ext == 'f4m' or platform == 'flash':
+                manifest_url = update_url_query(
+                    media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'),
+                    {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'})
+                formats.extend(self._extract_f4m_formats(
+                    manifest_url, video_id, f4m_id='hds', fatal=False))
+            else:
+                bitrate = int_or_none(xpath_text(relinker, 'bitrate'))
+                formats.append({
+                    'url': media_url,
+                    'tbr': bitrate if bitrate > 0 else None,
+                    'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
+                })
+
+        if not formats and geoprotection is True:
+            self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+
+        return dict((k, v) for k, v in {
+            'is_live': is_live,
+            'duration': duration,
+            'formats': formats,
+        }.items() if v is not None)
+
+    @staticmethod
+    def _extract_subtitles(url, subtitle_url):
+        subtitles = {}
+        if subtitle_url and isinstance(subtitle_url, compat_str):
+            subtitle_url = urljoin(url, subtitle_url)
+            STL_EXT = '.stl'
+            SRT_EXT = '.srt'
+            subtitles['it'] = [{
+                'ext': 'stl',
+                'url': subtitle_url,
+            }]
+            if subtitle_url.endswith(STL_EXT):
+                srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT
+                subtitles['it'].append({
+                    'ext': 'srt',
+                    'url': srt_url,
+                })
+        return subtitles
+
+
+class RaiPlayIE(RaiBaseIE):
+    _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE
+    _TESTS = [{
+        'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter',
+        'md5': '340aa3b7afb54bfd14a8c11786450d76',
+        'info_dict': {
+            'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66',
+            'ext': 'mp4',
+            'title': 'La Casa Bianca',
+            'alt_title': 'S2016 - Puntata del 23/10/2016',
+            'description': 'md5:a09d45890850458077d1f68bb036e0a5',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Rai 3',
+            'creator': 'Rai 3',
+            'duration': 3278,
+            'timestamp': 1477764300,
+            'upload_date': '20161029',
+            'series': 'La Casa Bianca',
+            'season': '2016',
+        },
+    }, {
+        'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
+        'md5': '8970abf8caf8aef4696e7b1f2adfc696',
+        'info_dict': {
+            'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
+            'ext': 'mp4',
+            'title': 'Report del 07/04/2014',
+            'alt_title': 'S2013/14 - Puntata del 07/04/2014',
+            'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Rai 5',
+            'creator': 'Rai 5',
+            'duration': 6160,
+            'series': 'Report',
+            'season_number': 5,
+            'season': '2013/14',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        url, video_id = mobj.group('url', 'id')
+
+        media = self._download_json(
+            '%s?json' % url, video_id, 'Downloading video JSON')
+
+        title = media['name']
+
+        video = media['video']
+
+        relinker_info = self._extract_relinker_info(video['contentUrl'], video_id)
+        self._sort_formats(relinker_info['formats'])
+
+        thumbnails = []
+        if 'images' in media:
+            for _, value in media.get('images').items():
+                if value:
+                    thumbnails.append({
+                        'url': value.replace('[RESOLUTION]', '600x400')
+                    })
+
+        timestamp = unified_timestamp(try_get(
+            media, lambda x: x['availabilities'][0]['start'], compat_str))
+
+        subtitles = self._extract_subtitles(url, video.get('subtitles'))
+
+        info = {
+            'id': video_id,
+            'title': self._live_title(title) if relinker_info.get(
+                'is_live') else title,
+            'alt_title': media.get('subtitle'),
+            'description': media.get('description'),
+            'uploader': strip_or_none(media.get('channel')),
+            'creator': strip_or_none(media.get('editor')),
+            'duration': parse_duration(video.get('duration')),
+            'timestamp': timestamp,
+            'thumbnails': thumbnails,
+            'series': try_get(
+                media, lambda x: x['isPartOf']['name'], compat_str),
+            'season_number': int_or_none(try_get(
+                media, lambda x: x['isPartOf']['numeroStagioni'])),
+            'season': media.get('stagione') or None,
+            'subtitles': subtitles,
+        }
+
+        info.update(relinker_info)
+        return info
+
+
+class RaiPlayLiveIE(RaiBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://www.raiplay.it/dirette/rainews24',
+        'info_dict': {
+            'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c',
+            'display_id': 'rainews24',
+            'ext': 'mp4',
+            'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'md5:6eca31500550f9376819f174e5644754',
+            'uploader': 'Rai News 24',
+            'creator': 'Rai News 24',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE,
+            webpage, 'content id')
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': RaiPlayIE.ie_key(),
+            'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id,
+            'id': video_id,
+            'display_id': display_id,
+        }
+
+
+class RaiPlayPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/',
+        'info_dict': {
+            'id': 'nondirloalmiocapo',
+            'title': 'Non dirlo al mio capo',
+            'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86',
+        },
+        'playlist_mincount': 12,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        title = self._html_search_meta(
+            ('programma', 'nomeProgramma'), webpage, 'title')
+        description = unescapeHTML(self._html_search_meta(
+            ('description', 'og:description'), webpage, 'description'))
+
+        entries = []
+        for mobj in re.finditer(
+                r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1',
+                webpage):
+            video_url = urljoin(url, mobj.group('path'))
+            entries.append(self.url_result(
+                video_url, ie=RaiPlayIE.ie_key(),
+                video_id=RaiPlayIE._match_id(video_url)))
+
+        return self.playlist_result(entries, playlist_id, title, description)
+
+
+class RaiIE(RaiBaseIE):
+    _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
+    _TESTS = [{
+        # var uniquename = "ContentItem-..."
+        # data-id="ContentItem-..."
+        'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
+        'info_dict': {
+            'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
+            'ext': 'mp4',
+            'title': 'TG PRIMO TEMPO',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1758,
+            'upload_date': '20140612',
+        }
+    }, {
+        # with ContentItem in many metas
+        'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',
+        'info_dict': {
+            'id': '1632c009-c843-4836-bb65-80c33084a64b',
+            'ext': 'mp4',
+            'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"',
+            'description': 'I film in uscita questa settimana.',
+            'thumbnail': r're:^https?://.*\.png$',
+            'duration': 833,
+            'upload_date': '20161103',
+        }
+    }, {
+        # with ContentItem in og:url
+        'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html',
+        'md5': '11959b4e44fa74de47011b5799490adf',
+        'info_dict': {
+            'id': 'efb17665-691c-45d5-a60c-5301333cbb0c',
+            'ext': 'mp4',
+            'title': 'TG1 ore 20:00 del 03/11/2016',
+            'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2214,
+            'upload_date': '20161103',
+        }
+    }, {
+        # drawMediaRaiTV(...)
+        'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
+        'md5': '2dd727e61114e1ee9c47f0da6914e178',
+        'info_dict': {
+            'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
+            'ext': 'mp4',
+            'title': 'Il pacco',
+            'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20141221',
+        },
+    }, {
+        # initEdizione('ContentItem-...'
+        'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
+        'info_dict': {
+            'id': 'c2187016-8484-4e3a-8ac8-35e475b07303',
+            'ext': 'mp4',
+            'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}',
+            'duration': 2274,
+            'upload_date': '20170401',
+        },
+        'skip': 'Changes daily',
+    }, {
+        # HDS live stream with only relinker URL
+        'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
+        'info_dict': {
+            'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
+            'ext': 'flv',
+            'title': 'EuroNews',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # HLS live stream with ContentItem in og:url
+        'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
+        'info_dict': {
+            'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',
+            'ext': 'mp4',
+            'title': 'La diretta di Rainews24',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # Direct MMS URL
+        'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html',
+        'only_matching': True,
+    }]
+
+    def _extract_from_content_id(self, content_id, url):
+        media = self._download_json(
+            'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id,
+            content_id, 'Downloading video JSON')
+
+        title = media['name'].strip()
+
+        media_type = media['type']
+        if 'Audio' in media_type:
+            relinker_info = {
+                'formats': [{
+                    'format_id': media.get('formatoAudio'),
+                    'url': media['audioUrl'],
+                    'ext': media.get('formatoAudio'),
+                }]
+            }
+        elif 'Video' in media_type:
+            relinker_info = self._extract_relinker_info(media['mediaUri'], content_id)
+        else:
+            raise ExtractorError('not a media file')
+
+        self._sort_formats(relinker_info['formats'])
+
+        thumbnails = []
+        for image_type in ('image', 'image_medium', 'image_300'):
+            thumbnail_url = media.get(image_type)
+            if thumbnail_url:
+                thumbnails.append({
+                    'url': compat_urlparse.urljoin(url, thumbnail_url),
+                })
+
+        subtitles = self._extract_subtitles(url, media.get('subtitlesUrl'))
+
+        info = {
+            'id': content_id,
+            'title': title,
+            'description': strip_or_none(media.get('desc')),
+            'thumbnails': thumbnails,
+            'uploader': media.get('author'),
+            'upload_date': unified_strdate(media.get('date')),
+            'duration': parse_duration(media.get('length')),
+            'subtitles': subtitles,
+        }
+
+        info.update(relinker_info)
+
+        return info
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        content_item_id = None
+
+        content_item_url = self._html_search_meta(
+            ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url',
+             'twitter:player', 'jsonlink'), webpage, default=None)
+        if content_item_url:
+            content_item_id = self._search_regex(
+                r'ContentItem-(%s)' % self._UUID_RE, content_item_url,
+                'content item id', default=None)
+
+        if not content_item_id:
+            content_item_id = self._search_regex(
+                r'''(?x)
+                    (?:
+                        (?:initEdizione|drawMediaRaiTV)\(|
+                        <(?:[^>]+\bdata-id|var\s+uniquename)=
+                    )
+                    (["\'])
+                    (?:(?!\1).)*\bContentItem-(?P<id>%s)
+                ''' % self._UUID_RE,
+                webpage, 'content item id', default=None, group='id')
+
+        content_item_ids = set()
+        if content_item_id:
+            content_item_ids.add(content_item_id)
+        if video_id not in content_item_ids:
+            content_item_ids.add(video_id)
+
+        for content_item_id in content_item_ids:
+            try:
+                return self._extract_from_content_id(content_item_id, url)
+            except GeoRestrictedError:
+                raise
+            except ExtractorError:
+                pass
+
+        relinker_url = self._search_regex(
+            r'''(?x)
+                (?:
+                    var\s+videoURL|
+                    mediaInfo\.mediaUri
+                )\s*=\s*
+                ([\'"])
+                (?P<url>
+                    (?:https?:)?
+                    //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\?
+                    (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1
+            ''',
+            webpage, 'relinker URL', group='url')
+
+        relinker_info = self._extract_relinker_info(
+            urljoin(url, relinker_url), video_id)
+        self._sort_formats(relinker_info['formats'])
+
+        title = self._search_regex(
+            r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1',
+            webpage, 'title', group='title',
+            default=None) or self._og_search_title(webpage)
+
+        info = {
+            'id': video_id,
+            'title': title,
+        }
+
+        info.update(relinker_info)
+
+        return info
diff --git a/youtube_dl/extractor/raywenderlich.py b/youtube_dl/extractor/raywenderlich.py

new file mode 100644 (file)

index 0000000..5411ece
--- /dev/null
+++ b/youtube_dl/extractor/raywenderlich.py
@@ -0,0 +1,179 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .vimeo import VimeoIE
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    merge_dicts,
+    try_get,
+    unescapeHTML,
+    unified_timestamp,
+    urljoin,
+)
+
+
+class RayWenderlichIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            videos\.raywenderlich\.com/courses|
+                            (?:www\.)?raywenderlich\.com
+                        )/
+                        (?P<course_id>[^/]+)/lessons/(?P<id>\d+)
+                    '''
+
+    _TESTS = [{
+        'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1',
+        'info_dict': {
+            'id': '248377018',
+            'ext': 'mp4',
+            'title': 'Introduction',
+            'description': 'md5:804d031b3efa9fcb49777d512d74f722',
+            'timestamp': 1513906277,
+            'upload_date': '20171222',
+            'duration': 133,
+            'uploader': 'Ray Wenderlich',
+            'uploader_id': 'user3304672',
+        },
+        'params': {
+            'noplaylist': True,
+            'skip_download': True,
+        },
+        'add_ie': [VimeoIE.ie_key()],
+        'expected_warnings': ['HTTP Error 403: Forbidden'],
+    }, {
+        'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_video_id(data, lesson_id):
+        if not data:
+            return
+        groups = try_get(data, lambda x: x['groups'], list) or []
+        if not groups:
+            return
+        for group in groups:
+            if not isinstance(group, dict):
+                continue
+            contents = try_get(data, lambda x: x['contents'], list) or []
+            for content in contents:
+                if not isinstance(content, dict):
+                    continue
+                ordinal = int_or_none(content.get('ordinal'))
+                if ordinal != lesson_id:
+                    continue
+                video_id = content.get('identifier')
+                if video_id:
+                    return compat_str(video_id)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        course_id, lesson_id = mobj.group('course_id', 'id')
+        display_id = '%s/%s' % (course_id, lesson_id)
+
+        webpage = self._download_webpage(url, display_id)
+
+        thumbnail = self._og_search_thumbnail(
+            webpage, default=None) or self._html_search_meta(
+            'twitter:image', webpage, 'thumbnail')
+
+        if '>Subscribe to unlock' in webpage:
+            raise ExtractorError(
+                'This content is only available for subscribers',
+                expected=True)
+
+        info = {
+            'thumbnail': thumbnail,
+        }
+
+        vimeo_id = self._search_regex(
+            r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None)
+
+        if not vimeo_id:
+            data = self._parse_json(
+                self._search_regex(
+                    r'data-collection=(["\'])(?P<data>{.+?})\1', webpage,
+                    'data collection', default='{}', group='data'),
+                display_id, transform_source=unescapeHTML, fatal=False)
+            video_id = self._extract_video_id(
+                data, lesson_id) or self._search_regex(
+                r'/videos/(\d+)/', thumbnail, 'video id')
+            headers = {
+                'Referer': url,
+                'X-Requested-With': 'XMLHttpRequest',
+            }
+            csrf_token = self._html_search_meta(
+                'csrf-token', webpage, 'csrf token', default=None)
+            if csrf_token:
+                headers['X-CSRF-Token'] = csrf_token
+            video = self._download_json(
+                'https://videos.raywenderlich.com/api/v1/videos/%s.json'
+                % video_id, display_id, headers=headers)['video']
+            vimeo_id = video['clips'][0]['provider_id']
+            info.update({
+                '_type': 'url_transparent',
+                'title': video.get('name'),
+                'description': video.get('description') or video.get(
+                    'meta_description'),
+                'duration': int_or_none(video.get('duration')),
+                'timestamp': unified_timestamp(video.get('created_at')),
+            })
+
+        return merge_dicts(info, self.url_result(
+            VimeoIE._smuggle_referrer(
+                'https://player.vimeo.com/video/%s' % vimeo_id, url),
+            ie=VimeoIE.ie_key(), video_id=vimeo_id))
+
+
+class RayWenderlichCourseIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            videos\.raywenderlich\.com/courses|
+                            (?:www\.)?raywenderlich\.com
+                        )/
+                        (?P<id>[^/]+)
+                    '''
+
+    _TEST = {
+        'url': 'https://www.raywenderlich.com/3530-testing-in-ios',
+        'info_dict': {
+            'title': 'Testing in iOS',
+            'id': '3530-testing-in-ios',
+        },
+        'params': {
+            'noplaylist': False,
+        },
+        'playlist_count': 29,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if RayWenderlichIE.suitable(url) else super(
+            RayWenderlichCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        course_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, course_id)
+
+        entries = []
+        lesson_urls = set()
+        for lesson_url in re.findall(
+                r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage):
+            if lesson_url in lesson_urls:
+                continue
+            lesson_urls.add(lesson_url)
+            entries.append(self.url_result(
+                urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key()))
+
+        title = self._og_search_title(
+            webpage, default=None) or self._html_search_meta(
+            'twitter:title', webpage, 'title', default=None)
+
+        return self.playlist_result(entries, course_id, title)
diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py

new file mode 100644 (file)

index 0000000..ae7413f
--- /dev/null
+++ b/youtube_dl/extractor/rbmaradio.py
@@ -0,0 +1,72 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    clean_html,
+    int_or_none,
+    unified_timestamp,
+    update_url_query,
+)
+
+
+class RBMARadioIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:rbmaradio|redbullradio)\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011',
+        'md5': '6bc6f9bcb18994b4c983bc3bf4384d95',
+        'info_dict': {
+            'id': 'ford-lopatin-live-at-primavera-sound-2011',
+            'ext': 'mp3',
+            'title': 'Main Stage - Ford & Lopatin at Primavera Sound',
+            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 2452,
+            'timestamp': 1307103164,
+            'upload_date': '20110603',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        show_id = mobj.group('show_id')
+        episode_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, episode_id)
+
+        episode = self._parse_json(
+            self._search_regex(
+                r'__INITIAL_STATE__\s*=\s*({.+?})\s*</script>',
+                webpage, 'json data'),
+            episode_id)['episodes'][show_id][episode_id]
+
+        title = episode['title']
+
+        show_title = episode.get('showTitle')
+        if show_title:
+            title = '%s - %s' % (show_title, title)
+
+        formats = [{
+            'url': update_url_query(episode['audioURL'], query={'cbr': abr}),
+            'format_id': compat_str(abr),
+            'abr': abr,
+            'vcodec': 'none',
+        } for abr in (96, 128, 192, 256)]
+        self._check_formats(formats, episode_id)
+
+        description = clean_html(episode.get('longTeaser'))
+        thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape'))
+        duration = int_or_none(episode.get('duration'))
+        timestamp = unified_timestamp(episode.get('publishedAt'))
+
+        return {
+            'id': episode_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py

new file mode 100644 (file)

index 0000000..8c016a7
--- /dev/null
+++ b/youtube_dl/extractor/rds.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    parse_iso8601,
+    js_to_json,
+)
+from ..compat import compat_str
+
+
+class RDSIE(InfoExtractor):
+    IE_DESC = 'RDS.ca'
+    _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+'
+
+    _TESTS = [{
+        'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',
+        'info_dict': {
+            'id': '604333',
+            'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',
+            'ext': 'flv',
+            'title': 'Fowler Jr. prend la direction de Jacksonville',
+            'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ',
+            'timestamp': 1430397346,
+            'upload_date': '20150430',
+            'duration': 154.354,
+            'age_limit': 0,
+        }
+    }, {
+        'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json)
+        video_id = compat_str(item['id'])
+        title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta(
+            'title', webpage, 'title', fatal=True)
+        description = self._og_search_description(webpage) or self._html_search_meta(
+            'description', webpage, 'description')
+        thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex(
+            [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"',
+             r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'],
+            webpage, 'thumbnail', fatal=False)
+        timestamp = parse_iso8601(self._search_regex(
+            r'<span[^>]+itemprop="uploadDate"[^>]+content="([^"]+)"',
+            webpage, 'upload date', fatal=False))
+        duration = parse_duration(self._search_regex(
+            r'<span[^>]+itemprop="duration"[^>]+content="([^"]+)"',
+            webpage, 'duration', fatal=False))
+        age_limit = self._family_friendly_search(webpage)
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'display_id': display_id,
+            'url': '9c9media:rds_web:%s' % video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'age_limit': age_limit,
+            'ie_key': 'NineCNineMedia',
+        }
diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py

new file mode 100644 (file)

index 0000000..dbe1aad
--- /dev/null
+++ b/youtube_dl/extractor/redbulltv.py
@@ -0,0 +1,128 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    float_or_none,
+    ExtractorError,
+)
+
+
+class RedBullTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)(?:/events/[^/]+)?/(?:videos?|live)/(?P<id>AP-\w+)'
+    _TESTS = [{
+        # film
+        'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11',
+        'md5': 'fb0445b98aa4394e504b413d98031d1f',
+        'info_dict': {
+            'id': 'AP-1Q6XCDTAN1W11',
+            'ext': 'mp4',
+            'title': 'ABC of... WRC - ABC of... S1E6',
+            'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31',
+            'duration': 1582.04,
+        },
+    }, {
+        # episode
+        'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11',
+        'info_dict': {
+            'id': 'AP-1PMHKJFCW1W11',
+            'ext': 'mp4',
+            'title': 'Grime - Hashtags S2E4',
+            'description': 'md5:b5f522b89b72e1e23216e5018810bb25',
+            'duration': 904.6,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.redbull.com/us-en/videos/AP-1YM9QCYE52111',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.redbull.com/us-en/events/AP-1XV2K61Q51W11/live/AP-1XUJ86FDH1W11',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        session = self._download_json(
+            'https://api.redbull.tv/v3/session', video_id,
+            note='Downloading access token', query={
+                'category': 'personal_computer',
+                'os_family': 'http',
+            })
+        if session.get('code') == 'error':
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, session['message']))
+        token = session['token']
+
+        try:
+            video = self._download_json(
+                'https://api.redbull.tv/v3/products/' + video_id,
+                video_id, note='Downloading video information',
+                headers={'Authorization': token}
+            )
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+                error_message = self._parse_json(
+                    e.cause.read().decode(), video_id)['error']
+                raise ExtractorError('%s said: %s' % (
+                    self.IE_NAME, error_message), expected=True)
+            raise
+
+        title = video['title'].strip()
+
+        formats = self._extract_m3u8_formats(
+            'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token),
+            video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for resource in video.get('resources', []):
+            if resource.startswith('closed_caption_'):
+                splitted_resource = resource.split('_')
+                if splitted_resource[2]:
+                    subtitles.setdefault('en', []).append({
+                        'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource),
+                        'ext': splitted_resource[2],
+                    })
+
+        subheading = video.get('subheading')
+        if subheading:
+            title += ' - %s' % subheading
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video.get('long_description') or video.get(
+                'short_description'),
+            'duration': float_or_none(video.get('duration'), scale=1000),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class RedBullTVRrnContentIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com(?:/[^/]+)?(?:/tv)?)/(?:video|live)/rrn:content:[^:]+:(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TESTS = [{
+        'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:live-videos:e3e6feb4-e95f-50b7-962a-c70f8fd13c73/mens-dh-finals-fort-william',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.redbull.com/int-en/tv/video/rrn:content:videos:a36a0f36-ff1b-5db8-a69d-ee11a14bf48b/tn-ts-style?playlist=rrn:content:event-profiles:83f05926-5de8-5389-b5e4-9bb312d715e8:extras',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_url = self._og_search_url(webpage)
+
+        return self.url_result(
+            video_url, ie=RedBullTVIE.ie_key(),
+            video_id=RedBullTVIE._match_id(video_url))
diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py

new file mode 100644 (file)

index 0000000..663f622
--- /dev/null
+++ b/youtube_dl/extractor/reddit.py
@@ -0,0 +1,130 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    url_or_none,
+)
+
+
+class RedditIE(InfoExtractor):
+    _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)'
+    _TEST = {
+        # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/
+        'url': 'https://v.redd.it/zv89llsvexdz',
+        'md5': '0a070c53eba7ec4534d95a5a1259e253',
+        'info_dict': {
+            'id': 'zv89llsvexdz',
+            'ext': 'mp4',
+            'title': 'zv89llsvexdz',
+        },
+        'params': {
+            'format': 'bestvideo',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        formats = self._extract_m3u8_formats(
+            'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id,
+            'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+
+        formats.extend(self._extract_mpd_formats(
+            'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id,
+            mpd_id='dash', fatal=False))
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'formats': formats,
+        }
+
+
+class RedditRIE(InfoExtractor):
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/?#&]+))'
+    _TESTS = [{
+        'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
+        'info_dict': {
+            'id': 'zv89llsvexdz',
+            'ext': 'mp4',
+            'title': 'That small heart attack.',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1501941939,
+            'upload_date': '20170805',
+            'uploader': 'Antw87',
+            'like_count': int,
+            'dislike_count': int,
+            'comment_count': int,
+            'age_limit': 0,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.reddit.com/r/videos/comments/6rrwyj',
+        'only_matching': True,
+    }, {
+        # imgur
+        'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/',
+        'only_matching': True,
+    }, {
+        # imgur @ old reddit
+        'url': 'https://old.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/',
+        'only_matching': True,
+    }, {
+        # streamable
+        'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/',
+        'only_matching': True,
+    }, {
+        # youtube
+        'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/',
+        'only_matching': True,
+    }, {
+        # reddit video @ nm reddit
+        'url': 'https://nm.reddit.com/r/Cricket/comments/8idvby/lousy_cameraman_finds_himself_in_cairns_line_of/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        url, video_id = mobj.group('url', 'id')
+
+        video_id = self._match_id(url)
+
+        data = self._download_json(
+            url + '/.json', video_id)[0]['data']['children'][0]['data']
+
+        video_url = data['url']
+
+        # Avoid recursing into the same reddit URL
+        if 'reddit.com/' in video_url and '/%s/' % video_id in video_url:
+            raise ExtractorError('No media found', expected=True)
+
+        over_18 = data.get('over_18')
+        if over_18 is True:
+            age_limit = 18
+        elif over_18 is False:
+            age_limit = 0
+        else:
+            age_limit = None
+
+        return {
+            '_type': 'url_transparent',
+            'url': video_url,
+            'title': data.get('title'),
+            'thumbnail': url_or_none(data.get('thumbnail')),
+            'timestamp': float_or_none(data.get('created_utc')),
+            'uploader': data.get('author'),
+            'like_count': int_or_none(data.get('ups')),
+            'dislike_count': int_or_none(data.get('downs')),
+            'comment_count': int_or_none(data.get('num_comments')),
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py

new file mode 100644 (file)

index 0000000..2d2f6a9
--- /dev/null
+++ b/youtube_dl/extractor/redtube.py
@@ -0,0 +1,133 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    merge_dicts,
+    str_to_int,
+    unified_strdate,
+    url_or_none,
+)
+
+
+class RedTubeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.redtube.com/66418',
+        'md5': 'fc08071233725f26b8f014dba9590005',
+        'info_dict': {
+            'id': '66418',
+            'ext': 'mp4',
+            'title': 'Sucked on a toilet',
+            'upload_date': '20110811',
+            'duration': 596,
+            'view_count': int,
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'http://embed.redtube.com/?bgcolor=000000&id=1443286',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'http://www.redtube.com/%s' % video_id, video_id)
+
+        ERRORS = (
+            (('video-deleted-info', '>This video has been removed'), 'has been removed'),
+            (('private_video_text', '>This video is private', '>Send a friend request to its owner to be able to view it'), 'is private'),
+        )
+
+        for patterns, message in ERRORS:
+            if any(p in webpage for p in patterns):
+                raise ExtractorError(
+                    'Video %s %s' % (video_id, message), expected=True)
+
+        info = self._search_json_ld(webpage, video_id, default={})
+
+        if not info.get('title'):
+            info['title'] = self._html_search_regex(
+                (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>',
+                 r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',),
+                webpage, 'title', group='title',
+                default=None) or self._og_search_title(webpage)
+
+        formats = []
+        sources = self._parse_json(
+            self._search_regex(
+                r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
+            video_id, fatal=False)
+        if sources and isinstance(sources, dict):
+            for format_id, format_url in sources.items():
+                if format_url:
+                    formats.append({
+                        'url': format_url,
+                        'format_id': format_id,
+                        'height': int_or_none(format_id),
+                    })
+        medias = self._parse_json(
+            self._search_regex(
+                r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage,
+                'media definitions', default='{}'),
+            video_id, fatal=False)
+        if medias and isinstance(medias, list):
+            for media in medias:
+                format_url = url_or_none(media.get('videoUrl'))
+                if not format_url:
+                    continue
+                if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id='hls',
+                        fatal=False))
+                    continue
+                format_id = media.get('quality')
+                formats.append({
+                    'url': format_url,
+                    'format_id': format_id,
+                    'height': int_or_none(format_id),
+                })
+        if not formats:
+            video_url = self._html_search_regex(
+                r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
+            formats.append({'url': video_url})
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+        upload_date = unified_strdate(self._search_regex(
+            r'<span[^>]+>(?:ADDED|Published on) ([^<]+)<',
+            webpage, 'upload date', default=None))
+        duration = int_or_none(self._og_search_property(
+            'video:duration', webpage, default=None) or self._search_regex(
+                r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None))
+        view_count = str_to_int(self._search_regex(
+            (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)',
+             r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)',
+             r'<span[^>]+\bclass=["\']video_view_count[^>]*>\s*([\d,.]+)'),
+            webpage, 'view count', default=None))
+
+        # No self-labeling, but they describe themselves as
+        # "Home of Videos Porno"
+        age_limit = 18
+
+        return merge_dicts(info, {
+            'id': video_id,
+            'ext': 'mp4',
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'view_count': view_count,
+            'age_limit': age_limit,
+            'formats': formats,
+        })
diff --git a/youtube_dl/extractor/regiotv.py b/youtube_dl/extractor/regiotv.py

new file mode 100644 (file)

index 0000000..e250a52
--- /dev/null
+++ b/youtube_dl/extractor/regiotv.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..utils import (
+    sanitized_Request,
+    xpath_text,
+    xpath_with_ns,
+)
+
+
+class RegioTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?regio-tv\.de/video/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.regio-tv.de/video/395808.html',
+        'info_dict': {
+            'id': '395808',
+            'ext': 'mp4',
+            'title': 'Wir in Ludwigsburg',
+            'description': 'Mit unseren zuckersüßen Adventskindern, außerdem besuchen wir die Abendsterne!',
+        }
+    }, {
+        'url': 'http://www.regio-tv.de/video/395808',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        key = self._search_regex(
+            r'key\s*:\s*(["\'])(?P<key>.+?)\1', webpage, 'key', group='key')
+        title = self._og_search_title(webpage)
+
+        SOAP_TEMPLATE = '<?xml version="1.0" encoding="utf-8"?><soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"><soap:Body><{0} xmlns="http://v.telvi.de/"><key xsi:type="xsd:string">{1}</key></{0}></soap:Body></soap:Envelope>'
+
+        request = sanitized_Request(
+            'http://v.telvi.de/',
+            SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8'))
+        video_data = self._download_xml(request, video_id, 'Downloading video XML')
+
+        NS_MAP = {
+            'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+            'soap': 'http://schemas.xmlsoap.org/soap/envelope/',
+        }
+
+        video_url = xpath_text(
+            video_data, xpath_with_ns('.//video', NS_MAP), 'video url', fatal=True)
+        thumbnail = xpath_text(
+            video_data, xpath_with_ns('.//image', NS_MAP), 'thumbnail')
+        description = self._og_search_description(
+            webpage) or self._html_search_meta('description', webpage)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/rentv.py b/youtube_dl/extractor/rentv.py

new file mode 100644 (file)

index 0000000..7c8909d
--- /dev/null
+++ b/youtube_dl/extractor/rentv.py
@@ -0,0 +1,106 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    url_or_none,
+)
+
+
+class RENTVIE(InfoExtractor):
+    _VALID_URL = r'(?:rentv:|https?://(?:www\.)?ren\.tv/(?:player|video/epizod)/)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://ren.tv/video/epizod/118577',
+        'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb',
+        'info_dict': {
+            'id': '118577',
+            'ext': 'mp4',
+            'title': 'Документальный спецпроект: "Промывка мозгов. Технологии XXI века"',
+            'timestamp': 1472230800,
+            'upload_date': '20160826',
+        }
+    }, {
+        'url': 'http://ren.tv/player/118577',
+        'only_matching': True,
+    }, {
+        'url': 'rentv:118577',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage('http://ren.tv/player/' + video_id, video_id)
+        config = self._parse_json(self._search_regex(
+            r'config\s*=\s*({.+})\s*;', webpage, 'config'), video_id)
+        title = config['title']
+        formats = []
+        for video in config['src']:
+            src = url_or_none(video.get('src'))
+            if not src:
+                continue
+            ext = determine_ext(src)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': src,
+                })
+        self._sort_formats(formats)
+        return {
+            'id': video_id,
+            'title': title,
+            'description': config.get('description'),
+            'thumbnail': config.get('image'),
+            'duration': int_or_none(config.get('duration')),
+            'timestamp': int_or_none(config.get('date')),
+            'formats': formats,
+        }
+
+
+class RENTVArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ren\.tv/novosti/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'http://ren.tv/novosti/2016-10-26/video-mikroavtobus-popavshiy-v-dtp-s-gruzovikami-v-podmoskove-prevratilsya-v',
+        'md5': 'ebd63c4680b167693745ab91343df1d6',
+        'info_dict': {
+            'id': '136472',
+            'ext': 'mp4',
+            'title': 'Видео: микроавтобус, попавший в ДТП с грузовиками в Подмосковье, превратился в груду металла',
+            'description': 'Жертвами столкновения двух фур и микроавтобуса, по последним данным, стали семь человек.',
+        }
+    }, {
+        # TODO: invalid m3u8
+        'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video',
+        'info_dict': {
+            'id': 'playlist',
+            'ext': 'mp4',
+            'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ',
+            'uploader': 'ren.tv',
+        },
+        'params': {
+            # m3u8 downloads
+            'skip_download': True,
+        },
+        'skip': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        drupal_settings = self._parse_json(self._search_regex(
+            r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+            webpage, 'drupal settings'), display_id)
+
+        entries = []
+        for config_profile in drupal_settings.get('ren_jwplayer', {}).values():
+            media_id = config_profile.get('mediaid')
+            if not media_id:
+                continue
+            media_id = compat_str(media_id)
+            entries.append(self.url_result('rentv:' + media_id, 'RENTV', media_id))
+        return self.playlist_result(entries, display_id)
diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py

new file mode 100644 (file)

index 0000000..d47fb45
--- /dev/null
+++ b/youtube_dl/extractor/restudy.py
@@ -0,0 +1,44 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class RestudyIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|portal)\.)?restudy\.dk/video/[^/]+/id/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'https://www.restudy.dk/video/play/id/1637',
+        'info_dict': {
+            'id': '1637',
+            'ext': 'flv',
+            'title': 'Leiden-frosteffekt',
+            'description': 'Denne video er et eksperiment med flydende kvælstof.',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://portal.restudy.dk/video/leiden-frosteffekt/id/1637',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(webpage).strip()
+        description = self._og_search_description(webpage).strip()
+
+        formats = self._extract_smil_formats(
+            'https://cdn.portal.restudy.dk/dynamic/themes/front/awsmedia/SmilDirectory/video_%s.xml' % video_id,
+            video_id)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/reuters.py b/youtube_dl/extractor/reuters.py

new file mode 100644 (file)

index 0000000..9dc482d
--- /dev/null
+++ b/youtube_dl/extractor/reuters.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+    int_or_none,
+    unescapeHTML,
+)
+
+
+class ReutersIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562',
+        'md5': '8015113643a0b12838f160b0b81cc2ee',
+        'info_dict': {
+            'id': '368575562',
+            'ext': 'mp4',
+            'title': 'San Francisco police chief resigns',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id)
+        video_data = js_to_json(self._search_regex(
+            r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);',
+            webpage, 'video data'))
+
+        def get_json_value(key, fatal=False):
+            return self._search_regex(r'"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal)
+
+        title = unescapeHTML(get_json_value('title', fatal=True))
+        mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups()
+
+        mas_data = self._download_json(
+            'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid),
+            video_id, transform_source=js_to_json)
+        formats = []
+        for f in mas_data:
+            f_url = f.get('url')
+            if not f_url:
+                continue
+            method = f.get('method')
+            if method == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+            else:
+                container = f.get('container')
+                ext = '3gp' if method == 'mobile' else container
+                formats.append({
+                    'format_id': ext,
+                    'url': f_url,
+                    'ext': ext,
+                    'container': container if method != 'mobile' else None,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': get_json_value('thumb'),
+            'duration': int_or_none(get_json_value('seconds')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/reverbnation.py b/youtube_dl/extractor/reverbnation.py

new file mode 100644 (file)

index 0000000..4cb99c2
--- /dev/null
+++ b/youtube_dl/extractor/reverbnation.py
@@ -0,0 +1,53 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    qualities,
+    str_or_none,
+)
+
+
+class ReverbNationIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?reverbnation\.com/.*?/song/(?P<id>\d+).*?$'
+    _TESTS = [{
+        'url': 'http://www.reverbnation.com/alkilados/song/16965047-mona-lisa',
+        'md5': 'c0aaf339bcee189495fdf5a8c8ba8645',
+        'info_dict': {
+            'id': '16965047',
+            'ext': 'mp3',
+            'title': 'MONA LISA',
+            'uploader': 'ALKILADOS',
+            'uploader_id': '216429',
+            'thumbnail': r're:^https?://.*\.jpg',
+        },
+    }]
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+
+        api_res = self._download_json(
+            'https://api.reverbnation.com/song/%s' % song_id,
+            song_id,
+            note='Downloading information of song %s' % song_id
+        )
+
+        THUMBNAILS = ('thumbnail', 'image')
+        quality = qualities(THUMBNAILS)
+        thumbnails = []
+        for thumb_key in THUMBNAILS:
+            if api_res.get(thumb_key):
+                thumbnails.append({
+                    'url': api_res[thumb_key],
+                    'preference': quality(thumb_key)
+                })
+
+        return {
+            'id': song_id,
+            'title': api_res['name'],
+            'url': api_res['url'],
+            'uploader': api_res.get('artist', {}).get('name'),
+            'uploader_id': str_or_none(api_res.get('artist', {}).get('id')),
+            'thumbnails': thumbnails,
+            'ext': 'mp3',
+            'vcodec': 'none',
+        }
diff --git a/youtube_dl/extractor/rice.py b/youtube_dl/extractor/rice.py

new file mode 100644 (file)

index 0000000..f855719
--- /dev/null
+++ b/youtube_dl/extractor/rice.py
@@ -0,0 +1,116 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import (
+    xpath_text,
+    xpath_element,
+    int_or_none,
+    parse_iso8601,
+    ExtractorError,
+)
+
+
+class RICEIE(InfoExtractor):
+    _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)'
+    _TEST = {
+        'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw',
+        'md5': '9b83b4a2eead4912dc3b7fac7c449b6a',
+        'info_dict': {
+            'id': 'YEWIvbhb40aqdjMD1ALSqw',
+            'ext': 'mp4',
+            'title': 'Active Learning in Archeology',
+            'upload_date': '20140616',
+            'timestamp': 1402926346,
+        }
+    }
+    _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config'
+
+    def _real_extract(self, url):
+        qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
+        if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'):
+            raise ExtractorError('Invalid URL', expected=True)
+
+        portal_id = qs['PortalID'][0]
+        playlist_id = qs['DestinationID'][0]
+        content_id = qs['ContentID'][0]
+
+        content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={
+            'portalId': portal_id,
+            'playlistId': playlist_id,
+            'contentId': content_id
+        })
+        metadata = xpath_element(content_data, './/metaData', fatal=True)
+        title = xpath_text(metadata, 'primaryTitle', fatal=True)
+        encodings = xpath_element(content_data, './/encodings', fatal=True)
+        player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={
+            'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True),
+            'contentId': content_id,
+        })
+
+        common_fmt = {}
+        dimensions = xpath_text(encodings, 'dimensions')
+        if dimensions:
+            wh = dimensions.split('x')
+            if len(wh) == 2:
+                common_fmt.update({
+                    'width': int_or_none(wh[0]),
+                    'height': int_or_none(wh[1]),
+                })
+
+        formats = []
+        rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS))
+        if rtsp_path:
+            fmt = {
+                'url': rtsp_path,
+                'format_id': 'rtsp',
+            }
+            fmt.update(common_fmt)
+            formats.append(fmt)
+        for source in player_data.findall(self._xpath_ns('.//Source', self._NS)):
+            video_url = xpath_text(source, self._xpath_ns('File', self._NS))
+            if not video_url:
+                continue
+            if '.m3u8' in video_url:
+                formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+            else:
+                fmt = {
+                    'url': video_url,
+                    'format_id': video_url.split(':')[0],
+                }
+                fmt.update(common_fmt)
+                rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url)
+                if rtmp:
+                    fmt.update({
+                        'url': rtmp.group('url'),
+                        'play_path': rtmp.group('playpath'),
+                        'app': rtmp.group('app'),
+                        'ext': 'flv',
+                    })
+                formats.append(fmt)
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for content_asset in content_data.findall('.//contentAssets'):
+            asset_type = xpath_text(content_asset, 'type')
+            if asset_type == 'image':
+                image_url = xpath_text(content_asset, 'httpPath')
+                if not image_url:
+                    continue
+                thumbnails.append({
+                    'id': xpath_text(content_asset, 'ID'),
+                    'url': image_url,
+                })
+
+        return {
+            'id': content_id,
+            'title': title,
+            'description': xpath_text(metadata, 'abstract'),
+            'duration': int_or_none(xpath_text(metadata, 'duration')),
+            'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py

new file mode 100644 (file)

index 0000000..c3623ed
--- /dev/null
+++ b/youtube_dl/extractor/rmcdecouverte.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveLegacyIE
+from ..compat import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+from ..utils import smuggle_url
+
+
+class RMCDecouverteIE(InfoExtractor):
+    _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:(?:[^/]+/)*program_(?P<id>\d+)|(?P<live_id>mediaplayer-direct))'
+
+    _TESTS = [{
+        'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/',
+        'info_dict': {
+            'id': '5983675500001',
+            'ext': 'mp4',
+            'title': 'CORVETTE',
+            'description': 'md5:c1e8295521e45ffebf635d6a7658f506',
+            'uploader_id': '1969646226001',
+            'upload_date': '20181226',
+            'timestamp': 1545861635,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'only available for a week',
+    }, {
+        # live, geo restricted, bypassable
+        'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/',
+        'only_matching': True,
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id') or mobj.group('live_id')
+        webpage = self._download_webpage(url, display_id)
+        brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
+        if brightcove_legacy_url:
+            brightcove_id = compat_parse_qs(compat_urlparse.urlparse(
+                brightcove_legacy_url).query)['@videoPlayer'][0]
+        else:
+            brightcove_id = self._search_regex(
+                r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
+        return self.url_result(
+            smuggle_url(
+                self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+                {'geo_countries': ['FR']}),
+            'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py

new file mode 100644 (file)

index 0000000..69934ef
--- /dev/null
+++ b/youtube_dl/extractor/ro220.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+
+
+class Ro220IE(InfoExtractor):
+    IE_NAME = '220.ro'
+    _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/',
+        'md5': '03af18b73a07b4088753930db7a34add',
+        'info_dict': {
+            'id': 'LYV6doKo7f',
+            'ext': 'mp4',
+            'title': 'Luati-le Banii sez 4 ep 1',
+            'description': r're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        url = compat_urllib_parse_unquote(self._search_regex(
+            r'(?s)clip\s*:\s*{.*?url\s*:\s*\'([^\']+)\'', webpage, 'url'))
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        formats = [{
+            'format_id': 'sd',
+            'url': url,
+            'ext': 'mp4',
+        }]
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/rockstargames.py b/youtube_dl/extractor/rockstargames.py

new file mode 100644 (file)

index 0000000..cd6904b
--- /dev/null
+++ b/youtube_dl/extractor/rockstargames.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class RockstarGamesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?rockstargames\.com/videos(?:/video/|#?/?\?.*\bvideo=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.rockstargames.com/videos/video/11544/',
+        'md5': '03b5caa6e357a4bd50e3143fc03e5733',
+        'info_dict': {
+            'id': '11544',
+            'ext': 'mp4',
+            'title': 'Further Adventures in Finance and Felony Trailer',
+            'description': 'md5:6d31f55f30cb101b5476c4a379e324a3',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1464876000,
+            'upload_date': '20160602',
+        }
+    }, {
+        'url': 'http://www.rockstargames.com/videos#/?video=48',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'https://www.rockstargames.com/videoplayer/videos/get-video.json',
+            video_id, query={
+                'id': video_id,
+                'locale': 'en_us',
+            })['video']
+
+        title = video['title']
+
+        formats = []
+        for video in video['files_processed']['video/mp4']:
+            if not video.get('src'):
+                continue
+            resolution = video.get('resolution')
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]$', resolution or '', 'height', default=None))
+            formats.append({
+                'url': self._proto_relative_url(video['src']),
+                'format_id': resolution,
+                'height': height,
+            })
+
+        if not formats:
+            youtube_id = video.get('youtube_id')
+            if youtube_id:
+                return self.url_result(youtube_id, 'Youtube')
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': self._proto_relative_url(video.get('screencap')),
+            'timestamp': parse_iso8601(video.get('created')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py

new file mode 100644 (file)

index 0000000..8883639
--- /dev/null
+++ b/youtube_dl/extractor/roosterteeth.py
@@ -0,0 +1,137 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    str_or_none,
+    urlencode_postdata,
+)
+
+
+class RoosterTeethIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)'
+    _NETRC_MACHINE = 'roosterteeth'
+    _TESTS = [{
+        'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement',
+        'md5': 'e2bd7764732d785ef797700a2489f212',
+        'info_dict': {
+            'id': '9156',
+            'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement',
+            'ext': 'mp4',
+            'title': 'Million Dollars, But... The Game Announcement',
+            'description': 'md5:168a54b40e228e79f4ddb141e89fe4f5',
+            'thumbnail': r're:^https?://.*\.png$',
+            'series': 'Million Dollars, But...',
+            'episode': 'Million Dollars, But... The Game Announcement',
+        },
+    }, {
+        'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31',
+        'only_matching': True,
+    }, {
+        'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts',
+        'only_matching': True,
+    }, {
+        'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow',
+        'only_matching': True,
+    }, {
+        'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better',
+        'only_matching': True,
+    }, {
+        # only available for FIRST members
+        'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one',
+        'only_matching': True,
+    }, {
+        'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement',
+        'only_matching': True,
+    }]
+    _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/episodes/'
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        try:
+            self._download_json(
+                'https://auth.roosterteeth.com/oauth/token',
+                None, 'Logging in', data=urlencode_postdata({
+                    'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5',
+                    'grant_type': 'password',
+                    'username': username,
+                    'password': password,
+                }))
+        except ExtractorError as e:
+            msg = 'Unable to login'
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                resp = self._parse_json(e.cause.read().decode(), None, fatal=False)
+                if resp:
+                    error = resp.get('extra_info') or resp.get('error_description') or resp.get('error')
+                    if error:
+                        msg += ': ' + error
+            self.report_warning(msg)
+
+    def _real_initialize(self):
+        if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'):
+            return
+        self._login()
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        api_episode_url = self._EPISODE_BASE_URL + display_id
+
+        try:
+            m3u8_url = self._download_json(
+                api_episode_url + '/videos', display_id,
+                'Downloading video JSON metadata')['data'][0]['attributes']['url']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                if self._parse_json(e.cause.read().decode(), display_id).get('access') is False:
+                    self.raise_login_required(
+                        '%s is only available for FIRST members' % display_id)
+            raise
+
+        formats = self._extract_m3u8_formats(
+            m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+        self._sort_formats(formats)
+
+        episode = self._download_json(
+            api_episode_url, display_id,
+            'Downloading episode JSON metadata')['data'][0]
+        attributes = episode['attributes']
+        title = attributes.get('title') or attributes['display_title']
+        video_id = compat_str(episode['id'])
+
+        thumbnails = []
+        for image in episode.get('included', {}).get('images', []):
+            if image.get('type') == 'episode_image':
+                img_attributes = image.get('attributes') or {}
+                for k in ('thumb', 'small', 'medium', 'large'):
+                    img_url = img_attributes.get(k)
+                    if img_url:
+                        thumbnails.append({
+                            'id': k,
+                            'url': img_url,
+                        })
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': attributes.get('description') or attributes.get('caption'),
+            'thumbnails': thumbnails,
+            'series': attributes.get('show_title'),
+            'season_number': int_or_none(attributes.get('season_number')),
+            'season_id': attributes.get('season_id'),
+            'episode': title,
+            'episode_number': int_or_none(attributes.get('number')),
+            'episode_id': str_or_none(episode.get('uuid')),
+            'formats': formats,
+            'channel_id': attributes.get('channel_id'),
+            'duration': int_or_none(attributes.get('length')),
+        }
diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py

new file mode 100644 (file)

index 0000000..14c8e82
--- /dev/null
+++ b/youtube_dl/extractor/rottentomatoes.py
@@ -0,0 +1,32 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .internetvideoarchive import InternetVideoArchiveIE
+
+
+class RottenTomatoesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/',
+        'info_dict': {
+            'id': '11028566',
+            'ext': 'mp4',
+            'title': 'Toy Story 3',
+            'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id')
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id,
+            'ie_key': InternetVideoArchiveIE.ie_key(),
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+        }
diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py

new file mode 100644 (file)

index 0000000..6528464
--- /dev/null
+++ b/youtube_dl/extractor/roxwel.py
@@ -0,0 +1,53 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import unified_strdate, determine_ext
+
+
+class RoxwelIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)'
+
+    _TEST = {
+        'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html',
+        'info_dict': {
+            'id': 'passionpittakeawalklive',
+            'ext': 'flv',
+            'title': 'Take A Walk (live)',
+            'uploader': 'Passion Pit',
+            'uploader_id': 'passionpit',
+            'upload_date': '20120928',
+            'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        filename = mobj.group('filename')
+        info_url = 'http://www.roxwel.com/api/videos/%s' % filename
+        info = self._download_json(info_url, filename)
+
+        rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')])
+        best_rate = rtmp_rates[-1]
+        url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate)
+        rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url')
+        ext = determine_ext(rtmp_url)
+        if ext == 'f4v':
+            rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename)
+
+        return {
+            'id': filename,
+            'title': info['title'],
+            'url': rtmp_url,
+            'ext': 'flv',
+            'description': info['description'],
+            'thumbnail': info.get('player_image_url') or info.get('image_url_large'),
+            'uploader': info['artist'],
+            'uploader_id': info['artistname'],
+            'upload_date': unified_strdate(info['dbdate']),
+        }
diff --git a/youtube_dl/extractor/rozhlas.py b/youtube_dl/extractor/rozhlas.py

new file mode 100644 (file)

index 0000000..fccf694
--- /dev/null
+++ b/youtube_dl/extractor/rozhlas.py
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    remove_start,
+)
+
+
+class RozhlasIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://prehravac.rozhlas.cz/audio/3421320',
+        'md5': '504c902dbc9e9a1fd50326eccf02a7e2',
+        'info_dict': {
+            'id': '3421320',
+            'ext': 'mp3',
+            'title': 'Echo Pavla Klusáka (30.06.2015 21:00)',
+            'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let'
+        }
+    }, {
+        'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        audio_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id)
+
+        title = self._html_search_regex(
+            r'<h3>(.+?)</h3>\s*<p[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track',
+            webpage, 'title', default=None) or remove_start(
+            self._og_search_title(webpage), 'Radio Wave - ')
+        description = self._html_search_regex(
+            r'<p[^>]+title=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track',
+            webpage, 'description', fatal=False, group='url')
+        duration = int_or_none(self._search_regex(
+            r'data-duration=["\'](\d+)', webpage, 'duration', default=None))
+
+        return {
+            'id': audio_id,
+            'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'vcodec': 'none',
+        }
diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py

new file mode 100644 (file)

index 0000000..3b0f308
--- /dev/null
+++ b/youtube_dl/extractor/rtbf.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    strip_or_none,
+)
+
+
+class RTBFIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+        https?://(?:www\.)?rtbf\.be/
+        (?:
+            video/[^?]+\?.*\bid=|
+            ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=|
+            auvio/[^/]+\?.*\b(?P<live>l)?id=
+        )(?P<id>\d+)'''
+    _TESTS = [{
+        'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274',
+        'md5': '8c876a1cceeb6cf31b476461ade72384',
+        'info_dict': {
+            'id': '1921274',
+            'ext': 'mp4',
+            'title': 'Les Diables au coeur (épisode 2)',
+            'description': '(du 25/04/2014)',
+            'duration': 3099.54,
+            'upload_date': '20140425',
+            'timestamp': 1398456300,
+        }
+    }, {
+        # geo restricted
+        'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996',
+        'only_matching': True,
+    }, {
+        # Live
+        'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775',
+        'only_matching': True,
+    }, {
+        # Audio
+        'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811',
+        'only_matching': True,
+    }, {
+        # With Subtitle
+        'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588',
+        'only_matching': True,
+    }]
+    _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be'
+    _PROVIDERS = {
+        'YOUTUBE': 'Youtube',
+        'DAILYMOTION': 'Dailymotion',
+        'VIMEO': 'Vimeo',
+    }
+    _QUALITIES = [
+        ('mobile', 'SD'),
+        ('web', 'MD'),
+        ('high', 'HD'),
+    ]
+
+    def _real_extract(self, url):
+        live, media_id = re.match(self._VALID_URL, url).groups()
+        embed_page = self._download_webpage(
+            'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'),
+            media_id, query={'id': media_id})
+        data = self._parse_json(self._html_search_regex(
+            r'data-media="([^"]+)"', embed_page, 'media data'), media_id)
+
+        error = data.get('error')
+        if error:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+        provider = data.get('provider')
+        if provider in self._PROVIDERS:
+            return self.url_result(data['url'], self._PROVIDERS[provider])
+
+        title = data['title']
+        is_live = data.get('isLive')
+        if is_live:
+            title = self._live_title(title)
+        height_re = r'-(\d+)p\.'
+        formats = []
+
+        m3u8_url = data.get('urlHlsAes128') or data.get('urlHls')
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False))
+
+        fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x
+        http_url = data.get('url')
+        if formats and http_url and re.search(height_re, http_url):
+            http_url = fix_url(http_url)
+            for m3u8_f in formats[:]:
+                height = m3u8_f.get('height')
+                if not height:
+                    continue
+                f = m3u8_f.copy()
+                del f['protocol']
+                f.update({
+                    'format_id': m3u8_f['format_id'].replace('hls-', 'http-'),
+                    'url': re.sub(height_re, '-%dp.' % height, http_url),
+                })
+                formats.append(f)
+        else:
+            sources = data.get('sources') or {}
+            for key, format_id in self._QUALITIES:
+                format_url = sources.get(key)
+                if not format_url:
+                    continue
+                height = int_or_none(self._search_regex(
+                    height_re, format_url, 'height', default=None))
+                formats.append({
+                    'format_id': format_id,
+                    'url': fix_url(format_url),
+                    'height': height,
+                })
+
+        mpd_url = data.get('urlDash')
+        if not data.get('drm') and mpd_url:
+            formats.extend(self._extract_mpd_formats(
+                mpd_url, media_id, mpd_id='dash', fatal=False))
+
+        audio_url = data.get('urlAudio')
+        if audio_url:
+            formats.append({
+                'format_id': 'audio',
+                'url': audio_url,
+                'vcodec': 'none',
+            })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for track in (data.get('tracks') or {}).values():
+            sub_url = track.get('url')
+            if not sub_url:
+                continue
+            subtitles.setdefault(track.get('lang') or 'fr', []).append({
+                'url': sub_url,
+            })
+
+        return {
+            'id': media_id,
+            'formats': formats,
+            'title': title,
+            'description': strip_or_none(data.get('description')),
+            'thumbnail': data.get('thumbnail'),
+            'duration': float_or_none(data.get('realDuration')),
+            'timestamp': int_or_none(data.get('liveFrom')),
+            'series': data.get('programLabel'),
+            'subtitles': subtitles,
+            'is_live': is_live,
+        }
diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py

new file mode 100644 (file)

index 0000000..1fbc729
--- /dev/null
+++ b/youtube_dl/extractor/rte.py
@@ -0,0 +1,167 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    float_or_none,
+    parse_iso8601,
+    str_or_none,
+    try_get,
+    unescapeHTML,
+    url_or_none,
+    ExtractorError,
+)
+
+
+class RteBaseIE(InfoExtractor):
+    def _real_extract(self, url):
+        item_id = self._match_id(url)
+
+        info_dict = {}
+        formats = []
+
+        ENDPOINTS = (
+            'https://feeds.rasset.ie/rteavgen/player/playlist?type=iptv&format=json&showId=',
+            'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=',
+        )
+
+        for num, ep_url in enumerate(ENDPOINTS, start=1):
+            try:
+                data = self._download_json(ep_url + item_id, item_id)
+            except ExtractorError as ee:
+                if num < len(ENDPOINTS) or formats:
+                    continue
+                if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
+                    error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False)
+                    if error_info:
+                        raise ExtractorError(
+                            '%s said: %s' % (self.IE_NAME, error_info['message']),
+                            expected=True)
+                raise
+
+            # NB the string values in the JSON are stored using XML escaping(!)
+            show = try_get(data, lambda x: x['shows'][0], dict)
+            if not show:
+                continue
+
+            if not info_dict:
+                title = unescapeHTML(show['title'])
+                description = unescapeHTML(show.get('description'))
+                thumbnail = show.get('thumbnail')
+                duration = float_or_none(show.get('duration'), 1000)
+                timestamp = parse_iso8601(show.get('published'))
+                info_dict = {
+                    'id': item_id,
+                    'title': title,
+                    'description': description,
+                    'thumbnail': thumbnail,
+                    'timestamp': timestamp,
+                    'duration': duration,
+                }
+
+            mg = try_get(show, lambda x: x['media:group'][0], dict)
+            if not mg:
+                continue
+
+            if mg.get('url'):
+                m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url'])
+                if m:
+                    m = m.groupdict()
+                    formats.append({
+                        'url': m['url'] + '/' + m['app'],
+                        'app': m['app'],
+                        'play_path': m['playpath'],
+                        'player_url': url,
+                        'ext': 'flv',
+                        'format_id': 'rtmp',
+                    })
+
+            if mg.get('hls_server') and mg.get('hls_url'):
+                formats.extend(self._extract_m3u8_formats(
+                    mg['hls_server'] + mg['hls_url'], item_id, 'mp4',
+                    entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+
+            if mg.get('hds_server') and mg.get('hds_url'):
+                formats.extend(self._extract_f4m_formats(
+                    mg['hds_server'] + mg['hds_url'], item_id,
+                    f4m_id='hds', fatal=False))
+
+            mg_rte_server = str_or_none(mg.get('rte:server'))
+            mg_url = str_or_none(mg.get('url'))
+            if mg_rte_server and mg_url:
+                hds_url = url_or_none(mg_rte_server + mg_url)
+                if hds_url:
+                    formats.extend(self._extract_f4m_formats(
+                        hds_url, item_id, f4m_id='hds', fatal=False))
+
+        self._sort_formats(formats)
+
+        info_dict['formats'] = formats
+        return info_dict
+
+
+class RteIE(RteBaseIE):
+    IE_NAME = 'rte'
+    IE_DESC = 'Raidió Teilifís Éireann TV'
+    _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/',
+        'md5': '4a76eb3396d98f697e6e8110563d2604',
+        'info_dict': {
+            'id': '10478715',
+            'ext': 'mp4',
+            'title': 'iWitness',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'The spirit of Ireland, one voice and one minute at a time.',
+            'duration': 60.046,
+            'upload_date': '20151012',
+            'timestamp': 1444694160,
+        },
+    }
+
+
+class RteRadioIE(RteBaseIE):
+    IE_NAME = 'rte:radio'
+    IE_DESC = 'Raidió Teilifís Éireann radio'
+    # Radioplayer URLs have two distinct specifier formats,
+    # the old format #!rii=<channel_id>:<id>:<playable_item_id>:<date>:
+    # the new format #!rii=b<channel_id>_<id>_<playable_item_id>_<date>_
+    # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated.
+    # An <id> uniquely defines an individual recording, and is the only part we require.
+    _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:b?[0-9]*)(?:%3A|:|%5F|_)(?P<id>[0-9]+)'
+
+    _TESTS = [{
+        # Old-style player URL; HLS and RTMPE formats
+        'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:',
+        'md5': 'c79ccb2c195998440065456b69760411',
+        'info_dict': {
+            'id': '10507902',
+            'ext': 'mp4',
+            'title': 'Gloria',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'md5:9ce124a7fb41559ec68f06387cabddf0',
+            'timestamp': 1451203200,
+            'upload_date': '20151227',
+            'duration': 7230.0,
+        },
+    }, {
+        # New-style player URL; RTMPE formats only
+        'url': 'http://rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=b16_3250678_8861_06-04-2012_',
+        'info_dict': {
+            'id': '3250678',
+            'ext': 'flv',
+            'title': 'The Lyric Concert with Paul Herriott',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': '',
+            'timestamp': 1333742400,
+            'upload_date': '20120406',
+            'duration': 7199.016,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }]
diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py

new file mode 100644 (file)

index 0000000..70f000c
--- /dev/null
+++ b/youtube_dl/extractor/rtl2.py
@@ -0,0 +1,207 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..aes import aes_cbc_decrypt
+from ..compat import (
+    compat_b64decode,
+    compat_ord,
+    compat_str,
+)
+from ..utils import (
+    bytes_to_intlist,
+    ExtractorError,
+    intlist_to_bytes,
+    int_or_none,
+    strip_or_none,
+)
+
+
+class RTL2IE(InfoExtractor):
+    IE_NAME = 'rtl2'
+    _VALID_URL = r'https?://(?:www\.)?rtl2\.de/sendung/[^/]+/(?:video/(?P<vico_id>\d+)[^/]+/(?P<vivi_id>\d+)-|folge/)(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0',
+        'info_dict': {
+            'id': 'folge-203-0',
+            'ext': 'f4v',
+            'title': 'GRIP sucht den Sommerkönig',
+            'description': 'md5:e3adbb940fd3c6e76fa341b8748b562f'
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+        'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'],
+    }, {
+        'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/',
+        'info_dict': {
+            'id': 'anna-erwischt-alex',
+            'ext': 'mp4',
+            'title': 'Anna erwischt Alex!',
+            'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.'
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+        'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'],
+    }]
+
+    def _real_extract(self, url):
+        vico_id, vivi_id, display_id = re.match(self._VALID_URL, url).groups()
+        if not vico_id:
+            webpage = self._download_webpage(url, display_id)
+
+            mobj = re.search(
+                r'data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"',
+                webpage)
+            if mobj:
+                vico_id = mobj.group('vico_id')
+                vivi_id = mobj.group('vivi_id')
+            else:
+                vico_id = self._html_search_regex(
+                    r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id')
+                vivi_id = self._html_search_regex(
+                    r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id')
+
+        info = self._download_json(
+            'https://service.rtl2.de/api-player-vipo/video.php',
+            display_id, query={
+                'vico_id': vico_id,
+                'vivi_id': vivi_id,
+            })
+        video_info = info['video']
+        title = video_info['titel']
+
+        formats = []
+
+        rtmp_url = video_info.get('streamurl')
+        if rtmp_url:
+            rtmp_url = rtmp_url.replace('\\', '')
+            stream_url = 'mp4:' + self._html_search_regex(r'/ondemand/(.+)', rtmp_url, 'stream URL')
+            rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0']
+
+            formats.append({
+                'format_id': 'rtmp',
+                'url': rtmp_url,
+                'play_path': stream_url,
+                'player_url': 'https://www.rtl2.de/sites/default/modules/rtl2/jwplayer/jwplayer-7.6.0/jwplayer.flash.swf',
+                'page_url': url,
+                'flash_version': 'LNX 11,2,202,429',
+                'rtmp_conn': rtmp_conn,
+                'no_resume': True,
+                'preference': 1,
+            })
+
+        m3u8_url = video_info.get('streamurl_hls')
+        if m3u8_url:
+            formats.extend(self._extract_akamai_formats(m3u8_url, display_id))
+
+        self._sort_formats(formats)
+
+        return {
+            'id': display_id,
+            'title': title,
+            'thumbnail': video_info.get('image'),
+            'description': video_info.get('beschreibung'),
+            'duration': int_or_none(video_info.get('duration')),
+            'formats': formats,
+        }
+
+
+class RTL2YouBaseIE(InfoExtractor):
+    _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/'
+
+
+class RTL2YouIE(RTL2YouBaseIE):
+    IE_NAME = 'rtl2:you'
+    _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du',
+        'info_dict': {
+            'id': '15740',
+            'ext': 'mp4',
+            'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!',
+            'description': 'md5:ddaa95c61b372b12b66e115b2772fe01',
+            'age_limit': 12,
+        },
+    }, {
+        'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712',
+        'only_matching': True,
+    }]
+    _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!'
+    _GEO_COUNTRIES = ['DE']
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        stream_data = self._download_json(
+            self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id)
+
+        data, iv = compat_b64decode(stream_data['streamUrl']).decode().split(':')
+        stream_url = intlist_to_bytes(aes_cbc_decrypt(
+            bytes_to_intlist(compat_b64decode(data)),
+            bytes_to_intlist(self._AES_KEY),
+            bytes_to_intlist(compat_b64decode(iv))
+        ))
+        if b'rtl2_you_video_not_found' in stream_url:
+            raise ExtractorError('video not found', expected=True)
+
+        formats = self._extract_m3u8_formats(
+            stream_url[:-compat_ord(stream_url[-1])].decode(),
+            video_id, 'mp4', 'm3u8_native')
+        self._sort_formats(formats)
+
+        video_data = self._download_json(
+            self._BACKWERK_BASE_URL + 'video/' + video_id, video_id)
+
+        series = video_data.get('formatTitle')
+        title = episode = video_data.get('title') or series
+        if series and series != title:
+            title = '%s - %s' % (series, title)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': strip_or_none(video_data.get('description')),
+            'thumbnail': video_data.get('image'),
+            'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000),
+            'series': series,
+            'episode': episode,
+            'age_limit': int_or_none(video_data.get('minimumAge')),
+        }
+
+
+class RTL2YouSeriesIE(RTL2YouBaseIE):
+    IE_NAME = 'rtl2:you:series'
+    _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://you.rtl2.de/videos/115/dragon-ball',
+        'info_dict': {
+            'id': '115',
+        },
+        'playlist_mincount': 5,
+    }
+
+    def _real_extract(self, url):
+        series_id = self._match_id(url)
+        stream_data = self._download_json(
+            self._BACKWERK_BASE_URL + 'videos',
+            series_id, query={
+                'formatId': series_id,
+                'limit': 1000000000,
+            })
+
+        entries = []
+        for video in stream_data.get('videos', []):
+            video_id = compat_str(video['videoId'])
+            if not video_id:
+                continue
+            entries.append(self.url_result(
+                'http://you.rtl2.de/video/%s/%s' % (series_id, video_id),
+                'RTL2You', video_id))
+        return self.playlist_result(entries, series_id)
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py

new file mode 100644 (file)

index 0000000..fadca8c
--- /dev/null
+++ b/youtube_dl/extractor/rtlnl.py
@@ -0,0 +1,126 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+)
+
+
+class RtlNlIE(InfoExtractor):
+    IE_NAME = 'rtl.nl'
+    IE_DESC = 'rtl.nl and rtlxl.nl'
+    _VALID_URL = r'''(?x)
+        https?://(?:(?:www|static)\.)?
+        (?:
+            rtlxl\.nl/[^\#]*\#!/[^/]+/|
+            rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/)
+        )
+        (?P<id>[0-9a-f-]+)'''
+
+    _TESTS = [{
+        'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
+        'md5': '473d1946c1fdd050b2c0161a4b13c373',
+        'info_dict': {
+            'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
+            'ext': 'mp4',
+            'title': 'RTL Nieuws',
+            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+            'timestamp': 1461951000,
+            'upload_date': '20160429',
+            'duration': 1167.96,
+        },
+    }, {
+        # best format available a3t
+        'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
+        'md5': 'dea7474214af1271d91ef332fb8be7ea',
+        'info_dict': {
+            'id': '84ae5571-ac25-4225-ae0c-ef8d9efb2aed',
+            'ext': 'mp4',
+            'timestamp': 1424039400,
+            'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag',
+            'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
+            'upload_date': '20150215',
+            'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
+        }
+    }, {
+        # empty synopsis and missing episodes (see https://github.com/ytdl-org/youtube-dl/issues/6275)
+        # best format available nettv
+        'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
+        'info_dict': {
+            'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
+            'ext': 'mp4',
+            'title': 'RTL Nieuws - Meer beelden van overval juwelier',
+            'thumbnail': r're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
+            'timestamp': 1437233400,
+            'upload_date': '20150718',
+            'duration': 30.474,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # encrypted m3u8 streams, georestricted
+        'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
+        'only_matching': True,
+    }, {
+        'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.rtl.nl/video/c603c9c2-601d-4b5e-8175-64f1e942dc7d/',
+        'only_matching': True,
+    }, {
+        'url': 'https://static.rtl.nl/embed/?uuid=1a2970fc-5c0b-43ff-9fdc-927e39e6d1bc&autoplay=false&publicatiepunt=rtlnieuwsnl',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        uuid = self._match_id(url)
+        info = self._download_json(
+            'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid,
+            uuid)
+
+        material = info['material'][0]
+        title = info['abstracts'][0]['name']
+        subtitle = material.get('title')
+        if subtitle:
+            title += ' - %s' % subtitle
+        description = material.get('synopsis')
+
+        meta = info.get('meta', {})
+
+        videopath = material['videopath']
+        m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
+
+        formats = self._extract_m3u8_formats(
+            m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False)
+        self._sort_formats(formats)
+
+        thumbnails = []
+
+        for p in ('poster_base_url', '"thumb_base_url"'):
+            if not meta.get(p):
+                continue
+
+            thumbnails.append({
+                'url': self._proto_relative_url(meta[p] + uuid),
+                'width': int_or_none(self._search_regex(
+                    r'/sz=([0-9]+)', meta[p], 'thumbnail width', fatal=False)),
+                'height': int_or_none(self._search_regex(
+                    r'/sz=[0-9]+x([0-9]+)',
+                    meta[p], 'thumbnail height', fatal=False))
+            })
+
+        return {
+            'id': uuid,
+            'title': title,
+            'formats': formats,
+            'timestamp': material['original_date'],
+            'description': description,
+            'duration': parse_duration(material.get('duration')),
+            'thumbnails': thumbnails,
+        }
diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py

new file mode 100644 (file)

index 0000000..02986f4
--- /dev/null
+++ b/youtube_dl/extractor/rtp.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    js_to_json,
+)
+
+
+class RTPIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?rtp\.pt/play/p(?P<program_id>[0-9]+)/(?P<id>[^/?#]+)/?'
+    _TESTS = [{
+        'url': 'http://www.rtp.pt/play/p405/e174042/paixoes-cruzadas',
+        'md5': 'e736ce0c665e459ddb818546220b4ef8',
+        'info_dict': {
+            'id': 'e174042',
+            'ext': 'mp3',
+            'title': 'Paixões Cruzadas',
+            'description': 'As paixões musicais de António Cartaxo e António Macedo',
+            'thumbnail': r're:^https?://.*\.jpg',
+        },
+    }, {
+        'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._html_search_meta(
+            'twitter:title', webpage, display_name='title', fatal=True)
+
+        config = self._parse_json(self._search_regex(
+            r'(?s)RTPPlayer\(({.+?})\);', webpage,
+            'player config'), video_id, js_to_json)
+        file_url = config['file']
+        ext = determine_ext(file_url)
+        if ext == 'm3u8':
+            file_key = config.get('fileKey')
+            formats = self._extract_m3u8_formats(
+                file_url, video_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=file_key)
+            if file_key:
+                formats.append({
+                    'url': 'https://cdn-ondemand.rtp.pt' + file_key,
+                    'preference': 1,
+                })
+            self._sort_formats(formats)
+        else:
+            formats = [{
+                'url': file_url,
+                'ext': ext,
+            }]
+        if config.get('mediaType') == 'audio':
+            for f in formats:
+                f['vcodec'] = 'none'
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': self._html_search_meta(['description', 'twitter:description'], webpage),
+            'thumbnail': config.get('poster') or self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py

new file mode 100644 (file)

index 0000000..48f17b8
--- /dev/null
+++ b/youtube_dl/extractor/rts.py
@@ -0,0 +1,230 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .srgssr import SRGSSRIE
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+    unescapeHTML,
+    determine_ext,
+)
+
+
+class RTSIE(SRGSSRIE):
+    IE_DESC = 'RTS.ch'
+    _VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html'
+
+    _TESTS = [
+        {
+            'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html',
+            'md5': 'ff7f8450a90cf58dacb64e29707b4a8e',
+            'info_dict': {
+                'id': '3449373',
+                'display_id': 'les-enfants-terribles',
+                'ext': 'mp4',
+                'duration': 1488,
+                'title': 'Les Enfants Terribles',
+                'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.',
+                'uploader': 'Divers',
+                'upload_date': '19680921',
+                'timestamp': -40280400,
+                'thumbnail': r're:^https?://.*\.image',
+                'view_count': int,
+            },
+        },
+        {
+            'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html',
+            'info_dict': {
+                'id': '5624065',
+                'title': 'Passe-moi les jumelles',
+            },
+            'playlist_mincount': 4,
+        },
+        {
+            'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html',
+            'info_dict': {
+                'id': '5745975',
+                'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski',
+                'ext': 'mp4',
+                'duration': 48,
+                'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski',
+                'description': 'Hockey - Playoff',
+                'uploader': 'Hockey',
+                'upload_date': '20140403',
+                'timestamp': 1396556882,
+                'thumbnail': r're:^https?://.*\.image',
+                'view_count': int,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+            'skip': 'Blocked outside Switzerland',
+        },
+        {
+            'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html',
+            'md5': '1bae984fe7b1f78e94abc74e802ed99f',
+            'info_dict': {
+                'id': '5745356',
+                'display_id': 'londres-cachee-par-un-epais-smog',
+                'ext': 'mp4',
+                'duration': 33,
+                'title': 'Londres cachée par un épais smog',
+                'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.',
+                'uploader': 'L\'actu en vidéo',
+                'upload_date': '20140403',
+                'timestamp': 1396537322,
+                'thumbnail': r're:^https?://.*\.image',
+                'view_count': int,
+            },
+        },
+        {
+            'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html',
+            'md5': 'dd8ef6a22dff163d063e2a52bc8adcae',
+            'info_dict': {
+                'id': '5706148',
+                'display_id': 'urban-hippie-de-damien-krisl-03-04-2014',
+                'ext': 'mp3',
+                'duration': 123,
+                'title': '"Urban Hippie", de Damien Krisl',
+                'description': 'Des Hippies super glam.',
+                'upload_date': '20140403',
+                'timestamp': 1396551600,
+            },
+        },
+        {
+            # article with videos on rhs
+            'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html',
+            'info_dict': {
+                'id': '6693917',
+                'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse',
+            },
+            'playlist_mincount': 5,
+        },
+        {
+            'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        media_id = m.group('rts_id') or m.group('id')
+        display_id = m.group('display_id') or media_id
+
+        def download_json(internal_id):
+            return self._download_json(
+                'http://www.rts.ch/a/%s.html?f=json/article' % internal_id,
+                display_id)
+
+        all_info = download_json(media_id)
+
+        # media_id extracted out of URL is not always a real id
+        if 'video' not in all_info and 'audio' not in all_info:
+            entries = []
+
+            for item in all_info.get('items', []):
+                item_url = item.get('url')
+                if not item_url:
+                    continue
+                entries.append(self.url_result(item_url, 'RTS'))
+
+            if not entries:
+                page, urlh = self._download_webpage_handle(url, display_id)
+                if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id:
+                    return self.url_result(urlh.geturl(), 'RTS')
+
+                # article with videos on rhs
+                videos = re.findall(
+                    r'<article[^>]+class="content-item"[^>]*>\s*<a[^>]+data-video-urn="urn:([^"]+)"',
+                    page)
+                if not videos:
+                    videos = re.findall(
+                        r'(?s)<iframe[^>]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"',
+                        page)
+                if videos:
+                    entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos]
+
+            if entries:
+                return self.playlist_result(entries, media_id, all_info.get('title'))
+
+            internal_id = self._html_search_regex(
+                r'<(?:video|audio) data-id="([0-9]+)"', page,
+                'internal video id')
+            all_info = download_json(internal_id)
+
+        media_type = 'video' if 'video' in all_info else 'audio'
+
+        # check for errors
+        self.get_media_data('rts', media_type, media_id)
+
+        info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio']
+
+        title = info['title']
+
+        def extract_bitrate(url):
+            return int_or_none(self._search_regex(
+                r'-([0-9]+)k\.', url, 'bitrate', default=None))
+
+        formats = []
+        streams = info.get('streams', {})
+        for format_id, format_url in streams.items():
+            if format_id == 'hds_sd' and 'hds' in streams:
+                continue
+            if format_id == 'hls_sd' and 'hls' in streams:
+                continue
+            ext = determine_ext(format_url)
+            if ext in ('m3u8', 'f4m'):
+                format_url = self._get_tokenized_src(format_url, media_id, format_id)
+                if ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0',
+                        media_id, f4m_id=format_id, fatal=False))
+                else:
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False))
+            else:
+                formats.append({
+                    'format_id': format_id,
+                    'url': format_url,
+                    'tbr': extract_bitrate(format_url),
+                })
+
+        for media in info.get('media', []):
+            media_url = media.get('url')
+            if not media_url or re.match(r'https?://', media_url):
+                continue
+            rate = media.get('rate')
+            ext = media.get('ext') or determine_ext(media_url, 'mp4')
+            format_id = ext
+            if rate:
+                format_id += '-%dk' % rate
+            formats.append({
+                'format_id': format_id,
+                'url': 'http://download-video.rts.ch/' + media_url,
+                'tbr': rate or extract_bitrate(media_url),
+            })
+
+        self._check_formats(formats, media_id)
+        self._sort_formats(formats)
+
+        duration = info.get('duration') or info.get('cutout') or info.get('cutduration')
+        if isinstance(duration, compat_str):
+            duration = parse_duration(duration)
+
+        return {
+            'id': media_id,
+            'display_id': display_id,
+            'formats': formats,
+            'title': title,
+            'description': info.get('intro'),
+            'duration': duration,
+            'view_count': int_or_none(info.get('plays')),
+            'uploader': info.get('programName'),
+            'timestamp': parse_iso8601(info.get('broadcast_date')),
+            'thumbnail': unescapeHTML(info.get('preview_image_url')),
+        }
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py

new file mode 100644 (file)

index 0000000..ce9db06
--- /dev/null
+++ b/youtube_dl/extractor/rtve.py
@@ -0,0 +1,292 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_struct_unpack,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    remove_end,
+    remove_start,
+    sanitized_Request,
+    std_headers,
+)
+
+
+def _decrypt_url(png):
+    encrypted_data = compat_b64decode(png)
+    text_index = encrypted_data.find(b'tEXt')
+    text_chunk = encrypted_data[text_index - 4:]
+    length = compat_struct_unpack('!I', text_chunk[:4])[0]
+    # Use bytearray to get integers when iterating in both python 2.x and 3.x
+    data = bytearray(text_chunk[8:8 + length])
+    data = [chr(b) for b in data if b != 0]
+    hash_index = data.index('#')
+    alphabet_data = data[:hash_index]
+    url_data = data[hash_index + 1:]
+    if url_data[0] == 'H' and url_data[3] == '%':
+        # remove useless HQ%% at the start
+        url_data = url_data[4:]
+
+    alphabet = []
+    e = 0
+    d = 0
+    for l in alphabet_data:
+        if d == 0:
+            alphabet.append(l)
+            d = e = (e + 1) % 4
+        else:
+            d -= 1
+    url = ''
+    f = 0
+    e = 3
+    b = 1
+    for letter in url_data:
+        if f == 0:
+            l = int(letter) * 10
+            f = 1
+        else:
+            if e == 0:
+                l += int(letter)
+                url += alphabet[l]
+                e = (b + 3) % 4
+                f = 0
+                b += 1
+            else:
+                e -= 1
+
+    return url
+
+
+class RTVEALaCartaIE(InfoExtractor):
+    IE_NAME = 'rtve.es:alacarta'
+    IE_DESC = 'RTVE a la carta'
+    _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
+        'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43',
+        'info_dict': {
+            'id': '2491869',
+            'ext': 'mp4',
+            'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
+            'duration': 5024.566,
+        },
+    }, {
+        'note': 'Live stream',
+        'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/',
+        'info_dict': {
+            'id': '1694255',
+            'ext': 'flv',
+            'title': 'TODO',
+        },
+        'skip': 'The f4m manifest can\'t be used yet',
+    }, {
+        'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/',
+        'md5': 'e55e162379ad587e9640eda4f7353c0f',
+        'info_dict': {
+            'id': '4236788',
+            'ext': 'mp4',
+            'title': 'Servir y proteger - Capítulo 104 ',
+            'duration': 3222.0,
+        },
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        },
+    }, {
+        'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/',
+        'only_matching': True,
+    }]
+
+    def _real_initialize(self):
+        user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8')
+        manager_info = self._download_json(
+            'http://www.rtve.es/odin/loki/' + user_agent_b64,
+            None, 'Fetching manager info')
+        self._manager = manager_info['manager']
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        info = self._download_json(
+            'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
+            video_id)['page']['items'][0]
+        if info['state'] == 'DESPU':
+            raise ExtractorError('The video is no longer available', expected=True)
+        title = info['title']
+        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id)
+        png_request = sanitized_Request(png_url)
+        png_request.add_header('Referer', url)
+        png = self._download_webpage(png_request, video_id, 'Downloading url information')
+        video_url = _decrypt_url(png)
+        ext = determine_ext(video_url)
+
+        formats = []
+        if not video_url.endswith('.f4m') and ext != 'm3u8':
+            if '?' not in video_url:
+                video_url = video_url.replace('resources/', 'auth/resources/')
+            video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve')
+
+        if ext == 'm3u8':
+            formats.extend(self._extract_m3u8_formats(
+                video_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+        elif ext == 'f4m':
+            formats.extend(self._extract_f4m_formats(
+                video_url, video_id, f4m_id='hds', fatal=False))
+        else:
+            formats.append({
+                'url': video_url,
+            })
+        self._sort_formats(formats)
+
+        subtitles = None
+        if info.get('sbtFile') is not None:
+            subtitles = self.extract_subtitles(video_id, info['sbtFile'])
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': info.get('image'),
+            'page_url': url,
+            'subtitles': subtitles,
+            'duration': float_or_none(info.get('duration'), scale=1000),
+        }
+
+    def _get_subtitles(self, video_id, sub_file):
+        subs = self._download_json(
+            sub_file + '.json', video_id,
+            'Downloading subtitles info')['page']['items']
+        return dict(
+            (s['lang'], [{'ext': 'vtt', 'url': s['src']}])
+            for s in subs)
+
+
+class RTVEInfantilIE(InfoExtractor):
+    IE_NAME = 'rtve.es:infantil'
+    IE_DESC = 'RTVE infantil'
+    _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P<show>[^/]*)/video/(?P<short_title>[^/]*)/(?P<id>[0-9]+)/'
+
+    _TESTS = [{
+        'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/',
+        'md5': '915319587b33720b8e0357caaa6617e6',
+        'info_dict': {
+            'id': '3040283',
+            'ext': 'mp4',
+            'title': 'Maneras de vivir',
+            'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG',
+            'duration': 357.958,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        info = self._download_json(
+            'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
+            video_id)['page']['items'][0]
+
+        webpage = self._download_webpage(url, video_id)
+        vidplayer_id = self._search_regex(
+            r' id="vidplayer([0-9]+)"', webpage, 'internal video ID')
+
+        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id
+        png = self._download_webpage(png_url, video_id, 'Downloading url information')
+        video_url = _decrypt_url(png)
+
+        return {
+            'id': video_id,
+            'ext': 'mp4',
+            'title': info['title'],
+            'url': video_url,
+            'thumbnail': info.get('image'),
+            'duration': float_or_none(info.get('duration'), scale=1000),
+        }
+
+
+class RTVELiveIE(InfoExtractor):
+    IE_NAME = 'rtve.es:live'
+    IE_DESC = 'RTVE.es live streams'
+    _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)'
+
+    _TESTS = [{
+        'url': 'http://www.rtve.es/directo/la-1/',
+        'info_dict': {
+            'id': 'la-1',
+            'ext': 'mp4',
+            'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
+        },
+        'params': {
+            'skip_download': 'live stream',
+        }
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        start_time = time.gmtime()
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es')
+        title = remove_start(title, 'Estoy viendo ')
+        title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time)
+
+        vidplayer_id = self._search_regex(
+            (r'playerId=player([0-9]+)',
+             r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)',
+             r'data-id=["\'](\d+)'),
+            webpage, 'internal video ID')
+        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id
+        png = self._download_webpage(png_url, video_id, 'Downloading url information')
+        m3u8_url = _decrypt_url(png)
+        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'is_live': True,
+        }
+
+
+class RTVETelevisionIE(InfoExtractor):
+    IE_NAME = 'rtve.es:television'
+    _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml'
+
+    _TEST = {
+        'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml',
+        'info_dict': {
+            'id': '3069778',
+            'ext': 'mp4',
+            'title': 'Documentos TV - La revolución del móvil',
+            'duration': 3496.948,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+        webpage = self._download_webpage(url, page_id)
+
+        alacarta_url = self._search_regex(
+            r'data-location="alacarta_videos"[^<]+url&quot;:&quot;(http://www\.rtve\.es/alacarta.+?)&',
+            webpage, 'alacarta url', default=None)
+        if alacarta_url is None:
+            raise ExtractorError(
+                'The webpage doesn\'t contain any video', expected=True)
+
+        return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key())
diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py

new file mode 100644 (file)

index 0000000..6a00f70
--- /dev/null
+++ b/youtube_dl/extractor/rtvnh.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class RTVNHIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.rtvnh.nl/video/131946',
+        'md5': 'cdbec9f44550763c8afc96050fa747dc',
+        'info_dict': {
+            'id': '131946',
+            'ext': 'mp4',
+            'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw',
+            'thumbnail': r're:^https?:.*\.jpg$'
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        meta = self._parse_json(self._download_webpage(
+            'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id)
+
+        status = meta.get('status')
+        if status != 200:
+            raise ExtractorError(
+                '%s returned error code %d' % (self.IE_NAME, status), expected=True)
+
+        formats = []
+        rtmp_formats = self._extract_smil_formats(
+            'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id)
+        formats.extend(rtmp_formats)
+
+        for rtmp_format in rtmp_formats:
+            rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+            rtsp_format = rtmp_format.copy()
+            del rtsp_format['play_path']
+            del rtsp_format['ext']
+            rtsp_format.update({
+                'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+                'url': rtmp_url.replace('rtmp://', 'rtsp://'),
+                'protocol': 'rtsp',
+            })
+            formats.append(rtsp_format)
+            http_base_url = rtmp_url.replace('rtmp://', 'http://')
+            formats.extend(self._extract_m3u8_formats(
+                http_base_url + '/playlist.m3u8', video_id, 'mp4',
+                'm3u8_native', m3u8_id='hls', fatal=False))
+            formats.extend(self._extract_f4m_formats(
+                http_base_url + '/manifest.f4m',
+                video_id, f4m_id='hds', fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': meta['title'].strip(),
+            'thumbnail': meta.get('image'),
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/rtvs.py b/youtube_dl/extractor/rtvs.py

new file mode 100644 (file)

index 0000000..6573b26
--- /dev/null
+++ b/youtube_dl/extractor/rtvs.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class RTVSIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?rtvs\.sk/(?:radio|televizia)/archiv/\d+/(?P<id>\d+)'
+    _TESTS = [{
+        # radio archive
+        'url': 'http://www.rtvs.sk/radio/archiv/11224/414872',
+        'md5': '134d5d6debdeddf8a5d761cbc9edacb8',
+        'info_dict': {
+            'id': '414872',
+            'ext': 'mp3',
+            'title': 'Ostrov pokladov 1 časť.mp3'
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        # tv archive
+        'url': 'http://www.rtvs.sk/televizia/archiv/8249/63118',
+        'md5': '85e2c55cf988403b70cac24f5c086dc6',
+        'info_dict': {
+            'id': '63118',
+            'ext': 'mp4',
+            'title': 'Amaro Džives - Náš deň',
+            'description': 'Galavečer pri príležitosti Medzinárodného dňa Rómov.'
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        playlist_url = self._search_regex(
+            r'playlist["\']?\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'playlist url', group='url')
+
+        data = self._download_json(
+            playlist_url, video_id, 'Downloading playlist')[0]
+        return self._parse_jwplayer_data(data, video_id=video_id)
diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py

new file mode 100644 (file)

index 0000000..3c8053a
--- /dev/null
+++ b/youtube_dl/extractor/ruhd.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class RUHDIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.ruhd.ru/play.php?vid=207',
+        'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83',
+        'info_dict': {
+            'id': '207',
+            'ext': 'divx',
+            'title': 'КОТ бааааам',
+            'description': 'классный кот)',
+            'thumbnail': r're:^http://.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._html_search_regex(
+            r'<param name="src" value="([^"]+)"', webpage, 'video url')
+        title = self._html_search_regex(
+            r'<title>([^<]+)&nbsp;&nbsp; RUHD\.ru - Видео Высокого качества №1 в России!</title>',
+            webpage, 'title')
+        description = self._html_search_regex(
+            r'(?s)<div id="longdesc">(.+?)<span id="showlink">',
+            webpage, 'description', fatal=False)
+        thumbnail = self._html_search_regex(
+            r'<param name="previewImage" value="([^"]+)"',
+            webpage, 'thumbnail', fatal=False)
+        if thumbnail:
+            thumbnail = 'http://www.ruhd.ru' + thumbnail
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py

new file mode 100644 (file)

index 0000000..8f54d56
--- /dev/null
+++ b/youtube_dl/extractor/rutube.py
@@ -0,0 +1,313 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    determine_ext,
+    bool_or_none,
+    int_or_none,
+    try_get,
+    unified_timestamp,
+    url_or_none,
+)
+
+
+class RutubeBaseIE(InfoExtractor):
+    def _download_api_info(self, video_id, query=None):
+        if not query:
+            query = {}
+        query['format'] = 'json'
+        return self._download_json(
+            'http://rutube.ru/api/video/%s/' % video_id,
+            video_id, 'Downloading video JSON',
+            'Unable to download video JSON', query=query)
+
+    @staticmethod
+    def _extract_info(video, video_id=None, require_title=True):
+        title = video['title'] if require_title else video.get('title')
+
+        age_limit = video.get('is_adult')
+        if age_limit is not None:
+            age_limit = 18 if age_limit is True else 0
+
+        uploader_id = try_get(video, lambda x: x['author']['id'])
+        category = try_get(video, lambda x: x['category']['name'])
+
+        return {
+            'id': video.get('id') or video_id if video_id else video['id'],
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': video.get('thumbnail_url'),
+            'duration': int_or_none(video.get('duration')),
+            'uploader': try_get(video, lambda x: x['author']['name']),
+            'uploader_id': compat_str(uploader_id) if uploader_id else None,
+            'timestamp': unified_timestamp(video.get('created_ts')),
+            'category': [category] if category else None,
+            'age_limit': age_limit,
+            'view_count': int_or_none(video.get('hits')),
+            'comment_count': int_or_none(video.get('comments_count')),
+            'is_live': bool_or_none(video.get('is_livestream')),
+        }
+
+    def _download_and_extract_info(self, video_id, query=None):
+        return self._extract_info(
+            self._download_api_info(video_id, query=query), video_id)
+
+    def _download_api_options(self, video_id, query=None):
+        if not query:
+            query = {}
+        query['format'] = 'json'
+        return self._download_json(
+            'http://rutube.ru/api/play/options/%s/' % video_id,
+            video_id, 'Downloading options JSON',
+            'Unable to download options JSON',
+            headers=self.geo_verification_headers(), query=query)
+
+    def _extract_formats(self, options, video_id):
+        formats = []
+        for format_id, format_url in options['video_balancer'].items():
+            ext = determine_ext(format_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    format_url, video_id, f4m_id=format_id, fatal=False))
+            else:
+                formats.append({
+                    'url': format_url,
+                    'format_id': format_id,
+                })
+        self._sort_formats(formats)
+        return formats
+
+    def _download_and_extract_formats(self, video_id, query=None):
+        return self._extract_formats(
+            self._download_api_options(video_id, query=query), video_id)
+
+
+class RutubeIE(RutubeBaseIE):
+    IE_NAME = 'rutube'
+    IE_DESC = 'Rutube videos'
+    _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'
+
+    _TESTS = [{
+        'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
+        'md5': '1d24f180fac7a02f3900712e5a5764d6',
+        'info_dict': {
+            'id': '3eac3b4561676c17df9132a9a1e62e3e',
+            'ext': 'mp4',
+            'title': 'Раненный кенгуру забежал в аптеку',
+            'description': 'http://www.ntdtv.ru ',
+            'duration': 81,
+            'uploader': 'NTDRussian',
+            'uploader_id': '29790',
+            'timestamp': 1381943602,
+            'upload_date': '20131016',
+            'age_limit': 0,
+        },
+    }, {
+        'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
+        'only_matching': True,
+    }, {
+        'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661',
+        'only_matching': True,
+    }, {
+        'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252',
+        'only_matching': True,
+    }, {
+        'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url)
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [mobj.group('url') for mobj in re.finditer(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1',
+            webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        info = self._download_and_extract_info(video_id)
+        info['formats'] = self._download_and_extract_formats(video_id)
+        return info
+
+
+class RutubeEmbedIE(RutubeBaseIE):
+    IE_NAME = 'rutube:embed'
+    IE_DESC = 'Rutube embedded videos'
+    _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)'
+
+    _TESTS = [{
+        'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
+        'info_dict': {
+            'id': 'a10e53b86e8f349080f718582ce4c661',
+            'ext': 'mp4',
+            'timestamp': 1387830582,
+            'upload_date': '20131223',
+            'uploader_id': '297833',
+            'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89',
+            'uploader': 'subziro89 ILya',
+            'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://rutube.ru/play/embed/8083783',
+        'only_matching': True,
+    }, {
+        # private video
+        'url': 'https://rutube.ru/play/embed/10631925?p=IbAigKqWd1do4mjaM5XLIQ',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        embed_id = self._match_id(url)
+        # Query may contain private videos token and should be passed to API
+        # requests (see #19163)
+        query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        options = self._download_api_options(embed_id, query)
+        video_id = options['effective_video']
+        formats = self._extract_formats(options, video_id)
+        info = self._download_and_extract_info(video_id, query)
+        info.update({
+            'extractor_key': 'Rutube',
+            'formats': formats,
+        })
+        return info
+
+
+class RutubePlaylistBaseIE(RutubeBaseIE):
+    def _next_page_url(self, page_num, playlist_id, *args, **kwargs):
+        return self._PAGE_TEMPLATE % (playlist_id, page_num)
+
+    def _entries(self, playlist_id, *args, **kwargs):
+        next_page_url = None
+        for pagenum in itertools.count(1):
+            page = self._download_json(
+                next_page_url or self._next_page_url(
+                    pagenum, playlist_id, *args, **kwargs),
+                playlist_id, 'Downloading page %s' % pagenum)
+
+            results = page.get('results')
+            if not results or not isinstance(results, list):
+                break
+
+            for result in results:
+                video_url = url_or_none(result.get('video_url'))
+                if not video_url:
+                    continue
+                entry = self._extract_info(result, require_title=False)
+                entry.update({
+                    '_type': 'url',
+                    'url': video_url,
+                    'ie_key': RutubeIE.ie_key(),
+                })
+                yield entry
+
+            next_page_url = page.get('next')
+            if not next_page_url or not page.get('has_next'):
+                break
+
+    def _extract_playlist(self, playlist_id, *args, **kwargs):
+        return self.playlist_result(
+            self._entries(playlist_id, *args, **kwargs),
+            playlist_id, kwargs.get('playlist_name'))
+
+    def _real_extract(self, url):
+        return self._extract_playlist(self._match_id(url))
+
+
+class RutubeChannelIE(RutubePlaylistBaseIE):
+    IE_NAME = 'rutube:channel'
+    IE_DESC = 'Rutube channels'
+    _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://rutube.ru/tags/video/1800/',
+        'info_dict': {
+            'id': '1800',
+        },
+        'playlist_mincount': 68,
+    }]
+
+    _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json'
+
+
+class RutubeMovieIE(RutubePlaylistBaseIE):
+    IE_NAME = 'rutube:movie'
+    IE_DESC = 'Rutube movies'
+    _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)'
+    _TESTS = []
+
+    _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json'
+    _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json'
+
+    def _real_extract(self, url):
+        movie_id = self._match_id(url)
+        movie = self._download_json(
+            self._MOVIE_TEMPLATE % movie_id, movie_id,
+            'Downloading movie JSON')
+        return self._extract_playlist(
+            movie_id, playlist_name=movie.get('name'))
+
+
+class RutubePersonIE(RutubePlaylistBaseIE):
+    IE_NAME = 'rutube:person'
+    IE_DESC = 'Rutube person videos'
+    _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://rutube.ru/video/person/313878/',
+        'info_dict': {
+            'id': '313878',
+        },
+        'playlist_mincount': 37,
+    }]
+
+    _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json'
+
+
+class RutubePlaylistIE(RutubePlaylistBaseIE):
+    IE_NAME = 'rutube:playlist'
+    IE_DESC = 'Rutube playlists'
+    _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag',
+        'info_dict': {
+            'id': '3097',
+        },
+        'playlist_count': 27,
+    }, {
+        'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source',
+        'only_matching': True,
+    }]
+
+    _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json'
+
+    @classmethod
+    def suitable(cls, url):
+        if not super(RutubePlaylistIE, cls).suitable(url):
+            return False
+        params = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0])
+
+    def _next_page_url(self, page_num, playlist_id, item_kind):
+        return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num)
+
+    def _real_extract(self, url):
+        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        playlist_kind = qs['pl_type'][0]
+        playlist_id = qs['pl_id'][0]
+        return self._extract_playlist(playlist_id, item_kind=playlist_kind)
diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py

new file mode 100644 (file)

index 0000000..d2713c1
--- /dev/null
+++ b/youtube_dl/extractor/rutv.py
@@ -0,0 +1,211 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none
+)
+
+
+class RUTVIE(InfoExtractor):
+    IE_DESC = 'RUTV.RU'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:test)?player\.(?:rutv\.ru|vgtrk\.com)/
+                        (?P<path>
+                            flash\d+v/container\.swf\?id=|
+                            iframe/(?P<type>swf|video|live)/id/|
+                            index/iframe/cast_id/
+                        )
+                        (?P<id>\d+)
+                    '''
+
+    _TESTS = [
+        {
+            'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724',
+            'info_dict': {
+                'id': '774471',
+                'ext': 'mp4',
+                'title': 'Монологи на все времена',
+                'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5',
+                'duration': 2906,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638',
+            'info_dict': {
+                'id': '774016',
+                'ext': 'mp4',
+                'title': 'Чужой в семье Сталина',
+                'description': '',
+                'duration': 2539,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000',
+            'info_dict': {
+                'id': '766888',
+                'ext': 'mp4',
+                'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
+                'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
+                'duration': 279,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169',
+            'info_dict': {
+                'id': '771852',
+                'ext': 'mp4',
+                'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет',
+                'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8',
+                'duration': 3096,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014',
+            'info_dict': {
+                'id': '51499',
+                'ext': 'flv',
+                'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
+                'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
+            },
+            'skip': 'Translation has finished',
+        },
+        {
+            'url': 'http://player.rutv.ru/iframe/live/id/21/showZoomBtn/false/isPlay/true/',
+            'info_dict': {
+                'id': '21',
+                'ext': 'mp4',
+                'title': 're:^Россия 24. Прямой эфир [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+                'is_live': True,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://testplayer.vgtrk.com/iframe/live/id/19201/showZoomBtn/false/isPlay/true/',
+            'only_matching': True,
+        },
+    ]
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+        mobj = re.search(
+            r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        video_path = mobj.group('path')
+
+        if re.match(r'flash\d+v', video_path):
+            video_type = 'video'
+        elif video_path.startswith('iframe'):
+            video_type = mobj.group('type')
+            if video_type == 'swf':
+                video_type = 'video'
+        elif video_path.startswith('index/iframe/cast_id'):
+            video_type = 'live'
+
+        is_live = video_type == 'live'
+
+        json_data = self._download_json(
+            'http://player.rutv.ru/iframe/data%s/id/%s' % ('live' if is_live else 'video', video_id),
+            video_id, 'Downloading JSON')
+
+        if json_data['errors']:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, json_data['errors']), expected=True)
+
+        playlist = json_data['data']['playlist']
+        medialist = playlist['medialist']
+        media = medialist[0]
+
+        if media['errors']:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True)
+
+        view_count = playlist.get('count_views')
+        priority_transport = playlist['priority_transport']
+
+        thumbnail = media['picture']
+        width = int_or_none(media['width'])
+        height = int_or_none(media['height'])
+        description = media['anons']
+        title = media['title']
+        duration = int_or_none(media.get('duration'))
+
+        formats = []
+
+        for transport, links in media['sources'].items():
+            for quality, url in links.items():
+                preference = -1 if priority_transport == transport else -2
+                if transport == 'rtmp':
+                    mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
+                    if not mobj:
+                        continue
+                    fmt = {
+                        'url': mobj.group('url'),
+                        'play_path': mobj.group('playpath'),
+                        'app': mobj.group('app'),
+                        'page_url': 'http://player.rutv.ru',
+                        'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22',
+                        'rtmp_live': True,
+                        'ext': 'flv',
+                        'vbr': int(quality),
+                        'preference': preference,
+                    }
+                elif transport == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        url, video_id, 'mp4', preference=preference, m3u8_id='hls'))
+                    continue
+                else:
+                    fmt = {
+                        'url': url
+                    }
+                fmt.update({
+                    'width': width,
+                    'height': height,
+                    'format_id': '%s-%s' % (transport, quality),
+                })
+                formats.append(fmt)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'view_count': view_count,
+            'duration': duration,
+            'formats': formats,
+            'is_live': is_live,
+        }
diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py

new file mode 100644 (file)

index 0000000..f984040
--- /dev/null
+++ b/youtube_dl/extractor/ruutu.py
@@ -0,0 +1,153 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    xpath_attr,
+    xpath_text,
+)
+
+
+class RuutuIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P<id>\d+)'
+    _TESTS = [
+        {
+            'url': 'http://www.ruutu.fi/video/2058907',
+            'md5': 'ab2093f39be1ca8581963451b3c0234f',
+            'info_dict': {
+                'id': '2058907',
+                'ext': 'mp4',
+                'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!',
+                'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 114,
+                'age_limit': 0,
+            },
+        },
+        {
+            'url': 'http://www.ruutu.fi/video/2057306',
+            'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9',
+            'info_dict': {
+                'id': '2057306',
+                'ext': 'mp4',
+                'title': 'Superpesis: katso koko kausi Ruudussa',
+                'description': 'md5:bfb7336df2a12dc21d18fa696c9f8f23',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 40,
+                'age_limit': 0,
+            },
+        },
+        {
+            'url': 'http://www.supla.fi/supla/2231370',
+            'md5': 'df14e782d49a2c0df03d3be2a54ef949',
+            'info_dict': {
+                'id': '2231370',
+                'ext': 'mp4',
+                'title': 'Osa 1: Mikael Jungner',
+                'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'age_limit': 0,
+            },
+        },
+        # Episode where <SourceFile> is "NOT-USED", but has other
+        # downloadable sources available.
+        {
+            'url': 'http://www.ruutu.fi/video/3193728',
+            'only_matching': True,
+        },
+        {
+            # audio podcast
+            'url': 'https://www.supla.fi/supla/3382410',
+            'md5': 'b9d7155fed37b2ebf6021d74c4b8e908',
+            'info_dict': {
+                'id': '3382410',
+                'ext': 'mp3',
+                'title': 'Mikä ihmeen poltergeist?',
+                'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'age_limit': 0,
+            },
+            'expected_warnings': ['HTTP Error 502: Bad Gateway'],
+        }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video_xml = self._download_xml(
+            'https://gatling.nelonenmedia.fi/media-xml-cache', video_id,
+            query={'id': video_id})
+
+        formats = []
+        processed_urls = []
+
+        def extract_formats(node):
+            for child in node:
+                if child.tag.endswith('Files'):
+                    extract_formats(child)
+                elif child.tag.endswith('File'):
+                    video_url = child.text
+                    if (not video_url or video_url in processed_urls
+                            or any(p in video_url for p in ('NOT_USED', 'NOT-USED'))):
+                        continue
+                    processed_urls.append(video_url)
+                    ext = determine_ext(video_url)
+                    if ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            video_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+                    elif ext == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            video_url, video_id, f4m_id='hds', fatal=False))
+                    elif ext == 'mpd':
+                        # video-only and audio-only streams are of different
+                        # duration resulting in out of sync issue
+                        continue
+                        formats.extend(self._extract_mpd_formats(
+                            video_url, video_id, mpd_id='dash', fatal=False))
+                    elif ext == 'mp3' or child.tag == 'AudioMediaFile':
+                        formats.append({
+                            'format_id': 'audio',
+                            'url': video_url,
+                            'vcodec': 'none',
+                        })
+                    else:
+                        proto = compat_urllib_parse_urlparse(video_url).scheme
+                        if not child.tag.startswith('HTTP') and proto != 'rtmp':
+                            continue
+                        preference = -1 if proto == 'rtmp' else 1
+                        label = child.get('label')
+                        tbr = int_or_none(child.get('bitrate'))
+                        format_id = '%s-%s' % (proto, label if label else tbr) if label or tbr else proto
+                        if not self._is_valid_url(video_url, video_id, format_id):
+                            continue
+                        width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]]
+                        formats.append({
+                            'format_id': format_id,
+                            'url': video_url,
+                            'width': width,
+                            'height': height,
+                            'tbr': tbr,
+                            'preference': preference,
+                        })
+
+        extract_formats(video_xml.find('./Clip'))
+
+        drm = xpath_text(video_xml, './Clip/DRM', default=None)
+        if not formats and drm:
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True),
+            'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'),
+            'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'),
+            'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')),
+            'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/ruv.py b/youtube_dl/extractor/ruv.py

new file mode 100644 (file)

index 0000000..8f3cc40
--- /dev/null
+++ b/youtube_dl/extractor/ruv.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    unified_timestamp,
+)
+
+
+class RuvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ruv\.is/(?:sarpurinn/[^/]+|node)/(?P<id>[^/]+(?:/\d+)?)'
+    _TESTS = [{
+        # m3u8
+        'url': 'http://ruv.is/sarpurinn/ruv-aukaras/fh-valur/20170516',
+        'md5': '66347652f4e13e71936817102acc1724',
+        'info_dict': {
+            'id': '1144499',
+            'display_id': 'fh-valur/20170516',
+            'ext': 'mp4',
+            'title': 'FH - Valur',
+            'description': 'Bein útsending frá 3. leik FH og Vals í úrslitum Olísdeildar karla í handbolta.',
+            'timestamp': 1494963600,
+            'upload_date': '20170516',
+        },
+    }, {
+        # mp3
+        'url': 'http://ruv.is/sarpurinn/ras-2/morgunutvarpid/20170619',
+        'md5': '395ea250c8a13e5fdb39d4670ef85378',
+        'info_dict': {
+            'id': '1153630',
+            'display_id': 'morgunutvarpid/20170619',
+            'ext': 'mp3',
+            'title': 'Morgunútvarpið',
+            'description': 'md5:a4cf1202c0a1645ca096b06525915418',
+            'timestamp': 1497855000,
+            'upload_date': '20170619',
+        },
+    }, {
+        'url': 'http://ruv.is/sarpurinn/ruv/frettir/20170614',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ruv.is/node/1151854',
+        'only_matching': True,
+    }, {
+        'url': 'http://ruv.is/sarpurinn/klippa/secret-soltice-hefst-a-morgun',
+        'only_matching': True,
+    }, {
+        'url': 'http://ruv.is/sarpurinn/ras-1/morgunvaktin/20170619',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(webpage)
+
+        FIELD_RE = r'video\.%s\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'
+
+        media_url = self._html_search_regex(
+            FIELD_RE % 'src', webpage, 'video URL', group='url')
+
+        video_id = self._search_regex(
+            r'<link\b[^>]+\bhref=["\']https?://www\.ruv\.is/node/(\d+)',
+            webpage, 'video id', default=display_id)
+
+        ext = determine_ext(media_url)
+
+        if ext == 'm3u8':
+            formats = self._extract_m3u8_formats(
+                media_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls')
+        elif ext == 'mp3':
+            formats = [{
+                'format_id': 'mp3',
+                'url': media_url,
+                'vcodec': 'none',
+            }]
+        else:
+            formats = [{
+                'url': media_url,
+            }]
+
+        description = self._og_search_description(webpage, default=None)
+        thumbnail = self._og_search_thumbnail(
+            webpage, default=None) or self._search_regex(
+            FIELD_RE % 'poster', webpage, 'thumbnail', fatal=False)
+        timestamp = unified_timestamp(self._html_search_meta(
+            'article:published_time', webpage, 'timestamp', fatal=False))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py

new file mode 100644 (file)

index 0000000..2cc6651
--- /dev/null
+++ b/youtube_dl/extractor/safari.py
@@ -0,0 +1,264 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+
+from ..compat import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    update_url_query,
+)
+
+
+class SafariBaseIE(InfoExtractor):
+    _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/'
+    _NETRC_MACHINE = 'safari'
+
+    _API_BASE = 'https://learning.oreilly.com/api/v1'
+    _API_FORMAT = 'json'
+
+    LOGGED_IN = False
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        _, urlh = self._download_webpage_handle(
+            'https://learning.oreilly.com/accounts/login-check/', None,
+            'Downloading login page')
+
+        def is_logged(urlh):
+            return 'learning.oreilly.com/home/' in urlh.geturl()
+
+        if is_logged(urlh):
+            self.LOGGED_IN = True
+            return
+
+        redirect_url = urlh.geturl()
+        parsed_url = compat_urlparse.urlparse(redirect_url)
+        qs = compat_parse_qs(parsed_url.query)
+        next_uri = compat_urlparse.urljoin(
+            'https://api.oreilly.com', qs['next'][0])
+
+        auth, urlh = self._download_json_handle(
+            'https://www.oreilly.com/member/auth/login/', None, 'Logging in',
+            data=json.dumps({
+                'email': username,
+                'password': password,
+                'redirect_uri': next_uri,
+            }).encode(), headers={
+                'Content-Type': 'application/json',
+                'Referer': redirect_url,
+            }, expected_status=400)
+
+        credentials = auth.get('credentials')
+        if (not auth.get('logged_in') and not auth.get('redirect_uri')
+                and credentials):
+            raise ExtractorError(
+                'Unable to login: %s' % credentials, expected=True)
+
+        # oreilly serves two same instances of the following cookies
+        # in Set-Cookie header and expects first one to be actually set
+        for cookie in ('groot_sessionid', 'orm-jwt', 'orm-rt'):
+            self._apply_first_set_cookie_header(urlh, cookie)
+
+        _, urlh = self._download_webpage_handle(
+            auth.get('redirect_uri') or next_uri, None, 'Completing login',)
+
+        if is_logged(urlh):
+            self.LOGGED_IN = True
+            return
+
+        raise ExtractorError('Unable to log in')
+
+
+class SafariIE(SafariBaseIE):
+    IE_NAME = 'safari'
+    IE_DESC = 'safaribooksonline.com online video'
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
+                            (?:
+                                library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html|
+                                videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+)
+                            )
+                    '''
+
+    _TESTS = [{
+        'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
+        'md5': 'dcc5a425e79f2564148652616af1f2a3',
+        'info_dict': {
+            'id': '0_qbqx90ic',
+            'ext': 'mp4',
+            'title': 'Introduction to Hadoop Fundamentals LiveLessons',
+            'timestamp': 1437758058,
+            'upload_date': '20150724',
+            'uploader_id': 'stork',
+        },
+    }, {
+        # non-digits in course id
+        'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00',
+        'only_matching': True,
+    }, {
+        'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/00_SeriesIntro.html',
+        'only_matching': True,
+    }]
+
+    _PARTNER_ID = '1926081'
+    _UICONF_ID = '29375172'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        reference_id = mobj.group('reference_id')
+        if reference_id:
+            video_id = reference_id
+            partner_id = self._PARTNER_ID
+            ui_id = self._UICONF_ID
+        else:
+            video_id = '%s-%s' % (mobj.group('course_id'), mobj.group('part'))
+
+            webpage, urlh = self._download_webpage_handle(url, video_id)
+
+            mobj = re.match(self._VALID_URL, urlh.geturl())
+            reference_id = mobj.group('reference_id')
+            if not reference_id:
+                reference_id = self._search_regex(
+                    r'data-reference-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
+                    webpage, 'kaltura reference id', group='id')
+            partner_id = self._search_regex(
+                r'data-partner-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
+                webpage, 'kaltura widget id', default=self._PARTNER_ID,
+                group='id')
+            ui_id = self._search_regex(
+                r'data-ui-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
+                webpage, 'kaltura uiconf id', default=self._UICONF_ID,
+                group='id')
+
+        query = {
+            'wid': '_%s' % partner_id,
+            'uiconf_id': ui_id,
+            'flashvars[referenceId]': reference_id,
+        }
+
+        if self.LOGGED_IN:
+            kaltura_session = self._download_json(
+                '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id),
+                video_id, 'Downloading kaltura session JSON',
+                'Unable to download kaltura session JSON', fatal=False,
+                headers={'Accept': 'application/json'})
+            if kaltura_session:
+                session = kaltura_session.get('session')
+                if session:
+                    query['flashvars[ks]'] = session
+
+        return self.url_result(update_url_query(
+            'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
+            'Kaltura')
+
+
+class SafariApiIE(SafariBaseIE):
+    IE_NAME = 'safari:api'
+    _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html'
+
+    _TESTS = [{
+        'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        part = self._download_json(
+            url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')),
+            'Downloading part JSON')
+        return self.url_result(part['web_url'], SafariIE.ie_key())
+
+
+class SafariCourseIE(SafariBaseIE):
+    IE_NAME = 'safari:course'
+    IE_DESC = 'safaribooksonline.com online courses'
+
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:www\.)?(?:safaribooksonline|(?:learning\.)?oreilly)\.com/
+                            (?:
+                                library/view/[^/]+|
+                                api/v1/book|
+                                videos/[^/]+
+                            )|
+                            techbus\.safaribooksonline\.com
+                        )
+                        /(?P<id>[^/]+)
+                    '''
+
+    _TESTS = [{
+        'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
+        'info_dict': {
+            'id': '9780133392838',
+            'title': 'Hadoop Fundamentals LiveLessons',
+        },
+        'playlist_count': 22,
+        'skip': 'Requires safaribooksonline account credentials',
+    }, {
+        'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
+        'only_matching': True,
+    }, {
+        'url': 'http://techbus.safaribooksonline.com/9780134426365',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314',
+        'only_matching': True,
+    }, {
+        'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.oreilly.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if SafariIE.suitable(url) or SafariApiIE.suitable(url)
+                else super(SafariCourseIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        course_id = self._match_id(url)
+
+        course_json = self._download_json(
+            '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
+            course_id, 'Downloading course JSON')
+
+        if 'chapters' not in course_json:
+            raise ExtractorError(
+                'No chapters found for course %s' % course_id, expected=True)
+
+        entries = [
+            self.url_result(chapter, SafariApiIE.ie_key())
+            for chapter in course_json['chapters']]
+
+        course_title = course_json['title']
+
+        return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/sapo.py b/youtube_dl/extractor/sapo.py

new file mode 100644 (file)

index 0000000..49a9b31
--- /dev/null
+++ b/youtube_dl/extractor/sapo.py
@@ -0,0 +1,119 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    unified_strdate,
+)
+
+
+class SapoIE(InfoExtractor):
+    IE_DESC = 'SAPO Vídeos'
+    _VALID_URL = r'https?://(?:(?:v2|www)\.)?videos\.sapo\.(?:pt|cv|ao|mz|tl)/(?P<id>[\da-zA-Z]{20})'
+
+    _TESTS = [
+        {
+            'url': 'http://videos.sapo.pt/UBz95kOtiWYUMTA5Ghfi',
+            'md5': '79ee523f6ecb9233ac25075dee0eda83',
+            'note': 'SD video',
+            'info_dict': {
+                'id': 'UBz95kOtiWYUMTA5Ghfi',
+                'ext': 'mp4',
+                'title': 'Benfica - Marcas na Hitória',
+                'description': 'md5:c9082000a128c3fd57bf0299e1367f22',
+                'duration': 264,
+                'uploader': 'tiago_1988',
+                'upload_date': '20080229',
+                'categories': ['benfica', 'cabral', 'desporto', 'futebol', 'geovanni', 'hooijdonk', 'joao', 'karel', 'lisboa', 'miccoli'],
+            },
+        },
+        {
+            'url': 'http://videos.sapo.pt/IyusNAZ791ZdoCY5H5IF',
+            'md5': '90a2f283cfb49193fe06e861613a72aa',
+            'note': 'HD video',
+            'info_dict': {
+                'id': 'IyusNAZ791ZdoCY5H5IF',
+                'ext': 'mp4',
+                'title': 'Codebits VII - Report',
+                'description': 'md5:6448d6fd81ce86feac05321f354dbdc8',
+                'duration': 144,
+                'uploader': 'codebits',
+                'upload_date': '20140427',
+                'categories': ['codebits', 'codebits2014'],
+            },
+        },
+        {
+            'url': 'http://v2.videos.sapo.pt/yLqjzPtbTimsn2wWBKHz',
+            'md5': 'e5aa7cc0bdc6db9b33df1a48e49a15ac',
+            'note': 'v2 video',
+            'info_dict': {
+                'id': 'yLqjzPtbTimsn2wWBKHz',
+                'ext': 'mp4',
+                'title': 'Hipnose Condicionativa 4',
+                'description': 'md5:ef0481abf8fb4ae6f525088a6dadbc40',
+                'duration': 692,
+                'uploader': 'sapozen',
+                'upload_date': '20090609',
+                'categories': ['condicionativa', 'heloisa', 'hipnose', 'miranda', 'sapo', 'zen'],
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        item = self._download_xml(
+            'http://rd3.videos.sapo.pt/%s/rss2' % video_id, video_id).find('./channel/item')
+
+        title = item.find('./title').text
+        description = item.find('./{http://videos.sapo.pt/mrss/}synopse').text
+        thumbnail = item.find('./{http://search.yahoo.com/mrss/}content').get('url')
+        duration = parse_duration(item.find('./{http://videos.sapo.pt/mrss/}time').text)
+        uploader = item.find('./{http://videos.sapo.pt/mrss/}author').text
+        upload_date = unified_strdate(item.find('./pubDate').text)
+        view_count = int(item.find('./{http://videos.sapo.pt/mrss/}views').text)
+        comment_count = int(item.find('./{http://videos.sapo.pt/mrss/}comment_count').text)
+        tags = item.find('./{http://videos.sapo.pt/mrss/}tags').text
+        categories = tags.split() if tags else []
+        age_limit = 18 if item.find('./{http://videos.sapo.pt/mrss/}m18').text == 'true' else 0
+
+        video_url = item.find('./{http://videos.sapo.pt/mrss/}videoFile').text
+        video_size = item.find('./{http://videos.sapo.pt/mrss/}videoSize').text.split('x')
+
+        formats = [{
+            'url': video_url,
+            'ext': 'mp4',
+            'format_id': 'sd',
+            'width': int(video_size[0]),
+            'height': int(video_size[1]),
+        }]
+
+        if item.find('./{http://videos.sapo.pt/mrss/}HD').text == 'true':
+            formats.append({
+                'url': re.sub(r'/mov/1$', '/mov/39', video_url),
+                'ext': 'mp4',
+                'format_id': 'hd',
+                'width': 1280,
+                'height': 720,
+            })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'uploader': uploader,
+            'upload_date': upload_date,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'categories': categories,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py

new file mode 100644 (file)

index 0000000..21e44b6
--- /dev/null
+++ b/youtube_dl/extractor/savefrom.py
@@ -0,0 +1,34 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import os.path
+import re
+
+from .common import InfoExtractor
+
+
+class SaveFromIE(InfoExtractor):
+    IE_NAME = 'savefrom.net'
+    _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P<url>.*)$'
+
+    _TEST = {
+        'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com',
+        'info_dict': {
+            'id': 'UlVRAPW2WJY',
+            'ext': 'mp4',
+            'title': 'About Team Radical MMA | MMA Fighting',
+            'upload_date': '20120816',
+            'uploader': 'Howcast',
+            'uploader_id': 'Howcast',
+            'description': r're:(?s).* Hi, my name is Rene Dreifuss\. And I\'m here to show you some MMA.*',
+        },
+        'params': {
+            'skip_download': True
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = os.path.splitext(url.split('/')[-1])[0]
+
+        return self.url_result(mobj.group('url'), video_id=video_id)
diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py

new file mode 100644 (file)

index 0000000..0e623ff
--- /dev/null
+++ b/youtube_dl/extractor/sbs.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    smuggle_url,
+    ExtractorError,
+)
+
+
+class SBSIE(InfoExtractor):
+    IE_DESC = 'sbs.com.au'
+    _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)'
+
+    _TESTS = [{
+        # Original URL is handled by the generic IE which finds the iframe:
+        # http://www.sbs.com.au/thefeed/blog/2014/08/21/dingo-conservation
+        'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed',
+        'md5': '3150cf278965eeabb5b4cea1c963fe0a',
+        'info_dict': {
+            'id': '320403011771',
+            'ext': 'mp4',
+            'title': 'Dingo Conservation (The Feed)',
+            'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
+            'thumbnail': r're:http://.*\.jpg',
+            'duration': 308,
+            'timestamp': 1408613220,
+            'upload_date': '20140821',
+            'uploader': 'SBSC',
+        },
+    }, {
+        'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        player_params = self._download_json(
+            'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id)
+
+        error = player_params.get('error')
+        if error:
+            error_message = 'Sorry, The video you are looking for does not exist.'
+            video_data = error.get('results') or {}
+            error_code = error.get('errorCode')
+            if error_code == 'ComingSoon':
+                error_message = '%s is not yet available.' % video_data.get('title', '')
+            elif error_code in ('Forbidden', 'intranetAccessOnly'):
+                error_message = 'Sorry, This video cannot be accessed via this website'
+            elif error_code == 'Expired':
+                error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '')
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
+
+        urls = player_params['releaseUrls']
+        theplatform_url = (urls.get('progressive') or urls.get('html')
+                           or urls.get('standard') or player_params['relatedItemsURL'])
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'id': video_id,
+            'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}),
+        }
diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py

new file mode 100644 (file)

index 0000000..69a0d01
--- /dev/null
+++ b/youtube_dl/extractor/screencast.py
@@ -0,0 +1,123 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_request,
+)
+from ..utils import (
+    ExtractorError,
+)
+
+
+class ScreencastIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P<id>[a-zA-Z0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.screencast.com/t/3ZEjQXlT',
+        'md5': '917df1c13798a3e96211dd1561fded83',
+        'info_dict': {
+            'id': '3ZEjQXlT',
+            'ext': 'm4v',
+            'title': 'Color Measurement with Ocean Optics Spectrometers',
+            'description': 'md5:240369cde69d8bed61349a199c5fb153',
+            'thumbnail': r're:^https?://.*\.(?:gif|jpg)$',
+        }
+    }, {
+        'url': 'http://www.screencast.com/t/V2uXehPJa1ZI',
+        'md5': 'e8e4b375a7660a9e7e35c33973410d34',
+        'info_dict': {
+            'id': 'V2uXehPJa1ZI',
+            'ext': 'mov',
+            'title': 'The Amadeus Spectrometer',
+            'description': 're:^In this video, our friends at.*To learn more about Amadeus, visit',
+            'thumbnail': r're:^https?://.*\.(?:gif|jpg)$',
+        }
+    }, {
+        'url': 'http://www.screencast.com/t/aAB3iowa',
+        'md5': 'dedb2734ed00c9755761ccaee88527cd',
+        'info_dict': {
+            'id': 'aAB3iowa',
+            'ext': 'mp4',
+            'title': 'Google Earth Export',
+            'description': 'Provides a demo of a CommunityViz export to Google Earth, one of the 3D viewing options.',
+            'thumbnail': r're:^https?://.*\.(?:gif|jpg)$',
+        }
+    }, {
+        'url': 'http://www.screencast.com/t/X3ddTrYh',
+        'md5': '669ee55ff9c51988b4ebc0877cc8b159',
+        'info_dict': {
+            'id': 'X3ddTrYh',
+            'ext': 'wmv',
+            'title': 'Toolkit 6 User Group Webinar (2014-03-04) - Default Judgment and First Impression',
+            'description': 'md5:7b9f393bc92af02326a5c5889639eab0',
+            'thumbnail': r're:^https?://.*\.(?:gif|jpg)$',
+        }
+    }, {
+        'url': 'http://screencast.com/t/aAB3iowa',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._html_search_regex(
+            r'<embed name="Video".*?src="([^"]+)"', webpage,
+            'QuickTime embed', default=None)
+
+        if video_url is None:
+            flash_vars_s = self._html_search_regex(
+                r'<param name="flashVars" value="([^"]+)"', webpage, 'flash vars',
+                default=None)
+            if not flash_vars_s:
+                flash_vars_s = self._html_search_regex(
+                    r'<param name="initParams" value="([^"]+)"', webpage, 'flash vars',
+                    default=None)
+                if flash_vars_s:
+                    flash_vars_s = flash_vars_s.replace(',', '&')
+            if flash_vars_s:
+                flash_vars = compat_parse_qs(flash_vars_s)
+                video_url_raw = compat_urllib_request.quote(
+                    flash_vars['content'][0])
+                video_url = video_url_raw.replace('http%3A', 'http:')
+
+        if video_url is None:
+            video_meta = self._html_search_meta(
+                'og:video', webpage, default=None)
+            if video_meta:
+                video_url = self._search_regex(
+                    r'src=(.*?)(?:$|&)', video_meta,
+                    'meta tag video URL', default=None)
+
+        if video_url is None:
+            video_url = self._html_search_regex(
+                r'MediaContentUrl["\']\s*:(["\'])(?P<url>(?:(?!\1).)+)\1',
+                webpage, 'video url', default=None, group='url')
+
+        if video_url is None:
+            video_url = self._html_search_meta(
+                'og:video', webpage, default=None)
+
+        if video_url is None:
+            raise ExtractorError('Cannot find video')
+
+        title = self._og_search_title(webpage, default=None)
+        if title is None:
+            title = self._html_search_regex(
+                [r'<b>Title:</b> ([^<]+)</div>',
+                 r'class="tabSeperator">></span><span class="tabText">(.+?)<',
+                 r'<title>([^<]+)</title>'],
+                webpage, 'title')
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._og_search_description(webpage, default=None)
+        if description is None:
+            description = self._html_search_meta('description', webpage)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py

new file mode 100644 (file)

index 0000000..b5e76c9
--- /dev/null
+++ b/youtube_dl/extractor/screencastomatic.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class ScreencastOMaticIE(InfoExtractor):
+    _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)'
+    _TEST = {
+        'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl',
+        'md5': '483583cb80d92588f15ccbedd90f0c18',
+        'info_dict': {
+            'id': 'c2lD3BeOPl',
+            'ext': 'mp4',
+            'title': 'Welcome to 3-4 Philosophy @ DECV!',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.',
+            'duration': 369.163,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        jwplayer_data = self._parse_json(
+            self._search_regex(
+                r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'),
+            video_id, transform_source=js_to_json)
+
+        info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
+        info_dict.update({
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+        })
+        return info_dict
diff --git a/youtube_dl/extractor/scrippsnetworks.py b/youtube_dl/extractor/scrippsnetworks.py

new file mode 100644 (file)

index 0000000..b40b4c4
--- /dev/null
+++ b/youtube_dl/extractor/scrippsnetworks.py
@@ -0,0 +1,152 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import hashlib
+import re
+
+from .aws import AWSIE
+from .anvato import AnvatoIE
+from .common import InfoExtractor
+from ..utils import (
+    smuggle_url,
+    urlencode_postdata,
+    xpath_text,
+)
+
+
+class ScrippsNetworksWatchIE(AWSIE):
+    IE_NAME = 'scrippsnetworks:watch'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        watch\.
+                        (?P<site>geniuskitchen)\.com/
+                        (?:
+                            player\.[A-Z0-9]+\.html\#|
+                            show/(?:[^/]+/){2}|
+                            player/
+                        )
+                        (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/',
+        'info_dict': {
+            'id': '4194875',
+            'ext': 'mp4',
+            'title': 'Ample Hills Ice Cream Bike',
+            'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.',
+            'uploader': 'ANV',
+            'upload_date': '20171011',
+            'timestamp': 1507698000,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [AnvatoIE.ie_key()],
+    }]
+
+    _SNI_TABLE = {
+        'geniuskitchen': 'genius',
+    }
+
+    _AWS_API_KEY = 'E7wSQmq0qK6xPrF13WmzKiHo4BQ7tip4pQcSXVl1'
+    _AWS_PROXY_HOST = 'web.api.video.snidigital.com'
+
+    _AWS_USER_AGENT = 'aws-sdk-js/2.80.0 callback'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        site_id, video_id = mobj.group('site', 'id')
+
+        aws_identity_id_json = json.dumps({
+            'IdentityId': '%s:7655847c-0ae7-4d9b-80d6-56c062927eb3' % self._AWS_REGION
+        }).encode('utf-8')
+        token = self._download_json(
+            'https://cognito-identity.%s.amazonaws.com/' % self._AWS_REGION, video_id,
+            data=aws_identity_id_json,
+            headers={
+                'Accept': '*/*',
+                'Content-Type': 'application/x-amz-json-1.1',
+                'Referer': url,
+                'X-Amz-Content-Sha256': hashlib.sha256(aws_identity_id_json).hexdigest(),
+                'X-Amz-Target': 'AWSCognitoIdentityService.GetOpenIdToken',
+                'X-Amz-User-Agent': self._AWS_USER_AGENT,
+            })['Token']
+
+        sts = self._download_xml(
+            'https://sts.amazonaws.com/', video_id, data=urlencode_postdata({
+                'Action': 'AssumeRoleWithWebIdentity',
+                'RoleArn': 'arn:aws:iam::710330595350:role/Cognito_WebAPIUnauth_Role',
+                'RoleSessionName': 'web-identity',
+                'Version': '2011-06-15',
+                'WebIdentityToken': token,
+            }), headers={
+                'Referer': url,
+                'X-Amz-User-Agent': self._AWS_USER_AGENT,
+                'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
+            })
+
+        def get(key):
+            return xpath_text(
+                sts, './/{https://sts.amazonaws.com/doc/2011-06-15/}%s' % key,
+                fatal=True)
+
+        mcp_id = self._aws_execute_api({
+            'uri': '/1/web/brands/%s/episodes/scrid/%s' % (self._SNI_TABLE[site_id], video_id),
+            'access_key': get('AccessKeyId'),
+            'secret_key': get('SecretAccessKey'),
+            'session_token': get('SessionToken'),
+        }, video_id)['results'][0]['mcpId']
+
+        return self.url_result(
+            smuggle_url(
+                'anvato:anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a:%s' % mcp_id,
+                {'geo_countries': ['US']}),
+            AnvatoIE.ie_key(), video_id=mcp_id)
+
+
+class ScrippsNetworksIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?P<site>cookingchanneltv|discovery|(?:diy|food)network|hgtv|travelchannel)\.com/videos/[0-9a-z-]+-(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.cookingchanneltv.com/videos/the-best-of-the-best-0260338',
+        'info_dict': {
+            'id': '0260338',
+            'ext': 'mp4',
+            'title': 'The Best of the Best',
+            'description': 'Catch a new episode of MasterChef Canada Tuedsay at 9/8c.',
+            'timestamp': 1475678834,
+            'upload_date': '20161005',
+            'uploader': 'SCNI-SCND',
+        },
+        'add_ie': ['ThePlatform'],
+    }, {
+        'url': 'https://www.diynetwork.com/videos/diy-barnwood-tablet-stand-0265790',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.foodnetwork.com/videos/chocolate-strawberry-cake-roll-7524591',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.hgtv.com/videos/cookie-decorating-101-0301929',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.travelchannel.com/videos/two-climates-one-bag-5302184',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.discovery.com/videos/guardians-of-the-glades-cooking-with-tom-cobb-5578368',
+        'only_matching': True,
+    }]
+    _ACCOUNT_MAP = {
+        'cookingchanneltv': 2433005105,
+        'discovery': 2706091867,
+        'diynetwork': 2433004575,
+        'foodnetwork': 2433005105,
+        'hgtv': 2433004575,
+        'travelchannel': 2433005739,
+    }
+    _TP_TEMPL = 'https://link.theplatform.com/s/ip77QC/media/guid/%d/%s?mbr=true'
+
+    def _real_extract(self, url):
+        site, guid = re.match(self._VALID_URL, url).groups()
+        return self.url_result(smuggle_url(
+            self._TP_TEMPL % (self._ACCOUNT_MAP[site], guid),
+            {'force_smil_url': True}), 'ThePlatform', guid)
diff --git a/youtube_dl/extractor/scte.py b/youtube_dl/extractor/scte.py

new file mode 100644 (file)

index 0000000..ca1de63
--- /dev/null
+++ b/youtube_dl/extractor/scte.py
@@ -0,0 +1,144 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    decode_packed_codes,
+    ExtractorError,
+    urlencode_postdata,
+)
+
+
+class SCTEBaseIE(InfoExtractor):
+    _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx'
+    _NETRC_MACHINE = 'scte'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_popup = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login popup')
+
+        def is_logged(webpage):
+            return any(re.search(p, webpage) for p in (
+                r'class=["\']welcome\b', r'>Sign Out<'))
+
+        # already logged in
+        if is_logged(login_popup):
+            return
+
+        login_form = self._hidden_inputs(login_popup)
+
+        login_form.update({
+            'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username,
+            'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password,
+            'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on',
+        })
+
+        response = self._download_webpage(
+            self._LOGIN_URL, None, 'Logging in',
+            data=urlencode_postdata(login_form))
+
+        if '|pageRedirect|' not in response and not is_logged(response):
+            error = self._html_search_regex(
+                r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)</',
+                response, 'error message', default=None)
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
+            raise ExtractorError('Unable to log in')
+
+
+class SCTEIE(SCTEBaseIE):
+    _VALID_URL = r'https?://learning\.scte\.org/mod/scorm/view\.php?.*?\bid=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484',
+        'info_dict': {
+            'title': 'Introduction to DOCSIS Engineering Professional',
+            'id': '31484',
+        },
+        'playlist_count': 5,
+        'skip': 'Requires account credentials',
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._search_regex(r'<h1>(.+?)</h1>', webpage, 'title')
+
+        context_id = self._search_regex(r'context-(\d+)', webpage, video_id)
+        content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id
+        context = decode_packed_codes(self._download_webpage(
+            '%smobile/data.js' % content_base, video_id))
+
+        data = self._parse_xml(
+            self._search_regex(
+                r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"),
+            video_id)
+
+        entries = []
+        for asset in data.findall('.//asset'):
+            asset_url = asset.get('url')
+            if not asset_url or not asset_url.endswith('.mp4'):
+                continue
+            asset_id = self._search_regex(
+                r'video_([^_]+)_', asset_url, 'asset id', default=None)
+            if not asset_id:
+                continue
+            entries.append({
+                'id': asset_id,
+                'title': title,
+                'url': content_base + asset_url,
+            })
+
+        return self.playlist_result(entries, video_id, title)
+
+
+class SCTECourseIE(SCTEBaseIE):
+    _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491',
+        'only_matching': True,
+    }, {
+        'url': 'https://learning.scte.org/course/view.php?id=3639',
+        'only_matching': True,
+    }, {
+        'url': 'https://learning.scte.org/course/view.php?id=3073',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        course_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, course_id)
+
+        title = self._search_regex(
+            r'<h1>(.+?)</h1>', webpage, 'title', default=None)
+
+        entries = []
+        for mobj in re.finditer(
+                r'''(?x)
+                    <a[^>]+
+                        href=(["\'])
+                        (?P<url>
+                            https?://learning\.scte\.org/mod/
+                            (?P<kind>scorm|subcourse)/view\.php?(?:(?!\1).)*?
+                            \bid=\d+
+                        )
+                    ''',
+                webpage):
+            item_url = mobj.group('url')
+            if item_url == url:
+                continue
+            ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm'
+                  else SCTECourseIE.ie_key())
+            entries.append(self.url_result(item_url, ie=ie))
+
+        return self.playlist_result(entries, course_id, title)
diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py

new file mode 100644 (file)

index 0000000..7872dc8
--- /dev/null
+++ b/youtube_dl/extractor/seeker.py
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    get_element_by_class,
+    strip_or_none,
+)
+
+
+class SeekerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html'
+    _TESTS = [{
+        'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html',
+        'md5': '897d44bbe0d8986a2ead96de565a92db',
+        'info_dict': {
+            'id': 'Elrn3gnY',
+            'ext': 'mp4',
+            'title': 'Should Trump Be Required To Release His Tax Returns?',
+            'description': 'md5:41efa8cfa8d627841045eec7b018eb45',
+            'timestamp': 1490090165,
+            'upload_date': '20170321',
+        }
+    }, {
+        'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html',
+        'playlist': [
+            {
+                'md5': '0497b9f20495174be73ae136949707d2',
+                'info_dict': {
+                    'id': 'FihYQ8AE',
+                    'ext': 'mp4',
+                    'title': 'The Pros & Cons Of Zoos',
+                    'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c',
+                    'timestamp': 1490039133,
+                    'upload_date': '20170320',
+                },
+            }
+        ],
+        'info_dict': {
+            'id': '1834116536',
+            'title': 'After Gorilla Killing, Changes Ahead for Zoos',
+            'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.',
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id, article_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, display_id)
+        entries = []
+        for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage):
+            entries.append(self.url_result(
+                'jwplatform:' + jwp_id, 'JWPlatform', jwp_id))
+        return self.playlist_result(
+            entries, article_id,
+            self._og_search_title(webpage),
+            strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage))
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py

new file mode 100644 (file)

index 0000000..db5ef8b
--- /dev/null
+++ b/youtube_dl/extractor/senateisvp.py
@@ -0,0 +1,153 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unsmuggle_url,
+)
+from ..compat import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+
+
+class SenateISVPIE(InfoExtractor):
+    _COMM_MAP = [
+        ['ag', '76440', 'http://ag-f.akamaihd.net'],
+        ['aging', '76442', 'http://aging-f.akamaihd.net'],
+        ['approps', '76441', 'http://approps-f.akamaihd.net'],
+        ['armed', '76445', 'http://armed-f.akamaihd.net'],
+        ['banking', '76446', 'http://banking-f.akamaihd.net'],
+        ['budget', '76447', 'http://budget-f.akamaihd.net'],
+        ['cecc', '76486', 'http://srs-f.akamaihd.net'],
+        ['commerce', '80177', 'http://commerce1-f.akamaihd.net'],
+        ['csce', '75229', 'http://srs-f.akamaihd.net'],
+        ['dpc', '76590', 'http://dpc-f.akamaihd.net'],
+        ['energy', '76448', 'http://energy-f.akamaihd.net'],
+        ['epw', '76478', 'http://epw-f.akamaihd.net'],
+        ['ethics', '76449', 'http://ethics-f.akamaihd.net'],
+        ['finance', '76450', 'http://finance-f.akamaihd.net'],
+        ['foreign', '76451', 'http://foreign-f.akamaihd.net'],
+        ['govtaff', '76453', 'http://govtaff-f.akamaihd.net'],
+        ['help', '76452', 'http://help-f.akamaihd.net'],
+        ['indian', '76455', 'http://indian-f.akamaihd.net'],
+        ['intel', '76456', 'http://intel-f.akamaihd.net'],
+        ['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'],
+        ['jccic', '85180', 'http://jccic-f.akamaihd.net'],
+        ['jec', '76458', 'http://jec-f.akamaihd.net'],
+        ['judiciary', '76459', 'http://judiciary-f.akamaihd.net'],
+        ['rpc', '76591', 'http://rpc-f.akamaihd.net'],
+        ['rules', '76460', 'http://rules-f.akamaihd.net'],
+        ['saa', '76489', 'http://srs-f.akamaihd.net'],
+        ['smbiz', '76461', 'http://smbiz-f.akamaihd.net'],
+        ['srs', '75229', 'http://srs-f.akamaihd.net'],
+        ['uscc', '76487', 'http://srs-f.akamaihd.net'],
+        ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'],
+        ['arch', '', 'http://ussenate-f.akamaihd.net/']
+    ]
+    _IE_NAME = 'senate.gov'
+    _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)'
+    _TESTS = [{
+        'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
+        'info_dict': {
+            'id': 'judiciary031715',
+            'ext': 'mp4',
+            'title': 'Integrated Senate Video Player',
+            'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false',
+        'info_dict': {
+            'id': 'commerce011514',
+            'ext': 'mp4',
+            'title': 'Integrated Senate Video Player'
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi',
+        # checksum differs each time
+        'info_dict': {
+            'id': 'intel090613',
+            'ext': 'mp4',
+            'title': 'Integrated Senate Video Player'
+        }
+    }, {
+        # From http://www.c-span.org/video/?96791-1
+        'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _search_iframe_url(webpage):
+        mobj = re.search(
+            r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _get_info_for_comm(self, committee):
+        for entry in self._COMM_MAP:
+            if entry[0] == committee:
+                return entry[1:]
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        qs = compat_parse_qs(re.match(self._VALID_URL, url).group('qs'))
+        if not qs.get('filename') or not qs.get('type') or not qs.get('comm'):
+            raise ExtractorError('Invalid URL', expected=True)
+
+        video_id = re.sub(r'.mp4$', '', qs['filename'][0])
+
+        webpage = self._download_webpage(url, video_id)
+
+        if smuggled_data.get('force_title'):
+            title = smuggled_data['force_title']
+        else:
+            title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id)
+        poster = qs.get('poster')
+        thumbnail = poster[0] if poster else None
+
+        video_type = qs['type'][0]
+        committee = video_type if video_type == 'arch' else qs['comm'][0]
+        stream_num, domain = self._get_info_for_comm(committee)
+
+        formats = []
+        if video_type == 'arch':
+            filename = video_id if '.' in video_id else video_id + '.mp4'
+            formats = [{
+                # All parameters in the query string are necessary to prevent a 403 error
+                'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=',
+            }]
+        else:
+            hdcore_sign = 'hdcore=3.1.0'
+            url_params = (domain, video_id, stream_num)
+            f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign
+            m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params
+            for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'):
+                # URLs without the extra param induce an 404 error
+                entry.update({'extra_param_to_segment_url': hdcore_sign})
+                formats.append(entry)
+            for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'):
+                mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url'])
+                if mobj:
+                    entry['format_id'] += mobj.group('tag')
+                formats.append(entry)
+
+            self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py

new file mode 100644 (file)

index 0000000..9d96529
--- /dev/null
+++ b/youtube_dl/extractor/sendtonews.py
@@ -0,0 +1,105 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    parse_iso8601,
+    update_url_query,
+    int_or_none,
+    determine_protocol,
+    unescapeHTML,
+)
+
+
+class SendtoNewsIE(InfoExtractor):
+    _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P<id>[0-9A-Za-z-]+)'
+
+    _TEST = {
+        # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/
+        'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES',
+        'info_dict': {
+            'id': 'GxfCe0Zo7D-175909-5588'
+        },
+        'playlist_count': 8,
+        # test the first video only to prevent lengthy tests
+        'playlist': [{
+            'info_dict': {
+                'id': '240385',
+                'ext': 'mp4',
+                'title': 'Indians introduce Encarnacion',
+                'description': 'Indians president of baseball operations Chris Antonetti and Edwin Encarnacion discuss the slugger\'s three-year contract with Cleveland',
+                'duration': 137.898,
+                'thumbnail': r're:https?://.*\.jpg$',
+                'upload_date': '20170105',
+                'timestamp': 1483649762,
+            },
+        }],
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s'
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(r'''(?x)<script[^>]+src=([\'"])
+            (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\?
+                .*\bSC=(?P<SC>[0-9a-zA-Z-]+).*
+            \1>''', webpage)
+        if mobj:
+            sc = mobj.group('SC')
+            return cls._URL_TEMPLATE % sc
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        data_url = update_url_query(
+            url.replace('embedplayer.php', 'data_read.php'),
+            {'cmd': 'loadInitial'})
+        playlist_data = self._download_json(data_url, playlist_id)
+
+        entries = []
+        for video in playlist_data['playlistData'][0]:
+            info_dict = self._parse_jwplayer_data(
+                video['jwconfiguration'],
+                require_title=False, m3u8_id='hls', rtmp_params={'no_resume': True})
+
+            for f in info_dict['formats']:
+                if f.get('tbr'):
+                    continue
+                tbr = int_or_none(self._search_regex(
+                    r'/(\d+)k/', f['url'], 'bitrate', default=None))
+                if not tbr:
+                    continue
+                f.update({
+                    'format_id': '%s-%d' % (determine_protocol(f), tbr),
+                    'tbr': tbr,
+                })
+            self._sort_formats(info_dict['formats'], ('tbr', 'height', 'width', 'format_id'))
+
+            thumbnails = []
+            if video.get('thumbnailUrl'):
+                thumbnails.append({
+                    'id': 'normal',
+                    'url': video['thumbnailUrl'],
+                })
+            if video.get('smThumbnailUrl'):
+                thumbnails.append({
+                    'id': 'small',
+                    'url': video['smThumbnailUrl'],
+                })
+            info_dict.update({
+                'title': video['S_headLine'].strip(),
+                'description': unescapeHTML(video.get('S_fullStory')),
+                'thumbnails': thumbnails,
+                'duration': float_or_none(video.get('SM_length')),
+                'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '),
+            })
+            entries.append(info_dict)
+
+        return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py

new file mode 100644 (file)

index 0000000..9401bf2
--- /dev/null
+++ b/youtube_dl/extractor/servus.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class ServusIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        (?:
+                            servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)|
+                            servustv\.com/videos
+                        )
+                        /(?P<id>[aA]{2}-\w+|\d+-\d+)
+                    '''
+    _TESTS = [{
+        # new URL schema
+        'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/',
+        'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
+        'info_dict': {
+            'id': 'AA-1T6VBU5PW1W12',
+            'ext': 'mp4',
+            'title': 'Die Grünen aus Sicht des Volkes',
+            'description': 'md5:1247204d85783afe3682644398ff2ec4',
+            'thumbnail': r're:^https?://.*\.jpg',
+        }
+    }, {
+        # old URL schema
+        'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url).upper()
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._search_regex(
+            (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
+             r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'),
+            webpage, 'title', default=None,
+            group='title') or self._og_search_title(webpage)
+        title = re.sub(r'\s*-\s*Servus TV\s*$', '', title)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        formats = self._extract_m3u8_formats(
+            'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id,
+            video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/sevenplus.py b/youtube_dl/extractor/sevenplus.py

new file mode 100644 (file)

index 0000000..84568ac
--- /dev/null
+++ b/youtube_dl/extractor/sevenplus.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .brightcove import BrightcoveNewIE
+from ..compat import compat_str
+from ..utils import (
+    try_get,
+    update_url_query,
+)
+
+
+class SevenPlusIE(BrightcoveNewIE):
+    IE_NAME = '7plus'
+    _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P<path>[^?]+\?.*?\bepisode-id=(?P<id>[^&#]+))'
+    _TESTS = [{
+        'url': 'https://7plus.com.au/MTYS?episode-id=MTYS7-003',
+        'info_dict': {
+            'id': 'MTYS7-003',
+            'ext': 'mp4',
+            'title': 'S7 E3 - Wind Surf',
+            'description': 'md5:29c6a69f21accda7601278f81b46483d',
+            'uploader_id': '5303576322001',
+            'upload_date': '20171201',
+            'timestamp': 1512106377,
+            'series': 'Mighty Ships',
+            'season_number': 7,
+            'episode_number': 3,
+            'episode': 'Wind Surf',
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://7plus.com.au/UUUU?episode-id=AUMS43-001',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        path, episode_id = re.match(self._VALID_URL, url).groups()
+
+        media = self._download_json(
+            'https://videoservice.swm.digital/playback', episode_id, query={
+                'appId': '7plus',
+                'deviceType': 'web',
+                'platformType': 'web',
+                'accountId': 5303576322001,
+                'referenceId': 'ref:' + episode_id,
+                'deliveryId': 'csai',
+                'videoType': 'vod',
+            })['media']
+
+        for source in media.get('sources', {}):
+            src = source.get('src')
+            if not src:
+                continue
+            source['src'] = update_url_query(src, {'rule': ''})
+
+        info = self._parse_brightcove_metadata(media, episode_id)
+
+        content = self._download_json(
+            'https://component-cdn.swm.digital/content/' + path,
+            episode_id, headers={
+                'market-id': 4,
+            }, fatal=False) or {}
+        for item in content.get('items', {}):
+            if item.get('componentData', {}).get('componentType') == 'infoPanel':
+                for src_key, dst_key in [('title', 'title'), ('shortSynopsis', 'description')]:
+                    value = item.get(src_key)
+                    if value:
+                        info[dst_key] = value
+                info['series'] = try_get(
+                    item, lambda x: x['seriesLogo']['name'], compat_str)
+                mobj = re.search(r'^S(\d+)\s+E(\d+)\s+-\s+(.+)$', info['title'])
+                if mobj:
+                    info.update({
+                        'season_number': int(mobj.group(1)),
+                        'episode_number': int(mobj.group(2)),
+                        'episode': mobj.group(3),
+                    })
+
+        return info
diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py

new file mode 100644 (file)

index 0000000..3df5152
--- /dev/null
+++ b/youtube_dl/extractor/sexu.py
@@ -0,0 +1,63 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class SexuIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?sexu\.com/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://sexu.com/961791/',
+        'md5': 'ff615aca9691053c94f8f10d96cd7884',
+        'info_dict': {
+            'id': '961791',
+            'ext': 'mp4',
+            'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b',
+            'description': 'md5:2b75327061310a3afb3fbd7d09e2e403',
+            'categories': list,  # NSFW
+            'thumbnail': r're:https?://.*\.jpg$',
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        jwvideo = self._parse_json(
+            self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'),
+            video_id)
+
+        sources = jwvideo['sources']
+
+        formats = [{
+            'url': source['file'].replace('\\', ''),
+            'format_id': source.get('label'),
+            'height': int(self._search_regex(
+                r'^(\d+)[pP]', source.get('label', ''), 'height',
+                default=None)),
+        } for source in sources if source.get('file')]
+        self._sort_formats(formats)
+
+        title = self._html_search_regex(
+            r'<title>([^<]+)\s*-\s*Sexu\.Com</title>', webpage, 'title')
+
+        description = self._html_search_meta(
+            'description', webpage, 'description')
+
+        thumbnail = jwvideo.get('image')
+
+        categories_str = self._html_search_meta(
+            'keywords', webpage, 'categories')
+        categories = (
+            None if categories_str is None
+            else categories_str.split(','))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'categories': categories,
+            'formats': formats,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py

new file mode 100644 (file)

index 0000000..7a1c7e3
--- /dev/null
+++ b/youtube_dl/extractor/seznamzpravy.py
@@ -0,0 +1,169 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_str,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    urljoin,
+    int_or_none,
+    parse_codecs,
+    try_get,
+)
+
+
+def _raw_id(src_url):
+    return compat_urllib_parse_urlparse(src_url).path.split('/')[-1]
+
+
+class SeznamZpravyIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc='
+    _TESTS = [{
+        'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5&sectionPrefixPreroll=%2Fzpravy',
+        'info_dict': {
+            'id': '170889',
+            'ext': 'mp4',
+            'title': 'Svět bez obalu: Čeští vojáci na misích (krátká verze)',
+            'thumbnail': r're:^https?://.*\.jpe?g',
+            'duration': 241,
+            'series': 'Svět bez obalu',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # with Location key
+        'url': 'https://www.seznamzpravy.cz/iframe/player?duration=null&serviceSlug=zpravy&src=https%3A%2F%2Flive-a.sdn.szn.cz%2Fv_39%2F59e468fe454f8472a96af9fa%3Ffl%3Dmdk%2C5c1e2840%7C&itemType=livevod&autoPlay=false&title=P%C5%99edseda%20KDU-%C4%8CSL%20Pavel%20B%C4%9Blobr%C3%A1dek%20ve%20volebn%C3%AD%20V%C3%BDzv%C4%9B%20Seznamu&series=V%C3%BDzva&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_G_J%2FjTBCs.jpeg%3Ffl%3Dcro%2C0%2C0%2C1280%2C720%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=16&height=9&cutFrom=0&cutTo=0&splVersion=VOD&contentId=185688&contextId=38489&showAdvert=true&collocation=&hideFullScreen=false&hideSubtitles=false&embed=&isVideoTooShortForPreroll=false&isVideoTooShortForPreroll2=false&isVideoTooLongForPostroll=false&fakePostrollZoneID=seznam.clanky.zpravy.preroll&fakePrerollZoneID=seznam.clanky.zpravy.preroll&videoCommentId=&trim=default_16x9&noPrerollVideoLength=30&noPreroll2VideoLength=undefined&noMidrollVideoLength=0&noPostrollVideoLength=999999&autoplayPossible=true&version=5.0.41&dotService=zpravy&gemiusPrismIdentifier=zD3g7byfW5ekpXmxTVLaq5Srjw5i4hsYo0HY1aBwIe..27&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5&sectionPrefixPreroll=%2Fzpravy%2Fvyzva&zoneIdPostroll=seznam.pack.videospot&skipOffsetPostroll=5&sectionPrefixPostroll=%2Fzpravy%2Fvyzva&regression=false',
+        'info_dict': {
+            'id': '185688',
+            'ext': 'mp4',
+            'title': 'Předseda KDU-ČSL Pavel Bělobrádek ve volební Výzvě Seznamu',
+            'thumbnail': r're:^https?://.*\.jpe?g',
+            'series': 'Výzva',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url') for mobj in re.finditer(
+                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1',
+                webpage)]
+
+    def _extract_sdn_formats(self, sdn_url, video_id):
+        sdn_data = self._download_json(sdn_url, video_id)
+
+        if sdn_data.get('Location'):
+            sdn_url = sdn_data['Location']
+            sdn_data = self._download_json(sdn_url, video_id)
+
+        formats = []
+        mp4_formats = try_get(sdn_data, lambda x: x['data']['mp4'], dict) or {}
+        for format_id, format_data in mp4_formats.items():
+            relative_url = format_data.get('url')
+            if not relative_url:
+                continue
+
+            try:
+                width, height = format_data.get('resolution')
+            except (TypeError, ValueError):
+                width, height = None, None
+
+            f = {
+                'url': urljoin(sdn_url, relative_url),
+                'format_id': 'http-%s' % format_id,
+                'tbr': int_or_none(format_data.get('bandwidth'), scale=1000),
+                'width': int_or_none(width),
+                'height': int_or_none(height),
+            }
+            f.update(parse_codecs(format_data.get('codec')))
+            formats.append(f)
+
+        pls = sdn_data.get('pls', {})
+
+        def get_url(format_id):
+            return try_get(pls, lambda x: x[format_id]['url'], compat_str)
+
+        dash_rel_url = get_url('dash')
+        if dash_rel_url:
+            formats.extend(self._extract_mpd_formats(
+                urljoin(sdn_url, dash_rel_url), video_id, mpd_id='dash',
+                fatal=False))
+
+        hls_rel_url = get_url('hls')
+        if hls_rel_url:
+            formats.extend(self._extract_m3u8_formats(
+                urljoin(sdn_url, hls_rel_url), video_id, ext='mp4',
+                m3u8_id='hls', fatal=False))
+
+        self._sort_formats(formats)
+        return formats
+
+    def _real_extract(self, url):
+        params = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+
+        src = params['src'][0]
+        title = params['title'][0]
+        video_id = params.get('contentId', [_raw_id(src)])[0]
+        formats = self._extract_sdn_formats(src + 'spl2,2,VOD', video_id)
+
+        duration = int_or_none(params.get('duration', [None])[0])
+        series = params.get('series', [None])[0]
+        thumbnail = params.get('poster', [None])[0]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'series': series,
+            'formats': formats,
+        }
+
+
+class SeznamZpravyArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:seznam\.cz/zpravy|seznamzpravy\.cz)/clanek/(?:[^/?#&]+)-(?P<id>\d+)'
+    _API_URL = 'https://apizpravy.seznam.cz/'
+
+    _TESTS = [{
+        # two videos on one page, with SDN URL
+        'url': 'https://www.seznamzpravy.cz/clanek/jejich-svet-na-nas-utoci-je-lepsi-branit-se-na-jejich-pisecku-rika-reziser-a-major-v-zaloze-marhoul-35990',
+        'info_dict': {
+            'id': '35990',
+            'title': 'md5:6011c877a36905f28f271fcd8dcdb0f2',
+            'description': 'md5:933f7b06fa337a814ba199d3596d27ba',
+        },
+        'playlist_count': 2,
+    }, {
+        # video with live stream URL
+        'url': 'https://www.seznam.cz/zpravy/clanek/znovu-do-vlady-s-ano-pavel-belobradek-ve-volebnim-specialu-seznamu-38489',
+        'info_dict': {
+            'id': '38489',
+            'title': 'md5:8fa1afdc36fd378cf0eba2b74c5aca60',
+            'description': 'md5:428e7926a1a81986ec7eb23078004fb4',
+        },
+        'playlist_count': 1,
+    }]
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, article_id)
+
+        info = self._search_json_ld(webpage, article_id, default={})
+
+        title = info.get('title') or self._og_search_title(webpage, fatal=False)
+        description = info.get('description') or self._og_search_description(webpage)
+
+        return self.playlist_result([
+            self.url_result(entry_url, ie=SeznamZpravyIE.ie_key())
+            for entry_url in SeznamZpravyIE._extract_urls(webpage)],
+            article_id, title, description)
diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py

new file mode 100644 (file)

index 0000000..5c2a620
--- /dev/null
+++ b/youtube_dl/extractor/shahid.py
@@ -0,0 +1,215 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import math
+import re
+
+from .aws import AWSIE
+from ..compat import compat_HTTPError
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    InAdvancePagedList,
+    int_or_none,
+    parse_iso8601,
+    str_or_none,
+    urlencode_postdata,
+)
+
+
+class ShahidBaseIE(AWSIE):
+    _AWS_PROXY_HOST = 'api2.shahid.net'
+    _AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh'
+
+    def _handle_error(self, e):
+        fail_data = self._parse_json(
+            e.cause.read().decode('utf-8'), None, fatal=False)
+        if fail_data:
+            faults = fail_data.get('faults', [])
+            faults_message = ', '.join([clean_html(fault['userMessage']) for fault in faults if fault.get('userMessage')])
+            if faults_message:
+                raise ExtractorError(faults_message, expected=True)
+
+    def _call_api(self, path, video_id, request=None):
+        query = {}
+        if request:
+            query['request'] = json.dumps(request)
+        try:
+            return self._aws_execute_api({
+                'uri': '/proxy/v2/' + path,
+                'access_key': 'AKIAI6X4TYCIXM2B7MUQ',
+                'secret_key': '4WUUJWuFvtTkXbhaWTDv7MhO+0LqoYDWfEnUXoWn',
+            }, video_id, query)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError):
+                self._handle_error(e)
+            raise
+
+
+class ShahidIE(ShahidBaseIE):
+    _NETRC_MACHINE = 'shahid'
+    _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286',
+        'info_dict': {
+            'id': '275286',
+            'ext': 'mp4',
+            'title': 'مجلس الشباب الموسم 1 كليب 1',
+            'timestamp': 1506988800,
+            'upload_date': '20171003',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://shahid.mbc.net/ar/movies/%D8%A7%D9%84%D9%82%D9%86%D8%A7%D8%B5%D8%A9/movie-151746',
+        'only_matching': True
+    }, {
+        # shahid plus subscriber only
+        'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511',
+        'only_matching': True
+    }]
+
+    def _real_initialize(self):
+        email, password = self._get_login_info()
+        if email is None:
+            return
+
+        try:
+            user_data = self._download_json(
+                'https://shahid.mbc.net/wd/service/users/login',
+                None, 'Logging in', data=json.dumps({
+                    'email': email,
+                    'password': password,
+                    'basic': 'false',
+                }).encode('utf-8'), headers={
+                    'Content-Type': 'application/json; charset=UTF-8',
+                })['user']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError):
+                self._handle_error(e)
+            raise
+
+        self._download_webpage(
+            'https://shahid.mbc.net/populateContext',
+            None, 'Populate Context', data=urlencode_postdata({
+                'firstName': user_data['firstName'],
+                'lastName': user_data['lastName'],
+                'userName': user_data['email'],
+                'csg_user_name': user_data['email'],
+                'subscriberId': user_data['id'],
+                'sessionId': user_data['sessionId'],
+            }))
+
+    def _real_extract(self, url):
+        page_type, video_id = re.match(self._VALID_URL, url).groups()
+        if page_type == 'clip':
+            page_type = 'episode'
+
+        playout = self._call_api(
+            'playout/url/' + video_id, video_id)['playout']
+
+        if playout.get('drm'):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
+        formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4')
+        self._sort_formats(formats)
+
+        # video = self._call_api(
+        #     'product/id', video_id, {
+        #         'id': video_id,
+        #         'productType': 'ASSET',
+        #         'productSubType': page_type.upper()
+        #     })['productModel']
+
+        response = self._download_json(
+            'http://api.shahid.net/api/v1_1/%s/%s' % (page_type, video_id),
+            video_id, 'Downloading video JSON', query={
+                'apiKey': 'sh@hid0nlin3',
+                'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=',
+            })
+        data = response.get('data', {})
+        error = data.get('error')
+        if error:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())),
+                expected=True)
+
+        video = data[page_type]
+        title = video['title']
+        categories = [
+            category['name']
+            for category in video.get('genres', []) if 'name' in category]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': video.get('thumbnailUrl'),
+            'duration': int_or_none(video.get('duration')),
+            'timestamp': parse_iso8601(video.get('referenceDate')),
+            'categories': categories,
+            'series': video.get('showTitle') or video.get('showName'),
+            'season': video.get('seasonTitle'),
+            'season_number': int_or_none(video.get('seasonNumber')),
+            'season_id': str_or_none(video.get('seasonId')),
+            'episode_number': int_or_none(video.get('number')),
+            'episode_id': video_id,
+            'formats': formats,
+        }
+
+
+class ShahidShowIE(ShahidBaseIE):
+    _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187',
+        'info_dict': {
+            'id': '79187',
+            'title': 'رامز قرش البحر',
+            'description': 'md5:c88fa7e0f02b0abd39d417aee0d046ff',
+        },
+        'playlist_mincount': 32,
+    }, {
+        'url': 'https://shahid.mbc.net/ar/series/How-to-live-Longer-(The-Big-Think)/series-291861',
+        'only_matching': True
+    }]
+    _PAGE_SIZE = 30
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+
+        product = self._call_api(
+            'playableAsset', show_id, {'showId': show_id})['productModel']
+        playlist = product['playlist']
+        playlist_id = playlist['id']
+        show = product.get('show', {})
+
+        def page_func(page_num):
+            playlist = self._call_api(
+                'product/playlist', show_id, {
+                    'playListId': playlist_id,
+                    'pageNumber': page_num,
+                    'pageSize': 30,
+                    'sorts': [{
+                        'order': 'DESC',
+                        'type': 'SORTDATE'
+                    }],
+                })
+            for product in playlist.get('productList', {}).get('products', []):
+                product_url = product.get('productUrl', []).get('url')
+                if not product_url:
+                    continue
+                yield self.url_result(
+                    product_url, 'Shahid',
+                    str_or_none(product.get('id')),
+                    product.get('title'))
+
+        entries = InAdvancePagedList(
+            page_func,
+            math.ceil(playlist['count'] / self._PAGE_SIZE),
+            self._PAGE_SIZE)
+
+        return self.playlist_result(
+            entries, show_id, show.get('title'), show.get('description'))
diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py

new file mode 100644 (file)

index 0000000..02295d1
--- /dev/null
+++ b/youtube_dl/extractor/shared.py
@@ -0,0 +1,138 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_urllib_parse_unquote_plus,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    js_to_json,
+    KNOWN_EXTENSIONS,
+    parse_filesize,
+    rot47,
+    url_or_none,
+    urlencode_postdata,
+)
+
+
+class SharedBaseIE(InfoExtractor):
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage, urlh = self._download_webpage_handle(url, video_id)
+
+        if self._FILE_NOT_FOUND in webpage:
+            raise ExtractorError(
+                'Video %s does not exist' % video_id, expected=True)
+
+        video_url = self._extract_video_url(webpage, video_id, url)
+
+        title = self._extract_title(webpage)
+        filesize = int_or_none(self._extract_filesize(webpage))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'ext': 'mp4',
+            'filesize': filesize,
+            'title': title,
+        }
+
+    def _extract_title(self, webpage):
+        return compat_b64decode(self._html_search_meta(
+            'full:title', webpage, 'title')).decode('utf-8')
+
+    def _extract_filesize(self, webpage):
+        return self._html_search_meta(
+            'full:size', webpage, 'file size', fatal=False)
+
+
+class SharedIE(SharedBaseIE):
+    IE_DESC = 'shared.sx'
+    _VALID_URL = r'https?://shared\.sx/(?P<id>[\da-z]{10})'
+    _FILE_NOT_FOUND = '>File does not exist<'
+
+    _TEST = {
+        'url': 'http://shared.sx/0060718775',
+        'md5': '106fefed92a8a2adb8c98e6a0652f49b',
+        'info_dict': {
+            'id': '0060718775',
+            'ext': 'mp4',
+            'title': 'Bmp4',
+            'filesize': 1720110,
+        },
+    }
+
+    def _extract_video_url(self, webpage, video_id, url):
+        download_form = self._hidden_inputs(webpage)
+
+        video_page = self._download_webpage(
+            url, video_id, 'Downloading video page',
+            data=urlencode_postdata(download_form),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'Referer': url,
+            })
+
+        video_url = self._html_search_regex(
+            r'data-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            video_page, 'video URL', group='url')
+
+        return video_url
+
+
+class VivoIE(SharedBaseIE):
+    IE_DESC = 'vivo.sx'
+    _VALID_URL = r'https?://vivo\.sx/(?P<id>[\da-z]{10})'
+    _FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed'
+
+    _TEST = {
+        'url': 'http://vivo.sx/d7ddda0e78',
+        'md5': '15b3af41be0b4fe01f4df075c2678b2c',
+        'info_dict': {
+            'id': 'd7ddda0e78',
+            'ext': 'mp4',
+            'title': 'Chicken',
+            'filesize': 515659,
+        },
+    }
+
+    def _extract_title(self, webpage):
+        title = self._html_search_regex(
+            r'data-name\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', webpage,
+            'title', default=None, group='title')
+        if title:
+            ext = determine_ext(title)
+            if ext.lower() in KNOWN_EXTENSIONS:
+                title = title.rpartition('.' + ext)[0]
+            return title
+        return self._og_search_title(webpage)
+
+    def _extract_filesize(self, webpage):
+        return parse_filesize(self._search_regex(
+            r'data-type=["\']video["\'][^>]*>Watch.*?<strong>\s*\((.+?)\)',
+            webpage, 'filesize', fatal=False))
+
+    def _extract_video_url(self, webpage, video_id, url):
+        def decode_url_old(encoded_url):
+            return compat_b64decode(encoded_url).decode('utf-8')
+
+        stream_url = self._search_regex(
+            r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'stream url', default=None, group='url')
+        if stream_url:
+            stream_url = url_or_none(decode_url_old(stream_url))
+        if stream_url:
+            return stream_url
+
+        def decode_url(encoded_url):
+            return rot47(compat_urllib_parse_unquote_plus(encoded_url))
+
+        return decode_url(self._parse_json(
+            self._search_regex(
+                r'(?s)InitializeStream\s*\(\s*({.+?})\s*\)\s*;', webpage,
+                'stream'),
+            video_id, transform_source=js_to_json)['source'])
diff --git a/youtube_dl/extractor/showroomlive.py b/youtube_dl/extractor/showroomlive.py

new file mode 100644 (file)

index 0000000..efd9d56
--- /dev/null
+++ b/youtube_dl/extractor/showroomlive.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    urljoin,
+)
+
+
+class ShowRoomLiveIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?showroom-live\.com/(?!onlive|timetable|event|campaign|news|ranking|room)(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.showroom-live.com/48_Nana_Okada',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        broadcaster_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, broadcaster_id)
+
+        room_id = self._search_regex(
+            (r'SrGlobal\.roomId\s*=\s*(\d+)',
+             r'(?:profile|room)\?room_id\=(\d+)'), webpage, 'room_id')
+
+        room = self._download_json(
+            urljoin(url, '/api/room/profile?room_id=%s' % room_id),
+            broadcaster_id)
+
+        is_live = room.get('is_onlive')
+        if is_live is not True:
+            raise ExtractorError('%s is offline' % broadcaster_id, expected=True)
+
+        uploader = room.get('performer_name') or broadcaster_id
+        title = room.get('room_name') or room.get('main_name') or uploader
+
+        streaming_url_list = self._download_json(
+            urljoin(url, '/api/live/streaming_url?room_id=%s' % room_id),
+            broadcaster_id)['streaming_url_list']
+
+        formats = []
+        for stream in streaming_url_list:
+            stream_url = stream.get('url')
+            if not stream_url:
+                continue
+            stream_type = stream.get('type')
+            if stream_type == 'hls':
+                m3u8_formats = self._extract_m3u8_formats(
+                    stream_url, broadcaster_id, ext='mp4', m3u8_id='hls',
+                    live=True)
+                for f in m3u8_formats:
+                    f['quality'] = int_or_none(stream.get('quality', 100))
+                formats.extend(m3u8_formats)
+            elif stream_type == 'rtmp':
+                stream_name = stream.get('stream_name')
+                if not stream_name:
+                    continue
+                formats.append({
+                    'url': stream_url,
+                    'play_path': stream_name,
+                    'page_url': url,
+                    'player_url': 'https://www.showroom-live.com/assets/swf/v3/ShowRoomLive.swf',
+                    'rtmp_live': True,
+                    'ext': 'flv',
+                    'format_id': 'rtmp',
+                    'format_note': stream.get('label'),
+                    'quality': int_or_none(stream.get('quality', 100)),
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': compat_str(room.get('live_id') or broadcaster_id),
+            'title': self._live_title(title),
+            'description': room.get('description'),
+            'timestamp': int_or_none(room.get('current_live_started_at')),
+            'uploader': uploader,
+            'uploader_id': broadcaster_id,
+            'view_count': int_or_none(room.get('view_num')),
+            'formats': formats,
+            'is_live': True,
+        }
diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py

new file mode 100644 (file)

index 0000000..07b766b
--- /dev/null
+++ b/youtube_dl/extractor/sina.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    HEADRequest,
+    ExtractorError,
+    int_or_none,
+    update_url_query,
+    qualities,
+    get_element_by_attribute,
+    clean_html,
+)
+
+
+class SinaIE(InfoExtractor):
+    _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/
+                        (?:
+                            (?:view/|.*\#)(?P<video_id>\d+)|
+                            .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|
+                            # This is used by external sites like Weibo
+                            api/sinawebApi/outplay.php/(?P<token>.+?)\.swf
+                        )
+                  '''
+
+    _TESTS = [
+        {
+            'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622',
+            'md5': 'd38433e2fc886007729735650ae4b3e9',
+            'info_dict': {
+                'id': '250576622',
+                'ext': 'mp4',
+                'title': '现场:克鲁兹宣布退选 特朗普将稳获提名',
+            }
+        },
+        {
+            'url': 'http://video.sina.com.cn/v/b/101314253-1290078633.html',
+            'info_dict': {
+                'id': '101314253',
+                'ext': 'flv',
+                'title': '军方提高对朝情报监视级别',
+            },
+            'skip': 'the page does not exist or has been deleted',
+        },
+        {
+            'url': 'http://video.sina.com.cn/view/250587748.html',
+            'md5': '3d1807a25c775092aab3bc157fff49b4',
+            'info_dict': {
+                'id': '250587748',
+                'ext': 'mp4',
+                'title': '瞬间泪目：8年前汶川地震珍贵视频首曝光',
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('video_id')
+        if not video_id:
+            if mobj.group('token') is not None:
+                # The video id is in the redirected url
+                self.to_screen('Getting video id')
+                request = HEADRequest(url)
+                _, urlh = self._download_webpage_handle(request, 'NA', False)
+                return self._real_extract(urlh.geturl())
+            else:
+                pseudo_id = mobj.group('pseudo_id')
+                webpage = self._download_webpage(url, pseudo_id)
+                error = get_element_by_attribute('class', 'errtitle', webpage)
+                if error:
+                    raise ExtractorError('%s said: %s' % (
+                        self.IE_NAME, clean_html(error)), expected=True)
+                video_id = self._search_regex(
+                    r"video_id\s*:\s*'(\d+)'", webpage, 'video id')
+
+        video_data = self._download_json(
+            'http://s.video.sina.com.cn/video/h5play',
+            video_id, query={'video_id': video_id})
+        if video_data['code'] != 1:
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, video_data['message']), expected=True)
+        else:
+            video_data = video_data['data']
+            title = video_data['title']
+            description = video_data.get('description')
+            if description:
+                description = description.strip()
+
+            preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd'])
+            formats = []
+            for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items():
+                file_api = quality.get('file_api')
+                file_id = quality.get('file_id')
+                if not file_api or not file_id:
+                    continue
+                formats.append({
+                    'format_id': quality_id,
+                    'url': update_url_query(file_api, {'vid': file_id}),
+                    'preference': preference(quality_id),
+                    'ext': 'mp4',
+                })
+            self._sort_formats(formats)
+
+            return {
+                'id': video_id,
+                'title': title,
+                'description': description,
+                'thumbnail': video_data.get('image'),
+                'duration': int_or_none(video_data.get('length')),
+                'timestamp': int_or_none(video_data.get('create_time')),
+                'formats': formats,
+            }
diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py

new file mode 100644 (file)

index 0000000..7ec66ec
--- /dev/null
+++ b/youtube_dl/extractor/sixplay.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_parse_qs,
+    compat_str,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    try_get,
+    qualities,
+)
+
+
+class SixPlayIE(InfoExtractor):
+    IE_NAME = '6play'
+    _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay\.be|play\.rtl\.hr|rtlmost\.hu)/.+?-c_)(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051',
+        'md5': '31fcd112637baa0c2ab92c4fcd8baf27',
+        'info_dict': {
+            'id': '12041051',
+            'ext': 'mp4',
+            'title': 'Le but qui a marqué l\'histoire du football français !',
+            'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851',
+        },
+    }, {
+        'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869',
+        'only_matching': True,
+    }, {
+        'url': 'https://play.rtl.hr/pj-masks-p_9455/epizoda-34-sezona-1-catboyevo-cudo-na-dva-kotaca-c_11984989',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.rtlmost.hu/megtorve-p_14167/megtorve-6-resz-c_12397787',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        domain, video_id = re.search(self._VALID_URL, url).groups()
+        service, consumer_name = {
+            '6play.fr': ('6play', 'm6web'),
+            'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'),
+            'play.rtl.hr': ('rtlhr_rtl_play', 'rtlhr'),
+            'rtlmost.hu': ('rtlhu_rtl_most', 'rtlhu'),
+        }.get(domain, ('6play', 'm6web'))
+
+        data = self._download_json(
+            'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id),
+            video_id, headers={
+                'x-customer-name': consumer_name
+            }, query={
+                'csa': 5,
+                'with': 'clips',
+            })
+
+        clip_data = data['clips'][0]
+        title = clip_data['title']
+
+        urls = []
+        quality_key = qualities(['lq', 'sd', 'hq', 'hd'])
+        formats = []
+        subtitles = {}
+        assets = clip_data.get('assets') or []
+        for asset in assets:
+            asset_url = asset.get('full_physical_path')
+            protocol = asset.get('protocol')
+            if not asset_url or ((protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264') and not ('_drmnp.ism/' in asset_url or '_unpnp.ism/' in asset_url)) or asset_url in urls:
+                continue
+            urls.append(asset_url)
+            container = asset.get('video_container')
+            ext = determine_ext(asset_url)
+            if protocol == 'http_subtitle' or ext == 'vtt':
+                subtitles.setdefault('fr', []).append({'url': asset_url})
+                continue
+            if container == 'm3u8' or ext == 'm3u8':
+                if protocol == 'usp':
+                    if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]:
+                        urlh = self._request_webpage(
+                            asset_url, video_id, fatal=False,
+                            headers=self.geo_verification_headers())
+                        if not urlh:
+                            continue
+                        asset_url = urlh.geturl()
+                    asset_url = asset_url.replace('_drmnp.ism/', '_unpnp.ism/')
+                    for i in range(3, 0, -1):
+                        asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i)
+                        m3u8_formats = self._extract_m3u8_formats(
+                            asset_url, video_id, 'mp4', 'm3u8_native',
+                            m3u8_id='hls', fatal=False)
+                        formats.extend(m3u8_formats)
+                        formats.extend(self._extract_mpd_formats(
+                            asset_url.replace('.m3u8', '.mpd'),
+                            video_id, mpd_id='dash', fatal=False))
+                        if m3u8_formats:
+                            break
+                else:
+                    formats.extend(self._extract_m3u8_formats(
+                        asset_url, video_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+            elif container == 'mp4' or ext == 'mp4':
+                quality = asset.get('video_quality')
+                formats.append({
+                    'url': asset_url,
+                    'format_id': quality,
+                    'quality': quality_key(quality),
+                    'ext': ext,
+                })
+        self._sort_formats(formats)
+
+        def get(getter):
+            for src in (data, clip_data):
+                v = try_get(src, getter, compat_str)
+                if v:
+                    return v
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': get(lambda x: x['description']),
+            'duration': int_or_none(clip_data.get('duration')),
+            'series': get(lambda x: x['program']['title']),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/sky.py b/youtube_dl/extractor/sky.py

new file mode 100644 (file)

index 0000000..ea30d6e
--- /dev/null
+++ b/youtube_dl/extractor/sky.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    extract_attributes,
+    smuggle_url,
+    strip_or_none,
+    urljoin,
+)
+
+
+class SkyBaseIE(InfoExtractor):
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        video_data = extract_attributes(self._search_regex(
+            r'(<div.+?class="[^"]*sdc-article-video__media-ooyala[^"]*"[^>]+>)',
+            webpage, 'video data'))
+
+        video_url = 'ooyala:%s' % video_data['data-video-id']
+        if video_data.get('data-token-required') == 'true':
+            token_fetch_options = self._parse_json(video_data.get(
+                'data-token-fetch-options', '{}'), video_id, fatal=False) or {}
+            token_fetch_url = token_fetch_options.get('url')
+            if token_fetch_url:
+                embed_token = self._download_webpage(urljoin(
+                    url, token_fetch_url), video_id, fatal=False)
+                if embed_token:
+                    video_url = smuggle_url(
+                        video_url, {'embed_token': embed_token.strip('"')})
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': video_url,
+            'title': self._og_search_title(webpage),
+            'description': strip_or_none(self._og_search_description(webpage)),
+            'ie_key': 'Ooyala',
+        }
+
+
+class SkySportsIE(SkyBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine',
+        'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec',
+        'info_dict': {
+            'id': 'o3eWJnNDE6l7kfNO8BOoBlRxXRQ4ANNQ',
+            'ext': 'mp4',
+            'title': 'Bale: It\'s our time to shine',
+            'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d',
+        },
+        'add_ie': ['Ooyala'],
+    }
+
+
+class SkyNewsIE(SkyBaseIE):
+    _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962',
+        'md5': 'd6327e581473cea9976a3236ded370cd',
+        'info_dict': {
+            'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM',
+            'ext': 'mp4',
+            'title': 'Russian plane inspected after deadly fire',
+            'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.',
+        },
+        'add_ie': ['Ooyala'],
+    }
diff --git a/youtube_dl/extractor/skylinewebcams.py b/youtube_dl/extractor/skylinewebcams.py

new file mode 100644 (file)

index 0000000..b7f8ac7
--- /dev/null
+++ b/youtube_dl/extractor/skylinewebcams.py
@@ -0,0 +1,42 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class SkylineWebcamsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?skylinewebcams\.com/[^/]+/webcam/(?:[^/]+/)+(?P<id>[^/]+)\.html'
+    _TEST = {
+        'url': 'https://www.skylinewebcams.com/it/webcam/italia/lazio/roma/scalinata-piazza-di-spagna-barcaccia.html',
+        'info_dict': {
+            'id': 'scalinata-piazza-di-spagna-barcaccia',
+            'ext': 'mp4',
+            'title': 're:^Live Webcam Scalinata di Piazza di Spagna - La Barcaccia [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'Roma, veduta sulla Scalinata di Piazza di Spagna e sulla Barcaccia',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        stream_url = self._search_regex(
+            r'(?:url|source)\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage,
+            'stream url', group='url')
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+
+        return {
+            'id': video_id,
+            'url': stream_url,
+            'ext': 'mp4',
+            'title': self._live_title(title),
+            'description': description,
+            'is_live': True,
+        }
diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py

new file mode 100644 (file)

index 0000000..fffc9aa
--- /dev/null
+++ b/youtube_dl/extractor/skynewsarabia.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    parse_iso8601,
+    parse_duration,
+)
+
+
+class SkyNewsArabiaBaseIE(InfoExtractor):
+    _IMAGE_BASE_URL = 'http://www.skynewsarabia.com/web/images'
+
+    def _call_api(self, path, value):
+        return self._download_json('http://api.skynewsarabia.com/web/rest/v2/%s/%s.json' % (path, value), value)
+
+    def _get_limelight_media_id(self, url):
+        return self._search_regex(r'/media/[^/]+/([a-z0-9]{32})', url, 'limelight media id')
+
+    def _get_image_url(self, image_path_template, width='1600', height='1200'):
+        return self._IMAGE_BASE_URL + image_path_template.format(width=width, height=height)
+
+    def _extract_video_info(self, video_data):
+        video_id = compat_str(video_data['id'])
+        topic = video_data.get('topicTitle')
+        return {
+            '_type': 'url_transparent',
+            'url': 'limelight:media:%s' % self._get_limelight_media_id(video_data['videoUrl'][0]['url']),
+            'id': video_id,
+            'title': video_data['headline'],
+            'description': video_data.get('summary'),
+            'thumbnail': self._get_image_url(video_data['mediaAsset']['imageUrl']),
+            'timestamp': parse_iso8601(video_data.get('date')),
+            'duration': parse_duration(video_data.get('runTime')),
+            'tags': video_data.get('tags', []),
+            'categories': [topic] if topic else [],
+            'webpage_url': 'http://www.skynewsarabia.com/web/video/%s' % video_id,
+            'ie_key': 'LimelightMedia',
+        }
+
+
+class SkyNewsArabiaIE(SkyNewsArabiaBaseIE):
+    IE_NAME = 'skynewsarabia:video'
+    _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.skynewsarabia.com/web/video/794902/%D9%86%D8%B5%D9%81-%D9%85%D9%84%D9%8A%D9%88%D9%86-%D9%85%D8%B5%D8%A8%D8%A7%D8%AD-%D8%B4%D8%AC%D8%B1%D8%A9-%D9%83%D8%B1%D9%8A%D8%B3%D9%85%D8%A7%D8%B3',
+        'info_dict': {
+            'id': '794902',
+            'ext': 'flv',
+            'title': 'نصف مليون مصباح على شجرة كريسماس',
+            'description': 'md5:22f1b27f0850eeb10c7e59b1f16eb7c6',
+            'upload_date': '20151128',
+            'timestamp': 1448697198,
+            'duration': 2119,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._call_api('video', video_id)
+        return self._extract_video_info(video_data)
+
+
+class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE):
+    IE_NAME = 'skynewsarabia:article'
+    _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9',
+        'info_dict': {
+            'id': '794549',
+            'ext': 'flv',
+            'title': 'بالفيديو.. ألعاب ذكية تحاكي واقع المنطقة',
+            'description': 'md5:0c373d29919a851e080ee4edd0c5d97f',
+            'upload_date': '20151126',
+            'timestamp': 1448559336,
+            'duration': 281.6,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.skynewsarabia.com/web/article/794844/%D8%A7%D8%B3%D8%AA%D9%87%D8%AF%D8%A7%D9%81-%D9%82%D9%88%D8%A7%D8%B1%D8%A8-%D8%A7%D9%94%D8%B3%D9%84%D8%AD%D8%A9-%D9%84%D9%85%D9%8A%D9%84%D9%8A%D8%B4%D9%8A%D8%A7%D8%AA-%D8%A7%D9%84%D8%AD%D9%88%D8%AB%D9%8A-%D9%88%D8%B5%D8%A7%D9%84%D8%AD',
+        'info_dict': {
+            'id': '794844',
+            'title': 'إحباط تهريب أسلحة لميليشيات الحوثي وصالح بجنوب اليمن',
+            'description': 'md5:5c927b8b2e805796e7f693538d96fc7e',
+        },
+        'playlist_mincount': 2,
+    }]
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+        article_data = self._call_api('article', article_id)
+        media_asset = article_data['mediaAsset']
+        if media_asset['type'] == 'VIDEO':
+            topic = article_data.get('topicTitle')
+            return {
+                '_type': 'url_transparent',
+                'url': 'limelight:media:%s' % self._get_limelight_media_id(media_asset['videoUrl'][0]['url']),
+                'id': article_id,
+                'title': article_data['headline'],
+                'description': article_data.get('summary'),
+                'thumbnail': self._get_image_url(media_asset['imageUrl']),
+                'timestamp': parse_iso8601(article_data.get('date')),
+                'tags': article_data.get('tags', []),
+                'categories': [topic] if topic else [],
+                'webpage_url': url,
+                'ie_key': 'LimelightMedia',
+            }
+        entries = [self._extract_video_info(item) for item in article_data.get('inlineItems', []) if item['type'] == 'VIDEO']
+        return self.playlist_result(entries, article_id, article_data['headline'], article_data.get('summary'))
diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py

new file mode 100644 (file)

index 0000000..e89ebeb
--- /dev/null
+++ b/youtube_dl/extractor/slideshare.py
@@ -0,0 +1,56 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    get_element_by_id,
+)
+
+
+class SlideshareIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'
+
+    _TEST = {
+        'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
+        'info_dict': {
+            'id': '25665706',
+            'ext': 'mp4',
+            'title': 'Managing Scale and Complexity',
+            'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        page_title = mobj.group('title')
+        webpage = self._download_webpage(url, page_title)
+        slideshare_obj = self._search_regex(
+            r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);',
+            webpage, 'slideshare object')
+        info = json.loads(slideshare_obj)
+        if info['slideshow']['type'] != 'video':
+            raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)
+
+        doc = info['doc']
+        bucket = info['jsplayer']['video_bucket']
+        ext = info['jsplayer']['video_extension']
+        video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
+        description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex(
+            r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage,
+            'description', fatal=False)
+
+        return {
+            '_type': 'video',
+            'id': info['slideshow']['id'],
+            'title': info['slideshow']['title'],
+            'ext': ext,
+            'url': video_url,
+            'thumbnail': info['slideshow']['pin_image_url'],
+            'description': description.strip() if description else None,
+        }
diff --git a/youtube_dl/extractor/slideslive.py b/youtube_dl/extractor/slideslive.py

new file mode 100644 (file)

index 0000000..d9ea768
--- /dev/null
+++ b/youtube_dl/extractor/slideslive.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class SlidesLiveIE(InfoExtractor):
+    _VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)'
+    _TESTS = [{
+        # video_service_name = YOUTUBE
+        'url': 'https://slideslive.com/38902413/gcc-ia16-backend',
+        'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f',
+        'info_dict': {
+            'id': 'LMtgR8ba0b0',
+            'ext': 'mp4',
+            'title': 'GCC IA16 backend',
+            'description': 'Watch full version of this video at https://slideslive.com/38902413.',
+            'uploader': 'SlidesLive Videos - A',
+            'uploader_id': 'UC62SdArr41t_-_fX40QCLRw',
+            'upload_date': '20170925',
+        }
+    }, {
+        # video_service_name = youtube
+        'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend',
+        'only_matching': True,
+    }, {
+        # video_service_name = url
+        'url': 'https://slideslive.com/38922070/learning-transferable-skills-1',
+        'only_matching': True,
+    }, {
+        # video_service_name = vimeo
+        'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'https://ben.slideslive.com/player/' + video_id, video_id)
+        service_name = video_data['video_service_name'].lower()
+        assert service_name in ('url', 'vimeo', 'youtube')
+        service_id = video_data['video_service_id']
+        info = {
+            'id': video_id,
+            'thumbnail': video_data.get('thumbnail'),
+            'url': service_id,
+        }
+        if service_name == 'url':
+            info['title'] = video_data['title']
+        else:
+            info.update({
+                '_type': 'url_transparent',
+                'ie_key': service_name.capitalize(),
+                'title': video_data.get('title'),
+            })
+            if service_name == 'vimeo':
+                info['url'] = smuggle_url(
+                    'https://player.vimeo.com/video/' + service_id,
+                    {'http_headers': {'Referer': url}})
+        return info
diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py

new file mode 100644 (file)

index 0000000..661f9e5
--- /dev/null
+++ b/youtube_dl/extractor/slutload.py
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class SlutloadIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/',
+        'md5': '868309628ba00fd488cf516a113fd717',
+        'info_dict': {
+            'id': 'TD73btpBqSxc',
+            'ext': 'mp4',
+            'title': 'virginie baisee en cam',
+            'age_limit': 18,
+            'thumbnail': r're:https?://.*?\.jpg'
+        },
+    }, {
+        # mobile site
+        'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        embed_page = self._download_webpage(
+            'http://www.slutload.com/embed_player/%s' % video_id, video_id,
+            'Downloading embed page', fatal=False)
+
+        if embed_page:
+            def extract(what):
+                return self._html_search_regex(
+                    r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what,
+                    embed_page, 'video %s' % what, default=None, group='url')
+
+            video_url = extract('url')
+            if video_url:
+                title = self._html_search_regex(
+                    r'<title>([^<]+)', embed_page, 'title', default=video_id)
+                return {
+                    'id': video_id,
+                    'url': video_url,
+                    'title': title,
+                    'thumbnail': extract('preview'),
+                    'age_limit': 18
+                }
+
+        webpage = self._download_webpage(
+            'http://www.slutload.com/video/_/%s/' % video_id, video_id)
+        title = self._html_search_regex(
+            r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip()
+        info = self._parse_html5_media_entries(url, webpage, video_id)[0]
+        info.update({
+            'id': video_id,
+            'title': title,
+            'age_limit': 18,
+        })
+        return info
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py

new file mode 100644 (file)

index 0000000..45995f3
--- /dev/null
+++ b/youtube_dl/extractor/smotri.py
@@ -0,0 +1,416 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+import hashlib
+import uuid
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    sanitized_Request,
+    unified_strdate,
+    urlencode_postdata,
+    xpath_text,
+)
+
+
+class SmotriIE(InfoExtractor):
+    IE_DESC = 'Smotri.com'
+    IE_NAME = 'smotri'
+    _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
+    _NETRC_MACHINE = 'smotri'
+
+    _TESTS = [
+        # real video id 2610366
+        {
+            'url': 'http://smotri.com/video/view/?id=v261036632ab',
+            'md5': '02c0dfab2102984e9c5bb585cc7cc321',
+            'info_dict': {
+                'id': 'v261036632ab',
+                'ext': 'mp4',
+                'title': 'катастрофа с камер видеонаблюдения',
+                'uploader': 'rbc2008',
+                'uploader_id': 'rbc08',
+                'upload_date': '20131118',
+                'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg',
+            },
+        },
+        # real video id 57591
+        {
+            'url': 'http://smotri.com/video/view/?id=v57591cb20',
+            'md5': '830266dfc21f077eac5afd1883091bcd',
+            'info_dict': {
+                'id': 'v57591cb20',
+                'ext': 'flv',
+                'title': 'test',
+                'uploader': 'Support Photofile@photofile',
+                'uploader_id': 'support-photofile',
+                'upload_date': '20070704',
+                'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
+            },
+        },
+        # video-password, not approved by moderator
+        {
+            'url': 'http://smotri.com/video/view/?id=v1390466a13c',
+            'md5': 'f6331cef33cad65a0815ee482a54440b',
+            'info_dict': {
+                'id': 'v1390466a13c',
+                'ext': 'mp4',
+                'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
+                'uploader': 'timoxa40',
+                'uploader_id': 'timoxa40',
+                'upload_date': '20100404',
+                'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg',
+            },
+            'params': {
+                'videopassword': 'qwerty',
+            },
+            'skip': 'Video is not approved by moderator',
+        },
+        # video-password
+        {
+            'url': 'http://smotri.com/video/view/?id=v6984858774#',
+            'md5': 'f11e01d13ac676370fc3b95b9bda11b0',
+            'info_dict': {
+                'id': 'v6984858774',
+                'ext': 'mp4',
+                'title': 'Дача Солженицина ПАРОЛЬ 223322',
+                'uploader': 'psavari1',
+                'uploader_id': 'psavari1',
+                'upload_date': '20081103',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'videopassword': '223322',
+            },
+        },
+        # age limit + video-password, not approved by moderator
+        {
+            'url': 'http://smotri.com/video/view/?id=v15408898bcf',
+            'md5': '91e909c9f0521adf5ee86fbe073aad70',
+            'info_dict': {
+                'id': 'v15408898bcf',
+                'ext': 'flv',
+                'title': 'этот ролик не покажут по ТВ',
+                'uploader': 'zzxxx',
+                'uploader_id': 'ueggb',
+                'upload_date': '20101001',
+                'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg',
+                'age_limit': 18,
+            },
+            'params': {
+                'videopassword': '333'
+            },
+            'skip': 'Video is not approved by moderator',
+        },
+        # age limit + video-password
+        {
+            'url': 'http://smotri.com/video/view/?id=v7780025814',
+            'md5': 'b4599b068422559374a59300c5337d72',
+            'info_dict': {
+                'id': 'v7780025814',
+                'ext': 'mp4',
+                'title': 'Sexy Beach (пароль 123)',
+                'uploader': 'вАся',
+                'uploader_id': 'asya_prosto',
+                'upload_date': '20081218',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'age_limit': 18,
+            },
+            'params': {
+                'videopassword': '123'
+            },
+        },
+        # swf player
+        {
+            'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500',
+            'md5': '31099eeb4bc906712c5f40092045108d',
+            'info_dict': {
+                'id': 'v9188090500',
+                'ext': 'mp4',
+                'title': 'Shakira - Don\'t Bother',
+                'uploader': 'HannahL',
+                'uploader_id': 'lisaha95',
+                'upload_date': '20090331',
+                'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg',
+            },
+        },
+    ]
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)',
+            webpage)
+        if mobj is not None:
+            return mobj.group('url')
+
+        mobj = re.search(
+            r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s*
+                    <div\s+class="video_image">[^<]+</div>\s*
+                    <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage)
+        if mobj is not None:
+            return 'http://smotri.com/video/view/?id=%s' % mobj.group('id')
+
+    def _search_meta(self, name, html, display_name=None):
+        if display_name is None:
+            display_name = name
+        return self._html_search_meta(name, html, display_name)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video_form = {
+            'ticket': video_id,
+            'video_url': '1',
+            'frame_url': '1',
+            'devid': 'LoadupFlashPlayer',
+            'getvideoinfo': '1',
+        }
+
+        video_password = self._downloader.params.get('videopassword')
+        if video_password:
+            video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest()
+
+        video = self._download_json(
+            'http://smotri.com/video/view/url/bot/',
+            video_id, 'Downloading video JSON',
+            data=urlencode_postdata(video_form),
+            headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+        video_url = video.get('_vidURL') or video.get('_vidURL_mp4')
+
+        if not video_url:
+            if video.get('_moderate_no'):
+                raise ExtractorError(
+                    'Video %s has not been approved by moderator' % video_id, expected=True)
+
+            if video.get('error'):
+                raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+            if video.get('_pass_protected') == 1:
+                msg = ('Invalid video password' if video_password
+                       else 'This video is protected by a password, use the --video-password option')
+                raise ExtractorError(msg, expected=True)
+
+        title = video['title']
+        thumbnail = video.get('_imgURL')
+        upload_date = unified_strdate(video.get('added'))
+        uploader = video.get('userNick')
+        uploader_id = video.get('userLogin')
+        duration = int_or_none(video.get('duration'))
+
+        # Video JSON does not provide enough meta data
+        # We will extract some from the video web page instead
+        webpage_url = 'http://smotri.com/video/view/?id=%s' % video_id
+        webpage = self._download_webpage(webpage_url, video_id, 'Downloading video page')
+
+        # Warning if video is unavailable
+        warning = self._html_search_regex(
+            r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage,
+            'warning message', default=None)
+        if warning is not None:
+            self._downloader.report_warning(
+                'Video %s may not be available; smotri said: %s ' %
+                (video_id, warning))
+
+        # Adult content
+        if 'EroConfirmText">' in webpage:
+            self.report_age_confirmation()
+            confirm_string = self._html_search_regex(
+                r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id,
+                webpage, 'confirm string')
+            confirm_url = webpage_url + '&confirm=%s' % confirm_string
+            webpage = self._download_webpage(
+                confirm_url, video_id,
+                'Downloading video page (age confirmed)')
+            adult_content = True
+        else:
+            adult_content = False
+
+        view_count = self._html_search_regex(
+            r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>',
+            webpage, 'view count', fatal=False)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'upload_date': upload_date,
+            'uploader_id': uploader_id,
+            'duration': duration,
+            'view_count': int_or_none(view_count),
+            'age_limit': 18 if adult_content else 0,
+        }
+
+
+class SmotriCommunityIE(InfoExtractor):
+    IE_DESC = 'Smotri.com community videos'
+    IE_NAME = 'smotri:community'
+    _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)'
+    _TEST = {
+        'url': 'http://smotri.com/community/video/kommuna',
+        'info_dict': {
+            'id': 'kommuna',
+        },
+        'playlist_mincount': 4,
+    }
+
+    def _real_extract(self, url):
+        community_id = self._match_id(url)
+
+        rss = self._download_xml(
+            'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id,
+            community_id, 'Downloading community RSS')
+
+        entries = [
+            self.url_result(video_url.text, SmotriIE.ie_key())
+            for video_url in rss.findall('./channel/item/link')]
+
+        return self.playlist_result(entries, community_id)
+
+
+class SmotriUserIE(InfoExtractor):
+    IE_DESC = 'Smotri.com user videos'
+    IE_NAME = 'smotri:user'
+    _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)'
+    _TESTS = [{
+        'url': 'http://smotri.com/user/inspector',
+        'info_dict': {
+            'id': 'inspector',
+            'title': 'Inspector',
+        },
+        'playlist_mincount': 9,
+    }]
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+
+        rss = self._download_xml(
+            'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id,
+            user_id, 'Downloading user RSS')
+
+        entries = [self.url_result(video_url.text, 'Smotri')
+                   for video_url in rss.findall('./channel/item/link')]
+
+        description_text = xpath_text(rss, './channel/description') or ''
+        user_nickname = self._search_regex(
+            '^Видео режиссера (.+)$', description_text,
+            'user nickname', fatal=False)
+
+        return self.playlist_result(entries, user_id, user_nickname)
+
+
+class SmotriBroadcastIE(InfoExtractor):
+    IE_DESC = 'Smotri.com broadcasts'
+    IE_NAME = 'smotri:broadcast'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*'
+    _NETRC_MACHINE = 'smotri'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        broadcast_id = mobj.group('id')
+
+        broadcast_url = 'http://' + mobj.group('url')
+        broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page')
+
+        if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None:
+            raise ExtractorError(
+                'Broadcast %s does not exist' % broadcast_id, expected=True)
+
+        # Adult content
+        if re.search('EroConfirmText">', broadcast_page) is not None:
+
+            (username, password) = self._get_login_info()
+            if username is None:
+                self.raise_login_required(
+                    'Erotic broadcasts allowed only for registered users')
+
+            login_form = {
+                'login-hint53': '1',
+                'confirm_erotic': '1',
+                'login': username,
+                'password': password,
+            }
+
+            request = sanitized_Request(
+                broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form))
+            request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+            broadcast_page = self._download_webpage(
+                request, broadcast_id, 'Logging in and confirming age')
+
+            if '>Неверный логин или пароль<' in broadcast_page:
+                raise ExtractorError(
+                    'Unable to log in: bad username or password', expected=True)
+
+            adult_content = True
+        else:
+            adult_content = False
+
+        ticket = self._html_search_regex(
+            (r'data-user-file=(["\'])(?P<ticket>(?!\1).+)\1',
+             r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P<ticket>[^']+)'\)"),
+            broadcast_page, 'broadcast ticket', group='ticket')
+
+        broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket
+
+        broadcast_password = self._downloader.params.get('videopassword')
+        if broadcast_password:
+            broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest()
+
+        broadcast_json_page = self._download_webpage(
+            broadcast_url, broadcast_id, 'Downloading broadcast JSON')
+
+        try:
+            broadcast_json = json.loads(broadcast_json_page)
+
+            protected_broadcast = broadcast_json['_pass_protected'] == 1
+            if protected_broadcast and not broadcast_password:
+                raise ExtractorError(
+                    'This broadcast is protected by a password, use the --video-password option',
+                    expected=True)
+
+            broadcast_offline = broadcast_json['is_play'] == 0
+            if broadcast_offline:
+                raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True)
+
+            rtmp_url = broadcast_json['_server']
+            mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url)
+            if not mobj:
+                raise ExtractorError('Unexpected broadcast rtmp URL')
+
+            broadcast_playpath = broadcast_json['_streamName']
+            broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL'])
+            broadcast_thumbnail = broadcast_json.get('_imgURL')
+            broadcast_title = self._live_title(broadcast_json['title'])
+            broadcast_description = broadcast_json.get('description')
+            broadcaster_nick = broadcast_json.get('nick')
+            broadcaster_login = broadcast_json.get('login')
+            rtmp_conn = 'S:%s' % uuid.uuid4().hex
+        except KeyError:
+            if protected_broadcast:
+                raise ExtractorError('Bad broadcast password', expected=True)
+            raise ExtractorError('Unexpected broadcast JSON')
+
+        return {
+            'id': broadcast_id,
+            'url': rtmp_url,
+            'title': broadcast_title,
+            'thumbnail': broadcast_thumbnail,
+            'description': broadcast_description,
+            'uploader': broadcaster_nick,
+            'uploader_id': broadcaster_login,
+            'age_limit': 18 if adult_content else 0,
+            'ext': 'flv',
+            'play_path': broadcast_playpath,
+            'player_url': 'http://pics.smotri.com/broadcast_play.swf',
+            'app': broadcast_app,
+            'rtmp_live': True,
+            'rtmp_conn': rtmp_conn,
+            'is_live': True,
+        }
diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py

new file mode 100644 (file)

index 0000000..f773547
--- /dev/null
+++ b/youtube_dl/extractor/snotr.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    parse_filesize,
+    str_to_int,
+)
+
+
+class SnotrIE(InfoExtractor):
+    _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P<id>\d+)/([\w]+)'
+    _TESTS = [{
+        'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks',
+        'info_dict': {
+            'id': '13708',
+            'ext': 'mp4',
+            'title': 'Drone flying through fireworks!',
+            'duration': 248,
+            'filesize_approx': 40700000,
+            'description': 'A drone flying through Fourth of July Fireworks',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'expected_warnings': ['description'],
+    }, {
+        'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10',
+        'info_dict': {
+            'id': '530',
+            'ext': 'mp4',
+            'title': 'David Letteman - George W. Bush Top 10',
+            'duration': 126,
+            'filesize_approx': 8500000,
+            'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._og_search_title(webpage)
+
+        description = self._og_search_description(webpage)
+        info_dict = self._parse_html5_media_entries(
+            url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0]
+
+        view_count = str_to_int(self._html_search_regex(
+            r'<p[^>]*>\s*<strong[^>]*>Views:</strong>\s*<span[^>]*>([\d,\.]+)',
+            webpage, 'view count', fatal=False))
+
+        duration = parse_duration(self._html_search_regex(
+            r'<p[^>]*>\s*<strong[^>]*>Length:</strong>\s*<span[^>]*>([\d:]+)',
+            webpage, 'duration', fatal=False))
+
+        filesize_approx = parse_filesize(self._html_search_regex(
+            r'<p[^>]*>\s*<strong[^>]*>Filesize:</strong>\s*<span[^>]*>([^<]+)',
+            webpage, 'filesize', fatal=False))
+
+        info_dict.update({
+            'id': video_id,
+            'description': description,
+            'title': title,
+            'view_count': view_count,
+            'duration': duration,
+            'filesize_approx': filesize_approx,
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py

new file mode 100644 (file)

index 0000000..76b3cc6
--- /dev/null
+++ b/youtube_dl/extractor/sohu.py
@@ -0,0 +1,202 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_urlencode,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    try_get,
+)
+
+
+class SohuIE(InfoExtractor):
+    _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?'
+
+    # Sohu videos give different MD5 sums on Travis CI and my machine
+    _TESTS = [{
+        'note': 'This video is available only in Mainland China',
+        'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super',
+        'info_dict': {
+            'id': '382479172',
+            'ext': 'mp4',
+            'title': 'MV：Far East Movement《The Illest》',
+        },
+        'skip': 'On available in China',
+    }, {
+        'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
+        'info_dict': {
+            'id': '409385080',
+            'ext': 'mp4',
+            'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》',
+        }
+    }, {
+        'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
+        'info_dict': {
+            'id': '78693464',
+            'ext': 'mp4',
+            'title': '【爱范品】第31期：MWC见不到的奇葩手机',
+        }
+    }, {
+        'note': 'Multipart video',
+        'url': 'http://my.tv.sohu.com/pl/8384802/78910339.shtml',
+        'info_dict': {
+            'id': '78910339',
+            'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': '78910339_part1',
+                'ext': 'mp4',
+                'duration': 294,
+                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+            }
+        }, {
+            'info_dict': {
+                'id': '78910339_part2',
+                'ext': 'mp4',
+                'duration': 300,
+                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+            }
+        }, {
+            'info_dict': {
+                'id': '78910339_part3',
+                'ext': 'mp4',
+                'duration': 150,
+                'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
+            }
+        }]
+    }, {
+        'note': 'Video with title containing dash',
+        'url': 'http://my.tv.sohu.com/us/249884221/78932792.shtml',
+        'info_dict': {
+            'id': '78932792',
+            'ext': 'mp4',
+            'title': 'youtube-dlc testing video',
+        },
+        'params': {
+            'skip_download': True
+        }
+    }]
+
+    def _real_extract(self, url):
+
+        def _fetch_data(vid_id, mytv=False):
+            if mytv:
+                base_data_url = 'http://my.tv.sohu.com/play/videonew.do?vid='
+            else:
+                base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
+
+            return self._download_json(
+                base_data_url + vid_id, video_id,
+                'Downloading JSON data for %s' % vid_id,
+                headers=self.geo_verification_headers())
+
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        mytv = mobj.group('mytv') is not None
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage))
+
+        vid = self._html_search_regex(
+            r'var vid ?= ?["\'](\d+)["\']',
+            webpage, 'video path')
+        vid_data = _fetch_data(vid, mytv)
+        if vid_data['play'] != 1:
+            if vid_data.get('status') == 12:
+                raise ExtractorError(
+                    '%s said: There\'s something wrong in the video.' % self.IE_NAME,
+                    expected=True)
+            else:
+                self.raise_geo_restricted(
+                    '%s said: The video is only licensed to users in Mainland China.' % self.IE_NAME)
+
+        formats_json = {}
+        for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'):
+            vid_id = vid_data['data'].get('%sVid' % format_id)
+            if not vid_id:
+                continue
+            vid_id = compat_str(vid_id)
+            formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv)
+
+        part_count = vid_data['data']['totalBlocks']
+
+        playlist = []
+        for i in range(part_count):
+            formats = []
+            for format_id, format_data in formats_json.items():
+                allot = format_data['allot']
+
+                data = format_data['data']
+                clips_url = data['clipsURL']
+                su = data['su']
+
+                video_url = 'newflv.sohu.ccgslb.net'
+                cdnId = None
+                retries = 0
+
+                while 'newflv.sohu.ccgslb.net' in video_url:
+                    params = {
+                        'prot': 9,
+                        'file': clips_url[i],
+                        'new': su[i],
+                        'prod': 'flash',
+                        'rb': 1,
+                    }
+
+                    if cdnId is not None:
+                        params['idc'] = cdnId
+
+                    download_note = 'Downloading %s video URL part %d of %d' % (
+                        format_id, i + 1, part_count)
+
+                    if retries > 0:
+                        download_note += ' (retry #%d)' % retries
+                    part_info = self._parse_json(self._download_webpage(
+                        'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)),
+                        video_id, download_note), video_id)
+
+                    video_url = part_info['url']
+                    cdnId = part_info.get('nid')
+
+                    retries += 1
+                    if retries > 5:
+                        raise ExtractorError('Failed to get video URL')
+
+                formats.append({
+                    'url': video_url,
+                    'format_id': format_id,
+                    'filesize': int_or_none(
+                        try_get(data, lambda x: x['clipsBytes'][i])),
+                    'width': int_or_none(data.get('width')),
+                    'height': int_or_none(data.get('height')),
+                    'fps': int_or_none(data.get('fps')),
+                })
+            self._sort_formats(formats)
+
+            playlist.append({
+                'id': '%s_part%d' % (video_id, i + 1),
+                'title': title,
+                'duration': vid_data['data']['clipsDuration'][i],
+                'formats': formats,
+            })
+
+        if len(playlist) == 1:
+            info = playlist[0]
+            info['id'] = video_id
+        else:
+            info = {
+                '_type': 'multi_video',
+                'entries': playlist,
+                'id': video_id,
+                'title': title,
+            }
+
+        return info
diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py

new file mode 100644 (file)

index 0000000..58a8c0d
--- /dev/null
+++ b/youtube_dl/extractor/sonyliv.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class SonyLIVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)'
+    _TESTS = [{
+        'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight",
+        'info_dict': {
+            'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight",
+            'id': 'ref:5024612095001',
+            'ext': 'mp4',
+            'upload_date': '20170923',
+            'description': 'md5:7f28509a148d5be9d0782b4d5106410d',
+            'uploader_id': '5182475815001',
+            'timestamp': 1506200547,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['BrightcoveNew'],
+    }, {
+        'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)',
+        'only_matching': True,
+    }]
+
+    # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s'
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s'
+
+    def _real_extract(self, url):
+        brightcove_id = self._match_id(url)
+        return self.url_result(
+            smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {
+                'geo_countries': ['IN'],
+                'referrer': url,
+            }),
+            'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py

new file mode 100644 (file)

index 0000000..ae35736
--- /dev/null
+++ b/youtube_dl/extractor/soundcloud.py
@@ -0,0 +1,890 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import re
+import json
+import random
+
+from .common import (
+    InfoExtractor,
+    SearchInfoExtractor
+)
+from ..compat import (
+    compat_HTTPError,
+    compat_kwargs,
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    error_to_compat_str,
+    ExtractorError,
+    float_or_none,
+    HEADRequest,
+    int_or_none,
+    KNOWN_EXTENSIONS,
+    mimetype2ext,
+    str_or_none,
+    try_get,
+    unified_timestamp,
+    update_url_query,
+    url_or_none,
+    urlhandle_detect_ext,
+    sanitized_Request,
+)
+
+
+class SoundcloudEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)'
+    _TEST = {
+        # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/
+        'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey',
+        'only_matching': True,
+    }
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [m.group('url') for m in re.finditer(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1',
+            webpage)]
+
+    def _real_extract(self, url):
+        query = compat_urlparse.parse_qs(
+            compat_urlparse.urlparse(url).query)
+        api_url = query['url'][0]
+        secret_token = query.get('secret_token')
+        if secret_token:
+            api_url = update_url_query(api_url, {'secret_token': secret_token[0]})
+        return self.url_result(api_url)
+
+
+class SoundcloudIE(InfoExtractor):
+    """Information extractor for soundcloud.com
+       To access the media, the uid of the song and a stream token
+       must be extracted from the page source and the script must make
+       a request to media.soundcloud.com/crossdomain.xml. Then
+       the media can be grabbed by requesting from an url composed
+       of the stream token and uid
+     """
+
+    _VALID_URL = r'''(?x)^(?:https?://)?
+                    (?:(?:(?:www\.|m\.)?soundcloud\.com/
+                            (?!stations/track)
+                            (?P<uploader>[\w\d-]+)/
+                            (?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
+                            (?P<title>[\w\d-]+)/?
+                            (?P<token>[^?]+?)?(?:[?].*)?$)
+                       |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+)
+                          (?:/?\?secret_token=(?P<secret_token>[^&]+))?)
+                    )
+                    '''
+    IE_NAME = 'soundcloud'
+    _TESTS = [
+        {
+            'url': 'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
+            'md5': 'ebef0a451b909710ed1d7787dddbf0d7',
+            'info_dict': {
+                'id': '62986583',
+                'ext': 'mp3',
+                'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
+                'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
+                'uploader': 'E.T. ExTerrestrial Music',
+                'uploader_id': '1571244',
+                'timestamp': 1349920598,
+                'upload_date': '20121011',
+                'duration': 143.216,
+                'license': 'all-rights-reserved',
+                'view_count': int,
+                'like_count': int,
+                'comment_count': int,
+                'repost_count': int,
+            }
+        },
+        # geo-restricted
+        {
+            'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
+            'info_dict': {
+                'id': '47127627',
+                'ext': 'mp3',
+                'title': 'Goldrushed',
+                'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
+                'uploader': 'The Royal Concept',
+                'uploader_id': '9615865',
+                'timestamp': 1337635207,
+                'upload_date': '20120521',
+                'duration': 227.155,
+                'license': 'all-rights-reserved',
+                'view_count': int,
+                'like_count': int,
+                'comment_count': int,
+                'repost_count': int,
+            },
+        },
+        # private link
+        {
+            'url': 'https://soundcloud.com/jaimemf/youtube-dlc-test-video-a-y-baw/s-8Pjrp',
+            'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
+            'info_dict': {
+                'id': '123998367',
+                'ext': 'mp3',
+                'title': 'Youtube - Dl Test Video \'\' Ä↭',
+                'description': 'test chars:  \"\'/\\ä↭',
+                'uploader': 'jaimeMF',
+                'uploader_id': '69767071',
+                'timestamp': 1386604920,
+                'upload_date': '20131209',
+                'duration': 9.927,
+                'license': 'all-rights-reserved',
+                'view_count': int,
+                'like_count': int,
+                'comment_count': int,
+                'repost_count': int,
+            },
+        },
+        # private link (alt format)
+        {
+            'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp',
+            'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604',
+            'info_dict': {
+                'id': '123998367',
+                'ext': 'mp3',
+                'title': 'Youtube - Dl Test Video \'\' Ä↭',
+                'description': 'test chars:  \"\'/\\ä↭',
+                'uploader': 'jaimeMF',
+                'uploader_id': '69767071',
+                'timestamp': 1386604920,
+                'upload_date': '20131209',
+                'duration': 9.927,
+                'license': 'all-rights-reserved',
+                'view_count': int,
+                'like_count': int,
+                'comment_count': int,
+                'repost_count': int,
+            },
+        },
+        # downloadable song
+        {
+            'url': 'https://soundcloud.com/oddsamples/bus-brakes',
+            'md5': '7624f2351f8a3b2e7cd51522496e7631',
+            'info_dict': {
+                'id': '128590877',
+                'ext': 'mp3',
+                'title': 'Bus Brakes',
+                'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66',
+                'uploader': 'oddsamples',
+                'uploader_id': '73680509',
+                'timestamp': 1389232924,
+                'upload_date': '20140109',
+                'duration': 17.346,
+                'license': 'cc-by-sa',
+                'view_count': int,
+                'like_count': int,
+                'comment_count': int,
+                'repost_count': int,
+            },
+        },
+        # private link, downloadable format
+        {
+            'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd',
+            'md5': '64a60b16e617d41d0bef032b7f55441e',
+            'info_dict': {
+                'id': '340344461',
+                'ext': 'wav',
+                'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
+                'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
+                'uploader': 'Ori Uplift Music',
+                'uploader_id': '12563093',
+                'timestamp': 1504206263,
+                'upload_date': '20170831',
+                'duration': 7449.096,
+                'license': 'all-rights-reserved',
+                'view_count': int,
+                'like_count': int,
+                'comment_count': int,
+                'repost_count': int,
+            },
+        },
+        # no album art, use avatar pic for thumbnail
+        {
+            'url': 'https://soundcloud.com/garyvee/sideways-prod-mad-real',
+            'md5': '59c7872bc44e5d99b7211891664760c2',
+            'info_dict': {
+                'id': '309699954',
+                'ext': 'mp3',
+                'title': 'Sideways (Prod. Mad Real)',
+                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+                'uploader': 'garyvee',
+                'uploader_id': '2366352',
+                'timestamp': 1488152409,
+                'upload_date': '20170226',
+                'duration': 207.012,
+                'thumbnail': r're:https?://.*\.jpg',
+                'license': 'all-rights-reserved',
+                'view_count': int,
+                'like_count': int,
+                'comment_count': int,
+                'repost_count': int,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
+            'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7',
+            'info_dict': {
+                'id': '583011102',
+                'ext': 'mp3',
+                'title': 'Mezzo Valzer',
+                'description': 'md5:4138d582f81866a530317bae316e8b61',
+                'uploader': 'Micronie',
+                'uploader_id': '3352531',
+                'timestamp': 1551394171,
+                'upload_date': '20190228',
+                'duration': 180.157,
+                'thumbnail': r're:https?://.*\.jpg',
+                'license': 'all-rights-reserved',
+                'view_count': int,
+                'like_count': int,
+                'comment_count': int,
+                'repost_count': int,
+            },
+        },
+        {
+            # with AAC HQ format available via OAuth token
+            'url': 'https://soundcloud.com/wandw/the-chainsmokers-ft-daya-dont-let-me-down-ww-remix-1',
+            'only_matching': True,
+        },
+    ]
+
+    _API_V2_BASE = 'https://api-v2.soundcloud.com/'
+    _BASE_URL = 'https://soundcloud.com/'
+    _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
+
+    _ARTWORK_MAP = {
+        'mini': 16,
+        'tiny': 20,
+        'small': 32,
+        'badge': 47,
+        't67x67': 67,
+        'large': 100,
+        't300x300': 300,
+        'crop': 400,
+        't500x500': 500,
+        'original': 0,
+    }
+
+    def _store_client_id(self, client_id):
+        self._downloader.cache.store('soundcloud', 'client_id', client_id)
+
+    def _update_client_id(self):
+        webpage = self._download_webpage('https://soundcloud.com/', None)
+        for src in reversed(re.findall(r'<script[^>]+src="([^"]+)"', webpage)):
+            script = self._download_webpage(src, None, fatal=False)
+            if script:
+                client_id = self._search_regex(
+                    r'client_id\s*:\s*"([0-9a-zA-Z]{32})"',
+                    script, 'client id', default=None)
+                if client_id:
+                    self._CLIENT_ID = client_id
+                    self._store_client_id(client_id)
+                    return
+        raise ExtractorError('Unable to extract client id')
+
+    def _download_json(self, *args, **kwargs):
+        non_fatal = kwargs.get('fatal') is False
+        if non_fatal:
+            del kwargs['fatal']
+        query = kwargs.get('query', {}).copy()
+        for _ in range(2):
+            query['client_id'] = self._CLIENT_ID
+            kwargs['query'] = query
+            try:
+                return super(SoundcloudIE, self)._download_json(*args, **compat_kwargs(kwargs))
+            except ExtractorError as e:
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                    self._store_client_id(None)
+                    self._update_client_id()
+                    continue
+                elif non_fatal:
+                    self._downloader.report_warning(error_to_compat_str(e))
+                    return False
+                raise
+
+    def _real_initialize(self):
+        self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or "T5R4kgWS2PRf6lzLyIravUMnKlbIxQag"  # 'EXLwg5lHTO2dslU5EePe3xkw0m1h86Cd' # 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk'
+        self._login()
+
+    _USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
+    _API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
+    _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
+    _access_token = None
+    _HEADERS = {}
+    _NETRC_MACHINE = 'soundcloud'
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        def genDevId():
+            def genNumBlock():
+                return ''.join([str(random.randrange(10)) for i in range(6)])
+            return '-'.join([genNumBlock() for i in range(4)])
+
+        payload = {
+            'client_id': self._CLIENT_ID,
+            'recaptcha_pubkey': 'null',
+            'recaptcha_response': 'null',
+            'credentials': {
+                'identifier': username,
+                'password': password
+            },
+            'signature': self.sign(username, password, self._CLIENT_ID),
+            'device_id': genDevId(),
+            'user_agent': self._USER_AGENT
+        }
+
+        query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
+        login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8'))
+        response = self._download_json(login, None)
+        self._access_token = response.get('session').get('access_token')
+        if not self._access_token:
+            self.report_warning('Unable to get access token, login may has failed')
+        else:
+            self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
+
+    # signature generation
+    def sign(self, user, pw, clid):
+        a = 33
+        i = 1
+        s = 440123
+        w = 117
+        u = 1800000
+        l = 1042
+        b = 37
+        k = 37
+        c = 5
+        n = "0763ed7314c69015fd4a0dc16bbf4b90"  # _KEY
+        y = "8"  # _REV
+        r = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"  # _USER_AGENT
+        e = user  # _USERNAME
+        t = clid  # _CLIENT_ID
+
+        d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]])
+        p = n + y + d + r + e + t + d + n
+        h = p
+
+        m = 8011470
+        f = 0
+
+        for f in range(f, len(h)):
+            m = (m >> 1) + ((1 & m) << 23)
+            m += ord(h[f])
+            m &= 16777215
+
+        # c is not even needed
+        out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c)
+
+        return out
+
+    @classmethod
+    def _resolv_url(cls, url):
+        return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url
+
+    def _extract_info_dict(self, info, full_title=None, secret_token=None):
+        track_id = compat_str(info['id'])
+        title = info['title']
+
+        format_urls = set()
+        formats = []
+        query = {'client_id': self._CLIENT_ID}
+        if secret_token:
+            query['secret_token'] = secret_token
+
+        if info.get('downloadable') and info.get('has_downloads_left'):
+            download_url = update_url_query(
+                self._API_V2_BASE + 'tracks/' + track_id + '/download', query)
+            redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri')
+            if redirect_url:
+                urlh = self._request_webpage(
+                    HEADRequest(redirect_url), track_id, fatal=False)
+                if urlh:
+                    format_url = urlh.geturl()
+                    format_urls.add(format_url)
+                    formats.append({
+                        'format_id': 'download',
+                        'ext': urlhandle_detect_ext(urlh) or 'mp3',
+                        'filesize': int_or_none(urlh.headers.get('Content-Length')),
+                        'url': format_url,
+                        'preference': 10,
+                    })
+
+        def invalid_url(url):
+            return not url or url in format_urls
+
+        def add_format(f, protocol, is_preview=False):
+            mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url)
+            if mobj:
+                for k, v in mobj.groupdict().items():
+                    if not f.get(k):
+                        f[k] = v
+            format_id_list = []
+            if protocol:
+                format_id_list.append(protocol)
+            ext = f.get('ext')
+            if ext == 'aac':
+                f['abr'] = '256'
+            for k in ('ext', 'abr'):
+                v = f.get(k)
+                if v:
+                    format_id_list.append(v)
+            preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url'])
+            if preview:
+                format_id_list.append('preview')
+            abr = f.get('abr')
+            if abr:
+                f['abr'] = int(abr)
+            if protocol == 'hls':
+                protocol = 'm3u8' if ext == 'aac' else 'm3u8_native'
+            else:
+                protocol = 'http'
+            f.update({
+                'format_id': '_'.join(format_id_list),
+                'protocol': protocol,
+                'preference': -10 if preview else None,
+            })
+            formats.append(f)
+
+        # New API
+        transcodings = try_get(
+            info, lambda x: x['media']['transcodings'], list) or []
+        for t in transcodings:
+            if not isinstance(t, dict):
+                continue
+            format_url = url_or_none(t.get('url'))
+            if not format_url:
+                continue
+            stream = self._download_json(
+                format_url, track_id, query=query, fatal=False, headers=self._HEADERS)
+            if not isinstance(stream, dict):
+                continue
+            stream_url = url_or_none(stream.get('url'))
+            if invalid_url(stream_url):
+                continue
+            format_urls.add(stream_url)
+            stream_format = t.get('format') or {}
+            protocol = stream_format.get('protocol')
+            if protocol != 'hls' and '/hls' in format_url:
+                protocol = 'hls'
+            ext = None
+            preset = str_or_none(t.get('preset'))
+            if preset:
+                ext = preset.split('_')[0]
+            if ext not in KNOWN_EXTENSIONS:
+                ext = mimetype2ext(stream_format.get('mime_type'))
+            add_format({
+                'url': stream_url,
+                'ext': ext,
+            }, 'http' if protocol == 'progressive' else protocol,
+                t.get('snipped') or '/preview/' in format_url)
+
+        for f in formats:
+            f['vcodec'] = 'none'
+
+        if not formats and info.get('policy') == 'BLOCK':
+            self.raise_geo_restricted()
+        self._sort_formats(formats)
+
+        user = info.get('user') or {}
+
+        thumbnails = []
+        artwork_url = info.get('artwork_url')
+        thumbnail = artwork_url or user.get('avatar_url')
+        if isinstance(thumbnail, compat_str):
+            if re.search(self._IMAGE_REPL_RE, thumbnail):
+                for image_id, size in self._ARTWORK_MAP.items():
+                    i = {
+                        'id': image_id,
+                        'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail),
+                    }
+                    if image_id == 'tiny' and not artwork_url:
+                        size = 18
+                    elif image_id == 'original':
+                        i['preference'] = 10
+                    if size:
+                        i.update({
+                            'width': size,
+                            'height': size,
+                        })
+                    thumbnails.append(i)
+            else:
+                thumbnails = [{'url': thumbnail}]
+
+        def extract_count(key):
+            return int_or_none(info.get('%s_count' % key))
+
+        return {
+            'id': track_id,
+            'uploader': user.get('username'),
+            'uploader_id': str_or_none(user.get('id')) or user.get('permalink'),
+            'uploader_url': user.get('permalink_url'),
+            'timestamp': unified_timestamp(info.get('created_at')),
+            'title': title,
+            'description': info.get('description'),
+            'thumbnails': thumbnails,
+            'duration': float_or_none(info.get('duration'), 1000),
+            'webpage_url': info.get('permalink_url'),
+            'license': info.get('license'),
+            'view_count': extract_count('playback'),
+            'like_count': extract_count('favoritings') or extract_count('likes'),
+            'comment_count': extract_count('comment'),
+            'repost_count': extract_count('reposts'),
+            'genre': info.get('genre'),
+            'formats': formats
+        }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        track_id = mobj.group('track_id')
+
+        query = {}
+        if track_id:
+            info_json_url = self._API_V2_BASE + 'tracks/' + track_id
+            full_title = track_id
+            token = mobj.group('secret_token')
+            if token:
+                query['secret_token'] = token
+        else:
+            full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title')
+            token = mobj.group('token')
+            if token:
+                resolve_title += '/%s' % token
+            info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
+
+        info = self._download_json(
+            info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)
+
+        return self._extract_info_dict(info, full_title, token)
+
+
+class SoundcloudPlaylistBaseIE(SoundcloudIE):
+    def _extract_set(self, playlist, token=None):
+        playlist_id = compat_str(playlist['id'])
+        tracks = playlist.get('tracks') or []
+        if not all([t.get('permalink_url') for t in tracks]) and token:
+            tracks = self._download_json(
+                self._API_V2_BASE + 'tracks', playlist_id,
+                'Downloading tracks', query={
+                    'ids': ','.join([compat_str(t['id']) for t in tracks]),
+                    'playlistId': playlist_id,
+                    'playlistSecretToken': token,
+                }, headers=self._HEADERS)
+        entries = []
+        for track in tracks:
+            track_id = str_or_none(track.get('id'))
+            url = track.get('permalink_url')
+            if not url:
+                if not track_id:
+                    continue
+                url = self._API_V2_BASE + 'tracks/' + track_id
+                if token:
+                    url += '?secret_token=' + token
+            entries.append(self.url_result(
+                url, SoundcloudIE.ie_key(), track_id))
+        return self.playlist_result(
+            entries, playlist_id,
+            playlist.get('title'),
+            playlist.get('description'))
+
+
+class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
+    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'
+    IE_NAME = 'soundcloud:set'
+    _TESTS = [{
+        'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
+        'info_dict': {
+            'id': '2284613',
+            'title': 'The Royal Concept EP',
+            'description': 'md5:71d07087c7a449e8941a70a29e34671e',
+        },
+        'playlist_mincount': 5,
+    }, {
+        'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title')
+        token = mobj.group('token')
+        if token:
+            full_title += '/' + token
+
+        info = self._download_json(self._resolv_url(
+            self._BASE_URL + full_title), full_title, headers=self._HEADERS)
+
+        if 'errors' in info:
+            msgs = (compat_str(err['error_message']) for err in info['errors'])
+            raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
+
+        return self._extract_set(info, token)
+
+
+class SoundcloudPagedPlaylistBaseIE(SoundcloudIE):
+    def _extract_playlist(self, base_url, playlist_id, playlist_title):
+        COMMON_QUERY = {
+            'limit': 80000,
+            'linked_partitioning': '1',
+        }
+
+        query = COMMON_QUERY.copy()
+        query['offset'] = 0
+
+        next_href = base_url
+
+        entries = []
+        for i in itertools.count():
+            response = self._download_json(
+                next_href, playlist_id,
+                'Downloading track page %s' % (i + 1), query=query, headers=self._HEADERS)
+
+            collection = response['collection']
+
+            if not isinstance(collection, list):
+                collection = []
+
+            # Empty collection may be returned, in this case we proceed
+            # straight to next_href
+
+            def resolve_entry(candidates):
+                for cand in candidates:
+                    if not isinstance(cand, dict):
+                        continue
+                    permalink_url = url_or_none(cand.get('permalink_url'))
+                    if not permalink_url:
+                        continue
+                    return self.url_result(
+                        permalink_url,
+                        SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
+                        str_or_none(cand.get('id')), cand.get('title'))
+
+            for e in collection:
+                entry = resolve_entry((e, e.get('track'), e.get('playlist')))
+                if entry:
+                    entries.append(entry)
+
+            next_href = response.get('next_href')
+            if not next_href:
+                break
+
+            next_href = response['next_href']
+            parsed_next_href = compat_urlparse.urlparse(next_href)
+            query = compat_urlparse.parse_qs(parsed_next_href.query)
+            query.update(COMMON_QUERY)
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': playlist_title,
+            'entries': entries,
+        }
+
+
+class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:(?:www|m)\.)?soundcloud\.com/
+                            (?P<user>[^/]+)
+                            (?:/
+                                (?P<rsrc>tracks|albums|sets|reposts|likes|spotlight)
+                            )?
+                            /?(?:[?#].*)?$
+                    '''
+    IE_NAME = 'soundcloud:user'
+    _TESTS = [{
+        'url': 'https://soundcloud.com/soft-cell-official',
+        'info_dict': {
+            'id': '207965082',
+            'title': 'Soft Cell (All)',
+        },
+        'playlist_mincount': 28,
+    }, {
+        'url': 'https://soundcloud.com/soft-cell-official/tracks',
+        'info_dict': {
+            'id': '207965082',
+            'title': 'Soft Cell (Tracks)',
+        },
+        'playlist_mincount': 27,
+    }, {
+        'url': 'https://soundcloud.com/soft-cell-official/albums',
+        'info_dict': {
+            'id': '207965082',
+            'title': 'Soft Cell (Albums)',
+        },
+        'playlist_mincount': 1,
+    }, {
+        'url': 'https://soundcloud.com/jcv246/sets',
+        'info_dict': {
+            'id': '12982173',
+            'title': 'Jordi / cv (Sets)',
+        },
+        'playlist_mincount': 2,
+    }, {
+        'url': 'https://soundcloud.com/jcv246/reposts',
+        'info_dict': {
+            'id': '12982173',
+            'title': 'Jordi / cv (Reposts)',
+        },
+        'playlist_mincount': 6,
+    }, {
+        'url': 'https://soundcloud.com/clalberg/likes',
+        'info_dict': {
+            'id': '11817582',
+            'title': 'clalberg (Likes)',
+        },
+        'playlist_mincount': 5,
+    }, {
+        'url': 'https://soundcloud.com/grynpyret/spotlight',
+        'info_dict': {
+            'id': '7098329',
+            'title': 'Grynpyret (Spotlight)',
+        },
+        'playlist_mincount': 1,
+    }]
+
+    _BASE_URL_MAP = {
+        'all': 'stream/users/%s',
+        'tracks': 'users/%s/tracks',
+        'albums': 'users/%s/albums',
+        'sets': 'users/%s/playlists',
+        'reposts': 'stream/users/%s/reposts',
+        'likes': 'users/%s/likes',
+        'spotlight': 'users/%s/spotlight',
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        uploader = mobj.group('user')
+
+        user = self._download_json(
+            self._resolv_url(self._BASE_URL + uploader),
+            uploader, 'Downloading user info', headers=self._HEADERS)
+
+        resource = mobj.group('rsrc') or 'all'
+
+        return self._extract_playlist(
+            self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'],
+            str_or_none(user.get('id')),
+            '%s (%s)' % (user['username'], resource.capitalize()))
+
+
+class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
+    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)'
+    IE_NAME = 'soundcloud:trackstation'
+    _TESTS = [{
+        'url': 'https://soundcloud.com/stations/track/officialsundial/your-text',
+        'info_dict': {
+            'id': '286017854',
+            'title': 'Track station: your text',
+        },
+        'playlist_mincount': 47,
+    }]
+
+    def _real_extract(self, url):
+        track_name = self._match_id(url)
+
+        track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS)
+        track_id = self._search_regex(
+            r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
+
+        return self._extract_playlist(
+            self._API_V2_BASE + 'stations/%s/tracks' % track['id'],
+            track_id, 'Track station: %s' % track['title'])
+
+
+class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
+    _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
+    IE_NAME = 'soundcloud:playlist'
+    _TESTS = [{
+        'url': 'https://api.soundcloud.com/playlists/4110309',
+        'info_dict': {
+            'id': '4110309',
+            'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]',
+            'description': 're:.*?TILT Brass - Bowery Poetry Club',
+        },
+        'playlist_count': 6,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+
+        query = {}
+        token = mobj.group('token')
+        if token:
+            query['secret_token'] = token
+
+        data = self._download_json(
+            self._API_V2_BASE + 'playlists/' + playlist_id,
+            playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS)
+
+        return self._extract_set(data, token)
+
+
+class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
+    IE_NAME = 'soundcloud:search'
+    IE_DESC = 'Soundcloud search'
+    _MAX_RESULTS = float('inf')
+    _TESTS = [{
+        'url': 'scsearch15:post-avant jazzcore',
+        'info_dict': {
+            'title': 'post-avant jazzcore',
+        },
+        'playlist_count': 15,
+    }]
+
+    _SEARCH_KEY = 'scsearch'
+    _MAX_RESULTS_PER_PAGE = 200
+    _DEFAULT_RESULTS_PER_PAGE = 50
+
+    def _get_collection(self, endpoint, collection_id, **query):
+        limit = min(
+            query.get('limit', self._DEFAULT_RESULTS_PER_PAGE),
+            self._MAX_RESULTS_PER_PAGE)
+        query.update({
+            'limit': limit,
+            'linked_partitioning': 1,
+            'offset': 0,
+        })
+        next_url = update_url_query(self._API_V2_BASE + endpoint, query)
+
+        collected_results = 0
+
+        for i in itertools.count(1):
+            response = self._download_json(
+                next_url, collection_id, 'Downloading page {0}'.format(i),
+                'Unable to download API page', headers=self._HEADERS)
+
+            collection = response.get('collection', [])
+            if not collection:
+                break
+
+            collection = list(filter(bool, collection))
+            collected_results += len(collection)
+
+            for item in collection:
+                yield self.url_result(item['uri'], SoundcloudIE.ie_key())
+
+            if not collection or collected_results >= limit:
+                break
+
+            next_url = response.get('next_href')
+            if not next_url:
+                break
+
+    def _get_n_results(self, query, n):
+        tracks = self._get_collection('search/tracks', query, limit=n, q=query)
+        return self.playlist_result(tracks, playlist_title=query)
diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py

new file mode 100644 (file)

index 0000000..3d78a9d
--- /dev/null
+++ b/youtube_dl/extractor/soundgasm.py
@@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class SoundgasmIE(InfoExtractor):
+    IE_NAME = 'soundgasm'
+    _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)'
+    _TEST = {
+        'url': 'http://soundgasm.net/u/ytdl/Piano-sample',
+        'md5': '010082a2c802c5275bb00030743e75ad',
+        'info_dict': {
+            'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9',
+            'ext': 'm4a',
+            'title': 'Piano sample',
+            'description': 'Royalty Free Sample Music',
+            'uploader': 'ytdl',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        audio_url = self._html_search_regex(
+            r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'audio URL', group='url')
+
+        title = self._search_regex(
+            r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)',
+            webpage, 'title', default=display_id)
+
+        description = self._html_search_regex(
+            (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>',
+             r'(?s)<li>Description:\s(.*?)<\/li>'),
+            webpage, 'description', fatal=False)
+
+        audio_id = self._search_regex(
+            r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id)
+
+        return {
+            'id': audio_id,
+            'display_id': display_id,
+            'url': audio_url,
+            'vcodec': 'none',
+            'title': title,
+            'description': description,
+            'uploader': mobj.group('user'),
+        }
+
+
+class SoundgasmProfileIE(InfoExtractor):
+    IE_NAME = 'soundgasm:profile'
+    _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$'
+    _TEST = {
+        'url': 'http://soundgasm.net/u/ytdl',
+        'info_dict': {
+            'id': 'ytdl',
+        },
+        'playlist_count': 1,
+    }
+
+    def _real_extract(self, url):
+        profile_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, profile_id)
+
+        entries = [
+            self.url_result(audio_url, 'Soundgasm')
+            for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)]
+
+        return self.playlist_result(entries, profile_id)
diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py

new file mode 100644 (file)

index 0000000..da75a43
--- /dev/null
+++ b/youtube_dl/extractor/southpark.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .mtv import MTVServicesInfoExtractor
+
+
+class SouthParkIE(MTVServicesInfoExtractor):
+    IE_NAME = 'southpark.cc.com'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))'
+
+    _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
+
+    _TESTS = [{
+        'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured',
+        'info_dict': {
+            'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30',
+            'ext': 'mp4',
+            'title': 'South Park|Bat Daded',
+            'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.',
+            'timestamp': 1112760000,
+            'upload_date': '20050406',
+        },
+    }, {
+        'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1',
+        'only_matching': True,
+    }]
+
+
+class SouthParkEsIE(SouthParkIE):
+    IE_NAME = 'southpark.cc.com:español'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))'
+    _LANG = 'es'
+
+    _TESTS = [{
+        'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate',
+        'info_dict': {
+            'title': 'Cartman Consigue Una Sonda Anal',
+            'description': 'Cartman Consigue Una Sonda Anal',
+        },
+        'playlist_count': 4,
+        'skip': 'Geo-restricted',
+    }]
+
+
+class SouthParkDeIE(SouthParkIE):
+    IE_NAME = 'southpark.de'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden|collections)/(?P<id>.+?)(\?|#|$))'
+    _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
+
+    _TESTS = [{
+        'url': 'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured',
+        'info_dict': {
+            'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2',
+            'ext': 'mp4',
+            'title': 'South Park|The Government Won\'t Respect My Privacy',
+            'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
+            'timestamp': 1380160800,
+            'upload_date': '20130926',
+        },
+    }, {
+        # non-ASCII characters in initial URL
+        'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen',
+        'info_dict': {
+            'title': 'Hashtag „Aufwärmen“',
+            'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.',
+        },
+        'playlist_count': 3,
+    }, {
+        # non-ASCII characters in redirect URL
+        'url': 'http://www.southpark.de/alle-episoden/s18e09',
+        'info_dict': {
+            'title': 'Hashtag „Aufwärmen“',
+            'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.',
+        },
+        'playlist_count': 3,
+    }, {
+        'url': 'http://www.southpark.de/collections/2476/superhero-showdown/1',
+        'only_matching': True,
+    }]
+
+
+class SouthParkNlIE(SouthParkIE):
+    IE_NAME = 'southpark.nl'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))'
+    _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/'
+
+    _TESTS = [{
+        'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free',
+        'info_dict': {
+            'title': 'Freemium Isn\'t Free',
+            'description': 'Stan is addicted to the new Terrance and Phillip mobile game.',
+        },
+        'playlist_mincount': 3,
+    }]
+
+
+class SouthParkDkIE(SouthParkIE):
+    IE_NAME = 'southparkstudios.dk'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))'
+    _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/'
+
+    _TESTS = [{
+        'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop',
+        'info_dict': {
+            'title': 'Grounded Vindaloop',
+            'description': 'Butters is convinced he\'s living in a virtual reality.',
+        },
+        'playlist_mincount': 3,
+    }, {
+        'url': 'http://www.southparkstudios.dk/collections/2476/superhero-showdown/1',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.southparkstudios.nu/collections/2476/superhero-showdown/1',
+        'only_matching': True,
+    }]
diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py

new file mode 100644 (file)

index 0000000..61ca902
--- /dev/null
+++ b/youtube_dl/extractor/spankbang.py
@@ -0,0 +1,184 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    merge_dicts,
+    orderedSet,
+    parse_duration,
+    parse_resolution,
+    str_to_int,
+    url_or_none,
+    urlencode_postdata,
+)
+
+
+class SpankBangIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/(?:video|play|embed)\b'
+    _TESTS = [{
+        'url': 'http://spankbang.com/3vvn/video/fantasy+solo',
+        'md5': '1cc433e1d6aa14bc376535b8679302f7',
+        'info_dict': {
+            'id': '3vvn',
+            'ext': 'mp4',
+            'title': 'fantasy solo',
+            'description': 'dillion harper masturbates on a bed',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'silly2587',
+            'timestamp': 1422571989,
+            'upload_date': '20150129',
+            'age_limit': 18,
+        }
+    }, {
+        # 480p only
+        'url': 'http://spankbang.com/1vt0/video/solvane+gangbang',
+        'only_matching': True,
+    }, {
+        # no uploader
+        'url': 'http://spankbang.com/lklg/video/sex+with+anyone+wedding+edition+2',
+        'only_matching': True,
+    }, {
+        # mobile page
+        'url': 'http://m.spankbang.com/1o2de/video/can+t+remember+her+name',
+        'only_matching': True,
+    }, {
+        # 4k
+        'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k',
+        'only_matching': True,
+    }, {
+        'url': 'https://m.spankbang.com/3vvn/play/fantasy+solo/480p/',
+        'only_matching': True,
+    }, {
+        'url': 'https://m.spankbang.com/3vvn/play',
+        'only_matching': True,
+    }, {
+        'url': 'https://spankbang.com/2y3td/embed/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            url.replace('/%s/embed' % video_id, '/%s/video' % video_id),
+            video_id, headers={'Cookie': 'country=US'})
+
+        if re.search(r'<[^>]+\b(?:id|class)=["\']video_removed', webpage):
+            raise ExtractorError(
+                'Video %s is not available' % video_id, expected=True)
+
+        formats = []
+
+        def extract_format(format_id, format_url):
+            f_url = url_or_none(format_url)
+            if not f_url:
+                return
+            f = parse_resolution(format_id)
+            ext = determine_ext(f_url)
+            if format_id.startswith('m3u8') or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    f_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif format_id.startswith('mpd') or ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    f_url, video_id, mpd_id='dash', fatal=False))
+            elif ext == 'mp4' or f.get('width') or f.get('height'):
+                f.update({
+                    'url': f_url,
+                    'format_id': format_id,
+                })
+                formats.append(f)
+
+        STREAM_URL_PREFIX = 'stream_url_'
+
+        for mobj in re.finditer(
+                r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2'
+                % STREAM_URL_PREFIX, webpage):
+            extract_format(mobj.group('id', 'url'))
+
+        if not formats:
+            stream_key = self._search_regex(
+                r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
+                webpage, 'stream key', group='value')
+
+            stream = self._download_json(
+                'https://spankbang.com/api/videos/stream', video_id,
+                'Downloading stream JSON', data=urlencode_postdata({
+                    'id': stream_key,
+                    'data': 0,
+                }), headers={
+                    'Referer': url,
+                    'X-Requested-With': 'XMLHttpRequest',
+                })
+
+            for format_id, format_url in stream.items():
+                if format_url and isinstance(format_url, list):
+                    format_url = format_url[0]
+                extract_format(format_id, format_url)
+
+        self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id'))
+
+        info = self._search_json_ld(webpage, video_id, default={})
+
+        title = self._html_search_regex(
+            r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title', default=None)
+        description = self._search_regex(
+            r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)',
+            webpage, 'description', default=None)
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
+        uploader = self._html_search_regex(
+            (r'(?s)<li[^>]+class=["\']profile[^>]+>(.+?)</a>',
+             r'class="user"[^>]*><img[^>]+>([^<]+)'),
+            webpage, 'uploader', default=None)
+        duration = parse_duration(self._search_regex(
+            r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)',
+            webpage, 'duration', default=None))
+        view_count = str_to_int(self._search_regex(
+            r'([\d,.]+)\s+plays', webpage, 'view count', default=None))
+
+        age_limit = self._rta_search(webpage)
+
+        return merge_dicts({
+            'id': video_id,
+            'title': title or video_id,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+            'age_limit': age_limit,
+        }, info
+        )
+
+
+class SpankBangPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/[^/]+'
+    _TEST = {
+        'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties',
+        'info_dict': {
+            'id': 'ug0k',
+            'title': 'Big Ass Titties',
+        },
+        'playlist_mincount': 50,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, playlist_id, headers={'Cookie': 'country=US; mobile=on'})
+
+        entries = [self.url_result(
+            'https://spankbang.com/%s/video' % video_id,
+            ie=SpankBangIE.ie_key(), video_id=video_id)
+            for video_id in orderedSet(re.findall(
+                r'<a[^>]+\bhref=["\']/?([\da-z]+)/play/', webpage))]
+
+        title = self._html_search_regex(
+            r'<h1>([^<]+)\s+playlist</h1>', webpage, 'playlist title',
+            fatal=False)
+
+        return self.playlist_result(entries, playlist_id, title)
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py

new file mode 100644 (file)

index 0000000..35ab9ec
--- /dev/null
+++ b/youtube_dl/extractor/spankwire.py
@@ -0,0 +1,182 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    merge_dicts,
+    str_or_none,
+    str_to_int,
+    url_or_none,
+)
+
+
+class SpankwireIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?spankwire\.com/
+                        (?:
+                            [^/]+/video|
+                            EmbedPlayer\.aspx/?\?.*?\bArticleId=
+                        )
+                        (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4
+        'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
+        'md5': '5aa0e4feef20aad82cbcae3aed7ab7cd',
+        'info_dict': {
+            'id': '103545',
+            'ext': 'mp4',
+            'title': 'Buckcherry`s X Rated Music Video Crazy Bitch',
+            'description': 'Crazy Bitch X rated music video.',
+            'duration': 222,
+            'uploader': 'oreusz',
+            'uploader_id': '124697',
+            'timestamp': 1178587885,
+            'upload_date': '20070508',
+            'average_rating': float,
+            'view_count': int,
+            'comment_count': int,
+            'age_limit': 18,
+            'categories': list,
+            'tags': list,
+        },
+    }, {
+        # download URL pattern: */mp4_<format_id>_<video_id>.mp4
+        'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/',
+        'md5': '09b3c20833308b736ae8902db2f8d7e6',
+        'info_dict': {
+            'id': '1921551',
+            'ext': 'mp4',
+            'title': 'Titcums Compiloation I',
+            'description': 'cum on tits',
+            'uploader': 'dannyh78999',
+            'uploader_id': '3056053',
+            'upload_date': '20150822',
+            'age_limit': 18,
+        },
+        'params': {
+            'proxy': '127.0.0.1:8118'
+        },
+        'skip': 'removed',
+    }, {
+        'url': 'https://www.spankwire.com/EmbedPlayer.aspx/?ArticleId=156156&autostart=true',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'https://www.spankwire.com/api/video/%s.json' % video_id, video_id)
+
+        title = video['title']
+
+        formats = []
+        videos = video.get('videos')
+        if isinstance(videos, dict):
+            for format_id, format_url in videos.items():
+                video_url = url_or_none(format_url)
+                if not format_url:
+                    continue
+                height = int_or_none(self._search_regex(
+                    r'(\d+)[pP]', format_id, 'height', default=None))
+                m = re.search(
+                    r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', video_url)
+                if m:
+                    tbr = int(m.group('tbr'))
+                    height = height or int(m.group('height'))
+                else:
+                    tbr = None
+                formats.append({
+                    'url': video_url,
+                    'format_id': '%dp' % height if height else format_id,
+                    'height': height,
+                    'tbr': tbr,
+                })
+        m3u8_url = url_or_none(video.get('HLS'))
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+        self._sort_formats(formats, ('height', 'tbr', 'width', 'format_id'))
+
+        view_count = str_to_int(video.get('viewed'))
+
+        thumbnails = []
+        for preference, t in enumerate(('', '2x'), start=0):
+            thumbnail_url = url_or_none(video.get('poster%s' % t))
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+                'preference': preference,
+            })
+
+        def extract_names(key):
+            entries_list = video.get(key)
+            if not isinstance(entries_list, list):
+                return
+            entries = []
+            for entry in entries_list:
+                name = str_or_none(entry.get('name'))
+                if name:
+                    entries.append(name)
+            return entries
+
+        categories = extract_names('categories')
+        tags = extract_names('tags')
+
+        uploader = None
+        info = {}
+
+        webpage = self._download_webpage(
+            'https://www.spankwire.com/_/video%s/' % video_id, video_id,
+            fatal=False)
+        if webpage:
+            info = self._search_json_ld(webpage, video_id, default={})
+            thumbnail_url = None
+            if 'thumbnail' in info:
+                thumbnail_url = url_or_none(info['thumbnail'])
+                del info['thumbnail']
+            if not thumbnail_url:
+                thumbnail_url = self._og_search_thumbnail(webpage)
+            if thumbnail_url:
+                thumbnails.append({
+                    'url': thumbnail_url,
+                    'preference': 10,
+                })
+            uploader = self._html_search_regex(
+                r'(?s)by\s*<a[^>]+\bclass=["\']uploaded__by[^>]*>(.+?)</a>',
+                webpage, 'uploader', fatal=False)
+            if not view_count:
+                view_count = str_to_int(self._search_regex(
+                    r'data-views=["\']([\d,.]+)', webpage, 'view count',
+                    fatal=False))
+
+        return merge_dicts({
+            'id': video_id,
+            'title': title,
+            'description': video.get('description'),
+            'duration': int_or_none(video.get('duration')),
+            'thumbnails': thumbnails,
+            'uploader': uploader,
+            'uploader_id': str_or_none(video.get('userId')),
+            'timestamp': int_or_none(video.get('time_approved_on')),
+            'average_rating': float_or_none(video.get('rating')),
+            'view_count': view_count,
+            'comment_count': int_or_none(video.get('comments')),
+            'age_limit': 18,
+            'categories': categories,
+            'tags': tags,
+            'formats': formats,
+        }, info)
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py

new file mode 100644 (file)

index 0000000..4df7f4d
--- /dev/null
+++ b/youtube_dl/extractor/spiegel.py
@@ -0,0 +1,159 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .nexx import (
+    NexxIE,
+    NexxEmbedIE,
+)
+from .spiegeltv import SpiegeltvIE
+from ..compat import compat_urlparse
+from ..utils import (
+    parse_duration,
+    strip_or_none,
+    unified_timestamp,
+)
+
+
+class SpiegelIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$'
+    _TESTS = [{
+        'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
+        'md5': 'b57399839d055fccfeb9a0455c439868',
+        'info_dict': {
+            'id': '563747',
+            'ext': 'mp4',
+            'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
+            'description': 'md5:8029d8310232196eb235d27575a8b9f4',
+            'duration': 49,
+            'upload_date': '20130311',
+            'timestamp': 1362994320,
+        },
+    }, {
+        'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
+        'md5': '5b6c2f4add9d62912ed5fc78a1faed80',
+        'info_dict': {
+            'id': '580988',
+            'ext': 'mp4',
+            'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
+            'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
+            'duration': 983,
+            'upload_date': '20131115',
+            'timestamp': 1384546642,
+        },
+    }, {
+        'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html',
+        'md5': '97b91083a672d72976faa8433430afb9',
+        'info_dict': {
+            'id': '601883',
+            'ext': 'mp4',
+            'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
+            'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
+            'upload_date': '20140904',
+            'timestamp': 1409834160,
+        }
+    }, {
+        'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
+        'only_matching': True,
+    }, {
+        # nexx video
+        'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id
+        handle = self._request_webpage(metadata_url, video_id)
+
+        # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html
+        if SpiegeltvIE.suitable(handle.geturl()):
+            return self.url_result(handle.geturl(), 'Spiegeltv')
+
+        video_data = self._parse_json(self._webpage_read_content(
+            handle, metadata_url, video_id), video_id)
+        title = video_data['title']
+        nexx_id = video_data['nexxOmniaId']
+        domain_id = video_data.get('nexxOmniaDomain') or '748'
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': 'nexx:%s:%s' % (domain_id, nexx_id),
+            'title': title,
+            'description': strip_or_none(video_data.get('teaser')),
+            'duration': parse_duration(video_data.get('duration')),
+            'timestamp': unified_timestamp(video_data.get('datum')),
+            'ie_key': NexxIE.ie_key(),
+        }
+
+
+class SpiegelArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html'
+    IE_NAME = 'Spiegel:Article'
+    IE_DESC = 'Articles on spiegel.de'
+    _TESTS = [{
+        'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',
+        'info_dict': {
+            'id': '1516455',
+            'ext': 'mp4',
+            'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
+            'description': 're:^Patrick Kämnitz gehört.{100,}',
+            'upload_date': '20140825',
+        },
+    }, {
+        'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
+        'info_dict': {
+
+        },
+        'playlist_count': 6,
+    }, {
+        # Nexx iFrame embed
+        'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
+        'info_dict': {
+            'id': '161464',
+            'ext': 'mp4',
+            'title': 'Nervenkitzel Achterbahn',
+            'alt_title': 'Karussellbauer in Deutschland',
+            'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
+            'release_year': 2005,
+            'creator': 'SPIEGEL TV',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2761,
+            'timestamp': 1394021479,
+            'upload_date': '20140305',
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        # Single video on top of the page
+        video_link = self._search_regex(
+            r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage,
+            'video page URL', default=None)
+        if video_link:
+            video_url = compat_urlparse.urljoin(
+                self.http_scheme() + '//spiegel.de/', video_link)
+            return self.url_result(video_url)
+
+        # Multiple embedded videos
+        embeds = re.findall(
+            r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"',
+            webpage)
+        entries = [
+            self.url_result(compat_urlparse.urljoin(
+                self.http_scheme() + '//spiegel.de/', embed_path))
+            for embed_path in embeds]
+        if embeds:
+            return self.playlist_result(entries)
+
+        return self.playlist_from_matches(
+            NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key())
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py

new file mode 100644 (file)

index 0000000..6ccf4c3
--- /dev/null
+++ b/youtube_dl/extractor/spiegeltv.py
@@ -0,0 +1,17 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .nexx import NexxIE
+
+
+class SpiegeltvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        return self.url_result(
+            'https://api.nexx.cloud/v3/748/videos/byid/%s'
+            % self._match_id(url), ie=NexxIE.ie_key())
diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py

new file mode 100644 (file)

index 0000000..aabff7a
--- /dev/null
+++ b/youtube_dl/extractor/spike.py
@@ -0,0 +1,55 @@
+from __future__ import unicode_literals
+
+from .mtv import MTVServicesInfoExtractor
+
+
+class BellatorIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?bellator\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)'
+    _TESTS = [{
+        'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg',
+        'info_dict': {
+            'title': 'Michael Page vs. Evangelista Cyborg',
+            'description': 'md5:0d917fc00ffd72dd92814963fc6cbb05',
+        },
+        'playlist_count': 3,
+    }, {
+        'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page',
+        'only_matching': True,
+    }]
+
+    _FEED_URL = 'http://www.bellator.com/feeds/mrss/'
+    _GEO_COUNTRIES = ['US']
+
+    def _extract_mgid(self, webpage):
+        return self._extract_triforce_mgid(webpage)
+
+
+class ParamountNetworkIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)'
+    _TESTS = [{
+        'url': 'http://www.paramountnetwork.com/episodes/j830qm/lip-sync-battle-joel-mchale-vs-jim-rash-season-2-ep-13',
+        'info_dict': {
+            'id': '37ace3a8-1df6-48be-85b8-38df8229e241',
+            'ext': 'mp4',
+            'title': 'Lip Sync Battle|April 28, 2016|2|209|Joel McHale Vs. Jim Rash|Act 1',
+            'description': 'md5:a739ca8f978a7802f67f8016d27ce114',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/'
+    _GEO_COUNTRIES = ['US']
+
+    def _extract_mgid(self, webpage):
+        root_data = self._parse_json(self._search_regex(
+            r'window\.__DATA__\s*=\s*({.+})',
+            webpage, 'data'), None)
+
+        def find_sub_data(data, data_type):
+            return next(c for c in data['children'] if c.get('type') == data_type)
+
+        c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer')
+        return c['props']['media']['video']['config']['uri']
diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py

new file mode 100644 (file)

index 0000000..a417b5a
--- /dev/null
+++ b/youtube_dl/extractor/sport5.py
@@ -0,0 +1,92 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class Sport5IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)'
+    _TESTS = [
+        {
+            'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1',
+            'info_dict': {
+                'id': 's5-Y59xx1-GUh2',
+                'ext': 'mp4',
+                'title': 'ולנסיה-קורדובה 0:3',
+                'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה',
+                'duration': 228,
+                'categories': list,
+            },
+            'skip': 'Blocked outside of Israel',
+        }, {
+            'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE',
+            'info_dict': {
+                'id': 's5-SiXxx1-hKh2',
+                'ext': 'mp4',
+                'title': 'GOALS_CELTIC_270914.mp4',
+                'description': '',
+                'duration': 87,
+                'categories': list,
+            },
+            'skip': 'Blocked outside of Israel',
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        media_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, media_id)
+
+        video_id = self._html_search_regex(r'clipId=([\w-]+)', webpage, 'video id')
+
+        metadata = self._download_xml(
+            'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id,
+            video_id)
+
+        error = metadata.find('./Error')
+        if error is not None:
+            raise ExtractorError(
+                '%s returned error: %s - %s' % (
+                    self.IE_NAME,
+                    error.find('./Name').text,
+                    error.find('./Description').text),
+                expected=True)
+
+        title = metadata.find('./Title').text
+        description = metadata.find('./Description').text
+        duration = int(metadata.find('./Duration').text)
+
+        posters_el = metadata.find('./PosterLinks')
+        thumbnails = [{
+            'url': thumbnail.text,
+            'width': int(thumbnail.get('width')),
+            'height': int(thumbnail.get('height')),
+        } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else []
+
+        categories_el = metadata.find('./Categories')
+        categories = [
+            cat.get('name') for cat in categories_el.findall('./Category')
+        ] if categories_el is not None else []
+
+        formats = [{
+            'url': fmt.text,
+            'ext': 'mp4',
+            'vbr': int(fmt.get('bitrate')),
+            'width': int(fmt.get('width')),
+            'height': int(fmt.get('height')),
+        } for fmt in metadata.findall('./PlaybackLinks/FileURL')]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'duration': duration,
+            'categories': categories,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py

new file mode 100644 (file)

index 0000000..b9017fd
--- /dev/null
+++ b/youtube_dl/extractor/sportbox.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    js_to_json,
+    merge_dicts,
+)
+
+
+class SportBoxIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
+        'info_dict': {
+            'id': '109158',
+            'ext': 'mp4',
+            'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
+            'description': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 292,
+            'view_count': int,
+            'timestamp': 1426237001,
+            'upload_date': '20150313',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
+        'only_matching': True,
+    }, {
+        'url': 'https://news.sportbox.ru/vdl/player/media/193095',
+        'only_matching': True,
+    }, {
+        'url': 'https://news.sportbox.ru/vdl/player/media/109158',
+        'only_matching': True,
+    }, {
+        'url': 'https://matchtv.ru/vdl/player/media/109158',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        sources = self._parse_json(
+            self._search_regex(
+                r'(?s)playerOptions\.sources(?:WithRes)?\s*=\s*(\[.+?\])\s*;\s*\n',
+                webpage, 'sources'),
+            video_id, transform_source=js_to_json)
+
+        formats = []
+        for source in sources:
+            src = source.get('src')
+            if not src:
+                continue
+            if determine_ext(src) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': src,
+                })
+        self._sort_formats(formats)
+
+        player = self._parse_json(
+            self._search_regex(
+                r'(?s)playerOptions\s*=\s*({.+?})\s*;\s*\n', webpage,
+                'player options', default='{}'),
+            video_id, transform_source=js_to_json)
+        media_id = player['mediaId']
+
+        info = self._search_json_ld(webpage, media_id, default={})
+
+        view_count = int_or_none(self._search_regex(
+            r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None))
+
+        return merge_dicts(info, {
+            'id': media_id,
+            'title': self._og_search_title(webpage, default=None) or media_id,
+            'thumbnail': player.get('poster'),
+            'duration': int_or_none(player.get('duration')),
+            'view_count': view_count,
+            'formats': formats,
+        })
diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py

new file mode 100644 (file)

index 0000000..378fc75
--- /dev/null
+++ b/youtube_dl/extractor/sportdeutschland.py
@@ -0,0 +1,82 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_iso8601,
+    sanitized_Request,
+)
+
+
+class SportDeutschlandIE(InfoExtractor):
+    _VALID_URL = r'https?://sportdeutschland\.tv/(?P<sport>[^/?#]+)/(?P<id>[^?#/]+)(?:$|[?#])'
+    _TESTS = [{
+        'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0',
+        'info_dict': {
+            'id': 're-live-deutsche-meisterschaften-2020-halbfinals',
+            'ext': 'mp4',
+            'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals',
+            'categories': ['Badminton-Deutschland'],
+            'view_count': int,
+            'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+            'timestamp': int,
+            'upload_date': '20200201',
+            'description': 're:.*',  # meaningless description for THIS video
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        sport_id = mobj.group('sport')
+
+        api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % (
+            sport_id, video_id)
+        req = sanitized_Request(api_url, headers={
+            'Accept': 'application/vnd.vidibus.v2.html+json',
+            'Referer': url,
+        })
+        data = self._download_json(req, video_id)
+
+        asset = data['asset']
+        categories = [data['section']['title']]
+
+        formats = []
+        smil_url = asset['video']
+        if '.smil' in smil_url:
+            m3u8_url = smil_url.replace('.smil', '.m3u8')
+            formats.extend(
+                self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'))
+
+            smil_doc = self._download_xml(
+                smil_url, video_id, note='Downloading SMIL metadata')
+            base_url_el = smil_doc.find('./head/meta')
+            if base_url_el:
+                base_url = base_url_el.attrib['base']
+            formats.extend([{
+                'format_id': 'rmtp',
+                'url': base_url if base_url_el else n.attrib['src'],
+                'play_path': n.attrib['src'],
+                'ext': 'flv',
+                'preference': -100,
+                'format_note': 'Seems to fail at example stream',
+            } for n in smil_doc.findall('./body/video')])
+        else:
+            formats.append({'url': smil_url})
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': asset['title'],
+            'thumbnail': asset.get('image'),
+            'description': asset.get('teaser'),
+            'duration': asset.get('duration'),
+            'categories': categories,
+            'view_count': asset.get('views'),
+            'rtmp_live': asset.get('live'),
+            'timestamp': parse_iso8601(asset.get('date')),
+        }
diff --git a/youtube_dl/extractor/springboardplatform.py b/youtube_dl/extractor/springboardplatform.py

new file mode 100644 (file)

index 0000000..07d99b5
--- /dev/null
+++ b/youtube_dl/extractor/springboardplatform.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    xpath_attr,
+    xpath_text,
+    xpath_element,
+    unescapeHTML,
+    unified_timestamp,
+)
+
+
+class SpringboardPlatformIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        cms\.springboardplatform\.com/
+                        (?:
+                            (?:previews|embed_iframe)/(?P<index>\d+)/video/(?P<id>\d+)|
+                            xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+)
+                        )
+                    '''
+    _TESTS = [{
+        'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1',
+        'md5': '5c3cb7b5c55740d482561099e920f192',
+        'info_dict': {
+            'id': '981017',
+            'ext': 'mp4',
+            'title': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX',
+            'description': 'Redman "BUD like YOU" "Usher Good Kisser" REMIX',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1409132328,
+            'upload_date': '20140827',
+            'duration': 193,
+        },
+    }, {
+        'url': 'http://cms.springboardplatform.com/embed_iframe/159/video/981017/rab007/rapbasement.com/1/1',
+        'only_matching': True,
+    }, {
+        'url': 'http://cms.springboardplatform.com/embed_iframe/20/video/1731611/ki055/kidzworld.com/10',
+        'only_matching': True,
+    }, {
+        'url': 'http://cms.springboardplatform.com/xml_feeds_advanced/index/159/rss3/981017/0/0/1/',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url')
+            for mobj in re.finditer(
+                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1',
+                webpage)]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('id_2')
+        index = mobj.group('index') or mobj.group('index_2')
+
+        video = self._download_xml(
+            'http://cms.springboardplatform.com/xml_feeds_advanced/index/%s/rss3/%s'
+            % (index, video_id), video_id)
+
+        item = xpath_element(video, './/item', 'item', fatal=True)
+
+        content = xpath_element(
+            item, './{http://search.yahoo.com/mrss/}content', 'content',
+            fatal=True)
+        title = unescapeHTML(xpath_text(item, './title', 'title', fatal=True))
+
+        video_url = content.attrib['url']
+
+        if 'error_video.mp4' in video_url:
+            raise ExtractorError(
+                'Video %s no longer exists' % video_id, expected=True)
+
+        duration = int_or_none(content.get('duration'))
+        tbr = int_or_none(content.get('bitrate'))
+        filesize = int_or_none(content.get('fileSize'))
+        width = int_or_none(content.get('width'))
+        height = int_or_none(content.get('height'))
+
+        description = unescapeHTML(xpath_text(
+            item, './description', 'description'))
+        thumbnail = xpath_attr(
+            item, './{http://search.yahoo.com/mrss/}thumbnail', 'url',
+            'thumbnail')
+
+        timestamp = unified_timestamp(xpath_text(
+            item, './{http://cms.springboardplatform.com/namespaces.html}created',
+            'timestamp'))
+
+        formats = [{
+            'url': video_url,
+            'format_id': 'http',
+            'tbr': tbr,
+            'filesize': filesize,
+            'width': width,
+            'height': height,
+        }]
+
+        m3u8_format = formats[0].copy()
+        m3u8_format.update({
+            'url': re.sub(r'(https?://)cdn\.', r'\1hls.', video_url) + '.m3u8',
+            'ext': 'mp4',
+            'format_id': 'hls',
+            'protocol': 'm3u8_native',
+        })
+        formats.append(m3u8_format)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/sprout.py b/youtube_dl/extractor/sprout.py

new file mode 100644 (file)

index 0000000..8467bf4
--- /dev/null
+++ b/youtube_dl/extractor/sprout.py
@@ -0,0 +1,52 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .adobepass import AdobePassIE
+from ..utils import (
+    extract_attributes,
+    update_url_query,
+    smuggle_url,
+)
+
+
+class SproutIE(AdobePassIE):
+    _VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P<id>[^/?#]+)'
+    _TEST = {
+        'url': 'http://www.sproutonline.com/watch/cowboy-adventure',
+        'md5': '74bf14128578d1e040c3ebc82088f45f',
+        'info_dict': {
+            'id': '9dexnwtmh8_X',
+            'ext': 'mp4',
+            'title': 'A Cowboy Adventure',
+            'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.',
+            'timestamp': 1437758640,
+            'upload_date': '20150724',
+            'uploader': 'NBCU-SPROUT-NEW',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        video_component = self._search_regex(
+            r'(?s)(<div[^>]+data-component="video"[^>]*?>)',
+            webpage, 'video component', default=None)
+        if video_component:
+            options = self._parse_json(extract_attributes(
+                video_component)['data-options'], video_id)
+            theplatform_url = options['video']
+            query = {
+                'mbr': 'true',
+                'manifest': 'm3u',
+            }
+            if options.get('protected'):
+                query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout')
+            theplatform_url = smuggle_url(update_url_query(
+                theplatform_url, query), {'force_smil_url': True})
+        else:
+            iframe = self._search_regex(
+                r'(<iframe[^>]+id="sproutVideoIframe"[^>]*?>)',
+                webpage, 'iframe')
+            theplatform_url = extract_attributes(iframe)['src']
+
+        return self.url_result(theplatform_url, 'ThePlatform')
diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py

new file mode 100644 (file)

index 0000000..170dce8
--- /dev/null
+++ b/youtube_dl/extractor/srgssr.py
@@ -0,0 +1,186 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
+from ..utils import (
+    ExtractorError,
+    parse_iso8601,
+    qualities,
+)
+
+
+class SRGSSRIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)'
+    _GEO_BYPASS = False
+    _GEO_COUNTRIES = ['CH']
+
+    _ERRORS = {
+        'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.',
+        'AGERATING18': 'To protect children under the age of 18, this video is only available between 11 p.m. and 5 a.m.',
+        # 'ENDDATE': 'For legal reasons, this video was only available for a specified period of time.',
+        'GEOBLOCK': 'For legal reasons, this video is only available in Switzerland.',
+        'LEGAL': 'The video cannot be transmitted for legal reasons.',
+        'STARTDATE': 'This video is not yet available. Please try again later.',
+    }
+
+    def _get_tokenized_src(self, url, video_id, format_id):
+        sp = compat_urllib_parse_urlparse(url).path.split('/')
+        token = self._download_json(
+            'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]),
+            video_id, 'Downloading %s token' % format_id, fatal=False) or {}
+        auth_params = token.get('token', {}).get('authparams')
+        if auth_params:
+            url += '?' + auth_params
+        return url
+
+    def get_media_data(self, bu, media_type, media_id):
+        media_data = self._download_json(
+            'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id),
+            media_id)[media_type.capitalize()]
+
+        if media_data.get('block') and media_data['block'] in self._ERRORS:
+            message = self._ERRORS[media_data['block']]
+            if media_data['block'] == 'GEOBLOCK':
+                self.raise_geo_restricted(
+                    msg=message, countries=self._GEO_COUNTRIES)
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, message), expected=True)
+
+        return media_data
+
+    def _real_extract(self, url):
+        bu, media_type, media_id = re.match(self._VALID_URL, url).groups()
+
+        media_data = self.get_media_data(bu, media_type, media_id)
+
+        metadata = media_data['AssetMetadatas']['AssetMetadata'][0]
+        title = metadata['title']
+        description = metadata.get('description')
+        created_date = media_data.get('createdDate') or metadata.get('createdDate')
+        timestamp = parse_iso8601(created_date)
+
+        thumbnails = [{
+            'id': image.get('id'),
+            'url': image['url'],
+        } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])]
+
+        preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD'])
+        formats = []
+        for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []):
+            protocol = source.get('@protocol')
+            for asset in source['url']:
+                asset_url = asset['text']
+                quality = asset['@quality']
+                format_id = '%s-%s' % (protocol, quality)
+                if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'):
+                    asset_url = self._get_tokenized_src(asset_url, media_id, format_id)
+                    if protocol.startswith('HTTP-HDS'):
+                        formats.extend(self._extract_f4m_formats(
+                            asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0',
+                            media_id, f4m_id=format_id, fatal=False))
+                    elif protocol.startswith('HTTP-HLS'):
+                        formats.extend(self._extract_m3u8_formats(
+                            asset_url, media_id, 'mp4', 'm3u8_native',
+                            m3u8_id=format_id, fatal=False))
+                else:
+                    formats.append({
+                        'format_id': format_id,
+                        'url': asset_url,
+                        'preference': preference(quality),
+                        'ext': 'flv' if protocol == 'RTMP' else None,
+                    })
+        self._sort_formats(formats)
+
+        return {
+            'id': media_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
+
+
+class SRGSSRPlayIE(InfoExtractor):
+    IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:(?:www|play)\.)?
+                        (?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/
+                        (?:
+                            [^/]+/(?P<type>video|audio)/[^?]+|
+                            popup(?P<type_2>video|audio)player
+                        )
+                        \?id=(?P<id>[0-9a-f\-]{36}|\d+)
+                    '''
+
+    _TESTS = [{
+        'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5',
+        'md5': 'da6b5b3ac9fa4761a942331cef20fcb3',
+        'info_dict': {
+            'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5',
+            'ext': 'mp4',
+            'upload_date': '20130701',
+            'title': 'Snowden beantragt Asyl in Russland',
+            'timestamp': 1372713995,
+        }
+    }, {
+        # No Speichern (Save) button
+        'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa',
+        'md5': '0a274ce38fda48c53c01890651985bc6',
+        'info_dict': {
+            'id': '677f5829-e473-4823-ac83-a1087fe97faa',
+            'ext': 'flv',
+            'upload_date': '20130710',
+            'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive',
+            'description': 'md5:88604432b60d5a38787f152dec89cd56',
+            'timestamp': 1373493600,
+        },
+    }, {
+        'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc',
+        'info_dict': {
+            'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc',
+            'ext': 'mp3',
+            'upload_date': '20151013',
+            'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem',
+            'timestamp': 1444750398,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260',
+        'md5': '67a2a9ae4e8e62a68d0e9820cc9782df',
+        'info_dict': {
+            'id': '6348260',
+            'display_id': '6348260',
+            'ext': 'mp4',
+            'duration': 1796,
+            'title': 'Le 19h30',
+            'description': '',
+            'uploader': '19h30',
+            'upload_date': '20141201',
+            'timestamp': 1417458600,
+            'thumbnail': r're:^https?://.*\.image',
+            'view_count': int,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        bu = mobj.group('bu')
+        media_type = mobj.group('type') or mobj.group('type_2')
+        media_id = mobj.group('id')
+        # other info can be extracted from url + '&layout=json'
+        return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR')
diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py

new file mode 100644 (file)

index 0000000..359dada
--- /dev/null
+++ b/youtube_dl/extractor/srmediathek.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .ard import ARDMediathekBaseIE
+from ..utils import (
+    ExtractorError,
+    get_element_by_attribute,
+)
+
+
+class SRMediathekIE(ARDMediathekBaseIE):
+    IE_NAME = 'sr:mediathek'
+    IE_DESC = 'Saarländischer Rundfunk'
+    _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)'
+
+    _TESTS = [{
+        'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455',
+        'info_dict': {
+            'id': '28455',
+            'ext': 'mp4',
+            'title': 'sportarena (26.10.2014)',
+            'description': 'Ringen: KSV Köllerbach gegen Aachen-Walheim; Frauen-Fußball: 1. FC Saarbrücken gegen Sindelfingen; Motorsport: Rallye in Losheim; dazu: Interview mit Timo Bernhard; Turnen: TG Saar; Reitsport: Deutscher Voltigier-Pokal; Badminton: Interview mit Michael Fuchs ',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'skip': 'no longer available',
+    }, {
+        'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=37682',
+        'info_dict': {
+            'id': '37682',
+            'ext': 'mp4',
+            'title': 'Love, Cakes and Rock\'n\'Roll',
+            'description': 'md5:18bf9763631c7d326c22603681e1123d',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        if '>Der gew&uuml;nschte Beitrag ist leider nicht mehr verf&uuml;gbar.<' in webpage:
+            raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
+
+        media_collection_url = self._search_regex(
+            r'data-mediacollection-ardplayer="([^"]+)"', webpage, 'media collection url')
+        info = self._extract_media_info(media_collection_url, webpage, video_id)
+        info.update({
+            'id': video_id,
+            'title': get_element_by_attribute('class', 'ardplayer-title', webpage),
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        })
+        return info
diff --git a/youtube_dl/extractor/stanfordoc.py b/youtube_dl/extractor/stanfordoc.py

new file mode 100644 (file)

index 0000000..ae3dd13
--- /dev/null
+++ b/youtube_dl/extractor/stanfordoc.py
@@ -0,0 +1,91 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    orderedSet,
+    unescapeHTML,
+)
+
+
+class StanfordOpenClassroomIE(InfoExtractor):
+    IE_NAME = 'stanfordoc'
+    IE_DESC = 'Stanford Open ClassRoom'
+    _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
+    _TEST = {
+        'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
+        'md5': '544a9468546059d4e80d76265b0443b8',
+        'info_dict': {
+            'id': 'PracticalUnix_intro-environment',
+            'ext': 'mp4',
+            'title': 'Intro Environment',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        if mobj.group('course') and mobj.group('video'):  # A specific video
+            course = mobj.group('course')
+            video = mobj.group('video')
+            info = {
+                'id': course + '_' + video,
+                'uploader': None,
+                'upload_date': None,
+            }
+
+            baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
+            xmlUrl = baseUrl + video + '.xml'
+            mdoc = self._download_xml(xmlUrl, info['id'])
+            try:
+                info['title'] = mdoc.findall('./title')[0].text
+                info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
+            except IndexError:
+                raise ExtractorError('Invalid metadata XML file')
+            return info
+        elif mobj.group('course'):  # A course page
+            course = mobj.group('course')
+            info = {
+                'id': course,
+                '_type': 'playlist',
+                'uploader': None,
+                'upload_date': None,
+            }
+
+            coursepage = self._download_webpage(
+                url, info['id'],
+                note='Downloading course info page',
+                errnote='Unable to download course info page')
+
+            info['title'] = self._html_search_regex(
+                r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
+
+            info['description'] = self._html_search_regex(
+                r'(?s)<description>([^<]+)</description>',
+                coursepage, 'description', fatal=False)
+
+            links = orderedSet(re.findall(r'<a href="(VideoPage\.php\?[^"]+)">', coursepage))
+            info['entries'] = [self.url_result(
+                'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l)
+            ) for l in links]
+            return info
+        else:  # Root page
+            info = {
+                'id': 'Stanford OpenClassroom',
+                '_type': 'playlist',
+                'uploader': None,
+                'upload_date': None,
+            }
+            info['title'] = info['id']
+
+            rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
+            rootpage = self._download_webpage(rootURL, info['id'],
+                                              errnote='Unable to download course info page')
+
+            links = orderedSet(re.findall(r'<a href="(CoursePage\.php\?[^"]+)">', rootpage))
+            info['entries'] = [self.url_result(
+                'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l)
+            ) for l in links]
+            return info
diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py

new file mode 100644 (file)

index 0000000..a6a191c
--- /dev/null
+++ b/youtube_dl/extractor/steam.py
@@ -0,0 +1,149 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    extract_attributes,
+    ExtractorError,
+    get_element_by_class,
+    js_to_json,
+)
+
+
+class SteamIE(InfoExtractor):
+    _VALID_URL = r"""(?x)
+        https?://store\.steampowered\.com/
+            (agecheck/)?
+            (?P<urltype>video|app)/ #If the page is only for videos or for a game
+            (?P<gameID>\d+)/?
+            (?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID
+        |
+        https?://(?:www\.)?steamcommunity\.com/sharedfiles/filedetails/\?id=(?P<fileID>[0-9]+)
+    """
+    _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
+    _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
+    _TESTS = [{
+        'url': 'http://store.steampowered.com/video/105600/',
+        'playlist': [
+            {
+                'md5': '6a294ee0c4b1f47f5bb76a65e31e3592',
+                'info_dict': {
+                    'id': '2040428',
+                    'ext': 'mp4',
+                    'title': 'Terraria 1.3 Trailer',
+                    'playlist_index': 1,
+                }
+            },
+            {
+                'md5': '911672b20064ca3263fa89650ba5a7aa',
+                'info_dict': {
+                    'id': '2029566',
+                    'ext': 'mp4',
+                    'title': 'Terraria 1.2 Trailer',
+                    'playlist_index': 2,
+                }
+            }
+        ],
+        'info_dict': {
+            'id': '105600',
+            'title': 'Terraria',
+        },
+        'params': {
+            'playlistend': 2,
+        }
+    }, {
+        'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205',
+        'info_dict': {
+            'id': 'X8kpJBlzD2E',
+            'ext': 'mp4',
+            'upload_date': '20140617',
+            'title': 'FRONTIERS - Trapping',
+            'description': 'md5:bf6f7f773def614054089e5769c12a6e',
+            'uploader': 'AAD Productions',
+            'uploader_id': 'AtomicAgeDogGames',
+        }
+    }]
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        fileID = m.group('fileID')
+        if fileID:
+            videourl = url
+            playlist_id = fileID
+        else:
+            gameID = m.group('gameID')
+            playlist_id = gameID
+            videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id
+
+        self._set_cookie('steampowered.com', 'mature_content', '1')
+
+        webpage = self._download_webpage(videourl, playlist_id)
+
+        if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
+            videourl = self._AGECHECK_TEMPLATE % playlist_id
+            self.report_age_confirmation()
+            webpage = self._download_webpage(videourl, playlist_id)
+
+        flash_vars = self._parse_json(self._search_regex(
+            r'(?s)rgMovieFlashvars\s*=\s*({.+?});', webpage,
+            'flash vars'), playlist_id, js_to_json)
+
+        playlist_title = None
+        entries = []
+        if fileID:
+            playlist_title = get_element_by_class('workshopItemTitle', webpage)
+            for movie in flash_vars.values():
+                if not movie:
+                    continue
+                youtube_id = movie.get('YOUTUBE_VIDEO_ID')
+                if not youtube_id:
+                    continue
+                entries.append({
+                    '_type': 'url',
+                    'url': youtube_id,
+                    'ie_key': 'Youtube',
+                })
+        else:
+            playlist_title = get_element_by_class('apphub_AppName', webpage)
+            for movie_id, movie in flash_vars.items():
+                if not movie:
+                    continue
+                video_id = self._search_regex(r'movie_(\d+)', movie_id, 'video id', fatal=False)
+                title = movie.get('MOVIE_NAME')
+                if not title or not video_id:
+                    continue
+                entry = {
+                    'id': video_id,
+                    'title': title.replace('+', ' '),
+                }
+                formats = []
+                flv_url = movie.get('FILENAME')
+                if flv_url:
+                    formats.append({
+                        'format_id': 'flv',
+                        'url': flv_url,
+                    })
+                highlight_element = self._search_regex(
+                    r'(<div[^>]+id="highlight_movie_%s"[^>]+>)' % video_id,
+                    webpage, 'highlight element', fatal=False)
+                if highlight_element:
+                    highlight_attribs = extract_attributes(highlight_element)
+                    if highlight_attribs:
+                        entry['thumbnail'] = highlight_attribs.get('data-poster')
+                        for quality in ('', '-hd'):
+                            for ext in ('webm', 'mp4'):
+                                video_url = highlight_attribs.get('data-%s%s-source' % (ext, quality))
+                                if video_url:
+                                    formats.append({
+                                        'format_id': ext + quality,
+                                        'url': video_url,
+                                    })
+                if not formats:
+                    continue
+                entry['formats'] = formats
+                entries.append(entry)
+        if not entries:
+            raise ExtractorError('Could not find any videos')
+
+        return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py

new file mode 100644 (file)

index 0000000..97d1ff6
--- /dev/null
+++ b/youtube_dl/extractor/stitcher.py
@@ -0,0 +1,81 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    js_to_json,
+    unescapeHTML,
+)
+
+
+class StitcherIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)'
+    _TESTS = [{
+        'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
+        'md5': '391dd4e021e6edeb7b8e68fbf2e9e940',
+        'info_dict': {
+            'id': '40789481',
+            'ext': 'mp3',
+            'title': 'Machine Learning Mastery and Cancer Clusters',
+            'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3',
+            'duration': 1604,
+            'thumbnail': r're:^https?://.*\.jpg',
+        },
+    }, {
+        'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
+        'info_dict': {
+            'id': '40846275',
+            'display_id': 'the-rare-hourlong-comedy-plus',
+            'ext': 'mp3',
+            'title': "The CW's 'Crazy Ex-Girlfriend'",
+            'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
+            'duration': 2235,
+            'thumbnail': r're:^https?://.*\.jpg',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # escaped title
+        'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        audio_id = mobj.group('id')
+        display_id = mobj.group('display_id') or audio_id
+
+        webpage = self._download_webpage(url, display_id)
+
+        episode = self._parse_json(
+            js_to_json(self._search_regex(
+                r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')),
+            display_id)['config']['episode']
+
+        title = unescapeHTML(episode['title'])
+        formats = [{
+            'url': episode[episode_key],
+            'ext': determine_ext(episode[episode_key]) or 'mp3',
+            'vcodec': 'none',
+        } for episode_key in ('episodeURL',) if episode.get(episode_key)]
+        description = self._search_regex(
+            r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False)
+        duration = int_or_none(episode.get('duration'))
+        thumbnail = episode.get('episodeImage')
+
+        return {
+            'id': audio_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/storyfire.py b/youtube_dl/extractor/storyfire.py

new file mode 100644 (file)

index 0000000..67457cc
--- /dev/null
+++ b/youtube_dl/extractor/storyfire.py
@@ -0,0 +1,255 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+from .common import InfoExtractor
+
+
+class StoryFireIE(InfoExtractor):
+    _VALID_URL = r'(?:(?:https?://(?:www\.)?storyfire\.com/video-details)|(?:https://storyfire.app.link))/(?P<id>[^/\s]+)'
+    _TESTS = [{
+        'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181',
+        'md5': '560953bfca81a69003cfa5e53ac8a920',
+        'info_dict': {
+            'id': '5df1d132b6378700117f9181',
+            'ext': 'mp4',
+            'title': 'Buzzfeed Teaches You About Memes',
+            'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
+            'timestamp': 1576129028,
+            'description': 'Mocking Buzzfeed\'s meme lesson. Reuploaded from YouTube because of their new policies',
+            'uploader': 'whang!',
+            'upload_date': '20191212',
+        },
+        'params': {'format': 'bestvideo'}  # There are no merged formats in the playlist.
+    }, {
+        'url': 'https://storyfire.app.link/5GxAvWOQr8',  # Alternate URL format, with unrelated short ID
+        'md5': '7a2dc6d60c4889edfed459c620fe690d',
+        'info_dict': {
+            'id': '5f1e11ecd78a57b6c702001d',
+            'ext': 'm4a',
+            'title': 'Weird Nintendo Prototype Leaks',
+            'description': 'A stream taking a look at some weird Nintendo Prototypes with Luigi in Mario 64 and weird Yoshis',
+            'timestamp': 1595808576,
+            'upload_date': '20200727',
+            'uploader': 'whang!',
+            'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
+        },
+        'params': {'format': 'bestaudio'}  # Verifying audio extraction
+
+    }]
+
+    _aformats = {
+        'audio-medium-audio': {'acodec': 'aac', 'abr': 125, 'preference': -10},
+        'audio-high-audio': {'acodec': 'aac', 'abr': 254, 'preference': -1},
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        # Extracting the json blob is mandatory to proceed with extraction.
+        jsontext = self._html_search_regex(
+            r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>',
+            webpage, 'json_data')
+
+        json = self._parse_json(jsontext, video_id)
+
+        # The currentVideo field in the json is mandatory
+        # because it contains the only link to the m3u playlist
+        video = json['props']['initialState']['video']['currentVideo']
+        videourl = video['vimeoVideoURL']  # Video URL is mandatory
+
+        # Extract other fields from the json in an error tolerant fashion
+        # ID may be incorrect (on short URL format), correct it.
+        parsed_id = video.get('_id')
+        if parsed_id:
+            video_id = parsed_id
+
+        title = video.get('title')
+        description = video.get('description')
+
+        thumbnail = video.get('storyImage')
+        views = video.get('views')
+        likes = video.get('likesCount')
+        comments = video.get('commentsCount')
+        duration = video.get('videoDuration')
+        publishdate = video.get('publishDate')  # Apparently epoch time, day only
+
+        uploader = video.get('username')
+        uploader_id = video.get('hostID')
+        # Construct an uploader URL
+        uploader_url = None
+        if uploader_id:
+            uploader_url = "https://storyfire.com/user/%s/video" % uploader_id
+
+        # Collect root playlist to determine formats
+        formats = self._extract_m3u8_formats(
+            videourl, video_id, 'mp4', 'm3u8_native')
+
+        # Modify formats to fill in missing information about audio codecs
+        for format in formats:
+            aformat = self._aformats.get(format['format_id'])
+            if aformat:
+                format['acodec'] = aformat['acodec']
+                format['abr'] = aformat['abr']
+                format['preference'] = aformat['preference']
+                format['ext'] = 'm4a'
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'ext': "mp4",
+            'url': videourl,
+            'formats': formats,
+
+            'thumbnail': thumbnail,
+            'view_count': views,
+            'like_count': likes,
+            'comment_count': comments,
+            'duration': duration,
+            'timestamp': publishdate,
+
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'uploader_url': uploader_url,
+
+        }
+
+
+class StoryFireUserIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?storyfire\.com/user/(?P<id>[^/\s]+)/video'
+    _TESTS = [{
+        'url': 'https://storyfire.com/user/ntZAJFECERSgqHSxzonV5K2E89s1/video',
+        'info_dict': {
+            'id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
+            'title': 'whang!',
+        },
+        'playlist_mincount': 18
+    }, {
+        'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video',
+        'info_dict': {
+            'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2',
+            'title': 'McJuggerNuggets',
+        },
+        'playlist_mincount': 143
+
+    }]
+
+    # Generator for fetching playlist items
+    def _enum_videos(self, baseurl, user_id, firstjson):
+        totalVideos = int(firstjson['videosCount'])
+        haveVideos = 0
+        json = firstjson
+
+        for page in itertools.count(1):
+            for video in json['videos']:
+                id = video['_id']
+                url = "https://storyfire.com/video-details/%s" % id
+                haveVideos += 1
+                yield {
+                    '_type': 'url',
+                    'id': id,
+                    'url': url,
+                    'ie_key': 'StoryFire',
+
+                    'title': video.get('title'),
+                    'description': video.get('description'),
+                    'view_count': video.get('views'),
+                    'comment_count': video.get('commentsCount'),
+                    'duration': video.get('videoDuration'),
+                    'timestamp': video.get('publishDate'),
+                }
+            # Are there more pages we could fetch?
+            if haveVideos < totalVideos:
+                pageurl = baseurl + ("%i" % haveVideos)
+                json = self._download_json(pageurl, user_id,
+                                           note='Downloading page %s' % page)
+
+                # Are there any videos in the new json?
+                videos = json.get('videos')
+                if not videos or len(videos) == 0:
+                    break  # no videos
+
+            else:
+                break  # We have fetched all the videos, stop
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+
+        baseurl = "https://storyfire.com/app/publicVideos/%s?skip=" % user_id
+
+        # Download first page to ensure it can be downloaded, and get user information if available.
+        firstpage = baseurl + "0"
+        firstjson = self._download_json(firstpage, user_id)
+
+        title = None
+        videos = firstjson.get('videos')
+        if videos and len(videos):
+            title = videos[1].get('username')
+
+        return {
+            '_type': 'playlist',
+            'entries': self._enum_videos(baseurl, user_id, firstjson),
+            'id': user_id,
+            'title': title,
+        }
+
+
+class StoryFireSeriesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?storyfire\.com/write/series/stories/(?P<id>[^/\s]+)'
+    _TESTS = [{
+        'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/',
+        'info_dict': {
+            'id': '-Lq6MsuIHLODO6d2dDkr',
+        },
+        'playlist_mincount': 13
+    }, {
+        'url': 'https://storyfire.com/write/series/stories/the_mortal_one/',
+        'info_dict': {
+            'id': 'the_mortal_one',
+        },
+        'playlist_count': 0  # This playlist has entries, but no videos.
+    }, {
+        'url': 'https://storyfire.com/write/series/stories/story_time',
+        'info_dict': {
+            'id': 'story_time',
+        },
+        'playlist_mincount': 10
+    }]
+
+    # Generator for returning playlist items
+    # This object is substantially different than the one in the user videos page above
+    def _enum_videos(self, jsonlist):
+        for video in jsonlist:
+            id = video['_id']
+            if video.get('hasVideo'):  # Boolean element
+                url = "https://storyfire.com/video-details/%s" % id
+                yield {
+                    '_type': 'url',
+                    'id': id,
+                    'url': url,
+                    'ie_key': 'StoryFire',
+
+                    'title': video.get('title'),
+                    'description': video.get('description'),
+                    'view_count': video.get('views'),
+                    'likes_count': video.get('likesCount'),
+                    'comment_count': video.get('commentsCount'),
+                    'duration': video.get('videoDuration'),
+                    'timestamp': video.get('publishDate'),
+                }
+
+    def _real_extract(self, url):
+        list_id = self._match_id(url)
+
+        listurl = "https://storyfire.com/app/seriesStories/%s/list" % list_id
+        json = self._download_json(listurl, list_id)
+
+        return {
+            '_type': 'playlist',
+            'entries': self._enum_videos(json),
+            'id': list_id
+        }
diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py

new file mode 100644 (file)

index 0000000..3472527
--- /dev/null
+++ b/youtube_dl/extractor/streamable.py
@@ -0,0 +1,112 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+)
+
+
+class StreamableIE(InfoExtractor):
+    _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)'
+    _TESTS = [
+        {
+            'url': 'https://streamable.com/dnd1',
+            'md5': '3e3bc5ca088b48c2d436529b64397fef',
+            'info_dict': {
+                'id': 'dnd1',
+                'ext': 'mp4',
+                'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol',
+                'thumbnail': r're:https?://.*\.jpg$',
+                'uploader': 'teabaker',
+                'timestamp': 1454964157.35115,
+                'upload_date': '20160208',
+                'duration': 61.516,
+                'view_count': int,
+            }
+        },
+        # older video without bitrate, width/height, etc. info
+        {
+            'url': 'https://streamable.com/moo',
+            'md5': '2cf6923639b87fba3279ad0df3a64e73',
+            'info_dict': {
+                'id': 'moo',
+                'ext': 'mp4',
+                'title': '"Please don\'t eat me!"',
+                'thumbnail': r're:https?://.*\.jpg$',
+                'timestamp': 1426115495,
+                'upload_date': '20150311',
+                'duration': 12,
+                'view_count': int,
+            }
+        },
+        {
+            'url': 'https://streamable.com/e/dnd1',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://streamable.com/s/okkqk/drxjds',
+            'only_matching': True,
+        }
+    ]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)',
+            webpage)
+        if mobj:
+            return mobj.group('src')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # Note: Using the ajax API, as the public Streamable API doesn't seem
+        # to return video info like the title properly sometimes, and doesn't
+        # include info like the video duration
+        video = self._download_json(
+            'https://ajax.streamable.com/videos/%s' % video_id, video_id)
+
+        # Format IDs:
+        # 0 The video is being uploaded
+        # 1 The video is being processed
+        # 2 The video has at least one file ready
+        # 3 The video is unavailable due to an error
+        status = video.get('status')
+        if status != 2:
+            raise ExtractorError(
+                'This video is currently unavailable. It may still be uploading or processing.',
+                expected=True)
+
+        title = video.get('reddit_title') or video['title']
+
+        formats = []
+        for key, info in video['files'].items():
+            if not info.get('url'):
+                continue
+            formats.append({
+                'format_id': key,
+                'url': self._proto_relative_url(info['url']),
+                'width': int_or_none(info.get('width')),
+                'height': int_or_none(info.get('height')),
+                'filesize': int_or_none(info.get('size')),
+                'fps': int_or_none(info.get('framerate')),
+                'vbr': float_or_none(info.get('bitrate'), 1000)
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': self._proto_relative_url(video.get('thumbnail_url')),
+            'uploader': video.get('owner', {}).get('user_name'),
+            'timestamp': float_or_none(video.get('date_added')),
+            'duration': float_or_none(video.get('duration')),
+            'view_count': int_or_none(video.get('plays')),
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py

new file mode 100644 (file)

index 0000000..32eb2b9
--- /dev/null
+++ b/youtube_dl/extractor/streamcloud.py
@@ -0,0 +1,78 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    urlencode_postdata,
+)
+
+
+class StreamcloudIE(InfoExtractor):
+    IE_NAME = 'streamcloud.eu'
+    _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?'
+
+    _TESTS = [{
+        'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dlc_test_video_____________-BaW_jenozKc.mp4.html',
+        'md5': '6bea4c7fa5daaacc2a946b7146286686',
+        'info_dict': {
+            'id': 'skp9j99s4bpz',
+            'ext': 'mp4',
+            'title': 'youtube-dlc test video  \'/\\ ä ↭',
+        },
+        'skip': 'Only available from the EU'
+    }, {
+        'url': 'http://streamcloud.eu/ua8cmfh1nbe6/NSHIP-148--KUC-NG--H264-.mp4.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        url = 'http://streamcloud.eu/%s' % video_id
+
+        orig_webpage = self._download_webpage(url, video_id)
+
+        if '>File Not Found<' in orig_webpage:
+            raise ExtractorError(
+                'Video %s does not exist' % video_id, expected=True)
+
+        fields = re.findall(r'''(?x)<input\s+
+            type="(?:hidden|submit)"\s+
+            name="([^"]+)"\s+
+            (?:id="[^"]+"\s+)?
+            value="([^"]*)"
+            ''', orig_webpage)
+
+        self._sleep(6, video_id)
+
+        webpage = self._download_webpage(
+            url, video_id, data=urlencode_postdata(fields), headers={
+                b'Content-Type': b'application/x-www-form-urlencoded',
+            })
+
+        try:
+            title = self._html_search_regex(
+                r'<h1[^>]*>([^<]+)<', webpage, 'title')
+            video_url = self._search_regex(
+                r'file:\s*"([^"]+)"', webpage, 'video URL')
+        except ExtractorError:
+            message = self._html_search_regex(
+                r'(?s)<div[^>]+class=(["\']).*?msgboxinfo.*?\1[^>]*>(?P<message>.+?)</div>',
+                webpage, 'message', default=None, group='message')
+            if message:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
+            raise
+        thumbnail = self._search_regex(
+            r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'http_headers': {
+                'Referer': url,
+            },
+        }
diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py

new file mode 100644 (file)

index 0000000..58e0b4c
--- /dev/null
+++ b/youtube_dl/extractor/streamcz.py
@@ -0,0 +1,105 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    sanitized_Request,
+)
+
+
+def _get_api_key(api_path):
+    if api_path.endswith('?'):
+        api_path = api_path[:-1]
+
+    api_key = 'fb5f58a820353bd7095de526253c14fd'
+    a = '{0:}{1:}{2:}'.format(api_key, api_path, int(round(time.time() / 24 / 3600)))
+    return hashlib.md5(a.encode('ascii')).hexdigest()
+
+
+class StreamCZIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<id>[0-9]+)'
+    _API_URL = 'http://www.stream.cz/API'
+
+    _TESTS = [{
+        'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
+        'md5': '934bb6a6d220d99c010783c9719960d5',
+        'info_dict': {
+            'id': '765767',
+            'ext': 'mp4',
+            'title': 'Peklo na talíři: Éčka pro děti',
+            'description': 'Taška s grónskou pomazánkou a další pekelnosti ZDE',
+            'thumbnail': 're:^http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
+            'duration': 256,
+        },
+    }, {
+        'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka',
+        'md5': '849a88c1e1ca47d41403c2ba5e59e261',
+        'info_dict': {
+            'id': '10002447',
+            'ext': 'mp4',
+            'title': 'Kancelář Blaník: Tři roky pro Mazánka',
+            'description': 'md5:3862a00ba7bf0b3e44806b544032c859',
+            'thumbnail': 're:^http://im.stream.cz/episode/537f838c50c11f8d21320000',
+            'duration': 368,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        api_path = '/episode/%s' % video_id
+
+        req = sanitized_Request(self._API_URL + api_path)
+        req.add_header('Api-Password', _get_api_key(api_path))
+        data = self._download_json(req, video_id)
+
+        formats = []
+        for quality, video in enumerate(data['video_qualities']):
+            for f in video['formats']:
+                typ = f['type'].partition('/')[2]
+                qlabel = video.get('quality_label')
+                formats.append({
+                    'format_note': '%s-%s' % (qlabel, typ) if qlabel else typ,
+                    'format_id': '%s-%s' % (typ, f['quality']),
+                    'url': f['source'],
+                    'height': int_or_none(f['quality'].rstrip('p')),
+                    'quality': quality,
+                })
+        self._sort_formats(formats)
+
+        image = data.get('image')
+        if image:
+            thumbnail = self._proto_relative_url(
+                image.replace('{width}', '1240').replace('{height}', '697'),
+                scheme='http:',
+            )
+        else:
+            thumbnail = None
+
+        stream = data.get('_embedded', {}).get('stream:show', {}).get('name')
+        if stream:
+            title = '%s: %s' % (stream, data['name'])
+        else:
+            title = data['name']
+
+        subtitles = {}
+        srt_url = data.get('subtitles_srt')
+        if srt_url:
+            subtitles['cs'] = [{
+                'ext': 'srt',
+                'url': srt_url,
+            }]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+            'description': data.get('web_site_text'),
+            'duration': int_or_none(data.get('duration')),
+            'view_count': int_or_none(data.get('views')),
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/streetvoice.py b/youtube_dl/extractor/streetvoice.py

new file mode 100644 (file)

index 0000000..91612c7
--- /dev/null
+++ b/youtube_dl/extractor/streetvoice.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import unified_strdate
+
+
+class StreetVoiceIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://streetvoice.com/skippylu/songs/94440/',
+        'md5': '15974627fc01a29e492c98593c2fd472',
+        'info_dict': {
+            'id': '94440',
+            'ext': 'mp3',
+            'title': '輸',
+            'description': 'Crispy脆樂團 - 輸',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 260,
+            'upload_date': '20091018',
+            'uploader': 'Crispy脆樂團',
+            'uploader_id': '627810',
+        }
+    }, {
+        'url': 'http://tw.streetvoice.com/skippylu/songs/94440/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        song_id = self._match_id(url)
+
+        song = self._download_json(
+            'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'')
+
+        title = song['name']
+        author = song['user']['nickname']
+
+        return {
+            'id': song_id,
+            'url': song['file'],
+            'title': title,
+            'description': '%s - %s' % (author, title),
+            'thumbnail': self._proto_relative_url(song.get('image'), 'http:'),
+            'duration': song.get('length'),
+            'upload_date': unified_strdate(song.get('created_at')),
+            'uploader': author,
+            'uploader_id': compat_str(song['user']['id']),
+        }
diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py

new file mode 100644 (file)

index 0000000..4dbead2
--- /dev/null
+++ b/youtube_dl/extractor/stretchinternet.py
@@ -0,0 +1,32 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class StretchInternetIE(InfoExtractor):
+    _VALID_URL = r'https?://portal\.stretchinternet\.com/[^/]+/(?:portal|full)\.htm\?.*?\beventId=(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://portal.stretchinternet.com/umary/portal.htm?eventId=573272&streamType=video',
+        'info_dict': {
+            'id': '573272',
+            'ext': 'mp4',
+            'title': 'University of Mary Wrestling vs. Upper Iowa',
+            'timestamp': 1575668361,
+            'upload_date': '20191206',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        event = self._download_json(
+            'https://api.stretchinternet.com/trinity/event/tcg/' + video_id,
+            video_id)[0]
+
+        return {
+            'id': video_id,
+            'title': event['title'],
+            'timestamp': int_or_none(event.get('dateCreated'), 1000),
+            'url': 'https://' + event['media'][0]['url'],
+        }
diff --git a/youtube_dl/extractor/stv.py b/youtube_dl/extractor/stv.py

new file mode 100644 (file)

index 0000000..bae8b71
--- /dev/null
+++ b/youtube_dl/extractor/stv.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_str,
+    float_or_none,
+    int_or_none,
+)
+
+
+class STVPlayerIE(InfoExtractor):
+    IE_NAME = 'stv:player'
+    _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})'
+    _TEST = {
+        'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/',
+        'md5': '5adf9439c31d554f8be0707c7abe7e0a',
+        'info_dict': {
+            'id': '5333973339001',
+            'ext': 'mp4',
+            'upload_date': '20170301',
+            'title': '60 seconds on set with Laura Norton',
+            'description': "How many questions can Laura - a.k.a Kerry Wyatt - answer in 60 seconds? Let\'s find out!",
+            'timestamp': 1488388054,
+            'uploader_id': '1486976045',
+        },
+        'skip': 'this resource is unavailable outside of the UK',
+    }
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s'
+    _PTYPE_MAP = {
+        'episode': 'episodes',
+        'video': 'shortform',
+    }
+
+    def _real_extract(self, url):
+        ptype, video_id = re.match(self._VALID_URL, url).groups()
+        resp = self._download_json(
+            'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id),
+            video_id)
+
+        result = resp['results']
+        video = result['video']
+        video_id = compat_str(video['id'])
+
+        subtitles = {}
+        _subtitles = result.get('_subtitles') or {}
+        for ext, sub_url in _subtitles.items():
+            subtitles.setdefault('en', []).append({
+                'ext': 'vtt' if ext == 'webvtt' else ext,
+                'url': sub_url,
+            })
+
+        programme = result.get('programme') or {}
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id,
+            'description': result.get('summary'),
+            'duration': float_or_none(video.get('length'), 1000),
+            'subtitles': subtitles,
+            'view_count': int_or_none(result.get('views')),
+            'series': programme.get('name') or programme.get('shortName'),
+            'ie_key': 'BrightcoveNew',
+        }
diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py

new file mode 100644 (file)

index 0000000..6805116
--- /dev/null
+++ b/youtube_dl/extractor/sunporno.py
@@ -0,0 +1,79 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    int_or_none,
+    qualities,
+    determine_ext,
+)
+
+
+class SunPornoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?sunporno\.com/videos|embeds\.sunporno\.com/embed)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.sunporno.com/videos/807778/',
+        'md5': '507887e29033502f29dba69affeebfc9',
+        'info_dict': {
+            'id': '807778',
+            'ext': 'mp4',
+            'title': 'md5:0a400058e8105d39e35c35e7c5184164',
+            'description': 'md5:a31241990e1bd3a64e72ae99afb325fb',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 302,
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'http://embeds.sunporno.com/embed/807778',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.sunporno.com/videos/%s' % video_id, video_id)
+
+        title = self._html_search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title')
+        description = self._html_search_meta(
+            'description', webpage, 'description')
+        thumbnail = self._html_search_regex(
+            r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
+
+        duration = parse_duration(self._search_regex(
+            (r'itemprop="duration"[^>]*>\s*(\d+:\d+)\s*<',
+             r'>Duration:\s*<span[^>]+>\s*(\d+:\d+)\s*<'),
+            webpage, 'duration', fatal=False))
+
+        view_count = int_or_none(self._html_search_regex(
+            r'class="views">(?:<noscript>)?\s*(\d+)\s*<',
+            webpage, 'view count', fatal=False))
+        comment_count = int_or_none(self._html_search_regex(
+            r'(\d+)</b> Comments?',
+            webpage, 'comment count', fatal=False, default=None))
+
+        formats = []
+        quality = qualities(['mp4', 'flv'])
+        for video_url in re.findall(r'<(?:source|video) src="([^"]+)"', webpage):
+            video_ext = determine_ext(video_url)
+            formats.append({
+                'url': video_url,
+                'format_id': video_ext,
+                'quality': quality(video_ext),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'formats': formats,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/sverigesradio.py b/youtube_dl/extractor/sverigesradio.py

new file mode 100644 (file)

index 0000000..aa0691f
--- /dev/null
+++ b/youtube_dl/extractor/sverigesradio.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    str_or_none,
+)
+
+
+class SverigesRadioBaseIE(InfoExtractor):
+    _BASE_URL = 'https://sverigesradio.se/sida/playerajax/'
+    _QUALITIES = ['low', 'medium', 'high']
+    _EXT_TO_CODEC_MAP = {
+        'mp3': 'mp3',
+        'm4a': 'aac',
+    }
+    _CODING_FORMAT_TO_ABR_MAP = {
+        5: 128,
+        11: 192,
+        12: 32,
+        13: 96,
+    }
+
+    def _real_extract(self, url):
+        audio_id = self._match_id(url)
+        query = {
+            'id': audio_id,
+            'type': self._AUDIO_TYPE,
+        }
+
+        item = self._download_json(
+            self._BASE_URL + 'audiometadata', audio_id,
+            'Downloading audio JSON metadata', query=query)['items'][0]
+        title = item['subtitle']
+
+        query['format'] = 'iis'
+        urls = []
+        formats = []
+        for quality in self._QUALITIES:
+            query['quality'] = quality
+            audio_url_data = self._download_json(
+                self._BASE_URL + 'getaudiourl', audio_id,
+                'Downloading %s format JSON metadata' % quality,
+                fatal=False, query=query) or {}
+            audio_url = audio_url_data.get('audioUrl')
+            if not audio_url or audio_url in urls:
+                continue
+            urls.append(audio_url)
+            ext = determine_ext(audio_url)
+            coding_format = audio_url_data.get('codingFormat')
+            abr = int_or_none(self._search_regex(
+                r'_a(\d+)\.m4a', audio_url, 'audio bitrate',
+                default=None)) or self._CODING_FORMAT_TO_ABR_MAP.get(coding_format)
+            formats.append({
+                'abr': abr,
+                'acodec': self._EXT_TO_CODEC_MAP.get(ext),
+                'ext': ext,
+                'format_id': str_or_none(coding_format),
+                'vcodec': 'none',
+                'url': audio_url,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': audio_id,
+            'title': title,
+            'formats': formats,
+            'series': item.get('title'),
+            'duration': int_or_none(item.get('duration')),
+            'thumbnail': item.get('displayimageurl'),
+            'description': item.get('description'),
+        }
+
+
+class SverigesRadioPublicationIE(SverigesRadioBaseIE):
+    IE_NAME = 'sverigesradio:publication'
+    _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/sida/(?:artikel|gruppsida)\.aspx\?.*?\bartikel=(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7038546',
+        'md5': '6a4917e1923fccb080e5a206a5afa542',
+        'info_dict': {
+            'id': '7038546',
+            'ext': 'm4a',
+            'duration': 132,
+            'series': 'Nyheter (Ekot)',
+            'title': 'Esa Teittinen: Sanningen har inte kommit fram',
+            'description': 'md5:daf7ce66a8f0a53d5465a5984d3839df',
+            'thumbnail': r're:^https?://.*\.jpg',
+        },
+    }, {
+        'url': 'https://sverigesradio.se/sida/gruppsida.aspx?programid=3304&grupp=6247&artikel=7146887',
+        'only_matching': True,
+    }]
+    _AUDIO_TYPE = 'publication'
+
+
+class SverigesRadioEpisodeIE(SverigesRadioBaseIE):
+    IE_NAME = 'sverigesradio:episode'
+    _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://sverigesradio.se/avsnitt/1140922?programid=1300',
+        'md5': '20dc4d8db24228f846be390b0c59a07c',
+        'info_dict': {
+            'id': '1140922',
+            'ext': 'mp3',
+            'duration': 3307,
+            'series': 'Konflikt',
+            'title': 'Metoo och valen',
+            'description': 'md5:fcb5c1f667f00badcc702b196f10a27e',
+            'thumbnail': r're:^https?://.*\.jpg',
+        }
+    }
+    _AUDIO_TYPE = 'episode'
diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py

new file mode 100644 (file)

index 0000000..e12389c
--- /dev/null
+++ b/youtube_dl/extractor/svt.py
@@ -0,0 +1,380 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    dict_get,
+    int_or_none,
+    str_or_none,
+    strip_or_none,
+    try_get,
+)
+
+
+class SVTBaseIE(InfoExtractor):
+    _GEO_COUNTRIES = ['SE']
+
+    def _extract_video(self, video_info, video_id):
+        is_live = dict_get(video_info, ('live', 'simulcast'), default=False)
+        m3u8_protocol = 'm3u8' if is_live else 'm3u8_native'
+        formats = []
+        for vr in video_info['videoReferences']:
+            player_type = vr.get('playerType') or vr.get('format')
+            vurl = vr['url']
+            ext = determine_ext(vurl)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    vurl, video_id,
+                    ext='mp4', entry_protocol=m3u8_protocol,
+                    m3u8_id=player_type, fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    vurl + '?hdcore=3.3.0', video_id,
+                    f4m_id=player_type, fatal=False))
+            elif ext == 'mpd':
+                if player_type == 'dashhbbtv':
+                    formats.extend(self._extract_mpd_formats(
+                        vurl, video_id, mpd_id=player_type, fatal=False))
+            else:
+                formats.append({
+                    'format_id': player_type,
+                    'url': vurl,
+                })
+        if not formats and video_info.get('rights', {}).get('geoBlockedSweden'):
+            self.raise_geo_restricted(
+                'This video is only available in Sweden',
+                countries=self._GEO_COUNTRIES)
+        self._sort_formats(formats)
+
+        subtitles = {}
+        subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
+        if isinstance(subtitle_references, list):
+            for sr in subtitle_references:
+                subtitle_url = sr.get('url')
+                subtitle_lang = sr.get('language', 'sv')
+                if subtitle_url:
+                    if determine_ext(subtitle_url) == 'm3u8':
+                        # TODO(yan12125): handle WebVTT in m3u8 manifests
+                        continue
+
+                    subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url})
+
+        title = video_info.get('title')
+
+        series = video_info.get('programTitle')
+        season_number = int_or_none(video_info.get('season'))
+        episode = video_info.get('episodeTitle')
+        episode_number = int_or_none(video_info.get('episodeNumber'))
+
+        duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration')))
+        age_limit = None
+        adult = dict_get(
+            video_info, ('inappropriateForChildren', 'blockedForChildren'),
+            skip_false_values=False)
+        if adult is not None:
+            age_limit = 18 if adult else 0
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'subtitles': subtitles,
+            'duration': duration,
+            'age_limit': age_limit,
+            'series': series,
+            'season_number': season_number,
+            'episode': episode,
+            'episode_number': episode_number,
+            'is_live': is_live,
+        }
+
+
+class SVTIE(SVTBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
+        'md5': '33e9a5d8f646523ce0868ecfb0eed77d',
+        'info_dict': {
+            'id': '2900353',
+            'ext': 'mp4',
+            'title': 'Stjärnorna skojar till det - under SVT-intervjun',
+            'duration': 27,
+            'age_limit': 0,
+        },
+    }
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        widget_id = mobj.group('widget_id')
+        article_id = mobj.group('id')
+
+        info = self._download_json(
+            'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),
+            article_id)
+
+        info_dict = self._extract_video(info['video'], article_id)
+        info_dict['title'] = info['context']['title']
+        return info_dict
+
+
+class SVTPlayBaseIE(SVTBaseIE):
+    _SVTPLAY_RE = r'root\s*\[\s*(["\'])_*svtplay\1\s*\]\s*=\s*(?P<json>{.+?})\s*;\s*\n'
+
+
+class SVTPlayIE(SVTPlayBaseIE):
+    IE_DESC = 'SVT Play and Öppet arkiv'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        svt:(?P<svt_id>[^/?#&]+)|
+                        https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)
+                    )
+                    '''
+    _TESTS = [{
+        'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',
+        'md5': '2b6704fe4a28801e1a098bbf3c5ac611',
+        'info_dict': {
+            'id': '5996901',
+            'ext': 'mp4',
+            'title': 'Flygplan till Haile Selassie',
+            'duration': 3527,
+            'thumbnail': r're:^https?://.*[\.-]jpg$',
+            'age_limit': 0,
+            'subtitles': {
+                'sv': [{
+                    'ext': 'wsrt',
+                }]
+            },
+        },
+    }, {
+        # geo restricted to Sweden
+        'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.svtplay.se/kanaler/svt1',
+        'only_matching': True,
+    }, {
+        'url': 'svt:1376446-003A',
+        'only_matching': True,
+    }, {
+        'url': 'svt:14278044',
+        'only_matching': True,
+    }]
+
+    def _adjust_title(self, info):
+        if info['is_live']:
+            info['title'] = self._live_title(info['title'])
+
+    def _extract_by_video_id(self, video_id, webpage=None):
+        data = self._download_json(
+            'https://api.svt.se/videoplayer-api/video/%s' % video_id,
+            video_id, headers=self.geo_verification_headers())
+        info_dict = self._extract_video(data, video_id)
+        if not info_dict.get('title'):
+            title = dict_get(info_dict, ('episode', 'series'))
+            if not title and webpage:
+                title = re.sub(
+                    r'\s*\|\s*.+?$', '', self._og_search_title(webpage))
+            if not title:
+                title = video_id
+            info_dict['title'] = title
+        self._adjust_title(info_dict)
+        return info_dict
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, svt_id = mobj.group('id', 'svt_id')
+
+        if svt_id:
+            return self._extract_by_video_id(svt_id)
+
+        webpage = self._download_webpage(url, video_id)
+
+        data = self._parse_json(
+            self._search_regex(
+                self._SVTPLAY_RE, webpage, 'embedded data', default='{}',
+                group='json'),
+            video_id, fatal=False)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        if data:
+            video_info = try_get(
+                data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'],
+                dict)
+            if video_info:
+                info_dict = self._extract_video(video_info, video_id)
+                info_dict.update({
+                    'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
+                    'thumbnail': thumbnail,
+                })
+                self._adjust_title(info_dict)
+                return info_dict
+
+        svt_id = self._search_regex(
+            r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
+            webpage, 'video id')
+
+        return self._extract_by_video_id(svt_id, webpage)
+
+
+class SVTSeriesIE(SVTPlayBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)(?:.+?\btab=(?P<season_slug>[^&#]+))?'
+    _TESTS = [{
+        'url': 'https://www.svtplay.se/rederiet',
+        'info_dict': {
+            'id': '14445680',
+            'title': 'Rederiet',
+            'description': 'md5:d9fdfff17f5d8f73468176ecd2836039',
+        },
+        'playlist_mincount': 318,
+    }, {
+        'url': 'https://www.svtplay.se/rederiet?tab=season-2-14445680',
+        'info_dict': {
+            'id': 'season-2-14445680',
+            'title': 'Rederiet - Säsong 2',
+            'description': 'md5:d9fdfff17f5d8f73468176ecd2836039',
+        },
+        'playlist_mincount': 12,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        series_slug, season_id = re.match(self._VALID_URL, url).groups()
+
+        series = self._download_json(
+            'https://api.svt.se/contento/graphql', series_slug,
+            'Downloading series page', query={
+                'query': '''{
+  listablesBySlug(slugs: ["%s"]) {
+    associatedContent(include: [productionPeriod, season]) {
+      items {
+        item {
+          ... on Episode {
+            videoSvtId
+          }
+        }
+      }
+      id
+      name
+    }
+    id
+    longDescription
+    name
+    shortDescription
+  }
+}''' % series_slug,
+            })['data']['listablesBySlug'][0]
+
+        season_name = None
+
+        entries = []
+        for season in series['associatedContent']:
+            if not isinstance(season, dict):
+                continue
+            if season_id:
+                if season.get('id') != season_id:
+                    continue
+                season_name = season.get('name')
+            items = season.get('items')
+            if not isinstance(items, list):
+                continue
+            for item in items:
+                video = item.get('item') or {}
+                content_id = video.get('videoSvtId')
+                if not content_id or not isinstance(content_id, compat_str):
+                    continue
+                entries.append(self.url_result(
+                    'svt:' + content_id, SVTPlayIE.ie_key(), content_id))
+
+        title = series.get('name')
+        season_name = season_name or season_id
+
+        if title and season_name:
+            title = '%s - %s' % (title, season_name)
+        elif season_id:
+            title = season_id
+
+        return self.playlist_result(
+            entries, season_id or series.get('id'), title,
+            dict_get(series, ('longDescription', 'shortDescription')))
+
+
+class SVTPageIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?svt\.se/(?P<path>(?:[^/]+/)*(?P<id>[^/?&#]+))'
+    _TESTS = [{
+        'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa',
+        'info_dict': {
+            'id': '25298267',
+            'title': 'Bakom masken – Lehners kamp mot mental ohälsa',
+        },
+        'playlist_count': 4,
+    }, {
+        'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien',
+        'info_dict': {
+            'id': '24243746',
+            'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien',
+        },
+        'playlist_count': 2,
+    }, {
+        # only programTitle
+        'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
+        'info_dict': {
+            'id': '8439V2K',
+            'ext': 'mp4',
+            'title': 'Stjärnorna skojar till det - under SVT-intervjun',
+            'duration': 27,
+            'age_limit': 0,
+        },
+    }, {
+        'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.svt.se/vader/manadskronikor/maj2018',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        path, display_id = re.match(self._VALID_URL, url).groups()
+
+        article = self._download_json(
+            'https://api.svt.se/nss-api/page/' + path, display_id,
+            query={'q': 'articles'})['articles']['content'][0]
+
+        entries = []
+
+        def _process_content(content):
+            if content.get('_type') in ('VIDEOCLIP', 'VIDEOEPISODE'):
+                video_id = compat_str(content['image']['svtId'])
+                entries.append(self.url_result(
+                    'svt:' + video_id, SVTPlayIE.ie_key(), video_id))
+
+        for media in article.get('media', []):
+            _process_content(media)
+
+        for obj in article.get('structuredBody', []):
+            _process_content(obj.get('content') or {})
+
+        return self.playlist_result(
+            entries, str_or_none(article.get('id')),
+            strip_or_none(article.get('title')))
diff --git a/youtube_dl/extractor/swrmediathek.py b/youtube_dl/extractor/swrmediathek.py

new file mode 100644 (file)

index 0000000..0f61597
--- /dev/null
+++ b/youtube_dl/extractor/swrmediathek.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    int_or_none,
+    determine_protocol,
+)
+
+
+class SWRMediathekIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?swrmediathek\.de/(?:content/)?player\.htm\?show=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+
+    _TESTS = [{
+        'url': 'http://swrmediathek.de/player.htm?show=849790d0-dab8-11e3-a953-0026b975f2e6',
+        'md5': '8c5f6f0172753368547ca8413a7768ac',
+        'info_dict': {
+            'id': '849790d0-dab8-11e3-a953-0026b975f2e6',
+            'ext': 'mp4',
+            'title': 'SWR odysso',
+            'description': 'md5:2012e31baad36162e97ce9eb3f157b8a',
+            'thumbnail': r're:^http:.*\.jpg$',
+            'duration': 2602,
+            'upload_date': '20140515',
+            'uploader': 'SWR Fernsehen',
+            'uploader_id': '990030',
+        },
+    }, {
+        'url': 'http://swrmediathek.de/player.htm?show=0e1a8510-ddf2-11e3-9be3-0026b975f2e6',
+        'md5': 'b10ab854f912eecc5a6b55cd6fc1f545',
+        'info_dict': {
+            'id': '0e1a8510-ddf2-11e3-9be3-0026b975f2e6',
+            'ext': 'mp4',
+            'title': 'Nachtcafé - Alltagsdroge Alkohol - zwischen Sektempfang und Komasaufen',
+            'description': 'md5:e0a3adc17e47db2c23aab9ebc36dbee2',
+            'thumbnail': r're:http://.*\.jpg',
+            'duration': 5305,
+            'upload_date': '20140516',
+            'uploader': 'SWR Fernsehen',
+            'uploader_id': '990030',
+        },
+        'skip': 'redirect to http://swrmediathek.de/index.htm?hinweis=swrlink',
+    }, {
+        'url': 'http://swrmediathek.de/player.htm?show=bba23e10-cb93-11e3-bf7f-0026b975f2e6',
+        'md5': '4382e4ef2c9d7ce6852535fa867a0dd3',
+        'info_dict': {
+            'id': 'bba23e10-cb93-11e3-bf7f-0026b975f2e6',
+            'ext': 'mp3',
+            'title': 'Saša Stanišic: Vor dem Fest',
+            'description': 'md5:5b792387dc3fbb171eb709060654e8c9',
+            'thumbnail': r're:http://.*\.jpg',
+            'duration': 3366,
+            'upload_date': '20140520',
+            'uploader': 'SWR 2',
+            'uploader_id': '284670',
+        },
+        'skip': 'redirect to http://swrmediathek.de/index.htm?hinweis=swrlink',
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://swrmediathek.de/AjaxEntry?ekey=%s' % video_id,
+            video_id, 'Downloading video JSON')
+
+        attr = video['attr']
+        title = attr['entry_title']
+        media_type = attr.get('entry_etype')
+
+        formats = []
+        for entry in video.get('sub', []):
+            if entry.get('name') != 'entry_media':
+                continue
+
+            entry_attr = entry.get('attr', {})
+            f_url = entry_attr.get('val2')
+            if not f_url:
+                continue
+            codec = entry_attr.get('val0')
+            if codec == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    f_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif codec == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    f_url + '?hdcore=3.7.0', video_id,
+                    f4m_id='hds', fatal=False))
+            else:
+                formats.append({
+                    'format_id': determine_protocol({'url': f_url}),
+                    'url': f_url,
+                    'quality': int_or_none(entry_attr.get('val1')),
+                    'vcodec': codec if media_type == 'Video' else 'none',
+                    'acodec': codec if media_type == 'Audio' else None,
+                })
+        self._sort_formats(formats)
+
+        upload_date = None
+        entry_pdatet = attr.get('entry_pdatet')
+        if entry_pdatet:
+            upload_date = entry_pdatet[:-4]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': attr.get('entry_descl'),
+            'thumbnail': attr.get('entry_image_16_9'),
+            'duration': parse_duration(attr.get('entry_durat')),
+            'upload_date': upload_date,
+            'uploader': attr.get('channel_title'),
+            'uploader_id': attr.get('channel_idkey'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py

new file mode 100644 (file)

index 0000000..def7e5a
--- /dev/null
+++ b/youtube_dl/extractor/syfy.py
@@ -0,0 +1,58 @@
+from __future__ import unicode_literals
+
+from .adobepass import AdobePassIE
+from ..utils import (
+    update_url_query,
+    smuggle_url,
+)
+
+
+class SyfyIE(AdobePassIE):
+    _VALID_URL = r'https?://(?:www\.)?syfy\.com/(?:[^/]+/)?videos/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'http://www.syfy.com/theinternetruinedmylife/videos/the-internet-ruined-my-life-season-1-trailer',
+        'info_dict': {
+            'id': '2968097',
+            'ext': 'mp4',
+            'title': 'The Internet Ruined My Life: Season 1 Trailer',
+            'description': 'One tweet, one post, one click, can destroy everything.',
+            'uploader': 'NBCU-MPAT',
+            'upload_date': '20170113',
+            'timestamp': 1484345640,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['ThePlatform'],
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        syfy_mpx = list(self._parse_json(self._search_regex(
+            r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', webpage, 'drupal settings'),
+            display_id)['syfy']['syfy_mpx'].values())[0]
+        video_id = syfy_mpx['mpxGUID']
+        title = syfy_mpx['episodeTitle']
+        query = {
+            'mbr': 'true',
+            'manifest': 'm3u',
+        }
+        if syfy_mpx.get('entitlement') == 'auth':
+            resource = self._get_mvpd_resource(
+                'syfy', title, video_id,
+                syfy_mpx.get('mpxRating', 'TV-14'))
+            query['auth'] = self._extract_mvpd_auth(
+                url, video_id, 'syfy', resource)
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': smuggle_url(update_url_query(
+                self._proto_relative_url(syfy_mpx['releaseURL']), query),
+                {'force_smil_url': True}),
+            'title': title,
+            'id': video_id,
+            'display_id': display_id,
+        }
diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py

new file mode 100644 (file)

index 0000000..cfad331
--- /dev/null
+++ b/youtube_dl/extractor/sztvhu.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class SztvHuIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909',
+        'md5': 'a6df607b11fb07d0e9f2ad94613375cb',
+        'info_dict': {
+            'id': '20130909',
+            'ext': 'mp4',
+            'title': 'Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren',
+            'description': 'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        video_file = self._search_regex(
+            r'file: "...:(.*?)",', webpage, 'video file')
+        title = self._html_search_regex(
+            r'<meta name="title" content="([^"]*?) - [^-]*? - [^-]*?"',
+            webpage, 'video title')
+        description = self._html_search_regex(
+            r'<meta name="description" content="([^"]*)"/>',
+            webpage, 'video description', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        video_url = 'http://media.sztv.hu/vod/' + video_file
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py

new file mode 100644 (file)

index 0000000..c351b75
--- /dev/null
+++ b/youtube_dl/extractor/tagesschau.py
@@ -0,0 +1,311 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    js_to_json,
+    parse_iso8601,
+    parse_filesize,
+)
+
+
+class TagesschauPlayerIE(InfoExtractor):
+    IE_NAME = 'tagesschau:player'
+    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html'
+
+    _TESTS = [{
+        'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html',
+        'md5': '8d09548d5c15debad38bee3a4d15ca21',
+        'info_dict': {
+            'id': '179517',
+            'ext': 'mp4',
+            'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD',
+            'thumbnail': r're:^https?:.*\.jpg$',
+            'formats': 'mincount:6',
+        },
+    }, {
+        'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
+        'md5': '76e6eec6ebd40740671cf0a2c88617e5',
+        'info_dict': {
+            'id': '29417',
+            'ext': 'mp3',
+            'title': 'Trabi - Bye, bye Rennpappe',
+            'thumbnail': r're:^https?:.*\.jpg$',
+            'formats': 'mincount:2',
+        },
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html',
+        'only_matching': True,
+    }]
+
+    _FORMATS = {
+        'xs': {'quality': 0},
+        's': {'width': 320, 'height': 180, 'quality': 1},
+        'm': {'width': 512, 'height': 288, 'quality': 2},
+        'l': {'width': 960, 'height': 540, 'quality': 3},
+        'xl': {'width': 1280, 'height': 720, 'quality': 4},
+        'xxl': {'quality': 5},
+    }
+
+    def _extract_via_api(self, kind, video_id):
+        info = self._download_json(
+            'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id),
+            video_id)
+        title = info['headline']
+        formats = []
+        for media in info['mediadata']:
+            for format_id, format_url in media.items():
+                if determine_ext(format_url) == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id='hls'))
+                else:
+                    formats.append({
+                        'url': format_url,
+                        'format_id': format_id,
+                        'vcodec': 'none' if kind == 'audio' else None,
+                    })
+        self._sort_formats(formats)
+        timestamp = parse_iso8601(info.get('date'))
+        return {
+            'id': video_id,
+            'title': title,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        # kind = mobj.group('kind').lower()
+        # if kind == 'video':
+        #     return self._extract_via_api(kind, video_id)
+
+        # JSON api does not provide some audio formats (e.g. ogg) thus
+        # extractiong audio via webpage
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(webpage).strip()
+        formats = []
+
+        for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage):
+            media = self._parse_json(js_to_json(media_json), video_id, fatal=False)
+            if not media:
+                continue
+            src = media.get('src')
+            if not src:
+                return
+            quality = media.get('quality')
+            kind = media.get('type', '').split('/')[0]
+            ext = determine_ext(src)
+            f = {
+                'url': src,
+                'format_id': '%s_%s' % (quality, ext) if quality else ext,
+                'ext': ext,
+                'vcodec': 'none' if kind == 'audio' else None,
+            }
+            f.update(self._FORMATS.get(quality, {}))
+            formats.append(f)
+
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
+
+
+class TagesschauIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
+
+    _TESTS = [{
+        'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
+        'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6',
+        'info_dict': {
+            'id': 'video-102143',
+            'ext': 'mp4',
+            'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
+            'description': '18.07.2015 20:10 Uhr',
+            'thumbnail': r're:^https?:.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
+        'md5': '3c54c1f6243d279b706bde660ceec633',
+        'info_dict': {
+            'id': 'ts-5727',
+            'ext': 'mp4',
+            'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
+            'description': 'md5:695c01bfd98b7e313c501386327aea59',
+            'thumbnail': r're:^https?:.*\.jpg$',
+        },
+    }, {
+        # exclusive audio
+        'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
+        'md5': '76e6eec6ebd40740671cf0a2c88617e5',
+        'info_dict': {
+            'id': 'audio-29417',
+            'ext': 'mp3',
+            'title': 'Trabi - Bye, bye Rennpappe',
+            'description': 'md5:8687dda862cbbe2cfb2df09b56341317',
+            'thumbnail': r're:^https?:.*\.jpg$',
+        },
+    }, {
+        # audio in article
+        'url': 'http://www.tagesschau.de/inland/bnd-303.html',
+        'md5': 'e0916c623e85fc1d2b26b78f299d3958',
+        'info_dict': {
+            'id': 'bnd-303',
+            'ext': 'mp3',
+            'title': 'Viele Baustellen für neuen BND-Chef',
+            'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4',
+            'thumbnail': r're:^https?:.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
+        'info_dict': {
+            'id': 'afd-parteitag-135',
+            'title': 'Möchtegern-Underdog mit Machtanspruch',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tagesschau.de/100sekunden/index.html',
+        'only_matching': True,
+    }, {
+        # playlist article with collapsing sections
+        'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url)
+
+    def _extract_formats(self, download_text, media_kind):
+        links = re.finditer(
+            r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
+            download_text)
+        formats = []
+        for l in links:
+            link_url = l.group('url')
+            if not link_url:
+                continue
+            format_id = self._search_regex(
+                r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID',
+                default=determine_ext(link_url))
+            format = {
+                'format_id': format_id,
+                'url': l.group('url'),
+                'format_name': l.group('name'),
+            }
+            title = l.group('title')
+            if title:
+                if media_kind.lower() == 'video':
+                    m = re.match(
+                        r'''(?x)
+                            Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
+                            (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
+                            (?P<vbr>[0-9]+)kbps&\#10;
+                            Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
+                            Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
+                        title)
+                    if m:
+                        format.update({
+                            'format_note': m.group('audio_desc'),
+                            'vcodec': m.group('vcodec'),
+                            'width': int(m.group('width')),
+                            'height': int(m.group('height')),
+                            'abr': int(m.group('abr')),
+                            'vbr': int(m.group('vbr')),
+                            'filesize_approx': parse_filesize(m.group('filesize_approx')),
+                        })
+                else:
+                    m = re.match(
+                        r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)',
+                        title)
+                    if m:
+                        format.update({
+                            'format_note': '%s, %s' % (m.group('format'), m.group('note')),
+                            'vcodec': 'none',
+                            'abr': int(m.group('abr')),
+                        })
+            formats.append(format)
+        self._sort_formats(formats)
+        return formats
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('path')
+        display_id = video_id.lstrip('-')
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._html_search_regex(
+            r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
+            webpage, 'title', default=None) or self._og_search_title(webpage)
+
+        DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'
+
+        webpage_type = self._og_search_property('type', webpage, default=None)
+        if webpage_type == 'website':  # Article
+            entries = []
+            for num, (entry_title, media_kind, download_text) in enumerate(re.findall(
+                    r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
+                    webpage), 1):
+                entries.append({
+                    'id': '%s-%d' % (display_id, num),
+                    'title': '%s' % entry_title,
+                    'formats': self._extract_formats(download_text, media_kind),
+                })
+            if len(entries) > 1:
+                return self.playlist_result(entries, display_id, title)
+            formats = entries[0]['formats']
+        else:  # Assume single video
+            download_text = self._search_regex(
+                DOWNLOAD_REGEX, webpage, 'download links', group='links')
+            media_kind = self._search_regex(
+                DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind')
+            formats = self._extract_formats(download_text, media_kind)
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._html_search_regex(
+            r'(?s)<p class="teasertext">(.*?)</p>',
+            webpage, 'description', default=None)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': display_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+            'description': description,
+        }
diff --git a/youtube_dl/extractor/tass.py b/youtube_dl/extractor/tass.py

new file mode 100644 (file)

index 0000000..6d336da
--- /dev/null
+++ b/youtube_dl/extractor/tass.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    js_to_json,
+    qualities,
+)
+
+
+class TassIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:tass\.ru|itar-tass\.com)/[^/]+/(?P<id>\d+)'
+    _TESTS = [
+        {
+            'url': 'http://tass.ru/obschestvo/1586870',
+            'md5': '3b4cdd011bc59174596b6145cda474a4',
+            'info_dict': {
+                'id': '1586870',
+                'ext': 'mp4',
+                'title': 'Посетителям московского зоопарка показали красную панду',
+                'description': 'Приехавшую из Дублина Зейну можно увидеть в павильоне "Кошки тропиков"',
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+        },
+        {
+            'url': 'http://itar-tass.com/obschestvo/1600009',
+            'only_matching': True,
+        },
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        sources = json.loads(js_to_json(self._search_regex(
+            r'(?s)sources\s*:\s*(\[.+?\])', webpage, 'sources')))
+
+        quality = qualities(['sd', 'hd'])
+
+        formats = []
+        for source in sources:
+            video_url = source.get('file')
+            if not video_url or not video_url.startswith('http') or not video_url.endswith('.mp4'):
+                continue
+            label = source.get('label')
+            formats.append({
+                'url': video_url,
+                'format_id': label,
+                'quality': quality(label),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tastytrade.py b/youtube_dl/extractor/tastytrade.py

new file mode 100644 (file)

index 0000000..7fe96bd
--- /dev/null
+++ b/youtube_dl/extractor/tastytrade.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class TastyTradeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017',
+        'info_dict': {
+            'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
+            'ext': 'mp4',
+            'title': 'A History of Teaming',
+            'description': 'md5:2a9033db8da81f2edffa4c99888140b3',
+            'duration': 422.255,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        ooyala_code = self._search_regex(
+            r'data-media-id=(["\'])(?P<code>(?:(?!\1).)+)\1',
+            webpage, 'ooyala code', group='code')
+
+        info = self._search_json_ld(webpage, display_id, fatal=False)
+        info.update({
+            '_type': 'url_transparent',
+            'ie_key': OoyalaIE.ie_key(),
+            'url': 'ooyala:%s' % ooyala_code,
+            'display_id': display_id,
+        })
+        return info
diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py

new file mode 100644 (file)

index 0000000..e8a7c65
--- /dev/null
+++ b/youtube_dl/extractor/tbs.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .turner import TurnerBaseIE
+from ..compat import (
+    compat_urllib_parse_urlparse,
+    compat_parse_qs,
+)
+from ..utils import (
+    float_or_none,
+    int_or_none,
+    strip_or_none,
+)
+
+
+class TBSIE(TurnerBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com(?P<path>/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+))'
+    _TESTS = [{
+        'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster',
+        'info_dict': {
+            'id': '8d384cde33b89f3a43ce5329de42903ed5099887',
+            'ext': 'mp4',
+            'title': 'Monster',
+            'description': 'Get a first look at the theatrical trailer for TNT’s highly anticipated new psychological thriller The Alienist, which premieres January 22 on TNT.',
+            'timestamp': 1508175329,
+            'upload_date': '20171016',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://www.tbs.com/shows/search-party/season-1/episode-1/explicit-the-mysterious-disappearance-of-the-girl-no-one-knew',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tntdrama.com/movies/star-wars-a-new-hope',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        site, path, display_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, display_id)
+        drupal_settings = self._parse_json(self._search_regex(
+            r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>',
+            webpage, 'drupal setting'), display_id)
+        video_data = next(v for v in drupal_settings['turner_playlist'] if v.get('url') == path)
+
+        media_id = video_data['mediaID']
+        title = video_data['title']
+        tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse(
+            drupal_settings['ngtv_token_url']).query)
+
+        info = self._extract_ngtv_info(
+            media_id, tokenizer_query, {
+                'url': url,
+                'site_name': site[:3].upper(),
+                'auth_required': video_data.get('authRequired') == '1',
+            })
+
+        thumbnails = []
+        for image_id, image in video_data.get('images', {}).items():
+            image_url = image.get('url')
+            if not image_url or image.get('type') != 'video':
+                continue
+            i = {
+                'id': image_id,
+                'url': image_url,
+            }
+            mobj = re.search(r'(\d+)x(\d+)', image_url)
+            if mobj:
+                i.update({
+                    'width': int(mobj.group(1)),
+                    'height': int(mobj.group(2)),
+                })
+            thumbnails.append(i)
+
+        info.update({
+            'id': media_id,
+            'title': title,
+            'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')),
+            'duration': float_or_none(video_data.get('duration')) or info.get('duration'),
+            'timestamp': int_or_none(video_data.get('created')),
+            'season_number': int_or_none(video_data.get('season')),
+            'episode_number': int_or_none(video_data.get('episode')),
+            'thumbnails': thumbnails,
+        })
+        return info
diff --git a/youtube_dl/extractor/tdslifeway.py b/youtube_dl/extractor/tdslifeway.py

new file mode 100644 (file)

index 0000000..101c6ee
--- /dev/null
+++ b/youtube_dl/extractor/tdslifeway.py
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TDSLifewayIE(InfoExtractor):
+    _VALID_URL = r'https?://tds\.lifeway\.com/v1/trainingdeliverysystem/courses/(?P<id>\d+)/index\.html'
+
+    _TEST = {
+        # From http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers
+        'url': 'http://tds.lifeway.com/v1/trainingdeliverysystem/courses/3453494717001/index.html?externalRegistration=AssetId%7C34F466F1-78F3-4619-B2AB-A8EFFA55E9E9%21InstanceId%7C0%21UserId%7Caaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa&grouping=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&activity_id=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&content_endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2Fcontent%2F&actor=%7B%22name%22%3A%5B%22Guest%20Guest%22%5D%2C%22account%22%3A%5B%7B%22accountServiceHomePage%22%3A%22http%3A%2F%2Fscorm.lifeway.com%2F%22%2C%22accountName%22%3A%22aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa%22%7D%5D%2C%22objectType%22%3A%22Agent%22%7D&content_token=462a50b2-b6f9-4970-99b1-930882c499fb&registration=93d6ec8e-7f7b-4ed3-bbc8-a857913c0b2a&externalConfiguration=access%7CFREE%21adLength%7C-1%21assignOrgId%7C4AE36F78-299A-425D-91EF-E14A899B725F%21assignOrgParentId%7C%21courseId%7C%21isAnonymous%7Cfalse%21previewAsset%7Cfalse%21previewLength%7C-1%21previewMode%7Cfalse%21royalty%7CFREE%21sessionId%7C671422F9-8E79-48D4-9C2C-4EE6111EA1CD%21trackId%7C&auth=Basic%20OjhmZjk5MDBmLTBlYTMtNDJhYS04YjFlLWE4MWQ3NGNkOGRjYw%3D%3D&endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2F',
+        'info_dict': {
+            'id': '3453494717001',
+            'ext': 'mp4',
+            'title': 'The Gospel by Numbers',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'upload_date': '20140410',
+            'description': 'Coming soon from T4G 2014!',
+            'uploader_id': '2034960640001',
+            'timestamp': 1397145591,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['BrightcoveNew'],
+    }
+
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2034960640001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        brightcove_id = self._match_id(url)
+        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/teachable.py b/youtube_dl/extractor/teachable.py

new file mode 100644 (file)

index 0000000..a75369d
--- /dev/null
+++ b/youtube_dl/extractor/teachable.py
@@ -0,0 +1,298 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .wistia import WistiaIE
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    get_element_by_class,
+    strip_or_none,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class TeachableBaseIE(InfoExtractor):
+    _NETRC_MACHINE = 'teachable'
+    _URL_PREFIX = 'teachable:'
+
+    _SITES = {
+        # Only notable ones here
+        'v1.upskillcourses.com': 'upskill',
+        'gns3.teachable.com': 'gns3',
+        'academyhacker.com': 'academyhacker',
+        'stackskills.com': 'stackskills',
+        'market.saleshacker.com': 'saleshacker',
+        'learnability.org': 'learnability',
+        'edurila.com': 'edurila',
+        'courses.workitdaily.com': 'workitdaily',
+    }
+
+    _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys()))
+
+    def _real_initialize(self):
+        self._logged_in = False
+
+    def _login(self, site):
+        if self._logged_in:
+            return
+
+        username, password = self._get_login_info(
+            netrc_machine=self._SITES.get(site, site))
+        if username is None:
+            return
+
+        login_page, urlh = self._download_webpage_handle(
+            'https://%s/sign_in' % site, None,
+            'Downloading %s login page' % site)
+
+        def is_logged(webpage):
+            return any(re.search(p, webpage) for p in (
+                r'class=["\']user-signout',
+                r'<a[^>]+\bhref=["\']/sign_out',
+                r'Log\s+[Oo]ut\s*<'))
+
+        if is_logged(login_page):
+            self._logged_in = True
+            return
+
+        login_url = urlh.geturl()
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'user[email]': username,
+            'user[password]': password,
+        })
+
+        post_url = self._search_regex(
+            r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page,
+            'post url', default=login_url, group='url')
+
+        if not post_url.startswith('http'):
+            post_url = urljoin(login_url, post_url)
+
+        response = self._download_webpage(
+            post_url, None, 'Logging in to %s' % site,
+            data=urlencode_postdata(login_form),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'Referer': login_url,
+            })
+
+        if '>I accept the new Privacy Policy<' in response:
+            raise ExtractorError(
+                'Unable to login: %s asks you to accept new Privacy Policy. '
+                'Go to https://%s/ and accept.' % (site, site), expected=True)
+
+        # Successful login
+        if is_logged(response):
+            self._logged_in = True
+            return
+
+        message = get_element_by_class('alert', response)
+        if message is not None:
+            raise ExtractorError(
+                'Unable to login: %s' % clean_html(message), expected=True)
+
+        raise ExtractorError('Unable to log in')
+
+
+class TeachableIE(TeachableBaseIE):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        %shttps?://(?P<site_t>[^/]+)|
+                        https?://(?:www\.)?(?P<site>%s)
+                    )
+                    /courses/[^/]+/lectures/(?P<id>\d+)
+                    ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE
+
+    _TESTS = [{
+        'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364',
+        'info_dict': {
+            'id': 'untlgzk1v7',
+            'ext': 'bin',
+            'title': 'Overview',
+            'description': 'md5:071463ff08b86c208811130ea1c2464c',
+            'duration': 736.4,
+            'timestamp': 1542315762,
+            'upload_date': '20181115',
+            'chapter': 'Welcome',
+            'chapter_number': 1,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100',
+        'only_matching': True,
+    }, {
+        'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939',
+        'only_matching': True,
+    }, {
+        'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _is_teachable(webpage):
+        return 'teachableTracker.linker:autoLink' in webpage and re.search(
+            r'<link[^>]+href=["\']https?://process\.fs\.teachablecdn\.com',
+            webpage)
+
+    @staticmethod
+    def _extract_url(webpage, source_url):
+        if not TeachableIE._is_teachable(webpage):
+            return
+        if re.match(r'https?://[^/]+/(?:courses|p)', source_url):
+            return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        site = mobj.group('site') or mobj.group('site_t')
+        video_id = mobj.group('id')
+
+        self._login(site)
+
+        prefixed = url.startswith(self._URL_PREFIX)
+        if prefixed:
+            url = url[len(self._URL_PREFIX):]
+
+        webpage = self._download_webpage(url, video_id)
+
+        wistia_urls = WistiaIE._extract_urls(webpage)
+        if not wistia_urls:
+            if any(re.search(p, webpage) for p in (
+                    r'class=["\']lecture-contents-locked',
+                    r'>\s*Lecture contents locked',
+                    r'id=["\']lecture-locked',
+                    # https://academy.tailoredtutors.co.uk/courses/108779/lectures/1955313
+                    r'class=["\'](?:inner-)?lesson-locked',
+                    r'>LESSON LOCKED<')):
+                self.raise_login_required('Lecture contents locked')
+            raise ExtractorError('Unable to find video URL')
+
+        title = self._og_search_title(webpage, default=None)
+
+        chapter = None
+        chapter_number = None
+        section_item = self._search_regex(
+            r'(?s)(?P<li><li[^>]+\bdata-lecture-id=["\']%s[^>]+>.+?</li>)' % video_id,
+            webpage, 'section item', default=None, group='li')
+        if section_item:
+            chapter_number = int_or_none(self._search_regex(
+                r'data-ss-position=["\'](\d+)', section_item, 'section id',
+                default=None))
+            if chapter_number is not None:
+                sections = []
+                for s in re.findall(
+                        r'(?s)<div[^>]+\bclass=["\']section-title[^>]+>(.+?)</div>', webpage):
+                    section = strip_or_none(clean_html(s))
+                    if not section:
+                        sections = []
+                        break
+                    sections.append(section)
+                if chapter_number <= len(sections):
+                    chapter = sections[chapter_number - 1]
+
+        entries = [{
+            '_type': 'url_transparent',
+            'url': wistia_url,
+            'ie_key': WistiaIE.ie_key(),
+            'title': title,
+            'chapter': chapter,
+            'chapter_number': chapter_number,
+        } for wistia_url in wistia_urls]
+
+        return self.playlist_result(entries, video_id, title)
+
+
+class TeachableCourseIE(TeachableBaseIE):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            %shttps?://(?P<site_t>[^/]+)|
+                            https?://(?:www\.)?(?P<site>%s)
+                        )
+                        /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+)
+                    ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE
+    _TESTS = [{
+        'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/',
+        'info_dict': {
+            'id': 'essential-web-developer-course',
+            'title': 'The Essential Web Developer Course (Free)',
+        },
+        'playlist_count': 192,
+    }, {
+        'url': 'http://v1.upskillcourses.com/courses/119763/',
+        'only_matching': True,
+    }, {
+        'url': 'http://v1.upskillcourses.com/courses/enrolled/119763',
+        'only_matching': True,
+    }, {
+        'url': 'https://gns3.teachable.com/courses/enrolled/423415',
+        'only_matching': True,
+    }, {
+        'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini',
+        'only_matching': True,
+    }, {
+        'url': 'teachable:https://filmsimplified.com/p/davinci-resolve-15-crash-course',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if TeachableIE.suitable(url) else super(
+            TeachableCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        site = mobj.group('site') or mobj.group('site_t')
+        course_id = mobj.group('id')
+
+        self._login(site)
+
+        prefixed = url.startswith(self._URL_PREFIX)
+        if prefixed:
+            prefix = self._URL_PREFIX
+            url = url[len(prefix):]
+
+        webpage = self._download_webpage(url, course_id)
+
+        url_base = 'https://%s/' % site
+
+        entries = []
+
+        for mobj in re.finditer(
+                r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)',
+                webpage):
+            li = mobj.group('li')
+            if 'fa-youtube-play' not in li:
+                continue
+            lecture_url = self._search_regex(
+                r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li,
+                'lecture url', default=None, group='url')
+            if not lecture_url:
+                continue
+            lecture_id = self._search_regex(
+                r'/lectures/(\d+)', lecture_url, 'lecture id', default=None)
+            title = self._html_search_regex(
+                r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li,
+                'title', default=None)
+            entry_url = urljoin(url_base, lecture_url)
+            if prefixed:
+                entry_url = self._URL_PREFIX + entry_url
+            entries.append(
+                self.url_result(
+                    entry_url,
+                    ie=TeachableIE.ie_key(), video_id=lecture_id,
+                    video_title=clean_html(title)))
+
+        course_title = self._html_search_regex(
+            (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h',
+             r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'),
+            webpage, 'course title', fatal=False)
+
+        return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py

new file mode 100644 (file)

index 0000000..1272078
--- /dev/null
+++ b/youtube_dl/extractor/teachertube.py
@@ -0,0 +1,129 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    qualities,
+)
+
+
+class TeacherTubeIE(InfoExtractor):
+    IE_NAME = 'teachertube'
+    IE_DESC = 'teachertube.com videos'
+
+    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=|video/(?:[\da-z-]+-)?|audio/)(?P<id>\d+)'
+
+    _TESTS = [{
+        # flowplayer
+        'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997',
+        'md5': 'f9434ef992fd65936d72999951ee254c',
+        'info_dict': {
+            'id': '339997',
+            'ext': 'mp4',
+            'title': 'Measures of dispersion from a frequency table',
+            'description': 'Measures of dispersion from a frequency table',
+            'thumbnail': r're:https?://.*\.(?:jpg|png)',
+        },
+    }, {
+        # jwplayer
+        'url': 'http://www.teachertube.com/music.php?music_id=8805',
+        'md5': '01e8352006c65757caf7b961f6050e21',
+        'info_dict': {
+            'id': '8805',
+            'ext': 'mp3',
+            'title': 'PER ASPERA AD ASTRA',
+            'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNI?KE ?KOLE P',
+        },
+    }, {
+        # unavailable video
+        'url': 'http://www.teachertube.com/video/intro-video-schleicher-297790',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        error = self._search_regex(
+            r'<div\b[^>]+\bclass=["\']msgBox error[^>]+>([^<]+)', webpage,
+            'error', default=None)
+        if error:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+        title = self._html_search_meta('title', webpage, 'title', fatal=True)
+        TITLE_SUFFIX = ' - TeacherTube'
+        if title.endswith(TITLE_SUFFIX):
+            title = title[:-len(TITLE_SUFFIX)].strip()
+
+        description = self._html_search_meta('description', webpage, 'description')
+        if description:
+            description = description.strip()
+
+        quality = qualities(['mp3', 'flv', 'mp4'])
+
+        media_urls = re.findall(r'data-contenturl="([^"]+)"', webpage)
+        media_urls.extend(re.findall(r'var\s+filePath\s*=\s*"([^"]+)"', webpage))
+        media_urls.extend(re.findall(r'\'file\'\s*:\s*["\']([^"\']+)["\'],', webpage))
+
+        formats = [
+            {
+                'url': media_url,
+                'quality': quality(determine_ext(media_url))
+            } for media_url in set(media_urls)
+        ]
+
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(
+            webpage, default=None) or self._html_search_meta(
+            'thumbnail', webpage)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
+
+
+class TeacherTubeUserIE(InfoExtractor):
+    IE_NAME = 'teachertube:user:collection'
+    IE_DESC = 'teachertube.com user and collection videos'
+
+    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?'
+
+    _MEDIA_RE = r'''(?sx)
+        class="?sidebar_thumb_time"?>[0-9:]+</div>
+        \s*
+        <a\s+href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)"
+    '''
+    _TEST = {
+        'url': 'http://www.teachertube.com/user/profile/rbhagwati2',
+        'info_dict': {
+            'id': 'rbhagwati2'
+        },
+        'playlist_mincount': 179,
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user_id = mobj.group('user')
+
+        urls = []
+        webpage = self._download_webpage(url, user_id)
+        urls.extend(re.findall(self._MEDIA_RE, webpage))
+
+        pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[:-1]
+        for p in pages:
+            more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p)
+            webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages)))
+            video_urls = re.findall(self._MEDIA_RE, webpage)
+            urls.extend(video_urls)
+
+        entries = [self.url_result(vurl, 'TeacherTube') for vurl in urls]
+        return self.playlist_result(entries, user_id)
diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py

new file mode 100644 (file)

index 0000000..624cdb3
--- /dev/null
+++ b/youtube_dl/extractor/teachingchannel.py
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TeachingChannelIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?teachingchannel\.org/videos?/(?P<id>[^/?&#]+)'
+
+    _TEST = {
+        'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution',
+        'info_dict': {
+            'id': '3swwlzkT',
+            'ext': 'mp4',
+            'title': 'A History of Teaming',
+            'description': 'md5:2a9033db8da81f2edffa4c99888140b3',
+            'duration': 422,
+            'upload_date': '20170316',
+            'timestamp': 1489691297,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['JWPlatform'],
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        mid = self._search_regex(
+            r'(?:data-mid=["\']|id=["\']jw-video-player-)([a-zA-Z0-9]{8})',
+            webpage, 'media id')
+
+        return self.url_result('jwplatform:' + mid, 'JWPlatform', mid)
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py

new file mode 100644 (file)

index 0000000..5793b71
--- /dev/null
+++ b/youtube_dl/extractor/teamcoco.py
@@ -0,0 +1,205 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .turner import TurnerBaseIE
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    mimetype2ext,
+    parse_duration,
+    parse_iso8601,
+    qualities,
+)
+
+
+class TeamcocoIE(TurnerBaseIE):
+    _VALID_URL = r'https?://(?:\w+\.)?teamcoco\.com/(?P<id>([^/]+/)*[^/?#]+)'
+    _TESTS = [
+        {
+            'url': 'http://teamcoco.com/video/mary-kay-remote',
+            'md5': '55d532f81992f5c92046ad02fec34d7d',
+            'info_dict': {
+                'id': '80187',
+                'ext': 'mp4',
+                'title': 'Conan Becomes A Mary Kay Beauty Consultant',
+                'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.',
+                'duration': 495.0,
+                'upload_date': '20140402',
+                'timestamp': 1396407600,
+            }
+        }, {
+            'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
+            'md5': 'cde9ba0fa3506f5f017ce11ead928f9a',
+            'info_dict': {
+                'id': '19705',
+                'ext': 'mp4',
+                'description': 'Louis C.K. got starstruck by George W. Bush, so what? Part one.',
+                'title': 'Louis C.K. Interview Pt. 1 11/3/11',
+                'duration': 288,
+                'upload_date': '20111104',
+                'timestamp': 1320405840,
+            }
+        }, {
+            'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey',
+            'info_dict': {
+                'id': '88748',
+                'ext': 'mp4',
+                'title': 'Timothy Olyphant Raises A Toast To “Justified”',
+                'description': 'md5:15501f23f020e793aeca761205e42c24',
+                'upload_date': '20150415',
+                'timestamp': 1429088400,
+            },
+            'params': {
+                'skip_download': True,  # m3u8 downloads
+            }
+        }, {
+            'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9',
+            'info_dict': {
+                'id': '89341',
+                'ext': 'mp4',
+                'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+                'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett',
+            },
+            'params': {
+                'skip_download': True,  # m3u8 downloads
+            },
+            'skip': 'This video is no longer available.',
+        }, {
+            'url': 'http://teamcoco.com/video/the-conan-audiencey-awards-for-04/25/18',
+            'only_matching': True,
+        }, {
+            'url': 'http://teamcoco.com/italy/conan-jordan-schlansky-hit-the-streets-of-florence',
+            'only_matching': True,
+        }, {
+            'url': 'http://teamcoco.com/haiti/conan-s-haitian-history-lesson',
+            'only_matching': True,
+        }, {
+            'url': 'http://teamcoco.com/israel/conan-hits-the-streets-beaches-of-tel-aviv',
+            'only_matching': True,
+        }, {
+            'url': 'https://conan25.teamcoco.com/video/ice-cube-kevin-hart-conan-share-lyft',
+            'only_matching': True,
+        }
+    ]
+    _RECORD_TEMPL = '''id
+        title
+        teaser
+        publishOn
+        thumb {
+          preview
+        }
+        tags {
+          name
+        }
+        duration
+        turnerMediaId
+        turnerMediaAuthToken'''
+
+    def _graphql_call(self, query_template, object_type, object_id):
+        find_object = 'find' + object_type
+        return self._download_json(
+            'https://teamcoco.com/graphql', object_id, data=json.dumps({
+                'query': query_template % (find_object, object_id)
+            }).encode(), headers={
+                'Content-Type': 'application/json',
+            })['data'][find_object]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        response = self._graphql_call('''{
+  %%s(slug: "%%s") {
+    ... on RecordSlug {
+      record {
+        %s
+      }
+    }
+    ... on PageSlug {
+      child {
+        id
+      }
+    }
+    ... on NotFoundSlug {
+      status
+    }
+  }
+}''' % self._RECORD_TEMPL, 'Slug', display_id)
+        if response.get('status'):
+            raise ExtractorError('This video is no longer available.', expected=True)
+
+        child = response.get('child')
+        if child:
+            record = self._graphql_call('''{
+  %%s(id: "%%s") {
+    ... on Video {
+      %s
+    }
+  }
+}''' % self._RECORD_TEMPL, 'Record', child['id'])
+        else:
+            record = response['record']
+        video_id = record['id']
+
+        info = {
+            'id': video_id,
+            'display_id': display_id,
+            'title': record['title'],
+            'thumbnail': record.get('thumb', {}).get('preview'),
+            'description': record.get('teaser'),
+            'duration': parse_duration(record.get('duration')),
+            'timestamp': parse_iso8601(record.get('publishOn')),
+        }
+
+        media_id = record.get('turnerMediaId')
+        if media_id:
+            self._initialize_geo_bypass({
+                'countries': ['US'],
+            })
+            info.update(self._extract_ngtv_info(media_id, {
+                'accessToken': record['turnerMediaAuthToken'],
+                'accessTokenType': 'jws',
+            }))
+        else:
+            video_sources = self._download_json(
+                'https://teamcoco.com/_truman/d/' + video_id,
+                video_id)['meta']['src']
+            if isinstance(video_sources, dict):
+                video_sources = video_sources.values()
+
+            formats = []
+            get_quality = qualities(['low', 'sd', 'hd', 'uhd'])
+            for src in video_sources:
+                if not isinstance(src, dict):
+                    continue
+                src_url = src.get('src')
+                if not src_url:
+                    continue
+                format_id = src.get('label')
+                ext = determine_ext(src_url, mimetype2ext(src.get('type')))
+                if format_id == 'hls' or ext == 'm3u8':
+                    # compat_urllib_parse.urljoin does not work here
+                    if src_url.startswith('/'):
+                        src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url
+                    formats.extend(self._extract_m3u8_formats(
+                        src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
+                else:
+                    if src_url.startswith('/mp4:protected/'):
+                        # TODO Correct extraction for these files
+                        continue
+                    tbr = int_or_none(self._search_regex(
+                        r'(\d+)k\.mp4', src_url, 'tbr', default=None))
+
+                    formats.append({
+                        'url': src_url,
+                        'ext': ext,
+                        'tbr': tbr,
+                        'format_id': format_id,
+                        'quality': get_quality(format_id),
+                    })
+            self._sort_formats(formats)
+            info['formats'] = formats
+
+        return info
diff --git a/youtube_dl/extractor/teamtreehouse.py b/youtube_dl/extractor/teamtreehouse.py

new file mode 100644 (file)

index 0000000..d347e97
--- /dev/null
+++ b/youtube_dl/extractor/teamtreehouse.py
@@ -0,0 +1,140 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    get_element_by_class,
+    get_element_by_id,
+    parse_duration,
+    remove_end,
+    urlencode_postdata,
+    urljoin,
+)
+
+
+class TeamTreeHouseIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?teamtreehouse\.com/library/(?P<id>[^/]+)'
+    _TESTS = [{
+        # Course
+        'url': 'https://teamtreehouse.com/library/introduction-to-user-authentication-in-php',
+        'info_dict': {
+            'id': 'introduction-to-user-authentication-in-php',
+            'title': 'Introduction to User Authentication in PHP',
+            'description': 'md5:405d7b4287a159b27ddf30ca72b5b053',
+        },
+        'playlist_mincount': 24,
+    }, {
+        # WorkShop
+        'url': 'https://teamtreehouse.com/library/deploying-a-react-app',
+        'info_dict': {
+            'id': 'deploying-a-react-app',
+            'title': 'Deploying a React App',
+            'description': 'md5:10a82e3ddff18c14ac13581c9b8e5921',
+        },
+        'playlist_mincount': 4,
+    }, {
+        # Video
+        'url': 'https://teamtreehouse.com/library/application-overview-2',
+        'info_dict': {
+            'id': 'application-overview-2',
+            'ext': 'mp4',
+            'title': 'Application Overview',
+            'description': 'md5:4b0a234385c27140a4378de5f1e15127',
+        },
+        'expected_warnings': ['This is just a preview'],
+    }]
+    _NETRC_MACHINE = 'teamtreehouse'
+
+    def _real_initialize(self):
+        email, password = self._get_login_info()
+        if email is None:
+            return
+
+        signin_page = self._download_webpage(
+            'https://teamtreehouse.com/signin',
+            None, 'Downloading signin page')
+        data = self._form_hidden_inputs('new_user_session', signin_page)
+        data.update({
+            'user_session[email]': email,
+            'user_session[password]': password,
+        })
+        error_message = get_element_by_class('error-message', self._download_webpage(
+            'https://teamtreehouse.com/person_session',
+            None, 'Logging in', data=urlencode_postdata(data)))
+        if error_message:
+            raise ExtractorError(clean_html(error_message), expected=True)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
+        description = self._html_search_meta(
+            ['description', 'og:description', 'twitter:description'], webpage)
+        entries = self._parse_html5_media_entries(url, webpage, display_id)
+        if entries:
+            info = entries[0]
+
+            for subtitles in info.get('subtitles', {}).values():
+                for subtitle in subtitles:
+                    subtitle['ext'] = determine_ext(subtitle['url'], 'srt')
+
+            is_preview = 'data-preview="true"' in webpage
+            if is_preview:
+                self.report_warning(
+                    'This is just a preview. You need to be signed in with a Basic account to download the entire video.', display_id)
+                duration = 30
+            else:
+                duration = float_or_none(self._search_regex(
+                    r'data-duration="(\d+)"', webpage, 'duration'), 1000)
+                if not duration:
+                    duration = parse_duration(get_element_by_id(
+                        'video-duration', webpage))
+
+            info.update({
+                'id': display_id,
+                'title': title,
+                'description': description,
+                'duration': duration,
+            })
+            return info
+        else:
+            def extract_urls(html, extract_info=None):
+                for path in re.findall(r'<a[^>]+href="([^"]+)"', html):
+                    page_url = urljoin(url, path)
+                    entry = {
+                        '_type': 'url_transparent',
+                        'id': self._match_id(page_url),
+                        'url': page_url,
+                        'id_key': self.ie_key(),
+                    }
+                    if extract_info:
+                        entry.update(extract_info)
+                    entries.append(entry)
+
+            workshop_videos = self._search_regex(
+                r'(?s)<ul[^>]+id="workshop-videos"[^>]*>(.+?)</ul>',
+                webpage, 'workshop videos', default=None)
+            if workshop_videos:
+                extract_urls(workshop_videos)
+            else:
+                stages_path = self._search_regex(
+                    r'(?s)<div[^>]+id="syllabus-stages"[^>]+data-url="([^"]+)"',
+                    webpage, 'stages path')
+                if stages_path:
+                    stages_page = self._download_webpage(
+                        urljoin(url, stages_path), display_id, 'Downloading stages page')
+                    for chapter_number, (chapter, steps_list) in enumerate(re.findall(r'(?s)<h2[^>]*>\s*(.+?)\s*</h2>.+?<ul[^>]*>(.+?)</ul>', stages_page), 1):
+                        extract_urls(steps_list, {
+                            'chapter': chapter,
+                            'chapter_number': chapter_number,
+                        })
+                    title = remove_end(title, ' Course')
+
+            return self.playlist_result(
+                entries, display_id, title, description)
diff --git a/youtube_dl/extractor/techtalks.py b/youtube_dl/extractor/techtalks.py

new file mode 100644 (file)

index 0000000..a5b62c7
--- /dev/null
+++ b/youtube_dl/extractor/techtalks.py
@@ -0,0 +1,82 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    get_element_by_attribute,
+    clean_html,
+)
+
+
+class TechTalksIE(InfoExtractor):
+    _VALID_URL = r'https?://techtalks\.tv/talks/(?:[^/]+/)?(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/',
+        'info_dict': {
+            'id': '57758',
+            'title': 'Learning Topic Models --- Going beyond SVD',
+        },
+        'playlist': [
+            {
+                'info_dict': {
+                    'id': '57758',
+                    'ext': 'flv',
+                    'title': 'Learning Topic Models --- Going beyond SVD',
+                },
+            },
+            {
+                'info_dict': {
+                    'id': '57758-slides',
+                    'ext': 'flv',
+                    'title': 'Learning Topic Models --- Going beyond SVD',
+                },
+            },
+        ],
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://techtalks.tv/talks/57758',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        talk_id = mobj.group('id')
+        webpage = self._download_webpage(url, talk_id)
+        rtmp_url = self._search_regex(
+            r'netConnectionUrl: \'(.*?)\'', webpage, 'rtmp url')
+        play_path = self._search_regex(
+            r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"',
+            webpage, 'presenter play path')
+        title = clean_html(get_element_by_attribute('class', 'title', webpage))
+        video_info = {
+            'id': talk_id,
+            'title': title,
+            'url': rtmp_url,
+            'play_path': play_path,
+            'ext': 'flv',
+        }
+        m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage)
+        if m_slides is None:
+            return video_info
+        else:
+            return {
+                '_type': 'playlist',
+                'id': talk_id,
+                'title': title,
+                'entries': [
+                    video_info,
+                    # The slides video
+                    {
+                        'id': talk_id + '-slides',
+                        'title': title,
+                        'url': rtmp_url,
+                        'play_path': m_slides.group(1),
+                        'ext': 'flv',
+                    },
+                ],
+            }
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py

new file mode 100644 (file)

index 0000000..63e2455
--- /dev/null
+++ b/youtube_dl/extractor/ted.py
@@ -0,0 +1,363 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+
+from ..compat import (
+    compat_str,
+    compat_urlparse
+)
+from ..utils import (
+    extract_attributes,
+    float_or_none,
+    int_or_none,
+    try_get,
+    url_or_none,
+)
+
+
+class TEDIE(InfoExtractor):
+    IE_NAME = 'ted'
+    _VALID_URL = r'''(?x)
+        (?P<proto>https?://)
+        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
+        (
+            (?P<type_playlist>playlists(?:/(?P<playlist_id>\d+))?) # We have a playlist
+            |
+            ((?P<type_talk>talks)) # We have a simple talk
+            |
+            (?P<type_watch>watch)/[^/]+/[^/]+
+        )
+        (/lang/(.*?))? # The url may contain the language
+        /(?P<name>[\w-]+) # Here goes the name and then ".html"
+        .*)$
+        '''
+    _TESTS = [{
+        'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
+        'md5': 'b0ce2b05ca215042124fbc9e3886493a',
+        'info_dict': {
+            'id': '102',
+            'ext': 'mp4',
+            'title': 'The illusion of consciousness',
+            'description': ('Philosopher Dan Dennett makes a compelling '
+                            'argument that not only don\'t we understand our own '
+                            'consciousness, but that half the time our brains are '
+                            'actively fooling us.'),
+            'uploader': 'Dan Dennett',
+            'width': 853,
+            'duration': 1308,
+            'view_count': int,
+            'comment_count': int,
+            'tags': list,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # missing HTTP bitrates
+        'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
+        'info_dict': {
+            'id': '6069',
+            'ext': 'mp4',
+            'title': 'The beauty and power of algorithms',
+            'thumbnail': r're:^https?://.+\.jpg',
+            'description': 'md5:734e352710fb00d840ab87ae31aaf688',
+            'uploader': 'Vishal Sikka',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
+        'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
+        'info_dict': {
+            'id': '1972',
+            'ext': 'mp4',
+            'title': 'Be passionate. Be courageous. Be your best.',
+            'uploader': 'Gabby Giffords and Mark Kelly',
+            'description': 'md5:5174aed4d0f16021b704120360f72b92',
+            'duration': 1128,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.ted.com/playlists/who_are_the_hackers',
+        'info_dict': {
+            'id': '10',
+            'title': 'Who are the hackers?',
+            'description': 'md5:49a0dbe8fb76d81a0e64b4a80af7f15a'
+        },
+        'playlist_mincount': 6,
+    }, {
+        # contains a youtube video
+        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
+        'add_ie': ['Youtube'],
+        'info_dict': {
+            'id': '_ZG8HBuDjgc',
+            'ext': 'webm',
+            'title': 'Douglas Adams: Parrots the Universe and Everything',
+            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
+            'uploader': 'University of California Television (UCTV)',
+            'uploader_id': 'UCtelevision',
+            'upload_date': '20080522',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # no nativeDownloads
+        'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
+        'info_dict': {
+            'id': '1792',
+            'ext': 'mp4',
+            'title': 'The orchestra in my mouth',
+            'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
+            'uploader': 'Tom Thum',
+            'view_count': int,
+            'comment_count': int,
+            'tags': list,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    _NATIVE_FORMATS = {
+        'low': {'width': 320, 'height': 180},
+        'medium': {'width': 512, 'height': 288},
+        'high': {'width': 854, 'height': 480},
+    }
+
+    def _extract_info(self, webpage):
+        info_json = self._search_regex(
+            r'(?s)q\(\s*"\w+.init"\s*,\s*({.+?})\)\s*</script>',
+            webpage, 'info json')
+        return json.loads(info_json)
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url, re.VERBOSE)
+        if m.group('type').startswith('embed'):
+            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
+            return self.url_result(desktop_url, 'TED')
+        name = m.group('name')
+        if m.group('type_talk'):
+            return self._talk_info(url, name)
+        elif m.group('type_watch'):
+            return self._watch_info(url, name)
+        else:
+            return self._playlist_videos_info(url, name)
+
+    def _playlist_videos_info(self, url, name):
+        '''Returns the videos of the playlist'''
+
+        webpage = self._download_webpage(url, name,
+                                         'Downloading playlist webpage')
+
+        playlist_entries = []
+        for entry in re.findall(r'(?s)<[^>]+data-ga-context=["\']playlist["\'][^>]*>', webpage):
+            attrs = extract_attributes(entry)
+            entry_url = compat_urlparse.urljoin(url, attrs['href'])
+            playlist_entries.append(self.url_result(entry_url, self.ie_key()))
+
+        final_url = self._og_search_url(webpage, fatal=False)
+        playlist_id = (
+            re.match(self._VALID_URL, final_url).group('playlist_id')
+            if final_url else None)
+
+        return self.playlist_result(
+            playlist_entries, playlist_id=playlist_id,
+            playlist_title=self._og_search_title(webpage, fatal=False),
+            playlist_description=self._og_search_description(webpage))
+
+    def _talk_info(self, url, video_name):
+        webpage = self._download_webpage(url, video_name)
+
+        info = self._extract_info(webpage)
+
+        data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
+        talk_info = data['talks'][0]
+
+        title = talk_info['title'].strip()
+
+        downloads = talk_info.get('downloads') or {}
+        native_downloads = downloads.get('nativeDownloads') or talk_info.get('nativeDownloads') or {}
+
+        formats = [{
+            'url': format_url,
+            'format_id': format_id,
+        } for (format_id, format_url) in native_downloads.items() if format_url is not None]
+
+        subtitled_downloads = downloads.get('subtitledDownloads') or {}
+        for lang, subtitled_download in subtitled_downloads.items():
+            for q in self._NATIVE_FORMATS:
+                q_url = subtitled_download.get(q)
+                if not q_url:
+                    continue
+                formats.append({
+                    'url': q_url,
+                    'format_id': '%s-%s' % (q, lang),
+                    'language': lang,
+                })
+
+        if formats:
+            for f in formats:
+                finfo = self._NATIVE_FORMATS.get(f['format_id'].split('-')[0])
+                if finfo:
+                    f.update(finfo)
+
+        player_talk = talk_info['player_talks'][0]
+
+        external = player_talk.get('external')
+        if isinstance(external, dict):
+            service = external.get('service')
+            if isinstance(service, compat_str):
+                ext_url = None
+                if service.lower() == 'youtube':
+                    ext_url = external.get('code')
+
+                return self.url_result(ext_url or external['uri'])
+
+        resources_ = player_talk.get('resources') or talk_info.get('resources')
+
+        http_url = None
+        for format_id, resources in resources_.items():
+            if format_id == 'hls':
+                if not isinstance(resources, dict):
+                    continue
+                stream_url = url_or_none(resources.get('stream'))
+                if not stream_url:
+                    continue
+                formats.extend(self._extract_m3u8_formats(
+                    stream_url, video_name, 'mp4', m3u8_id=format_id,
+                    fatal=False))
+            else:
+                if not isinstance(resources, list):
+                    continue
+                if format_id == 'h264':
+                    for resource in resources:
+                        h264_url = resource.get('file')
+                        if not h264_url:
+                            continue
+                        bitrate = int_or_none(resource.get('bitrate'))
+                        formats.append({
+                            'url': h264_url,
+                            'format_id': '%s-%sk' % (format_id, bitrate),
+                            'tbr': bitrate,
+                        })
+                        if re.search(r'\d+k', h264_url):
+                            http_url = h264_url
+                elif format_id == 'rtmp':
+                    streamer = talk_info.get('streamer')
+                    if not streamer:
+                        continue
+                    for resource in resources:
+                        formats.append({
+                            'format_id': '%s-%s' % (format_id, resource.get('name')),
+                            'url': streamer,
+                            'play_path': resource['file'],
+                            'ext': 'flv',
+                            'width': int_or_none(resource.get('width')),
+                            'height': int_or_none(resource.get('height')),
+                            'tbr': int_or_none(resource.get('bitrate')),
+                        })
+
+        m3u8_formats = list(filter(
+            lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
+            formats))
+        if http_url:
+            for m3u8_format in m3u8_formats:
+                bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
+                if not bitrate:
+                    continue
+                bitrate_url = re.sub(r'\d+k', bitrate, http_url)
+                if not self._is_valid_url(
+                        bitrate_url, video_name, '%s bitrate' % bitrate):
+                    continue
+                f = m3u8_format.copy()
+                f.update({
+                    'url': bitrate_url,
+                    'format_id': m3u8_format['format_id'].replace('hls', 'http'),
+                    'protocol': 'http',
+                })
+                if f.get('acodec') == 'none':
+                    del f['acodec']
+                formats.append(f)
+
+        audio_download = talk_info.get('audioDownload')
+        if audio_download:
+            formats.append({
+                'url': audio_download,
+                'format_id': 'audio',
+                'vcodec': 'none',
+            })
+
+        self._sort_formats(formats)
+
+        video_id = compat_str(talk_info['id'])
+
+        return {
+            'id': video_id,
+            'title': title,
+            'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
+            'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
+            'description': self._og_search_description(webpage),
+            'subtitles': self._get_subtitles(video_id, talk_info),
+            'formats': formats,
+            'duration': float_or_none(talk_info.get('duration')),
+            'view_count': int_or_none(data.get('viewed_count')),
+            'comment_count': int_or_none(
+                try_get(data, lambda x: x['comments']['count'])),
+            'tags': try_get(talk_info, lambda x: x['tags'], list),
+        }
+
+    def _get_subtitles(self, video_id, talk_info):
+        sub_lang_list = {}
+        for language in try_get(
+                talk_info,
+                (lambda x: x['downloads']['languages'],
+                 lambda x: x['languages']), list):
+            lang_code = language.get('languageCode') or language.get('ianaCode')
+            if not lang_code:
+                continue
+            sub_lang_list[lang_code] = [
+                {
+                    'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
+                    'ext': ext,
+                }
+                for ext in ['ted', 'srt']
+            ]
+        return sub_lang_list
+
+    def _watch_info(self, url, name):
+        webpage = self._download_webpage(url, name)
+
+        config_json = self._html_search_regex(
+            r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
+            webpage, 'config', default=None)
+        if not config_json:
+            embed_url = self._search_regex(
+                r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
+            return self.url_result(self._proto_relative_url(embed_url))
+        config = json.loads(config_json)['config']
+        video_url = config['video']['url']
+        thumbnail = config.get('image', {}).get('url')
+
+        title = self._html_search_regex(
+            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
+        description = self._html_search_regex(
+            [
+                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
+                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
+            ],
+            webpage, 'description', fatal=False)
+
+        return {
+            'id': name,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'description': description,
+        }
diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py

new file mode 100644 (file)

index 0000000..a29a64b
--- /dev/null
+++ b/youtube_dl/extractor/tele13.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..utils import (
+    js_to_json,
+    qualities,
+    determine_ext,
+)
+
+
+class Tele13IE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)'
+    _TESTS = [
+        {
+            'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
+            'md5': '4cb1fa38adcad8fea88487a078831755',
+            'info_dict': {
+                'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
+                'ext': 'mp4',
+                'title': 'El círculo de hierro de Michelle Bachelet en su regreso a La Moneda',
+            },
+            'params': {
+                # HTTP Error 404: Not Found
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok',
+            'md5': '867adf6a3b3fef932c68a71d70b70946',
+            'info_dict': {
+                'id': 'rOoKv2OMpOw',
+                'ext': 'mp4',
+                'title': 'Shooting star seen on 7-Sep-2015',
+                'description': 'md5:7292ff2a34b2f673da77da222ae77e1e',
+                'uploader': 'Porjai Jaturongkhakun',
+                'upload_date': '20150906',
+                'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw',
+            },
+            'add_ie': ['Youtube'],
+        }
+    ]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        setup_js = self._search_regex(
+            r"(?s)jwplayer\('player-vivo'\).setup\((\{.*?\})\)",
+            webpage, 'setup code')
+        sources = self._parse_json(self._search_regex(
+            r'sources\s*:\s*(\[[^\]]+\])', setup_js, 'sources'),
+            display_id, js_to_json)
+
+        preference = qualities(['Móvil', 'SD', 'HD'])
+        formats = []
+        urls = []
+        for f in sources:
+            format_url = f['file']
+            if format_url and format_url not in urls:
+                ext = determine_ext(format_url)
+                if ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, display_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+                elif YoutubeIE.suitable(format_url):
+                    return self.url_result(format_url, 'Youtube')
+                else:
+                    formats.append({
+                        'url': format_url,
+                        'format_id': f.get('label'),
+                        'preference': preference(f.get('label')),
+                        'ext': ext,
+                    })
+                urls.append(format_url)
+        self._sort_formats(formats)
+
+        return {
+            'id': display_id,
+            'title': self._search_regex(
+                r'title\s*:\s*"([^"]+)"', setup_js, 'title'),
+            'description': self._html_search_meta(
+                'description', webpage, 'description'),
+            'thumbnail': self._search_regex(
+                r'image\s*:\s*"([^"]+)"', setup_js, 'thumbnail', default=None),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tele5.py b/youtube_dl/extractor/tele5.py

new file mode 100644 (file)

index 0000000..3e1a7a9
--- /dev/null
+++ b/youtube_dl/extractor/tele5.py
@@ -0,0 +1,108 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .jwplatform import JWPlatformIE
+from .nexx import NexxIE
+from ..compat import compat_urlparse
+from ..utils import (
+    NO_DEFAULT,
+    smuggle_url,
+)
+
+
+class Tele5IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _GEO_COUNTRIES = ['DE']
+    _TESTS = [{
+        'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416',
+        'info_dict': {
+            'id': '1549416',
+            'ext': 'mp4',
+            'upload_date': '20180814',
+            'timestamp': 1534290623,
+            'title': 'Pandorum',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # jwplatform, nexx unavailable
+        'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/',
+        'info_dict': {
+            'id': 'WJuiOlUp',
+            'ext': 'mp4',
+            'upload_date': '20200603',
+            'timestamp': 1591214400,
+            'title': 'Ghoul - Das Geheimnis des Friedhofmonsters',
+            'description': 'md5:42002af1d887ff3d5b2b3ca1f8137d97',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [JWPlatformIE.ie_key()],
+    }, {
+        'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tele5.de/video-clip/?ve_id=1609440',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tele5.de/filme/schlefaz-dragon-crusaders/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tele5.de/filme/making-of/avengers-endgame/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tele5.de/star-trek/raumschiff-voyager/ganze-folge/das-vinculum/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tele5.de/anders-ist-sevda/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0]
+
+        NEXX_ID_RE = r'\d{6,}'
+        JWPLATFORM_ID_RE = r'[a-zA-Z0-9]{8}'
+
+        def nexx_result(nexx_id):
+            return self.url_result(
+                'https://api.nexx.cloud/v3/759/videos/byid/%s' % nexx_id,
+                ie=NexxIE.ie_key(), video_id=nexx_id)
+
+        nexx_id = jwplatform_id = None
+
+        if video_id:
+            if re.match(NEXX_ID_RE, video_id):
+                return nexx_result(video_id)
+            elif re.match(JWPLATFORM_ID_RE, video_id):
+                jwplatform_id = video_id
+
+        if not nexx_id:
+            display_id = self._match_id(url)
+            webpage = self._download_webpage(url, display_id)
+
+            def extract_id(pattern, name, default=NO_DEFAULT):
+                return self._html_search_regex(
+                    (r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](%s)' % pattern,
+                     r'\s+id\s*=\s*["\']player_(%s)' % pattern,
+                     r'\bdata-id\s*=\s*["\'](%s)' % pattern), webpage, name,
+                    default=default)
+
+            nexx_id = extract_id(NEXX_ID_RE, 'nexx id', default=None)
+            if nexx_id:
+                return nexx_result(nexx_id)
+
+            if not jwplatform_id:
+                jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id')
+
+        return self.url_result(
+            smuggle_url(
+                'jwplatform:%s' % jwplatform_id,
+                {'geo_countries': self._GEO_COUNTRIES}),
+            ie=JWPlatformIE.ie_key(), video_id=jwplatform_id)
diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py

new file mode 100644 (file)

index 0000000..a0353fe
--- /dev/null
+++ b/youtube_dl/extractor/telebruxelles.py
@@ -0,0 +1,76 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class TeleBruxellesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(?:[^/]+/)*(?P<id>[^/#?]+)'
+    _TESTS = [{
+        'url': 'http://bx1.be/news/que-risque-lauteur-dune-fausse-alerte-a-la-bombe/',
+        'md5': 'a2a67a5b1c3e8c9d33109b902f474fd9',
+        'info_dict': {
+            'id': '158856',
+            'display_id': 'que-risque-lauteur-dune-fausse-alerte-a-la-bombe',
+            'ext': 'mp4',
+            'title': 'Que risque l’auteur d’une fausse alerte à la bombe ?',
+            'description': 'md5:3cf8df235d44ebc5426373050840e466',
+        },
+    }, {
+        'url': 'http://bx1.be/sport/futsal-schaerbeek-sincline-5-3-a-thulin/',
+        'md5': 'dfe07ecc9c153ceba8582ac912687675',
+        'info_dict': {
+            'id': '158433',
+            'display_id': 'futsal-schaerbeek-sincline-5-3-a-thulin',
+            'ext': 'mp4',
+            'title': 'Futsal : Schaerbeek s’incline 5-3 à Thulin',
+            'description': 'md5:fd013f1488d5e2dceb9cebe39e2d569b',
+        },
+    }, {
+        'url': 'http://bx1.be/emission/bxenf1-gastronomie/',
+        'only_matching': True,
+    }, {
+        'url': 'https://bx1.be/berchem-sainte-agathe/personnel-carrefour-de-berchem-sainte-agathe-inquiet/',
+        'only_matching': True,
+    }, {
+        'url': 'https://bx1.be/dernier-jt/',
+        'only_matching': True,
+    }, {
+        # live stream
+        'url': 'https://bx1.be/lives/direct-tv/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        article_id = self._html_search_regex(
+            r'<article[^>]+\bid=["\']post-(\d+)', webpage, 'article ID', default=None)
+        title = self._html_search_regex(
+            r'<h1[^>]*>(.+?)</h1>', webpage, 'title',
+            default=None) or self._og_search_title(webpage)
+        description = self._og_search_description(webpage, default=None)
+
+        rtmp_url = self._html_search_regex(
+            r'file["\']?\s*:\s*"(r(?:tm|mt)ps?://[^/]+/(?:vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*"\.mp4|stream/live))"',
+            webpage, 'RTMP url')
+        # Yes, they have a typo in scheme name for live stream URLs (e.g.
+        # https://bx1.be/lives/direct-tv/)
+        rtmp_url = re.sub(r'^rmtp', 'rtmp', rtmp_url)
+        rtmp_url = re.sub(r'"\s*\+\s*"', '', rtmp_url)
+        formats = self._extract_wowza_formats(rtmp_url, article_id or display_id)
+        self._sort_formats(formats)
+
+        is_live = 'stream/live' in rtmp_url
+
+        return {
+            'id': article_id or display_id,
+            'display_id': display_id,
+            'title': self._live_title(title) if is_live else title,
+            'description': description,
+            'formats': formats,
+            'is_live': is_live,
+        }
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py

new file mode 100644 (file)

index 0000000..9ba3da3
--- /dev/null
+++ b/youtube_dl/extractor/telecinco.py
@@ -0,0 +1,188 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+from ..utils import (
+    clean_html,
+    determine_ext,
+    int_or_none,
+    str_or_none,
+    try_get,
+    urljoin,
+)
+
+
+class TelecincoIE(InfoExtractor):
+    IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'
+    _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'
+
+    _TESTS = [{
+        'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
+        'info_dict': {
+            'id': '1876350223',
+            'title': 'Bacalao con kokotxas al pil-pil',
+            'description': 'md5:716caf5601e25c3c5ab6605b1ae71529',
+        },
+        'playlist': [{
+            'md5': 'adb28c37238b675dad0f042292f209a7',
+            'info_dict': {
+                'id': 'JEA5ijCnF6p5W08A1rNKn7',
+                'ext': 'mp4',
+                'title': 'Con Martín Berasategui, hacer un bacalao al pil-pil es fácil y divertido',
+                'duration': 662,
+            },
+        }]
+    }, {
+        'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
+        'md5': '9468140ebc300fbb8b9d65dc6e5c4b43',
+        'info_dict': {
+            'id': 'jn24Od1zGLG4XUZcnUnZB6',
+            'ext': 'mp4',
+            'title': '¿Quién es este ex futbolista con el que hablan Leo Messi y Luis Suárez?',
+            'description': 'md5:a62ecb5f1934fc787107d7b9a2262805',
+            'duration': 79,
+        },
+    }, {
+        'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
+        'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6',
+        'info_dict': {
+            'id': 'aywerkD2Sv1vGNqq9b85Q2',
+            'ext': 'mp4',
+            'title': '#DOYLACARA. Con la trata no hay trato',
+            'description': 'md5:2771356ff7bfad9179c5f5cd954f1477',
+            'duration': 50,
+        },
+    }, {
+        # video in opening's content
+        'url': 'https://www.telecinco.es/vivalavida/fiorella-sobrina-edmundo-arrocet-entrevista_18_2907195140.html',
+        'info_dict': {
+            'id': '2907195140',
+            'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"',
+            'description': 'md5:73f340a7320143d37ab895375b2bf13a',
+        },
+        'playlist': [{
+            'md5': 'adb28c37238b675dad0f042292f209a7',
+            'info_dict': {
+                'id': 'TpI2EttSDAReWpJ1o0NVh2',
+                'ext': 'mp4',
+                'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"',
+                'duration': 1015,
+            },
+        }],
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html',
+        'only_matching': True,
+    }, {
+        # ooyala video
+        'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html',
+        'only_matching': True,
+    }]
+
+    def _parse_content(self, content, url):
+        video_id = content['dataMediaId']
+        if content.get('dataCmsId') == 'ooyala':
+            return self.url_result(
+                'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id)
+        config_url = urljoin(url, content['dataConfig'])
+        config = self._download_json(
+            config_url, video_id, 'Downloading config JSON')
+        title = config['info']['title']
+
+        def mmc_url(mmc_type):
+            return re.sub(
+                r'/(?:flash|html5)\.json', '/%s.json' % mmc_type,
+                config['services']['mmc'])
+
+        duration = None
+        formats = []
+        for mmc_type in ('flash', 'html5'):
+            mmc = self._download_json(
+                mmc_url(mmc_type), video_id,
+                'Downloading %s mmc JSON' % mmc_type, fatal=False)
+            if not mmc:
+                continue
+            if not duration:
+                duration = int_or_none(mmc.get('duration'))
+            for location in mmc['locations']:
+                gat = self._proto_relative_url(location.get('gat'), 'http:')
+                gcp = location.get('gcp')
+                ogn = location.get('ogn')
+                if None in (gat, gcp, ogn):
+                    continue
+                token_data = {
+                    'gcp': gcp,
+                    'ogn': ogn,
+                    'sta': 0,
+                }
+                media = self._download_json(
+                    gat, video_id, data=json.dumps(token_data).encode('utf-8'),
+                    headers={
+                        'Content-Type': 'application/json;charset=utf-8',
+                        'Referer': url,
+                    }, fatal=False) or {}
+                stream = media.get('stream') or media.get('file')
+                if not stream:
+                    continue
+                ext = determine_ext(stream)
+                if ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+                        video_id, f4m_id='hds', fatal=False))
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        stream, video_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'),
+            'duration': duration,
+        }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        article = self._parse_json(self._search_regex(
+            r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})',
+            webpage, 'article'), display_id)['article']
+        title = article.get('title')
+        description = clean_html(article.get('leadParagraph')) or ''
+        if article.get('editorialType') != 'VID':
+            entries = []
+            body = [article.get('opening')]
+            body.extend(try_get(article, lambda x: x['body'], list) or [])
+            for p in body:
+                if not isinstance(p, dict):
+                    continue
+                content = p.get('content')
+                if not content:
+                    continue
+                type_ = p.get('type')
+                if type_ == 'paragraph':
+                    content_str = str_or_none(content)
+                    if content_str:
+                        description += content_str
+                    continue
+                if type_ == 'video' and isinstance(content, dict):
+                    entries.append(self._parse_content(content, url))
+            return self.playlist_result(
+                entries, str_or_none(article.get('id')), title, description)
+        content = article['opening']['content']
+        info = self._parse_content(content, url)
+        info.update({
+            'description': description,
+        })
+        return info
diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py

new file mode 100644 (file)

index 0000000..2dc0205
--- /dev/null
+++ b/youtube_dl/extractor/telegraaf.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    parse_iso8601,
+    try_get,
+)
+
+
+class TelegraafIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/video/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.telegraaf.nl/video/734366489/historisch-scheepswrak-slaat-na-100-jaar-los',
+        'info_dict': {
+            'id': 'gaMItuoSeUg2',
+            'ext': 'mp4',
+            'title': 'Historisch scheepswrak slaat na 100 jaar los',
+            'description': 'md5:6f53b7c4f55596722ac24d6c0ec00cfb',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 55,
+            'timestamp': 1572805527,
+            'upload_date': '20191103',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+
+        video_id = self._download_json(
+            'https://www.telegraaf.nl/graphql', article_id, query={
+                'query': '''{
+  article(uid: %s) {
+    videos {
+      videoId
+    }
+  }
+}''' % article_id,
+            })['data']['article']['videos'][0]['videoId']
+
+        item = self._download_json(
+            'https://content.tmgvideo.nl/playlist/item=%s/playlist.json' % video_id,
+            video_id)['items'][0]
+        title = item['title']
+
+        formats = []
+        locations = item.get('locations') or {}
+        for location in locations.get('adaptive', []):
+            manifest_url = location.get('src')
+            if not manifest_url:
+                continue
+            ext = determine_ext(manifest_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    manifest_url, video_id, ext='mp4', m3u8_id='hls', fatal=False))
+            elif ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    manifest_url, video_id, mpd_id='dash', fatal=False))
+            else:
+                self.report_warning('Unknown adaptive format %s' % ext)
+        for location in locations.get('progressive', []):
+            src = try_get(location, lambda x: x['sources'][0]['src'])
+            if not src:
+                continue
+            label = location.get('label')
+            formats.append({
+                'url': src,
+                'width': int_or_none(location.get('width')),
+                'height': int_or_none(location.get('height')),
+                'format_id': 'http' + ('-%s' % label if label else ''),
+            })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': item.get('description'),
+            'formats': formats,
+            'duration': int_or_none(item.get('duration')),
+            'thumbnail': item.get('poster'),
+            'timestamp': parse_iso8601(item.get('datecreated'), ' '),
+        }
diff --git a/youtube_dl/extractor/telemb.py b/youtube_dl/extractor/telemb.py

new file mode 100644 (file)

index 0000000..9bcac4e
--- /dev/null
+++ b/youtube_dl/extractor/telemb.py
@@ -0,0 +1,78 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import remove_start
+
+
+class TeleMBIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?telemb\.be/(?P<display_id>.+?)_d_(?P<id>\d+)\.html'
+    _TESTS = [
+        {
+            'url': 'http://www.telemb.be/mons-cook-with-danielle-des-cours-de-cuisine-en-anglais-_d_13466.html',
+            'md5': 'f45ea69878516ba039835794e0f8f783',
+            'info_dict': {
+                'id': '13466',
+                'display_id': 'mons-cook-with-danielle-des-cours-de-cuisine-en-anglais-',
+                'ext': 'mp4',
+                'title': 'Mons - Cook with Danielle : des cours de cuisine en anglais ! - Les reportages',
+                'description': 'md5:bc5225f47b17c309761c856ad4776265',
+                'thumbnail': r're:^http://.*\.(?:jpg|png)$',
+            }
+        },
+        {
+            # non-ASCII characters in download URL
+            'url': 'http://telemb.be/les-reportages-havre-incendie-mortel_d_13514.html',
+            'md5': '6e9682736e5ccd4eab7f21e855350733',
+            'info_dict': {
+                'id': '13514',
+                'display_id': 'les-reportages-havre-incendie-mortel',
+                'ext': 'mp4',
+                'title': 'Havré - Incendie mortel - Les reportages',
+                'description': 'md5:5e54cb449acb029c2b7734e2d946bd4a',
+                'thumbnail': r're:^http://.*\.(?:jpg|png)$',
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        formats = []
+        for video_url in re.findall(r'file\s*:\s*"([^"]+)"', webpage):
+            fmt = {
+                'url': video_url,
+                'format_id': video_url.split(':')[0]
+            }
+            rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url)
+            if rtmp:
+                fmt.update({
+                    'play_path': rtmp.group('playpath'),
+                    'app': rtmp.group('app'),
+                    'player_url': 'http://p.jwpcdn.com/6/10/jwplayer.flash.swf',
+                    'page_url': 'http://www.telemb.be',
+                    'preference': -1,
+                })
+            formats.append(fmt)
+        self._sort_formats(formats)
+
+        title = remove_start(self._og_search_title(webpage), 'TéléMB : ')
+        description = self._html_search_regex(
+            r'<meta property="og:description" content="(.+?)" />',
+            webpage, 'description', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py

new file mode 100644 (file)

index 0000000..c82c94b
--- /dev/null
+++ b/youtube_dl/extractor/telequebec.py
@@ -0,0 +1,205 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    smuggle_url,
+    try_get,
+    unified_timestamp,
+)
+
+
+class TeleQuebecBaseIE(InfoExtractor):
+    @staticmethod
+    def _limelight_result(media_id):
+        return {
+            '_type': 'url_transparent',
+            'url': smuggle_url(
+                'limelight:media:' + media_id, {'geo_countries': ['CA']}),
+            'ie_key': 'LimelightMedia',
+        }
+
+
+class TeleQuebecIE(TeleQuebecBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            zonevideo\.telequebec\.tv/media|
+                            coucou\.telequebec\.tv/videos
+                        )/(?P<id>\d+)
+                    '''
+    _TESTS = [{
+        # available till 01.01.2023
+        'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane',
+        'info_dict': {
+            'id': '577116881b4b439084e6b1cf4ef8b1b3',
+            'ext': 'mp4',
+            'title': 'Un petit choc et puis repart!',
+            'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # no description
+        'url': 'http://zonevideo.telequebec.tv/media/30261',
+        'only_matching': True,
+    }, {
+        'url': 'https://coucou.telequebec.tv/videos/41788/idee-de-genie/l-heure-du-bain',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        media_id = self._match_id(url)
+
+        media_data = self._download_json(
+            'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id,
+            media_id)['media']
+
+        info = self._limelight_result(media_data['streamInfo']['sourceId'])
+        info.update({
+            'title': media_data.get('title'),
+            'description': try_get(
+                media_data, lambda x: x['descriptions'][0]['text'], compat_str),
+            'duration': int_or_none(
+                media_data.get('durationInMilliseconds'), 1000),
+        })
+        return info
+
+
+class TeleQuebecSquatIE(InfoExtractor):
+    _VALID_URL = r'https://squat\.telequebec\.tv/videos/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://squat.telequebec.tv/videos/9314',
+        'info_dict': {
+            'id': 'd59ae78112d542e793d83cc9d3a5b530',
+            'ext': 'mp4',
+            'title': 'Poupeflekta',
+            'description': 'md5:2f0718f8d2f8fece1646ee25fb7bce75',
+            'duration': 1351,
+            'timestamp': 1569057600,
+            'upload_date': '20190921',
+            'series': 'Miraculous : Les Aventures de Ladybug et Chat Noir',
+            'season': 'Saison 3',
+            'season_number': 3,
+            'episode_number': 57,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'https://squat.api.telequebec.tv/v1/videos/%s' % video_id,
+            video_id)
+
+        media_id = video['sourceId']
+
+        return {
+            '_type': 'url_transparent',
+            'url': 'http://zonevideo.telequebec.tv/media/%s' % media_id,
+            'ie_key': TeleQuebecIE.ie_key(),
+            'id': media_id,
+            'title': video.get('titre'),
+            'description': video.get('description'),
+            'timestamp': unified_timestamp(video.get('datePublication')),
+            'series': video.get('container'),
+            'season': video.get('saison'),
+            'season_number': int_or_none(video.get('noSaison')),
+            'episode_number': int_or_none(video.get('episode')),
+        }
+
+
+class TeleQuebecEmissionIE(TeleQuebecBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            [^/]+\.telequebec\.tv/emissions/|
+                            (?:www\.)?telequebec\.tv/
+                        )
+                        (?P<id>[^?#&]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://lindicemcsween.telequebec.tv/emissions/100430013/des-soins-esthetiques-a-377-d-interets-annuels-ca-vous-tente',
+        'info_dict': {
+            'id': '66648a6aef914fe3badda25e81a4d50a',
+            'ext': 'mp4',
+            'title': "Des soins esthétiques à 377 % d'intérêts annuels, ça vous tente?",
+            'description': 'md5:369e0d55d0083f1fc9b71ffb640ea014',
+            'upload_date': '20171024',
+            'timestamp': 1508862118,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.telequebec.tv/masha-et-michka/epi059masha-et-michka-3-053-078',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.telequebec.tv/documentaire/bebes-sur-mesure/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        media_id = self._search_regex(
+            r'mediaUID\s*:\s*["\'][Ll]imelight_(?P<id>[a-z0-9]{32})', webpage,
+            'limelight id')
+
+        info = self._limelight_result(media_id)
+        info.update({
+            'title': self._og_search_title(webpage, default=None),
+            'description': self._og_search_description(webpage, default=None),
+        })
+        return info
+
+
+class TeleQuebecLiveIE(InfoExtractor):
+    _VALID_URL = r'https?://zonevideo\.telequebec\.tv/(?P<id>endirect)'
+    _TEST = {
+        'url': 'http://zonevideo.telequebec.tv/endirect/',
+        'info_dict': {
+            'id': 'endirect',
+            'ext': 'mp4',
+            'title': 're:^Télé-Québec - En direct [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        m3u8_url = None
+        webpage = self._download_webpage(
+            'https://player.telequebec.tv/Tq_VideoPlayer.js', video_id,
+            fatal=False)
+        if webpage:
+            m3u8_url = self._search_regex(
+                r'm3U8Url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+                'm3u8 url', default=None, group='url')
+        if not m3u8_url:
+            m3u8_url = 'https://teleqmmd.mmdlive.lldns.net/teleqmmd/f386e3b206814e1f8c8c1c71c0f8e748/manifest.m3u8'
+        formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, 'mp4', m3u8_id='hls')
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._live_title('Télé-Québec - En direct'),
+            'is_live': True,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/teletask.py b/youtube_dl/extractor/teletask.py

new file mode 100644 (file)

index 0000000..b9e2ef8
--- /dev/null
+++ b/youtube_dl/extractor/teletask.py
@@ -0,0 +1,53 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class TeleTaskIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tele-task\.de/archive/video/html5/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.tele-task.de/archive/video/html5/26168/',
+        'info_dict': {
+            'id': '26168',
+            'title': 'Duplicate Detection',
+        },
+        'playlist': [{
+            'md5': '290ef69fb2792e481169c3958dbfbd57',
+            'info_dict': {
+                'id': '26168-speaker',
+                'ext': 'mp4',
+                'title': 'Duplicate Detection',
+                'upload_date': '20141218',
+            }
+        }, {
+            'md5': 'e1e7218c5f0e4790015a437fcf6c71b4',
+            'info_dict': {
+                'id': '26168-slides',
+                'ext': 'mp4',
+                'title': 'Duplicate Detection',
+                'upload_date': '20141218',
+            }
+        }]
+    }
+
+    def _real_extract(self, url):
+        lecture_id = self._match_id(url)
+        webpage = self._download_webpage(url, lecture_id)
+
+        title = self._html_search_regex(
+            r'itemprop="name">([^<]+)</a>', webpage, 'title')
+        upload_date = unified_strdate(self._html_search_regex(
+            r'Date:</td><td>([^<]+)</td>', webpage, 'date', fatal=False))
+
+        entries = [{
+            'id': '%s-%s' % (lecture_id, format_id),
+            'url': video_url,
+            'title': title,
+            'upload_date': upload_date,
+        } for format_id, video_url in re.findall(
+            r'<video class="([^"]+)"[^>]*>\s*<source src="([^"]+)"', webpage)]
+
+        return self.playlist_result(entries, lecture_id, title)
diff --git a/youtube_dl/extractor/telewebion.py b/youtube_dl/extractor/telewebion.py

new file mode 100644 (file)

index 0000000..1207b1a
--- /dev/null
+++ b/youtube_dl/extractor/telewebion.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TelewebionIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?telewebion\.com/#!/episode/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.telewebion.com/#!/episode/1263668/',
+        'info_dict': {
+            'id': '1263668',
+            'ext': 'mp4',
+            'title': 'قرعه\u200cکشی لیگ قهرمانان اروپا',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'view_count': int,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        secure_token = self._download_webpage(
+            'http://m.s2.telewebion.com/op/op?action=getSecurityToken', video_id)
+        episode_details = self._download_json(
+            'http://m.s2.telewebion.com/op/op', video_id,
+            query={'action': 'getEpisodeDetails', 'episode_id': video_id})
+
+        m3u8_url = 'http://m.s1.telewebion.com/smil/%s.m3u8?filepath=%s&m3u8=1&secure_token=%s' % (
+            video_id, episode_details['file_path'], secure_token)
+        formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, ext='mp4', m3u8_id='hls')
+
+        picture_paths = [
+            episode_details.get('picture_path'),
+            episode_details.get('large_picture_path'),
+        ]
+
+        thumbnails = [{
+            'url': picture_path,
+            'preference': idx,
+        } for idx, picture_path in enumerate(picture_paths) if picture_path is not None]
+
+        return {
+            'id': video_id,
+            'title': episode_details['title'],
+            'formats': formats,
+            'thumbnails': thumbnails,
+            'view_count': episode_details.get('view_count'),
+        }
diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py

new file mode 100644 (file)

index 0000000..a586f30
--- /dev/null
+++ b/youtube_dl/extractor/tennistv.py
@@ -0,0 +1,112 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+
+from ..utils import (
+    ExtractorError,
+    unified_timestamp,
+)
+
+
+class TennisTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tennistv\.com/videos/(?P<id>[-a-z0-9]+)'
+    _TEST = {
+        'url': 'https://www.tennistv.com/videos/indian-wells-2018-verdasco-fritz',
+        'info_dict': {
+            'id': 'indian-wells-2018-verdasco-fritz',
+            'ext': 'mp4',
+            'title': 'Fernando Verdasco v Taylor Fritz',
+            'description': 're:^After his stunning victory.{174}$',
+            'thumbnail': 'https://atp-prod.akamaized.net/api/images/v1/images/112831/landscape/1242/0',
+            'timestamp': 1521017381,
+            'upload_date': '20180314',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Requires email and password of a subscribed account',
+    }
+    _NETRC_MACHINE = 'tennistv'
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if not username or not password:
+            raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+
+        login_form = {
+            'Email': username,
+            'Password': password,
+        }
+        login_json = json.dumps(login_form).encode('utf-8')
+        headers = {
+            'content-type': 'application/json',
+            'Referer': 'https://www.tennistv.com/login',
+            'Origin': 'https://www.tennistv.com',
+        }
+
+        login_result = self._download_json(
+            'https://www.tennistv.com/api/users/v1/login', None,
+            note='Logging in',
+            errnote='Login failed (wrong password?)',
+            headers=headers,
+            data=login_json)
+
+        if login_result['error']['errorCode']:
+            raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, login_result['error']['errorMessage']))
+
+        if login_result['entitlement'] != 'SUBSCRIBED':
+            self.report_warning('%s may not be subscribed to %s.' % (username, self.IE_NAME))
+
+        self._session_token = login_result['sessionToken']
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        internal_id = self._search_regex(r'video=([0-9]+)', webpage, 'internal video id')
+
+        headers = {
+            'Origin': 'https://www.tennistv.com',
+            'authorization': 'ATP %s' % self._session_token,
+            'content-type': 'application/json',
+            'Referer': url,
+        }
+        check_data = {
+            'videoID': internal_id,
+            'VideoUrlType': 'HLSV3',
+        }
+        check_json = json.dumps(check_data).encode('utf-8')
+        check_result = self._download_json(
+            'https://www.tennistv.com/api/users/v1/entitlementchecknondiva',
+            video_id, note='Checking video authorization', headers=headers, data=check_json)
+        formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4')
+
+        vdata_url = 'https://www.tennistv.com/api/channels/v1/de/none/video/%s' % video_id
+        vdata = self._download_json(vdata_url, video_id)
+
+        timestamp = unified_timestamp(vdata['timestamp'])
+        thumbnail = vdata['video']['thumbnailUrl']
+        description = vdata['displayText']['description']
+        title = vdata['video']['title']
+
+        series = vdata['tour']
+        venue = vdata['displayText']['venue']
+        round_str = vdata['seo']['round']
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'series': series,
+            'season': venue,
+            'episode': round_str,
+        }
diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py

new file mode 100644 (file)

index 0000000..af325fe
--- /dev/null
+++ b/youtube_dl/extractor/tenplay.py
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_age_limit,
+    parse_iso8601,
+    smuggle_url,
+)
+
+
+class TenPlayIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/(?:[^/]+/)+(?P<id>tpv\d{6}[a-z]{5})'
+    _TESTS = [{
+        'url': 'https://10play.com.au/masterchef/episodes/season-1/masterchef-s1-ep-1/tpv190718kwzga',
+        'info_dict': {
+            'id': '6060533435001',
+            'ext': 'mp4',
+            'title': 'MasterChef - S1 Ep. 1',
+            'description': 'md5:4fe7b78e28af8f2d900cd20d900ef95c',
+            'age_limit': 10,
+            'timestamp': 1240828200,
+            'upload_date': '20090427',
+            'uploader_id': '2199827728001',
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc',
+        'only_matching': True,
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        content_id = self._match_id(url)
+        data = self._download_json(
+            'https://10play.com.au/api/video/' + content_id, content_id)
+        video = data.get('video') or {}
+        metadata = data.get('metaData') or {}
+        brightcove_id = video.get('videoId') or metadata['showContentVideoId']
+        brightcove_url = smuggle_url(
+            self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+            {'geo_countries': ['AU']})
+
+        return {
+            '_type': 'url_transparent',
+            'url': brightcove_url,
+            'id': content_id,
+            'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'),
+            'description': video.get('description'),
+            'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')),
+            'series': metadata.get('showName'),
+            'season': metadata.get('showContentSeason'),
+            'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')),
+            'ie_key': 'BrightcoveNew',
+        }
diff --git a/youtube_dl/extractor/testurl.py b/youtube_dl/extractor/testurl.py

new file mode 100644 (file)

index 0000000..84a14a0
--- /dev/null
+++ b/youtube_dl/extractor/testurl.py
@@ -0,0 +1,64 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class TestURLIE(InfoExtractor):
+    """ Allows addressing of the test cases as test:yout.*be_1 """
+
+    IE_DESC = False  # Do not list
+    _VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$'
+
+    def _real_extract(self, url):
+        from ..extractor import gen_extractors
+
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        extractor_id = mobj.group('extractor')
+        all_extractors = gen_extractors()
+
+        rex = re.compile(extractor_id, flags=re.IGNORECASE)
+        matching_extractors = [
+            e for e in all_extractors if rex.search(e.IE_NAME)]
+
+        if len(matching_extractors) == 0:
+            raise ExtractorError(
+                'No extractors matching %r found' % extractor_id,
+                expected=True)
+        elif len(matching_extractors) > 1:
+            # Is it obvious which one to pick?
+            try:
+                extractor = next(
+                    ie for ie in matching_extractors
+                    if ie.IE_NAME.lower() == extractor_id.lower())
+            except StopIteration:
+                raise ExtractorError(
+                    ('Found multiple matching extractors: %s' %
+                        ' '.join(ie.IE_NAME for ie in matching_extractors)),
+                    expected=True)
+        else:
+            extractor = matching_extractors[0]
+
+        num_str = mobj.group('num')
+        num = int(num_str) if num_str else 0
+
+        testcases = []
+        t = getattr(extractor, '_TEST', None)
+        if t:
+            testcases.append(t)
+        testcases.extend(getattr(extractor, '_TESTS', []))
+
+        try:
+            tc = testcases[num]
+        except IndexError:
+            raise ExtractorError(
+                ('Test case %d not found, got only %d tests' %
+                    (num, len(testcases))),
+                expected=True)
+
+        self.to_screen('Test URL: %s' % tc['url'])
+
+        return self.url_result(tc['url'], video_id=video_id)
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py

new file mode 100644 (file)

index 0000000..55e2a07
--- /dev/null
+++ b/youtube_dl/extractor/tf1.py
@@ -0,0 +1,92 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+
+
+class TF1IE(InfoExtractor):
+    """TF1 uses the wat.tv player."""
+    _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P<id>[^/?#.]+)'
+    _TESTS = [{
+        'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
+        'info_dict': {
+            'id': '10635995',
+            'ext': 'mp4',
+            'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle',
+            'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
+        },
+        'params': {
+            # Sometimes wat serves the whole file with the --test option
+            'skip_download': True,
+        },
+        'expected_warnings': ['HTTP Error 404'],
+    }, {
+        'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html',
+        'info_dict': {
+            'id': 'le-grand-mysterioso-chuggington-7085291-739',
+            'ext': 'mp4',
+            'title': 'Le grand Mystérioso - Chuggington',
+            'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.',
+            'upload_date': '20150103',
+        },
+        'params': {
+            # Sometimes wat serves the whole file with the --test option
+            'skip_download': True,
+        },
+        'skip': 'HTTP Error 410: Gone',
+    }, {
+        'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html',
+        'info_dict': {
+            'id': '13641379',
+            'ext': 'mp4',
+            'title': 'md5:f392bc52245dc5ad43771650c96fb620',
+            'description': 'md5:44bc54f0a21322f5b91d68e76a544eae',
+            'upload_date': '20190611',
+        },
+        'params': {
+            # Sometimes wat serves the whole file with the --test option
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        wat_id = None
+
+        data = self._parse_json(
+            self._search_regex(
+                r'__APOLLO_STATE__\s*=\s*({.+?})\s*(?:;|</script>)', webpage,
+                'data', default='{}'), video_id, fatal=False)
+
+        if data:
+            try:
+                wat_id = next(
+                    video.get('streamId')
+                    for key, video in data.items()
+                    if isinstance(video, dict)
+                    and video.get('slug') == video_id)
+                if not isinstance(wat_id, compat_str) or not wat_id.isdigit():
+                    wat_id = None
+            except StopIteration:
+                pass
+
+        if not wat_id:
+            wat_id = self._html_search_regex(
+                (r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1',
+                 r'(["\']?)streamId\1\s*:\s*(["\']?)(?P<id>\d+)\2'),
+                webpage, 'wat id', group='id')
+
+        return self.url_result('wat:%s' % wat_id, 'Wat')
diff --git a/youtube_dl/extractor/tfo.py b/youtube_dl/extractor/tfo.py

new file mode 100644 (file)

index 0000000..0631cb7
--- /dev/null
+++ b/youtube_dl/extractor/tfo.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    HEADRequest,
+    ExtractorError,
+    int_or_none,
+    clean_html,
+)
+
+
+class TFOIE(InfoExtractor):
+    _GEO_COUNTRIES = ['CA']
+    _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon',
+        'md5': 'cafbe4f47a8dae0ca0159937878100d6',
+        'info_dict': {
+            'id': '7da3d50e495c406b8fc0b997659cc075',
+            'ext': 'mp4',
+            'title': 'Video Game Hackathon',
+            'description': 'md5:558afeba217c6c8d96c60e5421795c07',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        self._request_webpage(HEADRequest('http://www.tfo.org/'), video_id)
+        infos = self._download_json(
+            'http://www.tfo.org/api/web/video/get_infos', video_id, data=json.dumps({
+                'product_id': video_id,
+            }).encode(), headers={
+                'X-tfo-session': self._get_cookies('http://www.tfo.org/')['tfo-session'].value,
+            })
+        if infos.get('success') == 0:
+            if infos.get('code') == 'ErrGeoBlocked':
+                self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(infos['msg'])), expected=True)
+        video_data = infos['data']
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': 'limelight:media:' + video_data['llid'],
+            'title': video_data['title'],
+            'description': video_data.get('description'),
+            'series': video_data.get('collection'),
+            'season_number': int_or_none(video_data.get('season')),
+            'episode_number': int_or_none(video_data.get('episode')),
+            'duration': int_or_none(video_data.get('duration')),
+            'ie_key': 'LimelightMedia',
+        }
diff --git a/youtube_dl/extractor/theintercept.py b/youtube_dl/extractor/theintercept.py

new file mode 100644 (file)

index 0000000..f23b587
--- /dev/null
+++ b/youtube_dl/extractor/theintercept.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    parse_iso8601,
+    int_or_none,
+    ExtractorError,
+)
+
+
+class TheInterceptIE(InfoExtractor):
+    _VALID_URL = r'https?://theintercept\.com/fieldofvision/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/',
+        'md5': '145f28b41d44aab2f87c0a4ac8ec95bd',
+        'info_dict': {
+            'id': '46214',
+            'ext': 'mp4',
+            'title': '#ThisIsACoup – Episode Four: Surrender or Die',
+            'description': 'md5:74dd27f0e2fbd50817829f97eaa33140',
+            'timestamp': 1450429239,
+            'upload_date': '20151218',
+            'comment_count': int,
+        }
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        json_data = self._parse_json(self._search_regex(
+            r'initialStoreTree\s*=\s*(?P<json_data>{.+})', webpage,
+            'initialStoreTree'), display_id)
+
+        for post in json_data['resources']['posts'].values():
+            if post['slug'] == display_id:
+                return {
+                    '_type': 'url_transparent',
+                    'url': 'jwplatform:%s' % post['fov_videoid'],
+                    'id': compat_str(post['ID']),
+                    'display_id': display_id,
+                    'title': post['title'],
+                    'description': post.get('excerpt'),
+                    'timestamp': parse_iso8601(post.get('date')),
+                    'comment_count': int_or_none(post.get('comments_number')),
+                }
+        raise ExtractorError('Unable to find the current post')
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py

new file mode 100644 (file)

index 0000000..0705551
--- /dev/null
+++ b/youtube_dl/extractor/theplatform.py
@@ -0,0 +1,411 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import time
+import hmac
+import binascii
+import hashlib
+
+
+from .once import OnceIE
+from .adobepass import AdobePassIE
+from ..compat import (
+    compat_parse_qs,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    sanitized_Request,
+    unsmuggle_url,
+    update_url_query,
+    xpath_with_ns,
+    mimetype2ext,
+    find_xpath_attr,
+)
+
+default_ns = 'http://www.w3.org/2005/SMIL21/Language'
+_x = lambda p: xpath_with_ns(p, {'smil': default_ns})
+
+
+class ThePlatformBaseIE(OnceIE):
+    _TP_TLD = 'com'
+
+    def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
+        meta = self._download_xml(
+            smil_url, video_id, note=note, query={'format': 'SMIL'},
+            headers=self.geo_verification_headers())
+        error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src')
+        if error_element is not None:
+            exception = find_xpath_attr(
+                error_element, _x('.//smil:param'), 'name', 'exception')
+            if exception is not None:
+                if exception.get('value') == 'GeoLocationBlocked':
+                    self.raise_geo_restricted(error_element.attrib['abstract'])
+                elif error_element.attrib['src'].startswith(
+                        'http://link.theplatform.%s/s/errorFiles/Unavailable.'
+                        % self._TP_TLD):
+                    raise ExtractorError(
+                        error_element.attrib['abstract'], expected=True)
+
+        smil_formats = self._parse_smil_formats(
+            meta, smil_url, video_id, namespace=default_ns,
+            # the parameters are from syfy.com, other sites may use others,
+            # they also work for nbc.com
+            f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'},
+            transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src))
+
+        formats = []
+        for _format in smil_formats:
+            if OnceIE.suitable(_format['url']):
+                formats.extend(self._extract_once_formats(_format['url']))
+            else:
+                media_url = _format['url']
+                if determine_ext(media_url) == 'm3u8':
+                    hdnea2 = self._get_cookies(media_url).get('hdnea2')
+                    if hdnea2:
+                        _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value})
+
+                formats.append(_format)
+
+        subtitles = self._parse_smil_subtitles(meta, default_ns)
+
+        return formats, subtitles
+
+    def _download_theplatform_metadata(self, path, video_id):
+        info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path)
+        return self._download_json(info_url, video_id)
+
+    def _parse_theplatform_metadata(self, info):
+        subtitles = {}
+        captions = info.get('captions')
+        if isinstance(captions, list):
+            for caption in captions:
+                lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
+                subtitles.setdefault(lang, []).append({
+                    'ext': mimetype2ext(mime),
+                    'url': src,
+                })
+
+        duration = info.get('duration')
+        tp_chapters = info.get('chapters', [])
+        chapters = []
+        if tp_chapters:
+            def _add_chapter(start_time, end_time):
+                start_time = float_or_none(start_time, 1000)
+                end_time = float_or_none(end_time, 1000)
+                if start_time is None or end_time is None:
+                    return
+                chapters.append({
+                    'start_time': start_time,
+                    'end_time': end_time,
+                })
+
+            for chapter in tp_chapters[:-1]:
+                _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
+            _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
+
+        return {
+            'title': info['title'],
+            'subtitles': subtitles,
+            'description': info['description'],
+            'thumbnail': info['defaultThumbnailUrl'],
+            'duration': float_or_none(duration, 1000),
+            'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
+            'uploader': info.get('billingCode'),
+            'chapters': chapters,
+        }
+
+    def _extract_theplatform_metadata(self, path, video_id):
+        info = self._download_theplatform_metadata(path, video_id)
+        return self._parse_theplatform_metadata(info)
+
+
+class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
+    _VALID_URL = r'''(?x)
+        (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
+           (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
+         |theplatform:)(?P<id>[^/\?&]+)'''
+
+    _TESTS = [{
+        # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
+        'url': 'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
+        'info_dict': {
+            'id': 'e9I_cZgTgIPd',
+            'ext': 'flv',
+            'title': 'Blackberry\'s big, bold Z30',
+            'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
+            'duration': 247,
+            'timestamp': 1383239700,
+            'upload_date': '20131031',
+            'uploader': 'CBSI-NEW',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+        'skip': '404 Not Found',
+    }, {
+        # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/
+        'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT',
+        'info_dict': {
+            'id': '22d_qsQ6MIRT',
+            'ext': 'flv',
+            'description': 'md5:ac330c9258c04f9d7512cf26b9595409',
+            'title': 'Tesla Model S: A second step towards a cleaner motoring future',
+            'timestamp': 1426176191,
+            'upload_date': '20150312',
+            'uploader': 'CBSI-NEW',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
+        'info_dict': {
+            'id': 'yMBg9E8KFxZD',
+            'ext': 'mp4',
+            'description': 'md5:644ad9188d655b742f942bf2e06b002d',
+            'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
+            'uploader': 'EGSM',
+        }
+    }, {
+        'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
+        'only_matching': True,
+    }, {
+        'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701',
+        'md5': 'fb96bb3d85118930a5b055783a3bd992',
+        'info_dict': {
+            'id': 'tdy_or_siri_150701',
+            'ext': 'mp4',
+            'title': 'iPhone Siri’s sassy response to a math question has people talking',
+            'description': 'md5:a565d1deadd5086f3331d57298ec6333',
+            'duration': 83.0,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1435752600,
+            'upload_date': '20150701',
+            'uploader': 'NBCU-NEWS',
+        },
+    }, {
+        # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
+        # geo-restricted (US), HLS encrypted with AES-128
+        'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def _extract_urls(cls, webpage):
+        m = re.search(
+            r'''(?x)
+                    <meta\s+
+                        property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+                        content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
+            ''', webpage)
+        if m:
+            return [m.group('url')]
+
+        # Are whitesapces ignored in URLs?
+        # https://github.com/ytdl-org/youtube-dl/issues/12044
+        matches = re.findall(
+            r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
+        if matches:
+            return [re.sub(r'\s', '', list(zip(*matches))[1][0])]
+
+    @staticmethod
+    def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
+        flags = '10' if include_qs else '00'
+        expiration_date = '%x' % (int(time.time()) + life)
+
+        def str_to_hex(str):
+            return binascii.b2a_hex(str.encode('ascii')).decode('ascii')
+
+        def hex_to_bytes(hex):
+            return binascii.a2b_hex(hex.encode('ascii'))
+
+        relative_path = re.match(r'https?://link\.theplatform\.com/s/([^?]+)', url).group(1)
+        clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path))
+        checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest()
+        sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
+        return '%s&sig=%s' % (url, sig)
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        mobj = re.match(self._VALID_URL, url)
+        provider_id = mobj.group('provider_id')
+        video_id = mobj.group('id')
+
+        if not provider_id:
+            provider_id = 'dJ5BDC'
+
+        path = provider_id + '/'
+        if mobj.group('media'):
+            path += mobj.group('media')
+        path += video_id
+
+        qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        if 'guid' in qs_dict:
+            webpage = self._download_webpage(url, video_id)
+            scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage)
+            feed_id = None
+            # feed id usually locates in the last script.
+            # Seems there's no pattern for the interested script filename, so
+            # I try one by one
+            for script in reversed(scripts):
+                feed_script = self._download_webpage(
+                    self._proto_relative_url(script, 'http:'),
+                    video_id, 'Downloading feed script')
+                feed_id = self._search_regex(
+                    r'defaultFeedId\s*:\s*"([^"]+)"', feed_script,
+                    'default feed id', default=None)
+                if feed_id is not None:
+                    break
+            if feed_id is None:
+                raise ExtractorError('Unable to find feed id')
+            return self.url_result('http://feed.theplatform.com/f/%s/%s?byGuid=%s' % (
+                provider_id, feed_id, qs_dict['guid'][0]))
+
+        if smuggled_data.get('force_smil_url', False):
+            smil_url = url
+        # Explicitly specified SMIL (see https://github.com/ytdl-org/youtube-dl/issues/7385)
+        elif '/guid/' in url:
+            headers = {}
+            source_url = smuggled_data.get('source_url')
+            if source_url:
+                headers['Referer'] = source_url
+            request = sanitized_Request(url, headers=headers)
+            webpage = self._download_webpage(request, video_id)
+            smil_url = self._search_regex(
+                r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',
+                webpage, 'smil url', group='url')
+            path = self._search_regex(
+                r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path')
+            smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4'
+        elif mobj.group('config'):
+            config_url = url + '&form=json'
+            config_url = config_url.replace('swf/', 'config/')
+            config_url = config_url.replace('onsite/', 'onsite/config/')
+            config = self._download_json(config_url, video_id, 'Downloading config')
+            if 'releaseUrl' in config:
+                release_url = config['releaseUrl']
+            else:
+                release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
+            smil_url = release_url + '&formats=MPEG4&manifest=f4m'
+        else:
+            smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
+
+        sig = smuggled_data.get('sig')
+        if sig:
+            smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
+
+        formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
+        self._sort_formats(formats)
+
+        ret = self._extract_theplatform_metadata(path, video_id)
+        combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
+        ret.update({
+            'id': video_id,
+            'formats': formats,
+            'subtitles': combined_subtitles,
+        })
+
+        return ret
+
+
+class ThePlatformFeedIE(ThePlatformBaseIE):
+    _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s'
+    _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))'
+    _TESTS = [{
+        # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
+        'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
+        'md5': '6e32495b5073ab414471b615c5ded394',
+        'info_dict': {
+            'id': 'n_hardball_5biden_140207',
+            'ext': 'mp4',
+            'title': 'The Biden factor: will Joe run in 2016?',
+            'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20140208',
+            'timestamp': 1391824260,
+            'duration': 467.0,
+            'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
+            'uploader': 'NBCU-NEWS',
+        },
+    }, {
+        'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01',
+        'only_matching': True,
+    }]
+
+    def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None):
+        real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
+        entry = self._download_json(real_url, video_id)['entries'][0]
+        main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else entry.get('plmedia$publicUrl')
+
+        formats = []
+        subtitles = {}
+        first_video_id = None
+        duration = None
+        asset_types = []
+        for item in entry['media$content']:
+            smil_url = item['plfile$url']
+            cur_video_id = ThePlatformIE._match_id(smil_url)
+            if first_video_id is None:
+                first_video_id = cur_video_id
+                duration = float_or_none(item.get('plfile$duration'))
+            file_asset_types = item.get('plfile$assetTypes') or compat_parse_qs(compat_urllib_parse_urlparse(smil_url).query)['assetTypes']
+            for asset_type in file_asset_types:
+                if asset_type in asset_types:
+                    continue
+                asset_types.append(asset_type)
+                query = {
+                    'mbr': 'true',
+                    'formats': item['plfile$format'],
+                    'assetTypes': asset_type,
+                }
+                if asset_type in asset_types_query:
+                    query.update(asset_types_query[asset_type])
+                cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
+                    main_smil_url or smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
+                formats.extend(cur_formats)
+                subtitles = self._merge_subtitles(subtitles, cur_subtitles)
+
+        self._sort_formats(formats)
+
+        thumbnails = [{
+            'url': thumbnail['plfile$url'],
+            'width': int_or_none(thumbnail.get('plfile$width')),
+            'height': int_or_none(thumbnail.get('plfile$height')),
+        } for thumbnail in entry.get('media$thumbnails', [])]
+
+        timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
+        categories = [item['media$name'] for item in entry.get('media$categories', [])]
+
+        ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id)
+        subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
+        ret.update({
+            'id': video_id,
+            'formats': formats,
+            'subtitles': subtitles,
+            'thumbnails': thumbnails,
+            'duration': duration,
+            'timestamp': timestamp,
+            'categories': categories,
+        })
+        if custom_fields:
+            ret.update(custom_fields(entry))
+
+        return ret
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        provider_id = mobj.group('provider_id')
+        feed_id = mobj.group('feed_id')
+        filter_query = mobj.group('filter')
+
+        return self._extract_feed_info(provider_id, feed_id, filter_query, video_id)
diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py

new file mode 100644 (file)

index 0000000..cd64235
--- /dev/null
+++ b/youtube_dl/extractor/thescene.py
@@ -0,0 +1,44 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..compat import compat_urlparse
+
+
+class TheSceneIE(InfoExtractor):
+    _VALID_URL = r'https?://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)'
+
+    _TEST = {
+        'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear',
+        'info_dict': {
+            'id': '520e8faac2b4c00e3c6e5f43',
+            'ext': 'mp4',
+            'title': 'Narciso Rodriguez: Spring 2013 Ready-to-Wear',
+            'display_id': 'narciso-rodriguez-spring-2013-ready-to-wear',
+            'duration': 127,
+            'series': 'Style.com Fashion Shows',
+            'season': 'Ready To Wear Spring 2013',
+            'tags': list,
+            'categories': list,
+            'upload_date': '20120913',
+            'timestamp': 1347512400,
+            'uploader': 'vogue',
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        player_url = compat_urlparse.urljoin(
+            url,
+            self._html_search_regex(
+                r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url'))
+
+        return {
+            '_type': 'url_transparent',
+            'display_id': display_id,
+            'url': player_url,
+            'ie_key': 'CondeNast',
+        }
diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py

new file mode 100644 (file)

index 0000000..c3f1188
--- /dev/null
+++ b/youtube_dl/extractor/thestar.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TheStarIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thestar\.com/(?:[^/]+/)*(?P<id>.+)\.html'
+    _TEST = {
+        'url': 'http://www.thestar.com/life/2016/02/01/mankind-why-this-woman-started-a-men-s-skincare-line.html',
+        'md5': '2c62dd4db2027e35579fefb97a8b6554',
+        'info_dict': {
+            'id': '4732393888001',
+            'ext': 'mp4',
+            'title': 'Mankind: Why this woman started a men\'s skin care line',
+            'description': 'Robert Cribb talks to Young Lee, the founder of Uncle Peter\'s MAN.',
+            'uploader_id': '794267642001',
+            'timestamp': 1454353482,
+            'upload_date': '20160201',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/794267642001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        brightcove_id = self._search_regex(
+            r'mainartBrightcoveVideoId["\']?\s*:\s*["\']?(\d+)',
+            webpage, 'brightcove id')
+        return self.url_result(
+            self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+            'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/thesun.py b/youtube_dl/extractor/thesun.py

new file mode 100644 (file)

index 0000000..15d4a69
--- /dev/null
+++ b/youtube_dl/extractor/thesun.py
@@ -0,0 +1,38 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import extract_attributes
+
+
+class TheSunIE(InfoExtractor):
+    _VALID_URL = r'https://(?:www\.)?thesun\.co\.uk/[^/]+/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.thesun.co.uk/tvandshowbiz/2261604/orlando-bloom-and-katy-perry-post-adorable-instagram-video-together-celebrating-thanksgiving-after-split-rumours/',
+        'info_dict': {
+            'id': '2261604',
+            'title': 'md5:cba22f48bad9218b64d5bbe0e16afddf',
+        },
+        'playlist_count': 2,
+    }
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, article_id)
+
+        entries = []
+        for video in re.findall(
+                r'<video[^>]+data-video-id-pending=[^>]+>',
+                webpage):
+            attrs = extract_attributes(video)
+            video_id = attrs['data-video-id-pending']
+            account_id = attrs.get('data-account', '5067014667001')
+            entries.append(self.url_result(
+                self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id),
+                'BrightcoveNew', video_id))
+
+        return self.playlist_result(
+            entries, article_id, self._og_search_title(webpage, fatal=False))
diff --git a/youtube_dl/extractor/theweatherchannel.py b/youtube_dl/extractor/theweatherchannel.py

new file mode 100644 (file)

index 0000000..c34a49d
--- /dev/null
+++ b/youtube_dl/extractor/theweatherchannel.py
@@ -0,0 +1,79 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .theplatform import ThePlatformIE
+from ..utils import (
+    determine_ext,
+    parse_duration,
+)
+
+
+class TheWeatherChannelIE(ThePlatformIE):
+    _VALID_URL = r'https?://(?:www\.)?weather\.com/(?:[^/]+/)*video/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock',
+        'md5': 'ab924ac9574e79689c24c6b95e957def',
+        'info_dict': {
+            'id': 'cc82397e-cc3f-4d11-9390-a785add090e8',
+            'ext': 'mp4',
+            'title': 'Ice Climber Is In For A Shock',
+            'description': 'md5:55606ce1378d4c72e6545e160c9d9695',
+            'uploader': 'TWC - Digital (No Distro)',
+            'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c',
+        }
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        drupal_settings = self._parse_json(self._search_regex(
+            r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+            webpage, 'drupal settings'), display_id)
+        video_id = drupal_settings['twc']['contexts']['node']['uuid']
+        video_data = self._download_json(
+            'https://dsx.weather.com/cms/v4/asset-collection/en_US/' + video_id, video_id)
+        seo_meta = video_data.get('seometa', {})
+        title = video_data.get('title') or seo_meta['title']
+
+        urls = []
+        thumbnails = []
+        formats = []
+        for variant_id, variant_url in video_data.get('variants', []).items():
+            variant_url = variant_url.strip()
+            if not variant_url or variant_url in urls:
+                continue
+            urls.append(variant_url)
+            ext = determine_ext(variant_url)
+            if ext == 'jpg':
+                thumbnails.append({
+                    'url': variant_url,
+                    'id': variant_id,
+                })
+            elif ThePlatformIE.suitable(variant_url):
+                tp_formats, _ = self._extract_theplatform_smil(variant_url, video_id)
+                formats.extend(tp_formats)
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    variant_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id=variant_id, fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    variant_url, video_id, f4m_id=variant_id, fatal=False))
+            else:
+                formats.append({
+                    'url': variant_url,
+                    'format_id': variant_id,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': video_data.get('description') or seo_meta.get('description') or seo_meta.get('og:description'),
+            'duration': parse_duration(video_data.get('duration')),
+            'uploader': video_data.get('providername'),
+            'uploader_id': video_data.get('providerid'),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/thisamericanlife.py b/youtube_dl/extractor/thisamericanlife.py

new file mode 100644 (file)

index 0000000..91e45f2
--- /dev/null
+++ b/youtube_dl/extractor/thisamericanlife.py
@@ -0,0 +1,40 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ThisAmericanLifeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thisamericanlife\.org/(?:radio-archives/episode/|play_full\.php\?play=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.thisamericanlife.org/radio-archives/episode/487/harper-high-school-part-one',
+        'md5': '8f7d2da8926298fdfca2ee37764c11ce',
+        'info_dict': {
+            'id': '487',
+            'ext': 'm4a',
+            'title': '487: Harper High School, Part One',
+            'description': 'md5:ee40bdf3fb96174a9027f76dbecea655',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.thisamericanlife.org/play_full.php?play=487',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.thisamericanlife.org/radio-archives/episode/%s' % video_id, video_id)
+
+        return {
+            'id': video_id,
+            'url': 'http://stream.thisamericanlife.org/{0}/stream/{0}_64k.m3u8'.format(video_id),
+            'protocol': 'm3u8_native',
+            'ext': 'm4a',
+            'acodec': 'aac',
+            'vcodec': 'none',
+            'abr': 64,
+            'title': self._html_search_meta(r'twitter:title', webpage, 'title', fatal=True),
+            'description': self._html_search_meta(r'description', webpage, 'description'),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/thisav.py b/youtube_dl/extractor/thisav.py

new file mode 100644 (file)

index 0000000..dc3dd03
--- /dev/null
+++ b/youtube_dl/extractor/thisav.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import remove_end
+
+
+class ThisAVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*'
+    _TESTS = [{
+        # jwplayer
+        'url': 'http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html',
+        'md5': '0480f1ef3932d901f0e0e719f188f19b',
+        'info_dict': {
+            'id': '47734',
+            'ext': 'flv',
+            'title': '高樹マリア - Just fit',
+            'uploader': 'dj7970',
+            'uploader_id': 'dj7970'
+        }
+    }, {
+        # html5 media
+        'url': 'http://www.thisav.com/video/242352/nerdy-18yo-big-ass-tattoos-and-glasses.html',
+        'md5': 'ba90c076bd0f80203679e5b60bf523ee',
+        'info_dict': {
+            'id': '242352',
+            'ext': 'mp4',
+            'title': 'Nerdy 18yo Big Ass Tattoos and Glasses',
+            'uploader': 'cybersluts',
+            'uploader_id': 'cybersluts',
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        title = remove_end(self._html_search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title'),
+            ' - 視頻 - ThisAV.com-世界第一中文成人娛樂網站')
+        video_url = self._html_search_regex(
+            r"addVariable\('file','([^']+)'\);", webpage, 'video url', default=None)
+        if video_url:
+            info_dict = {
+                'formats': [{
+                    'url': video_url,
+                }],
+            }
+        else:
+            entries = self._parse_html5_media_entries(url, webpage, video_id)
+            if entries:
+                info_dict = entries[0]
+            else:
+                info_dict = self._extract_jwplayer_data(
+                    webpage, video_id, require_title=False)
+        uploader = self._html_search_regex(
+            r': <a href="http://www\.thisav\.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>',
+            webpage, 'uploader name', fatal=False)
+        uploader_id = self._html_search_regex(
+            r': <a href="http://www\.thisav\.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>',
+            webpage, 'uploader id', fatal=False)
+
+        info_dict.update({
+            'id': video_id,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'title': title,
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/thisoldhouse.py b/youtube_dl/extractor/thisoldhouse.py

new file mode 100644 (file)

index 0000000..a3d9b40
--- /dev/null
+++ b/youtube_dl/extractor/thisoldhouse.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class ThisOldHouseIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/]+/)?\d+)/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench',
+        'info_dict': {
+            'id': '5dcdddf673c3f956ef5db202',
+            'ext': 'mp4',
+            'title': 'How to Build a Storage Bench',
+            'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.',
+            'timestamp': 1442548800,
+            'upload_date': '20150918',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost',
+        'only_matching': True,
+    }, {
+        # iframe www.thisoldhouse.com
+        'url': 'https://www.thisoldhouse.com/21083431/seaside-transformation-the-westerly-project',
+        'only_matching': True,
+    }]
+    _ZYPE_TMPL = 'https://player.zype.com/embed/%s.html?api_key=hsOk_yMSPYNrT22e9pu8hihLXjaZf0JW5jsOWv4ZqyHJFvkJn6rtToHl09tbbsbe'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        video_id = self._search_regex(
+            r'<iframe[^>]+src=[\'"](?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})',
+            webpage, 'video id')
+        return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id)
diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py

new file mode 100644 (file)

index 0000000..f26937d
--- /dev/null
+++ b/youtube_dl/extractor/threeqsdn.py
@@ -0,0 +1,142 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    js_to_json,
+    mimetype2ext,
+)
+
+
+class ThreeQSDNIE(InfoExtractor):
+    IE_NAME = '3qsdn'
+    IE_DESC = '3Q SDN'
+    _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TESTS = [{
+        # ondemand from http://www.philharmonie.tv/veranstaltung/26/
+        'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http',
+        'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd',
+        'info_dict': {
+            'id': '0280d6b9-1215-11e6-b427-0cc47a188158',
+            'ext': 'mp4',
+            'title': '0280d6b9-1215-11e6-b427-0cc47a188158',
+            'is_live': False,
+        },
+        'expected_warnings': ['Failed to download MPD manifest', 'Failed to parse JSON'],
+    }, {
+        # live video stream
+        'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true',
+        'info_dict': {
+            'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
+            'ext': 'mp4',
+            'title': 're:^d755d94b-4ab9-11e3-9162-0025907ad44f [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,  # m3u8 downloads
+        },
+        'expected_warnings': ['Failed to download MPD manifest'],
+    }, {
+        # live audio stream
+        'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # live audio stream with some 404 URLs
+        'url': 'http://playout.3qsdn.com/ac5c3186-777a-11e2-9c30-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # geo restricted with 'This content is not available in your country'
+        'url': 'http://playout.3qsdn.com/d63a3ffe-75e8-11e2-9c30-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # geo restricted with 'playout.3qsdn.com/forbidden'
+        'url': 'http://playout.3qsdn.com/8e330f26-6ae2-11e2-a16a-9acf09e2db48',
+        'only_matching': True,
+    }, {
+        # live video with rtmp link
+        'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        js = self._download_webpage(
+            'http://playout.3qsdn.com/%s' % video_id, video_id,
+            query={'js': 'true'})
+
+        if any(p in js for p in (
+                '>This content is not available in your country',
+                'playout.3qsdn.com/forbidden')):
+            self.raise_geo_restricted()
+
+        stream_content = self._search_regex(
+            r'streamContent\s*:\s*(["\'])(?P<content>.+?)\1', js,
+            'stream content', default='demand', group='content')
+
+        live = stream_content == 'live'
+
+        stream_type = self._search_regex(
+            r'streamType\s*:\s*(["\'])(?P<type>audio|video)\1', js,
+            'stream type', default='video', group='type')
+
+        formats = []
+        urls = set()
+
+        def extract_formats(item_url, item={}):
+            if not item_url or item_url in urls:
+                return
+            urls.add(item_url)
+            ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None)
+            if ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    item_url, video_id, mpd_id='mpd', fatal=False))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    item_url, video_id, 'mp4',
+                    entry_protocol='m3u8' if live else 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    item_url, video_id, f4m_id='hds', fatal=False))
+            else:
+                if not self._is_valid_url(item_url, video_id):
+                    return
+                formats.append({
+                    'url': item_url,
+                    'format_id': item.get('quality'),
+                    'ext': 'mp4' if item_url.startswith('rtsp') else ext,
+                    'vcodec': 'none' if stream_type == 'audio' else None,
+                })
+
+        for item_js in re.findall(r'({[^{]*?\b(?:src|source)\s*:\s*["\'].+?})', js):
+            f = self._parse_json(
+                item_js, video_id, transform_source=js_to_json, fatal=False)
+            if not f:
+                continue
+            extract_formats(f.get('src'), f)
+
+        # More relaxed version to collect additional URLs and acting
+        # as a future-proof fallback
+        for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js):
+            extract_formats(src)
+
+        self._sort_formats(formats)
+
+        title = self._live_title(video_id) if live else video_id
+
+        return {
+            'id': video_id,
+            'title': title,
+            'is_live': live,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py

new file mode 100644 (file)

index 0000000..66088b9
--- /dev/null
+++ b/youtube_dl/extractor/tiktok.py
@@ -0,0 +1,138 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_str,
+    ExtractorError,
+    int_or_none,
+    str_or_none,
+    try_get,
+    url_or_none,
+)
+
+
+class TikTokBaseIE(InfoExtractor):
+    def _extract_aweme(self, data):
+        video = data['video']
+        description = str_or_none(try_get(data, lambda x: x['desc']))
+        width = int_or_none(try_get(data, lambda x: video['width']))
+        height = int_or_none(try_get(data, lambda x: video['height']))
+
+        format_urls = set()
+        formats = []
+        for format_id in (
+                'play_addr_lowbr', 'play_addr', 'play_addr_h264',
+                'download_addr'):
+            for format in try_get(
+                    video, lambda x: x[format_id]['url_list'], list) or []:
+                format_url = url_or_none(format)
+                if not format_url:
+                    continue
+                if format_url in format_urls:
+                    continue
+                format_urls.add(format_url)
+                formats.append({
+                    'url': format_url,
+                    'ext': 'mp4',
+                    'height': height,
+                    'width': width,
+                })
+        self._sort_formats(formats)
+
+        thumbnail = url_or_none(try_get(
+            video, lambda x: x['cover']['url_list'][0], compat_str))
+        uploader = try_get(data, lambda x: x['author']['nickname'], compat_str)
+        timestamp = int_or_none(data.get('create_time'))
+        comment_count = int_or_none(data.get('comment_count')) or int_or_none(
+            try_get(data, lambda x: x['statistics']['comment_count']))
+        repost_count = int_or_none(try_get(
+            data, lambda x: x['statistics']['share_count']))
+
+        aweme_id = data['aweme_id']
+
+        return {
+            'id': aweme_id,
+            'title': uploader or aweme_id,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'timestamp': timestamp,
+            'comment_count': comment_count,
+            'repost_count': repost_count,
+            'formats': formats,
+        }
+
+
+class TikTokIE(TikTokBaseIE):
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:
+                                (?:m\.)?tiktok\.com/v|
+                                (?:www\.)?tiktok\.com/share/video
+                            )
+                            /(?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'https://m.tiktok.com/v/6606727368545406213.html',
+        'md5': 'd584b572e92fcd48888051f238022420',
+        'info_dict': {
+            'id': '6606727368545406213',
+            'ext': 'mp4',
+            'title': 'Zureeal',
+            'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay',
+            'thumbnail': r're:^https?://.*~noop.image',
+            'uploader': 'Zureeal',
+            'timestamp': 1538248586,
+            'upload_date': '20180929',
+            'comment_count': int,
+            'repost_count': int,
+        }
+    }, {
+        'url': 'https://www.tiktok.com/share/video/6606727368545406213',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'https://m.tiktok.com/v/%s.html' % video_id, video_id)
+        data = self._parse_json(self._search_regex(
+            r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id)
+        return self._extract_aweme(data)
+
+
+class TikTokUserIE(TikTokBaseIE):
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:
+                                (?:m\.)?tiktok\.com/h5/share/usr|
+                                (?:www\.)?tiktok\.com/share/user
+                            )
+                            /(?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html',
+        'info_dict': {
+            'id': '188294915489964032',
+        },
+        'playlist_mincount': 24,
+    }, {
+        'url': 'https://www.tiktok.com/share/user/188294915489964032',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+        data = self._download_json(
+            'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id,
+            query={'_signature': '_'})
+        entries = []
+        for aweme in data['aweme_list']:
+            try:
+                entry = self._extract_aweme(aweme)
+            except ExtractorError:
+                continue
+            entry['extractor_key'] = TikTokIE.ie_key()
+            entries.append(entry)
+        return self.playlist_result(entries, user_id)
diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py

new file mode 100644 (file)

index 0000000..bc2def5
--- /dev/null
+++ b/youtube_dl/extractor/tinypic.py
@@ -0,0 +1,56 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class TinyPicIE(InfoExtractor):
+    IE_NAME = 'tinypic'
+    IE_DESC = 'tinypic.com videos'
+    _VALID_URL = r'https?://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+'
+
+    _TESTS = [
+        {
+            'url': 'http://tinypic.com/player.php?v=6xw7tc%3E&s=5#.UtqZmbRFCM8',
+            'md5': '609b74432465364e72727ebc6203f044',
+            'info_dict': {
+                'id': '6xw7tc',
+                'ext': 'flv',
+                'title': 'shadow phenomenon weird',
+            },
+        },
+        {
+            'url': 'http://de.tinypic.com/player.php?v=dy90yh&s=8',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id, 'Downloading page')
+
+        mobj = re.search(r'(?m)fo\.addVariable\("file",\s"(?P<fileid>[\da-z]+)"\);\n'
+                         r'\s+fo\.addVariable\("s",\s"(?P<serverid>\d+)"\);', webpage)
+        if mobj is None:
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+        file_id = mobj.group('fileid')
+        server_id = mobj.group('serverid')
+
+        KEYWORDS_SUFFIX = ', Video, images, photos, videos, myspace, ebay, video hosting, photo hosting'
+        keywords = self._html_search_meta('keywords', webpage, 'title')
+        title = keywords[:-len(KEYWORDS_SUFFIX)] if keywords.endswith(KEYWORDS_SUFFIX) else ''
+
+        video_url = 'http://v%s.tinypic.com/%s.flv' % (server_id, file_id)
+        thumbnail = 'http://v%s.tinypic.com/%s_th.jpg' % (server_id, file_id)
+
+        return {
+            'id': file_id,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'title': title
+        }
diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py

new file mode 100644 (file)

index 0000000..419f9d9
--- /dev/null
+++ b/youtube_dl/extractor/tmz.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TMZIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'http://www.tmz.com/videos/0_okj015ty/',
+        'md5': '4d22a51ef205b6c06395d8394f72d560',
+        'info_dict': {
+            'id': '0_okj015ty',
+            'ext': 'mp4',
+            'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!',
+            'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie???  Or is she just showing off her amazing boobs?',
+            'timestamp': 1394747163,
+            'uploader_id': 'batchUser',
+            'upload_date': '20140313',
+        }
+    }, {
+        'url': 'http://www.tmz.com/videos/0-cegprt2p/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url).replace('-', '_')
+        return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id)
+
+
+class TMZArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/]+)/?'
+    _TEST = {
+        'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert',
+        'md5': '3316ff838ae5bb7f642537825e1e90d2',
+        'info_dict': {
+            'id': '0_6snoelag',
+            'ext': 'mov',
+            'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake',
+            'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake.  She\'s watching me."',
+            'timestamp': 1429467813,
+            'upload_date': '20150419',
+            'uploader_id': 'batchUser',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        embedded_video_info = self._parse_json(self._html_search_regex(
+            r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'),
+            video_id)
+
+        return self.url_result(
+            'http://www.tmz.com/videos/%s/' % embedded_video_info['id'])
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py

new file mode 100644 (file)

index 0000000..b3573c6
--- /dev/null
+++ b/youtube_dl/extractor/tnaflix.py
@@ -0,0 +1,327 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    fix_xml_ampersands,
+    float_or_none,
+    int_or_none,
+    parse_duration,
+    str_to_int,
+    unescapeHTML,
+    xpath_text,
+)
+
+
+class TNAFlixNetworkBaseIE(InfoExtractor):
+    # May be overridden in descendants if necessary
+    _CONFIG_REGEX = [
+        r'flashvars\.config\s*=\s*escape\("(?P<url>[^"]+)"',
+        r'<input[^>]+name="config\d?" value="(?P<url>[^"]+)"',
+        r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1',
+    ]
+    _HOST = 'tna'
+    _VKEY_SUFFIX = ''
+    _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"'
+    _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"'
+    _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"'
+    _VIEW_COUNT_REGEX = None
+    _COMMENT_COUNT_REGEX = None
+    _AVERAGE_RATING_REGEX = None
+    _CATEGORIES_REGEX = r'<li[^>]*>\s*<span[^>]+class="infoTitle"[^>]*>Categories:</span>\s*<span[^>]+class="listView"[^>]*>(.+?)</span>\s*</li>'
+
+    def _extract_thumbnails(self, flix_xml):
+
+        def get_child(elem, names):
+            for name in names:
+                child = elem.find(name)
+                if child is not None:
+                    return child
+
+        timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage'])
+        if timeline is None:
+            return
+
+        pattern_el = get_child(timeline, ['imagePattern', 'pattern'])
+        if pattern_el is None or not pattern_el.text:
+            return
+
+        first_el = get_child(timeline, ['imageFirst', 'first'])
+        last_el = get_child(timeline, ['imageLast', 'last'])
+        if first_el is None or last_el is None:
+            return
+
+        first_text = first_el.text
+        last_text = last_el.text
+        if not first_text.isdigit() or not last_text.isdigit():
+            return
+
+        first = int(first_text)
+        last = int(last_text)
+        if first > last:
+            return
+
+        width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width'))
+        height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height'))
+
+        return [{
+            'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'),
+            'width': width,
+            'height': height,
+        } for i in range(first, last + 1)]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        for display_id_key in ('display_id', 'display_id_2'):
+            if display_id_key in mobj.groupdict():
+                display_id = mobj.group(display_id_key)
+                if display_id:
+                    break
+        else:
+            display_id = video_id
+
+        webpage = self._download_webpage(url, display_id)
+
+        cfg_url = self._proto_relative_url(self._html_search_regex(
+            self._CONFIG_REGEX, webpage, 'flashvars.config', default=None,
+            group='url'), 'http:')
+
+        if not cfg_url:
+            inputs = self._hidden_inputs(webpage)
+            cfg_url = ('https://cdn-fck.%sflix.com/%sflix/%s%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha'
+                       % (self._HOST, self._HOST, inputs['vkey'], self._VKEY_SUFFIX, inputs['nkey'], video_id))
+
+        cfg_xml = self._download_xml(
+            cfg_url, display_id, 'Downloading metadata',
+            transform_source=fix_xml_ampersands, headers={'Referer': url})
+
+        formats = []
+
+        def extract_video_url(vl):
+            # Any URL modification now results in HTTP Error 403: Forbidden
+            return unescapeHTML(vl.text)
+
+        video_link = cfg_xml.find('./videoLink')
+        if video_link is not None:
+            formats.append({
+                'url': extract_video_url(video_link),
+                'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'),
+            })
+
+        for item in cfg_xml.findall('./quality/item'):
+            video_link = item.find('./videoLink')
+            if video_link is None:
+                continue
+            res = item.find('res')
+            format_id = None if res is None else res.text
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]', format_id, 'height', default=None))
+            formats.append({
+                'url': self._proto_relative_url(extract_video_url(video_link), 'http:'),
+                'format_id': format_id,
+                'height': height,
+            })
+
+        self._sort_formats(formats)
+
+        thumbnail = self._proto_relative_url(
+            xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:')
+        thumbnails = self._extract_thumbnails(cfg_xml)
+
+        title = None
+        if self._TITLE_REGEX:
+            title = self._html_search_regex(
+                self._TITLE_REGEX, webpage, 'title', default=None)
+        if not title:
+            title = self._og_search_title(webpage)
+
+        age_limit = self._rta_search(webpage) or 18
+
+        duration = parse_duration(self._html_search_meta(
+            'duration', webpage, 'duration', default=None))
+
+        def extract_field(pattern, name):
+            return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None
+
+        description = extract_field(self._DESCRIPTION_REGEX, 'description')
+        uploader = extract_field(self._UPLOADER_REGEX, 'uploader')
+        view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count'))
+        comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count'))
+        average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating'))
+
+        categories_str = extract_field(self._CATEGORIES_REGEX, 'categories')
+        categories = [c.strip() for c in categories_str.split(',')] if categories_str is not None else []
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'thumbnails': thumbnails,
+            'duration': duration,
+            'age_limit': age_limit,
+            'uploader': uploader,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'average_rating': average_rating,
+            'categories': categories,
+            'formats': formats,
+        }
+
+
+class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE):
+    _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)'
+
+    _TITLE_REGEX = r'<title>([^<]+)</title>'
+
+    _TESTS = [{
+        'url': 'https://player.tnaflix.com/video/6538',
+        'info_dict': {
+            'id': '6538',
+            'display_id': '6538',
+            'ext': 'mp4',
+            'title': 'Educational xxx video',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://player.empflix.com/video/33051',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [url for _, url in re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1',
+            webpage)]
+
+
+class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE):
+    _DESCRIPTION_REGEX = r'(?s)>Description:</[^>]+>(.+?)<'
+    _UPLOADER_REGEX = r'<span>by\s*<a[^>]+\bhref=["\']/profile/[^>]+>([^<]+)<'
+    _CATEGORIES_REGEX = r'(?s)<span[^>]*>Categories:</span>(.+?)</div>'
+
+
+class TNAFlixIE(TNAEMPFlixBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
+
+    _TITLE_REGEX = r'<title>(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)</title>'
+
+    _TESTS = [{
+        # anonymous uploader, no categories
+        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+        'md5': '7e569419fe6d69543d01e6be22f5f7c4',
+        'info_dict': {
+            'id': '553878',
+            'display_id': 'Carmella-Decesare-striptease',
+            'ext': 'mp4',
+            'title': 'Carmella Decesare - striptease',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'duration': 91,
+            'age_limit': 18,
+            'categories': ['Porn Stars'],
+        }
+    }, {
+        # non-anonymous uploader, categories
+        'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538',
+        'md5': '0f5d4d490dbfd117b8607054248a07c0',
+        'info_dict': {
+            'id': '6538',
+            'display_id': 'Educational-xxx-video',
+            'ext': 'mp4',
+            'title': 'Educational xxx video',
+            'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'duration': 164,
+            'age_limit': 18,
+            'uploader': 'bobwhite39',
+            'categories': list,
+        }
+    }, {
+        'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
+        'only_matching': True,
+    }]
+
+
+class EMPFlixIE(TNAEMPFlixBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?empflix\.com/(?:videos/(?P<display_id>.+?)-|[^/]+/(?P<display_id_2>[^/]+)/video)(?P<id>[0-9]+)'
+
+    _HOST = 'emp'
+    _VKEY_SUFFIX = '-1'
+
+    _TESTS = [{
+        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+        'md5': 'bc30d48b91a7179448a0bda465114676',
+        'info_dict': {
+            'id': '33051',
+            'display_id': 'Amateur-Finger-Fuck',
+            'ext': 'mp4',
+            'title': 'Amateur Finger Fuck',
+            'description': 'Amateur solo finger fucking.',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'duration': 83,
+            'age_limit': 18,
+            'uploader': 'cwbike',
+            'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'],
+        }
+    }, {
+        'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051',
+        'only_matching': True,
+    }]
+
+
+class MovieFapIE(TNAFlixNetworkBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html'
+
+    _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>'
+    _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>'
+    _AVERAGE_RATING_REGEX = r'Current Rating\s*<br>\s*<strong>([\d.]+)</strong>'
+    _CATEGORIES_REGEX = r'(?s)<div[^>]+id="vid_info"[^>]*>\s*<div[^>]*>.+?</div>(.*?)<br>'
+
+    _TESTS = [{
+        # normal, multi-format video
+        'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html',
+        'md5': '26624b4e2523051b550067d547615906',
+        'info_dict': {
+            'id': 'be9867c9416c19f54a4a',
+            'display_id': 'experienced-milf-amazing-handjob',
+            'ext': 'mp4',
+            'title': 'Experienced MILF Amazing Handjob',
+            'description': 'Experienced MILF giving an Amazing Handjob',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'age_limit': 18,
+            'uploader': 'darvinfred06',
+            'view_count': int,
+            'comment_count': int,
+            'average_rating': float,
+            'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'],
+        }
+    }, {
+        # quirky single-format case where the extension is given as fid, but the video is really an flv
+        'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
+        'md5': 'fa56683e291fc80635907168a743c9ad',
+        'info_dict': {
+            'id': 'e5da0d3edce5404418f5',
+            'display_id': 'jeune-couple-russe',
+            'ext': 'flv',
+            'title': 'Jeune Couple Russe',
+            'description': 'Amateur',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'age_limit': 18,
+            'uploader': 'whiskeyjar',
+            'view_count': int,
+            'comment_count': int,
+            'average_rating': float,
+            'categories': ['Amateur', 'Teen'],
+        }
+    }]
diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py

new file mode 100644 (file)

index 0000000..ca2e36e
--- /dev/null
+++ b/youtube_dl/extractor/toggle.py
@@ -0,0 +1,213 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    parse_iso8601,
+    sanitized_Request,
+)
+
+
+class ToggleIE(InfoExtractor):
+    IE_NAME = 'toggle'
+    _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115',
+        'info_dict': {
+            'id': '343115',
+            'ext': 'mp4',
+            'title': 'Lion Moms Premiere',
+            'description': 'md5:aea1149404bff4d7f7b6da11fafd8e6b',
+            'upload_date': '20150910',
+            'timestamp': 1441858274,
+        },
+        'params': {
+            'skip_download': 'm3u8 download',
+        }
+    }, {
+        'note': 'DRM-protected video',
+        'url': 'http://www.mewatch.sg/en/movies/dug-s-special-mission/341413',
+        'info_dict': {
+            'id': '341413',
+            'ext': 'wvm',
+            'title': 'Dug\'s Special Mission',
+            'description': 'md5:e86c6f4458214905c1772398fabc93e0',
+            'upload_date': '20150827',
+            'timestamp': 1440644006,
+        },
+        'params': {
+            'skip_download': 'DRM-protected wvm download',
+        }
+    }, {
+        # this also tests correct video id extraction
+        'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay',
+        'url': 'http://www.mewatch.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861',
+        'info_dict': {
+            'id': '332861',
+            'ext': 'mp4',
+            'title': '28th SEA Games (5 Show) -  Episode  11',
+            'description': 'md5:3cd4f5f56c7c3b1340c50a863f896faa',
+            'upload_date': '20150605',
+            'timestamp': 1433480166,
+        },
+        'params': {
+            'skip_download': 'DRM-protected wvm download',
+        },
+        'skip': 'm3u8 links are geo-restricted'
+    }, {
+        'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.mewatch.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.mewatch.sg/zh/series/zero-calling-s2-hd/ep13/336367',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.mewatch.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.mewatch.sg/en/movies/seven-days/321936',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.mewatch.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.mewatch.sg/en/channels/eleven-plus/401585',
+        'only_matching': True,
+    }]
+
+    _FORMAT_PREFERENCES = {
+        'wvm-STBMain': -10,
+        'wvm-iPadMain': -20,
+        'wvm-iPhoneMain': -30,
+        'wvm-Android': -40,
+    }
+    _API_USER = 'tvpapi_147'
+    _API_PASS = '11111'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            url, video_id, note='Downloading video page')
+
+        api_user = self._search_regex(
+            r'apiUser\s*:\s*(["\'])(?P<user>.+?)\1', webpage, 'apiUser',
+            default=self._API_USER, group='user')
+        api_pass = self._search_regex(
+            r'apiPass\s*:\s*(["\'])(?P<pass>.+?)\1', webpage, 'apiPass',
+            default=self._API_PASS, group='pass')
+
+        params = {
+            'initObj': {
+                'Locale': {
+                    'LocaleLanguage': '',
+                    'LocaleCountry': '',
+                    'LocaleDevice': '',
+                    'LocaleUserState': 0
+                },
+                'Platform': 0,
+                'SiteGuid': 0,
+                'DomainID': '0',
+                'UDID': '',
+                'ApiUser': api_user,
+                'ApiPass': api_pass
+            },
+            'MediaID': video_id,
+            'mediaType': 0,
+        }
+
+        req = sanitized_Request(
+            'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo',
+            json.dumps(params).encode('utf-8'))
+        info = self._download_json(req, video_id, 'Downloading video info json')
+
+        title = info['MediaName']
+
+        formats = []
+        for video_file in info.get('Files', []):
+            video_url, vid_format = video_file.get('URL'), video_file.get('Format')
+            if not video_url or video_url == 'NA' or not vid_format:
+                continue
+            ext = determine_ext(video_url)
+            vid_format = vid_format.replace(' ', '')
+            # if geo-restricted, m3u8 is inaccessible, but mp4 is okay
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, ext='mp4', m3u8_id=vid_format,
+                    note='Downloading %s m3u8 information' % vid_format,
+                    errnote='Failed to download %s m3u8 information' % vid_format,
+                    fatal=False))
+            elif ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    video_url, video_id, mpd_id=vid_format,
+                    note='Downloading %s MPD manifest' % vid_format,
+                    errnote='Failed to download %s MPD manifest' % vid_format,
+                    fatal=False))
+            elif ext == 'ism':
+                formats.extend(self._extract_ism_formats(
+                    video_url, video_id, ism_id=vid_format,
+                    note='Downloading %s ISM manifest' % vid_format,
+                    errnote='Failed to download %s ISM manifest' % vid_format,
+                    fatal=False))
+            elif ext in ('mp4', 'wvm'):
+                # wvm are drm-protected files
+                formats.append({
+                    'ext': ext,
+                    'url': video_url,
+                    'format_id': vid_format,
+                    'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1,
+                    'format_note': 'DRM-protected video' if ext == 'wvm' else None
+                })
+        if not formats:
+            # Most likely because geo-blocked
+            raise ExtractorError('No downloadable videos found', expected=True)
+        self._sort_formats(formats)
+
+        duration = int_or_none(info.get('Duration'))
+        description = info.get('Description')
+        created_at = parse_iso8601(info.get('CreationDate') or None)
+
+        average_rating = float_or_none(info.get('Rating'))
+        view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter'))
+        like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter'))
+
+        thumbnails = []
+        for picture in info.get('Pictures', []):
+            if not isinstance(picture, dict):
+                continue
+            pic_url = picture.get('URL')
+            if not pic_url:
+                continue
+            thumbnail = {
+                'url': pic_url,
+            }
+            pic_size = picture.get('PicSize', '')
+            m = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', pic_size)
+            if m:
+                thumbnail.update({
+                    'width': int(m.group('width')),
+                    'height': int(m.group('height')),
+                })
+            thumbnails.append(thumbnail)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': created_at,
+            'average_rating': average_rating,
+            'view_count': view_count,
+            'like_count': like_count,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tonline.py b/youtube_dl/extractor/tonline.py

new file mode 100644 (file)

index 0000000..cc11eae
--- /dev/null
+++ b/youtube_dl/extractor/tonline.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class TOnlineIE(InfoExtractor):
+    IE_NAME = 't-online.de'
+    _VALID_URL = r'https?://(?:www\.)?t-online\.de/tv/(?:[^/]+/)*id_(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.t-online.de/tv/sport/fussball/id_79166266/drittes-remis-zidane-es-muss-etwas-passieren-.html',
+        'md5': '7d94dbdde5f9d77c5accc73c39632c29',
+        'info_dict': {
+            'id': '79166266',
+            'ext': 'mp4',
+            'title': 'Drittes Remis! Zidane: "Es muss etwas passieren"',
+            'description': 'Es läuft nicht rund bei Real Madrid. Das 1:1 gegen den SD Eibar war das dritte Unentschieden in Folge in der Liga.',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'http://www.t-online.de/tv/id_%s/tid_json_video' % video_id, video_id)
+        title = video_data['subtitle']
+
+        formats = []
+        for asset in video_data.get('assets', []):
+            asset_source = asset.get('source') or asset.get('source2')
+            if not asset_source:
+                continue
+            formats_id = []
+            for field_key in ('type', 'profile'):
+                field_value = asset.get(field_key)
+                if field_value:
+                    formats_id.append(field_value)
+            formats.append({
+                'format_id': '-'.join(formats_id),
+                'url': asset_source,
+            })
+
+        thumbnails = []
+        for image in video_data.get('images', []):
+            image_source = image.get('source')
+            if not image_source:
+                continue
+            thumbnails.append({
+                'url': image_source,
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'duration': int_or_none(video_data.get('duration')),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/toongoggles.py b/youtube_dl/extractor/toongoggles.py

new file mode 100644 (file)

index 0000000..b5ba1c0
--- /dev/null
+++ b/youtube_dl/extractor/toongoggles.py
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+)
+
+
+class ToonGogglesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?toongoggles\.com/shows/(?P<show_id>\d+)(?:/[^/]+/episodes/(?P<episode_id>\d+))?'
+    _TESTS = [{
+        'url': 'http://www.toongoggles.com/shows/217143/bernard-season-2/episodes/217147/football',
+        'md5': '18289fc2b951eff6b953a9d8f01e6831',
+        'info_dict': {
+            'id': '217147',
+            'ext': 'mp4',
+            'title': 'Football',
+            'uploader_id': '1',
+            'description': 'Bernard decides to play football in order to be better than Lloyd and tries to beat him no matter how, he even cheats.',
+            'upload_date': '20160718',
+            'timestamp': 1468879330,
+        }
+    }, {
+        'url': 'http://www.toongoggles.com/shows/227759/om-nom-stories-around-the-world',
+        'info_dict': {
+            'id': '227759',
+            'title': 'Om Nom Stories Around The World',
+        },
+        'playlist_mincount': 11,
+    }]
+
+    def _call_api(self, action, page_id, query):
+        query.update({
+            'for_ng': 1,
+            'for_web': 1,
+            'show_meta': 1,
+            'version': 7.0,
+        })
+        return self._download_json('http://api.toongoggles.com/' + action, page_id, query=query)
+
+    def _parse_episode_data(self, episode_data):
+        title = episode_data['episode_name']
+
+        return {
+            '_type': 'url_transparent',
+            'id': episode_data['episode_id'],
+            'title': title,
+            'url': 'kaltura:513551:' + episode_data['entry_id'],
+            'thumbnail': episode_data.get('thumbnail_url'),
+            'description': episode_data.get('description'),
+            'duration': parse_duration(episode_data.get('hms')),
+            'series': episode_data.get('show_name'),
+            'season_number': int_or_none(episode_data.get('season_num')),
+            'episode_id': episode_data.get('episode_id'),
+            'episode': title,
+            'episode_number': int_or_none(episode_data.get('episode_num')),
+            'categories': episode_data.get('categories'),
+            'ie_key': 'Kaltura',
+        }
+
+    def _real_extract(self, url):
+        show_id, episode_id = re.match(self._VALID_URL, url).groups()
+        if episode_id:
+            episode_data = self._call_api('search', episode_id, {
+                'filter': 'episode',
+                'id': episode_id,
+            })['objects'][0]
+            return self._parse_episode_data(episode_data)
+        else:
+            show_data = self._call_api('getepisodesbyshow', show_id, {
+                'max': 1000000000,
+                'showid': show_id,
+            })
+            entries = []
+            for episode_data in show_data.get('objects', []):
+                entries.append(self._parse_episode_data(episode_data))
+            return self.playlist_result(entries, show_id, show_data.get('show_name'))
diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py

new file mode 100644 (file)

index 0000000..44b022f
--- /dev/null
+++ b/youtube_dl/extractor/toutv.py
@@ -0,0 +1,93 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .radiocanada import RadioCanadaIE
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    merge_dicts,
+)
+
+
+class TouTvIE(RadioCanadaIE):
+    _NETRC_MACHINE = 'toutv'
+    IE_NAME = 'tou.tv'
+    _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)'
+
+    _TESTS = [{
+        'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17',
+        'info_dict': {
+            'id': '122017',
+            'ext': 'mp4',
+            'title': 'Saison 2015 Épisode 17',
+            'description': 'La photo de famille 2',
+            'upload_date': '20100717',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': '404 Not Found',
+    }, {
+        'url': 'http://ici.tou.tv/hackers',
+        'only_matching': True,
+    }, {
+        'url': 'https://ici.tou.tv/l-age-adulte/S01C501',
+        'only_matching': True,
+    }]
+    _CLIENT_KEY = '90505c8d-9c34-4f34-8da1-3a85bdc6d4f4'
+
+    def _real_initialize(self):
+        email, password = self._get_login_info()
+        if email is None:
+            return
+        try:
+            self._access_token = self._download_json(
+                'https://services.radio-canada.ca/toutv/profiling/accounts/login',
+                None, 'Logging in', data=json.dumps({
+                    'ClientId': self._CLIENT_KEY,
+                    'ClientSecret': '34026772-244b-49b6-8b06-317b30ac9a20',
+                    'Email': email,
+                    'Password': password,
+                    'Scope': 'id.write media-validation.read',
+                }).encode(), headers={
+                    'Authorization': 'client-key ' + self._CLIENT_KEY,
+                    'Content-Type': 'application/json;charset=utf-8',
+                })['access_token']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                error = self._parse_json(e.cause.read().decode(), None)['Message']
+                raise ExtractorError(error, expected=True)
+            raise
+        self._claims = self._call_api('validation/v2/getClaims')['claims']
+
+    def _real_extract(self, url):
+        path = self._match_id(url)
+        metadata = self._download_json(
+            'https://services.radio-canada.ca/toutv/presentation/%s' % path, path, query={
+                'client_key': self._CLIENT_KEY,
+                'device': 'web',
+                'version': 4,
+            })
+        # IsDrm does not necessarily mean the video is DRM protected (see
+        # https://github.com/ytdl-org/youtube-dl/issues/13994).
+        if metadata.get('IsDrm'):
+            self.report_warning('This video is probably DRM protected.', path)
+        video_id = metadata['IdMedia']
+        details = metadata['Details']
+
+        return merge_dicts({
+            'id': video_id,
+            'title': details.get('OriginalTitle'),
+            'description': details.get('Description'),
+            'thumbnail': details.get('ImageUrl'),
+            'duration': int_or_none(details.get('LengthInSeconds')),
+            'series': metadata.get('ProgramTitle'),
+            'season_number': int_or_none(metadata.get('SeasonNumber')),
+            'season': metadata.get('SeasonTitle'),
+            'episode_number': int_or_none(metadata.get('EpisodeNumber')),
+            'episode': metadata.get('EpisodeTitle'),
+        }, self._extract_info(metadata.get('AppCode', 'toutv'), video_id))
diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py

new file mode 100644 (file)

index 0000000..f705a06
--- /dev/null
+++ b/youtube_dl/extractor/toypics.py
@@ -0,0 +1,90 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+import re
+
+
+class ToypicsIE(InfoExtractor):
+    IE_DESC = 'Toypics video'
+    _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',
+        'md5': '16e806ad6d6f58079d210fe30985e08b',
+        'info_dict': {
+            'id': '514',
+            'ext': 'mp4',
+            'title': "Chance-Bulge'd, 2",
+            'age_limit': 18,
+            'uploader': 'kidsune',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        formats = self._parse_html5_media_entries(
+            url, webpage, video_id)[0]['formats']
+        title = self._html_search_regex([
+            r'<h1[^>]+class=["\']view-video-title[^>]+>([^<]+)</h',
+            r'<title>([^<]+) - Toypics</title>',
+        ], webpage, 'title')
+
+        uploader = self._html_search_regex(
+            r'More videos from <strong>([^<]+)</strong>', webpage, 'uploader',
+            fatal=False)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'uploader': uploader,
+            'age_limit': 18,
+        }
+
+
+class ToypicsUserIE(InfoExtractor):
+    IE_DESC = 'Toypics user profile'
+    _VALID_URL = r'https?://videos\.toypics\.net/(?!view)(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://videos.toypics.net/Mikey',
+        'info_dict': {
+            'id': 'Mikey',
+        },
+        'playlist_mincount': 19,
+    }
+
+    def _real_extract(self, url):
+        username = self._match_id(url)
+
+        profile_page = self._download_webpage(
+            url, username, note='Retrieving profile page')
+
+        video_count = int(self._search_regex(
+            r'public/">Public Videos \(([0-9]+)\)</a></li>', profile_page,
+            'video count'))
+
+        PAGE_SIZE = 8
+        urls = []
+        page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
+        for n in range(1, page_count + 1):
+            lpage_url = url + '/public/%d' % n
+            lpage = self._download_webpage(
+                lpage_url, username,
+                note='Downloading page %d/%d' % (n, page_count))
+            urls.extend(
+                re.findall(
+                    r'<div[^>]+class=["\']preview[^>]+>\s*<a[^>]+href="(https?://videos\.toypics\.net/view/[^"]+)"',
+                    lpage))
+
+        return {
+            '_type': 'playlist',
+            'id': username,
+            'entries': [{
+                '_type': 'url',
+                'url': eurl,
+                'ie_key': 'Toypics',
+            } for eurl in urls]
+        }
diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py

new file mode 100644 (file)

index 0000000..747370d
--- /dev/null
+++ b/youtube_dl/extractor/traileraddict.py
@@ -0,0 +1,64 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class TrailerAddictIE(InfoExtractor):
+    _WORKING = False
+    _VALID_URL = r'(?:https?://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)'
+    _TEST = {
+        'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer',
+        'md5': '41365557f3c8c397d091da510e73ceb4',
+        'info_dict': {
+            'id': '76184',
+            'ext': 'mp4',
+            'title': 'Prince Avalanche Trailer',
+            'description': 'Trailer for Prince Avalanche.\n\nTwo highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind.',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        name = mobj.group('movie') + '/' + mobj.group('trailer_name')
+        webpage = self._download_webpage(url, name)
+
+        title = self._search_regex(r'<title>(.+?)</title>',
+                                   webpage, 'video title').replace(' - Trailer Addict', '')
+        view_count_str = self._search_regex(
+            r'<span class="views_n">([0-9,.]+)</span>',
+            webpage, 'view count', fatal=False)
+        view_count = (
+            None if view_count_str is None
+            else int(view_count_str.replace(',', '')))
+        video_id = self._search_regex(
+            r'<param\s+name="movie"\s+value="/emb/([0-9]+)"\s*/>',
+            webpage, 'video id')
+
+        # Presence of (no)watchplus function indicates HD quality is available
+        if re.search(r'function (no)?watchplus()', webpage):
+            fvar = 'fvarhd'
+        else:
+            fvar = 'fvar'
+
+        info_url = 'http://www.traileraddict.com/%s.php?tid=%s' % (fvar, str(video_id))
+        info_webpage = self._download_webpage(info_url, video_id, 'Downloading the info webpage')
+
+        final_url = self._search_regex(r'&fileurl=(.+)',
+                                       info_webpage, 'Download url').replace('%3F', '?')
+        thumbnail_url = self._search_regex(r'&image=(.+?)&',
+                                           info_webpage, 'thumbnail url')
+
+        description = self._html_search_regex(
+            r'(?s)<div class="synopsis">.*?<div class="movie_label_info"[^>]*>(.*?)</div>',
+            webpage, 'description', fatal=False)
+
+        return {
+            'id': video_id,
+            'url': final_url,
+            'title': title,
+            'thumbnail': thumbnail_url,
+            'description': description,
+            'view_count': view_count,
+        }
diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py

new file mode 100644 (file)

index 0000000..a800449
--- /dev/null
+++ b/youtube_dl/extractor/trilulilu.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class TriluliluIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|m)\.)?trilulilu\.ro/(?:[^/]+/)?(?P<id>[^/#\?]+)'
+    _TESTS = [{
+        'url': 'http://www.trilulilu.ro/big-buck-bunny-1',
+        'md5': '68da087b676a6196a413549212f60cc6',
+        'info_dict': {
+            'id': 'ae2899e124140b',
+            'ext': 'mp4',
+            'title': 'Big Buck Bunny',
+            'description': ':) pentru copilul din noi',
+            'uploader_id': 'chipy',
+            'upload_date': '20120304',
+            'timestamp': 1330830647,
+            'uploader': 'chipy',
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        },
+    }, {
+        'url': 'http://www.trilulilu.ro/adena-ft-morreti-inocenta',
+        'md5': '929dfb8729dc71750463af88bbbbf4a4',
+        'info_dict': {
+            'id': 'f299710e3c91c5',
+            'ext': 'mp4',
+            'title': 'Adena ft. Morreti - Inocenta',
+            'description': 'pop music',
+            'uploader_id': 'VEVOmixt',
+            'upload_date': '20151204',
+            'uploader': 'VEVOmixt',
+            'timestamp': 1449187937,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        media_info = self._download_json('http://m.trilulilu.ro/%s?format=json' % display_id, display_id)
+
+        age_limit = 0
+        errors = media_info.get('errors', {})
+        if errors.get('friends'):
+            raise ExtractorError('This video is private.', expected=True)
+        elif errors.get('geoblock'):
+            raise ExtractorError('This video is not available in your country.', expected=True)
+        elif errors.get('xxx_unlogged'):
+            age_limit = 18
+
+        media_class = media_info.get('class')
+        if media_class not in ('video', 'audio'):
+            raise ExtractorError('not a video or an audio')
+
+        user = media_info.get('user', {})
+
+        thumbnail = media_info.get('cover_url')
+        if thumbnail:
+            thumbnail.format(width='1600', height='1200')
+
+        # TODO: get correct ext for audio files
+        stream_type = media_info.get('stream_type')
+        formats = [{
+            'url': media_info['href'],
+            'ext': stream_type,
+        }]
+        if media_info.get('is_hd'):
+            formats.append({
+                'format_id': 'hd',
+                'url': media_info['hrefhd'],
+                'ext': stream_type,
+            })
+        if media_class == 'audio':
+            formats[0]['vcodec'] = 'none'
+        else:
+            formats[0]['format_id'] = 'sd'
+
+        return {
+            'id': media_info['identifier'].split('|')[1],
+            'display_id': display_id,
+            'formats': formats,
+            'title': media_info['title'],
+            'description': media_info.get('description'),
+            'thumbnail': thumbnail,
+            'uploader_id': user.get('username'),
+            'uploader': user.get('fullname'),
+            'timestamp': parse_iso8601(media_info.get('published'), ' '),
+            'duration': int_or_none(media_info.get('duration')),
+            'view_count': int_or_none(media_info.get('count_views')),
+            'like_count': int_or_none(media_info.get('count_likes')),
+            'comment_count': int_or_none(media_info.get('count_comments')),
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/trunews.py b/youtube_dl/extractor/trunews.py

new file mode 100644 (file)

index 0000000..cca5b5c
--- /dev/null
+++ b/youtube_dl/extractor/trunews.py
@@ -0,0 +1,34 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TruNewsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?trunews\.com/stream/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.trunews.com/stream/will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech',
+        'info_dict': {
+            'id': '5c5a21e65d3c196e1c0020cc',
+            'display_id': 'will-democrats-stage-a-circus-during-president-trump-s-state-of-the-union-speech',
+            'ext': 'mp4',
+            'title': "Will Democrats Stage a Circus During President Trump's State of the Union Speech?",
+            'description': 'md5:c583b72147cc92cf21f56a31aff7a670',
+            'duration': 3685,
+            'timestamp': 1549411440,
+            'upload_date': '20190206',
+        },
+        'add_ie': ['Zype'],
+    }
+    _ZYPE_TEMPL = 'https://player.zype.com/embed/%s.js?api_key=X5XnahkjCwJrT_l5zUqypnaLEObotyvtUKJWWlONxDoHVjP8vqxlArLV8llxMbyt'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        zype_id = self._download_json(
+            'https://api.zype.com/videos', display_id, query={
+                'app_key': 'PUVKp9WgGUb3-JUw6EqafLx8tFVP6VKZTWbUOR-HOm__g4fNDt1bCsm_LgYf_k9H',
+                'per_page': 1,
+                'active': 'true',
+                'friendly_title': display_id,
+            })['response'][0]['_id']
+        return self.url_result(self._ZYPE_TEMPL % zype_id, 'Zype', zype_id)
diff --git a/youtube_dl/extractor/trutv.py b/youtube_dl/extractor/trutv.py

new file mode 100644 (file)

index 0000000..ce892c8
--- /dev/null
+++ b/youtube_dl/extractor/trutv.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .turner import TurnerBaseIE
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class TruTVIE(TurnerBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?trutv\.com/(?:shows|full-episodes)/(?P<series_slug>[0-9A-Za-z-]+)/(?:videos/(?P<clip_slug>[0-9A-Za-z-]+)|(?P<id>\d+))'
+    _TEST = {
+        'url': 'https://www.trutv.com/shows/the-carbonaro-effect/videos/sunlight-activated-flower.html',
+        'info_dict': {
+            'id': 'f16c03beec1e84cd7d1a51f11d8fcc29124cc7f1',
+            'ext': 'mp4',
+            'title': 'Sunlight-Activated Flower',
+            'description': "A customer is stunned when he sees Michael's sunlight-activated flower.",
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        series_slug, clip_slug, video_id = re.match(self._VALID_URL, url).groups()
+
+        if video_id:
+            path = 'episode'
+            display_id = video_id
+        else:
+            path = 'series/clip'
+            display_id = clip_slug
+
+        data = self._download_json(
+            'https://api.trutv.com/v2/web/%s/%s/%s' % (path, series_slug, display_id),
+            display_id)
+        video_data = data['episode'] if video_id else data['info']
+        media_id = video_data['mediaId']
+        title = video_data['title'].strip()
+
+        info = self._extract_ngtv_info(
+            media_id, {}, {
+                'url': url,
+                'site_name': 'truTV',
+                'auth_required': video_data.get('isAuthRequired'),
+            })
+
+        thumbnails = []
+        for image in video_data.get('images', []):
+            image_url = image.get('srcUrl')
+            if not image_url:
+                continue
+            thumbnails.append({
+                'url': image_url,
+                'width': int_or_none(image.get('width')),
+                'height': int_or_none(image.get('height')),
+            })
+
+        info.update({
+            'id': media_id,
+            'display_id': display_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'thumbnails': thumbnails,
+            'timestamp': parse_iso8601(video_data.get('publicationDate')),
+            'series': video_data.get('showTitle'),
+            'season_number': int_or_none(video_data.get('seasonNum')),
+            'episode_number': int_or_none(video_data.get('episodeNum')),
+        })
+        return info
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py

new file mode 100644 (file)

index 0000000..db93b01
--- /dev/null
+++ b/youtube_dl/extractor/tube8.py
@@ -0,0 +1,86 @@
+from __future__ import unicode_literals
+
+import re
+
+from ..utils import (
+    int_or_none,
+    str_to_int,
+)
+from .keezmovies import KeezMoviesIE
+
+
+class Tube8IE(KeezMoviesIE):
+    _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.tube8.com/teen/kasia-music-video/229795/',
+        'md5': '65e20c48e6abff62ed0c3965fff13a39',
+        'info_dict': {
+            'id': '229795',
+            'display_id': 'kasia-music-video',
+            'ext': 'mp4',
+            'description': 'hot teen Kasia grinding',
+            'uploader': 'unknown',
+            'title': 'Kasia music video',
+            'age_limit': 18,
+            'duration': 230,
+            'categories': ['Teen'],
+            'tags': ['dancing'],
+        },
+    }, {
+        'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)',
+            webpage)
+
+    def _real_extract(self, url):
+        webpage, info = self._extract_info(url)
+
+        if not info['title']:
+            info['title'] = self._html_search_regex(
+                r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
+
+        description = self._html_search_regex(
+            r'(?s)Description:</dt>\s*<dd>(.+?)</dd>', webpage, 'description', fatal=False)
+        uploader = self._html_search_regex(
+            r'<span class="username">\s*(.+?)\s*<',
+            webpage, 'uploader', fatal=False)
+
+        like_count = int_or_none(self._search_regex(
+            r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
+        dislike_count = int_or_none(self._search_regex(
+            r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
+        view_count = str_to_int(self._search_regex(
+            r'Views:\s*</dt>\s*<dd>([\d,\.]+)',
+            webpage, 'view count', fatal=False))
+        comment_count = str_to_int(self._search_regex(
+            r'<span id="allCommentsCount">(\d+)</span>',
+            webpage, 'comment count', fatal=False))
+
+        category = self._search_regex(
+            r'Category:\s*</dt>\s*<dd>\s*<a[^>]+href=[^>]+>([^<]+)',
+            webpage, 'category', fatal=False)
+        categories = [category] if category else None
+
+        tags_str = self._search_regex(
+            r'(?s)Tags:\s*</dt>\s*<dd>(.+?)</(?!a)',
+            webpage, 'tags', fatal=False)
+        tags = [t for t in re.findall(
+            r'<a[^>]+href=[^>]+>([^<]+)', tags_str)] if tags_str else None
+
+        info.update({
+            'description': description,
+            'uploader': uploader,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'comment_count': comment_count,
+            'categories': categories,
+            'tags': tags,
+        })
+
+        return info
diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py

new file mode 100644 (file)

index 0000000..a51fa65
--- /dev/null
+++ b/youtube_dl/extractor/tubitv.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    sanitized_Request,
+    urlencode_postdata,
+)
+
+
+class TubiTvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tubitv\.com/(?:video|movies|tv-shows)/(?P<id>[0-9]+)'
+    _LOGIN_URL = 'http://tubitv.com/login'
+    _NETRC_MACHINE = 'tubitv'
+    _GEO_COUNTRIES = ['US']
+    _TESTS = [{
+        'url': 'http://tubitv.com/video/283829/the_comedian_at_the_friday',
+        'md5': '43ac06be9326f41912dc64ccf7a80320',
+        'info_dict': {
+            'id': '283829',
+            'ext': 'mp4',
+            'title': 'The Comedian at The Friday',
+            'description': 'A stand up comedian is forced to look at the decisions in his life while on a one week trip to the west coast.',
+            'uploader_id': 'bc168bee0d18dd1cb3b86c68706ab434',
+        },
+    }, {
+        'url': 'http://tubitv.com/tv-shows/321886/s01_e01_on_nom_stories',
+        'only_matching': True,
+    }, {
+        'url': 'http://tubitv.com/movies/383676/tracker',
+        'only_matching': True,
+    }]
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        form_data = {
+            'username': username,
+            'password': password,
+        }
+        payload = urlencode_postdata(form_data)
+        request = sanitized_Request(self._LOGIN_URL, payload)
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        login_page = self._download_webpage(
+            request, None, False, 'Wrong login info')
+        if not re.search(r'id="tubi-logout"', login_page):
+            raise ExtractorError(
+                'Login failed (invalid username/password)', expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'http://tubitv.com/oz/videos/%s/content' % video_id, video_id)
+        title = video_data['title']
+
+        formats = self._extract_m3u8_formats(
+            self._proto_relative_url(video_data['url']),
+            video_id, 'mp4', 'm3u8_native')
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for thumbnail_url in video_data.get('thumbnails', []):
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'url': self._proto_relative_url(thumbnail_url),
+            })
+
+        subtitles = {}
+        for sub in video_data.get('subtitles', []):
+            sub_url = sub.get('url')
+            if not sub_url:
+                continue
+            subtitles.setdefault(sub.get('lang', 'English'), []).append({
+                'url': self._proto_relative_url(sub_url),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'subtitles': subtitles,
+            'thumbnails': thumbnails,
+            'description': video_data.get('description'),
+            'duration': int_or_none(video_data.get('duration')),
+            'uploader_id': video_data.get('publisher_id'),
+        }
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py

new file mode 100644 (file)

index 0000000..7421378
--- /dev/null
+++ b/youtube_dl/extractor/tudou.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TudouPlaylistIE(InfoExtractor):
+    IE_NAME = 'tudou:playlist'
+    _VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html'
+    _TESTS = [{
+        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html',
+        'info_dict': {
+            'id': 'zzdE77v6Mmo',
+        },
+        'playlist_mincount': 209,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        playlist_data = self._download_json(
+            'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id)
+        entries = [self.url_result(
+            'http://www.tudou.com/programs/view/%s' % item['icode'],
+            'Tudou', item['icode'],
+            item['kw']) for item in playlist_data['items']]
+        return self.playlist_result(entries, playlist_id)
+
+
+class TudouAlbumIE(InfoExtractor):
+    IE_NAME = 'tudou:album'
+    _VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover|play)/(?P<id>[\w-]{11})'
+    _TESTS = [{
+        'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html',
+        'info_dict': {
+            'id': 'v5qckFJvNJg',
+        },
+        'playlist_mincount': 45,
+    }]
+
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+        album_data = self._download_json(
+            'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id)
+        entries = [self.url_result(
+            'http://www.tudou.com/programs/view/%s' % item['icode'],
+            'Tudou', item['icode'],
+            item['kw']) for item in album_data['items']]
+        return self.playlist_result(entries, album_id)
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py

new file mode 100644 (file)

index 0000000..ae584ad
--- /dev/null
+++ b/youtube_dl/extractor/tumblr.py
@@ -0,0 +1,213 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    urlencode_postdata
+)
+
+
+class TumblrIE(InfoExtractor):
+    _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])'
+    _NETRC_MACHINE = 'tumblr'
+    _LOGIN_URL = 'https://www.tumblr.com/login'
+    _TESTS = [{
+        'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes',
+        'md5': '479bb068e5b16462f5176a6828829767',
+        'info_dict': {
+            'id': '54196191430',
+            'ext': 'mp4',
+            'title': 'tatiana maslany news, Orphan Black || DVD extra - behind the scenes ↳...',
+            'description': 'md5:37db8211e40b50c7c44e95da14f630b7',
+            'thumbnail': r're:http://.*\.jpg',
+        }
+    }, {
+        'url': 'http://5sostrum.tumblr.com/post/90208453769/yall-forgetting-the-greatest-keek-of-them-all',
+        'md5': 'bf348ef8c0ef84fbf1cbd6fa6e000359',
+        'info_dict': {
+            'id': '90208453769',
+            'ext': 'mp4',
+            'title': '5SOS STRUM ;]',
+            'description': 'md5:dba62ac8639482759c8eb10ce474586a',
+            'thumbnail': r're:http://.*\.jpg',
+        }
+    }, {
+        'url': 'http://hdvideotest.tumblr.com/post/130323439814/test-description-for-my-hd-video',
+        'md5': '7ae503065ad150122dc3089f8cf1546c',
+        'info_dict': {
+            'id': '130323439814',
+            'ext': 'mp4',
+            'title': 'HD Video Testing \u2014 Test description for my HD video',
+            'description': 'md5:97cc3ab5fcd27ee4af6356701541319c',
+            'thumbnail': r're:http://.*\.jpg',
+        },
+        'params': {
+            'format': 'hd',
+        },
+    }, {
+        'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
+        'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
+        'info_dict': {
+            'id': 'Wmur',
+            'ext': 'mp4',
+            'title': 'naked smoking & stretching',
+            'upload_date': '20150506',
+            'timestamp': 1430931613,
+            'age_limit': 18,
+            'uploader_id': '1638622',
+            'uploader': 'naked-yogi',
+        },
+        'add_ie': ['Vidme'],
+    }, {
+        'url': 'http://camdamage.tumblr.com/post/98846056295/',
+        'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6',
+        'info_dict': {
+            'id': '105463834',
+            'ext': 'mp4',
+            'title': 'Cam Damage-HD 720p',
+            'uploader': 'John Moyer',
+            'uploader_id': 'user32021558',
+        },
+        'add_ie': ['Vimeo'],
+    }, {
+        'url': 'http://sutiblr.tumblr.com/post/139638707273',
+        'md5': '2dd184b3669e049ba40563a7d423f95c',
+        'info_dict': {
+            'id': 'ir7qBEIKqvq',
+            'ext': 'mp4',
+            'title': 'Vine by sutiblr',
+            'alt_title': 'Vine by sutiblr',
+            'uploader': 'sutiblr',
+            'uploader_id': '1198993975374495744',
+            'upload_date': '20160220',
+            'like_count': int,
+            'comment_count': int,
+            'repost_count': int,
+        },
+        'add_ie': ['Vine'],
+    }, {
+        'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or',
+        'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72',
+        'info_dict': {
+            'id': '-7LnUPGlSo',
+            'ext': 'mp4',
+            'title': 'Video by victoriassecret',
+            'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat',
+            'uploader_id': 'victoriassecret',
+            'thumbnail': r're:^https?://.*\.jpg'
+        },
+        'add_ie': ['Instagram'],
+    }]
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login page')
+
+        login_form = self._hidden_inputs(login_page)
+        login_form.update({
+            'user[email]': username,
+            'user[password]': password
+        })
+
+        response, urlh = self._download_webpage_handle(
+            self._LOGIN_URL, None, 'Logging in',
+            data=urlencode_postdata(login_form), headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'Referer': self._LOGIN_URL,
+            })
+
+        # Successful login
+        if '/dashboard' in urlh.geturl():
+            return
+
+        login_errors = self._parse_json(
+            self._search_regex(
+                r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response,
+                'login errors', default='[]'),
+            None, fatal=False)
+        if login_errors:
+            raise ExtractorError(
+                'Unable to login: %s' % login_errors[0], expected=True)
+
+        self.report_warning('Login has probably failed')
+
+    def _real_extract(self, url):
+        m_url = re.match(self._VALID_URL, url)
+        video_id = m_url.group('id')
+        blog = m_url.group('blog_name')
+
+        url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
+        webpage, urlh = self._download_webpage_handle(url, video_id)
+
+        redirect_url = urlh.geturl()
+        if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'):
+            raise ExtractorError(
+                'This Tumblr may contain sensitive media. '
+                'Disable safe mode in your account settings '
+                'at https://www.tumblr.com/settings/account#safe_mode',
+                expected=True)
+
+        iframe_url = self._search_regex(
+            r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'',
+            webpage, 'iframe url', default=None)
+        if iframe_url is None:
+            return self.url_result(redirect_url, 'Generic')
+
+        iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page')
+
+        duration = None
+        sources = []
+
+        sd_url = self._search_regex(
+            r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe,
+            'sd video url', default=None, group='url')
+        if sd_url:
+            sources.append((sd_url, 'sd'))
+
+        options = self._parse_json(
+            self._search_regex(
+                r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe,
+                'hd video url', default='', group='options'),
+            video_id, fatal=False)
+        if options:
+            duration = int_or_none(options.get('duration'))
+            hd_url = options.get('hdUrl')
+            if hd_url:
+                sources.append((hd_url, 'hd'))
+
+        formats = [{
+            'url': video_url,
+            'ext': 'mp4',
+            'format_id': format_id,
+            'height': int_or_none(self._search_regex(
+                r'/(\d{3,4})$', video_url, 'height', default=None)),
+            'quality': quality,
+        } for quality, (video_url, format_id) in enumerate(sources)]
+
+        self._sort_formats(formats)
+
+        # The only place where you can get a title, it's not complete,
+        # but searching in other places doesn't work for all videos
+        video_title = self._html_search_regex(
+            r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>',
+            webpage, 'title')
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py

new file mode 100644 (file)

index 0000000..c7a5f5a
--- /dev/null
+++ b/youtube_dl/extractor/tunein.py
@@ -0,0 +1,183 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+from ..compat import compat_urlparse
+
+
+class TuneInBaseIE(InfoExtractor):
+    _API_BASE_URL = 'http://tunein.com/tuner/tune/'
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/[pst]\d+)',
+            webpage)
+
+    def _real_extract(self, url):
+        content_id = self._match_id(url)
+
+        content_info = self._download_json(
+            self._API_BASE_URL + self._API_URL_QUERY % content_id,
+            content_id, note='Downloading JSON metadata')
+
+        title = content_info['Title']
+        thumbnail = content_info.get('Logo')
+        location = content_info.get('Location')
+        streams_url = content_info.get('StreamUrl')
+        if not streams_url:
+            raise ExtractorError('No downloadable streams found', expected=True)
+        if not streams_url.startswith('http://'):
+            streams_url = compat_urlparse.urljoin(url, streams_url)
+
+        streams = self._download_json(
+            streams_url, content_id, note='Downloading stream data',
+            transform_source=lambda s: re.sub(r'^\s*\((.*)\);\s*$', r'\1', s))['Streams']
+
+        is_live = None
+        formats = []
+        for stream in streams:
+            if stream.get('Type') == 'Live':
+                is_live = True
+            reliability = stream.get('Reliability')
+            format_note = (
+                'Reliability: %d%%' % reliability
+                if reliability is not None else None)
+            formats.append({
+                'preference': (
+                    0 if reliability is None or reliability > 90
+                    else 1),
+                'abr': stream.get('Bandwidth'),
+                'ext': stream.get('MediaType').lower(),
+                'acodec': stream.get('MediaType'),
+                'vcodec': 'none',
+                'url': stream.get('Url'),
+                'source_preference': reliability,
+                'format_note': format_note,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': content_id,
+            'title': self._live_title(title) if is_live else title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'location': location,
+            'is_live': is_live,
+        }
+
+
+class TuneInClipIE(TuneInBaseIE):
+    IE_NAME = 'tunein:clip'
+    _VALID_URL = r'https?://(?:www\.)?tunein\.com/station/.*?audioClipId\=(?P<id>\d+)'
+    _API_URL_QUERY = '?tuneType=AudioClip&audioclipId=%s'
+
+    _TESTS = [{
+        'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816',
+        'md5': '99f00d772db70efc804385c6b47f4e77',
+        'info_dict': {
+            'id': '816',
+            'title': '32m',
+            'ext': 'mp3',
+        },
+    }]
+
+
+class TuneInStationIE(TuneInBaseIE):
+    IE_NAME = 'tunein:station'
+    _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId=|embed/player/s)(?P<id>\d+)'
+    _API_URL_QUERY = '?tuneType=Station&stationId=%s'
+
+    @classmethod
+    def suitable(cls, url):
+        return False if TuneInClipIE.suitable(url) else super(TuneInStationIE, cls).suitable(url)
+
+    _TESTS = [{
+        'url': 'http://tunein.com/radio/Jazz24-885-s34682/',
+        'info_dict': {
+            'id': '34682',
+            'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2',
+            'ext': 'mp3',
+            'location': 'Tacoma, WA',
+        },
+        'params': {
+            'skip_download': True,  # live stream
+        },
+    }, {
+        'url': 'http://tunein.com/embed/player/s6404/',
+        'only_matching': True,
+    }]
+
+
+class TuneInProgramIE(TuneInBaseIE):
+    IE_NAME = 'tunein:program'
+    _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId=|embed/player/p)(?P<id>\d+)'
+    _API_URL_QUERY = '?tuneType=Program&programId=%s'
+
+    _TESTS = [{
+        'url': 'http://tunein.com/radio/Jazz-24-p2506/',
+        'info_dict': {
+            'id': '2506',
+            'title': 'Jazz 24 on 91.3 WUKY-HD3',
+            'ext': 'mp3',
+            'location': 'Lexington, KY',
+        },
+        'params': {
+            'skip_download': True,  # live stream
+        },
+    }, {
+        'url': 'http://tunein.com/embed/player/p191660/',
+        'only_matching': True,
+    }]
+
+
+class TuneInTopicIE(TuneInBaseIE):
+    IE_NAME = 'tunein:topic'
+    _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:topic/.*?TopicId=|embed/player/t)(?P<id>\d+)'
+    _API_URL_QUERY = '?tuneType=Topic&topicId=%s'
+
+    _TESTS = [{
+        'url': 'http://tunein.com/topic/?TopicId=101830576',
+        'md5': 'c31a39e6f988d188252eae7af0ef09c9',
+        'info_dict': {
+            'id': '101830576',
+            'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)',
+            'ext': 'mp3',
+            'location': 'Belgium',
+        },
+    }, {
+        'url': 'http://tunein.com/embed/player/t101830576/',
+        'only_matching': True,
+    }]
+
+
+class TuneInShortenerIE(InfoExtractor):
+    IE_NAME = 'tunein:shortener'
+    IE_DESC = False  # Do not list
+    _VALID_URL = r'https?://tun\.in/(?P<id>[A-Za-z0-9]+)'
+
+    _TEST = {
+        # test redirection
+        'url': 'http://tun.in/ser7s',
+        'info_dict': {
+            'id': '34682',
+            'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2',
+            'ext': 'mp3',
+            'location': 'Tacoma, WA',
+        },
+        'params': {
+            'skip_download': True,  # live stream
+        },
+    }
+
+    def _real_extract(self, url):
+        redirect_id = self._match_id(url)
+        # The server doesn't support HEAD requests
+        urlh = self._request_webpage(
+            url, redirect_id, note='Downloading redirect page')
+        url = urlh.geturl()
+        self.to_screen('Following redirect: %s' % url)
+        return self.url_result(url)
diff --git a/youtube_dl/extractor/tunepk.py b/youtube_dl/extractor/tunepk.py

new file mode 100644 (file)

index 0000000..9d42651
--- /dev/null
+++ b/youtube_dl/extractor/tunepk.py
@@ -0,0 +1,90 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    try_get,
+    unified_timestamp,
+)
+
+
+class TunePkIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:www\.)?tune\.pk/(?:video/|player/embed_player.php?.*?\bvid=)|
+                            embed\.tune\.pk/play/
+                        )
+                        (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'https://tune.pk/video/6919541/maudie-2017-international-trailer-1-ft-ethan-hawke-sally-hawkins',
+        'md5': '0c537163b7f6f97da3c5dd1e3ef6dd55',
+        'info_dict': {
+            'id': '6919541',
+            'ext': 'mp4',
+            'title': 'Maudie (2017) | International Trailer # 1 ft Ethan Hawke, Sally Hawkins',
+            'description': 'md5:eb5a04114fafef5cec90799a93a2d09c',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1487327564,
+            'upload_date': '20170217',
+            'uploader': 'Movie Trailers',
+            'duration': 107,
+            'view_count': int,
+        }
+    }, {
+        'url': 'https://tune.pk/player/embed_player.php?vid=6919541&folder=2017/02/17/&width=600&height=350&autoplay=no',
+        'only_matching': True,
+    }, {
+        'url': 'https://embed.tune.pk/play/6919541?autoplay=no&ssl=yes&inline=true',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://tune.pk/video/%s' % video_id, video_id)
+
+        details = self._parse_json(
+            self._search_regex(
+                r'new\s+TunePlayer\(({.+?})\)\s*;\s*\n', webpage, 'tune player'),
+            video_id)['details']
+
+        video = details['video']
+        title = video.get('title') or self._og_search_title(
+            webpage, default=None) or self._html_search_meta(
+            'title', webpage, 'title', fatal=True)
+
+        formats = self._parse_jwplayer_formats(
+            details['player']['sources'], video_id)
+        self._sort_formats(formats)
+
+        description = self._og_search_description(
+            webpage, default=None) or self._html_search_meta(
+            'description', webpage, 'description')
+
+        thumbnail = video.get('thumb') or self._og_search_thumbnail(
+            webpage, default=None) or self._html_search_meta(
+            'thumbnail', webpage, 'thumbnail')
+
+        timestamp = unified_timestamp(video.get('date_added'))
+        uploader = try_get(
+            video, lambda x: x['uploader']['name'],
+            compat_str) or self._html_search_meta('author', webpage, 'author')
+
+        duration = int_or_none(video.get('duration'))
+        view_count = int_or_none(video.get('views'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/turbo.py b/youtube_dl/extractor/turbo.py

new file mode 100644 (file)

index 0000000..be3eaa5
--- /dev/null
+++ b/youtube_dl/extractor/turbo.py
@@ -0,0 +1,68 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    qualities,
+    xpath_text,
+)
+
+
+class TurboIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?turbo\.fr/videos-voiture/(?P<id>[0-9]+)-'
+    _API_URL = 'http://www.turbo.fr/api/tv/xml.php?player_generique=player_generique&id={0:}'
+    _TEST = {
+        'url': 'http://www.turbo.fr/videos-voiture/454443-turbo-du-07-09-2014-renault-twingo-3-bentley-continental-gt-speed-ces-guide-achat-dacia.html',
+        'md5': '33f4b91099b36b5d5a91f84b5bcba600',
+        'info_dict': {
+            'id': '454443',
+            'ext': 'mp4',
+            'duration': 3715,
+            'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ',
+            'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        playlist = self._download_xml(self._API_URL.format(video_id), video_id)
+        item = playlist.find('./channel/item')
+        if item is None:
+            raise ExtractorError('Playlist item was not found', expected=True)
+
+        title = xpath_text(item, './title', 'title')
+        duration = int_or_none(xpath_text(item, './durate', 'duration'))
+        thumbnail = xpath_text(item, './visuel_clip', 'thumbnail')
+        description = self._html_search_meta('description', webpage)
+
+        formats = []
+        get_quality = qualities(['3g', 'sd', 'hq'])
+        for child in item:
+            m = re.search(r'url_video_(?P<quality>.+)', child.tag)
+            if m:
+                quality = compat_str(m.group('quality'))
+                formats.append({
+                    'format_id': quality,
+                    'url': child.text,
+                    'quality': get_quality(quality),
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'description': description,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py

new file mode 100644 (file)

index 0000000..4a6cbfb
--- /dev/null
+++ b/youtube_dl/extractor/turner.py
@@ -0,0 +1,234 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .adobepass import AdobePassIE
+from ..compat import compat_str
+from ..utils import (
+    xpath_text,
+    int_or_none,
+    determine_ext,
+    float_or_none,
+    parse_duration,
+    xpath_attr,
+    update_url_query,
+    ExtractorError,
+    strip_or_none,
+    url_or_none,
+)
+
+
+class TurnerBaseIE(AdobePassIE):
+    _AKAMAI_SPE_TOKEN_CACHE = {}
+
+    def _extract_timestamp(self, video_data):
+        return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts'))
+
+    def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None):
+        secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*'
+        token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path)
+        if not token:
+            query = {
+                'path': secure_path,
+            }
+            if custom_tokenizer_query:
+                query.update(custom_tokenizer_query)
+            else:
+                query['videoId'] = content_id
+            if ap_data.get('auth_required'):
+                query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name'])
+            auth = self._download_xml(
+                tokenizer_src, content_id, query=query)
+            error_msg = xpath_text(auth, 'error/msg')
+            if error_msg:
+                raise ExtractorError(error_msg, expected=True)
+            token = xpath_text(auth, 'token')
+            if not token:
+                return video_url
+            self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token
+        return video_url + '?hdnea=' + token
+
+    def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}):
+        video_data = self._download_xml(data_src, video_id)
+        video_id = video_data.attrib['id']
+        title = xpath_text(video_data, 'headline', fatal=True)
+        content_id = xpath_text(video_data, 'contentId') or video_id
+        # rtmp_src = xpath_text(video_data, 'akamai/src')
+        # if rtmp_src:
+        #     splited_rtmp_src = rtmp_src.split(',')
+        #     if len(splited_rtmp_src) == 2:
+        #         rtmp_src = splited_rtmp_src[1]
+        # aifp = xpath_text(video_data, 'akamai/aifp', default='')
+
+        urls = []
+        formats = []
+        rex = re.compile(
+            r'(?P<width>[0-9]+)x(?P<height>[0-9]+)(?:_(?P<bitrate>[0-9]+))?')
+        # Possible formats locations: files/file, files/groupFiles/files
+        # and maybe others
+        for video_file in video_data.findall('.//file'):
+            video_url = video_file.text.strip()
+            if not video_url:
+                continue
+            ext = determine_ext(video_url)
+            if video_url.startswith('/mp4:protected/'):
+                continue
+                # TODO Correct extraction for these files
+                # protected_path_data = path_data.get('protected')
+                # if not protected_path_data or not rtmp_src:
+                #     continue
+                # protected_path = self._search_regex(
+                #     r'/mp4:(.+)\.[a-z0-9]', video_url, 'secure path')
+                # auth = self._download_webpage(
+                #     protected_path_data['tokenizer_src'], query={
+                #         'path': protected_path,
+                #         'videoId': content_id,
+                #         'aifp': aifp,
+                #     })
+                # token = xpath_text(auth, 'token')
+                # if not token:
+                #     continue
+                # video_url = rtmp_src + video_url + '?' + token
+            elif video_url.startswith('/secure/'):
+                secure_path_data = path_data.get('secure')
+                if not secure_path_data:
+                    continue
+                video_url = self._add_akamai_spe_token(
+                    secure_path_data['tokenizer_src'],
+                    secure_path_data['media_src'] + video_url,
+                    content_id, ap_data)
+            elif not re.match('https?://', video_url):
+                base_path_data = path_data.get(ext, path_data.get('default', {}))
+                media_src = base_path_data.get('media_src')
+                if not media_src:
+                    continue
+                video_url = media_src + video_url
+            if video_url in urls:
+                continue
+            urls.append(video_url)
+            format_id = video_file.get('bitrate')
+            if ext == 'smil':
+                formats.extend(self._extract_smil_formats(
+                    video_url, video_id, fatal=False))
+            elif ext == 'm3u8':
+                m3u8_formats = self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4',
+                    m3u8_id=format_id or 'hls', fatal=False)
+                if '/secure/' in video_url and '?hdnea=' in video_url:
+                    for f in m3u8_formats:
+                        f['_seekable'] = False
+                formats.extend(m3u8_formats)
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    update_url_query(video_url, {'hdcore': '3.7.0'}),
+                    video_id, f4m_id=format_id or 'hds', fatal=False))
+            else:
+                f = {
+                    'format_id': format_id,
+                    'url': video_url,
+                    'ext': ext,
+                }
+                mobj = rex.search(format_id + video_url)
+                if mobj:
+                    f.update({
+                        'width': int(mobj.group('width')),
+                        'height': int(mobj.group('height')),
+                        'tbr': int_or_none(mobj.group('bitrate')),
+                    })
+                elif isinstance(format_id, compat_str):
+                    if format_id.isdigit():
+                        f['tbr'] = int(format_id)
+                    else:
+                        mobj = re.match(r'ios_(audio|[0-9]+)$', format_id)
+                        if mobj:
+                            if mobj.group(1) == 'audio':
+                                f.update({
+                                    'vcodec': 'none',
+                                    'ext': 'm4a',
+                                })
+                            else:
+                                f['tbr'] = int(mobj.group(1))
+                formats.append(f)
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for source in video_data.findall('closedCaptions/source'):
+            for track in source.findall('track'):
+                track_url = url_or_none(track.get('url'))
+                if not track_url or track_url.endswith('/big'):
+                    continue
+                lang = track.get('lang') or track.get('label') or 'en'
+                subtitles.setdefault(lang, []).append({
+                    'url': track_url,
+                    'ext': {
+                        'scc': 'scc',
+                        'webvtt': 'vtt',
+                        'smptett': 'tt',
+                    }.get(source.get('format'))
+                })
+
+        thumbnails = [{
+            'id': image.get('cut'),
+            'url': image.text,
+            'width': int_or_none(image.get('width')),
+            'height': int_or_none(image.get('height')),
+        } for image in video_data.findall('images/image')]
+
+        is_live = xpath_text(video_data, 'isLive') == 'true'
+
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'formats': formats,
+            'subtitles': subtitles,
+            'thumbnails': thumbnails,
+            'thumbnail': xpath_text(video_data, 'poster'),
+            'description': strip_or_none(xpath_text(video_data, 'description')),
+            'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')),
+            'timestamp': self._extract_timestamp(video_data),
+            'upload_date': xpath_attr(video_data, 'metas', 'version'),
+            'series': xpath_text(video_data, 'showTitle'),
+            'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
+            'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
+            'is_live': is_live,
+        }
+
+    def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None):
+        streams_data = self._download_json(
+            'http://medium.ngtv.io/media/%s/tv' % media_id,
+            media_id)['media']['tv']
+        duration = None
+        chapters = []
+        formats = []
+        for supported_type in ('unprotected', 'bulkaes'):
+            stream_data = streams_data.get(supported_type, {})
+            m3u8_url = stream_data.get('secureUrl') or stream_data.get('url')
+            if not m3u8_url:
+                continue
+            if stream_data.get('playlistProtection') == 'spe':
+                m3u8_url = self._add_akamai_spe_token(
+                    'http://token.ngtv.io/token/token_spe',
+                    m3u8_url, media_id, ap_data or {}, tokenizer_query)
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False))
+
+            duration = float_or_none(stream_data.get('totalRuntime'))
+
+            if not chapters:
+                for chapter in stream_data.get('contentSegments', []):
+                    start_time = float_or_none(chapter.get('start'))
+                    chapter_duration = float_or_none(chapter.get('duration'))
+                    if start_time is None or chapter_duration is None:
+                        continue
+                    chapters.append({
+                        'start_time': start_time,
+                        'end_time': start_time + chapter_duration,
+                    })
+        self._sort_formats(formats)
+
+        return {
+            'formats': formats,
+            'chapters': chapters,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py

new file mode 100644 (file)

index 0000000..4a19b9b
--- /dev/null
+++ b/youtube_dl/extractor/tv2.py
@@ -0,0 +1,192 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    js_to_json,
+    parse_iso8601,
+    remove_end,
+    strip_or_none,
+    try_get,
+)
+
+
+class TV2IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.tv2.no/v/916509/',
+        'info_dict': {
+            'id': '916509',
+            'ext': 'flv',
+            'title': 'Se Frode Gryttens hyllest av Steven Gerrard',
+            'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.',
+            'timestamp': 1431715610,
+            'upload_date': '20150515',
+            'duration': 156.967,
+            'view_count': int,
+            'categories': list,
+        },
+    }
+    _API_DOMAIN = 'sumo.tv2.no'
+    _PROTOCOLS = ('HDS', 'HLS', 'DASH')
+    _GEO_COUNTRIES = ['NO']
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id)
+
+        formats = []
+        format_urls = []
+        for protocol in self._PROTOCOLS:
+            try:
+                data = self._download_json(
+                    api_base + '/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % protocol,
+                    video_id, 'Downloading play JSON')['playback']
+            except ExtractorError as e:
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                    error = self._parse_json(e.cause.read().decode(), video_id)['error']
+                    error_code = error.get('code')
+                    if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION':
+                        self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+                    elif error_code == 'SESSION_NOT_AUTHENTICATED':
+                        self.raise_login_required()
+                    raise ExtractorError(error['description'])
+                raise
+            items = try_get(data, lambda x: x['items']['item'])
+            if not items:
+                continue
+            if not isinstance(items, list):
+                items = [items]
+            for item in items:
+                if not isinstance(item, dict):
+                    continue
+                video_url = item.get('url')
+                if not video_url or video_url in format_urls:
+                    continue
+                format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat'))
+                if not self._is_valid_url(video_url, video_id, format_id):
+                    continue
+                format_urls.append(video_url)
+                ext = determine_ext(video_url)
+                if ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        video_url, video_id, f4m_id=format_id, fatal=False))
+                elif ext == 'm3u8':
+                    if not data.get('drmProtected'):
+                        formats.extend(self._extract_m3u8_formats(
+                            video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                            m3u8_id=format_id, fatal=False))
+                elif ext == 'mpd':
+                    formats.extend(self._extract_mpd_formats(
+                        video_url, video_id, format_id, fatal=False))
+                elif ext == 'ism' or video_url.endswith('.ism/Manifest'):
+                    pass
+                else:
+                    formats.append({
+                        'url': video_url,
+                        'format_id': format_id,
+                        'tbr': int_or_none(item.get('bitrate')),
+                        'filesize': int_or_none(item.get('fileSize')),
+                    })
+        if not formats and data.get('drmProtected'):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+        self._sort_formats(formats)
+
+        asset = self._download_json(
+            api_base + '.json', video_id,
+            'Downloading metadata JSON')['asset']
+        title = asset['title']
+
+        thumbnails = [{
+            'id': thumbnail.get('@type'),
+            'url': thumbnail.get('url'),
+        } for _, thumbnail in (asset.get('imageVersions') or {}).items()]
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': strip_or_none(asset.get('description')),
+            'thumbnails': thumbnails,
+            'timestamp': parse_iso8601(asset.get('createTime')),
+            'duration': float_or_none(asset.get('accurateDuration') or asset.get('duration')),
+            'view_count': int_or_none(asset.get('views')),
+            'categories': asset.get('keywords', '').split(','),
+            'formats': formats,
+        }
+
+
+class TV2ArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542',
+        'info_dict': {
+            'id': '6930542',
+            'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret',
+            'description': 'De fire siktede nekter fortsatt for å ha stjålet pingvinbabyene, men innrømmer å ha åpnet luken til de små kyllingene.',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'http://www.tv2.no/a/6930542',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        # Old embed pattern (looks unused nowadays)
+        assets = re.findall(r'data-assetid=["\'](\d+)', webpage)
+
+        if not assets:
+            # New embed pattern
+            for v in re.findall(r'(?s)TV2ContentboxVideo\(({.+?})\)', webpage):
+                video = self._parse_json(
+                    v, playlist_id, transform_source=js_to_json, fatal=False)
+                if not video:
+                    continue
+                asset = video.get('assetId')
+                if asset:
+                    assets.append(asset)
+
+        entries = [
+            self.url_result('http://www.tv2.no/v/%s' % asset_id, 'TV2')
+            for asset_id in assets]
+
+        title = remove_end(self._og_search_title(webpage), ' - TV2.no')
+        description = remove_end(self._og_search_description(webpage), ' - TV2.no')
+
+        return self.playlist_result(entries, playlist_id, title, description)
+
+
+class KatsomoIE(TV2IE):
+    _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321',
+        'info_dict': {
+            'id': '1181321',
+            'ext': 'mp4',
+            'title': 'MTV Uutiset Live',
+            'description': 'Päätöksen teki Pelicansin hallitus.',
+            'timestamp': 1575116484,
+            'upload_date': '20191130',
+            'duration': 37.12,
+            'view_count': int,
+            'categories': list,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+    _API_DOMAIN = 'api.katsomo.fi'
+    _PROTOCOLS = ('HLS', 'MPD')
+    _GEO_COUNTRIES = ['FI']
diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py

new file mode 100644 (file)

index 0000000..8bda934
--- /dev/null
+++ b/youtube_dl/extractor/tv2dk.py
@@ -0,0 +1,154 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    extract_attributes,
+    js_to_json,
+    url_or_none,
+)
+
+
+class TV2DKIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        (?:
+                            tvsyd|
+                            tv2ostjylland|
+                            tvmidtvest|
+                            tv2fyn|
+                            tv2east|
+                            tv2lorry|
+                            tv2nord
+                        )\.dk/
+                        (:[^/]+/)*
+                        (?P<id>[^/?\#&]+)
+                    '''
+    _TESTS = [{
+        'url': 'https://www.tvsyd.dk/nyheder/28-10-2019/1930/1930-28-okt-2019?autoplay=1#player',
+        'info_dict': {
+            'id': '0_52jmwa0p',
+            'ext': 'mp4',
+            'title': '19:30 - 28. okt. 2019',
+            'timestamp': 1572290248,
+            'upload_date': '20191028',
+            'uploader_id': 'tvsyd',
+            'duration': 1347,
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Kaltura'],
+    }, {
+        'url': 'https://www.tv2ostjylland.dk/artikel/minister-gaar-ind-i-sag-om-diabetes-teknologi',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tv2ostjylland.dk/nyheder/28-10-2019/22/2200-nyhederne-mandag-d-28-oktober-2019?autoplay=1#player',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tvmidtvest.dk/nyheder/27-10-2019/1930/1930-27-okt-2019',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tv2fyn.dk/artikel/fyn-kan-faa-landets-foerste-fabrik-til-groent-jetbraendstof',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tv2east.dk/artikel/gods-faar-indleveret-tonsvis-af-aebler-100-kilo-aebler-gaar-til-en-aeblebrandy',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tv2lorry.dk/koebenhavn/rasmus-paludan-evakueret-til-egen-demonstration#player',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tv2nord.dk/artikel/dybt-uacceptabelt',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        entries = []
+        for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage):
+            video = extract_attributes(video_el)
+            kaltura_id = video.get('data-entryid')
+            if not kaltura_id:
+                continue
+            partner_id = video.get('data-partnerid')
+            if not partner_id:
+                continue
+            entries.append(self.url_result(
+                'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura',
+                video_id=kaltura_id))
+        return self.playlist_result(entries)
+
+
+class TV2DKBornholmPlayIE(InfoExtractor):
+    _VALID_URL = r'https?://play\.tv2bornholm\.dk/\?.*?\bid=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://play.tv2bornholm.dk/?area=specifikTV&id=781021',
+        'info_dict': {
+            'id': '781021',
+            'ext': 'mp4',
+            'title': '12Nyheder-27.11.19',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'https://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id,
+            data=json.dumps({
+                'playlist_id': video_id,
+                'serienavn': '',
+            }).encode(), headers={
+                'X-Requested-With': 'XMLHttpRequest',
+                'Content-Type': 'application/json; charset=UTF-8',
+            })['d']
+
+        # TODO: generalize flowplayer
+        title = self._search_regex(
+            r'title\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', video, 'title',
+            group='value')
+        sources = self._parse_json(self._search_regex(
+            r'(?s)sources:\s*(\[.+?\]),', video, 'sources'),
+            video_id, js_to_json)
+
+        formats = []
+        srcs = set()
+        for source in sources:
+            src = url_or_none(source.get('src'))
+            if not src:
+                continue
+            if src in srcs:
+                continue
+            srcs.add(src)
+            ext = determine_ext(src)
+            src_type = source.get('type')
+            if src_type == 'application/x-mpegurl' or ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    src, video_id, ext='mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif src_type == 'application/dash+xml' or ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    src, video_id, mpd_id='dash', fatal=False))
+            else:
+                formats.append({
+                    'url': src,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tv2hu.py b/youtube_dl/extractor/tv2hu.py

new file mode 100644 (file)

index 0000000..86017b7
--- /dev/null
+++ b/youtube_dl/extractor/tv2hu.py
@@ -0,0 +1,62 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class TV2HuIE(InfoExtractor):
+    IE_NAME = 'tv2.hu'
+    _VALID_URL = r'https?://(?:www\.)?tv2\.hu/(?:[^/]+/)+(?P<id>\d+)_[^/?#]+?\.html'
+    _TESTS = [{
+        'url': 'http://tv2.hu/ezek_megorultek/217679_ezek-megorultek---1.-adas-1.-resz.html',
+        'md5': '585e58e2e090f34603804bb2c48e98d8',
+        'info_dict': {
+            'id': '217679',
+            'ext': 'mp4',
+            'title': 'Ezek megőrültek! - 1. adás 1. rész',
+            'upload_date': '20160826',
+            'thumbnail': r're:^https?://.*\.jpg$'
+        }
+    }, {
+        'url': 'http://tv2.hu/ezek_megorultek/teljes_adasok/217677_ezek-megorultek---1.-adas-2.-resz.html',
+        'only_matching': True
+    }, {
+        'url': 'http://tv2.hu/musoraink/aktiv/aktiv_teljes_adas/217963_aktiv-teljes-adas---2016.08.30..html',
+        'only_matching': True
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        json_url = self._search_regex(
+            r'jsonUrl\s*=\s*"([^"]+)"', webpage, 'json url')
+        json_data = self._download_json(json_url, video_id)
+
+        formats = []
+        for b in ('bitrates', 'backupBitrates'):
+            bitrates = json_data.get(b, {})
+            m3u8_url = bitrates.get('hls')
+            if m3u8_url:
+                formats.extend(self._extract_wowza_formats(
+                    m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp']))
+
+            for mp4_url in bitrates.get('mp4', []):
+                height = int_or_none(self._search_regex(
+                    r'\.(\d+)p\.mp4', mp4_url, 'height', default=None))
+                formats.append({
+                    'format_id': 'http' + ('-%d' % height if height else ''),
+                    'url': mp4_url,
+                    'height': height,
+                    'width': int_or_none(height / 9.0 * 16.0 if height else None),
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage).strip(),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'upload_date': self._search_regex(
+                r'/vod/(\d{8})/', json_url, 'upload_date', default=None),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py

new file mode 100644 (file)

index 0000000..c498b01
--- /dev/null
+++ b/youtube_dl/extractor/tv4.py
@@ -0,0 +1,124 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class TV4IE(InfoExtractor):
+    IE_DESC = 'tv4.se and tv4play.se'
+    _VALID_URL = r'''(?x)https?://(?:www\.)?
+        (?:
+            tv4\.se/(?:[^/]+)/klipp/(?:.*)-|
+            tv4play\.se/
+            (?:
+                (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)|
+                iframe/video/|
+                film/|
+                sport/|
+            )
+        )(?P<id>[0-9]+)'''
+    _GEO_COUNTRIES = ['SE']
+    _TESTS = [
+        {
+            'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650',
+            'md5': 'cb837212f342d77cec06e6dad190e96d',
+            'info_dict': {
+                'id': '2491650',
+                'ext': 'mp4',
+                'title': 'Kalla Fakta 5 (english subtitles)',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'timestamp': int,
+                'upload_date': '20131125',
+            },
+        },
+        {
+            'url': 'http://www.tv4play.se/iframe/video/3054113',
+            'md5': 'cb837212f342d77cec06e6dad190e96d',
+            'info_dict': {
+                'id': '3054113',
+                'ext': 'mp4',
+                'title': 'Så här jobbar ficktjuvarna - se avslöjande bilder',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'description': 'Unika bilder avslöjar hur turisternas fickor vittjas mitt på Stockholms central. Två experter på ficktjuvarna avslöjar knepen du ska se upp för.',
+                'timestamp': int,
+                'upload_date': '20150130',
+            },
+        },
+        {
+            'url': 'http://www.tv4play.se/sport/3060959',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.tv4play.se/film/2378136',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.tv4play.se/program/farang/3922081',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        info = self._download_json(
+            'https://playback-api.b17g.net/asset/%s' % video_id,
+            video_id, 'Downloading video info JSON', query={
+                'service': 'tv4',
+                'device': 'browser',
+                'protocol': 'hls,dash',
+                'drm': 'widevine',
+            })['metadata']
+
+        title = info['title']
+
+        manifest_url = self._download_json(
+            'https://playback-api.b17g.net/media/' + video_id,
+            video_id, query={
+                'service': 'tv4',
+                'device': 'browser',
+                'protocol': 'hls',
+            })['playbackItem']['manifestUrl']
+        formats = self._extract_m3u8_formats(
+            manifest_url, video_id, 'mp4',
+            'm3u8_native', m3u8_id='hls', fatal=False)
+        formats.extend(self._extract_mpd_formats(
+            manifest_url.replace('.m3u8', '.mpd'),
+            video_id, mpd_id='dash', fatal=False))
+        formats.extend(self._extract_f4m_formats(
+            manifest_url.replace('.m3u8', '.f4m'),
+            video_id, f4m_id='hds', fatal=False))
+        formats.extend(self._extract_ism_formats(
+            re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url),
+            video_id, ism_id='mss', fatal=False))
+
+        if not formats and info.get('is_geo_restricted'):
+            self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            # 'subtitles': subtitles,
+            'description': info.get('description'),
+            'timestamp': parse_iso8601(info.get('broadcast_date_time')),
+            'duration': int_or_none(info.get('duration')),
+            'thumbnail': info.get('image'),
+            'is_live': info.get('isLive') is True,
+            'series': info.get('seriesTitle'),
+            'season_number': int_or_none(info.get('seasonNumber')),
+            'episode': info.get('episodeTitle'),
+            'episode_number': int_or_none(info.get('episodeNumber')),
+        }
diff --git a/youtube_dl/extractor/tv5mondeplus.py b/youtube_dl/extractor/tv5mondeplus.py

new file mode 100644 (file)

index 0000000..b7fe082
--- /dev/null
+++ b/youtube_dl/extractor/tv5mondeplus.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    extract_attributes,
+    int_or_none,
+    parse_duration,
+)
+
+
+class TV5MondePlusIE(InfoExtractor):
+    IE_DESC = 'TV5MONDE+'
+    _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        # movie
+        'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/rendez-vous-a-atlit',
+        'md5': '8cbde5ea7b296cf635073e27895e227f',
+        'info_dict': {
+            'id': '822a4756-0712-7329-1859-a13ac7fd1407',
+            'display_id': 'rendez-vous-a-atlit',
+            'ext': 'mp4',
+            'title': 'Rendez-vous à Atlit',
+            'description': 'md5:2893a4c5e1dbac3eedff2d87956e4efb',
+            'upload_date': '20200130',
+        },
+    }, {
+        # series episode
+        'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/c-est-la-vie-ennemie-juree',
+        'info_dict': {
+            'id': '0df7007c-4900-3936-c601-87a13a93a068',
+            'display_id': 'c-est-la-vie-ennemie-juree',
+            'ext': 'mp4',
+            'title': "C'est la vie - Ennemie jurée",
+            'description': 'md5:dfb5c63087b6f35fe0cc0af4fe44287e',
+            'upload_date': '20200130',
+            'series': "C'est la vie",
+            'episode': 'Ennemie jurée',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver',
+        'only_matching': True,
+    }, {
+        'url': 'https://revoir.tv5monde.com/toutes-les-videos/info-societe/le-journal-de-la-rts-edition-du-30-01-20-19h30',
+        'only_matching': True,
+    }]
+    _GEO_BYPASS = False
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage:
+            self.raise_geo_restricted(countries=['FR'])
+
+        title = episode = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
+        vpl_data = extract_attributes(self._search_regex(
+            r'(<[^>]+class="video_player_loader"[^>]+>)',
+            webpage, 'video player loader'))
+
+        video_files = self._parse_json(
+            vpl_data['data-broadcast'], display_id).get('files', [])
+        formats = []
+        for video_file in video_files:
+            v_url = video_file.get('url')
+            if not v_url:
+                continue
+            video_format = video_file.get('format') or determine_ext(v_url)
+            if video_format == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    v_url, display_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': v_url,
+                    'format_id': video_format,
+                })
+        self._sort_formats(formats)
+
+        description = self._html_search_regex(
+            r'(?s)<div[^>]+class=["\']episode-texte[^>]+>(.+?)</div>', webpage,
+            'description', fatal=False)
+
+        series = self._html_search_regex(
+            r'<p[^>]+class=["\']episode-emission[^>]+>([^<]+)', webpage,
+            'series', default=None)
+
+        if series and series != title:
+            title = '%s - %s' % (series, title)
+
+        upload_date = self._search_regex(
+            r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})',
+            webpage, 'upload date', default=None)
+        if upload_date:
+            upload_date = upload_date.replace('_', '')
+
+        video_id = self._search_regex(
+            (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+             r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id',
+            default=display_id)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': vpl_data.get('data-image'),
+            'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)),
+            'upload_date': upload_date,
+            'formats': formats,
+            'series': series,
+            'episode': episode,
+        }
diff --git a/youtube_dl/extractor/tva.py b/youtube_dl/extractor/tva.py

new file mode 100644 (file)

index 0000000..443f46e
--- /dev/null
+++ b/youtube_dl/extractor/tva.py
@@ -0,0 +1,57 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    smuggle_url,
+)
+
+
+class TVAIE(InfoExtractor):
+    _VALID_URL = r'https?://videos?\.tva\.ca/details/_(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://videos.tva.ca/details/_5596811470001',
+        'info_dict': {
+            'id': '5596811470001',
+            'ext': 'mp4',
+            'title': 'Un extrait de l\'épisode du dimanche 8 octobre 2017 !',
+            'uploader_id': '5481942443001',
+            'upload_date': '20171003',
+            'timestamp': 1507064617,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'https://video.tva.ca/details/_5596811470001',
+        'only_matching': True,
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'https://videos.tva.ca/proxy/item/_' + video_id, video_id, headers={
+                'Accept': 'application/json',
+            }, query={
+                'appId': '5955fc5f23eec60006c951f1',
+            })
+
+        def get_attribute(key):
+            for attribute in video_data.get('attributes', []):
+                if attribute.get('key') == key:
+                    return attribute.get('value')
+            return None
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': get_attribute('title'),
+            'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}),
+            'description': get_attribute('description'),
+            'thumbnail': get_attribute('image-background') or get_attribute('image-landscape'),
+            'duration': float_or_none(get_attribute('video-duration'), 1000),
+            'ie_key': 'BrightcoveNew',
+        }
diff --git a/youtube_dl/extractor/tvanouvelles.py b/youtube_dl/extractor/tvanouvelles.py

new file mode 100644 (file)

index 0000000..1086176
--- /dev/null
+++ b/youtube_dl/extractor/tvanouvelles.py
@@ -0,0 +1,65 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveNewIE
+
+
+class TVANouvellesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tvanouvelles\.ca/videos/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.tvanouvelles.ca/videos/5117035533001',
+        'info_dict': {
+            'id': '5117035533001',
+            'ext': 'mp4',
+            'title': 'L’industrie du taxi dénonce l’entente entre Québec et Uber: explications',
+            'description': 'md5:479653b7c8cf115747bf5118066bd8b3',
+            'uploader_id': '1741764581',
+            'timestamp': 1473352030,
+            'upload_date': '20160908',
+        },
+        'add_ie': ['BrightcoveNew'],
+    }
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1741764581/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        brightcove_id = self._match_id(url)
+        return self.url_result(
+            self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+            BrightcoveNewIE.ie_key(), brightcove_id)
+
+
+class TVANouvellesArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tvanouvelles\.ca/(?:[^/]+/)+(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://www.tvanouvelles.ca/2016/11/17/des-policiers-qui-ont-la-meche-un-peu-courte',
+        'info_dict': {
+            'id': 'des-policiers-qui-ont-la-meche-un-peu-courte',
+            'title': 'Des policiers qui ont «la mèche un peu courte»?',
+            'description': 'md5:92d363c8eb0f0f030de9a4a84a90a3a0',
+        },
+        'playlist_mincount': 4,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if TVANouvellesIE.suitable(url) else super(TVANouvellesArticleIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        entries = [
+            self.url_result(
+                'http://www.tvanouvelles.ca/videos/%s' % mobj.group('id'),
+                ie=TVANouvellesIE.ie_key(), video_id=mobj.group('id'))
+            for mobj in re.finditer(
+                r'data-video-id=(["\'])?(?P<id>\d+)', webpage)]
+
+        title = self._og_search_title(webpage, fatal=False)
+        description = self._og_search_description(webpage)
+
+        return self.playlist_result(entries, display_id, title, description)
diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py

new file mode 100644 (file)

index 0000000..008f64c
--- /dev/null
+++ b/youtube_dl/extractor/tvc.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+)
+
+
+class TVCIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702',
+        'md5': 'bbc5ff531d1e90e856f60fc4b3afd708',
+        'info_dict': {
+            'id': '74622',
+            'ext': 'mp4',
+            'title': 'События. "События". Эфир от 22.05.2015 14:30',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1122,
+        },
+    }
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://www.tvc.ru/video/json/id/%s' % video_id, video_id)
+
+        formats = []
+        for info in video.get('path', {}).get('quality', []):
+            video_url = info.get('url')
+            if not video_url:
+                continue
+            format_id = self._search_regex(
+                r'cdnvideo/([^/]+?)(?:-[^/]+?)?/', video_url,
+                'format id', default=None)
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'width': int_or_none(info.get('width')),
+                'height': int_or_none(info.get('height')),
+                'tbr': int_or_none(info.get('bitrate')),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video['title'],
+            'thumbnail': video.get('picture'),
+            'duration': int_or_none(video.get('duration')),
+            'formats': formats,
+        }
+
+
+class TVCArticleIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)'
+    _TESTS = [{
+        'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/',
+        'info_dict': {
+            'id': '74622',
+            'ext': 'mp4',
+            'title': 'События. "События". Эфир от 22.05.2015 14:30',
+            'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1122,
+        },
+    }, {
+        'url': 'http://www.tvc.ru/news/show/id/69944',
+        'info_dict': {
+            'id': '75399',
+            'ext': 'mp4',
+            'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках',
+            'description': 'md5:f2098f71e21f309e89f69b525fd9846e',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 278,
+        },
+    }, {
+        'url': 'http://www.tvc.ru/channel/brand/id/47/show/episodes#',
+        'info_dict': {
+            'id': '2185',
+            'ext': 'mp4',
+            'title': 'Ещё не поздно. Эфир от 03.08.2013',
+            'description': 'md5:51fae9f3f8cfe67abce014e428e5b027',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 3316,
+        },
+    }]
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, self._match_id(url))
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'TVC',
+            'url': self._og_search_video_url(webpage),
+            'title': clean_html(self._og_search_title(webpage)),
+            'description': clean_html(self._og_search_description(webpage)),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py

new file mode 100644 (file)

index 0000000..180259a
--- /dev/null
+++ b/youtube_dl/extractor/tvigle.py
@@ -0,0 +1,138 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    parse_age_limit,
+    try_get,
+    url_or_none,
+)
+
+
+class TvigleIE(InfoExtractor):
+    IE_NAME = 'tvigle'
+    IE_DESC = 'Интернет-телевидение Tvigle.ru'
+    _VALID_URL = r'https?://(?:www\.)?(?:tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$|cloud\.tvigle\.ru/video/(?P<id>\d+))'
+
+    _GEO_BYPASS = False
+    _GEO_COUNTRIES = ['RU']
+
+    _TESTS = [
+        {
+            'url': 'http://www.tvigle.ru/video/sokrat/',
+            'info_dict': {
+                'id': '1848932',
+                'display_id': 'sokrat',
+                'ext': 'mp4',
+                'title': 'Сократ',
+                'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17',
+                'duration': 6586,
+                'age_limit': 12,
+            },
+            'skip': 'georestricted',
+        },
+        {
+            'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/',
+            'info_dict': {
+                'id': '5142516',
+                'ext': 'flv',
+                'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
+                'description': 'md5:027f7dc872948f14c96d19b4178428a4',
+                'duration': 186.080,
+                'age_limit': 0,
+            },
+            'skip': 'georestricted',
+        }, {
+            'url': 'https://cloud.tvigle.ru/video/5267604/',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        if not video_id:
+            webpage = self._download_webpage(url, display_id)
+            video_id = self._html_search_regex(
+                (r'<div[^>]+class=["\']player["\'][^>]+id=["\'](\d+)',
+                 r'cloudId\s*=\s*["\'](\d+)',
+                 r'class="video-preview current_playing" id="(\d+)"'),
+                webpage, 'video id')
+
+        video_data = self._download_json(
+            'http://cloud.tvigle.ru/api/play/video/%s/' % video_id, display_id)
+
+        item = video_data['playlist']['items'][0]
+
+        videos = item.get('videos')
+
+        error_message = item.get('errorMessage')
+        if not videos and error_message:
+            if item.get('isGeoBlocked') is True:
+                self.raise_geo_restricted(
+                    msg=error_message, countries=self._GEO_COUNTRIES)
+            else:
+                raise ExtractorError(
+                    '%s returned error: %s' % (self.IE_NAME, error_message),
+                    expected=True)
+
+        title = item['title']
+        description = item.get('description')
+        thumbnail = item.get('thumbnail')
+        duration = float_or_none(item.get('durationMilliseconds'), 1000)
+        age_limit = parse_age_limit(item.get('ageRestrictions'))
+
+        formats = []
+        for vcodec, url_or_fmts in item['videos'].items():
+            if vcodec == 'hls':
+                m3u8_url = url_or_none(url_or_fmts)
+                if not m3u8_url:
+                    continue
+                formats.extend(self._extract_m3u8_formats(
+                    m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif vcodec == 'dash':
+                mpd_url = url_or_none(url_or_fmts)
+                if not mpd_url:
+                    continue
+                formats.extend(self._extract_mpd_formats(
+                    mpd_url, video_id, mpd_id='dash', fatal=False))
+            else:
+                if not isinstance(url_or_fmts, dict):
+                    continue
+                for format_id, video_url in url_or_fmts.items():
+                    if format_id == 'm3u8':
+                        continue
+                    video_url = url_or_none(video_url)
+                    if not video_url:
+                        continue
+                    height = self._search_regex(
+                        r'^(\d+)[pP]$', format_id, 'height', default=None)
+                    filesize = int_or_none(try_get(
+                        item, lambda x: x['video_files_size'][vcodec][format_id]))
+                    formats.append({
+                        'url': video_url,
+                        'format_id': '%s-%s' % (vcodec, format_id),
+                        'vcodec': vcodec,
+                        'height': int_or_none(height),
+                        'filesize': filesize,
+                    })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tvland.py b/youtube_dl/extractor/tvland.py

new file mode 100644 (file)

index 0000000..7911441
--- /dev/null
+++ b/youtube_dl/extractor/tvland.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .spike import ParamountNetworkIE
+
+
+class TVLandIE(ParamountNetworkIE):
+    IE_NAME = 'tvland.com'
+    _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|(?:full-)?episodes)/(?P<id>[^/?#.]+)'
+    _FEED_URL = 'http://www.tvland.com/feeds/mrss/'
+    _TESTS = [{
+        # Geo-restricted. Without a proxy metadata are still there. With a
+        # proxy it redirects to http://m.tvland.com/app/
+        'url': 'https://www.tvland.com/episodes/s04pzf/everybody-loves-raymond-the-dog-season-1-ep-19',
+        'info_dict': {
+            'description': 'md5:84928e7a8ad6649371fbf5da5e1ad75a',
+            'title': 'The Dog',
+        },
+        'playlist_mincount': 5,
+    }, {
+        'url': 'https://www.tvland.com/video-clips/4n87f2/younger-a-first-look-at-younger-season-6',
+        'md5': 'e2c6389401cf485df26c79c247b08713',
+        'info_dict': {
+            'id': '891f7d3c-5b5b-4753-b879-b7ba1a601757',
+            'ext': 'mp4',
+            'title': 'Younger|April 30, 2019|6|NO-EPISODE#|A First Look at Younger Season 6',
+            'description': 'md5:595ea74578d3a888ae878dfd1c7d4ab2',
+            'upload_date': '20190430',
+            'timestamp': 1556658000,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.tvland.com/full-episodes/iu0hz6/younger-a-kiss-is-just-a-kiss-season-3-ep-301',
+        'only_matching': True,
+    }]
diff --git a/youtube_dl/extractor/tvn24.py b/youtube_dl/extractor/tvn24.py

new file mode 100644 (file)

index 0000000..de0fb50
--- /dev/null
+++ b/youtube_dl/extractor/tvn24.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    NO_DEFAULT,
+    unescapeHTML,
+)
+
+
+class TVN24IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:[^/]+)\.)?tvn24(?:bis)?\.pl/(?:[^/]+/)*(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.tvn24.pl/wiadomosci-z-kraju,3/oredzie-artura-andrusa,702428.html',
+        'md5': 'fbdec753d7bc29d96036808275f2130c',
+        'info_dict': {
+            'id': '1584444',
+            'ext': 'mp4',
+            'title': '"Święta mają być wesołe, dlatego, ludziska, wszyscy pod jemiołę"',
+            'description': 'Wyjątkowe orędzie Artura Andrusa, jednego z gości Szkła kontaktowego.',
+            'thumbnail': 're:https?://.*[.]jpeg',
+        }
+    }, {
+        # different layout
+        'url': 'https://tvnmeteo.tvn24.pl/magazyny/maja-w-ogrodzie,13/odcinki-online,1,4,1,0/pnacza-ptaki-i-iglaki-odc-691-hgtv-odc-29,1771763.html',
+        'info_dict': {
+            'id': '1771763',
+            'ext': 'mp4',
+            'title': 'Pnącza, ptaki i iglaki (odc. 691 /HGTV odc. 29)',
+            'thumbnail': 're:https?://.*',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://fakty.tvn24.pl/ogladaj-online,60/53-konferencja-bezpieczenstwa-w-monachium,716431.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://sport.tvn24.pl/pilka-nozna,105/ligue-1-kamil-glik-rozcial-glowe-monaco-tylko-remisuje-z-bastia,716522.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://tvn24bis.pl/poranek,146,m/gen-koziej-w-tvn24-bis-wracamy-do-czasow-zimnej-wojny,715660.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tvn24.pl/magazyn-tvn24/angie-w-jednej-czwartej-polka-od-szarej-myszki-do-cesarzowej-europy,119,2158',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(
+            webpage, default=None) or self._search_regex(
+            r'<h\d+[^>]+class=["\']magazineItemHeader[^>]+>(.+?)</h',
+            webpage, 'title')
+
+        def extract_json(attr, name, default=NO_DEFAULT, fatal=True):
+            return self._parse_json(
+                self._search_regex(
+                    r'\b%s=(["\'])(?P<json>(?!\1).+?)\1' % attr, webpage,
+                    name, group='json', default=default, fatal=fatal) or '{}',
+                display_id, transform_source=unescapeHTML, fatal=fatal)
+
+        quality_data = extract_json('data-quality', 'formats')
+
+        formats = []
+        for format_id, url in quality_data.items():
+            formats.append({
+                'url': url,
+                'format_id': format_id,
+                'height': int_or_none(format_id.rstrip('p')),
+            })
+        self._sort_formats(formats)
+
+        description = self._og_search_description(webpage, default=None)
+        thumbnail = self._og_search_thumbnail(
+            webpage, default=None) or self._html_search_regex(
+            r'\bdata-poster=(["\'])(?P<url>(?!\1).+?)\1', webpage,
+            'thumbnail', group='url')
+
+        video_id = None
+
+        share_params = extract_json(
+            'data-share-params', 'share params', default=None)
+        if isinstance(share_params, dict):
+            video_id = share_params.get('id')
+
+        if not video_id:
+            video_id = self._search_regex(
+                r'data-vid-id=["\'](\d+)', webpage, 'video id',
+                default=None) or self._search_regex(
+                r',(\d+)\.html', url, 'video id', default=display_id)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tvnet.py b/youtube_dl/extractor/tvnet.py

new file mode 100644 (file)

index 0000000..4222ff9
--- /dev/null
+++ b/youtube_dl/extractor/tvnet.py
@@ -0,0 +1,147 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unescapeHTML,
+    url_or_none,
+)
+
+
+class TVNetIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+)\.tvnet\.gov\.vn/[^/]+/(?:\d+/)?(?P<id>\d+)(?:/|$)'
+    _TESTS = [{
+        # video
+        'url': 'http://de.tvnet.gov.vn/video/109788/vtv1---bac-tuyet-tai-lao-cai-va-ha-giang/tin-nong-24h',
+        'md5': 'b4d7abe0252c9b47774760b7519c7558',
+        'info_dict': {
+            'id': '109788',
+            'ext': 'mp4',
+            'title': 'VTV1 - Bắc tuyết tại Lào Cai và Hà Giang',
+            'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
+            'is_live': False,
+            'view_count': int,
+        },
+    }, {
+        # audio
+        'url': 'http://vn.tvnet.gov.vn/radio/27017/vov1---ban-tin-chieu-10062018/doi-song-va-xa-hoi',
+        'md5': 'b5875ce9b0a2eecde029216d0e6db2ae',
+        'info_dict': {
+            'id': '27017',
+            'ext': 'm4a',
+            'title': 'VOV1 - Bản tin chiều (10/06/2018)',
+            'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
+            'is_live': False,
+        },
+    }, {
+        'url': 'http://us.tvnet.gov.vn/video/118023/129999/ngay-0705',
+        'info_dict': {
+            'id': '129999',
+            'ext': 'mp4',
+            'title': 'VTV1 - Quốc hội với cử tri (11/06/2018)',
+            'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
+            'is_live': False,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # live stream
+        'url': 'http://us.tvnet.gov.vn/kenh-truyen-hinh/1011/vtv1',
+        'info_dict': {
+            'id': '1011',
+            'ext': 'mp4',
+            'title': r're:^VTV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # radio live stream
+        'url': 'http://vn.tvnet.gov.vn/kenh-truyen-hinh/1014',
+        'info_dict': {
+            'id': '1014',
+            'ext': 'm4a',
+            'title': r're:VOV1 \| LiveTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'thumbnail': r're:(?i)https?://.*\.(?:jpg|png)',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://us.tvnet.gov.vn/phim/6136/25510/vtv3---ca-mot-doi-an-oan-tap-1-50/phim-truyen-hinh',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(
+            webpage, default=None) or self._html_search_meta(
+            'title', webpage, default=None) or self._search_regex(
+            r'<title>([^<]+)<', webpage, 'title')
+        title = re.sub(r'\s*-\s*TV Net\s*$', '', title)
+
+        if '/video/' in url or '/radio/' in url:
+            is_live = False
+        elif '/kenh-truyen-hinh/' in url:
+            is_live = True
+        else:
+            is_live = None
+
+        data_file = unescapeHTML(self._search_regex(
+            r'data-file=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage,
+            'data file', group='url'))
+
+        stream_urls = set()
+        formats = []
+        for stream in self._download_json(data_file, video_id):
+            if not isinstance(stream, dict):
+                continue
+            stream_url = url_or_none(stream.get('url'))
+            if stream_url in stream_urls or not stream_url:
+                continue
+            stream_urls.add(stream_url)
+            formats.extend(self._extract_m3u8_formats(
+                stream_url, video_id, 'mp4',
+                entry_protocol='m3u8' if is_live else 'm3u8_native',
+                m3u8_id='hls', fatal=False))
+        self._sort_formats(formats)
+
+        # better support for radio streams
+        if title.startswith('VOV'):
+            for f in formats:
+                f.update({
+                    'ext': 'm4a',
+                    'vcodec': 'none',
+                })
+
+        thumbnail = self._og_search_thumbnail(
+            webpage, default=None) or unescapeHTML(
+            self._search_regex(
+                r'data-image=(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage,
+                'thumbnail', default=None, group='url'))
+
+        if is_live:
+            title = self._live_title(title)
+
+        view_count = int_or_none(self._search_regex(
+            r'(?s)<div[^>]+\bclass=["\'].*?view-count[^>]+>.*?(\d+).*?</div>',
+            webpage, 'view count', default=None))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'is_live': is_live,
+            'view_count': view_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py

new file mode 100644 (file)

index 0000000..26a5aea
--- /dev/null
+++ b/youtube_dl/extractor/tvnoe.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    get_element_by_class,
+    js_to_json,
+)
+
+
+class TVNoeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.tvnoe.cz/video/10362',
+        'md5': 'aee983f279aab96ec45ab6e2abb3c2ca',
+        'info_dict': {
+            'id': '10362',
+            'ext': 'mp4',
+            'series': 'Noční univerzita',
+            'title': 'prof. Tomáš Halík, Th.D. - Návrat náboženství a střet civilizací',
+            'description': 'md5:f337bae384e1a531a52c55ebc50fff41',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        iframe_url = self._search_regex(
+            r'<iframe[^>]+src="([^"]+)"', webpage, 'iframe URL')
+
+        ifs_page = self._download_webpage(iframe_url, video_id)
+        jwplayer_data = self._find_jwplayer_data(
+            ifs_page, video_id, transform_source=js_to_json)
+        info_dict = self._parse_jwplayer_data(
+            jwplayer_data, video_id, require_title=False, base_url=iframe_url)
+
+        info_dict.update({
+            'id': video_id,
+            'title': clean_html(get_element_by_class(
+                'field-name-field-podnazev', webpage)),
+            'description': clean_html(get_element_by_class(
+                'field-name-body', webpage)),
+            'series': clean_html(get_element_by_class('title', webpage))
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py

new file mode 100644 (file)

index 0000000..9c8a8a0
--- /dev/null
+++ b/youtube_dl/extractor/tvnow.py
@@ -0,0 +1,486 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+    parse_duration,
+    str_or_none,
+    update_url_query,
+    urljoin,
+)
+
+
+class TVNowBaseIE(InfoExtractor):
+    _VIDEO_FIELDS = (
+        'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort',
+        'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode',
+        'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear',
+        'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo')
+
+    def _call_api(self, path, video_id, query):
+        return self._download_json(
+            'https://api.tvnow.de/v3/' + path, video_id, query=query)
+
+    def _extract_video(self, info, display_id):
+        video_id = compat_str(info['id'])
+        title = info['title']
+
+        paths = []
+        for manifest_url in (info.get('manifest') or {}).values():
+            if not manifest_url:
+                continue
+            manifest_url = update_url_query(manifest_url, {'filter': ''})
+            path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path')
+            if path in paths:
+                continue
+            paths.append(path)
+
+            def url_repl(proto, suffix):
+                return re.sub(
+                    r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub(
+                        r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)',
+                        '.ism/' + suffix, manifest_url))
+
+            def make_urls(proto, suffix):
+                urls = [url_repl(proto, suffix)]
+                hd_url = urls[0].replace('/manifest/', '/ngvod/')
+                if hd_url != urls[0]:
+                    urls.append(hd_url)
+                return urls
+
+            for man_url in make_urls('dash', '.mpd'):
+                formats = self._extract_mpd_formats(
+                    man_url, video_id, mpd_id='dash', fatal=False)
+            for man_url in make_urls('hss', 'Manifest'):
+                formats.extend(self._extract_ism_formats(
+                    man_url, video_id, ism_id='mss', fatal=False))
+            for man_url in make_urls('hls', '.m3u8'):
+                formats.extend(self._extract_m3u8_formats(
+                    man_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls',
+                    fatal=False))
+            if formats:
+                break
+        else:
+            if info.get('isDrm'):
+                raise ExtractorError(
+                    'Video %s is DRM protected' % video_id, expected=True)
+            if info.get('geoblocked'):
+                raise self.raise_geo_restricted()
+            if not info.get('free', True):
+                raise ExtractorError(
+                    'Video %s is not available for free' % video_id, expected=True)
+        self._sort_formats(formats)
+
+        description = info.get('articleLong') or info.get('articleShort')
+        timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ')
+        duration = parse_duration(info.get('duration'))
+
+        f = info.get('format', {})
+
+        thumbnails = [{
+            'url': 'https://aistvnow-a.akamaihd.net/tvnow/movie/%s' % video_id,
+        }]
+        thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo')
+        if thumbnail:
+            thumbnails.append({
+                'url': thumbnail,
+            })
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'timestamp': timestamp,
+            'duration': duration,
+            'series': f.get('title'),
+            'season_number': int_or_none(info.get('season')),
+            'episode_number': int_or_none(info.get('episode')),
+            'episode': title,
+            'formats': formats,
+        }
+
+
+class TVNowIE(TVNowBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/
+                        (?P<show_id>[^/]+)/
+                        (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+)
+                    '''
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) or TVNowShowIE.suitable(url)
+                else super(TVNowIE, cls).suitable(url))
+
+    _TESTS = [{
+        'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player',
+        'info_dict': {
+            'id': '331082',
+            'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3',
+            'ext': 'mp4',
+            'title': 'Der neue Porsche 911 GT 3',
+            'description': 'md5:6143220c661f9b0aae73b245e5d898bb',
+            'timestamp': 1495994400,
+            'upload_date': '20170528',
+            'duration': 5283,
+            'series': 'GRIP - Das Motormagazin',
+            'season_number': 14,
+            'episode_number': 405,
+            'episode': 'Der neue Porsche 911 GT 3',
+        },
+    }, {
+        # rtl2
+        'url': 'https://www.tvnow.de/rtl2/armes-deutschland/episode-0008/player',
+        'only_matching': True,
+    }, {
+        # rtlnitro
+        'url': 'https://www.tvnow.de/nitro/alarm-fuer-cobra-11-die-autobahnpolizei/auf-eigene-faust-pilot/player',
+        'only_matching': True,
+    }, {
+        # superrtl
+        'url': 'https://www.tvnow.de/superrtl/die-lustigsten-schlamassel-der-welt/u-a-ketchup-effekt/player',
+        'only_matching': True,
+    }, {
+        # ntv
+        'url': 'https://www.tvnow.de/ntv/startup-news/goetter-in-weiss/player',
+        'only_matching': True,
+    }, {
+        # vox
+        'url': 'https://www.tvnow.de/vox/auto-mobil/neues-vom-automobilmarkt-2017-11-19-17-00-00/player',
+        'only_matching': True,
+    }, {
+        # rtlplus
+        'url': 'https://www.tvnow.de/rtlplus/op-ruft-dr-bruckner/die-vernaehte-frau/player',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = '%s/%s' % mobj.group(2, 3)
+
+        info = self._call_api(
+            'movies/' + display_id, display_id, query={
+                'fields': ','.join(self._VIDEO_FIELDS),
+            })
+
+        return self._extract_video(info, display_id)
+
+
+class TVNowNewIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    (?P<base_url>https?://
+                        (?:www\.)?tvnow\.(?:de|at|ch)/
+                        (?:shows|serien))/
+                        (?P<show>[^/]+)-\d+/
+                        [^/]+/
+                        episode-\d+-(?P<episode>[^/?$&]+)-(?P<id>\d+)
+                    '''
+
+    _TESTS = [{
+        'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url'))
+        show, episode = mobj.group('show', 'episode')
+        return self.url_result(
+            # Rewrite new URLs to the old format and use extraction via old API
+            # at api.tvnow.de as a loophole for bypassing premium content checks
+            '%s/%s/%s' % (base_url, show, episode),
+            ie=TVNowIE.ie_key(), video_id=mobj.group('id'))
+
+
+class TVNowNewBaseIE(InfoExtractor):
+    def _call_api(self, path, video_id, query={}):
+        result = self._download_json(
+            'https://apigw.tvnow.de/module/' + path, video_id, query=query)
+        error = result.get('error')
+        if error:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error), expected=True)
+        return result
+
+
+r"""
+TODO: new apigw.tvnow.de based version of TVNowIE. Replace old TVNowIE with it
+when api.tvnow.de is shut down. This version can't bypass premium checks though.
+class TVNowIE(TVNowNewBaseIE):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?tvnow\.(?:de|at|ch)/
+                        (?:shows|serien)/[^/]+/
+                        (?:[^/]+/)+
+                        (?P<display_id>[^/?$&]+)-(?P<id>\d+)
+                    '''
+
+    _TESTS = [{
+        # episode with annual navigation
+        'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
+        'info_dict': {
+            'id': '331082',
+            'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3',
+            'ext': 'mp4',
+            'title': 'Der neue Porsche 911 GT 3',
+            'description': 'md5:6143220c661f9b0aae73b245e5d898bb',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1495994400,
+            'upload_date': '20170528',
+            'duration': 5283,
+            'series': 'GRIP - Das Motormagazin',
+            'season_number': 14,
+            'episode_number': 405,
+            'episode': 'Der neue Porsche 911 GT 3',
+        },
+    }, {
+        # rtl2, episode with season navigation
+        'url': 'https://www.tvnow.de/shows/armes-deutschland-11471/staffel-3/episode-14-bernd-steht-seit-der-trennung-von-seiner-frau-allein-da-526124',
+        'only_matching': True,
+    }, {
+        # rtlnitro
+        'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13/episode-5-auf-eigene-faust-pilot-366822',
+        'only_matching': True,
+    }, {
+        # superrtl
+        'url': 'https://www.tvnow.de/shows/die-lustigsten-schlamassel-der-welt-1221/staffel-2/episode-14-u-a-ketchup-effekt-364120',
+        'only_matching': True,
+    }, {
+        # ntv
+        'url': 'https://www.tvnow.de/shows/startup-news-10674/staffel-2/episode-39-goetter-in-weiss-387630',
+        'only_matching': True,
+    }, {
+        # vox
+        'url': 'https://www.tvnow.de/shows/auto-mobil-174/2017-11/episode-46-neues-vom-automobilmarkt-2017-11-19-17-00-00-380072',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
+        'only_matching': True,
+    }]
+
+    def _extract_video(self, info, url, display_id):
+        config = info['config']
+        source = config['source']
+
+        video_id = compat_str(info.get('id') or source['videoId'])
+        title = source['title'].strip()
+
+        paths = []
+        for manifest_url in (info.get('manifest') or {}).values():
+            if not manifest_url:
+                continue
+            manifest_url = update_url_query(manifest_url, {'filter': ''})
+            path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path')
+            if path in paths:
+                continue
+            paths.append(path)
+
+            def url_repl(proto, suffix):
+                return re.sub(
+                    r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub(
+                        r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)',
+                        '.ism/' + suffix, manifest_url))
+
+            formats = self._extract_mpd_formats(
+                url_repl('dash', '.mpd'), video_id,
+                mpd_id='dash', fatal=False)
+            formats.extend(self._extract_ism_formats(
+                url_repl('hss', 'Manifest'),
+                video_id, ism_id='mss', fatal=False))
+            formats.extend(self._extract_m3u8_formats(
+                url_repl('hls', '.m3u8'), video_id, 'mp4',
+                'm3u8_native', m3u8_id='hls', fatal=False))
+            if formats:
+                break
+        else:
+            if try_get(info, lambda x: x['rights']['isDrm']):
+                raise ExtractorError(
+                    'Video %s is DRM protected' % video_id, expected=True)
+            if try_get(config, lambda x: x['boards']['geoBlocking']['block']):
+                raise self.raise_geo_restricted()
+            if not info.get('free', True):
+                raise ExtractorError(
+                    'Video %s is not available for free' % video_id, expected=True)
+        self._sort_formats(formats)
+
+        description = source.get('description')
+        thumbnail = url_or_none(source.get('poster'))
+        timestamp = unified_timestamp(source.get('previewStart'))
+        duration = parse_duration(source.get('length'))
+
+        series = source.get('format')
+        season_number = int_or_none(self._search_regex(
+            r'staffel-(\d+)', url, 'season number', default=None))
+        episode_number = int_or_none(self._search_regex(
+            r'episode-(\d+)', url, 'episode number', default=None))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'series': series,
+            'season_number': season_number,
+            'episode_number': episode_number,
+            'episode': title,
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        display_id, video_id = re.match(self._VALID_URL, url).groups()
+        info = self._call_api('player/' + video_id, video_id)
+        return self._extract_video(info, video_id, display_id)
+"""
+
+
+class TVNowListBaseIE(TVNowNewBaseIE):
+    _SHOW_VALID_URL = r'''(?x)
+                    (?P<base_url>
+                        https?://
+                            (?:www\.)?tvnow\.(?:de|at|ch)/(?:shows|serien)/
+                            [^/?#&]+-(?P<show_id>\d+)
+                    )
+                    '''
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if TVNowNewIE.suitable(url)
+                else super(TVNowListBaseIE, cls).suitable(url))
+
+    def _extract_items(self, url, show_id, list_id, query):
+        items = self._call_api(
+            'teaserrow/format/episode/' + show_id, list_id,
+            query=query)['items']
+
+        entries = []
+        for item in items:
+            if not isinstance(item, dict):
+                continue
+            item_url = urljoin(url, item.get('url'))
+            if not item_url:
+                continue
+            video_id = str_or_none(item.get('id') or item.get('videoId'))
+            item_title = item.get('subheadline') or item.get('text')
+            entries.append(self.url_result(
+                item_url, ie=TVNowNewIE.ie_key(), video_id=video_id,
+                video_title=item_title))
+
+        return self.playlist_result(entries, '%s/%s' % (show_id, list_id))
+
+
+class TVNowSeasonIE(TVNowListBaseIE):
+    _VALID_URL = r'%s/staffel-(?P<id>\d+)' % TVNowListBaseIE._SHOW_VALID_URL
+    _TESTS = [{
+        'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13',
+        'info_dict': {
+            'id': '1815/13',
+        },
+        'playlist_mincount': 22,
+    }]
+
+    def _real_extract(self, url):
+        _, show_id, season_id = re.match(self._VALID_URL, url).groups()
+        return self._extract_items(
+            url, show_id, season_id, {'season': season_id})
+
+
+class TVNowAnnualIE(TVNowListBaseIE):
+    _VALID_URL = r'%s/(?P<year>\d{4})-(?P<month>\d{2})' % TVNowListBaseIE._SHOW_VALID_URL
+    _TESTS = [{
+        'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05',
+        'info_dict': {
+            'id': '1669/2017-05',
+        },
+        'playlist_mincount': 2,
+    }]
+
+    def _real_extract(self, url):
+        _, show_id, year, month = re.match(self._VALID_URL, url).groups()
+        return self._extract_items(
+            url, show_id, '%s-%s' % (year, month), {
+                'year': int(year),
+                'month': int(month),
+            })
+
+
+class TVNowShowIE(TVNowListBaseIE):
+    _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL
+    _TESTS = [{
+        # annual navigationType
+        'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669',
+        'info_dict': {
+            'id': '1669',
+        },
+        'playlist_mincount': 73,
+    }, {
+        # season navigationType
+        'url': 'https://www.tvnow.de/shows/armes-deutschland-11471',
+        'info_dict': {
+            'id': '11471',
+        },
+        'playlist_mincount': 3,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url)
+                else super(TVNowShowIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        base_url, show_id = re.match(self._VALID_URL, url).groups()
+
+        result = self._call_api(
+            'teaserrow/format/navigation/' + show_id, show_id)
+
+        items = result['items']
+
+        entries = []
+        navigation = result.get('navigationType')
+        if navigation == 'annual':
+            for item in items:
+                if not isinstance(item, dict):
+                    continue
+                year = int_or_none(item.get('year'))
+                if year is None:
+                    continue
+                months = item.get('months')
+                if not isinstance(months, list):
+                    continue
+                for month_dict in months:
+                    if not isinstance(month_dict, dict) or not month_dict:
+                        continue
+                    month_number = int_or_none(list(month_dict.keys())[0])
+                    if month_number is None:
+                        continue
+                    entries.append(self.url_result(
+                        '%s/%04d-%02d' % (base_url, year, month_number),
+                        ie=TVNowAnnualIE.ie_key()))
+        elif navigation == 'season':
+            for item in items:
+                if not isinstance(item, dict):
+                    continue
+                season_number = int_or_none(item.get('season'))
+                if season_number is None:
+                    continue
+                entries.append(self.url_result(
+                    '%s/staffel-%d' % (base_url, season_number),
+                    ie=TVNowSeasonIE.ie_key()))
+        else:
+            raise ExtractorError('Unknown navigationType')
+
+        return self.playlist_result(entries, show_id)
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py

new file mode 100644 (file)

index 0000000..accff75
--- /dev/null
+++ b/youtube_dl/extractor/tvp.py
@@ -0,0 +1,252 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    determine_ext,
+    ExtractorError,
+    get_element_by_attribute,
+    orderedSet,
+)
+
+
+class TVPIE(InfoExtractor):
+    IE_NAME = 'tvp'
+    IE_DESC = 'Telewizja Polska'
+    _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
+        'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
+        'info_dict': {
+            'id': '194536',
+            'ext': 'mp4',
+            'title': 'Czas honoru, odc. 13 – Władek',
+            'description': 'md5:437f48b93558370b031740546b696e24',
+        },
+    }, {
+        'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
+        'md5': 'b0005b542e5b4de643a9690326ab1257',
+        'info_dict': {
+            'id': '17916176',
+            'ext': 'mp4',
+            'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
+            'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
+        },
+    }, {
+        # page id is not the same as video id(#7799)
+        'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930',
+        'md5': '84cd3c8aec4840046e5ab712416b73d0',
+        'info_dict': {
+            'id': '33908820',
+            'ext': 'mp4',
+            'title': 'Wiadomości, 28.09.2017, 19:30',
+            'description': 'Wydanie główne codziennego serwisu informacyjnego.'
+        },
+        'skip': 'HTTP Error 404: Not Found',
+    }, {
+        'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
+        'only_matching': True,
+    }, {
+        'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
+        'only_matching': True,
+    }, {
+        'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
+        'only_matching': True,
+    }, {
+        'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
+        'only_matching': True,
+    }, {
+        'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+        webpage = self._download_webpage(url, page_id)
+        video_id = self._search_regex([
+            r'<iframe[^>]+src="[^"]*?object_id=(\d+)',
+            r"object_id\s*:\s*'(\d+)'",
+            r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id)
+        return {
+            '_type': 'url_transparent',
+            'url': 'tvp:' + video_id,
+            'description': self._og_search_description(
+                webpage, default=None) or self._html_search_meta(
+                'description', webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage, default=None),
+            'ie_key': 'TVPEmbed',
+        }
+
+
+class TVPEmbedIE(InfoExtractor):
+    IE_NAME = 'tvp:embed'
+    IE_DESC = 'Telewizja Polska'
+    _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'tvp:194536',
+        'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
+        'info_dict': {
+            'id': '194536',
+            'ext': 'mp4',
+            'title': 'Czas honoru, odc. 13 – Władek',
+        },
+    }, {
+        # not available
+        'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268',
+        'md5': '8c9cd59d16edabf39331f93bf8a766c7',
+        'info_dict': {
+            'id': '22670268',
+            'ext': 'mp4',
+            'title': 'Panorama, 07.12.2015, 15:40',
+        },
+        'skip': 'Transmisja została zakończona lub materiał niedostępny',
+    }, {
+        'url': 'tvp:22670268',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
+
+        error = self._html_search_regex(
+            r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>',
+            webpage, 'error', default=None) or clean_html(
+            get_element_by_attribute('class', 'msg error', webpage))
+        if error:
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, clean_html(error)), expected=True)
+
+        title = self._search_regex(
+            r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
+            webpage, 'title', group='title')
+        series_title = self._search_regex(
+            r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1',
+            webpage, 'series', group='series', default=None)
+        if series_title:
+            title = '%s, %s' % (series_title, title)
+
+        thumbnail = self._search_regex(
+            r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
+
+        video_url = self._search_regex(
+            r'0:{src:([\'"])(?P<url>.*?)\1', webpage,
+            'formats', group='url', default=None)
+        if not video_url or 'material_niedostepny.mp4' in video_url:
+            video_url = self._download_json(
+                'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
+                video_id)['video_url']
+
+        formats = []
+        video_url_base = self._search_regex(
+            r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)',
+            video_url, 'video base url', default=None)
+        if video_url_base:
+            # TODO: <Group> found instead of <AdaptationSet> in MPD manifest.
+            # It's not mentioned in MPEG-DASH standard. Figure that out.
+            # formats.extend(self._extract_mpd_formats(
+            #     video_url_base + '.ism/video.mpd',
+            #     video_id, mpd_id='dash', fatal=False))
+            formats.extend(self._extract_ism_formats(
+                video_url_base + '.ism/Manifest',
+                video_id, 'mss', fatal=False))
+            formats.extend(self._extract_f4m_formats(
+                video_url_base + '.ism/video.f4m',
+                video_id, f4m_id='hds', fatal=False))
+            m3u8_formats = self._extract_m3u8_formats(
+                video_url_base + '.ism/video.m3u8', video_id,
+                'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+            self._sort_formats(m3u8_formats)
+            m3u8_formats = list(filter(
+                lambda f: f.get('vcodec') != 'none', m3u8_formats))
+            formats.extend(m3u8_formats)
+            for i, m3u8_format in enumerate(m3u8_formats, 2):
+                http_url = '%s-%d.mp4' % (video_url_base, i)
+                if self._is_valid_url(http_url, video_id):
+                    f = m3u8_format.copy()
+                    f.update({
+                        'url': http_url,
+                        'format_id': f['format_id'].replace('hls', 'http'),
+                        'protocol': 'http',
+                    })
+                    formats.append(f)
+        else:
+            formats = [{
+                'format_id': 'direct',
+                'url': video_url,
+                'ext': determine_ext(video_url, 'mp4'),
+            }]
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
+
+
+class TVPWebsiteIE(InfoExtractor):
+    IE_NAME = 'tvp:series'
+    _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)'
+
+    _TESTS = [{
+        # series
+        'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video',
+        'info_dict': {
+            'id': '38678312',
+        },
+        'playlist_count': 115,
+    }, {
+        # film
+        'url': 'https://vod.tvp.pl/website/gloria,35139666',
+        'info_dict': {
+            'id': '36637049',
+            'ext': 'mp4',
+            'title': 'Gloria, Gloria',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['TVPEmbed'],
+    }, {
+        'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312',
+        'only_matching': True,
+    }]
+
+    def _entries(self, display_id, playlist_id):
+        url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id)
+        for page_num in itertools.count(1):
+            page = self._download_webpage(
+                url, display_id, 'Downloading page %d' % page_num,
+                query={'page': page_num})
+
+            video_ids = orderedSet(re.findall(
+                r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id,
+                page))
+
+            if not video_ids:
+                break
+
+            for video_id in video_ids:
+                yield self.url_result(
+                    'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(),
+                    video_id=video_id)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id, playlist_id = mobj.group('display_id', 'id')
+        return self.playlist_result(
+            self._entries(display_id, playlist_id), playlist_id)
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py

new file mode 100644 (file)

index 0000000..3c2450d
--- /dev/null
+++ b/youtube_dl/extractor/tvplay.py
@@ -0,0 +1,512 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_urlparse,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+    qualities,
+    try_get,
+    update_url_query,
+    url_or_none,
+)
+
+
+class TVPlayIE(InfoExtractor):
+    IE_NAME = 'mtg'
+    IE_DESC = 'MTG services'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        mtg:|
+                        https?://
+                            (?:www\.)?
+                            (?:
+                                tvplay(?:\.skaties)?\.lv(?:/parraides)?|
+                                (?:tv3play|play\.tv3)\.lt(?:/programos)?|
+                                tv3play(?:\.tv3)?\.ee/sisu|
+                                (?:tv(?:3|6|8|10)play|viafree)\.se/program|
+                                (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer|
+                                play\.nova(?:tv)?\.bg/programi
+                            )
+                            /(?:[^/]+/)+
+                        )
+                        (?P<id>\d+)
+                    '''
+    _TESTS = [
+        {
+            'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true',
+            'md5': 'a1612fe0849455423ad8718fe049be21',
+            'info_dict': {
+                'id': '418113',
+                'ext': 'mp4',
+                'title': 'Kādi ir īri? - Viņas melo labāk',
+                'description': 'Baiba apsmej īrus, kādi tie ir un ko viņi dara.',
+                'series': 'Viņas melo labāk',
+                'season': '2.sezona',
+                'season_number': 2,
+                'duration': 25,
+                'timestamp': 1406097056,
+                'upload_date': '20140723',
+            },
+        },
+        {
+            'url': 'http://play.tv3.lt/programos/moterys-meluoja-geriau/409229?autostart=true',
+            'info_dict': {
+                'id': '409229',
+                'ext': 'flv',
+                'title': 'Moterys meluoja geriau',
+                'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e',
+                'series': 'Moterys meluoja geriau',
+                'episode_number': 47,
+                'season': '1 sezonas',
+                'season_number': 1,
+                'duration': 1330,
+                'timestamp': 1403769181,
+                'upload_date': '20140626',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.tv3play.ee/sisu/kodu-keset-linna/238551?autostart=true',
+            'info_dict': {
+                'id': '238551',
+                'ext': 'flv',
+                'title': 'Kodu keset linna 398537',
+                'description': 'md5:7df175e3c94db9e47c0d81ffa5d68701',
+                'duration': 1257,
+                'timestamp': 1292449761,
+                'upload_date': '20101215',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true',
+            'info_dict': {
+                'id': '395385',
+                'ext': 'mp4',
+                'title': 'Husräddarna S02E07',
+                'description': 'md5:f210c6c89f42d4fc39faa551be813777',
+                'duration': 2574,
+                'timestamp': 1400596321,
+                'upload_date': '20140520',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true',
+            'info_dict': {
+                'id': '266636',
+                'ext': 'mp4',
+                'title': 'Den sista dokusåpan S01E08',
+                'description': 'md5:295be39c872520221b933830f660b110',
+                'duration': 1492,
+                'timestamp': 1330522854,
+                'upload_date': '20120229',
+                'age_limit': 18,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true',
+            'info_dict': {
+                'id': '282756',
+                'ext': 'mp4',
+                'title': 'Antikjakten S01E10',
+                'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8',
+                'duration': 2646,
+                'timestamp': 1348575868,
+                'upload_date': '20120925',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true',
+            'info_dict': {
+                'id': '230898',
+                'ext': 'mp4',
+                'title': 'Anna Anka søker assistent - Ep. 8',
+                'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474',
+                'duration': 2656,
+                'timestamp': 1277720005,
+                'upload_date': '20100628',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true',
+            'info_dict': {
+                'id': '21873',
+                'ext': 'mp4',
+                'title': 'Budbringerne program 10',
+                'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d',
+                'duration': 1297,
+                'timestamp': 1254205102,
+                'upload_date': '20090929',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true',
+            'info_dict': {
+                'id': '361883',
+                'ext': 'mp4',
+                'title': 'Hotelinspektør Alex Polizzi - Ep. 10',
+                'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81',
+                'duration': 2594,
+                'timestamp': 1393236292,
+                'upload_date': '20140224',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true',
+            'info_dict': {
+                'id': '624952',
+                'ext': 'flv',
+                'title': 'Здравей, България (12.06.2015 г.) ',
+                'description': 'md5:99f3700451ac5bb71a260268b8daefd7',
+                'duration': 8838,
+                'timestamp': 1434100372,
+                'upload_date': '20150612',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://tvplay.skaties.lv/vinas-melo-labak/418113/?autostart=true',
+            'only_matching': True,
+        },
+        {
+            # views is null
+            'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869',
+            'only_matching': True,
+        },
+        {
+            'url': 'mtg:418113',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        geo_country = self._search_regex(
+            r'https?://[^/]+\.([a-z]{2})', url,
+            'geo country', default=None)
+        if geo_country:
+            self._initialize_geo_bypass({'countries': [geo_country.upper()]})
+        video = self._download_json(
+            'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON')
+
+        title = video['title']
+
+        try:
+            streams = self._download_json(
+                'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id,
+                video_id, 'Downloading streams JSON')
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                msg = self._parse_json(e.cause.read().decode('utf-8'), video_id)
+                raise ExtractorError(msg['msg'], expected=True)
+            raise
+
+        quality = qualities(['hls', 'medium', 'high'])
+        formats = []
+        for format_id, video_url in streams.get('streams', {}).items():
+            video_url = url_or_none(video_url)
+            if not video_url:
+                continue
+            ext = determine_ext(video_url)
+            if ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    update_url_query(video_url, {
+                        'hdcore': '3.5.0',
+                        'plugin': 'aasp-3.5.0.151.81'
+                    }), video_id, f4m_id='hds', fatal=False))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                fmt = {
+                    'format_id': format_id,
+                    'quality': quality(format_id),
+                    'ext': ext,
+                }
+                if video_url.startswith('rtmp'):
+                    m = re.search(
+                        r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url)
+                    if not m:
+                        continue
+                    fmt.update({
+                        'ext': 'flv',
+                        'url': m.group('url'),
+                        'app': m.group('app'),
+                        'play_path': m.group('playpath'),
+                        'preference': -1,
+                    })
+                else:
+                    fmt.update({
+                        'url': video_url,
+                    })
+                formats.append(fmt)
+
+        if not formats and video.get('is_geo_blocked'):
+            self.raise_geo_restricted(
+                'This content might not be available in your country due to copyright reasons')
+
+        self._sort_formats(formats)
+
+        # TODO: webvtt in m3u8
+        subtitles = {}
+        sami_path = video.get('sami_path')
+        if sami_path:
+            lang = self._search_regex(
+                r'_([a-z]{2})\.xml', sami_path, 'lang',
+                default=compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1])
+            subtitles[lang] = [{
+                'url': sami_path,
+            }]
+
+        series = video.get('format_title')
+        episode_number = int_or_none(video.get('format_position', {}).get('episode'))
+        season = video.get('_embedded', {}).get('season', {}).get('title')
+        season_number = int_or_none(video.get('format_position', {}).get('season'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video.get('description'),
+            'series': series,
+            'episode_number': episode_number,
+            'season': season,
+            'season_number': season_number,
+            'duration': int_or_none(video.get('duration')),
+            'timestamp': parse_iso8601(video.get('created_at')),
+            'view_count': try_get(video, lambda x: x['views']['total'], int),
+            'age_limit': int_or_none(video.get('age_limit', 0)),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class ViafreeIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        viafree\.(?P<country>dk|no|se)
+                        /(?P<id>program(?:mer)?/(?:[^/]+/)+[^/?#&]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1',
+        'info_dict': {
+            'id': '757786',
+            'ext': 'mp4',
+            'title': 'Det beste vorspielet - Sesong 2 - Episode 1',
+            'description': 'md5:b632cb848331404ccacd8cd03e83b4c3',
+            'series': 'Det beste vorspielet',
+            'season_number': 2,
+            'duration': 1116,
+            'timestamp': 1471200600,
+            'upload_date': '20160814',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # with relatedClips
+        'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1',
+        'only_matching': True,
+    }, {
+        # Different og:image URL schema
+        'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5',
+        'only_matching': True,
+    }]
+    _GEO_BYPASS = False
+
+    @classmethod
+    def suitable(cls, url):
+        return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        country, path = re.match(self._VALID_URL, url).groups()
+        content = self._download_json(
+            'https://viafree-content.mtg-api.com/viafree-content/v1/%s/path/%s' % (country, path), path)
+        program = content['_embedded']['viafreeBlocks'][0]['_embedded']['program']
+        guid = program['guid']
+        meta = content['meta']
+        title = meta['title']
+
+        try:
+            stream_href = self._download_json(
+                program['_links']['streamLink']['href'], guid,
+                headers=self.geo_verification_headers())['embedded']['prioritizedStreams'][0]['links']['stream']['href']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                self.raise_geo_restricted(countries=[country])
+            raise
+
+        formats = self._extract_m3u8_formats(stream_href, guid, 'mp4')
+        self._sort_formats(formats)
+        episode = program.get('episode') or {}
+
+        return {
+            'id': guid,
+            'title': title,
+            'thumbnail': meta.get('image'),
+            'description': meta.get('description'),
+            'series': episode.get('seriesTitle'),
+            'episode_number': int_or_none(episode.get('episodeNumber')),
+            'season_number': int_or_none(episode.get('seasonNumber')),
+            'duration': int_or_none(try_get(program, lambda x: x['video']['duration']['milliseconds']), 1000),
+            'timestamp': parse_iso8601(try_get(program, lambda x: x['availability']['start'])),
+            'formats': formats,
+        }
+
+
+class TVPlayHomeIE(InfoExtractor):
+    _VALID_URL = r'https?://tvplay\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/[^/]+/[^/?#&]+-(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/',
+        'info_dict': {
+            'id': '366367',
+            'ext': 'mp4',
+            'title': 'Aferistai',
+            'description': 'Aferistai. Kalėdinė pasaka.',
+            'series': 'Aferistai [N-7]',
+            'season': '1 sezonas',
+            'season_number': 1,
+            'duration': 464,
+            'timestamp': 1394209658,
+            'upload_date': '20140307',
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [TVPlayIE.ie_key()],
+    }, {
+        'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/',
+        'only_matching': True,
+    }, {
+        'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_id = self._search_regex(
+            r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id')
+
+        if len(video_id) < 8:
+            return self.url_result(
+                'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id)
+
+        m3u8_url = self._search_regex(
+            r'data-file\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'm3u8 url', group='url')
+
+        formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            r'data-title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+            'title', default=None, group='value') or self._html_search_meta(
+            'title', webpage, default=None) or self._og_search_title(
+            webpage)
+
+        description = self._html_search_meta(
+            'description', webpage,
+            default=None) or self._og_search_description(webpage)
+
+        thumbnail = self._search_regex(
+            r'data-image\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'thumbnail', default=None, group='url') or self._html_search_meta(
+            'thumbnail', webpage, default=None) or self._og_search_thumbnail(
+            webpage)
+
+        duration = int_or_none(self._search_regex(
+            r'data-duration\s*=\s*["\'](\d+)', webpage, 'duration',
+            fatal=False))
+
+        season = self._search_regex(
+            (r'data-series-title\s*=\s*(["\'])[^/]+/(?P<value>(?:(?!\1).)+)\1',
+             r'\bseason\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
+            'season', default=None, group='value')
+        season_number = int_or_none(self._search_regex(
+            r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number',
+            default=None))
+        episode = self._search_regex(
+            (r'\bepisode\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
+             r'data-subtitle\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
+            'episode', default=None, group='value')
+        episode_number = int_or_none(self._search_regex(
+            r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number',
+            default=None))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'season': season,
+            'season_number': season_number,
+            'episode': episode,
+            'episode_number': episode_number,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py

new file mode 100644 (file)

index 0000000..8f8686a
--- /dev/null
+++ b/youtube_dl/extractor/tvplayer.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
+from ..utils import (
+    extract_attributes,
+    try_get,
+    urlencode_postdata,
+    ExtractorError,
+)
+
+
+class TVPlayerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tvplayer\.com/watch/(?P<id>[^/?#]+)'
+    _TEST = {
+        'url': 'http://tvplayer.com/watch/bbcone',
+        'info_dict': {
+            'id': '89',
+            'ext': 'mp4',
+            'title': r're:^BBC One [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        current_channel = extract_attributes(self._search_regex(
+            r'(<div[^>]+class="[^"]*current-channel[^"]*"[^>]*>)',
+            webpage, 'channel element'))
+        title = current_channel['data-name']
+
+        resource_id = current_channel['data-id']
+
+        token = self._search_regex(
+            r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage,
+            'token', group='token')
+
+        context = self._download_json(
+            'https://tvplayer.com/watch/context', display_id,
+            'Downloading JSON context', query={
+                'resource': resource_id,
+                'gen': token,
+            })
+
+        validate = context['validate']
+        platform = try_get(
+            context, lambda x: x['platform']['key'], compat_str) or 'firefox'
+
+        try:
+            response = self._download_json(
+                'http://api.tvplayer.com/api/v2/stream/live',
+                display_id, 'Downloading JSON stream', headers={
+                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+                }, data=urlencode_postdata({
+                    'id': resource_id,
+                    'service': 1,
+                    'platform': platform,
+                    'validate': validate,
+                }))['tvplayer']['response']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError):
+                response = self._parse_json(
+                    e.cause.read().decode(), resource_id)['tvplayer']['response']
+                raise ExtractorError(
+                    '%s said: %s' % (self.IE_NAME, response['error']), expected=True)
+            raise
+
+        formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4')
+        self._sort_formats(formats)
+
+        return {
+            'id': resource_id,
+            'display_id': display_id,
+            'title': self._live_title(title),
+            'formats': formats,
+            'is_live': True,
+        }
diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py

new file mode 100644 (file)

index 0000000..2b10d9b
--- /dev/null
+++ b/youtube_dl/extractor/tweakers.py
@@ -0,0 +1,62 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    determine_ext,
+    mimetype2ext,
+)
+
+
+class TweakersIE(InfoExtractor):
+    _VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html',
+        'md5': 'fe73e417c093a788e0160c4025f88b15',
+        'info_dict': {
+            'id': '9926',
+            'ext': 'mp4',
+            'title': 'New Nintendo 3DS XL - Op alle fronten beter',
+            'description': 'md5:3789b21fed9c0219e9bcaacd43fab280',
+            'thumbnail': r're:^https?://.*\.jpe?g$',
+            'duration': 386,
+            'uploader_id': 's7JeEm',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'https://tweakers.net/video/s1playlist/%s/1920/1080/playlist.json' % video_id,
+            video_id)['items'][0]
+
+        title = video_data['title']
+
+        formats = []
+        for location in video_data.get('locations', {}).get('progressive', []):
+            format_id = location.get('label')
+            width = int_or_none(location.get('width'))
+            height = int_or_none(location.get('height'))
+            for source in location.get('sources', []):
+                source_url = source.get('src')
+                if not source_url:
+                    continue
+                ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
+                formats.append({
+                    'format_id': format_id,
+                    'url': source_url,
+                    'width': width,
+                    'height': height,
+                    'ext': ext,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('poster'),
+            'duration': int_or_none(video_data.get('duration')),
+            'uploader_id': video_data.get('account'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py

new file mode 100644 (file)

index 0000000..74d1404
--- /dev/null
+++ b/youtube_dl/extractor/twentyfourvideo.py
@@ -0,0 +1,133 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_iso8601,
+    int_or_none,
+    xpath_attr,
+    xpath_element,
+)
+
+
+class TwentyFourVideoIE(InfoExtractor):
+    IE_NAME = '24video'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?P<host>
+                            (?:(?:www|porno?)\.)?24video\.
+                            (?:net|me|xxx|sexy?|tube|adult|site|vip)
+                        )/
+                        (?:
+                            video/(?:(?:view|xml)/)?|
+                            player/new24_play\.swf\?id=
+                        )
+                        (?P<id>\d+)
+                    '''
+
+    _TESTS = [{
+        'url': 'http://www.24video.net/video/view/1044982',
+        'md5': 'e09fc0901d9eaeedac872f154931deeb',
+        'info_dict': {
+            'id': '1044982',
+            'ext': 'mp4',
+            'title': 'Эротика каменного века',
+            'description': 'Как смотрели порно в каменном веке.',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'SUPERTELO',
+            'duration': 31,
+            'timestamp': 1275937857,
+            'upload_date': '20100607',
+            'age_limit': 18,
+            'like_count': int,
+            'dislike_count': int,
+        },
+    }, {
+        'url': 'http://www.24video.net/player/new24_play.swf?id=1044982',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.24video.me/video/view/1044982',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.24video.tube/video/view/2363750',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.24video.site/video/view/2640421',
+        'only_matching': True,
+    }, {
+        'url': 'https://porno.24video.net/video/2640421-vsya-takaya-gibkaya-i-v-masle',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.24video.vip/video/view/1044982',
+        'only_matching': True,
+    }, {
+        'url': 'https://porn.24video.net/video/2640421-vsya-takay',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        host = mobj.group('host')
+
+        webpage = self._download_webpage(
+            'http://%s/video/view/%s' % (host, video_id), video_id)
+
+        title = self._og_search_title(webpage)
+        description = self._html_search_regex(
+            r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>',
+            webpage, 'description', fatal=False, group='description')
+        thumbnail = self._og_search_thumbnail(webpage)
+        duration = int_or_none(self._og_search_property(
+            'duration', webpage, 'duration', fatal=False))
+        timestamp = parse_iso8601(self._search_regex(
+            r'<time[^>]+\bdatetime="([^"]+)"[^>]+itemprop="uploadDate"',
+            webpage, 'upload date', fatal=False))
+
+        uploader = self._html_search_regex(
+            r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>',
+            webpage, 'uploader', fatal=False)
+
+        view_count = int_or_none(self._html_search_regex(
+            r'<span class="video-views">(\d+) просмотр',
+            webpage, 'view count', fatal=False))
+        comment_count = int_or_none(self._html_search_regex(
+            r'<a[^>]+href="#tab-comments"[^>]*>(\d+) комментари',
+            webpage, 'comment count', default=None))
+
+        # Sets some cookies
+        self._download_xml(
+            r'http://%s/video/xml/%s?mode=init' % (host, video_id),
+            video_id, 'Downloading init XML')
+
+        video_xml = self._download_xml(
+            'http://%s/video/xml/%s?mode=play' % (host, video_id),
+            video_id, 'Downloading video XML')
+
+        video = xpath_element(video_xml, './/video', 'video', fatal=True)
+
+        formats = [{
+            'url': xpath_attr(video, '', 'url', 'video URL', fatal=True),
+        }]
+
+        like_count = int_or_none(video.get('ratingPlus'))
+        dislike_count = int_or_none(video.get('ratingMinus'))
+        age_limit = 18 if video.get('adult') == 'true' else 0
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': duration,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py

new file mode 100644 (file)

index 0000000..a42977f
--- /dev/null
+++ b/youtube_dl/extractor/twentymin.py
@@ -0,0 +1,91 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    try_get,
+)
+
+
+class TwentyMinutenIE(InfoExtractor):
+    IE_NAME = '20min'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?20min\.ch/
+                        (?:
+                            videotv/*\?.*?\bvid=|
+                            videoplayer/videoplayer\.html\?.*?\bvideoId@
+                        )
+                        (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2',
+        'md5': 'e7264320db31eed8c38364150c12496e',
+        'info_dict': {
+            'id': '469148',
+            'ext': 'mp4',
+            'title': '85 000 Franken für 15 perfekte Minuten',
+            'thumbnail': r're:https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.20min.ch/videoplayer/videoplayer.html?params=client@twentyDE|videoId@523629',
+        'info_dict': {
+            'id': '523629',
+            'ext': 'mp4',
+            'title': 'So kommen Sie bei Eis und Schnee sicher an',
+            'description': 'md5:117c212f64b25e3d95747e5276863f7d',
+            'thumbnail': r're:https?://.*\.jpg$',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [m.group('url') for m in re.finditer(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1',
+            webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_json(
+            'http://api.20min.ch/video/%s/show' % video_id,
+            video_id)['content']
+
+        title = video['title']
+
+        formats = [{
+            'format_id': format_id,
+            'url': 'http://podcast.20min-tv.ch/podcast/20min/%s%s.mp4' % (video_id, p),
+            'quality': quality,
+        } for quality, (format_id, p) in enumerate([('sd', ''), ('hd', 'h')])]
+        self._sort_formats(formats)
+
+        description = video.get('lead')
+        thumbnail = video.get('thumbnail')
+
+        def extract_count(kind):
+            return try_get(
+                video,
+                lambda x: int_or_none(x['communityobject']['thumbs_%s' % kind]))
+
+        like_count = extract_count('up')
+        dislike_count = extract_count('down')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/twentythreevideo.py b/youtube_dl/extractor/twentythreevideo.py

new file mode 100644 (file)

index 0000000..aa0c6e9
--- /dev/null
+++ b/youtube_dl/extractor/twentythreevideo.py
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class TwentyThreeVideoIE(InfoExtractor):
+    IE_NAME = '23video'
+    _VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)'
+    _TEST = {
+        'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1',
+        'md5': '75fcf216303eb1dae9920d651f85ced4',
+        'info_dict': {
+            'id': '20448876',
+            'ext': 'mp4',
+            'title': 'Video Marketing Minute: Personalized Video',
+            'timestamp': 1513855354,
+            'upload_date': '20171221',
+            'uploader_id': '12258964',
+            'uploader': 'Rasmus Bysted',
+        }
+    }
+
+    def _real_extract(self, url):
+        domain, query, photo_id = re.match(self._VALID_URL, url).groups()
+        base_url = 'https://video.%s' % domain
+        photo_data = self._download_json(
+            base_url + '/api/photo/list?' + query, photo_id, query={
+                'format': 'json',
+            }, transform_source=lambda s: self._search_regex(r'(?s)({.+})', s, 'photo data'))['photo']
+        title = photo_data['title']
+
+        formats = []
+
+        audio_path = photo_data.get('audio_download')
+        if audio_path:
+            formats.append({
+                'format_id': 'audio',
+                'url': base_url + audio_path,
+                'filesize': int_or_none(photo_data.get('audio_size')),
+                'vcodec': 'none',
+            })
+
+        def add_common_info_to_list(l, template, id_field, id_value):
+            f_base = template % id_value
+            f_path = photo_data.get(f_base + 'download')
+            if not f_path:
+                return
+            l.append({
+                id_field: id_value,
+                'url': base_url + f_path,
+                'width': int_or_none(photo_data.get(f_base + 'width')),
+                'height': int_or_none(photo_data.get(f_base + 'height')),
+                'filesize': int_or_none(photo_data.get(f_base + 'size')),
+            })
+
+        for f in ('mobile_high', 'medium', 'hd', '1080p', '4k'):
+            add_common_info_to_list(formats, 'video_%s_', 'format_id', f)
+
+        thumbnails = []
+        for t in ('quad16', 'quad50', 'quad75', 'quad100', 'small', 'portrait', 'standard', 'medium', 'large', 'original'):
+            add_common_info_to_list(thumbnails, '%s_', 'id', t)
+
+        return {
+            'id': photo_id,
+            'title': title,
+            'timestamp': int_or_none(photo_data.get('creation_date_epoch')),
+            'duration': int_or_none(photo_data.get('video_length')),
+            'view_count': int_or_none(photo_data.get('view_count')),
+            'comment_count': int_or_none(photo_data.get('number_of_comments')),
+            'uploader_id': photo_data.get('user_id'),
+            'uploader': photo_data.get('display_name'),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/twitcasting.py b/youtube_dl/extractor/twitcasting.py

new file mode 100644 (file)

index 0000000..2dbe89f
--- /dev/null
+++ b/youtube_dl/extractor/twitcasting.py
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import urlencode_postdata
+
+import re
+
+
+class TwitCastingIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609',
+        'md5': '745243cad58c4681dc752490f7540d7f',
+        'info_dict': {
+            'id': '2357609',
+            'ext': 'mp4',
+            'title': 'Live #2357609',
+            'uploader_id': 'ivetesangalo',
+            'description': "Moi! I'm live on TwitCasting from my iPhone.",
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://twitcasting.tv/mttbernardini/movie/3689740',
+        'info_dict': {
+            'id': '3689740',
+            'ext': 'mp4',
+            'title': 'Live playing something #3689740',
+            'uploader_id': 'mttbernardini',
+            'description': "I'm live on TwitCasting from my iPad. password: abc (Santa Marinella/Lazio, Italia)",
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'params': {
+            'skip_download': True,
+            'videopassword': 'abc',
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        uploader_id = mobj.group('uploader_id')
+
+        video_password = self._downloader.params.get('videopassword')
+        request_data = None
+        if video_password:
+            request_data = urlencode_postdata({
+                'password': video_password,
+            })
+        webpage = self._download_webpage(url, video_id, data=request_data)
+
+        title = self._html_search_regex(
+            r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</',
+            webpage, 'title', default=None) or self._html_search_meta(
+            'twitter:title', webpage, fatal=True)
+
+        m3u8_url = self._search_regex(
+            (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+             r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'),
+            webpage, 'm3u8 url', group='url')
+
+        formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+
+        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._og_search_description(
+            webpage, default=None) or self._html_search_meta(
+            'twitter:description', webpage)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader_id': uploader_id,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py

new file mode 100644 (file)

index 0000000..3f0f7e2
--- /dev/null
+++ b/youtube_dl/extractor/twitch.py
@@ -0,0 +1,798 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import re
+import random
+import json
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_kwargs,
+    compat_parse_qs,
+    compat_str,
+    compat_urllib_parse_urlencode,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    orderedSet,
+    parse_duration,
+    parse_iso8601,
+    qualities,
+    str_or_none,
+    try_get,
+    unified_timestamp,
+    update_url_query,
+    url_or_none,
+    urljoin,
+)
+
+
+class TwitchBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:(?:www|go|m)\.)?twitch\.tv'
+
+    _API_BASE = 'https://api.twitch.tv'
+    _USHER_BASE = 'https://usher.ttvnw.net'
+    _LOGIN_FORM_URL = 'https://www.twitch.tv/login'
+    _LOGIN_POST_URL = 'https://passport.twitch.tv/login'
+    _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko'
+    _NETRC_MACHINE = 'twitch'
+
+    def _handle_error(self, response):
+        if not isinstance(response, dict):
+            return
+        error = response.get('error')
+        if error:
+            raise ExtractorError(
+                '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')),
+                expected=True)
+
+    def _call_api(self, path, item_id, *args, **kwargs):
+        headers = kwargs.get('headers', {}).copy()
+        headers.update({
+            'Accept': 'application/vnd.twitchtv.v5+json; charset=UTF-8',
+            'Client-ID': self._CLIENT_ID,
+        })
+        kwargs.update({
+            'headers': headers,
+            'expected_status': (400, 410),
+        })
+        response = self._download_json(
+            '%s/%s' % (self._API_BASE, path), item_id,
+            *args, **compat_kwargs(kwargs))
+        self._handle_error(response)
+        return response
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        def fail(message):
+            raise ExtractorError(
+                'Unable to login. Twitch said: %s' % message, expected=True)
+
+        def login_step(page, urlh, note, data):
+            form = self._hidden_inputs(page)
+            form.update(data)
+
+            page_url = urlh.geturl()
+            post_url = self._search_regex(
+                r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page,
+                'post url', default=self._LOGIN_POST_URL, group='url')
+            post_url = urljoin(page_url, post_url)
+
+            headers = {
+                'Referer': page_url,
+                'Origin': page_url,
+                'Content-Type': 'text/plain;charset=UTF-8',
+            }
+
+            response = self._download_json(
+                post_url, None, note, data=json.dumps(form).encode(),
+                headers=headers, expected_status=400)
+            error = response.get('error_description') or response.get('error_code')
+            if error:
+                fail(error)
+
+            if 'Authenticated successfully' in response.get('message', ''):
+                return None, None
+
+            redirect_url = urljoin(
+                post_url,
+                response.get('redirect') or response['redirect_path'])
+            return self._download_webpage_handle(
+                redirect_url, None, 'Downloading login redirect page',
+                headers=headers)
+
+        login_page, handle = self._download_webpage_handle(
+            self._LOGIN_FORM_URL, None, 'Downloading login page')
+
+        # Some TOR nodes and public proxies are blocked completely
+        if 'blacklist_message' in login_page:
+            fail(clean_html(login_page))
+
+        redirect_page, handle = login_step(
+            login_page, handle, 'Logging in', {
+                'username': username,
+                'password': password,
+                'client_id': self._CLIENT_ID,
+            })
+
+        # Successful login
+        if not redirect_page:
+            return
+
+        if re.search(r'(?i)<form[^>]+id="two-factor-submit"', redirect_page) is not None:
+            # TODO: Add mechanism to request an SMS or phone call
+            tfa_token = self._get_tfa_info('two-factor authentication token')
+            login_step(redirect_page, handle, 'Submitting TFA token', {
+                'authy_token': tfa_token,
+                'remember_2fa': 'true',
+            })
+
+    def _prefer_source(self, formats):
+        try:
+            source = next(f for f in formats if f['format_id'] == 'Source')
+            source['quality'] = 10
+        except StopIteration:
+            for f in formats:
+                if '/chunked/' in f['url']:
+                    f.update({
+                        'quality': 10,
+                        'format_note': 'Source',
+                    })
+        self._sort_formats(formats)
+
+
+class TwitchItemBaseIE(TwitchBaseIE):
+    def _download_info(self, item, item_id):
+        return self._extract_info(self._call_api(
+            'kraken/videos/%s%s' % (item, item_id), item_id,
+            'Downloading %s info JSON' % self._ITEM_TYPE))
+
+    def _extract_media(self, item_id):
+        info = self._download_info(self._ITEM_SHORTCUT, item_id)
+        response = self._call_api(
+            'api/videos/%s%s' % (self._ITEM_SHORTCUT, item_id), item_id,
+            'Downloading %s playlist JSON' % self._ITEM_TYPE)
+        entries = []
+        chunks = response['chunks']
+        qualities = list(chunks.keys())
+        for num, fragment in enumerate(zip(*chunks.values()), start=1):
+            formats = []
+            for fmt_num, fragment_fmt in enumerate(fragment):
+                format_id = qualities[fmt_num]
+                fmt = {
+                    'url': fragment_fmt['url'],
+                    'format_id': format_id,
+                    'quality': 1 if format_id == 'live' else 0,
+                }
+                m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
+                if m:
+                    fmt['height'] = int(m.group('height'))
+                formats.append(fmt)
+            self._sort_formats(formats)
+            entry = dict(info)
+            entry['id'] = '%s_%d' % (entry['id'], num)
+            entry['title'] = '%s part %d' % (entry['title'], num)
+            entry['formats'] = formats
+            entries.append(entry)
+        return self.playlist_result(entries, info['id'], info['title'])
+
+    def _extract_info(self, info):
+        status = info.get('status')
+        if status == 'recording':
+            is_live = True
+        elif status == 'recorded':
+            is_live = False
+        else:
+            is_live = None
+        _QUALITIES = ('small', 'medium', 'large')
+        quality_key = qualities(_QUALITIES)
+        thumbnails = []
+        preview = info.get('preview')
+        if isinstance(preview, dict):
+            for thumbnail_id, thumbnail_url in preview.items():
+                thumbnail_url = url_or_none(thumbnail_url)
+                if not thumbnail_url:
+                    continue
+                if thumbnail_id not in _QUALITIES:
+                    continue
+                thumbnails.append({
+                    'url': thumbnail_url,
+                    'preference': quality_key(thumbnail_id),
+                })
+        return {
+            'id': info['_id'],
+            'title': info.get('title') or 'Untitled Broadcast',
+            'description': info.get('description'),
+            'duration': int_or_none(info.get('length')),
+            'thumbnails': thumbnails,
+            'uploader': info.get('channel', {}).get('display_name'),
+            'uploader_id': info.get('channel', {}).get('name'),
+            'timestamp': parse_iso8601(info.get('recorded_at')),
+            'view_count': int_or_none(info.get('views')),
+            'is_live': is_live,
+        }
+
+    def _real_extract(self, url):
+        return self._extract_media(self._match_id(url))
+
+
+class TwitchVideoIE(TwitchItemBaseIE):
+    IE_NAME = 'twitch:video'
+    _VALID_URL = r'%s/[^/]+/b/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE
+    _ITEM_TYPE = 'video'
+    _ITEM_SHORTCUT = 'a'
+
+    _TEST = {
+        'url': 'http://www.twitch.tv/riotgames/b/577357806',
+        'info_dict': {
+            'id': 'a577357806',
+            'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
+        },
+        'playlist_mincount': 12,
+        'skip': 'HTTP Error 404: Not Found',
+    }
+
+
+class TwitchChapterIE(TwitchItemBaseIE):
+    IE_NAME = 'twitch:chapter'
+    _VALID_URL = r'%s/[^/]+/c/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE
+    _ITEM_TYPE = 'chapter'
+    _ITEM_SHORTCUT = 'c'
+
+    _TESTS = [{
+        'url': 'http://www.twitch.tv/acracingleague/c/5285812',
+        'info_dict': {
+            'id': 'c5285812',
+            'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
+        },
+        'playlist_mincount': 3,
+        'skip': 'HTTP Error 404: Not Found',
+    }, {
+        'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',
+        'only_matching': True,
+    }]
+
+
+class TwitchVodIE(TwitchItemBaseIE):
+    IE_NAME = 'twitch:vod'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/|
+                            player\.twitch\.tv/\?.*?\bvideo=v?
+                        )
+                        (?P<id>\d+)
+                    '''
+    _ITEM_TYPE = 'vod'
+    _ITEM_SHORTCUT = 'v'
+
+    _TESTS = [{
+        'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s',
+        'info_dict': {
+            'id': 'v6528877',
+            'ext': 'mp4',
+            'title': 'LCK Summer Split - Week 6 Day 1',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 17208,
+            'timestamp': 1435131709,
+            'upload_date': '20150624',
+            'uploader': 'Riot Games',
+            'uploader_id': 'riotgames',
+            'view_count': int,
+            'start_time': 310,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # Untitled broadcast (title is None)
+        'url': 'http://www.twitch.tv/belkao_o/v/11230755',
+        'info_dict': {
+            'id': 'v11230755',
+            'ext': 'mp4',
+            'title': 'Untitled Broadcast',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1638,
+            'timestamp': 1439746708,
+            'upload_date': '20150816',
+            'uploader': 'BelkAO_o',
+            'uploader_id': 'belkao_o',
+            'view_count': int,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': 'HTTP Error 404: Not Found',
+    }, {
+        'url': 'http://player.twitch.tv/?t=5m10s&video=v6528877',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.twitch.tv/videos/6528877',
+        'only_matching': True,
+    }, {
+        'url': 'https://m.twitch.tv/beagsandjam/v/247478721',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.twitch.tv/northernlion/video/291940395',
+        'only_matching': True,
+    }, {
+        'url': 'https://player.twitch.tv/?video=480452374',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        item_id = self._match_id(url)
+
+        info = self._download_info(self._ITEM_SHORTCUT, item_id)
+        access_token = self._call_api(
+            'api/vods/%s/access_token' % item_id, item_id,
+            'Downloading %s access token' % self._ITEM_TYPE)
+
+        formats = self._extract_m3u8_formats(
+            '%s/vod/%s.m3u8?%s' % (
+                self._USHER_BASE, item_id,
+                compat_urllib_parse_urlencode({
+                    'allow_source': 'true',
+                    'allow_audio_only': 'true',
+                    'allow_spectre': 'true',
+                    'player': 'twitchweb',
+                    'playlist_include_framerate': 'true',
+                    'nauth': access_token['token'],
+                    'nauthsig': access_token['sig'],
+                })),
+            item_id, 'mp4', entry_protocol='m3u8_native')
+
+        self._prefer_source(formats)
+        info['formats'] = formats
+
+        parsed_url = compat_urllib_parse_urlparse(url)
+        query = compat_parse_qs(parsed_url.query)
+        if 't' in query:
+            info['start_time'] = parse_duration(query['t'][0])
+
+        if info.get('timestamp') is not None:
+            info['subtitles'] = {
+                'rechat': [{
+                    'url': update_url_query(
+                        'https://api.twitch.tv/v5/videos/%s/comments' % item_id, {
+                            'client_id': self._CLIENT_ID,
+                        }),
+                    'ext': 'json',
+                }],
+            }
+
+        return info
+
+
+class TwitchPlaylistBaseIE(TwitchBaseIE):
+    _PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d'
+    _PAGE_LIMIT = 100
+
+    def _extract_playlist(self, channel_name):
+        info = self._call_api(
+            'kraken/users?login=%s' % channel_name,
+            channel_name, 'Downloading channel info JSON')
+        info = info['users'][0]
+        channel_id = info['_id']
+        channel_name = info.get('display_name') or info.get('name') or channel_name
+        entries = []
+        offset = 0
+        limit = self._PAGE_LIMIT
+        broken_paging_detected = False
+        counter_override = None
+        for counter in itertools.count(1):
+            response = self._call_api(
+                self._PLAYLIST_PATH % (channel_id, offset, limit),
+                channel_id,
+                'Downloading %s JSON page %s'
+                % (self._PLAYLIST_TYPE, counter_override or counter))
+            page_entries = self._extract_playlist_page(response)
+            if not page_entries:
+                break
+            total = int_or_none(response.get('_total'))
+            # Since the beginning of March 2016 twitch's paging mechanism
+            # is completely broken on the twitch side. It simply ignores
+            # a limit and returns the whole offset number of videos.
+            # Working around by just requesting all videos at once.
+            # Upd: pagination bug was fixed by twitch on 15.03.2016.
+            if not broken_paging_detected and total and len(page_entries) > limit:
+                self.report_warning(
+                    'Twitch pagination is broken on twitch side, requesting all videos at once',
+                    channel_id)
+                broken_paging_detected = True
+                offset = total
+                counter_override = '(all at once)'
+                continue
+            entries.extend(page_entries)
+            if broken_paging_detected or total and len(page_entries) >= total:
+                break
+            offset += limit
+        return self.playlist_result(
+            [self._make_url_result(entry) for entry in orderedSet(entries)],
+            channel_id, channel_name)
+
+    def _make_url_result(self, url):
+        try:
+            video_id = 'v%s' % TwitchVodIE._match_id(url)
+            return self.url_result(url, TwitchVodIE.ie_key(), video_id=video_id)
+        except AssertionError:
+            return self.url_result(url)
+
+    def _extract_playlist_page(self, response):
+        videos = response.get('videos')
+        return [video['url'] for video in videos] if videos else []
+
+    def _real_extract(self, url):
+        return self._extract_playlist(self._match_id(url))
+
+
+class TwitchProfileIE(TwitchPlaylistBaseIE):
+    IE_NAME = 'twitch:profile'
+    _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
+    _PLAYLIST_TYPE = 'profile'
+
+    _TESTS = [{
+        'url': 'http://www.twitch.tv/vanillatv/profile',
+        'info_dict': {
+            'id': '22744919',
+            'title': 'VanillaTV',
+        },
+        'playlist_mincount': 412,
+    }, {
+        'url': 'http://m.twitch.tv/vanillatv/profile',
+        'only_matching': True,
+    }]
+
+
+class TwitchVideosBaseIE(TwitchPlaylistBaseIE):
+    _VALID_URL_VIDEOS_BASE = r'%s/(?P<id>[^/]+)/videos' % TwitchBaseIE._VALID_URL_BASE
+    _PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcast_type='
+
+
+class TwitchAllVideosIE(TwitchVideosBaseIE):
+    IE_NAME = 'twitch:videos:all'
+    _VALID_URL = r'%s/all' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE
+    _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive,upload,highlight'
+    _PLAYLIST_TYPE = 'all videos'
+
+    _TESTS = [{
+        'url': 'https://www.twitch.tv/spamfish/videos/all',
+        'info_dict': {
+            'id': '497952',
+            'title': 'Spamfish',
+        },
+        'playlist_mincount': 869,
+    }, {
+        'url': 'https://m.twitch.tv/spamfish/videos/all',
+        'only_matching': True,
+    }]
+
+
+class TwitchUploadsIE(TwitchVideosBaseIE):
+    IE_NAME = 'twitch:videos:uploads'
+    _VALID_URL = r'%s/uploads' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE
+    _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'upload'
+    _PLAYLIST_TYPE = 'uploads'
+
+    _TESTS = [{
+        'url': 'https://www.twitch.tv/spamfish/videos/uploads',
+        'info_dict': {
+            'id': '497952',
+            'title': 'Spamfish',
+        },
+        'playlist_mincount': 0,
+    }, {
+        'url': 'https://m.twitch.tv/spamfish/videos/uploads',
+        'only_matching': True,
+    }]
+
+
+class TwitchPastBroadcastsIE(TwitchVideosBaseIE):
+    IE_NAME = 'twitch:videos:past-broadcasts'
+    _VALID_URL = r'%s/past-broadcasts' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE
+    _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive'
+    _PLAYLIST_TYPE = 'past broadcasts'
+
+    _TESTS = [{
+        'url': 'https://www.twitch.tv/spamfish/videos/past-broadcasts',
+        'info_dict': {
+            'id': '497952',
+            'title': 'Spamfish',
+        },
+        'playlist_mincount': 0,
+    }, {
+        'url': 'https://m.twitch.tv/spamfish/videos/past-broadcasts',
+        'only_matching': True,
+    }]
+
+
+class TwitchHighlightsIE(TwitchVideosBaseIE):
+    IE_NAME = 'twitch:videos:highlights'
+    _VALID_URL = r'%s/highlights' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE
+    _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'highlight'
+    _PLAYLIST_TYPE = 'highlights'
+
+    _TESTS = [{
+        'url': 'https://www.twitch.tv/spamfish/videos/highlights',
+        'info_dict': {
+            'id': '497952',
+            'title': 'Spamfish',
+        },
+        'playlist_mincount': 805,
+    }, {
+        'url': 'https://m.twitch.tv/spamfish/videos/highlights',
+        'only_matching': True,
+    }]
+
+
+class TwitchStreamIE(TwitchBaseIE):
+    IE_NAME = 'twitch:stream'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:(?:www|go|m)\.)?twitch\.tv/|
+                            player\.twitch\.tv/\?.*?\bchannel=
+                        )
+                        (?P<id>[^/#?]+)
+                    '''
+
+    _TESTS = [{
+        'url': 'http://www.twitch.tv/shroomztv',
+        'info_dict': {
+            'id': '12772022048',
+            'display_id': 'shroomztv',
+            'ext': 'mp4',
+            'title': 're:^ShroomzTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'H1Z1 - lonewolfing with ShroomzTV | A3 Battle Royale later - @ShroomzTV',
+            'is_live': True,
+            'timestamp': 1421928037,
+            'upload_date': '20150122',
+            'uploader': 'ShroomzTV',
+            'uploader_id': 'shroomztv',
+            'view_count': int,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.twitch.tv/miracle_doto#profile-0',
+        'only_matching': True,
+    }, {
+        'url': 'https://player.twitch.tv/?channel=lotsofs',
+        'only_matching': True,
+    }, {
+        'url': 'https://go.twitch.tv/food',
+        'only_matching': True,
+    }, {
+        'url': 'https://m.twitch.tv/food',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False
+                if any(ie.suitable(url) for ie in (
+                    TwitchVideoIE,
+                    TwitchChapterIE,
+                    TwitchVodIE,
+                    TwitchProfileIE,
+                    TwitchAllVideosIE,
+                    TwitchUploadsIE,
+                    TwitchPastBroadcastsIE,
+                    TwitchHighlightsIE,
+                    TwitchClipsIE))
+                else super(TwitchStreamIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        channel_name = self._match_id(url)
+
+        access_token = self._call_api(
+            'api/channels/%s/access_token' % channel_name, channel_name,
+            'Downloading access token JSON')
+
+        token = access_token['token']
+        channel_id = compat_str(self._parse_json(
+            token, channel_name)['channel_id'])
+
+        stream = self._call_api(
+            'kraken/streams/%s?stream_type=all' % channel_id,
+            channel_id, 'Downloading stream JSON').get('stream')
+
+        if not stream:
+            raise ExtractorError('%s is offline' % channel_id, expected=True)
+
+        # Channel name may be typed if different case than the original channel name
+        # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing
+        # an invalid m3u8 URL. Working around by use of original channel name from stream
+        # JSON and fallback to lowercase if it's not available.
+        channel_name = try_get(
+            stream, lambda x: x['channel']['name'],
+            compat_str) or channel_name.lower()
+
+        query = {
+            'allow_source': 'true',
+            'allow_audio_only': 'true',
+            'allow_spectre': 'true',
+            'p': random.randint(1000000, 10000000),
+            'player': 'twitchweb',
+            'playlist_include_framerate': 'true',
+            'segment_preference': '4',
+            'sig': access_token['sig'].encode('utf-8'),
+            'token': token.encode('utf-8'),
+        }
+        formats = self._extract_m3u8_formats(
+            '%s/api/channel/hls/%s.m3u8?%s'
+            % (self._USHER_BASE, channel_name, compat_urllib_parse_urlencode(query)),
+            channel_id, 'mp4')
+        self._prefer_source(formats)
+
+        view_count = stream.get('viewers')
+        timestamp = parse_iso8601(stream.get('created_at'))
+
+        channel = stream['channel']
+        title = self._live_title(channel.get('display_name') or channel.get('name'))
+        description = channel.get('status')
+
+        thumbnails = []
+        for thumbnail_key, thumbnail_url in stream['preview'].items():
+            m = re.search(r'(?P<width>\d+)x(?P<height>\d+)\.jpg$', thumbnail_key)
+            if not m:
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+                'width': int(m.group('width')),
+                'height': int(m.group('height')),
+            })
+
+        return {
+            'id': str_or_none(stream.get('_id')) or channel_id,
+            'display_id': channel_name,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'uploader': channel.get('display_name'),
+            'uploader_id': channel.get('name'),
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'formats': formats,
+            'is_live': True,
+        }
+
+
+class TwitchClipsIE(TwitchBaseIE):
+    IE_NAME = 'twitch:clips'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|
+                            (?:(?:www|go|m)\.)?twitch\.tv/[^/]+/clip/
+                        )
+                        (?P<id>[^/?#&]+)
+                    '''
+
+    _TESTS = [{
+        'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat',
+        'md5': '761769e1eafce0ffebfb4089cb3847cd',
+        'info_dict': {
+            'id': '42850523',
+            'ext': 'mp4',
+            'title': 'EA Play 2016 Live from the Novo Theatre',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1465767393,
+            'upload_date': '20160612',
+            'creator': 'EA',
+            'uploader': 'stereotype_',
+            'uploader_id': '43566419',
+        },
+    }, {
+        # multiple formats
+        'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan',
+        'only_matching': True,
+    }, {
+        'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited',
+        'only_matching': True,
+    }, {
+        'url': 'https://m.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank',
+        'only_matching': True,
+    }, {
+        'url': 'https://go.twitch.tv/rossbroadcast/clip/ConfidentBraveHumanChefFrank',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        clip = self._download_json(
+            'https://gql.twitch.tv/gql', video_id, data=json.dumps({
+                'query': '''{
+  clip(slug: "%s") {
+    broadcaster {
+      displayName
+    }
+    createdAt
+    curator {
+      displayName
+      id
+    }
+    durationSeconds
+    id
+    tiny: thumbnailURL(width: 86, height: 45)
+    small: thumbnailURL(width: 260, height: 147)
+    medium: thumbnailURL(width: 480, height: 272)
+    title
+    videoQualities {
+      frameRate
+      quality
+      sourceURL
+    }
+    viewCount
+  }
+}''' % video_id,
+            }).encode(), headers={
+                'Client-ID': self._CLIENT_ID,
+            })['data']['clip']
+
+        if not clip:
+            raise ExtractorError(
+                'This clip is no longer available', expected=True)
+
+        formats = []
+        for option in clip.get('videoQualities', []):
+            if not isinstance(option, dict):
+                continue
+            source = url_or_none(option.get('sourceURL'))
+            if not source:
+                continue
+            formats.append({
+                'url': source,
+                'format_id': option.get('quality'),
+                'height': int_or_none(option.get('quality')),
+                'fps': int_or_none(option.get('frameRate')),
+            })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for thumbnail_id in ('tiny', 'small', 'medium'):
+            thumbnail_url = clip.get(thumbnail_id)
+            if not thumbnail_url:
+                continue
+            thumb = {
+                'id': thumbnail_id,
+                'url': thumbnail_url,
+            }
+            mobj = re.search(r'-(\d+)x(\d+)\.', thumbnail_url)
+            if mobj:
+                thumb.update({
+                    'height': int(mobj.group(2)),
+                    'width': int(mobj.group(1)),
+                })
+            thumbnails.append(thumb)
+
+        return {
+            'id': clip.get('id') or video_id,
+            'title': clip.get('title') or video_id,
+            'formats': formats,
+            'duration': int_or_none(clip.get('durationSeconds')),
+            'views': int_or_none(clip.get('viewCount')),
+            'timestamp': unified_timestamp(clip.get('createdAt')),
+            'thumbnails': thumbnails,
+            'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str),
+            'uploader': try_get(clip, lambda x: x['curator']['displayName'], compat_str),
+            'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str),
+        }
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py

new file mode 100644 (file)

index 0000000..4284487
--- /dev/null
+++ b/youtube_dl/extractor/twitter.py
@@ -0,0 +1,610 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_parse_qs,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    dict_get,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    try_get,
+    strip_or_none,
+    unified_timestamp,
+    update_url_query,
+    xpath_text,
+)
+
+from .periscope import (
+    PeriscopeBaseIE,
+    PeriscopeIE,
+)
+
+
+class TwitterBaseIE(InfoExtractor):
+    _API_BASE = 'https://api.twitter.com/1.1/'
+    _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/'
+    _GUEST_TOKEN = None
+
+    def _extract_variant_formats(self, variant, video_id):
+        variant_url = variant.get('url')
+        if not variant_url:
+            return []
+        elif '.m3u8' in variant_url:
+            return self._extract_m3u8_formats(
+                variant_url, video_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False)
+        else:
+            tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None
+            f = {
+                'url': variant_url,
+                'format_id': 'http' + ('-%d' % tbr if tbr else ''),
+                'tbr': tbr,
+            }
+            self._search_dimensions_in_video_url(f, variant_url)
+            return [f]
+
+    def _extract_formats_from_vmap_url(self, vmap_url, video_id):
+        vmap_data = self._download_xml(vmap_url, video_id)
+        formats = []
+        urls = []
+        for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'):
+            video_variant.attrib['url'] = compat_urllib_parse_unquote(
+                video_variant.attrib['url'])
+            urls.append(video_variant.attrib['url'])
+            formats.extend(self._extract_variant_formats(
+                video_variant.attrib, video_id))
+        video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile'))
+        if video_url not in urls:
+            formats.extend(self._extract_variant_formats({'url': video_url}, video_id))
+        return formats
+
+    @staticmethod
+    def _search_dimensions_in_video_url(a_format, video_url):
+        m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
+        if m:
+            a_format.update({
+                'width': int(m.group('width')),
+                'height': int(m.group('height')),
+            })
+
+    def _call_api(self, path, video_id, query={}):
+        headers = {
+            'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw',
+        }
+        if not self._GUEST_TOKEN:
+            self._GUEST_TOKEN = self._download_json(
+                self._API_BASE + 'guest/activate.json', video_id,
+                'Downloading guest token', data=b'',
+                headers=headers)['guest_token']
+        headers['x-guest-token'] = self._GUEST_TOKEN
+        try:
+            return self._download_json(
+                self._API_BASE + path, video_id, headers=headers, query=query)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                raise ExtractorError(self._parse_json(
+                    e.cause.read().decode(),
+                    video_id)['errors'][0]['message'], expected=True)
+            raise
+
+
+class TwitterCardIE(InfoExtractor):
+    IE_NAME = 'twitter:card'
+    _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)'
+    _TESTS = [
+        {
+            'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
+            # MD5 checksums are different in different places
+            'info_dict': {
+                'id': '560070183650213889',
+                'ext': 'mp4',
+                'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.",
+                'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96',
+                'uploader': 'Twitter',
+                'uploader_id': 'Twitter',
+                'thumbnail': r're:^https?://.*\.jpg',
+                'duration': 30.033,
+                'timestamp': 1422366112,
+                'upload_date': '20150127',
+            },
+        },
+        {
+            'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768',
+            'md5': '7137eca597f72b9abbe61e5ae0161399',
+            'info_dict': {
+                'id': '623160978427936768',
+                'ext': 'mp4',
+                'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video.",
+                'description': "Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video. https://t.co/BJYgOjSeGA",
+                'uploader': 'NASA',
+                'uploader_id': 'NASA',
+                'timestamp': 1437408129,
+                'upload_date': '20150720',
+            },
+        },
+        {
+            'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
+            'md5': 'b6d9683dd3f48e340ded81c0e917ad46',
+            'info_dict': {
+                'id': 'dq4Oj5quskI',
+                'ext': 'mp4',
+                'title': 'Ubuntu 11.10 Overview',
+                'description': 'md5:a831e97fa384863d6e26ce48d1c43376',
+                'upload_date': '20111013',
+                'uploader': 'OMG! UBUNTU!',
+                'uploader_id': 'omgubuntu',
+            },
+            'add_ie': ['Youtube'],
+        },
+        {
+            'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568',
+            'md5': '6dabeaca9e68cbb71c99c322a4b42a11',
+            'info_dict': {
+                'id': 'iBb2x00UVlv',
+                'ext': 'mp4',
+                'upload_date': '20151113',
+                'uploader_id': '1189339351084113920',
+                'uploader': 'ArsenalTerje',
+                'title': 'Vine by ArsenalTerje',
+                'timestamp': 1447451307,
+            },
+            'add_ie': ['Vine'],
+        }, {
+            'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
+            'md5': '884812a2adc8aaf6fe52b15ccbfa3b88',
+            'info_dict': {
+                'id': '705235433198714880',
+                'ext': 'mp4',
+                'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.",
+                'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns",
+                'uploader': 'Brent Yarina',
+                'uploader_id': 'BTNBrentYarina',
+                'timestamp': 1456976204,
+                'upload_date': '20160303',
+            },
+            'skip': 'This content is no longer available.',
+        }, {
+            'url': 'https://twitter.com/i/videos/752274308186120192',
+            'only_matching': True,
+        },
+    ]
+
+    def _real_extract(self, url):
+        status_id = self._match_id(url)
+        return self.url_result(
+            'https://twitter.com/statuses/' + status_id,
+            TwitterIE.ie_key(), status_id)
+
+
+class TwitterIE(TwitterBaseIE):
+    IE_NAME = 'twitter'
+    _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'https://twitter.com/freethenipple/status/643211948184596480',
+        'info_dict': {
+            'id': '643211948184596480',
+            'ext': 'mp4',
+            'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'description': 'FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ',
+            'uploader': 'FREE THE NIPPLE',
+            'uploader_id': 'freethenipple',
+            'duration': 12.922,
+            'timestamp': 1442188653,
+            'upload_date': '20150913',
+            'age_limit': 18,
+        },
+    }, {
+        'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
+        'md5': 'f36dcd5fb92bf7057f155e7d927eeb42',
+        'info_dict': {
+            'id': '657991469417025536',
+            'ext': 'mp4',
+            'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai',
+            'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"',
+            'thumbnail': r're:^https?://.*\.png',
+            'uploader': 'Gifs',
+            'uploader_id': 'giphz',
+        },
+        'expected_warnings': ['height', 'width'],
+        'skip': 'Account suspended',
+    }, {
+        'url': 'https://twitter.com/starwars/status/665052190608723968',
+        'info_dict': {
+            'id': '665052190608723968',
+            'ext': 'mp4',
+            'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.',
+            'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ',
+            'uploader_id': 'starwars',
+            'uploader': 'Star Wars',
+            'timestamp': 1447395772,
+            'upload_date': '20151113',
+        },
+    }, {
+        'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880',
+        'info_dict': {
+            'id': '705235433198714880',
+            'ext': 'mp4',
+            'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.",
+            'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns",
+            'uploader_id': 'BTNBrentYarina',
+            'uploader': 'Brent Yarina',
+            'timestamp': 1456976204,
+            'upload_date': '20160303',
+        },
+        'params': {
+            # The same video as https://twitter.com/i/videos/tweet/705235433198714880
+            # Test case of TwitterCardIE
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://twitter.com/jaydingeer/status/700207533655363584',
+        'info_dict': {
+            'id': '700207533655363584',
+            'ext': 'mp4',
+            'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel',
+            'description': 'BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'uploader': 'simon vetugo',
+            'uploader_id': 'simonvertugo',
+            'duration': 30.0,
+            'timestamp': 1455777459,
+            'upload_date': '20160218',
+        },
+    }, {
+        'url': 'https://twitter.com/Filmdrunk/status/713801302971588609',
+        'md5': '89a15ed345d13b86e9a5a5e051fa308a',
+        'info_dict': {
+            'id': 'MIOxnrUteUd',
+            'ext': 'mp4',
+            'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン',
+            'uploader': 'TAKUMA',
+            'uploader_id': '1004126642786242560',
+            'timestamp': 1402826626,
+            'upload_date': '20140615',
+        },
+        'add_ie': ['Vine'],
+    }, {
+        'url': 'https://twitter.com/captainamerica/status/719944021058060289',
+        'info_dict': {
+            'id': '719944021058060289',
+            'ext': 'mp4',
+            'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.',
+            'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI',
+            'uploader_id': 'CaptainAmerica',
+            'uploader': 'Captain America',
+            'duration': 3.17,
+            'timestamp': 1460483005,
+            'upload_date': '20160412',
+        },
+    }, {
+        'url': 'https://twitter.com/OPP_HSD/status/779210622571536384',
+        'info_dict': {
+            'id': '1zqKVVlkqLaKB',
+            'ext': 'mp4',
+            'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence',
+            'upload_date': '20160923',
+            'uploader_id': '1PmKqpJdOJQoY',
+            'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police',
+            'timestamp': 1474613214,
+        },
+        'add_ie': ['Periscope'],
+    }, {
+        # has mp4 formats via mobile API
+        'url': 'https://twitter.com/news_al3alm/status/852138619213144067',
+        'info_dict': {
+            'id': '852138619213144067',
+            'ext': 'mp4',
+            'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
+            'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة   https://t.co/xg6OhpyKfN',
+            'uploader': 'عالم الأخبار',
+            'uploader_id': 'news_al3alm',
+            'duration': 277.4,
+            'timestamp': 1492000653,
+            'upload_date': '20170412',
+        },
+    }, {
+        'url': 'https://twitter.com/i/web/status/910031516746514432',
+        'info_dict': {
+            'id': '910031516746514432',
+            'ext': 'mp4',
+            'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo',
+            'uploader': 'Préfet de Guadeloupe',
+            'uploader_id': 'Prefet971',
+            'duration': 47.48,
+            'timestamp': 1505803395,
+            'upload_date': '20170919',
+        },
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        },
+    }, {
+        # card via api.twitter.com/1.1/videos/tweet/config
+        'url': 'https://twitter.com/LisPower1/status/1001551623938805763',
+        'info_dict': {
+            'id': '1001551623938805763',
+            'ext': 'mp4',
+            'title': 're:.*?Shep is on a roll today.*?',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'description': 'md5:37b9f2ff31720cef23b2bd42ee8a0f09',
+            'uploader': 'Lis Power',
+            'uploader_id': 'LisPower1',
+            'duration': 111.278,
+            'timestamp': 1527623489,
+            'upload_date': '20180529',
+        },
+        'params': {
+            'skip_download': True,  # requires ffmpeg
+        },
+    }, {
+        'url': 'https://twitter.com/foobar/status/1087791357756956680',
+        'info_dict': {
+            'id': '1087791357756956680',
+            'ext': 'mp4',
+            'title': 'Twitter - A new is coming.  Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'description': 'md5:6dfd341a3310fb97d80d2bf7145df976',
+            'uploader': 'Twitter',
+            'uploader_id': 'Twitter',
+            'duration': 61.567,
+            'timestamp': 1548184644,
+            'upload_date': '20190122',
+        },
+    }, {
+        # not available in Periscope
+        'url': 'https://twitter.com/ViviEducation/status/1136534865145286656',
+        'info_dict': {
+            'id': '1vOGwqejwoWxB',
+            'ext': 'mp4',
+            'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019',
+            'uploader': 'Vivi',
+            'uploader_id': '1eVjYOLGkGrQL',
+        },
+        'add_ie': ['TwitterBroadcast'],
+    }, {
+        # Twitch Clip Embed
+        'url': 'https://twitter.com/GunB1g/status/1163218564784017422',
+        'only_matching': True,
+    }, {
+        # promo_video_website card
+        'url': 'https://twitter.com/GunB1g/status/1163218564784017422',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        twid = self._match_id(url)
+        status = self._call_api(
+            'statuses/show/%s.json' % twid, twid, {
+                'cards_platform': 'Web-12',
+                'include_cards': 1,
+                'include_reply_count': 1,
+                'include_user_entities': 0,
+                'tweet_mode': 'extended',
+            })
+
+        title = description = status['full_text'].replace('\n', ' ')
+        # strip  'https -_t.co_BJYgOjSeGA' junk from filenames
+        title = re.sub(r'\s+(https?://[^ ]+)', '', title)
+        user = status.get('user') or {}
+        uploader = user.get('name')
+        if uploader:
+            title = '%s - %s' % (uploader, title)
+        uploader_id = user.get('screen_name')
+
+        tags = []
+        for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []):
+            hashtag_text = hashtag.get('text')
+            if not hashtag_text:
+                continue
+            tags.append(hashtag_text)
+
+        info = {
+            'id': twid,
+            'title': title,
+            'description': description,
+            'uploader': uploader,
+            'timestamp': unified_timestamp(status.get('created_at')),
+            'uploader_id': uploader_id,
+            'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None,
+            'like_count': int_or_none(status.get('favorite_count')),
+            'repost_count': int_or_none(status.get('retweet_count')),
+            'comment_count': int_or_none(status.get('reply_count')),
+            'age_limit': 18 if status.get('possibly_sensitive') else 0,
+            'tags': tags,
+        }
+
+        media = try_get(status, lambda x: x['extended_entities']['media'][0])
+        if media and media.get('type') != 'photo':
+            video_info = media.get('video_info') or {}
+
+            formats = []
+            for variant in video_info.get('variants', []):
+                formats.extend(self._extract_variant_formats(variant, twid))
+            self._sort_formats(formats)
+
+            thumbnails = []
+            media_url = media.get('media_url_https') or media.get('media_url')
+            if media_url:
+                def add_thumbnail(name, size):
+                    thumbnails.append({
+                        'id': name,
+                        'url': update_url_query(media_url, {'name': name}),
+                        'width': int_or_none(size.get('w') or size.get('width')),
+                        'height': int_or_none(size.get('h') or size.get('height')),
+                    })
+                for name, size in media.get('sizes', {}).items():
+                    add_thumbnail(name, size)
+                add_thumbnail('orig', media.get('original_info') or {})
+
+            info.update({
+                'formats': formats,
+                'thumbnails': thumbnails,
+                'duration': float_or_none(video_info.get('duration_millis'), 1000),
+            })
+        else:
+            card = status.get('card')
+            if card:
+                binding_values = card['binding_values']
+
+                def get_binding_value(k):
+                    o = binding_values.get(k) or {}
+                    return try_get(o, lambda x: x[x['type'].lower() + '_value'])
+
+                card_name = card['name'].split(':')[-1]
+                if card_name in ('amplify', 'promo_video_website'):
+                    is_amplify = card_name == 'amplify'
+                    vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
+                    content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
+                    formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
+                    self._sort_formats(formats)
+
+                    thumbnails = []
+                    for suffix in ('_small', '', '_large', '_x_large', '_original'):
+                        image = get_binding_value('player_image' + suffix) or {}
+                        image_url = image.get('url')
+                        if not image_url or '/player-placeholder' in image_url:
+                            continue
+                        thumbnails.append({
+                            'id': suffix[1:] if suffix else 'medium',
+                            'url': image_url,
+                            'width': int_or_none(image.get('width')),
+                            'height': int_or_none(image.get('height')),
+                        })
+
+                    info.update({
+                        'formats': formats,
+                        'thumbnails': thumbnails,
+                        'duration': int_or_none(get_binding_value(
+                            'content_duration_seconds')),
+                    })
+                elif card_name == 'player':
+                    info.update({
+                        '_type': 'url',
+                        'url': get_binding_value('player_url'),
+                    })
+                elif card_name == 'periscope_broadcast':
+                    info.update({
+                        '_type': 'url',
+                        'url': get_binding_value('url') or get_binding_value('player_url'),
+                        'ie_key': PeriscopeIE.ie_key(),
+                    })
+                elif card_name == 'broadcast':
+                    info.update({
+                        '_type': 'url',
+                        'url': get_binding_value('broadcast_url'),
+                        'ie_key': TwitterBroadcastIE.ie_key(),
+                    })
+                else:
+                    raise ExtractorError('Unsupported Twitter Card.')
+            else:
+                expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url'])
+                if not expanded_url:
+                    raise ExtractorError("There's no video in this tweet.")
+                info.update({
+                    '_type': 'url',
+                    'url': expanded_url,
+                })
+        return info
+
+
+class TwitterAmplifyIE(TwitterBaseIE):
+    IE_NAME = 'twitter:amplify'
+    _VALID_URL = r'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})'
+
+    _TEST = {
+        'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
+        'md5': '7df102d0b9fd7066b86f3159f8e81bf6',
+        'info_dict': {
+            'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
+            'ext': 'mp4',
+            'title': 'Twitter Video',
+            'thumbnail': 're:^https?://.*',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        vmap_url = self._html_search_meta(
+            'twitter:amplify:vmap', webpage, 'vmap url')
+        formats = self._extract_formats_from_vmap_url(vmap_url, video_id)
+
+        thumbnails = []
+        thumbnail = self._html_search_meta(
+            'twitter:image:src', webpage, 'thumbnail', fatal=False)
+
+        def _find_dimension(target):
+            w = int_or_none(self._html_search_meta(
+                'twitter:%s:width' % target, webpage, fatal=False))
+            h = int_or_none(self._html_search_meta(
+                'twitter:%s:height' % target, webpage, fatal=False))
+            return w, h
+
+        if thumbnail:
+            thumbnail_w, thumbnail_h = _find_dimension('image')
+            thumbnails.append({
+                'url': thumbnail,
+                'width': thumbnail_w,
+                'height': thumbnail_h,
+            })
+
+        video_w, video_h = _find_dimension('player')
+        formats[0].update({
+            'width': video_w,
+            'height': video_h,
+        })
+
+        return {
+            'id': video_id,
+            'title': 'Twitter Video',
+            'formats': formats,
+            'thumbnails': thumbnails,
+        }
+
+
+class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
+    IE_NAME = 'twitter:broadcast'
+    _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P<id>[0-9a-zA-Z]{13})'
+
+    _TEST = {
+        # untitled Periscope video
+        'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj',
+        'info_dict': {
+            'id': '1yNGaQLWpejGj',
+            'ext': 'mp4',
+            'title': 'Andrea May Sahouri - Periscope Broadcast',
+            'uploader': 'Andrea May Sahouri',
+            'uploader_id': '1PXEdBZWpGwKe',
+        },
+    }
+
+    def _real_extract(self, url):
+        broadcast_id = self._match_id(url)
+        broadcast = self._call_api(
+            'broadcasts/show.json', broadcast_id,
+            {'ids': broadcast_id})['broadcasts'][broadcast_id]
+        info = self._parse_broadcast_data(broadcast, broadcast_id)
+        media_key = broadcast['media_key']
+        source = self._call_api(
+            'live_video_stream/status/' + media_key, media_key)['source']
+        m3u8_url = source.get('noRedirectPlaybackUrl') or source['location']
+        if '/live_video_stream/geoblocked/' in m3u8_url:
+            self.raise_geo_restricted()
+        m3u8_id = compat_parse_qs(compat_urllib_parse_urlparse(
+            m3u8_url).query).get('type', [None])[0]
+        state, width, height = self._extract_common_format_info(broadcast)
+        info['formats'] = self._extract_pscp_m3u8_formats(
+            m3u8_url, broadcast_id, m3u8_id, state, width, height)
+        return info
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py

new file mode 100644 (file)

index 0000000..60e364d
--- /dev/null
+++ b/youtube_dl/extractor/udemy.py
@@ -0,0 +1,481 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_kwargs,
+    compat_str,
+    compat_urllib_request,
+    compat_urlparse,
+)
+from ..utils import (
+    determine_ext,
+    extract_attributes,
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    js_to_json,
+    sanitized_Request,
+    try_get,
+    unescapeHTML,
+    url_or_none,
+    urlencode_postdata,
+)
+
+
+class UdemyIE(InfoExtractor):
+    IE_NAME = 'udemy'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:[^/]+\.)?udemy\.com/
+                        (?:
+                            [^#]+\#/lecture/|
+                            lecture/view/?\?lectureId=|
+                            [^/]+/learn/v4/t/lecture/
+                        )
+                        (?P<id>\d+)
+                    '''
+    _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1'
+    _ORIGIN_URL = 'https://www.udemy.com'
+    _NETRC_MACHINE = 'udemy'
+
+    _TESTS = [{
+        'url': 'https://www.udemy.com/java-tutorial/#/lecture/172757',
+        'md5': '98eda5b657e752cf945d8445e261b5c5',
+        'info_dict': {
+            'id': '160614',
+            'ext': 'mp4',
+            'title': 'Introduction and Installation',
+            'description': 'md5:c0d51f6f21ef4ec65f091055a5eef876',
+            'duration': 579.29,
+        },
+        'skip': 'Requires udemy account credentials',
+    }, {
+        # new URL schema
+        'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906',
+        'only_matching': True,
+    }, {
+        # no url in outputs format entry
+        'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812',
+        'only_matching': True,
+    }, {
+        # only outputs rendition
+        'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0',
+        'only_matching': True,
+    }, {
+        'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757',
+        'only_matching': True,
+    }]
+
+    def _extract_course_info(self, webpage, video_id):
+        course = self._parse_json(
+            unescapeHTML(self._search_regex(
+                r'ng-init=["\'].*\bcourse=({.+?})[;"\']',
+                webpage, 'course', default='{}')),
+            video_id, fatal=False) or {}
+        course_id = course.get('id') or self._search_regex(
+            [
+                r'data-course-id=["\'](\d+)',
+                r'&quot;courseId&quot;\s*:\s*(\d+)'
+            ], webpage, 'course id')
+        return course_id, course.get('title')
+
+    def _enroll_course(self, base_url, webpage, course_id):
+        def combine_url(base_url, url):
+            return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url
+
+        checkout_url = unescapeHTML(self._search_regex(
+            r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1',
+            webpage, 'checkout url', group='url', default=None))
+        if checkout_url:
+            raise ExtractorError(
+                'Course %s is not free. You have to pay for it before you can download. '
+                'Use this URL to confirm purchase: %s'
+                % (course_id, combine_url(base_url, checkout_url)),
+                expected=True)
+
+        enroll_url = unescapeHTML(self._search_regex(
+            r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1',
+            webpage, 'enroll url', group='url', default=None))
+        if enroll_url:
+            webpage = self._download_webpage(
+                combine_url(base_url, enroll_url),
+                course_id, 'Enrolling in the course',
+                headers={'Referer': base_url})
+            if '>You have enrolled in' in webpage:
+                self.to_screen('%s: Successfully enrolled in the course' % course_id)
+
+    def _download_lecture(self, course_id, lecture_id):
+        return self._download_json(
+            'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?'
+            % (course_id, lecture_id),
+            lecture_id, 'Downloading lecture JSON', query={
+                'fields[lecture]': 'title,description,view_html,asset',
+                'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data',
+            })
+
+    def _handle_error(self, response):
+        if not isinstance(response, dict):
+            return
+        error = response.get('error')
+        if error:
+            error_str = 'Udemy returned error #%s: %s' % (error.get('code'), error.get('message'))
+            error_data = error.get('data')
+            if error_data:
+                error_str += ' - %s' % error_data.get('formErrors')
+            raise ExtractorError(error_str, expected=True)
+
+    def _download_webpage_handle(self, *args, **kwargs):
+        headers = kwargs.get('headers', {}).copy()
+        headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
+        kwargs['headers'] = headers
+        ret = super(UdemyIE, self)._download_webpage_handle(
+            *args, **compat_kwargs(kwargs))
+        if not ret:
+            return ret
+        webpage, _ = ret
+        if any(p in webpage for p in (
+                '>Please verify you are a human',
+                'Access to this page has been denied because we believe you are using automation tools to browse the website',
+                '"_pxCaptcha"')):
+            raise ExtractorError(
+                'Udemy asks you to solve a CAPTCHA. Login with browser, '
+                'solve CAPTCHA, then export cookies and pass cookie file to '
+                'youtube-dlc with --cookies.', expected=True)
+        return ret
+
+    def _download_json(self, url_or_request, *args, **kwargs):
+        headers = {
+            'X-Udemy-Snail-Case': 'true',
+            'X-Requested-With': 'XMLHttpRequest',
+        }
+        for cookie in self._downloader.cookiejar:
+            if cookie.name == 'client_id':
+                headers['X-Udemy-Client-Id'] = cookie.value
+            elif cookie.name == 'access_token':
+                headers['X-Udemy-Bearer-Token'] = cookie.value
+                headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value
+
+        if isinstance(url_or_request, compat_urllib_request.Request):
+            for header, value in headers.items():
+                url_or_request.add_header(header, value)
+        else:
+            url_or_request = sanitized_Request(url_or_request, headers=headers)
+
+        response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs)
+        self._handle_error(response)
+        return response
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_popup = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login popup')
+
+        def is_logged(webpage):
+            return any(re.search(p, webpage) for p in (
+                r'href=["\'](?:https://www\.udemy\.com)?/user/logout/',
+                r'>Logout<'))
+
+        # already logged in
+        if is_logged(login_popup):
+            return
+
+        login_form = self._form_hidden_inputs('login-form', login_popup)
+
+        login_form.update({
+            'email': username,
+            'password': password,
+        })
+
+        response = self._download_webpage(
+            self._LOGIN_URL, None, 'Logging in',
+            data=urlencode_postdata(login_form),
+            headers={
+                'Referer': self._ORIGIN_URL,
+                'Origin': self._ORIGIN_URL,
+            })
+
+        if not is_logged(response):
+            error = self._html_search_regex(
+                r'(?s)<div[^>]+class="form-errors[^"]*">(.+?)</div>',
+                response, 'error message', default=None)
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
+            raise ExtractorError('Unable to log in')
+
+    def _real_extract(self, url):
+        lecture_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, lecture_id)
+
+        course_id, _ = self._extract_course_info(webpage, lecture_id)
+
+        try:
+            lecture = self._download_lecture(course_id, lecture_id)
+        except ExtractorError as e:
+            # Error could possibly mean we are not enrolled in the course
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                self._enroll_course(url, webpage, course_id)
+                lecture = self._download_lecture(course_id, lecture_id)
+            else:
+                raise
+
+        title = lecture['title']
+        description = lecture.get('description')
+
+        asset = lecture['asset']
+
+        asset_type = asset.get('asset_type') or asset.get('assetType')
+        if asset_type != 'Video':
+            raise ExtractorError(
+                'Lecture %s is not a video' % lecture_id, expected=True)
+
+        stream_url = asset.get('stream_url') or asset.get('streamUrl')
+        if stream_url:
+            youtube_url = self._search_regex(
+                r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None)
+            if youtube_url:
+                return self.url_result(youtube_url, 'Youtube')
+
+        video_id = compat_str(asset['id'])
+        thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl')
+        duration = float_or_none(asset.get('data', {}).get('duration'))
+
+        subtitles = {}
+        automatic_captions = {}
+
+        formats = []
+
+        def extract_output_format(src, f_id):
+            return {
+                'url': src.get('url'),
+                'format_id': '%sp' % (src.get('height') or f_id),
+                'width': int_or_none(src.get('width')),
+                'height': int_or_none(src.get('height')),
+                'vbr': int_or_none(src.get('video_bitrate_in_kbps')),
+                'vcodec': src.get('video_codec'),
+                'fps': int_or_none(src.get('frame_rate')),
+                'abr': int_or_none(src.get('audio_bitrate_in_kbps')),
+                'acodec': src.get('audio_codec'),
+                'asr': int_or_none(src.get('audio_sample_rate')),
+                'tbr': int_or_none(src.get('total_bitrate_in_kbps')),
+                'filesize': int_or_none(src.get('file_size_in_bytes')),
+            }
+
+        outputs = asset.get('data', {}).get('outputs')
+        if not isinstance(outputs, dict):
+            outputs = {}
+
+        def add_output_format_meta(f, key):
+            output = outputs.get(key)
+            if isinstance(output, dict):
+                output_format = extract_output_format(output, key)
+                output_format.update(f)
+                return output_format
+            return f
+
+        def extract_formats(source_list):
+            if not isinstance(source_list, list):
+                return
+            for source in source_list:
+                video_url = url_or_none(source.get('file') or source.get('src'))
+                if not video_url:
+                    continue
+                if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id='hls', fatal=False))
+                    continue
+                format_id = source.get('label')
+                f = {
+                    'url': video_url,
+                    'format_id': '%sp' % format_id,
+                    'height': int_or_none(format_id),
+                }
+                if format_id:
+                    # Some videos contain additional metadata (e.g.
+                    # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208)
+                    f = add_output_format_meta(f, format_id)
+                formats.append(f)
+
+        def extract_subtitles(track_list):
+            if not isinstance(track_list, list):
+                return
+            for track in track_list:
+                if not isinstance(track, dict):
+                    continue
+                if track.get('kind') != 'captions':
+                    continue
+                src = url_or_none(track.get('src'))
+                if not src:
+                    continue
+                lang = track.get('language') or track.get(
+                    'srclang') or track.get('label')
+                sub_dict = automatic_captions if track.get(
+                    'autogenerated') is True else subtitles
+                sub_dict.setdefault(lang, []).append({
+                    'url': src,
+                })
+
+        for url_kind in ('download', 'stream'):
+            urls = asset.get('%s_urls' % url_kind)
+            if isinstance(urls, dict):
+                extract_formats(urls.get('Video'))
+
+        captions = asset.get('captions')
+        if isinstance(captions, list):
+            for cc in captions:
+                if not isinstance(cc, dict):
+                    continue
+                cc_url = url_or_none(cc.get('url'))
+                if not cc_url:
+                    continue
+                lang = try_get(cc, lambda x: x['locale']['locale'], compat_str)
+                sub_dict = (automatic_captions if cc.get('source') == 'auto'
+                            else subtitles)
+                sub_dict.setdefault(lang or 'en', []).append({
+                    'url': cc_url,
+                })
+
+        view_html = lecture.get('view_html')
+        if view_html:
+            view_html_urls = set()
+            for source in re.findall(r'<source[^>]+>', view_html):
+                attributes = extract_attributes(source)
+                src = attributes.get('src')
+                if not src:
+                    continue
+                res = attributes.get('data-res')
+                height = int_or_none(res)
+                if src in view_html_urls:
+                    continue
+                view_html_urls.add(src)
+                if attributes.get('type') == 'application/x-mpegURL' or determine_ext(src) == 'm3u8':
+                    m3u8_formats = self._extract_m3u8_formats(
+                        src, video_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id='hls', fatal=False)
+                    for f in m3u8_formats:
+                        m = re.search(r'/hls_(?P<height>\d{3,4})_(?P<tbr>\d{2,})/', f['url'])
+                        if m:
+                            if not f.get('height'):
+                                f['height'] = int(m.group('height'))
+                            if not f.get('tbr'):
+                                f['tbr'] = int(m.group('tbr'))
+                    formats.extend(m3u8_formats)
+                else:
+                    formats.append(add_output_format_meta({
+                        'url': src,
+                        'format_id': '%dp' % height if height else None,
+                        'height': height,
+                    }, res))
+
+            # react rendition since 2017.04.15 (see
+            # https://github.com/ytdl-org/youtube-dl/issues/12744)
+            data = self._parse_json(
+                self._search_regex(
+                    r'videojs-setup-data=(["\'])(?P<data>{.+?})\1', view_html,
+                    'setup data', default='{}', group='data'), video_id,
+                transform_source=unescapeHTML, fatal=False)
+            if data and isinstance(data, dict):
+                extract_formats(data.get('sources'))
+                if not duration:
+                    duration = int_or_none(data.get('duration'))
+                extract_subtitles(data.get('tracks'))
+
+            if not subtitles and not automatic_captions:
+                text_tracks = self._parse_json(
+                    self._search_regex(
+                        r'text-tracks=(["\'])(?P<data>\[.+?\])\1', view_html,
+                        'text tracks', default='{}', group='data'), video_id,
+                    transform_source=lambda s: js_to_json(unescapeHTML(s)),
+                    fatal=False)
+                extract_subtitles(text_tracks)
+
+        if not formats and outputs:
+            for format_id, output in outputs.items():
+                f = extract_output_format(output, format_id)
+                if f.get('url'):
+                    formats.append(f)
+
+        self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles,
+            'automatic_captions': automatic_captions,
+        }
+
+
+class UdemyCourseIE(UdemyIE):
+    IE_NAME = 'udemy:course'
+    _VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.udemy.com/java-tutorial/',
+        'only_matching': True,
+    }, {
+        'url': 'https://wipro.udemy.com/java-tutorial/',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if UdemyIE.suitable(url) else super(UdemyCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        course_path = self._match_id(url)
+
+        webpage = self._download_webpage(url, course_path)
+
+        course_id, title = self._extract_course_info(webpage, course_path)
+
+        self._enroll_course(url, webpage, course_id)
+
+        response = self._download_json(
+            'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id,
+            course_id, 'Downloading course curriculum', query={
+                'fields[chapter]': 'title,object_index',
+                'fields[lecture]': 'title,asset',
+                'page_size': '1000',
+            })
+
+        entries = []
+        chapter, chapter_number = [None] * 2
+        for entry in response['results']:
+            clazz = entry.get('_class')
+            if clazz == 'lecture':
+                asset = entry.get('asset')
+                if isinstance(asset, dict):
+                    asset_type = asset.get('asset_type') or asset.get('assetType')
+                    if asset_type != 'Video':
+                        continue
+                lecture_id = entry.get('id')
+                if lecture_id:
+                    entry = {
+                        '_type': 'url_transparent',
+                        'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']),
+                        'title': entry.get('title'),
+                        'ie_key': UdemyIE.ie_key(),
+                    }
+                    if chapter_number:
+                        entry['chapter_number'] = chapter_number
+                    if chapter:
+                        entry['chapter'] = chapter
+                    entries.append(entry)
+            elif clazz == 'chapter':
+                chapter_number = entry.get('object_index')
+                chapter = entry.get('title')
+
+        return self.playlist_result(entries, course_id, title)
diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py

new file mode 100644 (file)

index 0000000..2c8e5c7
--- /dev/null
+++ b/youtube_dl/extractor/udn.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    js_to_json,
+)
+from ..compat import compat_urlparse
+
+
+class UDNEmbedIE(InfoExtractor):
+    IE_DESC = '聯合影音'
+    _PROTOCOL_RELATIVE_VALID_URL = r'//video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)'
+    _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL
+    _TESTS = [{
+        'url': 'http://video.udn.com/embed/news/300040',
+        'info_dict': {
+            'id': '300040',
+            'ext': 'mp4',
+            'title': '生物老師男變女 全校挺"做自己"',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to parse JSON Expecting value'],
+    }, {
+        'url': 'https://video.udn.com/embed/news/300040',
+        'only_matching': True,
+    }, {
+        # From https://video.udn.com/news/303776
+        'url': 'https://video.udn.com/play/news/303776',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        page = self._download_webpage(url, video_id)
+
+        options_str = self._html_search_regex(
+            r'var\s+options\s*=\s*([^;]+);', page, 'options')
+        trans_options_str = js_to_json(options_str)
+        options = self._parse_json(trans_options_str, 'options', fatal=False) or {}
+        if options:
+            video_urls = options['video']
+            title = options['title']
+            poster = options.get('poster')
+        else:
+            video_urls = self._parse_json(self._html_search_regex(
+                r'"video"\s*:\s*({.+?})\s*,', trans_options_str, 'video urls'), 'video urls')
+            title = self._html_search_regex(
+                r"title\s*:\s*'(.+?)'\s*,", options_str, 'title')
+            poster = self._html_search_regex(
+                r"poster\s*:\s*'(.+?)'\s*,", options_str, 'poster', default=None)
+
+        if video_urls.get('youtube'):
+            return self.url_result(video_urls.get('youtube'), 'Youtube')
+
+        formats = []
+        for video_type, api_url in video_urls.items():
+            if not api_url:
+                continue
+
+            video_url = self._download_webpage(
+                compat_urlparse.urljoin(url, api_url), video_id,
+                note='retrieve url for %s video' % video_type)
+
+            ext = determine_ext(video_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, ext='mp4', m3u8_id='hls'))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    video_url, video_id, f4m_id='hds'))
+            else:
+                mobj = re.search(r'_(?P<height>\d+)p_(?P<tbr>\d+)\.mp4', video_url)
+                a_format = {
+                    'url': video_url,
+                    # video_type may be 'mp4', which confuses YoutubeDL
+                    'format_id': 'http-' + video_type,
+                }
+                if mobj:
+                    a_format.update({
+                        'height': int_or_none(mobj.group('height')),
+                        'tbr': int_or_none(mobj.group('tbr')),
+                    })
+                formats.append(a_format)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'thumbnail': poster,
+        }
diff --git a/youtube_dl/extractor/ufctv.py b/youtube_dl/extractor/ufctv.py

new file mode 100644 (file)

index 0000000..3d74ba0
--- /dev/null
+++ b/youtube_dl/extractor/ufctv.py
@@ -0,0 +1,16 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .imggaming import ImgGamingBaseIE
+
+
+class UFCTVIE(ImgGamingBaseIE):
+    _VALID_URL = ImgGamingBaseIE._VALID_URL_TEMPL % r'(?:(?:app|www)\.)?(?:ufc\.tv|(?:ufc)?fightpass\.com)|ufcfightpass\.img(?:dge|gaming)\.com'
+    _NETRC_MACHINE = 'ufctv'
+    _REALM = 'ufc'
+
+
+class UFCArabiaIE(ImgGamingBaseIE):
+    _VALID_URL = ImgGamingBaseIE._VALID_URL_TEMPL % r'(?:(?:app|www)\.)?ufcarabia\.(?:ae|com)'
+    _NETRC_MACHINE = 'ufcarabia'
+    _REALM = 'admufc'
diff --git a/youtube_dl/extractor/uktvplay.py b/youtube_dl/extractor/uktvplay.py

new file mode 100644 (file)

index 0000000..2137502
--- /dev/null
+++ b/youtube_dl/extractor/uktvplay.py
@@ -0,0 +1,33 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class UKTVPlayIE(InfoExtractor):
+    _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001',
+        'md5': '',
+        'info_dict': {
+            'id': '2117008346001',
+            'ext': 'mp4',
+            'title': 'Pincers',
+            'description': 'Pincers',
+            'uploader_id': '1242911124001',
+            'upload_date': '20130124',
+            'timestamp': 1359049267,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to download MPD manifest']
+    }
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result(
+            self.BRIGHTCOVE_URL_TEMPLATE % video_id,
+            'BrightcoveNew', video_id)
diff --git a/youtube_dl/extractor/umg.py b/youtube_dl/extractor/umg.py

new file mode 100644 (file)

index 0000000..d815cd9
--- /dev/null
+++ b/youtube_dl/extractor/umg.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_filesize,
+    parse_iso8601,
+)
+
+
+class UMGDeIE(InfoExtractor):
+    IE_NAME = 'umg:de'
+    IE_DESC = 'Universal Music Deutschland'
+    _VALID_URL = r'https?://(?:www\.)?universal-music\.de/[^/]+/videos/[^/?#]+-(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.universal-music.de/sido/videos/jedes-wort-ist-gold-wert-457803',
+        'md5': 'ebd90f48c80dcc82f77251eb1902634f',
+        'info_dict': {
+            'id': '457803',
+            'ext': 'mp4',
+            'title': 'Jedes Wort ist Gold wert',
+            'timestamp': 1513591800,
+            'upload_date': '20171218',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'https://api.universal-music.de/graphql',
+            video_id, query={
+                'query': '''{
+  universalMusic(channel:16) {
+    video(id:%s) {
+      headline
+      formats {
+        formatId
+        url
+        type
+        width
+        height
+        mimeType
+        fileSize
+      }
+      duration
+      createdDate
+    }
+  }
+}''' % video_id})['data']['universalMusic']['video']
+
+        title = video_data['headline']
+        hls_url_template = 'http://mediadelivery.universal-music-services.de/vod/mp4:autofill/storage/' + '/'.join(list(video_id)) + '/content/%s/file/playlist.m3u8'
+
+        thumbnails = []
+        formats = []
+
+        def add_m3u8_format(format_id):
+            m3u8_formats = self._extract_m3u8_formats(
+                hls_url_template % format_id, video_id, 'mp4',
+                'm3u8_native', m3u8_id='hls', fatal='False')
+            if m3u8_formats and m3u8_formats[0].get('height'):
+                formats.extend(m3u8_formats)
+
+        for f in video_data.get('formats', []):
+            f_url = f.get('url')
+            mime_type = f.get('mimeType')
+            if not f_url or mime_type == 'application/mxf':
+                continue
+            fmt = {
+                'url': f_url,
+                'width': int_or_none(f.get('width')),
+                'height': int_or_none(f.get('height')),
+                'filesize': parse_filesize(f.get('fileSize')),
+            }
+            f_type = f.get('type')
+            if f_type == 'Image':
+                thumbnails.append(fmt)
+            elif f_type == 'Video':
+                format_id = f.get('formatId')
+                if format_id:
+                    fmt['format_id'] = format_id
+                    if mime_type == 'video/mp4':
+                        add_m3u8_format(format_id)
+                urlh = self._request_webpage(f_url, video_id, fatal=False)
+                if urlh:
+                    first_byte = urlh.read(1)
+                    if first_byte not in (b'F', b'\x00'):
+                        continue
+                    formats.append(fmt)
+        if not formats:
+            for format_id in (867, 836, 940):
+                add_m3u8_format(format_id)
+        self._sort_formats(formats, ('width', 'height', 'filesize', 'tbr'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'duration': int_or_none(video_data.get('duration')),
+            'timestamp': parse_iso8601(video_data.get('createdDate'), ' '),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py

new file mode 100644 (file)

index 0000000..a724cdb
--- /dev/null
+++ b/youtube_dl/extractor/unistra.py
@@ -0,0 +1,67 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import qualities
+
+
+class UnistraIE(InfoExtractor):
+    _VALID_URL = r'https?://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)'
+
+    _TESTS = [
+        {
+            'url': 'http://utv.unistra.fr/video.php?id_video=154',
+            'md5': '736f605cfdc96724d55bb543ab3ced24',
+            'info_dict': {
+                'id': '154',
+                'ext': 'mp4',
+                'title': 'M!ss Yella',
+                'description': 'md5:104892c71bd48e55d70b902736b81bbf',
+            },
+        },
+        {
+            'url': 'http://utv.unistra.fr/index.php?id_video=437',
+            'md5': '1ddddd6cccaae76f622ce29b8779636d',
+            'info_dict': {
+                'id': '437',
+                'ext': 'mp4',
+                'title': 'Prix Louise Weiss 2014',
+                'description': 'md5:cc3a8735f079f4fb6b0b570fc10c135a',
+            },
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        files = set(re.findall(r'file\s*:\s*"(/[^"]+)"', webpage))
+
+        quality = qualities(['SD', 'HD'])
+        formats = []
+        for file_path in files:
+            format_id = 'HD' if file_path.endswith('-HD.mp4') else 'SD'
+            formats.append({
+                'url': 'http://vod-flash.u-strasbg.fr:8080%s' % file_path,
+                'format_id': format_id,
+                'quality': quality(format_id)
+            })
+        self._sort_formats(formats)
+
+        title = self._html_search_regex(
+            r'<title>UTV - (.*?)</', webpage, 'title')
+        description = self._html_search_regex(
+            r'<meta name="Description" content="(.*?)"', webpage, 'description', flags=re.DOTALL)
+        thumbnail = self._search_regex(
+            r'image: "(.*?)"', webpage, 'thumbnail')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/unity.py b/youtube_dl/extractor/unity.py

new file mode 100644 (file)

index 0000000..73daacf
--- /dev/null
+++ b/youtube_dl/extractor/unity.py
@@ -0,0 +1,32 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+
+
+class UnityIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?unity3d\.com/learn/tutorials/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://unity3d.com/learn/tutorials/topics/animation/animate-anything-mecanim',
+        'info_dict': {
+            'id': 'jWuNtik0C8E',
+            'ext': 'mp4',
+            'title': 'Live Training 22nd September 2014 -  Animate Anything',
+            'description': 'md5:e54913114bd45a554c56cdde7669636e',
+            'duration': 2893,
+            'uploader': 'Unity',
+            'uploader_id': 'Unity3D',
+            'upload_date': '20140926',
+        }
+    }, {
+        'url': 'https://unity3d.com/learn/tutorials/projects/2d-ufo-tutorial/following-player-camera?playlist=25844',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        youtube_id = self._search_regex(
+            r'data-video-id="([_0-9a-zA-Z-]+)"',
+            webpage, 'youtube ID')
+        return self.url_result(youtube_id, ie=YoutubeIE.ie_key(), video_id=video_id)
diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py

new file mode 100644 (file)

index 0000000..628adf2
--- /dev/null
+++ b/youtube_dl/extractor/uol.py
@@ -0,0 +1,144 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_urlencode,
+)
+from ..utils import (
+    clean_html,
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+    qualities,
+    update_url_query,
+)
+
+
+class UOLIE(InfoExtractor):
+    IE_NAME = 'uol.com.br'
+    _VALID_URL = r'https?://(?:.+?\.)?uol\.com\.br/.*?(?:(?:mediaId|v)=|view/(?:[a-z0-9]+/)?|video(?:=|/(?:\d{4}/\d{2}/\d{2}/)?))(?P<id>\d+|[\w-]+-[A-Z0-9]+)'
+    _TESTS = [{
+        'url': 'http://player.mais.uol.com.br/player_video_v3.swf?mediaId=15951931',
+        'md5': '4f1e26683979715ff64e4e29099cf020',
+        'info_dict': {
+            'id': '15951931',
+            'ext': 'mp4',
+            'title': 'Miss simpatia é encontrada morta',
+            'description': 'md5:3f8c11a0c0556d66daf7e5b45ef823b2',
+            'timestamp': 1470421860,
+            'upload_date': '20160805',
+        }
+    }, {
+        'url': 'http://tvuol.uol.com.br/video/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326',
+        'md5': '2850a0e8dfa0a7307e04a96c5bdc5bc2',
+        'info_dict': {
+            'id': '15954259',
+            'ext': 'mp4',
+            'title': 'Incêndio destrói uma das maiores casas noturnas de Londres',
+            'description': 'Em Londres, um incêndio destruiu uma das maiores boates da cidade. Não há informações sobre vítimas.',
+            'timestamp': 1470674520,
+            'upload_date': '20160808',
+        }
+    }, {
+        'url': 'http://mais.uol.com.br/static/uolplayer/index.html?mediaId=15951931',
+        'only_matching': True,
+    }, {
+        'url': 'http://mais.uol.com.br/view/15954259',
+        'only_matching': True,
+    }, {
+        'url': 'http://noticias.band.uol.com.br/brasilurgente/video/2016/08/05/15951931/miss-simpatia-e-encontrada-morta.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://videos.band.uol.com.br/programa.asp?e=noticias&pr=brasil-urgente&v=15951931&t=Policia-desmonte-base-do-PCC-na-Cracolandia',
+        'only_matching': True,
+    }, {
+        'url': 'http://mais.uol.com.br/view/cphaa0gl2x8r/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326',
+        'only_matching': True,
+    }, {
+        'url': 'http://noticias.uol.com.br//videos/assistir.htm?video=rafaela-silva-inspira-criancas-no-judo-04024D983968D4C95326',
+        'only_matching': True,
+    }, {
+        'url': 'http://mais.uol.com.br/view/e0qbgxid79uv/15275470',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video_data = self._download_json(
+            # https://api.mais.uol.com.br/apiuol/v4/player/data/[MEDIA_ID]
+            'https://api.mais.uol.com.br/apiuol/v3/media/detail/' + video_id,
+            video_id)['item']
+        media_id = compat_str(video_data['mediaId'])
+        title = video_data['title']
+        ver = video_data.get('revision', 2)
+
+        uol_formats = self._download_json(
+            'https://croupier.mais.uol.com.br/v3/formats/%s/jsonp' % media_id,
+            media_id)
+        quality = qualities(['mobile', 'WEBM', '360p', '720p', '1080p'])
+        formats = []
+        for format_id, f in uol_formats.items():
+            if not isinstance(f, dict):
+                continue
+            f_url = f.get('url') or f.get('secureUrl')
+            if not f_url:
+                continue
+            query = {
+                'ver': ver,
+                'r': 'http://mais.uol.com.br',
+            }
+            for k in ('token', 'sign'):
+                v = f.get(k)
+                if v:
+                    query[k] = v
+            f_url = update_url_query(f_url, query)
+            format_id = format_id
+            if format_id == 'HLS':
+                m3u8_formats = self._extract_m3u8_formats(
+                    f_url, media_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False)
+                encoded_query = compat_urllib_parse_urlencode(query)
+                for m3u8_f in m3u8_formats:
+                    m3u8_f['extra_param_to_segment_url'] = encoded_query
+                    m3u8_f['url'] = update_url_query(m3u8_f['url'], query)
+                formats.extend(m3u8_formats)
+                continue
+            formats.append({
+                'format_id': format_id,
+                'url': f_url,
+                'quality': quality(format_id),
+                'preference': -1,
+            })
+        self._sort_formats(formats)
+
+        tags = []
+        for tag in video_data.get('tags', []):
+            tag_description = tag.get('description')
+            if not tag_description:
+                continue
+            tags.append(tag_description)
+
+        thumbnails = []
+        for q in ('Small', 'Medium', 'Wmedium', 'Large', 'Wlarge', 'Xlarge'):
+            q_url = video_data.get('thumb' + q)
+            if not q_url:
+                continue
+            thumbnails.append({
+                'id': q,
+                'url': q_url,
+            })
+
+        return {
+            'id': media_id,
+            'title': title,
+            'description': clean_html(video_data.get('description')),
+            'thumbnails': thumbnails,
+            'duration': parse_duration(video_data.get('duration')),
+            'tags': tags,
+            'formats': formats,
+            'timestamp': parse_iso8601(video_data.get('publishDate'), ' '),
+            'view_count': int_or_none(video_data.get('viewsQtty')),
+        }
diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py

new file mode 100644 (file)

index 0000000..f06bf5b
--- /dev/null
+++ b/youtube_dl/extractor/uplynk.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    ExtractorError,
+)
+
+
+class UplynkIE(InfoExtractor):
+    IE_NAME = 'uplynk'
+    _VALID_URL = r'https?://.*?\.uplynk\.com/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P<session_id>[^&]+))?'
+    _TEST = {
+        'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8',
+        'info_dict': {
+            'id': 'e89eaf2ce9054aa89d92ddb2d817a52e',
+            'ext': 'mp4',
+            'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4',
+            'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _extract_uplynk_info(self, uplynk_content_url):
+        path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups()
+        display_id = video_id or external_id
+        formats = self._extract_m3u8_formats(
+            'http://content.uplynk.com/%s.m3u8' % path,
+            display_id, 'mp4', 'm3u8_native')
+        if session_id:
+            for f in formats:
+                f['extra_param_to_segment_url'] = 'pbs=' + session_id
+        self._sort_formats(formats)
+        asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id)
+        if asset.get('error') == 1:
+            raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True)
+
+        return {
+            'id': asset['asset'],
+            'title': asset['desc'],
+            'thumbnail': asset.get('default_poster_url'),
+            'duration': float_or_none(asset.get('duration')),
+            'uploader_id': asset.get('owner'),
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        return self._extract_uplynk_info(url)
+
+
+class UplynkPreplayIE(UplynkIE):
+    IE_NAME = 'uplynk:preplay'
+    _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json'
+    _TEST = None
+
+    def _real_extract(self, url):
+        path, external_id, video_id = re.match(self._VALID_URL, url).groups()
+        display_id = video_id or external_id
+        preplay = self._download_json(url, display_id)
+        content_url = 'http://content.uplynk.com/%s.m3u8' % path
+        session_id = preplay.get('sid')
+        if session_id:
+            content_url += '?pbs=' + session_id
+        return self._extract_uplynk_info(content_url)
diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py

new file mode 100644 (file)

index 0000000..8f6edab
--- /dev/null
+++ b/youtube_dl/extractor/urort.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse,
+)
+from ..utils import (
+    unified_strdate,
+)
+
+
+class UrortIE(InfoExtractor):
+    IE_DESC = 'NRK P3 Urørt'
+    _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P<id>[^/]+)$'
+
+    _TEST = {
+        'url': 'https://urort.p3.no/#!/Band/Gerilja',
+        'md5': '5ed31a924be8a05e47812678a86e127b',
+        'info_dict': {
+            'id': '33124-24',
+            'ext': 'mp3',
+            'title': 'The Bomb',
+            'thumbnail': r're:^https?://.+\.jpg',
+            'uploader': 'Gerilja',
+            'uploader_id': 'Gerilja',
+            'upload_date': '20100323',
+        },
+        'params': {
+            'matchtitle': '^The Bomb$',  # To test, we want just one video
+        }
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id)
+        json_url = 'http://urort.p3.no/breeze/urort/TrackDTOViews?$filter=%s&$orderby=Released%%20desc&$expand=Tags%%2CFiles' % fstr
+        songs = self._download_json(json_url, playlist_id)
+        entries = []
+        for s in songs:
+            formats = [{
+                'tbr': f.get('Quality'),
+                'ext': f['FileType'],
+                'format_id': '%s-%s' % (f['FileType'], f.get('Quality', '')),
+                'url': 'http://p3urort.blob.core.windows.net/tracks/%s' % f['FileRef'],
+                'preference': 3 if f['FileType'] == 'mp3' else 2,
+            } for f in s['Files']]
+            self._sort_formats(formats)
+            e = {
+                'id': '%d-%s' % (s['BandId'], s['$id']),
+                'title': s['Title'],
+                'uploader_id': playlist_id,
+                'uploader': s.get('BandName', playlist_id),
+                'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'],
+                'upload_date': unified_strdate(s.get('Released')),
+                'formats': formats,
+            }
+            entries.append(e)
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': playlist_id,
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py

new file mode 100644 (file)

index 0000000..6030b7c
--- /dev/null
+++ b/youtube_dl/extractor/urplay.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import unified_timestamp
+
+
+class URPlayIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ur(?:play|skola)\.se/(?:program|Produkter)/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'https://urplay.se/program/203704-ur-samtiden-livet-universum-och-rymdens-markliga-musik-om-vetenskap-kritiskt-tankande-och-motstand',
+        'md5': 'ff5b0c89928f8083c74bbd5099c9292d',
+        'info_dict': {
+            'id': '203704',
+            'ext': 'mp4',
+            'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd',
+            'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a',
+            'timestamp': 1513512768,
+            'upload_date': '20171217',
+        },
+    }, {
+        'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde',
+        'info_dict': {
+            'id': '190031',
+            'ext': 'mp4',
+            'title': 'Tripp, Trapp, Träd : Sovkudde',
+            'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
+            'timestamp': 1440093600,
+            'upload_date': '20150820',
+        },
+    }, {
+        'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        urplayer_data = self._parse_json(self._search_regex(
+            r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id)
+        host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
+
+        formats = []
+        for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)):
+            file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr)
+            if file_http:
+                formats.extend(self._extract_wowza_formats(
+                    'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp']))
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for subtitle in urplayer_data.get('subtitles', []):
+            subtitle_url = subtitle.get('file')
+            kind = subtitle.get('kind')
+            if not subtitle_url or (kind and kind != 'captions'):
+                continue
+            subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({
+                'url': subtitle_url,
+            })
+
+        return {
+            'id': video_id,
+            'title': urplayer_data['title'],
+            'description': self._og_search_description(webpage),
+            'thumbnail': urplayer_data.get('image'),
+            'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')),
+            'series': urplayer_data.get('series_title'),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/usanetwork.py b/youtube_dl/extractor/usanetwork.py

new file mode 100644 (file)

index 0000000..54c7495
--- /dev/null
+++ b/youtube_dl/extractor/usanetwork.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .adobepass import AdobePassIE
+from ..utils import (
+    NO_DEFAULT,
+    smuggle_url,
+    update_url_query,
+)
+
+
+class USANetworkIE(AdobePassIE):
+    _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)'
+    _TEST = {
+        'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity',
+        'md5': '33c0d2ba381571b414024440d08d57fd',
+        'info_dict': {
+            'id': '3086229',
+            'ext': 'mp4',
+            'title': 'HPE Cybersecurity',
+            'description': 'The more we digitize our world, the more vulnerable we are.',
+            'upload_date': '20160818',
+            'timestamp': 1471535460,
+            'uploader': 'NBCU-USA',
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        def _x(name, default=NO_DEFAULT):
+            return self._search_regex(
+                r'data-%s\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
+                webpage, name, default=default, group='value')
+
+        video_id = _x('mpx-guid')
+        title = _x('episode-title')
+        mpx_account_id = _x('mpx-account-id', '2304992029')
+
+        query = {
+            'mbr': 'true',
+        }
+        if _x('is-full-episode', None) == '1':
+            query['manifest'] = 'm3u'
+
+        if _x('is-entitlement', None) == '1':
+            adobe_pass = {}
+            drupal_settings = self._search_regex(
+                r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+                webpage, 'drupal settings', fatal=False)
+            if drupal_settings:
+                drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False)
+                if drupal_settings:
+                    adobe_pass = drupal_settings.get('adobePass', {})
+            resource = self._get_mvpd_resource(
+                adobe_pass.get('adobePassResourceId', 'usa'),
+                title, video_id, _x('episode-rating', 'TV-14'))
+            query['auth'] = self._extract_mvpd_auth(
+                url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource)
+
+        info = self._search_json_ld(webpage, video_id, default={})
+        info.update({
+            '_type': 'url_transparent',
+            'url': smuggle_url(update_url_query(
+                'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id),
+                query), {'force_smil_url': True}),
+            'id': video_id,
+            'title': title,
+            'series': _x('show-title', None),
+            'episode': title,
+            'ie_key': 'ThePlatform',
+        })
+        return info
diff --git a/youtube_dl/extractor/usatoday.py b/youtube_dl/extractor/usatoday.py

new file mode 100644 (file)

index 0000000..b210344
--- /dev/null
+++ b/youtube_dl/extractor/usatoday.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    get_element_by_attribute,
+    parse_duration,
+    try_get,
+    update_url_query,
+)
+from ..compat import compat_str
+
+
+class USATodayIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?usatoday\.com/(?:[^/]+/)*(?P<id>[^?/#]+)'
+    _TESTS = [{
+        # Brightcove Partner ID = 29906170001
+        'url': 'http://www.usatoday.com/media/cinematic/video/81729424/us-france-warn-syrian-regime-ahead-of-new-peace-talks/',
+        'md5': '033587d2529dc3411a1ab3644c3b8827',
+        'info_dict': {
+            'id': '4799374959001',
+            'ext': 'mp4',
+            'title': 'US, France warn Syrian regime ahead of new peace talks',
+            'timestamp': 1457891045,
+            'description': 'md5:7e50464fdf2126b0f533748d3c78d58f',
+            'uploader_id': '29906170001',
+            'upload_date': '20160313',
+        }
+    }, {
+        # ui-video-data[asset_metadata][items][brightcoveaccount] = 28911775001
+        'url': 'https://www.usatoday.com/story/tech/science/2018/08/21/yellowstone-supervolcano-eruption-stop-worrying-its-blow/973633002/',
+        'info_dict': {
+            'id': '5824495846001',
+            'ext': 'mp4',
+            'title': 'Yellowstone more likely to crack rather than explode',
+            'timestamp': 1534790612,
+            'description': 'md5:3715e7927639a4f16b474e9391687c62',
+            'uploader_id': '28911775001',
+            'upload_date': '20180820',
+        }
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(update_url_query(url, {'ajax': 'true'}), display_id)
+        ui_video_data = get_element_by_attribute('class', 'ui-video-data', webpage)
+        if not ui_video_data:
+            raise ExtractorError('no video on the webpage', expected=True)
+        video_data = self._parse_json(ui_video_data, display_id)
+        item = try_get(video_data, lambda x: x['asset_metadata']['items'], dict) or {}
+
+        return {
+            '_type': 'url_transparent',
+            'url': self.BRIGHTCOVE_URL_TEMPLATE % (item.get('brightcoveaccount', '29906170001'), item.get('brightcoveid') or video_data['brightcove_id']),
+            'id': compat_str(video_data['id']),
+            'title': video_data['title'],
+            'thumbnail': video_data.get('thumbnail'),
+            'description': video_data.get('description'),
+            'duration': parse_duration(video_data.get('length')),
+            'ie_key': 'BrightcoveNew',
+        }
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py

new file mode 100644 (file)

index 0000000..582090d
--- /dev/null
+++ b/youtube_dl/extractor/ustream.py
@@ -0,0 +1,281 @@
+from __future__ import unicode_literals
+
+import random
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    encode_data_uri,
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    mimetype2ext,
+    str_or_none,
+)
+
+
+class UstreamIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
+    IE_NAME = 'ustream'
+    _TESTS = [{
+        'url': 'http://www.ustream.tv/recorded/20274954',
+        'md5': '088f151799e8f572f84eb62f17d73e5c',
+        'info_dict': {
+            'id': '20274954',
+            'ext': 'flv',
+            'title': 'Young Americans for Liberty February 7, 2012 2:28 AM',
+            'description': 'Young Americans for Liberty February 7, 2012 2:28 AM',
+            'timestamp': 1328577035,
+            'upload_date': '20120207',
+            'uploader': 'yaliberty',
+            'uploader_id': '6780869',
+        },
+    }, {
+        # From http://sportscanada.tv/canadagames/index.php/week2/figure-skating/444
+        # Title and uploader available only from params JSON
+        'url': 'http://www.ustream.tv/embed/recorded/59307601?ub=ff0000&lc=ff0000&oc=ffffff&uc=ffffff&v=3&wmode=direct',
+        'md5': '5a2abf40babeac9812ed20ae12d34e10',
+        'info_dict': {
+            'id': '59307601',
+            'ext': 'flv',
+            'title': '-CG11- Canada Games Figure Skating',
+            'uploader': 'sportscanadatv',
+        },
+        'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.',
+    }, {
+        'url': 'http://www.ustream.tv/embed/10299409',
+        'info_dict': {
+            'id': '10299409',
+        },
+        'playlist_count': 3,
+    }, {
+        'url': 'http://www.ustream.tv/recorded/91343263',
+        'info_dict': {
+            'id': '91343263',
+            'ext': 'mp4',
+            'title': 'GitHub Universe - General Session - Day 1',
+            'upload_date': '20160914',
+            'description': 'GitHub Universe - General Session - Day 1',
+            'timestamp': 1473872730,
+            'uploader': 'wa0dnskeqkr',
+            'uploader_id': '38977840',
+        },
+        'params': {
+            'skip_download': True,  # m3u8 download
+        },
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
+        if mobj is not None:
+            return mobj.group('url')
+
+    def _get_stream_info(self, url, video_id, app_id_ver, extra_note=None):
+        def num_to_hex(n):
+            return hex(n)[2:]
+
+        rnd = random.randrange
+
+        if not extra_note:
+            extra_note = ''
+
+        conn_info = self._download_json(
+            'http://r%d-1-%s-recorded-lp-live.ums.ustream.tv/1/ustream' % (rnd(1e8), video_id),
+            video_id, note='Downloading connection info' + extra_note,
+            query={
+                'type': 'viewer',
+                'appId': app_id_ver[0],
+                'appVersion': app_id_ver[1],
+                'rsid': '%s:%s' % (num_to_hex(rnd(1e8)), num_to_hex(rnd(1e8))),
+                'rpin': '_rpin.%d' % rnd(1e15),
+                'referrer': url,
+                'media': video_id,
+                'application': 'recorded',
+            })
+        host = conn_info[0]['args'][0]['host']
+        connection_id = conn_info[0]['args'][0]['connectionId']
+
+        return self._download_json(
+            'http://%s/1/ustream?connectionId=%s' % (host, connection_id),
+            video_id, note='Downloading stream info' + extra_note)
+
+    def _get_streams(self, url, video_id, app_id_ver):
+        # Sometimes the return dict does not have 'stream'
+        for trial_count in range(3):
+            stream_info = self._get_stream_info(
+                url, video_id, app_id_ver,
+                extra_note=' (try %d)' % (trial_count + 1) if trial_count > 0 else '')
+            if 'stream' in stream_info[0]['args'][0]:
+                return stream_info[0]['args'][0]['stream']
+        return []
+
+    def _parse_segmented_mp4(self, dash_stream_info):
+        def resolve_dash_template(template, idx, chunk_hash):
+            return template.replace('%', compat_str(idx), 1).replace('%', chunk_hash)
+
+        formats = []
+        for stream in dash_stream_info['streams']:
+            # Use only one provider to avoid too many formats
+            provider = dash_stream_info['providers'][0]
+            fragments = [{
+                'url': resolve_dash_template(
+                    provider['url'] + stream['initUrl'], 0, dash_stream_info['hashes']['0'])
+            }]
+            for idx in range(dash_stream_info['videoLength'] // dash_stream_info['chunkTime']):
+                fragments.append({
+                    'url': resolve_dash_template(
+                        provider['url'] + stream['segmentUrl'], idx,
+                        dash_stream_info['hashes'][compat_str(idx // 10 * 10)])
+                })
+            content_type = stream['contentType']
+            kind = content_type.split('/')[0]
+            f = {
+                'format_id': '-'.join(filter(None, [
+                    'dash', kind, str_or_none(stream.get('bitrate'))])),
+                'protocol': 'http_dash_segments',
+                # TODO: generate a MPD doc for external players?
+                'url': encode_data_uri(b'<MPD/>', 'text/xml'),
+                'ext': mimetype2ext(content_type),
+                'height': stream.get('height'),
+                'width': stream.get('width'),
+                'fragments': fragments,
+            }
+            if kind == 'video':
+                f.update({
+                    'vcodec': stream.get('codec'),
+                    'acodec': 'none',
+                    'vbr': stream.get('bitrate'),
+                })
+            else:
+                f.update({
+                    'vcodec': 'none',
+                    'acodec': stream.get('codec'),
+                    'abr': stream.get('bitrate'),
+                })
+            formats.append(f)
+        return formats
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+
+        # some sites use this embed format (see: https://github.com/ytdl-org/youtube-dl/issues/2990)
+        if m.group('type') == 'embed/recorded':
+            video_id = m.group('id')
+            desktop_url = 'http://www.ustream.tv/recorded/' + video_id
+            return self.url_result(desktop_url, 'Ustream')
+        if m.group('type') == 'embed':
+            video_id = m.group('id')
+            webpage = self._download_webpage(url, video_id)
+            content_video_ids = self._parse_json(self._search_regex(
+                r'ustream\.vars\.offAirContentVideoIds=([^;]+);', webpage,
+                'content video IDs'), video_id)
+            return self.playlist_result(
+                map(lambda u: self.url_result('http://www.ustream.tv/recorded/' + u, 'Ustream'), content_video_ids),
+                video_id)
+
+        params = self._download_json(
+            'https://api.ustream.tv/videos/%s.json' % video_id, video_id)
+
+        error = params.get('error')
+        if error:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error), expected=True)
+
+        video = params['video']
+
+        title = video['title']
+        filesize = float_or_none(video.get('file_size'))
+
+        formats = [{
+            'id': video_id,
+            'url': video_url,
+            'ext': format_id,
+            'filesize': filesize,
+        } for format_id, video_url in video['media_urls'].items() if video_url]
+
+        if not formats:
+            hls_streams = self._get_streams(url, video_id, app_id_ver=(11, 2))
+            if hls_streams:
+                # m3u8_native leads to intermittent ContentTooShortError
+                formats.extend(self._extract_m3u8_formats(
+                    hls_streams[0]['url'], video_id, ext='mp4', m3u8_id='hls'))
+
+            '''
+            # DASH streams handling is incomplete as 'url' is missing
+            dash_streams = self._get_streams(url, video_id, app_id_ver=(3, 1))
+            if dash_streams:
+                formats.extend(self._parse_segmented_mp4(dash_streams))
+            '''
+
+        self._sort_formats(formats)
+
+        description = video.get('description')
+        timestamp = int_or_none(video.get('created_at'))
+        duration = float_or_none(video.get('length'))
+        view_count = int_or_none(video.get('views'))
+
+        uploader = video.get('owner', {}).get('username')
+        uploader_id = video.get('owner', {}).get('id')
+
+        thumbnails = [{
+            'id': thumbnail_id,
+            'url': thumbnail_url,
+        } for thumbnail_id, thumbnail_url in video.get('thumbnail', {}).items()]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnails': thumbnails,
+            'timestamp': timestamp,
+            'duration': duration,
+            'view_count': view_count,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'formats': formats,
+        }
+
+
+class UstreamChannelIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ustream\.tv/channel/(?P<slug>.+)'
+    IE_NAME = 'ustream:channel'
+    _TEST = {
+        'url': 'http://www.ustream.tv/channel/channeljapan',
+        'info_dict': {
+            'id': '10874166',
+        },
+        'playlist_mincount': 17,
+    }
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        display_id = m.group('slug')
+        webpage = self._download_webpage(url, display_id)
+        channel_id = self._html_search_meta('ustream:channel_id', webpage)
+
+        BASE = 'http://www.ustream.tv'
+        next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id
+        video_ids = []
+        while next_url:
+            reply = self._download_json(
+                compat_urlparse.urljoin(BASE, next_url), display_id,
+                note='Downloading video information (next: %d)' % (len(video_ids) + 1))
+            video_ids.extend(re.findall(r'data-content-id="(\d.*)"', reply['data']))
+            next_url = reply['nextUrl']
+
+        entries = [
+            self.url_result('http://www.ustream.tv/recorded/' + vid, 'Ustream')
+            for vid in video_ids]
+        return {
+            '_type': 'playlist',
+            'id': channel_id,
+            'display_id': display_id,
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py

new file mode 100644 (file)

index 0000000..56509be
--- /dev/null
+++ b/youtube_dl/extractor/ustudio.py
@@ -0,0 +1,125 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+    unescapeHTML,
+)
+
+
+class UstudioIE(InfoExtractor):
+    IE_NAME = 'ustudio'
+    _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge',
+        'md5': '58bbfca62125378742df01fc2abbdef6',
+        'info_dict': {
+            'id': 'Uxu2my9bgSph',
+            'display_id': 'san_francisco_golden_gate_bridge',
+            'ext': 'mp4',
+            'title': 'San Francisco: Golden Gate Bridge',
+            'description': 'md5:23925500697f2c6d4830e387ba51a9be',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'upload_date': '20111107',
+            'uploader': 'Tony Farley',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id, display_id = re.match(self._VALID_URL, url).groups()
+
+        config = self._download_xml(
+            'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id,
+            display_id)
+
+        def extract(kind):
+            return [{
+                'url': unescapeHTML(item.attrib['url']),
+                'width': int_or_none(item.get('width')),
+                'height': int_or_none(item.get('height')),
+            } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')]
+
+        formats = extract('video')
+        self._sort_formats(formats)
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(webpage)
+        upload_date = unified_strdate(self._search_regex(
+            r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>',
+            webpage, 'upload date', fatal=False))
+        uploader = self._search_regex(
+            r'Uploaded by\s*<a[^>]*>([^<]+)<',
+            webpage, 'uploader', fatal=False)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnails': extract('image'),
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'formats': formats,
+        }
+
+
+class UstudioEmbedIE(InfoExtractor):
+    IE_NAME = 'ustudio:embed'
+    _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T',
+        'md5': '47c0be52a09b23a7f40de9469cec58f4',
+        'info_dict': {
+            'id': 'Uw7G1kMCe65T',
+            'ext': 'mp4',
+            'title': '5 Things IT Should Know About Video',
+            'description': 'md5:93d32650884b500115e158c5677d25ad',
+            'uploader_id': 'DeN7VdYRDKhP',
+        }
+    }
+
+    def _real_extract(self, url):
+        uploader_id, video_id = re.match(self._VALID_URL, url).groups()
+        video_data = self._download_json(
+            'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id),
+            video_id)['videos'][0]
+        title = video_data['name']
+
+        formats = []
+        for ext, qualities in video_data.get('transcodes', {}).items():
+            for quality in qualities:
+                quality_url = quality.get('url')
+                if not quality_url:
+                    continue
+                height = int_or_none(quality.get('height'))
+                formats.append({
+                    'format_id': '%s-%dp' % (ext, height) if height else ext,
+                    'url': quality_url,
+                    'width': int_or_none(quality.get('width')),
+                    'height': height,
+                })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for image in video_data.get('images', []):
+            image_url = image.get('url')
+            if not image_url:
+                continue
+            thumbnails.append({
+                'url': image_url,
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'duration': int_or_none(video_data.get('duration')),
+            'uploader_id': uploader_id,
+            'tags': video_data.get('keywords'),
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py

new file mode 100644 (file)

index 0000000..f474ed7
--- /dev/null
+++ b/youtube_dl/extractor/varzesh3.py
@@ -0,0 +1,79 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_urlparse,
+    compat_parse_qs,
+)
+from ..utils import (
+    clean_html,
+    remove_start,
+)
+
+
+class Varzesh3IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?'
+    _TESTS = [{
+        'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/',
+        'md5': '2a933874cb7dce4366075281eb49e855',
+        'info_dict': {
+            'id': '76337',
+            'ext': 'mp4',
+            'title': '۵ واکنش برتر دروازه‌بانان؛هفته ۲۶ بوندسلیگا',
+            'description': 'فصل ۲۰۱۵-۲۰۱۴',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+        'skip': 'HTTP 404 Error',
+    }, {
+        'url': 'http://video.varzesh3.com/video/112785/%D8%AF%D9%84%D9%87-%D8%B9%D9%84%DB%8C%D8%9B-%D8%B3%D8%AA%D8%A7%D8%B1%D9%87-%D9%86%D9%88%D8%B8%D9%87%D9%88%D8%B1-%D9%84%DB%8C%DA%AF-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AC%D8%B2%DB%8C%D8%B1%D9%87',
+        'md5': '841b7cd3afbc76e61708d94e53a4a4e7',
+        'info_dict': {
+            'id': '112785',
+            'ext': 'mp4',
+            'title': 'دله علی؛ ستاره نوظهور لیگ برتر جزیره',
+            'description': 'فوتبال 120',
+        },
+        'expected_warnings': ['description'],
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_url = self._search_regex(
+            r'<source[^>]+src="([^"]+)"', webpage, 'video url')
+
+        title = remove_start(self._html_search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ')
+
+        description = self._html_search_regex(
+            r'(?s)<div class="matn">(.+?)</div>',
+            webpage, 'description', default=None)
+        if description is None:
+            description = clean_html(self._html_search_meta('description', webpage))
+
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
+        if thumbnail is None:
+            fb_sharer_url = self._search_regex(
+                r'<a[^>]+href="(https?://www\.facebook\.com/sharer/sharer\.php?[^"]+)"',
+                webpage, 'facebook sharer URL', fatal=False)
+            sharer_params = compat_parse_qs(compat_urllib_parse_urlparse(fb_sharer_url).query)
+            thumbnail = sharer_params.get('p[images][0]', [None])[0]
+
+        video_id = self._search_regex(
+            r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'",
+            webpage, display_id, default=None)
+        if video_id is None:
+            video_id = self._search_regex(
+                r'var\s+VideoId\s*=\s*(\d+);', webpage, 'video id',
+                default=display_id)
+
+        return {
+            'url': video_url,
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py

new file mode 100644 (file)

index 0000000..8152ace
--- /dev/null
+++ b/youtube_dl/extractor/vbox7.py
@@ -0,0 +1,105 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class Vbox7IE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:[^/]+\.)?vbox7\.com/
+                        (?:
+                            play:|
+                            (?:
+                                emb/external\.php|
+                                player/ext\.swf
+                            )\?.*?\bvid=
+                        )
+                        (?P<id>[\da-fA-F]+)
+                    '''
+    _GEO_COUNTRIES = ['BG']
+    _TESTS = [{
+        'url': 'http://vbox7.com/play:0946fff23c',
+        'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf',
+        'info_dict': {
+            'id': '0946fff23c',
+            'ext': 'mp4',
+            'title': 'Борисов: Притеснен съм за бъдещето на България',
+            'description': 'По думите му е опасно страната ни да бъде обявена за "сигурна"',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 1470982814,
+            'upload_date': '20160812',
+            'uploader': 'zdraveibulgaria',
+        },
+        'params': {
+            'proxy': '127.0.0.1:8118',
+        },
+    }, {
+        'url': 'http://vbox7.com/play:249bb972c2',
+        'md5': '99f65c0c9ef9b682b97313e052734c3f',
+        'info_dict': {
+            'id': '249bb972c2',
+            'ext': 'mp4',
+            'title': 'Смях! Чудо - чист за секунди - Скрита камера',
+        },
+        'skip': 'georestricted',
+    }, {
+        'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1',
+        'only_matching': True,
+    }, {
+        'url': 'http://i49.vbox7.com/player/ext.swf?vid=0946fff23c&autoplay=1',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        response = self._download_json(
+            'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id,
+            video_id)
+
+        if 'error' in response:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, response['error']), expected=True)
+
+        video = response['options']
+
+        title = video['title']
+        video_url = video['src']
+
+        if '/na.mp4' in video_url:
+            self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+
+        uploader = video.get('uploader')
+
+        webpage = self._download_webpage(
+            'http://vbox7.com/play:%s' % video_id, video_id, fatal=None)
+
+        info = {}
+
+        if webpage:
+            info = self._search_json_ld(
+                webpage.replace('"/*@context"', '"@context"'), video_id,
+                fatal=False)
+
+        info.update({
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'uploader': uploader,
+            'thumbnail': self._proto_relative_url(
+                info.get('thumbnail') or self._og_search_thumbnail(webpage),
+                'http:'),
+        })
+        return info
diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py

new file mode 100644 (file)

index 0000000..a6dc3c8
--- /dev/null
+++ b/youtube_dl/extractor/veehd.py
@@ -0,0 +1,118 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_unquote,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    clean_html,
+    get_element_by_id,
+)
+
+
+class VeeHDIE(InfoExtractor):
+    _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'
+
+    # Seems VeeHD videos have multiple copies on several servers, all of
+    # whom have different MD5 checksums, so omit md5 field in all tests
+    _TESTS = [{
+        'url': 'http://veehd.com/video/4639434_Solar-Sinter',
+        'info_dict': {
+            'id': '4639434',
+            'ext': 'mp4',
+            'title': 'Solar Sinter',
+            'uploader_id': 'VideoEyes',
+            'description': 'md5:46a840e8692ddbaffb5f81d9885cb457',
+        },
+        'skip': 'Video deleted',
+    }, {
+        'url': 'http://veehd.com/video/4905758_Elysian-Fields-Channeling',
+        'info_dict': {
+            'id': '4905758',
+            'ext': 'mp4',
+            'title': 'Elysian Fields - Channeling',
+            'description': 'md5:360e4e95fdab58aefbea0f2a19e5604b',
+            'uploader_id': 'spotted',
+        }
+    }, {
+        'url': 'http://veehd.com/video/2046729_2012-2009-DivX-Trailer',
+        'info_dict': {
+            'id': '2046729',
+            'ext': 'avi',
+            'title': '2012 (2009) DivX Trailer',
+            'description': 'md5:75435ee95255e6a9838ac6f6f3a2396b',
+            'uploader_id': 'Movie_Trailers',
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # VeeHD seems to send garbage on the first request.
+        # See https://github.com/ytdl-org/youtube-dl/issues/2102
+        self._download_webpage(url, video_id, 'Requesting webpage')
+        webpage = self._download_webpage(url, video_id)
+
+        if 'This video has been removed<' in webpage:
+            raise ExtractorError('Video %s has been removed' % video_id, expected=True)
+
+        player_path = self._search_regex(
+            r'\$\("#playeriframe"\).attr\({src : "(.+?)"',
+            webpage, 'player path')
+        player_url = compat_urlparse.urljoin(url, player_path)
+
+        self._download_webpage(player_url, video_id, 'Requesting player page')
+        player_page = self._download_webpage(
+            player_url, video_id, 'Downloading player page')
+
+        video_url = None
+
+        config_json = self._search_regex(
+            r'value=\'config=({.+?})\'', player_page, 'config json', default=None)
+
+        if config_json:
+            config = json.loads(config_json)
+            video_url = compat_urllib_parse_unquote(config['clip']['url'])
+
+        if not video_url:
+            video_url = self._html_search_regex(
+                r'<embed[^>]+type="video/divx"[^>]+src="([^"]+)"',
+                player_page, 'video url', default=None)
+
+        if not video_url:
+            iframe_src = self._search_regex(
+                r'<iframe[^>]+src="/?([^"]+)"', player_page, 'iframe url')
+            iframe_url = 'http://veehd.com/%s' % iframe_src
+
+            self._download_webpage(iframe_url, video_id, 'Requesting iframe page')
+            iframe_page = self._download_webpage(
+                iframe_url, video_id, 'Downloading iframe page')
+
+            video_url = self._search_regex(
+                r"file\s*:\s*'([^']+)'", iframe_page, 'video url')
+
+        title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0])
+        uploader_id = self._html_search_regex(
+            r'<a href="/profile/\d+">(.+?)</a>',
+            webpage, 'uploader')
+        thumbnail = self._search_regex(
+            r'<img id="veehdpreview" src="(.+?)"',
+            webpage, 'thumbnail')
+        description = self._html_search_regex(
+            r'<td class="infodropdown".*?<div>(.*?)<ul',
+            webpage, 'description', flags=re.DOTALL)
+
+        return {
+            '_type': 'video',
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'uploader_id': uploader_id,
+            'thumbnail': thumbnail,
+            'description': description,
+        }
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py

new file mode 100644 (file)

index 0000000..1c44c14
--- /dev/null
+++ b/youtube_dl/extractor/veoh.py
@@ -0,0 +1,103 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    qualities,
+)
+
+
+class VeohIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|embed|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)'
+
+    _TESTS = [{
+        'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+        'md5': '9e7ecc0fd8bbee7a69fe38953aeebd30',
+        'info_dict': {
+            'id': 'v56314296nk7Zdmz3',
+            'ext': 'mp4',
+            'title': 'Straight Backs Are Stronger',
+            'uploader': 'LUMOback',
+            'description': 'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ',
+        },
+    }, {
+        'url': 'http://www.veoh.com/embed/v56314296nk7Zdmz3',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
+        'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
+        'info_dict': {
+            'id': '27701988',
+            'ext': 'mp4',
+            'title': 'Chile workers cover up to avoid skin damage',
+            'description': 'md5:2bd151625a60a32822873efc246ba20d',
+            'uploader': 'afp-news',
+            'duration': 123,
+        },
+        'skip': 'This video has been deleted.',
+    }, {
+        'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
+        'md5': '4fde7b9e33577bab2f2f8f260e30e979',
+        'note': 'Embedded ooyala video',
+        'info_dict': {
+            'id': '69525809',
+            'ext': 'mp4',
+            'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
+            'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
+            'uploader': 'newsy-videos',
+        },
+        'skip': 'This video has been deleted.',
+    }, {
+        'url': 'http://www.veoh.com/watch/e152215AJxZktGS',
+        'only_matching': True,
+    }]
+
+    def _extract_video(self, source):
+        return {
+            'id': source.get('videoId'),
+            'title': source.get('title'),
+            'description': source.get('description'),
+            'thumbnail': source.get('highResImage') or source.get('medResImage'),
+            'uploader': source.get('username'),
+            'duration': int_or_none(source.get('length')),
+            'view_count': int_or_none(source.get('views')),
+            'age_limit': 18 if source.get('isMature') == 'true' or source.get('isSexy') == 'true' else 0,
+            'formats': self._extract_formats(source),
+        }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video = self._download_json(
+            'https://www.veoh.com/watch/getVideo/' + video_id,
+            video_id)['video']
+        title = video['title']
+
+        thumbnail_url = None
+        q = qualities(['HQ', 'Regular'])
+        formats = []
+        for f_id, f_url in video.get('src', {}).items():
+            if not f_url:
+                continue
+            if f_id == 'poster':
+                thumbnail_url = f_url
+            else:
+                formats.append({
+                    'format_id': f_id,
+                    'quality': q(f_id),
+                    'url': f_url,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': thumbnail_url,
+            'uploader': video.get('author', {}).get('nickname'),
+            'duration': int_or_none(video.get('lengthBySec')) or parse_duration(video.get('length')),
+            'view_count': int_or_none(video.get('views')),
+            'formats': formats,
+            'average_rating': int_or_none(video.get('rating')),
+            'comment_count': int_or_none(video.get('numOfComments')),
+        }
diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py

new file mode 100644 (file)

index 0000000..5ab7168
--- /dev/null
+++ b/youtube_dl/extractor/vesti.py
@@ -0,0 +1,121 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+from .rutv import RUTVIE
+
+
+class VestiIE(InfoExtractor):
+    IE_DESC = 'Вести.Ru'
+    _VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.vesti.ru/videos?vid=575582&cid=1',
+            'info_dict': {
+                'id': '765035',
+                'ext': 'mp4',
+                'title': 'Вести.net: биткоины в России не являются законными',
+                'description': 'md5:d4bb3859dc1177b28a94c5014c35a36b',
+                'duration': 302,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.vesti.ru/doc.html?id=1349233',
+            'info_dict': {
+                'id': '773865',
+                'ext': 'mp4',
+                'title': 'Участники митинга штурмуют Донецкую областную администрацию',
+                'description': 'md5:1a160e98b3195379b4c849f2f4958009',
+                'duration': 210,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.vesti.ru/only_video.html?vid=576180',
+            'info_dict': {
+                'id': '766048',
+                'ext': 'mp4',
+                'title': 'США заморозило, Британию затопило',
+                'description': 'md5:f0ed0695ec05aed27c56a70a58dc4cc1',
+                'duration': 87,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://hitech.vesti.ru/news/view/id/4000',
+            'info_dict': {
+                'id': '766888',
+                'ext': 'mp4',
+                'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
+                'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
+                'duration': 279,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
+            'info_dict': {
+                'id': '766403',
+                'ext': 'mp4',
+                'title': 'XXII зимние Олимпийские игры. Российские хоккеисты стартовали на Олимпиаде с победы',
+                'description': 'md5:55805dfd35763a890ff50fa9e35e31b3',
+                'duration': 271,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+            'skip': 'Blocked outside Russia',
+        },
+        {
+            'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
+            'info_dict': {
+                'id': '51499',
+                'ext': 'flv',
+                'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
+                'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'Translation has finished'
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage(url, video_id, 'Downloading page')
+
+        mobj = re.search(
+            r'<meta[^>]+?property="og:video"[^>]+?content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)',
+            page)
+        if mobj:
+            video_id = mobj.group('id')
+            page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id,
+                                          'Downloading video page')
+
+        rutv_url = RUTVIE._extract_url(page)
+        if rutv_url:
+            return self.url_result(rutv_url, 'RUTV')
+
+        raise ExtractorError('No video found', expected=True)
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py

new file mode 100644 (file)

index 0000000..4ea9f1b
--- /dev/null
+++ b/youtube_dl/extractor/vevo.py
@@ -0,0 +1,374 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+    compat_HTTPError,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class VevoBaseIE(InfoExtractor):
+    def _extract_json(self, webpage, video_id):
+        return self._parse_json(
+            self._search_regex(
+                r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*</script>',
+                webpage, 'initial store'),
+            video_id)
+
+
+class VevoIE(VevoBaseIE):
+    '''
+    Accepts urls from vevo.com or in the format 'vevo:{id}'
+    (currently used by MTVIE and MySpaceIE)
+    '''
+    _VALID_URL = r'''(?x)
+        (?:https?://(?:www\.)?vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?|
+           https?://cache\.vevo\.com/m/html/embed\.html\?video=|
+           https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
+           https?://embed\.vevo\.com/.*?[?&]isrc=|
+           vevo:)
+        (?P<id>[^&?#]+)'''
+
+    _TESTS = [{
+        'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
+        'md5': '95ee28ee45e70130e3ab02b0f579ae23',
+        'info_dict': {
+            'id': 'GB1101300280',
+            'ext': 'mp4',
+            'title': 'Hurts - Somebody to Die For',
+            'timestamp': 1372057200,
+            'upload_date': '20130624',
+            'uploader': 'Hurts',
+            'track': 'Somebody to Die For',
+            'artist': 'Hurts',
+            'genre': 'Pop',
+        },
+        'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
+    }, {
+        'note': 'v3 SMIL format',
+        'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
+        'md5': 'f6ab09b034f8c22969020b042e5ac7fc',
+        'info_dict': {
+            'id': 'USUV71302923',
+            'ext': 'mp4',
+            'title': 'Cassadee Pope - I Wish I Could Break Your Heart',
+            'timestamp': 1392796919,
+            'upload_date': '20140219',
+            'uploader': 'Cassadee Pope',
+            'track': 'I Wish I Could Break Your Heart',
+            'artist': 'Cassadee Pope',
+            'genre': 'Country',
+        },
+        'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
+    }, {
+        'note': 'Age-limited video',
+        'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
+        'info_dict': {
+            'id': 'USRV81300282',
+            'ext': 'mp4',
+            'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
+            'age_limit': 18,
+            'timestamp': 1372888800,
+            'upload_date': '20130703',
+            'uploader': 'Justin Timberlake',
+            'track': 'Tunnel Vision (Explicit)',
+            'artist': 'Justin Timberlake',
+            'genre': 'Pop',
+        },
+        'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
+    }, {
+        'note': 'No video_info',
+        'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000',
+        'md5': '8b83cc492d72fc9cf74a02acee7dc1b0',
+        'info_dict': {
+            'id': 'USUV71503000',
+            'ext': 'mp4',
+            'title': 'K Camp ft. T.I. - Till I Die',
+            'age_limit': 18,
+            'timestamp': 1449468000,
+            'upload_date': '20151207',
+            'uploader': 'K Camp',
+            'track': 'Till I Die',
+            'artist': 'K Camp',
+            'genre': 'Hip-Hop',
+        },
+        'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
+    }, {
+        'note': 'Featured test',
+        'url': 'https://www.vevo.com/watch/lemaitre/Wait/USUV71402190',
+        'md5': 'd28675e5e8805035d949dc5cf161071d',
+        'info_dict': {
+            'id': 'USUV71402190',
+            'ext': 'mp4',
+            'title': 'Lemaitre ft. LoLo - Wait',
+            'age_limit': 0,
+            'timestamp': 1413432000,
+            'upload_date': '20141016',
+            'uploader': 'Lemaitre',
+            'track': 'Wait',
+            'artist': 'Lemaitre',
+            'genre': 'Electronic',
+        },
+        'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'],
+    }, {
+        'note': 'Only available via webpage',
+        'url': 'http://www.vevo.com/watch/GBUV71600656',
+        'md5': '67e79210613865b66a47c33baa5e37fe',
+        'info_dict': {
+            'id': 'GBUV71600656',
+            'ext': 'mp4',
+            'title': 'ABC - Viva Love',
+            'age_limit': 0,
+            'timestamp': 1461830400,
+            'upload_date': '20160428',
+            'uploader': 'ABC',
+            'track': 'Viva Love',
+            'artist': 'ABC',
+            'genre': 'Pop',
+        },
+        'expected_warnings': ['Failed to download video versions info'],
+    }, {
+        # no genres available
+        'url': 'http://www.vevo.com/watch/INS171400764',
+        'only_matching': True,
+    }, {
+        # Another case available only via the webpage; using streams/streamsV3 formats
+        # Geo-restricted to Netherlands/Germany
+        'url': 'http://www.vevo.com/watch/boostee/pop-corn-clip-officiel/FR1A91600909',
+        'only_matching': True,
+    }, {
+        'url': 'https://embed.vevo.com/?isrc=USH5V1923499&partnerId=4d61b777-8023-4191-9ede-497ed6c24647&partnerAdCode=',
+        'only_matching': True,
+    }]
+    _VERSIONS = {
+        0: 'youtube',  # only in AuthenticateVideo videoVersions
+        1: 'level3',
+        2: 'akamai',
+        3: 'level3',
+        4: 'amazon',
+    }
+
+    def _initialize_api(self, video_id):
+        webpage = self._download_webpage(
+            'https://accounts.vevo.com/token', None,
+            note='Retrieving oauth token',
+            errnote='Unable to retrieve oauth token',
+            data=json.dumps({
+                'client_id': 'SPupX1tvqFEopQ1YS6SS',
+                'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous',
+            }).encode('utf-8'),
+            headers={
+                'Content-Type': 'application/json',
+            })
+
+        if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage):
+            self.raise_geo_restricted(
+                '%s said: This page is currently unavailable in your region' % self.IE_NAME)
+
+        auth_info = self._parse_json(webpage, video_id)
+        self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token']
+
+    def _call_api(self, path, *args, **kwargs):
+        try:
+            data = self._download_json(self._api_url_template % path, *args, **kwargs)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError):
+                errors = self._parse_json(e.cause.read().decode(), None)['errors']
+                error_message = ', '.join([error['message'] for error in errors])
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
+            raise
+        return data
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        self._initialize_api(video_id)
+
+        video_info = self._call_api(
+            'video/%s' % video_id, video_id, 'Downloading api video info',
+            'Failed to download video info')
+
+        video_versions = self._call_api(
+            'video/%s/streams' % video_id, video_id,
+            'Downloading video versions info',
+            'Failed to download video versions info',
+            fatal=False)
+
+        # Some videos are only available via webpage (e.g.
+        # https://github.com/ytdl-org/youtube-dl/issues/9366)
+        if not video_versions:
+            webpage = self._download_webpage(url, video_id)
+            json_data = self._extract_json(webpage, video_id)
+            if 'streams' in json_data.get('default', {}):
+                video_versions = json_data['default']['streams'][video_id][0]
+            else:
+                video_versions = [
+                    value
+                    for key, value in json_data['apollo']['data'].items()
+                    if key.startswith('%s.streams' % video_id)]
+
+        uploader = None
+        artist = None
+        featured_artist = None
+        artists = video_info.get('artists')
+        for curr_artist in artists:
+            if curr_artist.get('role') == 'Featured':
+                featured_artist = curr_artist['name']
+            else:
+                artist = uploader = curr_artist['name']
+
+        formats = []
+        for video_version in video_versions:
+            version = self._VERSIONS.get(video_version.get('version'), 'generic')
+            version_url = video_version.get('url')
+            if not version_url:
+                continue
+
+            if '.ism' in version_url:
+                continue
+            elif '.mpd' in version_url:
+                formats.extend(self._extract_mpd_formats(
+                    version_url, video_id, mpd_id='dash-%s' % version,
+                    note='Downloading %s MPD information' % version,
+                    errnote='Failed to download %s MPD information' % version,
+                    fatal=False))
+            elif '.m3u8' in version_url:
+                formats.extend(self._extract_m3u8_formats(
+                    version_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls-%s' % version,
+                    note='Downloading %s m3u8 information' % version,
+                    errnote='Failed to download %s m3u8 information' % version,
+                    fatal=False))
+            else:
+                m = re.search(r'''(?xi)
+                    _(?P<width>[0-9]+)x(?P<height>[0-9]+)
+                    _(?P<vcodec>[a-z0-9]+)
+                    _(?P<vbr>[0-9]+)
+                    _(?P<acodec>[a-z0-9]+)
+                    _(?P<abr>[0-9]+)
+                    \.(?P<ext>[a-z0-9]+)''', version_url)
+                if not m:
+                    continue
+
+                formats.append({
+                    'url': version_url,
+                    'format_id': 'http-%s-%s' % (version, video_version['quality']),
+                    'vcodec': m.group('vcodec'),
+                    'acodec': m.group('acodec'),
+                    'vbr': int(m.group('vbr')),
+                    'abr': int(m.group('abr')),
+                    'ext': m.group('ext'),
+                    'width': int(m.group('width')),
+                    'height': int(m.group('height')),
+                })
+        self._sort_formats(formats)
+
+        track = video_info['title']
+        if featured_artist:
+            artist = '%s ft. %s' % (artist, featured_artist)
+        title = '%s - %s' % (artist, track) if artist else track
+
+        genres = video_info.get('genres')
+        genre = (
+            genres[0] if genres and isinstance(genres, list)
+            and isinstance(genres[0], compat_str) else None)
+
+        is_explicit = video_info.get('isExplicit')
+        if is_explicit is True:
+            age_limit = 18
+        elif is_explicit is False:
+            age_limit = 0
+        else:
+            age_limit = None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': video_info.get('imageUrl') or video_info.get('thumbnailUrl'),
+            'timestamp': parse_iso8601(video_info.get('releaseDate')),
+            'uploader': uploader,
+            'duration': int_or_none(video_info.get('duration')),
+            'view_count': int_or_none(video_info.get('views', {}).get('total')),
+            'age_limit': age_limit,
+            'track': track,
+            'artist': uploader,
+            'genre': genre,
+        }
+
+
+class VevoPlaylistIE(VevoBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29',
+        'info_dict': {
+            'id': 'dadbf4e7-b99f-4184-9670-6f0e547b6a29',
+            'title': 'Best-Of: Birdman',
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'http://www.vevo.com/watch/genre/rock',
+        'info_dict': {
+            'id': 'rock',
+            'title': 'Rock',
+        },
+        'playlist_count': 20,
+    }, {
+        'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0',
+        'md5': '32dcdfddddf9ec6917fc88ca26d36282',
+        'info_dict': {
+            'id': 'USCMV1100073',
+            'ext': 'mp4',
+            'title': 'Birdman - Y.U. MAD',
+            'timestamp': 1323417600,
+            'upload_date': '20111209',
+            'uploader': 'Birdman',
+            'track': 'Y.U. MAD',
+            'artist': 'Birdman',
+            'genre': 'Rap/Hip-Hop',
+        },
+        'expected_warnings': ['Unable to download SMIL file'],
+    }, {
+        'url': 'http://www.vevo.com/watch/genre/rock?index=0',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+        playlist_kind = mobj.group('kind')
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        index = qs.get('index', [None])[0]
+
+        if index:
+            video_id = self._search_regex(
+                r'<meta[^>]+content=(["\'])vevo://video/(?P<id>.+?)\1[^>]*>',
+                webpage, 'video id', default=None, group='id')
+            if video_id:
+                return self.url_result('vevo:%s' % video_id, VevoIE.ie_key())
+
+        playlists = self._extract_json(webpage, playlist_id)['default']['%ss' % playlist_kind]
+
+        playlist = (list(playlists.values())[0]
+                    if playlist_kind == 'playlist' else playlists[playlist_id])
+
+        entries = [
+            self.url_result('vevo:%s' % src, VevoIE.ie_key())
+            for src in playlist['isrcs']]
+
+        return self.playlist_result(
+            entries, playlist.get('playlistId') or playlist_id,
+            playlist.get('name'), playlist.get('description'))
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py

new file mode 100644 (file)

index 0000000..fe7a26b
--- /dev/null
+++ b/youtube_dl/extractor/vgtv.py
@@ -0,0 +1,307 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .xstream import XstreamIE
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    try_get,
+)
+
+
+class VGTVIE(XstreamIE):
+    IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet'
+    _GEO_BYPASS = False
+
+    _HOST_TO_APPNAME = {
+        'vgtv.no': 'vgtv',
+        'bt.no/tv': 'bttv',
+        'aftenbladet.no/tv': 'satv',
+        'fvn.no/fvntv': 'fvntv',
+        'aftenposten.no/webtv': 'aptv',
+        'ap.vgtv.no/webtv': 'aptv',
+        'tv.aftonbladet.se/abtv': 'abtv',
+        'www.aftonbladet.se/tv': 'abtv',
+    }
+
+    _APP_NAME_TO_VENDOR = {
+        'vgtv': 'vgtv',
+        'bttv': 'bt',
+        'satv': 'sa',
+        'fvntv': 'fvn',
+        'aptv': 'ap',
+        'abtv': 'ab',
+    }
+
+    _VALID_URL = r'''(?x)
+                    (?:https?://(?:www\.)?
+                    (?P<host>
+                        %s
+                    )
+                    /?
+                    (?:
+                        (?:\#!/)?(?:video|live)/|
+                        embed?.*id=|
+                        a(?:rticles)?/
+                    )|
+                    (?P<appname>
+                        %s
+                    ):)
+                    (?P<id>\d+)
+                    ''' % ('|'.join(_HOST_TO_APPNAME.keys()), '|'.join(_APP_NAME_TO_VENDOR.keys()))
+
+    _TESTS = [
+        {
+            # streamType: vod
+            'url': 'http://www.vgtv.no/#!/video/84196/hevnen-er-soet-episode-10-abu',
+            'md5': 'b8be7a234cebb840c0d512c78013e02f',
+            'info_dict': {
+                'id': '84196',
+                'ext': 'mp4',
+                'title': 'Hevnen er søt: Episode 10 - Abu',
+                'description': 'md5:e25e4badb5f544b04341e14abdc72234',
+                'thumbnail': r're:^https?://.*\.jpg',
+                'duration': 648.000,
+                'timestamp': 1404626400,
+                'upload_date': '20140706',
+                'view_count': int,
+            },
+        },
+        {
+            # streamType: wasLive
+            'url': 'http://www.vgtv.no/#!/live/100764/opptak-vgtv-foelger-em-kvalifiseringen',
+            'info_dict': {
+                'id': '100764',
+                'ext': 'flv',
+                'title': 'OPPTAK: VGTV følger EM-kvalifiseringen',
+                'description': 'md5:3772d9c0dc2dff92a886b60039a7d4d3',
+                'thumbnail': r're:^https?://.*\.jpg',
+                'duration': 9103.0,
+                'timestamp': 1410113864,
+                'upload_date': '20140907',
+                'view_count': int,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+            'skip': 'Video is no longer available',
+        },
+        {
+            # streamType: wasLive
+            'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla',
+            'info_dict': {
+                'id': '113063',
+                'ext': 'mp4',
+                'title': 'V75 fra Solvalla 30.05.15',
+                'description': 'md5:b3743425765355855f88e096acc93231',
+                'thumbnail': r're:^https?://.*\.jpg',
+                'duration': 25966,
+                'timestamp': 1432975582,
+                'upload_date': '20150530',
+                'view_count': int,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more',
+            'md5': 'fd828cd29774a729bf4d4425fe192972',
+            'info_dict': {
+                'id': '21039',
+                'ext': 'mp4',
+                'title': 'TRAILER: «SWEATSHOP» - I can´t take any more',
+                'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238',
+                'duration': 66,
+                'timestamp': 1417002452,
+                'upload_date': '20141126',
+                'view_count': int,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil',
+            'only_matching': True,
+        },
+        {
+            # geoblocked
+            'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://www.aftonbladet.se/tv/a/36015',
+            'only_matching': True,
+        },
+        {
+            'url': 'abtv:140026',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.vgtv.no/video/84196/hevnen-er-soet-episode-10-abu',
+            'only_matching': True,
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        host = mobj.group('host')
+        appname = self._HOST_TO_APPNAME[host] if host else mobj.group('appname')
+        vendor = self._APP_NAME_TO_VENDOR[appname]
+
+        data = self._download_json(
+            'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website'
+            % (vendor, video_id, appname),
+            video_id, 'Downloading media JSON')
+
+        if data.get('status') == 'inactive':
+            raise ExtractorError(
+                'Video %s is no longer available' % video_id, expected=True)
+
+        info = {
+            'formats': [],
+        }
+        if len(video_id) == 5:
+            if appname == 'bttv':
+                info = self._extract_video_info('btno', video_id)
+
+        streams = data['streamUrls']
+        stream_type = data.get('streamType')
+        is_live = stream_type == 'live'
+        formats = []
+
+        hls_url = streams.get('hls')
+        if hls_url:
+            formats.extend(self._extract_m3u8_formats(
+                hls_url, video_id, 'mp4',
+                entry_protocol='m3u8' if is_live else 'm3u8_native',
+                m3u8_id='hls', fatal=False))
+
+        hds_url = streams.get('hds')
+        if hds_url:
+            hdcore_sign = 'hdcore=3.7.0'
+            f4m_formats = self._extract_f4m_formats(
+                hds_url + '?%s' % hdcore_sign, video_id, f4m_id='hds', fatal=False)
+            if f4m_formats:
+                for entry in f4m_formats:
+                    # URLs without the extra param induce an 404 error
+                    entry.update({'extra_param_to_segment_url': hdcore_sign})
+                    formats.append(entry)
+
+        mp4_urls = streams.get('pseudostreaming') or []
+        mp4_url = streams.get('mp4')
+        if mp4_url:
+            mp4_urls.append(mp4_url)
+        for mp4_url in mp4_urls:
+            format_info = {
+                'url': mp4_url,
+            }
+            mobj = re.search(r'(\d+)_(\d+)_(\d+)', mp4_url)
+            if mobj:
+                tbr = int(mobj.group(3))
+                format_info.update({
+                    'width': int(mobj.group(1)),
+                    'height': int(mobj.group(2)),
+                    'tbr': tbr,
+                    'format_id': 'mp4-%s' % tbr,
+                })
+            formats.append(format_info)
+
+        info['formats'].extend(formats)
+
+        if not info['formats']:
+            properties = try_get(
+                data, lambda x: x['streamConfiguration']['properties'], list)
+            if properties and 'geoblocked' in properties:
+                raise self.raise_geo_restricted(
+                    countries=[host.rpartition('.')[-1].partition('/')[0].upper()])
+
+        self._sort_formats(info['formats'])
+
+        info.update({
+            'id': video_id,
+            'title': self._live_title(data['title']) if is_live else data['title'],
+            'description': data['description'],
+            'thumbnail': data['images']['main'] + '?t[]=900x506q80',
+            'timestamp': data['published'],
+            'duration': float_or_none(data['duration'], 1000),
+            'view_count': data['displays'],
+            'is_live': is_live,
+        })
+        return info
+
+
+class BTArticleIE(InfoExtractor):
+    IE_NAME = 'bt:article'
+    IE_DESC = 'Bergens Tidende Articles'
+    _VALID_URL = r'https?://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html'
+    _TEST = {
+        'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html',
+        'md5': '2acbe8ad129b3469d5ae51b1158878df',
+        'info_dict': {
+            'id': '23199',
+            'ext': 'mp4',
+            'title': 'Alrekstad internat',
+            'description': 'md5:dc81a9056c874fedb62fc48a300dac58',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 191,
+            'timestamp': 1289991323,
+            'upload_date': '20101117',
+            'view_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage(url, self._match_id(url))
+        video_id = self._search_regex(
+            r'<video[^>]+data-id="(\d+)"', webpage, 'video id')
+        return self.url_result('bttv:%s' % video_id, 'VGTV')
+
+
+class BTVestlendingenIE(InfoExtractor):
+    IE_NAME = 'bt:vestlendingen'
+    IE_DESC = 'Bergens Tidende - Vestlendingen'
+    _VALID_URL = r'https?://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588',
+        'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
+        'info_dict': {
+            'id': '86588',
+            'ext': 'mov',
+            'title': 'Otto Wollertsen',
+            'description': 'Vestlendingen Otto Fredrik Wollertsen',
+            'timestamp': 1430473209,
+            'upload_date': '20150501',
+        },
+        'skip': '404 Error',
+    }, {
+        'url': 'http://www.bt.no/spesial/vestlendingen/#!/86255',
+        'md5': 'a2893f8632e96389f4bdf36aa9463ceb',
+        'info_dict': {
+            'id': '86255',
+            'ext': 'mov',
+            'title': 'Du må tåle å fryse og være sulten',
+            'description': 'md5:b8046f4d022d5830ddab04865791d063',
+            'upload_date': '20150321',
+            'timestamp': 1426942023,
+        },
+    }]
+
+    def _real_extract(self, url):
+        return self.url_result('bttv:%s' % self._match_id(url), 'VGTV')
diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py

new file mode 100644 (file)

index 0000000..dff94a2
--- /dev/null
+++ b/youtube_dl/extractor/vh1.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .mtv import MTVServicesInfoExtractor
+
+
+class VH1IE(MTVServicesInfoExtractor):
+    IE_NAME = 'vh1.com'
+    _FEED_URL = 'http://www.vh1.com/feeds/mrss/'
+    _TESTS = [{
+        'url': 'http://www.vh1.com/episodes/0umwpq/hip-hop-squares-kent-jones-vs-nick-young-season-1-ep-120',
+        'info_dict': {
+            'title': 'Kent Jones vs. Nick Young',
+            'description': 'Come to Play. Stay to Party. With Mike Epps, TIP, O’Shea Jackson Jr., T-Pain, Tisha Campbell-Martin and more.',
+        },
+        'playlist_mincount': 4,
+    }, {
+        # Clip
+        'url': 'http://www.vh1.com/video-clips/t74mif/scared-famous-scared-famous-extended-preview',
+        'info_dict': {
+            'id': '0a50c2d2-a86b-4141-9565-911c7e2d0b92',
+            'ext': 'mp4',
+            'title': 'Scared Famous|October 9, 2017|1|NO-EPISODE#|Scared Famous + Extended Preview',
+            'description': 'md5:eff5551a274c473a29463de40f7b09da',
+            'upload_date': '20171009',
+            'timestamp': 1507574700,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }]
+
+    _VALID_URL = r'https?://(?:www\.)?vh1\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)'
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+        mgid = self._extract_triforce_mgid(webpage)
+        videos_info = self._get_videos_info(mgid)
+        return videos_info
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py

new file mode 100644 (file)

index 0000000..e374995
--- /dev/null
+++ b/youtube_dl/extractor/vice.py
@@ -0,0 +1,337 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import functools
+import hashlib
+import json
+import random
+import re
+import time
+
+from .adobepass import AdobePassIE
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    OnDemandPagedList,
+    parse_age_limit,
+    str_or_none,
+    try_get,
+)
+
+
+class ViceBaseIE(InfoExtractor):
+    def _call_api(self, resource, resource_key, resource_id, locale, fields, args=''):
+        return self._download_json(
+            'https://video.vice.com/api/v1/graphql', resource_id, query={
+                'query': '''{
+  %s(locale: "%s", %s: "%s"%s) {
+    %s
+  }
+}''' % (resource, locale, resource_key, resource_id, args, fields),
+            })['data'][resource]
+
+
+class ViceIE(ViceBaseIE, AdobePassIE):
+    IE_NAME = 'vice'
+    _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]{24})'
+    _TESTS = [{
+        'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7',
+        'info_dict': {
+            'id': '58c69e38a55424f1227dc3f7',
+            'ext': 'mp4',
+            'title': '10 Questions You Always Wanted To Ask: Pet Cremator',
+            'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5',
+            'uploader': 'vice',
+            'uploader_id': '57a204088cb727dec794c67b',
+            'timestamp': 1489664942,
+            'upload_date': '20170316',
+            'age_limit': 14,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # geo restricted to US
+        'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56',
+        'info_dict': {
+            'id': '5816510690b70e6c5fd39a56',
+            'ext': 'mp4',
+            'uploader': 'vice',
+            'title': 'The Signal From Tölva',
+            'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',
+            'uploader_id': '57a204088cb727dec794c67b',
+            'timestamp': 1477941983,
+            'upload_date': '20161031',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f',
+        'info_dict': {
+            'id': '581b12b60a0e1f4c0fb6ea2f',
+            'ext': 'mp4',
+            'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1',
+            'description': 'Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.',
+            'uploader': 'vice',
+            'uploader_id': '57a204088cb727dec794c67b',
+            'timestamp': 1485368119,
+            'upload_date': '20170125',
+            'age_limit': 14,
+        },
+        'params': {
+            # AES-encrypted m3u8
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',
+        'only_matching': True,
+    }, {
+        'url': 'https://video.vice.com/en_us/embed/57f41d3556a0a80f54726060',
+        'only_matching': True,
+    }, {
+        'url': 'https://vms.vice.com/en_us/video/preplay/58c69e38a55424f1227dc3f7',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})',
+            webpage)
+
+    @staticmethod
+    def _extract_url(webpage):
+        urls = ViceIE._extract_urls(webpage)
+        return urls[0] if urls else None
+
+    def _real_extract(self, url):
+        locale, video_id = re.match(self._VALID_URL, url).groups()
+
+        video = self._call_api('videos', 'id', video_id, locale, '''body
+    locked
+    rating
+    thumbnail_url
+    title''')[0]
+        title = video['title'].strip()
+        rating = video.get('rating')
+
+        query = {}
+        if video.get('locked'):
+            resource = self._get_mvpd_resource(
+                'VICELAND', title, video_id, rating)
+            query['tvetoken'] = self._extract_mvpd_auth(
+                url, video_id, 'VICELAND', resource)
+
+        # signature generation algorithm is reverse engineered from signatureGenerator in
+        # webpack:///../shared/~/vice-player/dist/js/vice-player.js in
+        # https://www.viceland.com/assets/common/js/web.vendor.bundle.js
+        # new JS is located here https://vice-web-statics-cdn.vice.com/vice-player/player-embed.js
+        exp = int(time.time()) + 1440
+
+        query.update({
+            'exp': exp,
+            'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(),
+            'skipadstitching': 1,
+            'platform': 'desktop',
+            'rn': random.randint(10000, 100000),
+        })
+
+        try:
+            preplay = self._download_json(
+                'https://vms.vice.com/%s/video/preplay/%s' % (locale, video_id),
+                video_id, query=query)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401):
+                error = json.loads(e.cause.read().decode())
+                error_message = error.get('error_description') or error['details']
+                raise ExtractorError('%s said: %s' % (
+                    self.IE_NAME, error_message), expected=True)
+            raise
+
+        video_data = preplay['video']
+        formats = self._extract_m3u8_formats(
+            preplay['playURL'], video_id, 'mp4', 'm3u8_native')
+        self._sort_formats(formats)
+        episode = video_data.get('episode') or {}
+        channel = video_data.get('channel') or {}
+        season = video_data.get('season') or {}
+
+        subtitles = {}
+        for subtitle in preplay.get('subtitleURLs', []):
+            cc_url = subtitle.get('url')
+            if not cc_url:
+                continue
+            language_code = try_get(subtitle, lambda x: x['languages'][0]['language_code'], compat_str) or 'en'
+            subtitles.setdefault(language_code, []).append({
+                'url': cc_url,
+            })
+
+        return {
+            'formats': formats,
+            'id': video_id,
+            'title': title,
+            'description': clean_html(video.get('body')),
+            'thumbnail': video.get('thumbnail_url'),
+            'duration': int_or_none(video_data.get('video_duration')),
+            'timestamp': int_or_none(video_data.get('created_at'), 1000),
+            'age_limit': parse_age_limit(video_data.get('video_rating') or rating),
+            'series': try_get(video_data, lambda x: x['show']['base']['display_title'], compat_str),
+            'episode_number': int_or_none(episode.get('episode_number')),
+            'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')),
+            'season_number': int_or_none(season.get('season_number')),
+            'season_id': str_or_none(season.get('id') or video_data.get('season_id')),
+            'uploader': channel.get('name'),
+            'uploader_id': str_or_none(channel.get('id')),
+            'subtitles': subtitles,
+        }
+
+
+class ViceShowIE(ViceBaseIE):
+    IE_NAME = 'vice:show'
+    _VALID_URL = r'https?://(?:video\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/show/(?P<id>[^/?#&]+)'
+    _PAGE_SIZE = 25
+    _TESTS = [{
+        'url': 'https://video.vice.com/en_us/show/fck-thats-delicious',
+        'info_dict': {
+            'id': '57a2040c8cb727dec794c901',
+            'title': 'F*ck, That’s Delicious',
+            'description': 'The life and eating habits of rap’s greatest bon vivant, Action Bronson.',
+        },
+        'playlist_mincount': 64,
+    }, {
+        'url': 'https://www.vicetv.com/en_us/show/fck-thats-delicious',
+        'only_matching': True,
+    }]
+
+    def _fetch_page(self, locale, show_id, page):
+        videos = self._call_api('videos', 'show_id', show_id, locale, '''body
+    id
+    url''', ', page: %d, per_page: %d' % (page + 1, self._PAGE_SIZE))
+        for video in videos:
+            yield self.url_result(
+                video['url'], ViceIE.ie_key(), video.get('id'))
+
+    def _real_extract(self, url):
+        locale, display_id = re.match(self._VALID_URL, url).groups()
+        show = self._call_api('shows', 'slug', display_id, locale, '''dek
+    id
+    title''')[0]
+        show_id = show['id']
+
+        entries = OnDemandPagedList(
+            functools.partial(self._fetch_page, locale, show_id),
+            self._PAGE_SIZE)
+
+        return self.playlist_result(
+            entries, show_id, show.get('title'), show.get('dek'))
+
+
+class ViceArticleIE(ViceBaseIE):
+    IE_NAME = 'vice:article'
+    _VALID_URL = r'https://(?:www\.)?vice\.com/(?P<locale>[^/]+)/article/(?:[0-9a-z]{6}/)?(?P<id>[^?#]+)'
+
+    _TESTS = [{
+        'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah',
+        'info_dict': {
+            'id': '58dc0a3dee202d2a0ccfcbd8',
+            'ext': 'mp4',
+            'title': 'Mormon War on Porn',
+            'description': 'md5:1c5d91fe25fa8aa304f9def118b92dbf',
+            'uploader': 'vice',
+            'uploader_id': '57a204088cb727dec794c67b',
+            'timestamp': 1491883129,
+            'upload_date': '20170411',
+            'age_limit': 17,
+        },
+        'params': {
+            # AES-encrypted m3u8
+            'skip_download': True,
+        },
+        'add_ie': [ViceIE.ie_key()],
+    }, {
+        'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car',
+        'md5': '13010ee0bc694ea87ec40724397c2349',
+        'info_dict': {
+            'id': '3jstaBeXgAs',
+            'ext': 'mp4',
+            'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
+            'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
+            'uploader': 'Motherboard',
+            'uploader_id': 'MotherboardTV',
+            'upload_date': '20140529',
+        },
+        'add_ie': [YoutubeIE.ie_key()],
+    }, {
+        'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded',
+        'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
+        'info_dict': {
+            'id': '57f41d3556a0a80f54726060',
+            'ext': 'mp4',
+            'title': "Making The World's First Male Sex Doll",
+            'description': 'md5:19b00b215b99961cf869c40fbe9df755',
+            'uploader': 'vice',
+            'uploader_id': '57a204088cb727dec794c67b',
+            'timestamp': 1476919911,
+            'upload_date': '20161019',
+            'age_limit': 17,
+        },
+        'params': {
+            'skip_download': True,
+            'format': 'bestvideo',
+        },
+        'add_ie': [ViceIE.ie_key()],
+    }, {
+        'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        locale, display_id = re.match(self._VALID_URL, url).groups()
+
+        article = self._call_api('articles', 'slug', display_id, locale, '''body
+    embed_code''')[0]
+        body = article['body']
+
+        def _url_res(video_url, ie_key):
+            return {
+                '_type': 'url_transparent',
+                'url': video_url,
+                'display_id': display_id,
+                'ie_key': ie_key,
+            }
+
+        vice_url = ViceIE._extract_url(body)
+        if vice_url:
+            return _url_res(vice_url, ViceIE.ie_key())
+
+        embed_code = self._search_regex(
+            r'embedCode=([^&\'"]+)', body,
+            'ooyala embed code', default=None)
+        if embed_code:
+            return _url_res('ooyala:%s' % embed_code, 'Ooyala')
+
+        youtube_url = YoutubeIE._extract_url(body)
+        if youtube_url:
+            return _url_res(youtube_url, YoutubeIE.ie_key())
+
+        video_url = self._html_search_regex(
+            r'data-video-url="([^"]+)"',
+            article['embed_code'], 'video URL')
+
+        return _url_res(video_url, ViceIE.ie_key())
diff --git a/youtube_dl/extractor/vidbit.py b/youtube_dl/extractor/vidbit.py

new file mode 100644 (file)

index 0000000..91f45b7
--- /dev/null
+++ b/youtube_dl/extractor/vidbit.py
@@ -0,0 +1,84 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    remove_end,
+    unified_strdate,
+)
+
+
+class VidbitIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)'
+    _TESTS = [{
+        'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2',
+        'md5': '1a34b7f14defe3b8fafca9796892924d',
+        'info_dict': {
+            'id': 'jkL2yDOEq2',
+            'ext': 'mp4',
+            'title': 'Intro to VidBit',
+            'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7',
+            'thumbnail': r're:https?://.*\.jpg$',
+            'upload_date': '20160618',
+            'view_count': int,
+            'comment_count': int,
+        }
+    }, {
+        'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id)
+
+        video_url, title = [None] * 2
+
+        config = self._parse_json(self._search_regex(
+            r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'),
+            video_id, transform_source=js_to_json)
+        if config:
+            if config.get('file'):
+                video_url = compat_urlparse.urljoin(url, config['file'])
+            title = config.get('title')
+
+        if not video_url:
+            video_url = compat_urlparse.urljoin(url, self._search_regex(
+                r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+                webpage, 'video URL', group='url'))
+
+        if not title:
+            title = remove_end(
+                self._html_search_regex(
+                    (r'<h1>(.+?)</h1>', r'<title>(.+?)</title>'),
+                    webpage, 'title', default=None) or self._og_search_title(webpage),
+                ' - VidBit')
+
+        description = self._html_search_meta(
+            ('description', 'og:description', 'twitter:description'),
+            webpage, 'description')
+
+        upload_date = unified_strdate(self._html_search_meta(
+            'datePublished', webpage, 'upload date'))
+
+        view_count = int_or_none(self._search_regex(
+            r'<strong>(\d+)</strong> views',
+            webpage, 'view count', fatal=False))
+        comment_count = int_or_none(self._search_regex(
+            r'id=["\']cmt_num["\'][^>]*>\((\d+)\)',
+            webpage, 'comment count', fatal=False))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'upload_date': upload_date,
+            'view_count': view_count,
+            'comment_count': comment_count,
+        }
diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py

new file mode 100644 (file)

index 0000000..6423584
--- /dev/null
+++ b/youtube_dl/extractor/viddler.py
@@ -0,0 +1,138 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    int_or_none,
+)
+
+
+class ViddlerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)(?:.+?\bsecret=(\d+))?'
+    _TESTS = [{
+        'url': 'http://www.viddler.com/v/43903784',
+        'md5': '9eee21161d2c7f5b39690c3e325fab2f',
+        'info_dict': {
+            'id': '43903784',
+            'ext': 'mov',
+            'title': 'Video Made Easy',
+            'description': 'md5:6a697ebd844ff3093bd2e82c37b409cd',
+            'uploader': 'viddler',
+            'timestamp': 1335371429,
+            'upload_date': '20120425',
+            'duration': 100.89,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'view_count': int,
+            'comment_count': int,
+            'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'],
+        }
+    }, {
+        'url': 'http://www.viddler.com/v/4d03aad9/',
+        'md5': 'f12c5a7fa839c47a79363bfdf69404fb',
+        'info_dict': {
+            'id': '4d03aad9',
+            'ext': 'ts',
+            'title': 'WALL-TO-GORTAT',
+            'upload_date': '20150126',
+            'uploader': 'deadspin',
+            'timestamp': 1422285291,
+            'view_count': int,
+            'comment_count': int,
+        }
+    }, {
+        'url': 'http://www.viddler.com/player/221ebbbd/0/',
+        'md5': '740511f61d3d1bb71dc14a0fe01a1c10',
+        'info_dict': {
+            'id': '221ebbbd',
+            'ext': 'mov',
+            'title': 'LETeens-Grammar-snack-third-conditional',
+            'description': ' ',
+            'upload_date': '20140929',
+            'uploader': 'BCLETeens',
+            'timestamp': 1411997190,
+            'view_count': int,
+            'comment_count': int,
+        }
+    }, {
+        # secret protected
+        'url': 'http://www.viddler.com/v/890c0985?secret=34051570',
+        'info_dict': {
+            'id': '890c0985',
+            'ext': 'mp4',
+            'title': 'Complete Property Training - Traineeships',
+            'description': ' ',
+            'upload_date': '20130606',
+            'uploader': 'TiffanyBowtell',
+            'timestamp': 1370496993,
+            'view_count': int,
+            'comment_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id, secret = re.match(self._VALID_URL, url).groups()
+
+        query = {
+            'video_id': video_id,
+            'key': 'v0vhrt7bg2xq1vyxhkct',
+        }
+        if secret:
+            query['secret'] = secret
+
+        data = self._download_json(
+            'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json',
+            video_id, headers={'Referer': url}, query=query)['video']
+
+        formats = []
+        for filed in data['files']:
+            if filed.get('status', 'ready') != 'ready':
+                continue
+            format_id = filed.get('profile_id') or filed['profile_name']
+            f = {
+                'format_id': format_id,
+                'format_note': filed['profile_name'],
+                'url': self._proto_relative_url(filed['url']),
+                'width': int_or_none(filed.get('width')),
+                'height': int_or_none(filed.get('height')),
+                'filesize': int_or_none(filed.get('size')),
+                'ext': filed.get('ext'),
+                'source_preference': -1,
+            }
+            formats.append(f)
+
+            if filed.get('cdn_url'):
+                f = f.copy()
+                f['url'] = self._proto_relative_url(filed['cdn_url'], 'http:')
+                f['format_id'] = format_id + '-cdn'
+                f['source_preference'] = 1
+                formats.append(f)
+
+            if filed.get('html5_video_source'):
+                f = f.copy()
+                f['url'] = self._proto_relative_url(filed['html5_video_source'])
+                f['format_id'] = format_id + '-html5'
+                f['source_preference'] = 0
+                formats.append(f)
+        self._sort_formats(formats)
+
+        categories = [
+            t.get('text') for t in data.get('tags', []) if 'text' in t]
+
+        return {
+            'id': video_id,
+            'title': data['title'],
+            'formats': formats,
+            'description': data.get('description'),
+            'timestamp': int_or_none(data.get('upload_time')),
+            'thumbnail': self._proto_relative_url(data.get('thumbnail_url')),
+            'uploader': data.get('author'),
+            'duration': float_or_none(data.get('length')),
+            'view_count': int_or_none(data.get('view_count')),
+            'comment_count': int_or_none(data.get('comment_count')),
+            'categories': categories,
+        }
diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py

new file mode 100644 (file)

index 0000000..a03614c
--- /dev/null
+++ b/youtube_dl/extractor/videa.py
@@ -0,0 +1,164 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import random
+import string
+import struct
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    mimetype2ext,
+    parse_codecs,
+    xpath_element,
+    xpath_text,
+)
+from ..compat import (
+    compat_b64decode,
+    compat_ord,
+    compat_parse_qs,
+)
+
+
+class VideaIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        videa(?:kid)?\.hu/
+                        (?:
+                            videok/(?:[^/]+/)*[^?#&]+-|
+                            player\?.*?\bv=|
+                            player/v/
+                        )
+                        (?P<id>[^?#&]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ',
+        'md5': '97a7af41faeaffd9f1fc864a7c7e7603',
+        'info_dict': {
+            'id': '8YfIAjxwWGwT8HVQ',
+            'ext': 'mp4',
+            'title': 'Az őrült kígyász 285 kígyót enged szabadon',
+            'thumbnail': r're:^https?://.*',
+            'duration': 21,
+        },
+    }, {
+        'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH',
+        'only_matching': True,
+    }, {
+        'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ',
+        'only_matching': True,
+    }, {
+        'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1',
+        'only_matching': True,
+    }, {
+        'url': 'https://videakid.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH',
+        'only_matching': True,
+    }, {
+        'url': 'https://videakid.hu/player?v=8YfIAjxwWGwT8HVQ',
+        'only_matching': True,
+    }, {
+        'url': 'https://videakid.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [url for _, url in re.findall(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1',
+            webpage)]
+
+    def rc4(self, ciphertext, key):
+        res = b''
+
+        keyLen = len(key)
+        S = list(range(256))
+
+        j = 0
+        for i in range(256):
+            j = (j + S[i] + ord(key[i % keyLen])) % 256
+            S[i], S[j] = S[j], S[i]
+
+        i = 0
+        j = 0
+        for m in range(len(ciphertext)):
+            i = (i + 1) % 256
+            j = (j + S[i]) % 256
+            S[i], S[j] = S[j], S[i]
+            k = S[(S[i] + S[j]) % 256]
+            res += struct.pack("B", k ^ compat_ord(ciphertext[m]))
+
+        return res
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id, fatal=True)
+        error = self._search_regex(r'<p class="error-text">([^<]+)</p>', webpage, 'error', default=None)
+        if error:
+            raise ExtractorError(error, expected=True)
+
+        video_src_params_raw = self._search_regex(r'<iframe[^>]+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params')
+        video_src_params = compat_parse_qs(video_src_params_raw)
+        player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True)
+        nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce')
+        random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8))
+        static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p'
+        l = nonce[:32]
+        s = nonce[32:]
+        result = ''
+        for i in range(0, 32):
+            result += s[i - (static_secret.index(l[i]) - 31)]
+
+        video_src_params['_s'] = random_seed
+        video_src_params['_t'] = result[:16]
+        encryption_key_stem = result[16:] + random_seed
+
+        [b64_info, handle] = self._download_webpage_handle(
+            'http://videa.hu/videaplayer_get_xml.php', video_id,
+            query=video_src_params, fatal=True)
+
+        encrypted_info = compat_b64decode(b64_info)
+        key = encryption_key_stem + handle.info()['x-videa-xs']
+        info_str = self.rc4(encrypted_info, key).decode('utf8')
+        info = self._parse_xml(info_str, video_id)
+
+        video = xpath_element(info, './/video', 'video', fatal=True)
+        sources = xpath_element(info, './/video_sources', 'sources', fatal=True)
+        hash_values = xpath_element(info, './/hash_values', 'hash_values', fatal=True)
+
+        title = xpath_text(video, './title', fatal=True)
+
+        formats = []
+        for source in sources.findall('./video_source'):
+            source_url = source.text
+            if not source_url:
+                continue
+            source_url += '?md5=%s&expires=%s' % (hash_values.find('hash_value_%s' % source.get('name')).text, source.get('exp'))
+            f = parse_codecs(source.get('codecs'))
+            f.update({
+                'url': source_url,
+                'ext': mimetype2ext(source.get('mimetype')) or 'mp4',
+                'format_id': source.get('name'),
+                'width': int_or_none(source.get('width')),
+                'height': int_or_none(source.get('height')),
+            })
+            formats.append(f)
+        self._sort_formats(formats)
+
+        thumbnail = xpath_text(video, './poster_src')
+        duration = int_or_none(xpath_text(video, './duration'))
+
+        age_limit = None
+        is_adult = xpath_text(video, './is_adult_content', default=None)
+        if is_adult:
+            age_limit = 18 if is_adult == '1' else 0
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py

new file mode 100644 (file)

index 0000000..fe70db7
--- /dev/null
+++ b/youtube_dl/extractor/videodetective.py
@@ -0,0 +1,29 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .internetvideoarchive import InternetVideoArchiveIE
+
+
+class VideoDetectiveIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.videodetective.com/movies/kick-ass-2/194487',
+        'info_dict': {
+            'id': '194487',
+            'ext': 'mp4',
+            'title': 'Kick-Ass 2',
+            'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        query = 'customerid=69249&publishedid=' + video_id
+        return self.url_result(
+            InternetVideoArchiveIE._build_json_url(query),
+            ie=InternetVideoArchiveIE.ie_key())
diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py

new file mode 100644 (file)

index 0000000..cd3f50a
--- /dev/null
+++ b/youtube_dl/extractor/videofyme.py
@@ -0,0 +1,52 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class VideofyMeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.videofy\.me/.+?|p\.videofy\.me/v)/(?P<id>\d+)(&|#|$)'
+    IE_NAME = 'videofy.me'
+
+    _TEST = {
+        'url': 'http://www.videofy.me/thisisvideofyme/1100701',
+        'md5': 'c77d700bdc16ae2e9f3c26019bd96143',
+        'info_dict': {
+            'id': '1100701',
+            'ext': 'mp4',
+            'title': 'This is VideofyMe',
+            'description': '',
+            'upload_date': '20130326',
+            'timestamp': 1364288959,
+            'uploader': 'VideofyMe',
+            'uploader_id': 'thisisvideofyme',
+            'view_count': int,
+            'likes': int,
+            'comment_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        config = self._download_json('http://vf-player-info-loader.herokuapp.com/%s.json' % video_id, video_id)['videoinfo']
+
+        video = config.get('video')
+        blog = config.get('blog', {})
+
+        return {
+            'id': video_id,
+            'title': video['title'],
+            'url': video['sources']['source']['url'],
+            'thumbnail': video.get('thumb'),
+            'description': video.get('description'),
+            'timestamp': parse_iso8601(video.get('date')),
+            'uploader': blog.get('name'),
+            'uploader_id': blog.get('identifier'),
+            'view_count': int_or_none(self._search_regex(r'([0-9]+)', video.get('views'), 'view count', fatal=False)),
+            'likes': int_or_none(video.get('likes')),
+            'comment_count': int_or_none(video.get('nrOfComments')),
+        }
diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py

new file mode 100644 (file)

index 0000000..e3eda33
--- /dev/null
+++ b/youtube_dl/extractor/videomore.py
@@ -0,0 +1,307 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    orderedSet,
+    parse_duration,
+    str_or_none,
+    unified_strdate,
+    url_or_none,
+    xpath_element,
+    xpath_text,
+)
+
+
+class VideomoreIE(InfoExtractor):
+    IE_NAME = 'videomore'
+    _VALID_URL = r'''(?x)
+                    videomore:(?P<sid>\d+)$|
+                    https?://(?:player\.)?videomore\.ru/
+                        (?:
+                            (?:
+                                embed|
+                                [^/]+/[^/]+
+                            )/|
+                            [^/]*\?.*?\btrack_id=
+                        )
+                        (?P<id>\d+)
+                        (?:[/?#&]|\.(?:xml|json)|$)
+                    '''
+    _TESTS = [{
+        'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617',
+        'md5': '44455a346edc0d509ac5b5a5b531dc35',
+        'info_dict': {
+            'id': '367617',
+            'ext': 'flv',
+            'title': 'Кино в деталях 5 сезон В гостях Алексей Чумаков и Юлия Ковальчук',
+            'series': 'Кино в деталях',
+            'episode': 'В гостях Алексей Чумаков и Юлия Ковальчук',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 2910,
+            'view_count': int,
+            'comment_count': int,
+            'age_limit': 16,
+        },
+    }, {
+        'url': 'http://videomore.ru/embed/259974',
+        'info_dict': {
+            'id': '259974',
+            'ext': 'flv',
+            'title': 'Молодежка 2 сезон 40 серия',
+            'series': 'Молодежка',
+            'episode': '40 серия',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 2809,
+            'view_count': int,
+            'comment_count': int,
+            'age_limit': 16,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://videomore.ru/molodezhka/sezon_promo/341073',
+        'info_dict': {
+            'id': '341073',
+            'ext': 'flv',
+            'title': 'Промо Команда проиграла из-за Бакина?',
+            'episode': 'Команда проиграла из-за Бакина?',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 29,
+            'age_limit': 16,
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://videomore.ru/elki_3?track_id=364623',
+        'only_matching': True,
+    }, {
+        'url': 'http://videomore.ru/embed/364623',
+        'only_matching': True,
+    }, {
+        'url': 'http://videomore.ru/video/tracks/364623.xml',
+        'only_matching': True,
+    }, {
+        'url': 'http://videomore.ru/video/tracks/364623.json',
+        'only_matching': True,
+    }, {
+        'url': 'http://videomore.ru/video/tracks/158031/quotes/33248',
+        'only_matching': True,
+    }, {
+        'url': 'videomore:367617',
+        'only_matching': True,
+    }, {
+        'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=(?P<url>https?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1',
+            webpage)
+        if not mobj:
+            mobj = re.search(
+                r'<iframe[^>]+src=([\'"])(?P<url>https?://videomore\.ru/embed/\d+)',
+                webpage)
+
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('sid') or mobj.group('id')
+
+        video = self._download_xml(
+            'http://videomore.ru/video/tracks/%s.xml' % video_id,
+            video_id, 'Downloading video XML')
+
+        item = xpath_element(video, './/playlist/item', fatal=True)
+
+        title = xpath_text(
+            item, ('./title', './episode_name'), 'title', fatal=True)
+
+        video_url = xpath_text(item, './video_url', 'video url', fatal=True)
+        formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds')
+        self._sort_formats(formats)
+
+        thumbnail = xpath_text(item, './thumbnail_url')
+        duration = int_or_none(xpath_text(item, './duration'))
+        view_count = int_or_none(xpath_text(item, './views'))
+        comment_count = int_or_none(xpath_text(item, './count_comments'))
+        age_limit = int_or_none(xpath_text(item, './min_age'))
+
+        series = xpath_text(item, './project_name')
+        episode = xpath_text(item, './episode_name')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'series': series,
+            'episode': episode,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
+
+
+class VideomoreVideoIE(InfoExtractor):
+    IE_NAME = 'videomore:video'
+    _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)(?:/*|[?#&].*?)$'
+    _TESTS = [{
+        # single video with og:video:iframe
+        'url': 'http://videomore.ru/elki_3',
+        'info_dict': {
+            'id': '364623',
+            'ext': 'flv',
+            'title': 'Ёлки 3',
+            'description': '',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 5579,
+            'age_limit': 6,
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # season single series with og:video:iframe
+        'url': 'http://videomore.ru/poslednii_ment/1_sezon/14_seriya',
+        'only_matching': True,
+    }, {
+        'url': 'http://videomore.ru/sejchas_v_seti/serii_221-240/226_vypusk',
+        'only_matching': True,
+    }, {
+        # single video without og:video:iframe
+        'url': 'http://videomore.ru/marin_i_ego_druzya',
+        'info_dict': {
+            'id': '359073',
+            'ext': 'flv',
+            'title': '1 серия. Здравствуй, Аквавилль!',
+            'description': 'md5:c6003179538b5d353e7bcd5b1372b2d7',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 754,
+            'age_limit': 6,
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if VideomoreIE.suitable(url) else super(VideomoreVideoIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_url = self._og_search_property(
+            'video:iframe', webpage, 'video url', default=None)
+
+        if not video_url:
+            video_id = self._search_regex(
+                (r'config\s*:\s*["\']https?://videomore\.ru/video/tracks/(\d+)\.xml',
+                 r'track-id=["\'](\d+)',
+                 r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id')
+            video_url = 'videomore:%s' % video_id
+        else:
+            video_id = None
+
+        return self.url_result(
+            video_url, ie=VideomoreIE.ie_key(), video_id=video_id)
+
+
+class VideomoreSeasonIE(InfoExtractor):
+    IE_NAME = 'videomore:season'
+    _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$'
+    _TESTS = [{
+        'url': 'http://videomore.ru/molodezhka/sezon_promo',
+        'info_dict': {
+            'id': 'molodezhka/sezon_promo',
+            'title': 'Молодежка Промо',
+        },
+        'playlist_mincount': 12,
+    }, {
+        'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if (VideomoreIE.suitable(url) or VideomoreVideoIE.suitable(url))
+                else super(VideomoreSeasonIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(webpage)
+
+        data = self._parse_json(
+            self._html_search_regex(
+                r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P<value>{.+?})\1',
+                webpage, 'data', default='{}', group='value'),
+            display_id, fatal=False)
+
+        entries = []
+
+        if data:
+            episodes = data.get('episodes')
+            if isinstance(episodes, list):
+                for ep in episodes:
+                    if not isinstance(ep, dict):
+                        continue
+                    ep_id = int_or_none(ep.get('id'))
+                    ep_url = url_or_none(ep.get('url'))
+                    if ep_id:
+                        e = {
+                            'url': 'videomore:%s' % ep_id,
+                            'id': compat_str(ep_id),
+                        }
+                    elif ep_url:
+                        e = {'url': ep_url}
+                    else:
+                        continue
+                    e.update({
+                        '_type': 'url',
+                        'ie_key': VideomoreIE.ie_key(),
+                        'title': str_or_none(ep.get('title')),
+                        'thumbnail': url_or_none(ep.get('image')),
+                        'duration': parse_duration(ep.get('duration')),
+                        'episode_number': int_or_none(ep.get('number')),
+                        'upload_date': unified_strdate(ep.get('date')),
+                    })
+                    entries.append(e)
+
+        if not entries:
+            entries = [
+                self.url_result(
+                    'videomore:%s' % video_id, ie=VideomoreIE.ie_key(),
+                    video_id=video_id)
+                for video_id in orderedSet(re.findall(
+                    r':(?:id|key)=["\'](\d+)["\']', webpage))]
+
+        if not entries:
+            entries = [
+                self.url_result(item) for item in re.findall(
+                    r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"'
+                    % display_id, webpage)]
+
+        return self.playlist_result(entries, display_id, title)
diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py

new file mode 100644 (file)

index 0000000..e5f964d
--- /dev/null
+++ b/youtube_dl/extractor/videopress.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    parse_age_limit,
+    qualities,
+    random_birthday,
+    try_get,
+    unified_timestamp,
+    urljoin,
+)
+
+
+class VideoPressIE(InfoExtractor):
+    _VALID_URL = r'https?://videopress\.com/embed/(?P<id>[\da-zA-Z]+)'
+    _TESTS = [{
+        'url': 'https://videopress.com/embed/kUJmAcSf',
+        'md5': '706956a6c875873d51010921310e4bc6',
+        'info_dict': {
+            'id': 'kUJmAcSf',
+            'ext': 'mp4',
+            'title': 'VideoPress Demo',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 634.6,
+            'timestamp': 1434983935,
+            'upload_date': '20150622',
+            'age_limit': 0,
+        },
+    }, {
+        # 17+, requires birth_* params
+        'url': 'https://videopress.com/embed/iH3gstfZ',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        query = random_birthday('birth_year', 'birth_month', 'birth_day')
+        video = self._download_json(
+            'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
+            video_id, query=query)
+
+        title = video['title']
+
+        def base_url(scheme):
+            return try_get(
+                video, lambda x: x['file_url_base'][scheme], compat_str)
+
+        base_url = base_url('https') or base_url('http')
+
+        QUALITIES = ('std', 'dvd', 'hd')
+        quality = qualities(QUALITIES)
+
+        formats = []
+        for format_id, f in video['files'].items():
+            if not isinstance(f, dict):
+                continue
+            for ext, path in f.items():
+                if ext in ('mp4', 'ogg'):
+                    formats.append({
+                        'url': urljoin(base_url, path),
+                        'format_id': '%s-%s' % (format_id, ext),
+                        'ext': determine_ext(path, ext),
+                        'quality': quality(format_id),
+                    })
+        original_url = try_get(video, lambda x: x['original'], compat_str)
+        if original_url:
+            formats.append({
+                'url': original_url,
+                'format_id': 'original',
+                'quality': len(QUALITIES),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': video.get('poster'),
+            'duration': float_or_none(video.get('duration'), 1000),
+            'timestamp': unified_timestamp(video.get('upload_date')),
+            'age_limit': parse_age_limit(video.get('rating')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py

new file mode 100644 (file)

index 0000000..b48baf0
--- /dev/null
+++ b/youtube_dl/extractor/vidio.py
@@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class VidioIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P<id>\d+)-(?P<display_id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015',
+        'md5': 'cd2801394afc164e9775db6a140b91fe',
+        'info_dict': {
+            'id': '165683',
+            'display_id': 'dj_ambred-booyah-live-2015',
+            'ext': 'mp4',
+            'title': 'DJ_AMBRED - Booyah (Live 2015)',
+            'description': 'md5:27dc15f819b6a78a626490881adbadf8',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 149,
+            'like_count': int,
+        },
+    }, {
+        'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.group('id', 'display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._og_search_title(webpage)
+
+        m3u8_url, duration, thumbnail = [None] * 3
+
+        clips = self._parse_json(
+            self._html_search_regex(
+                r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1',
+                webpage, 'video data', default='[]', group='data'),
+            display_id, fatal=False)
+        if clips:
+            clip = clips[0]
+            m3u8_url = clip.get('sources', [{}])[0].get('file')
+            duration = clip.get('clip_duration')
+            thumbnail = clip.get('image')
+
+        m3u8_url = m3u8_url or self._search_regex(
+            r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'hls url', group='url')
+        formats = self._extract_m3u8_formats(
+            m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+        self._sort_formats(formats)
+
+        duration = int_or_none(duration or self._search_regex(
+            r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage,
+            'duration', fatal=False, group='duration'))
+        thumbnail = thumbnail or self._og_search_thumbnail(webpage)
+
+        like_count = int_or_none(self._search_regex(
+            (r'<span[^>]+data-comment-vote-count=["\'](\d+)',
+             r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'),
+            webpage, 'like count', fatal=False))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'like_count': like_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/vidlii.py b/youtube_dl/extractor/vidlii.py

new file mode 100644 (file)

index 0000000..f477425
--- /dev/null
+++ b/youtube_dl/extractor/vidlii.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    get_element_by_id,
+    int_or_none,
+    strip_or_none,
+    unified_strdate,
+    urljoin,
+)
+
+
+class VidLiiIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vidlii\.com/(?:watch|embed)\?.*?\bv=(?P<id>[0-9A-Za-z_-]{11})'
+    _TESTS = [{
+        'url': 'https://www.vidlii.com/watch?v=tJluaH4BJ3v',
+        'md5': '9bf7d1e005dfa909b6efb0a1ff5175e2',
+        'info_dict': {
+            'id': 'tJluaH4BJ3v',
+            'ext': 'mp4',
+            'title': 'Vidlii is against me',
+            'description': 'md5:fa3f119287a2bfb922623b52b1856145',
+            'thumbnail': 're:https://.*.jpg',
+            'uploader': 'APPle5auc31995',
+            'uploader_url': 'https://www.vidlii.com/user/APPle5auc31995',
+            'upload_date': '20171107',
+            'duration': 212,
+            'view_count': int,
+            'comment_count': int,
+            'average_rating': float,
+            'categories': ['News & Politics'],
+            'tags': ['Vidlii', 'Jan', 'Videogames'],
+        }
+    }, {
+        'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://www.vidlii.com/watch?v=%s' % video_id, video_id)
+
+        video_url = self._search_regex(
+            r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage,
+            'video url', group='url')
+
+        title = self._search_regex(
+            (r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage,
+            'title')
+
+        description = self._html_search_meta(
+            ('description', 'twitter:description'), webpage,
+            default=None) or strip_or_none(
+            get_element_by_id('des_text', webpage))
+
+        thumbnail = self._html_search_meta(
+            'twitter:image', webpage, default=None)
+        if not thumbnail:
+            thumbnail_path = self._search_regex(
+                r'img\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+                'thumbnail', fatal=False, group='url')
+            if thumbnail_path:
+                thumbnail = urljoin(url, thumbnail_path)
+
+        uploader = self._search_regex(
+            r'<div[^>]+class=["\']wt_person[^>]+>\s*<a[^>]+\bhref=["\']/user/[^>]+>([^<]+)',
+            webpage, 'uploader', fatal=False)
+        uploader_url = 'https://www.vidlii.com/user/%s' % uploader if uploader else None
+
+        upload_date = unified_strdate(self._html_search_meta(
+            'datePublished', webpage, default=None) or self._search_regex(
+            r'<date>([^<]+)', webpage, 'upload date', fatal=False))
+
+        duration = int_or_none(self._html_search_meta(
+            'video:duration', webpage, 'duration',
+            default=None) or self._search_regex(
+            r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+
+        view_count = int_or_none(self._search_regex(
+            (r'<strong>(\d+)</strong> views',
+             r'Views\s*:\s*<strong>(\d+)</strong>'),
+            webpage, 'view count', fatal=False))
+
+        comment_count = int_or_none(self._search_regex(
+            (r'<span[^>]+id=["\']cmt_num[^>]+>(\d+)',
+             r'Comments\s*:\s*<strong>(\d+)'),
+            webpage, 'comment count', fatal=False))
+
+        average_rating = float_or_none(self._search_regex(
+            r'rating\s*:\s*([\d.]+)', webpage, 'average rating', fatal=False))
+
+        category = self._html_search_regex(
+            r'<div>Category\s*:\s*</div>\s*<div>\s*<a[^>]+>([^<]+)', webpage,
+            'category', fatal=False)
+        categories = [category] if category else None
+
+        tags = [
+            strip_or_none(tag)
+            for tag in re.findall(
+                r'<a[^>]+\bhref=["\']/results\?.*?q=[^>]*>([^<]+)',
+                webpage) if strip_or_none(tag)
+        ] or None
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_url': uploader_url,
+            'upload_date': upload_date,
+            'duration': duration,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'average_rating': average_rating,
+            'categories': categories,
+            'tags': tags,
+        }
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py

new file mode 100644 (file)

index 0000000..174e69c
--- /dev/null
+++ b/youtube_dl/extractor/vidme.py
@@ -0,0 +1,295 @@
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    parse_iso8601,
+    url_or_none,
+)
+
+
+class VidmeIE(InfoExtractor):
+    IE_NAME = 'vidme'
+    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{,5})(?:[^\da-zA-Z]|$)'
+    _TESTS = [{
+        'url': 'https://vid.me/QNB',
+        'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
+        'info_dict': {
+            'id': 'QNB',
+            'ext': 'mp4',
+            'title': 'Fishing for piranha - the easy way',
+            'description': 'source: https://www.facebook.com/photo.php?v=312276045600871',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1406313244,
+            'upload_date': '20140725',
+            'age_limit': 0,
+            'duration': 119.92,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        },
+    }, {
+        'url': 'https://vid.me/Gc6M',
+        'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
+        'info_dict': {
+            'id': 'Gc6M',
+            'ext': 'mp4',
+            'title': 'O Mere Dil ke chain - Arnav and Khushi VM',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1441211642,
+            'upload_date': '20150902',
+            'uploader': 'SunshineM',
+            'uploader_id': '3552827',
+            'age_limit': 0,
+            'duration': 223.72,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # tests uploader field
+        'url': 'https://vid.me/4Iib',
+        'info_dict': {
+            'id': '4Iib',
+            'ext': 'mp4',
+            'title': 'The Carver',
+            'description': 'md5:e9c24870018ae8113be936645b93ba3c',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1433203629,
+            'upload_date': '20150602',
+            'uploader': 'Thomas',
+            'uploader_id': '109747',
+            'age_limit': 0,
+            'duration': 97.859999999999999,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # nsfw test from http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching
+        'url': 'https://vid.me/e/Wmur',
+        'info_dict': {
+            'id': 'Wmur',
+            'ext': 'mp4',
+            'title': 'naked smoking & stretching',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1430931613,
+            'upload_date': '20150506',
+            'uploader': 'naked-yogi',
+            'uploader_id': '1638622',
+            'age_limit': 18,
+            'duration': 653.26999999999998,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # nsfw, user-disabled
+        'url': 'https://vid.me/dzGJ',
+        'only_matching': True,
+    }, {
+        # suspended
+        'url': 'https://vid.me/Ox3G',
+        'only_matching': True,
+    }, {
+        # deleted
+        'url': 'https://vid.me/KTPm',
+        'only_matching': True,
+    }, {
+        # no formats in the API response
+        'url': 'https://vid.me/e5g',
+        'info_dict': {
+            'id': 'e5g',
+            'ext': 'mp4',
+            'title': 'Video upload (e5g)',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'timestamp': 1401480195,
+            'upload_date': '20140530',
+            'uploader': None,
+            'uploader_id': None,
+            'age_limit': 0,
+            'duration': 483,
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        try:
+            response = self._download_json(
+                'https://api.vid.me/videoByUrl/%s' % video_id, video_id)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+                response = self._parse_json(e.cause.read(), video_id)
+            else:
+                raise
+
+        error = response.get('error')
+        if error:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error), expected=True)
+
+        video = response['video']
+
+        if video.get('state') == 'deleted':
+            raise ExtractorError(
+                'Vidme said: Sorry, this video has been deleted.',
+                expected=True)
+
+        if video.get('state') in ('user-disabled', 'suspended'):
+            raise ExtractorError(
+                'Vidme said: This video has been suspended either due to a copyright claim, '
+                'or for violating the terms of use.',
+                expected=True)
+
+        formats = []
+        for f in video.get('formats', []):
+            format_url = url_or_none(f.get('uri'))
+            if not format_url:
+                continue
+            format_type = f.get('type')
+            if format_type == 'dash':
+                formats.extend(self._extract_mpd_formats(
+                    format_url, video_id, mpd_id='dash', fatal=False))
+            elif format_type == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'format_id': f.get('type'),
+                    'url': format_url,
+                    'width': int_or_none(f.get('width')),
+                    'height': int_or_none(f.get('height')),
+                    'preference': 0 if f.get('type', '').endswith(
+                        'clip') else 1,
+                })
+
+        if not formats and video.get('complete_url'):
+            formats.append({
+                'url': video.get('complete_url'),
+                'width': int_or_none(video.get('width')),
+                'height': int_or_none(video.get('height')),
+            })
+
+        self._sort_formats(formats)
+
+        title = video['title']
+        description = video.get('description')
+        thumbnail = video.get('thumbnail_url')
+        timestamp = parse_iso8601(video.get('date_created'), ' ')
+        uploader = video.get('user', {}).get('username')
+        uploader_id = video.get('user', {}).get('user_id')
+        age_limit = 18 if video.get('nsfw') is True else 0
+        duration = float_or_none(video.get('duration'))
+        view_count = int_or_none(video.get('view_count'))
+        like_count = int_or_none(video.get('likes_count'))
+        comment_count = int_or_none(video.get('comment_count'))
+
+        return {
+            'id': video_id,
+            'title': title or 'Video upload (%s)' % video_id,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'age_limit': age_limit,
+            'timestamp': timestamp,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': like_count,
+            'comment_count': comment_count,
+            'formats': formats,
+        }
+
+
+class VidmeListBaseIE(InfoExtractor):
+    # Max possible limit according to https://docs.vid.me/#api-Videos-List
+    _LIMIT = 100
+
+    def _entries(self, user_id, user_name):
+        for page_num in itertools.count(1):
+            page = self._download_json(
+                'https://api.vid.me/videos/%s?user=%s&limit=%d&offset=%d'
+                % (self._API_ITEM, user_id, self._LIMIT, (page_num - 1) * self._LIMIT),
+                user_name, 'Downloading user %s page %d' % (self._API_ITEM, page_num))
+
+            videos = page.get('videos', [])
+            if not videos:
+                break
+
+            for video in videos:
+                video_url = video.get('full_url') or video.get('embed_url')
+                if video_url:
+                    yield self.url_result(video_url, VidmeIE.ie_key())
+
+            total = int_or_none(page.get('page', {}).get('total'))
+            if total and self._LIMIT * page_num >= total:
+                break
+
+    def _real_extract(self, url):
+        user_name = self._match_id(url)
+
+        user_id = self._download_json(
+            'https://api.vid.me/userByUsername?username=%s' % user_name,
+            user_name)['user']['user_id']
+
+        return self.playlist_result(
+            self._entries(user_id, user_name), user_id,
+            '%s - %s' % (user_name, self._TITLE))
+
+
+class VidmeUserIE(VidmeListBaseIE):
+    IE_NAME = 'vidme:user'
+    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})(?!/likes)(?:[^\da-zA-Z_-]|$)'
+    _API_ITEM = 'list'
+    _TITLE = 'Videos'
+    _TESTS = [{
+        'url': 'https://vid.me/MasakoX',
+        'info_dict': {
+            'id': '16112341',
+            'title': 'MasakoX - %s' % _TITLE,
+        },
+        'playlist_mincount': 191,
+    }, {
+        'url': 'https://vid.me/unsQuare_netWork',
+        'only_matching': True,
+    }]
+
+
+class VidmeUserLikesIE(VidmeListBaseIE):
+    IE_NAME = 'vidme:user:likes'
+    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})/likes'
+    _API_ITEM = 'likes'
+    _TITLE = 'Likes'
+    _TESTS = [{
+        'url': 'https://vid.me/ErinAlexis/likes',
+        'info_dict': {
+            'id': '6483530',
+            'title': 'ErinAlexis - %s' % _TITLE,
+        },
+        'playlist_mincount': 415,
+    }, {
+        'url': 'https://vid.me/Kaleidoscope-Ish/likes',
+        'only_matching': True,
+    }]
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py

new file mode 100644 (file)

index 0000000..4e79a0b
--- /dev/null
+++ b/youtube_dl/extractor/vidzi.py
@@ -0,0 +1,68 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    decode_packed_codes,
+    js_to_json,
+    NO_DEFAULT,
+    PACKED_CODES_RE,
+)
+
+
+class VidziIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
+    _TESTS = [{
+        'url': 'http://vidzi.tv/cghql9yq6emu.html',
+        'md5': '4f16c71ca0c8c8635ab6932b5f3f1660',
+        'info_dict': {
+            'id': 'cghql9yq6emu',
+            'ext': 'mp4',
+            'title': 'youtube-dlc test video  1\\\\2\'3/4<5\\\\6ä7↭',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://vidzi.cc/cghql9yq6emu.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://vidzi.si/rph9gztxj1et.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://vidzi.nu/cghql9yq6emu.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://vidzi.tv/%s' % video_id, video_id)
+        title = self._html_search_regex(
+            r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
+
+        codes = [webpage]
+        codes.extend([
+            decode_packed_codes(mobj.group(0)).replace('\\\'', '\'')
+            for mobj in re.finditer(PACKED_CODES_RE, webpage)])
+        for num, code in enumerate(codes, 1):
+            jwplayer_data = self._parse_json(
+                self._search_regex(
+                    r'setup\(([^)]+)\)', code, 'jwplayer data',
+                    default=NO_DEFAULT if num == len(codes) else '{}'),
+                video_id, transform_source=lambda s: js_to_json(
+                    re.sub(r'\s*\+\s*window\[.+?\]', '', s)))
+            if jwplayer_data:
+                break
+
+        info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
+        info_dict['title'] = title
+
+        return info_dict
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py

new file mode 100644 (file)

index 0000000..dbd5ba9
--- /dev/null
+++ b/youtube_dl/extractor/vier.py
@@ -0,0 +1,264 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import itertools
+
+from .common import InfoExtractor
+from ..utils import (
+    urlencode_postdata,
+    int_or_none,
+    unified_strdate,
+)
+
+
+class VierIE(InfoExtractor):
+    IE_NAME = 'vier'
+    IE_DESC = 'vier.be and vijf.be'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?(?P<site>vier|vijf)\.be/
+                        (?:
+                            (?:
+                                [^/]+/videos|
+                                video(?:/[^/]+)*
+                            )/
+                            (?P<display_id>[^/]+)(?:/(?P<id>\d+))?|
+                            (?:
+                                video/v3/embed|
+                                embed/video/public
+                            )/(?P<embed_id>\d+)
+                        )
+                    '''
+    _NETRC_MACHINE = 'vier'
+    _TESTS = [{
+        'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
+        'md5': 'e4ae2054a6b040ef1e289e20d111b46e',
+        'info_dict': {
+            'id': '16129',
+            'display_id': 'het-wordt-warm-de-moestuin',
+            'ext': 'mp4',
+            'title': 'Het wordt warm in De Moestuin',
+            'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',
+            'upload_date': '20121025',
+            'series': 'Plan B',
+            'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'],
+        },
+    }, {
+        'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',
+        'info_dict': {
+            'id': '2561614',
+            'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',
+            'ext': 'mp4',
+            'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7',
+            'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe',
+            'upload_date': '20170228',
+            'series': 'Temptation Island',
+            'tags': list,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
+        'info_dict': {
+            'id': '2674839',
+            'display_id': 'jani-gaat-naar-tokio-aflevering-4',
+            'ext': 'mp4',
+            'title': 'Jani gaat naar Tokio - Aflevering 4',
+            'description': 'md5:aa8d611541db6ae9e863125704511f88',
+            'upload_date': '20170501',
+            'series': 'Jani gaat',
+            'episode_number': 4,
+            'tags': ['Jani Gaat', 'Volledige Aflevering'],
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'Requires account credentials',
+    }, {
+        # Requires account credentials but bypassed extraction via v3/embed page
+        # without metadata
+        'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
+        'info_dict': {
+            'id': '2674839',
+            'display_id': 'jani-gaat-naar-tokio-aflevering-4',
+            'ext': 'mp4',
+            'title': 'jani-gaat-naar-tokio-aflevering-4',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['Log in to extract metadata'],
+    }, {
+        # Without video id in URL
+        'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.vier.be/video/v3/embed/16129',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.vijf.be/embed/video/public/4093',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6',
+        'only_matching': True,
+    }]
+
+    def _real_initialize(self):
+        self._logged_in = False
+
+    def _login(self, site):
+        username, password = self._get_login_info()
+        if username is None or password is None:
+            return
+
+        login_page = self._download_webpage(
+            'http://www.%s.be/user/login' % site,
+            None, note='Logging in', errnote='Unable to log in',
+            data=urlencode_postdata({
+                'form_id': 'user_login',
+                'name': username,
+                'pass': password,
+            }),
+            headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+        login_error = self._html_search_regex(
+            r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<',
+            login_page, 'login error', default=None)
+        if login_error:
+            self.report_warning('Unable to log in: %s' % login_error)
+        else:
+            self._logged_in = True
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        embed_id = mobj.group('embed_id')
+        display_id = mobj.group('display_id') or embed_id
+        video_id = mobj.group('id') or embed_id
+        site = mobj.group('site')
+
+        if not self._logged_in:
+            self._login(site)
+
+        webpage = self._download_webpage(url, display_id)
+
+        if r'id="user-login"' in webpage:
+            self.report_warning(
+                'Log in to extract metadata', video_id=display_id)
+            webpage = self._download_webpage(
+                'http://www.%s.be/video/v3/embed/%s' % (site, video_id),
+                display_id)
+
+        video_id = self._search_regex(
+            [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
+            webpage, 'video id', default=video_id or display_id)
+
+        playlist_url = self._search_regex(
+            r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1',
+            webpage, 'm3u8 url', default=None, group='url')
+
+        if not playlist_url:
+            application = self._search_regex(
+                [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
+                webpage, 'application', default=site + '_vod')
+            filename = self._search_regex(
+                [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
+                webpage, 'filename')
+            playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
+
+        formats = self._extract_wowza_formats(
+            playlist_url, display_id, skip_protocols=['dash'])
+        self._sort_formats(formats)
+
+        title = self._og_search_title(webpage, default=display_id)
+        description = self._html_search_regex(
+            r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>',
+            webpage, 'description', default=None, group='value')
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
+        upload_date = unified_strdate(self._html_search_regex(
+            r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})',
+            webpage, 'upload date', default=None, group='value'))
+
+        series = self._search_regex(
+            r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+            'series', default=None, group='value')
+        episode_number = int_or_none(self._search_regex(
+            r'(?i)aflevering (\d+)', title, 'episode number', default=None))
+        tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'series': series,
+            'episode_number': episode_number,
+            'tags': tags,
+            'formats': formats,
+        }
+
+
+class VierVideosIE(InfoExtractor):
+    IE_NAME = 'vier:videos'
+    _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'
+    _TESTS = [{
+        'url': 'http://www.vier.be/demoestuin/videos',
+        'info_dict': {
+            'id': 'demoestuin',
+        },
+        'playlist_mincount': 153,
+    }, {
+        'url': 'http://www.vijf.be/temptationisland/videos',
+        'info_dict': {
+            'id': 'temptationisland',
+        },
+        'playlist_mincount': 159,
+    }, {
+        'url': 'http://www.vier.be/demoestuin/videos?page=6',
+        'info_dict': {
+            'id': 'demoestuin-page6',
+        },
+        'playlist_mincount': 20,
+    }, {
+        'url': 'http://www.vier.be/demoestuin/videos?page=7',
+        'info_dict': {
+            'id': 'demoestuin-page7',
+        },
+        'playlist_mincount': 13,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        program = mobj.group('program')
+        site = mobj.group('site')
+
+        page_id = mobj.group('page')
+        if page_id:
+            page_id = int(page_id)
+            start_page = page_id
+            playlist_id = '%s-page%d' % (program, page_id)
+        else:
+            start_page = 0
+            playlist_id = program
+
+        entries = []
+        for current_page_id in itertools.count(start_page):
+            current_page = self._download_webpage(
+                'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id),
+                program,
+                'Downloading page %d' % (current_page_id + 1))
+            page_entries = [
+                self.url_result('http://www.' + site + '.be' + video_url, 'Vier')
+                for video_url in re.findall(
+                    r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
+            entries.extend(page_entries)
+            if page_id or '>Meer<' not in current_page:
+                break
+
+        return self.playlist_result(entries, playlist_id)
diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py

new file mode 100644 (file)

index 0000000..d6b92b1
--- /dev/null
+++ b/youtube_dl/extractor/viewlift.py
@@ -0,0 +1,250 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_age_limit,
+)
+
+
+class ViewLiftBaseIE(InfoExtractor):
+    _API_BASE = 'https://prod-api.viewlift.com/'
+    _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv'
+    _SITE_MAP = {
+        'ftfnext': 'lax',
+        'funnyforfree': 'snagfilms',
+        'hoichoi': 'hoichoitv',
+        'kiddovid': 'snagfilms',
+        'laxsportsnetwork': 'lax',
+        'legapallacanestro': 'lnp',
+        'marquee': 'marquee-tv',
+        'monumentalsportsnetwork': 'monumental-network',
+        'moviespree': 'bingeflix',
+        'pflmma': 'pfl',
+        'snagxtreme': 'snagfilms',
+        'theidentitytb': 'tampabay',
+        'vayafilm': 'snagfilms',
+    }
+    _TOKENS = {}
+
+    def _call_api(self, site, path, video_id, query):
+        token = self._TOKENS.get(site)
+        if not token:
+            token_query = {'site': site}
+            email, password = self._get_login_info(netrc_machine=site)
+            if email:
+                resp = self._download_json(
+                    self._API_BASE + 'identity/signin', video_id,
+                    'Logging in', query=token_query, data=json.dumps({
+                        'email': email,
+                        'password': password,
+                    }).encode())
+            else:
+                resp = self._download_json(
+                    self._API_BASE + 'identity/anonymous-token', video_id,
+                    'Downloading authorization token', query=token_query)
+            self._TOKENS[site] = token = resp['authorizationToken']
+        return self._download_json(
+            self._API_BASE + path, video_id,
+            headers={'Authorization': token}, query=query)
+
+
+class ViewLiftEmbedIE(ViewLiftBaseIE):
+    IE_NAME = 'viewlift:embed'
+    _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P<domain>%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX
+    _TESTS = [{
+        'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
+        'md5': '2924e9215c6eff7a55ed35b72276bd93',
+        'info_dict': {
+            'id': '74849a00-85a9-11e1-9660-123139220831',
+            'ext': 'mp4',
+            'title': '#whilewewatch',
+            'description': 'md5:b542bef32a6f657dadd0df06e26fb0c8',
+            'timestamp': 1334350096,
+            'upload_date': '20120413',
+        }
+    }, {
+        # invalid labels, 360p is better that 480p
+        'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036',
+        'md5': '882fca19b9eb27ef865efeeaed376a48',
+        'info_dict': {
+            'id': '17ca0950-a74a-11e0-a92a-0026bb61d036',
+            'ext': 'mp4',
+            'title': 'Life in Limbo',
+        },
+        'skip': 'The video does not exist',
+    }, {
+        'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX,
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        domain, film_id = re.match(self._VALID_URL, url).groups()
+        site = domain.split('.')[-2]
+        if site in self._SITE_MAP:
+            site = self._SITE_MAP[site]
+        try:
+            content_data = self._call_api(
+                site, 'entitlement/video/status', film_id, {
+                    'id': film_id
+                })['video']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage')
+                if error_message == 'User does not have a valid subscription or has not purchased this content.':
+                    self.raise_login_required()
+                raise ExtractorError(error_message, expected=True)
+            raise
+        gist = content_data['gist']
+        title = gist['title']
+        video_assets = content_data['streamingInfo']['videoAssets']
+
+        formats = []
+        mpeg_video_assets = video_assets.get('mpeg') or []
+        for video_asset in mpeg_video_assets:
+            video_asset_url = video_asset.get('url')
+            if not video_asset:
+                continue
+            bitrate = int_or_none(video_asset.get('bitrate'))
+            height = int_or_none(self._search_regex(
+                r'^_?(\d+)[pP]$', video_asset.get('renditionValue'),
+                'height', default=None))
+            formats.append({
+                'url': video_asset_url,
+                'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''),
+                'tbr': bitrate,
+                'height': height,
+                'vcodec': video_asset.get('codec'),
+            })
+
+        hls_url = video_assets.get('hls')
+        if hls_url:
+            formats.extend(self._extract_m3u8_formats(
+                hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+        self._sort_formats(formats, ('height', 'tbr', 'format_id'))
+
+        info = {
+            'id': film_id,
+            'title': title,
+            'description': gist.get('description'),
+            'thumbnail': gist.get('videoImageUrl'),
+            'duration': int_or_none(gist.get('runtime')),
+            'age_limit': parse_age_limit(content_data.get('parentalRating')),
+            'timestamp': int_or_none(gist.get('publishDate'), 1000),
+            'formats': formats,
+        }
+        for k in ('categories', 'tags'):
+            info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')]
+        return info
+
+
+class ViewLiftIE(ViewLiftBaseIE):
+    IE_NAME = 'viewlift'
+    _VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)(?P<path>(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P<id>[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX
+    _TESTS = [{
+        'url': 'http://www.snagfilms.com/films/title/lost_for_life',
+        'md5': '19844f897b35af219773fd63bdec2942',
+        'info_dict': {
+            'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
+            'display_id': 'lost_for_life',
+            'ext': 'mp4',
+            'title': 'Lost for Life',
+            'description': 'md5:ea10b5a50405ae1f7b5269a6ec594102',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 4489,
+            'categories': 'mincount:3',
+            'age_limit': 14,
+            'upload_date': '20150421',
+            'timestamp': 1429656820,
+        }
+    }, {
+        'url': 'http://www.snagfilms.com/show/the_world_cut_project/india',
+        'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd',
+        'info_dict': {
+            'id': '00000145-d75c-d96e-a9c7-ff5c67b20000',
+            'display_id': 'the_world_cut_project/india',
+            'ext': 'mp4',
+            'title': 'India',
+            'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 979,
+            'timestamp': 1399478279,
+            'upload_date': '20140507',
+        }
+    }, {
+        'url': 'http://main.snagfilms.com/augie_alone/s_2_ep_12_love',
+        'info_dict': {
+            'id': '00000148-7b53-de26-a9fb-fbf306f70020',
+            'display_id': 'augie_alone/s_2_ep_12_love',
+            'ext': 'mp4',
+            'title': 'S. 2 Ep. 12 - Love',
+            'description': 'Augie finds love.',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 107,
+            'upload_date': '20141012',
+            'timestamp': 1413129540,
+            'age_limit': 17,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://main.snagfilms.com/films/title/the_freebie',
+        'only_matching': True,
+    }, {
+        # Film is not playable in your area.
+        'url': 'http://www.snagfilms.com/films/title/inside_mecca',
+        'only_matching': True,
+    }, {
+        # Film is not available.
+        'url': 'http://www.snagfilms.com/show/augie_alone/flirting',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.winnersview.com/videos/the-good-son',
+        'only_matching': True,
+    }, {
+        # Was once Kaltura embed
+        'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        domain, path, display_id = re.match(self._VALID_URL, url).groups()
+        site = domain.split('.')[-2]
+        if site in self._SITE_MAP:
+            site = self._SITE_MAP[site]
+        modules = self._call_api(
+            site, 'content/pages', display_id, {
+                'includeContent': 'true',
+                'moduleOffset': 1,
+                'path': path,
+                'site': site,
+            })['modules']
+        film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule')
+        return {
+            '_type': 'url_transparent',
+            'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id),
+            'id': film_id,
+            'display_id': display_id,
+            'ie_key': 'ViewLiftEmbed',
+        }
diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py

new file mode 100644 (file)

index 0000000..a0abbae
--- /dev/null
+++ b/youtube_dl/extractor/viidea.py
@@ -0,0 +1,202 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    js_to_json,
+    parse_duration,
+    parse_iso8601,
+)
+
+
+class ViideaIE(InfoExtractor):
+    _VALID_URL = r'''(?x)https?://(?:www\.)?(?:
+            videolectures\.net|
+            flexilearn\.viidea\.net|
+            presentations\.ocwconsortium\.org|
+            video\.travel-zoom\.si|
+            video\.pomp-forum\.si|
+            tv\.nil\.si|
+            video\.hekovnik.com|
+            video\.szko\.si|
+            kpk\.viidea\.com|
+            inside\.viidea\.net|
+            video\.kiberpipa\.org|
+            bvvideo\.si|
+            kongres\.viidea\.net|
+            edemokracija\.viidea\.com
+        )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$'''
+
+    _TESTS = [{
+        'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
+        'info_dict': {
+            'id': '20171',
+            'display_id': 'promogram_igor_mekjavic_eng',
+            'ext': 'mp4',
+            'title': 'Automatics, robotics and biocybernetics',
+            'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+            'thumbnail': r're:http://.*\.jpg',
+            'timestamp': 1372349289,
+            'upload_date': '20130627',
+            'duration': 565,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # video with invalid direct format links (HTTP 403)
+        'url': 'http://videolectures.net/russir2010_filippova_nlp/',
+        'info_dict': {
+            'id': '14891',
+            'display_id': 'russir2010_filippova_nlp',
+            'ext': 'flv',
+            'title': 'NLP at Google',
+            'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3',
+            'thumbnail': r're:http://.*\.jpg',
+            'timestamp': 1284375600,
+            'upload_date': '20100913',
+            'duration': 5352,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }, {
+        # event playlist
+        'url': 'http://videolectures.net/deeplearning2015_montreal/',
+        'info_dict': {
+            'id': '23181',
+            'title': 'Deep Learning Summer School, Montreal 2015',
+            'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7',
+            'thumbnail': r're:http://.*\.jpg',
+            'timestamp': 1438560000,
+        },
+        'playlist_count': 30,
+    }, {
+        # multi part lecture
+        'url': 'http://videolectures.net/mlss09uk_bishop_ibi/',
+        'info_dict': {
+            'id': '9737',
+            'display_id': 'mlss09uk_bishop_ibi',
+            'title': 'Introduction To Bayesian Inference',
+            'thumbnail': r're:http://.*\.jpg',
+            'timestamp': 1251622800,
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': '9737_part1',
+                'display_id': 'mlss09uk_bishop_ibi_part1',
+                'ext': 'wmv',
+                'title': 'Introduction To Bayesian Inference (Part 1)',
+                'thumbnail': r're:http://.*\.jpg',
+                'duration': 4622,
+                'timestamp': 1251622800,
+                'upload_date': '20090830',
+            },
+        }, {
+            'info_dict': {
+                'id': '9737_part2',
+                'display_id': 'mlss09uk_bishop_ibi_part2',
+                'ext': 'wmv',
+                'title': 'Introduction To Bayesian Inference (Part 2)',
+                'thumbnail': r're:http://.*\.jpg',
+                'duration': 5641,
+                'timestamp': 1251622800,
+                'upload_date': '20090830',
+            },
+        }],
+        'playlist_count': 2,
+    }]
+
+    def _real_extract(self, url):
+        lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups()
+
+        webpage = self._download_webpage(url, lecture_slug)
+
+        cfg = self._parse_json(self._search_regex(
+            [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function',
+             r'cfg\s*:\s*({[^}]+})'],
+            webpage, 'cfg'), lecture_slug, js_to_json)
+
+        lecture_id = compat_str(cfg['obj_id'])
+
+        base_url = self._proto_relative_url(cfg['livepipe'], 'http:')
+
+        try:
+            lecture_data = self._download_json(
+                '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
+                lecture_id)['lecture'][0]
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                msg = self._parse_json(
+                    e.cause.read().decode('utf-8'), lecture_id)
+                raise ExtractorError(msg['detail'], expected=True)
+            raise
+
+        lecture_info = {
+            'id': lecture_id,
+            'display_id': lecture_slug,
+            'title': lecture_data['title'],
+            'timestamp': parse_iso8601(lecture_data.get('time')),
+            'description': lecture_data.get('description_wiki'),
+            'thumbnail': lecture_data.get('thumb'),
+        }
+
+        playlist_entries = []
+        lecture_type = lecture_data.get('type')
+        parts = [compat_str(video) for video in cfg.get('videos', [])]
+        if parts:
+            multipart = len(parts) > 1
+
+            def extract_part(part_id):
+                smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id)
+                smil = self._download_smil(smil_url, lecture_id)
+                info = self._parse_smil(smil, smil_url, lecture_id)
+                self._sort_formats(info['formats'])
+                info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id)
+                info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id)
+                if multipart:
+                    info['title'] += ' (Part %s)' % part_id
+                switch = smil.find('.//switch')
+                if switch is not None:
+                    info['duration'] = parse_duration(switch.attrib.get('dur'))
+                item_info = lecture_info.copy()
+                item_info.update(info)
+                return item_info
+
+            if explicit_part_id or not multipart:
+                result = extract_part(explicit_part_id or parts[0])
+            else:
+                result = {
+                    '_type': 'multi_video',
+                    'entries': [extract_part(part) for part in parts],
+                }
+                result.update(lecture_info)
+
+            # Immediately return explicitly requested part or non event item
+            if explicit_part_id or lecture_type != 'evt':
+                return result
+
+            playlist_entries.append(result)
+
+        # It's probably a playlist
+        if not parts or lecture_type == 'evt':
+            playlist_webpage = self._download_webpage(
+                '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id)
+            entries = [
+                self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea')
+                for _, video_url in re.findall(
+                    r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)]
+            playlist_entries.extend(entries)
+
+        playlist = self.playlist_result(playlist_entries, lecture_id)
+        playlist.update(lecture_info)
+        return playlist
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py

new file mode 100644 (file)

index 0000000..9e41712
--- /dev/null
+++ b/youtube_dl/extractor/viki.py
@@ -0,0 +1,384 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import hmac
+import itertools
+import json
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    parse_age_limit,
+    parse_iso8601,
+    sanitized_Request,
+)
+
+
+class VikiBaseIE(InfoExtractor):
+    _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/'
+    _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com'
+    _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s'
+
+    _APP = '100005a'
+    _APP_VERSION = '2.2.5.1428709186'
+    _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad'
+
+    _GEO_BYPASS = False
+    _NETRC_MACHINE = 'viki'
+
+    _token = None
+
+    _ERRORS = {
+        'geo': 'Sorry, this content is not available in your region.',
+        'upcoming': 'Sorry, this content is not yet available.',
+        # 'paywall': 'paywall',
+    }
+
+    def _prepare_call(self, path, timestamp=None, post_data=None):
+        path += '?' if '?' not in path else '&'
+        if not timestamp:
+            timestamp = int(time.time())
+        query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp)
+        if self._token:
+            query += '&token=%s' % self._token
+        sig = hmac.new(
+            self._APP_SECRET.encode('ascii'),
+            query.encode('ascii'),
+            hashlib.sha1
+        ).hexdigest()
+        url = self._API_URL_TEMPLATE % (query, sig)
+        return sanitized_Request(
+            url, json.dumps(post_data).encode('utf-8')) if post_data else url
+
+    def _call_api(self, path, video_id, note, timestamp=None, post_data=None):
+        resp = self._download_json(
+            self._prepare_call(path, timestamp, post_data), video_id, note, headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404])
+
+        error = resp.get('error')
+        if error:
+            if error == 'invalid timestamp':
+                resp = self._download_json(
+                    self._prepare_call(path, int(resp['current_timestamp']), post_data),
+                    video_id, '%s (retry)' % note, headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404])
+                error = resp.get('error')
+            if error:
+                self._raise_error(resp['error'])
+
+        return resp
+
+    def _raise_error(self, error):
+        raise ExtractorError(
+            '%s returned error: %s' % (self.IE_NAME, error),
+            expected=True)
+
+    def _check_errors(self, data):
+        for reason, status in data.get('blocking', {}).items():
+            if status and reason in self._ERRORS:
+                message = self._ERRORS[reason]
+                if reason == 'geo':
+                    self.raise_geo_restricted(msg=message)
+                raise ExtractorError('%s said: %s' % (
+                    self.IE_NAME, message), expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_form = {
+            'login_id': username,
+            'password': password,
+        }
+
+        login = self._call_api(
+            'sessions.json', None,
+            'Logging in', post_data=login_form)
+
+        self._token = login.get('token')
+        if not self._token:
+            self.report_warning('Unable to get session token, login has probably failed')
+
+    @staticmethod
+    def dict_selection(dict_obj, preferred_key, allow_fallback=True):
+        if preferred_key in dict_obj:
+            return dict_obj.get(preferred_key)
+
+        if not allow_fallback:
+            return
+
+        filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()]))
+        return filtered_dict[0] if filtered_dict else None
+
+
+class VikiIE(VikiBaseIE):
+    IE_NAME = 'viki'
+    _VALID_URL = r'%s(?:videos|player)/(?P<id>[0-9]+v)' % VikiBaseIE._VALID_URL_BASE
+    _TESTS = [{
+        'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
+        'info_dict': {
+            'id': '1023585v',
+            'ext': 'mp4',
+            'title': 'Heirs Episode 14',
+            'uploader': 'SBS',
+            'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+            'upload_date': '20131121',
+            'age_limit': 13,
+        },
+        'skip': 'Blocked in the US',
+    }, {
+        # clip
+        'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
+        'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
+        'info_dict': {
+            'id': '1067139v',
+            'ext': 'mp4',
+            'title': "'The Avengers: Age of Ultron' Press Conference",
+            'description': 'md5:d70b2f9428f5488321bfe1db10d612ea',
+            'duration': 352,
+            'timestamp': 1430380829,
+            'upload_date': '20150430',
+            'uploader': 'Arirang TV',
+            'like_count': int,
+            'age_limit': 0,
+        }
+    }, {
+        'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
+        'info_dict': {
+            'id': '1048879v',
+            'ext': 'mp4',
+            'title': 'Ankhon Dekhi',
+            'duration': 6512,
+            'timestamp': 1408532356,
+            'upload_date': '20140820',
+            'uploader': 'Spuul',
+            'like_count': int,
+            'age_limit': 13,
+        },
+        'skip': 'Blocked in the US',
+    }, {
+        # episode
+        'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
+        'md5': '5fa476a902e902783ac7a4d615cdbc7a',
+        'info_dict': {
+            'id': '44699v',
+            'ext': 'mp4',
+            'title': 'Boys Over Flowers - Episode 1',
+            'description': 'md5:b89cf50038b480b88b5b3c93589a9076',
+            'duration': 4204,
+            'timestamp': 1270496524,
+            'upload_date': '20100405',
+            'uploader': 'group8',
+            'like_count': int,
+            'age_limit': 13,
+        }
+    }, {
+        # youtube external
+        'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
+        'md5': '63f8600c1da6f01b7640eee7eca4f1da',
+        'info_dict': {
+            'id': '50562v',
+            'ext': 'webm',
+            'title': 'Poor Nastya [COMPLETE] - Episode 1',
+            'description': '',
+            'duration': 606,
+            'timestamp': 1274949505,
+            'upload_date': '20101213',
+            'uploader': 'ad14065n',
+            'uploader_id': 'ad14065n',
+            'like_count': int,
+            'age_limit': 13,
+        }
+    }, {
+        'url': 'http://www.viki.com/player/44699v',
+        'only_matching': True,
+    }, {
+        # non-English description
+        'url': 'http://www.viki.com/videos/158036v-love-in-magic',
+        'md5': '1713ae35df5a521b31f6dc40730e7c9c',
+        'info_dict': {
+            'id': '158036v',
+            'ext': 'mp4',
+            'uploader': 'I Planet Entertainment',
+            'upload_date': '20111122',
+            'timestamp': 1321985454,
+            'description': 'md5:44b1e46619df3a072294645c770cef36',
+            'title': 'Love In Magic',
+            'age_limit': 13,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._call_api(
+            'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
+
+        self._check_errors(video)
+
+        title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False)
+        if not title:
+            title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
+            container_titles = video.get('container', {}).get('titles', {})
+            container_title = self.dict_selection(container_titles, 'en')
+            title = '%s - %s' % (container_title, title)
+
+        description = self.dict_selection(video.get('descriptions', {}), 'en')
+
+        duration = int_or_none(video.get('duration'))
+        timestamp = parse_iso8601(video.get('created_at'))
+        uploader = video.get('author')
+        like_count = int_or_none(video.get('likes', {}).get('count'))
+        age_limit = parse_age_limit(video.get('rating'))
+
+        thumbnails = []
+        for thumbnail_id, thumbnail in video.get('images', {}).items():
+            thumbnails.append({
+                'id': thumbnail_id,
+                'url': thumbnail.get('url'),
+            })
+
+        subtitles = {}
+        for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+            subtitles[subtitle_lang] = [{
+                'ext': subtitles_format,
+                'url': self._prepare_call(
+                    'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
+            } for subtitles_format in ('srt', 'vtt')]
+
+        result = {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'like_count': like_count,
+            'age_limit': age_limit,
+            'thumbnails': thumbnails,
+            'subtitles': subtitles,
+        }
+
+        streams = self._call_api(
+            'videos/%s/streams.json' % video_id, video_id,
+            'Downloading video streams JSON')
+
+        if 'external' in streams:
+            result.update({
+                '_type': 'url_transparent',
+                'url': streams['external']['url'],
+            })
+            return result
+
+        formats = []
+        for format_id, stream_dict in streams.items():
+            height = int_or_none(self._search_regex(
+                r'^(\d+)[pP]$', format_id, 'height', default=None))
+            for protocol, format_dict in stream_dict.items():
+                # rtmps URLs does not seem to work
+                if protocol == 'rtmps':
+                    continue
+                format_url = format_dict['url']
+                if format_id == 'm3u8':
+                    m3u8_formats = self._extract_m3u8_formats(
+                        format_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native',
+                        m3u8_id='m3u8-%s' % protocol, fatal=False)
+                    # Despite CODECS metadata in m3u8 all video-only formats
+                    # are actually video+audio
+                    for f in m3u8_formats:
+                        if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
+                            f['acodec'] = None
+                    formats.extend(m3u8_formats)
+                elif format_url.startswith('rtmp'):
+                    mobj = re.search(
+                        r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
+                        format_url)
+                    if not mobj:
+                        continue
+                    formats.append({
+                        'format_id': 'rtmp-%s' % format_id,
+                        'ext': 'flv',
+                        'url': mobj.group('url'),
+                        'play_path': mobj.group('playpath'),
+                        'app': mobj.group('app'),
+                        'page_url': url,
+                    })
+                else:
+                    formats.append({
+                        'url': format_url,
+                        'format_id': '%s-%s' % (format_id, protocol),
+                        'height': height,
+                    })
+        self._sort_formats(formats)
+
+        result['formats'] = formats
+        return result
+
+
+class VikiChannelIE(VikiBaseIE):
+    IE_NAME = 'viki:channel'
+    _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P<id>[0-9]+c)' % VikiBaseIE._VALID_URL_BASE
+    _TESTS = [{
+        'url': 'http://www.viki.com/tv/50c-boys-over-flowers',
+        'info_dict': {
+            'id': '50c',
+            'title': 'Boys Over Flowers',
+            'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+        },
+        'playlist_mincount': 71,
+    }, {
+        'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
+        'info_dict': {
+            'id': '1354c',
+            'title': 'Poor Nastya [COMPLETE]',
+            'description': 'md5:05bf5471385aa8b21c18ad450e350525',
+        },
+        'playlist_count': 127,
+    }, {
+        'url': 'http://www.viki.com/news/24569c-showbiz-korea',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viki.com/artists/2141c-shinee',
+        'only_matching': True,
+    }]
+
+    _PER_PAGE = 25
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        channel = self._call_api(
+            'containers/%s.json' % channel_id, channel_id,
+            'Downloading channel JSON')
+
+        self._check_errors(channel)
+
+        title = self.dict_selection(channel['titles'], 'en')
+
+        description = self.dict_selection(channel['descriptions'], 'en')
+
+        entries = []
+        for video_type in ('episodes', 'clips', 'movies'):
+            for page_num in itertools.count(1):
+                page = self._call_api(
+                    'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d'
+                    % (channel_id, video_type, self._PER_PAGE, page_num), channel_id,
+                    'Downloading %s JSON page #%d' % (video_type, page_num))
+                for video in page['response']:
+                    video_id = video['id']
+                    entries.append(self.url_result(
+                        'https://www.viki.com/videos/%s' % video_id, 'Viki'))
+                if not page['pagination']['next']:
+                    break
+
+        return self.playlist_result(entries, channel_id, title, description)
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py

new file mode 100644 (file)

index 0000000..9839657
--- /dev/null
+++ b/youtube_dl/extractor/vimeo.py
@@ -0,0 +1,1128 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import functools
+import json
+import re
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_kwargs,
+    compat_HTTPError,
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    clean_html,
+    determine_ext,
+    dict_get,
+    ExtractorError,
+    js_to_json,
+    int_or_none,
+    merge_dicts,
+    OnDemandPagedList,
+    parse_filesize,
+    RegexNotFoundError,
+    sanitized_Request,
+    smuggle_url,
+    std_headers,
+    str_or_none,
+    try_get,
+    unified_timestamp,
+    unsmuggle_url,
+    urlencode_postdata,
+    urljoin,
+    unescapeHTML,
+)
+
+
+class VimeoBaseInfoExtractor(InfoExtractor):
+    _NETRC_MACHINE = 'vimeo'
+    _LOGIN_REQUIRED = False
+    _LOGIN_URL = 'https://vimeo.com/log_in'
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            if self._LOGIN_REQUIRED:
+                raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+            return
+        webpage = self._download_webpage(
+            self._LOGIN_URL, None, 'Downloading login page')
+        token, vuid = self._extract_xsrft_and_vuid(webpage)
+        data = {
+            'action': 'login',
+            'email': username,
+            'password': password,
+            'service': 'vimeo',
+            'token': token,
+        }
+        self._set_vimeo_cookie('vuid', vuid)
+        try:
+            self._download_webpage(
+                self._LOGIN_URL, None, 'Logging in',
+                data=urlencode_postdata(data), headers={
+                    'Content-Type': 'application/x-www-form-urlencoded',
+                    'Referer': self._LOGIN_URL,
+                })
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 418:
+                raise ExtractorError(
+                    'Unable to log in: bad username or password',
+                    expected=True)
+            raise ExtractorError('Unable to log in')
+
+    def _verify_video_password(self, url, video_id, webpage):
+        password = self._downloader.params.get('videopassword')
+        if password is None:
+            raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
+        token, vuid = self._extract_xsrft_and_vuid(webpage)
+        data = urlencode_postdata({
+            'password': password,
+            'token': token,
+        })
+        if url.startswith('http://'):
+            # vimeo only supports https now, but the user can give an http url
+            url = url.replace('http://', 'https://')
+        password_request = sanitized_Request(url + '/password', data)
+        password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        password_request.add_header('Referer', url)
+        self._set_vimeo_cookie('vuid', vuid)
+        return self._download_webpage(
+            password_request, video_id,
+            'Verifying the password', 'Wrong password')
+
+    def _extract_xsrft_and_vuid(self, webpage):
+        xsrft = self._search_regex(
+            r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
+            webpage, 'login token', group='xsrft')
+        vuid = self._search_regex(
+            r'["\']vuid["\']\s*:\s*(["\'])(?P<vuid>.+?)\1',
+            webpage, 'vuid', group='vuid')
+        return xsrft, vuid
+
+    def _extract_vimeo_config(self, webpage, video_id, *args, **kwargs):
+        vimeo_config = self._search_regex(
+            r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));',
+            webpage, 'vimeo config', *args, **compat_kwargs(kwargs))
+        if vimeo_config:
+            return self._parse_json(vimeo_config, video_id)
+
+    def _set_vimeo_cookie(self, name, value):
+        self._set_cookie('vimeo.com', name, value)
+
+    def _vimeo_sort_formats(self, formats):
+        # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
+        # at the same time without actual units specified. This lead to wrong sorting.
+        self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id'))
+
+    def _parse_config(self, config, video_id):
+        video_data = config['video']
+        video_title = video_data['title']
+        live_event = video_data.get('live_event') or {}
+        is_live = live_event.get('status') == 'started'
+
+        formats = []
+        config_files = video_data.get('files') or config['request'].get('files', {})
+        for f in config_files.get('progressive', []):
+            video_url = f.get('url')
+            if not video_url:
+                continue
+            formats.append({
+                'url': video_url,
+                'format_id': 'http-%s' % f.get('quality'),
+                'width': int_or_none(f.get('width')),
+                'height': int_or_none(f.get('height')),
+                'fps': int_or_none(f.get('fps')),
+                'tbr': int_or_none(f.get('bitrate')),
+            })
+
+        # TODO: fix handling of 308 status code returned for live archive manifest requests
+        sep_pattern = r'/sep/video/'
+        for files_type in ('hls', 'dash'):
+            for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items():
+                manifest_url = cdn_data.get('url')
+                if not manifest_url:
+                    continue
+                format_id = '%s-%s' % (files_type, cdn_name)
+                sep_manifest_urls = []
+                if re.search(sep_pattern, manifest_url):
+                    for suffix, repl in (('', 'video'), ('_sep', 'sep/video')):
+                        sep_manifest_urls.append((format_id + suffix, re.sub(
+                            sep_pattern, '/%s/' % repl, manifest_url)))
+                else:
+                    sep_manifest_urls = [(format_id, manifest_url)]
+                for f_id, m_url in sep_manifest_urls:
+                    if files_type == 'hls':
+                        formats.extend(self._extract_m3u8_formats(
+                            m_url, video_id, 'mp4',
+                            'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id,
+                            note='Downloading %s m3u8 information' % cdn_name,
+                            fatal=False))
+                    elif files_type == 'dash':
+                        if 'json=1' in m_url:
+                            real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url')
+                            if real_m_url:
+                                m_url = real_m_url
+                        mpd_formats = self._extract_mpd_formats(
+                            m_url.replace('/master.json', '/master.mpd'), video_id, f_id,
+                            'Downloading %s MPD information' % cdn_name,
+                            fatal=False)
+                        formats.extend(mpd_formats)
+
+        live_archive = live_event.get('archive') or {}
+        live_archive_source_url = live_archive.get('source_url')
+        if live_archive_source_url and live_archive.get('status') == 'done':
+            formats.append({
+                'format_id': 'live-archive-source',
+                'url': live_archive_source_url,
+                'preference': 1,
+            })
+
+        for f in formats:
+            if f.get('vcodec') == 'none':
+                f['preference'] = -50
+            elif f.get('acodec') == 'none':
+                f['preference'] = -40
+
+        subtitles = {}
+        text_tracks = config['request'].get('text_tracks')
+        if text_tracks:
+            for tt in text_tracks:
+                subtitles[tt['lang']] = [{
+                    'ext': 'vtt',
+                    'url': urljoin('https://vimeo.com', tt['url']),
+                }]
+
+        thumbnails = []
+        if not is_live:
+            for key, thumb in video_data.get('thumbs', {}).items():
+                thumbnails.append({
+                    'id': key,
+                    'width': int_or_none(key),
+                    'url': thumb,
+                })
+            thumbnail = video_data.get('thumbnail')
+            if thumbnail:
+                thumbnails.append({
+                    'url': thumbnail,
+                })
+
+        owner = video_data.get('owner') or {}
+        video_uploader_url = owner.get('url')
+
+        return {
+            'id': str_or_none(video_data.get('id')) or video_id,
+            'title': self._live_title(video_title) if is_live else video_title,
+            'uploader': owner.get('name'),
+            'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None,
+            'uploader_url': video_uploader_url,
+            'thumbnails': thumbnails,
+            'duration': int_or_none(video_data.get('duration')),
+            'formats': formats,
+            'subtitles': subtitles,
+            'is_live': is_live,
+        }
+
+    def _extract_original_format(self, url, video_id):
+        download_data = self._download_json(
+            url, video_id, fatal=False,
+            query={'action': 'load_download_config'},
+            headers={'X-Requested-With': 'XMLHttpRequest'})
+        if download_data:
+            source_file = download_data.get('source_file')
+            if isinstance(source_file, dict):
+                download_url = source_file.get('download_url')
+                if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'):
+                    source_name = source_file.get('public_name', 'Original')
+                    if self._is_valid_url(download_url, video_id, '%s video' % source_name):
+                        ext = (try_get(
+                            source_file, lambda x: x['extension'],
+                            compat_str) or determine_ext(
+                            download_url, None) or 'mp4').lower()
+                        return {
+                            'url': download_url,
+                            'ext': ext,
+                            'width': int_or_none(source_file.get('width')),
+                            'height': int_or_none(source_file.get('height')),
+                            'filesize': parse_filesize(source_file.get('size')),
+                            'format_id': source_name,
+                            'preference': 1,
+                        }
+
+
+class VimeoIE(VimeoBaseInfoExtractor):
+    """Information extractor for vimeo.com."""
+
+    # _VALID_URL matches Vimeo URLs
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:
+                                www|
+                                player
+                            )
+                            \.
+                        )?
+                        vimeo(?:pro)?\.com/
+                        (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
+                        (?:.*?/)?
+                        (?:
+                            (?:
+                                play_redirect_hls|
+                                moogaloop\.swf)\?clip_id=
+                            )?
+                        (?:videos?/)?
+                        (?P<id>[0-9]+)
+                        (?:/[\da-f]+)?
+                        /?(?:[?&].*)?(?:[#].*)?$
+                    '''
+    IE_NAME = 'vimeo'
+    _TESTS = [
+        {
+            'url': 'http://vimeo.com/56015672#at=0',
+            'md5': '8879b6cc097e987f02484baf890129e5',
+            'info_dict': {
+                'id': '56015672',
+                'ext': 'mp4',
+                'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+                'description': 'md5:2d3305bad981a06ff79f027f19865021',
+                'timestamp': 1355990239,
+                'upload_date': '20121220',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434',
+                'uploader_id': 'user7108434',
+                'uploader': 'Filippo Valsorda',
+                'duration': 10,
+                'license': 'by-sa',
+            },
+            'params': {
+                'format': 'best[protocol=https]',
+            },
+        },
+        {
+            'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
+            'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82',
+            'note': 'Vimeo Pro video (#1197)',
+            'info_dict': {
+                'id': '68093876',
+                'ext': 'mp4',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/openstreetmapus',
+                'uploader_id': 'openstreetmapus',
+                'uploader': 'OpenStreetMap US',
+                'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
+                'description': 'md5:2c362968038d4499f4d79f88458590c1',
+                'duration': 1595,
+                'upload_date': '20130610',
+                'timestamp': 1370893156,
+            },
+            'params': {
+                'format': 'best[protocol=https]',
+            },
+        },
+        {
+            'url': 'http://player.vimeo.com/video/54469442',
+            'md5': '619b811a4417aa4abe78dc653becf511',
+            'note': 'Videos that embed the url in the player page',
+            'info_dict': {
+                'id': '54469442',
+                'ext': 'mp4',
+                'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012',
+                'uploader': 'The BLN & Business of Software',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware',
+                'uploader_id': 'theblnbusinessofsoftware',
+                'duration': 3610,
+                'description': None,
+            },
+            'params': {
+                'format': 'best[protocol=https]',
+            },
+            'expected_warnings': ['Unable to download JSON metadata'],
+        },
+        {
+            'url': 'http://vimeo.com/68375962',
+            'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7',
+            'note': 'Video protected with password',
+            'info_dict': {
+                'id': '68375962',
+                'ext': 'mp4',
+                'title': 'youtube-dl password protected test video',
+                'timestamp': 1371200155,
+                'upload_date': '20130614',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128',
+                'uploader_id': 'user18948128',
+                'uploader': 'Jaime Marquínez Ferrándiz',
+                'duration': 10,
+                'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f',
+            },
+            'params': {
+                'format': 'best[protocol=https]',
+                'videopassword': 'youtube-dl',
+            },
+        },
+        {
+            'url': 'http://vimeo.com/channels/keypeele/75629013',
+            'md5': '2f86a05afe9d7abc0b9126d229bbe15d',
+            'info_dict': {
+                'id': '75629013',
+                'ext': 'mp4',
+                'title': 'Key & Peele: Terrorist Interrogation',
+                'description': 'md5:8678b246399b070816b12313e8b4eb5c',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio',
+                'uploader_id': 'atencio',
+                'uploader': 'Peter Atencio',
+                'channel_id': 'keypeele',
+                'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/keypeele',
+                'timestamp': 1380339469,
+                'upload_date': '20130928',
+                'duration': 187,
+            },
+            'expected_warnings': ['Unable to download JSON metadata'],
+        },
+        {
+            'url': 'http://vimeo.com/76979871',
+            'note': 'Video with subtitles',
+            'info_dict': {
+                'id': '76979871',
+                'ext': 'mp4',
+                'title': 'The New Vimeo Player (You Know, For Videos)',
+                'description': 'md5:2ec900bf97c3f389378a96aee11260ea',
+                'timestamp': 1381846109,
+                'upload_date': '20131015',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff',
+                'uploader_id': 'staff',
+                'uploader': 'Vimeo Staff',
+                'duration': 62,
+            }
+        },
+        {
+            # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/
+            'url': 'https://player.vimeo.com/video/98044508',
+            'note': 'The js code contains assignments to the same variable as the config',
+            'info_dict': {
+                'id': '98044508',
+                'ext': 'mp4',
+                'title': 'Pier Solar OUYA Official Trailer',
+                'uploader': 'Tulio Gonçalves',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user28849593',
+                'uploader_id': 'user28849593',
+            },
+        },
+        {
+            # contains original format
+            'url': 'https://vimeo.com/33951933',
+            'md5': '53c688fa95a55bf4b7293d37a89c5c53',
+            'info_dict': {
+                'id': '33951933',
+                'ext': 'mp4',
+                'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute',
+                'uploader': 'The DMCI',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci',
+                'uploader_id': 'dmci',
+                'timestamp': 1324343742,
+                'upload_date': '20111220',
+                'description': 'md5:ae23671e82d05415868f7ad1aec21147',
+            },
+        },
+        {
+            # only available via https://vimeo.com/channels/tributes/6213729 and
+            # not via https://vimeo.com/6213729
+            'url': 'https://vimeo.com/channels/tributes/6213729',
+            'info_dict': {
+                'id': '6213729',
+                'ext': 'mp4',
+                'title': 'Vimeo Tribute: The Shining',
+                'uploader': 'Casey Donahue',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue',
+                'uploader_id': 'caseydonahue',
+                'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/tributes',
+                'channel_id': 'tributes',
+                'timestamp': 1250886430,
+                'upload_date': '20090821',
+                'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'expected_warnings': ['Unable to download JSON metadata'],
+        },
+        {
+            # redirects to ondemand extractor and should be passed through it
+            # for successful extraction
+            'url': 'https://vimeo.com/73445910',
+            'info_dict': {
+                'id': '73445910',
+                'ext': 'mp4',
+                'title': 'The Reluctant Revolutionary',
+                'uploader': '10Ft Films',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/tenfootfilms',
+                'uploader_id': 'tenfootfilms',
+                'description': 'md5:0fa704e05b04f91f40b7f3ca2e801384',
+                'upload_date': '20130830',
+                'timestamp': 1377853339,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'expected_warnings': ['Unable to download JSON metadata'],
+        },
+        {
+            'url': 'http://player.vimeo.com/video/68375962',
+            'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7',
+            'info_dict': {
+                'id': '68375962',
+                'ext': 'mp4',
+                'title': 'youtube-dl password protected test video',
+                'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128',
+                'uploader_id': 'user18948128',
+                'uploader': 'Jaime Marquínez Ferrándiz',
+                'duration': 10,
+            },
+            'params': {
+                'format': 'best[protocol=https]',
+                'videopassword': 'youtube-dl',
+            },
+        },
+        {
+            'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://vimeo.com/109815029',
+            'note': 'Video not completely processed, "failed" seed status',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://vimeo.com/groups/travelhd/videos/22439234',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://vimeo.com/album/2632481/video/79010983',
+            'only_matching': True,
+        },
+        {
+            # source file returns 403: Forbidden
+            'url': 'https://vimeo.com/7809605',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://vimeo.com/160743502/abd0e13fb4',
+            'only_matching': True,
+        }
+        # https://gettingthingsdone.com/workflowmap/
+        # vimeo embed with check-password page protected by Referer header
+    ]
+
+    @staticmethod
+    def _smuggle_referrer(url, referrer_url):
+        return smuggle_url(url, {'http_headers': {'Referer': referrer_url}})
+
+    @staticmethod
+    def _extract_urls(url, webpage):
+        urls = []
+        # Look for embedded (iframe) Vimeo player
+        for mobj in re.finditer(
+                r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1',
+                webpage):
+            urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url))
+        PLAIN_EMBED_RE = (
+            # Look for embedded (swf embed) Vimeo player
+            r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1',
+            # Look more for non-standard embedded Vimeo player
+            r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1',
+        )
+        for embed_re in PLAIN_EMBED_RE:
+            for mobj in re.finditer(embed_re, webpage):
+                urls.append(mobj.group('url'))
+        return urls
+
+    @staticmethod
+    def _extract_url(url, webpage):
+        urls = VimeoIE._extract_urls(url, webpage)
+        return urls[0] if urls else None
+
+    def _verify_player_video_password(self, url, video_id, headers):
+        password = self._downloader.params.get('videopassword')
+        if password is None:
+            raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
+        data = urlencode_postdata({
+            'password': base64.b64encode(password.encode()),
+        })
+        headers = merge_dicts(headers, {
+            'Content-Type': 'application/x-www-form-urlencoded',
+        })
+        checked = self._download_json(
+            url + '/check-password', video_id,
+            'Verifying the password', data=data, headers=headers)
+        if checked is False:
+            raise ExtractorError('Wrong video password', expected=True)
+        return checked
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        url, data = unsmuggle_url(url, {})
+        headers = std_headers.copy()
+        if 'http_headers' in data:
+            headers.update(data['http_headers'])
+        if 'Referer' not in headers:
+            headers['Referer'] = url
+
+        channel_id = self._search_regex(
+            r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
+
+        # Extract ID from URL
+        video_id = self._match_id(url)
+        orig_url = url
+        is_pro = 'vimeopro.com/' in url
+        is_player = '://player.vimeo.com/video/' in url
+        if is_pro:
+            # some videos require portfolio_id to be present in player url
+            # https://github.com/ytdl-org/youtube-dl/issues/20070
+            url = self._extract_url(url, self._download_webpage(url, video_id))
+            if not url:
+                url = 'https://vimeo.com/' + video_id
+        elif is_player:
+            url = 'https://player.vimeo.com/video/' + video_id
+        elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
+            url = 'https://vimeo.com/' + video_id
+
+        try:
+            # Retrieve video webpage to extract further information
+            webpage, urlh = self._download_webpage_handle(
+                url, video_id, headers=headers)
+            redirect_url = urlh.geturl()
+        except ExtractorError as ee:
+            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+                errmsg = ee.cause.read()
+                if b'Because of its privacy settings, this video cannot be played here' in errmsg:
+                    raise ExtractorError(
+                        'Cannot download embed-only video without embedding '
+                        'URL. Please call youtube-dlc with the URL of the page '
+                        'that embeds this video.',
+                        expected=True)
+            raise
+
+        # Now we begin extracting as much information as we can from what we
+        # retrieved. First we extract the information common to all extractors,
+        # and latter we extract those that are Vimeo specific.
+        self.report_extraction(video_id)
+
+        vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None)
+        if vimeo_config:
+            seed_status = vimeo_config.get('seed_status', {})
+            if seed_status.get('state') == 'failed':
+                raise ExtractorError(
+                    '%s said: %s' % (self.IE_NAME, seed_status['title']),
+                    expected=True)
+
+        cc_license = None
+        timestamp = None
+        video_description = None
+
+        # Extract the config JSON
+        try:
+            try:
+                config_url = self._html_search_regex(
+                    r' data-config-url="(.+?)"', webpage,
+                    'config URL', default=None)
+                if not config_url:
+                    # Sometimes new react-based page is served instead of old one that require
+                    # different config URL extraction approach (see
+                    # https://github.com/ytdl-org/youtube-dl/pull/7209)
+                    page_config = self._parse_json(self._search_regex(
+                        r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
+                        webpage, 'page config'), video_id)
+                    config_url = page_config['player']['config_url']
+                    cc_license = page_config.get('cc_license')
+                    timestamp = try_get(
+                        page_config, lambda x: x['clip']['uploaded_on'],
+                        compat_str)
+                    video_description = clean_html(dict_get(
+                        page_config, ('description', 'description_html_escaped')))
+                config = self._download_json(config_url, video_id)
+            except RegexNotFoundError:
+                # For pro videos or player.vimeo.com urls
+                # We try to find out to which variable is assigned the config dic
+                m_variable_name = re.search(r'(\w)\.video\.id', webpage)
+                if m_variable_name is not None:
+                    config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))]
+                else:
+                    config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
+                config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;')
+                config_re.append(r'\bconfig\s*=\s*({.+?})\s*;')
+                config = self._search_regex(config_re, webpage, 'info section',
+                                            flags=re.DOTALL)
+                config = json.loads(config)
+        except Exception as e:
+            if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
+                raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
+
+            if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None:
+                if '_video_password_verified' in data:
+                    raise ExtractorError('video password verification failed!')
+                self._verify_video_password(redirect_url, video_id, webpage)
+                return self._real_extract(
+                    smuggle_url(redirect_url, {'_video_password_verified': 'verified'}))
+            else:
+                raise ExtractorError('Unable to extract info section',
+                                     cause=e)
+        else:
+            if config.get('view') == 4:
+                config = self._verify_player_video_password(redirect_url, video_id, headers)
+
+        vod = config.get('video', {}).get('vod', {})
+
+        def is_rented():
+            if '>You rented this title.<' in webpage:
+                return True
+            if config.get('user', {}).get('purchased'):
+                return True
+            for purchase_option in vod.get('purchase_options', []):
+                if purchase_option.get('purchased'):
+                    return True
+                label = purchase_option.get('label_string')
+                if label and (label.startswith('You rented this') or label.endswith(' remaining')):
+                    return True
+            return False
+
+        if is_rented() and vod.get('is_trailer'):
+            feature_id = vod.get('feature_id')
+            if feature_id and not data.get('force_feature_id', False):
+                return self.url_result(smuggle_url(
+                    'https://player.vimeo.com/player/%s' % feature_id,
+                    {'force_feature_id': True}), 'Vimeo')
+
+        # Extract video description
+        if not video_description:
+            video_description = self._html_search_regex(
+                r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
+                webpage, 'description', default=None)
+        if not video_description:
+            video_description = self._html_search_meta(
+                'description', webpage, default=None)
+        if not video_description and is_pro:
+            orig_webpage = self._download_webpage(
+                orig_url, video_id,
+                note='Downloading webpage for description',
+                fatal=False)
+            if orig_webpage:
+                video_description = self._html_search_meta(
+                    'description', orig_webpage, default=None)
+        if not video_description and not is_player:
+            self._downloader.report_warning('Cannot find video description')
+
+        # Extract upload date
+        if not timestamp:
+            timestamp = self._search_regex(
+                r'<time[^>]+datetime="([^"]+)"', webpage,
+                'timestamp', default=None)
+
+        try:
+            view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count'))
+            like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count'))
+            comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count'))
+        except RegexNotFoundError:
+            # This info is only available in vimeo.com/{id} urls
+            view_count = None
+            like_count = None
+            comment_count = None
+
+        formats = []
+
+        source_format = self._extract_original_format(
+            'https://vimeo.com/' + video_id, video_id)
+        if source_format:
+            formats.append(source_format)
+
+        info_dict_config = self._parse_config(config, video_id)
+        formats.extend(info_dict_config['formats'])
+        self._vimeo_sort_formats(formats)
+
+        json_ld = self._search_json_ld(webpage, video_id, default={})
+
+        if not cc_license:
+            cc_license = self._search_regex(
+                r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1',
+                webpage, 'license', default=None, group='license')
+
+        channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None
+
+        info_dict = {
+            'formats': formats,
+            'timestamp': unified_timestamp(timestamp),
+            'description': video_description,
+            'webpage_url': url,
+            'view_count': view_count,
+            'like_count': like_count,
+            'comment_count': comment_count,
+            'license': cc_license,
+            'channel_id': channel_id,
+            'channel_url': channel_url,
+        }
+
+        info_dict = merge_dicts(info_dict, info_dict_config, json_ld)
+
+        return info_dict
+
+
+class VimeoOndemandIE(VimeoIE):
+    IE_NAME = 'vimeo:ondemand'
+    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        # ondemand video not available via https://vimeo.com/id
+        'url': 'https://vimeo.com/ondemand/20704',
+        'md5': 'c424deda8c7f73c1dfb3edd7630e2f35',
+        'info_dict': {
+            'id': '105442900',
+            'ext': 'mp4',
+            'title': 'המעבדה - במאי יותם פלדמן',
+            'uploader': 'גם סרטים',
+            'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms',
+            'uploader_id': 'gumfilms',
+            'description': 'md5:4c027c965e439de4baab621e48b60791',
+            'upload_date': '20140906',
+            'timestamp': 1410032453,
+        },
+        'params': {
+            'format': 'best[protocol=https]',
+        },
+        'expected_warnings': ['Unable to download JSON metadata'],
+    }, {
+        # requires Referer to be passed along with og:video:url
+        'url': 'https://vimeo.com/ondemand/36938/126682985',
+        'info_dict': {
+            'id': '126584684',
+            'ext': 'mp4',
+            'title': 'Rävlock, rätt läte på rätt plats',
+            'uploader': 'Lindroth & Norin',
+            'uploader_url': r're:https?://(?:www\.)?vimeo\.com/lindrothnorin',
+            'uploader_id': 'lindrothnorin',
+            'description': 'md5:c3c46a90529612c8279fb6af803fc0df',
+            'upload_date': '20150502',
+            'timestamp': 1430586422,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['Unable to download JSON metadata'],
+    }, {
+        'url': 'https://vimeo.com/ondemand/nazmaalik',
+        'only_matching': True,
+    }, {
+        'url': 'https://vimeo.com/ondemand/141692381',
+        'only_matching': True,
+    }, {
+        'url': 'https://vimeo.com/ondemand/thelastcolony/150274832',
+        'only_matching': True,
+    }]
+
+
+class VimeoChannelIE(VimeoBaseInfoExtractor):
+    IE_NAME = 'vimeo:channel'
+    _VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])'
+    _MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
+    _TITLE = None
+    _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
+    _TESTS = [{
+        'url': 'https://vimeo.com/channels/tributes',
+        'info_dict': {
+            'id': 'tributes',
+            'title': 'Vimeo Tributes',
+        },
+        'playlist_mincount': 25,
+    }]
+    _BASE_URL_TEMPL = 'https://vimeo.com/channels/%s'
+
+    def _page_url(self, base_url, pagenum):
+        return '%s/videos/page:%d/' % (base_url, pagenum)
+
+    def _extract_list_title(self, webpage):
+        return self._TITLE or self._html_search_regex(
+            self._TITLE_RE, webpage, 'list title', fatal=False)
+
+    def _title_and_entries(self, list_id, base_url):
+        for pagenum in itertools.count(1):
+            page_url = self._page_url(base_url, pagenum)
+            webpage = self._download_webpage(
+                page_url, list_id,
+                'Downloading page %s' % pagenum)
+
+            if pagenum == 1:
+                yield self._extract_list_title(webpage)
+
+            # Try extracting href first since not all videos are available via
+            # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729)
+            clips = re.findall(
+                r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)(?:[^>]+\btitle="([^"]+)")?', webpage)
+            if clips:
+                for video_id, video_url, video_title in clips:
+                    yield self.url_result(
+                        compat_urlparse.urljoin(base_url, video_url),
+                        VimeoIE.ie_key(), video_id=video_id, video_title=video_title)
+            # More relaxed fallback
+            else:
+                for video_id in re.findall(r'id=["\']clip_(\d+)', webpage):
+                    yield self.url_result(
+                        'https://vimeo.com/%s' % video_id,
+                        VimeoIE.ie_key(), video_id=video_id)
+
+            if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
+                break
+
+    def _extract_videos(self, list_id, base_url):
+        title_and_entries = self._title_and_entries(list_id, base_url)
+        list_title = next(title_and_entries)
+        return self.playlist_result(title_and_entries, list_id, list_title)
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+        return self._extract_videos(channel_id, self._BASE_URL_TEMPL % channel_id)
+
+
+class VimeoUserIE(VimeoChannelIE):
+    IE_NAME = 'vimeo:user'
+    _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<id>[^/]+)(?:/videos|[#?]|$)'
+    _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
+    _TESTS = [{
+        'url': 'https://vimeo.com/nkistudio/videos',
+        'info_dict': {
+            'title': 'Nki',
+            'id': 'nkistudio',
+        },
+        'playlist_mincount': 66,
+    }]
+    _BASE_URL_TEMPL = 'https://vimeo.com/%s'
+
+
+class VimeoAlbumIE(VimeoBaseInfoExtractor):
+    IE_NAME = 'vimeo:album'
+    _VALID_URL = r'https://vimeo\.com/(?:album|showcase)/(?P<id>\d+)(?:$|[?#]|/(?!video))'
+    _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
+    _TESTS = [{
+        'url': 'https://vimeo.com/album/2632481',
+        'info_dict': {
+            'id': '2632481',
+            'title': 'Staff Favorites: November 2013',
+        },
+        'playlist_mincount': 13,
+    }, {
+        'note': 'Password-protected album',
+        'url': 'https://vimeo.com/album/3253534',
+        'info_dict': {
+            'title': 'test',
+            'id': '3253534',
+        },
+        'playlist_count': 1,
+        'params': {
+            'videopassword': 'youtube-dl',
+        }
+    }]
+    _PAGE_SIZE = 100
+
+    def _fetch_page(self, album_id, authorizaion, hashed_pass, page):
+        api_page = page + 1
+        query = {
+            'fields': 'link,uri',
+            'page': api_page,
+            'per_page': self._PAGE_SIZE,
+        }
+        if hashed_pass:
+            query['_hashed_pass'] = hashed_pass
+        videos = self._download_json(
+            'https://api.vimeo.com/albums/%s/videos' % album_id,
+            album_id, 'Downloading page %d' % api_page, query=query, headers={
+                'Authorization': 'jwt ' + authorizaion,
+            })['data']
+        for video in videos:
+            link = video.get('link')
+            if not link:
+                continue
+            uri = video.get('uri')
+            video_id = self._search_regex(r'/videos/(\d+)', uri, 'video_id', default=None) if uri else None
+            yield self.url_result(link, VimeoIE.ie_key(), video_id)
+
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+        webpage = self._download_webpage(url, album_id)
+        viewer = self._parse_json(self._search_regex(
+            r'bootstrap_data\s*=\s*({.+?})</script>',
+            webpage, 'bootstrap data'), album_id)['viewer']
+        jwt = viewer['jwt']
+        album = self._download_json(
+            'https://api.vimeo.com/albums/' + album_id,
+            album_id, headers={'Authorization': 'jwt ' + jwt},
+            query={'fields': 'description,name,privacy'})
+        hashed_pass = None
+        if try_get(album, lambda x: x['privacy']['view']) == 'password':
+            password = self._downloader.params.get('videopassword')
+            if not password:
+                raise ExtractorError(
+                    'This album is protected by a password, use the --video-password option',
+                    expected=True)
+            self._set_vimeo_cookie('vuid', viewer['vuid'])
+            try:
+                hashed_pass = self._download_json(
+                    'https://vimeo.com/showcase/%s/auth' % album_id,
+                    album_id, 'Verifying the password', data=urlencode_postdata({
+                        'password': password,
+                        'token': viewer['xsrft'],
+                    }), headers={
+                        'X-Requested-With': 'XMLHttpRequest',
+                    })['hashed_pass']
+            except ExtractorError as e:
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                    raise ExtractorError('Wrong password', expected=True)
+                raise
+        entries = OnDemandPagedList(functools.partial(
+            self._fetch_page, album_id, jwt, hashed_pass), self._PAGE_SIZE)
+        return self.playlist_result(
+            entries, album_id, album.get('name'), album.get('description'))
+
+
+class VimeoGroupsIE(VimeoChannelIE):
+    IE_NAME = 'vimeo:group'
+    _VALID_URL = r'https://vimeo\.com/groups/(?P<id>[^/]+)(?:/(?!videos?/\d+)|$)'
+    _TESTS = [{
+        'url': 'https://vimeo.com/groups/kattykay',
+        'info_dict': {
+            'id': 'kattykay',
+            'title': 'Katty Kay',
+        },
+        'playlist_mincount': 27,
+    }]
+    _BASE_URL_TEMPL = 'https://vimeo.com/groups/%s'
+
+
+class VimeoReviewIE(VimeoBaseInfoExtractor):
+    IE_NAME = 'vimeo:review'
+    IE_DESC = 'Review pages on vimeo'
+    _VALID_URL = r'(?P<url>https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)/[0-9a-f]{10})'
+    _TESTS = [{
+        'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d',
+        'md5': 'c507a72f780cacc12b2248bb4006d253',
+        'info_dict': {
+            'id': '75524534',
+            'ext': 'mp4',
+            'title': "DICK HARDWICK 'Comedian'",
+            'uploader': 'Richard Hardwick',
+            'uploader_id': 'user21297594',
+            'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks",
+        },
+        'expected_warnings': ['Unable to download JSON metadata'],
+    }, {
+        'note': 'video player needs Referer',
+        'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053',
+        'md5': '6295fdab8f4bf6a002d058b2c6dce276',
+        'info_dict': {
+            'id': '91613211',
+            'ext': 'mp4',
+            'title': 're:(?i)^Death by dogma versus assembling agile . Sander Hoogendoorn',
+            'uploader': 'DevWeek Events',
+            'duration': 2773,
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader_id': 'user22258446',
+        },
+        'skip': 'video gone',
+    }, {
+        'note': 'Password protected',
+        'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde',
+        'info_dict': {
+            'id': '138823582',
+            'ext': 'mp4',
+            'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1',
+            'uploader': 'TMB',
+            'uploader_id': 'user37284429',
+        },
+        'params': {
+            'videopassword': 'holygrail',
+        },
+        'skip': 'video gone',
+    }]
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        page_url, video_id = re.match(self._VALID_URL, url).groups()
+        clip_data = self._download_json(
+            page_url.replace('/review/', '/review/data/'),
+            video_id)['clipData']
+        config_url = clip_data['configUrl']
+        config = self._download_json(config_url, video_id)
+        info_dict = self._parse_config(config, video_id)
+        source_format = self._extract_original_format(
+            page_url + '/action', video_id)
+        if source_format:
+            info_dict['formats'].append(source_format)
+        self._vimeo_sort_formats(info_dict['formats'])
+        info_dict['description'] = clean_html(clip_data.get('description'))
+        return info_dict
+
+
+class VimeoWatchLaterIE(VimeoChannelIE):
+    IE_NAME = 'vimeo:watchlater'
+    IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)'
+    _VALID_URL = r'https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater'
+    _TITLE = 'Watch Later'
+    _LOGIN_REQUIRED = True
+    _TESTS = [{
+        'url': 'https://vimeo.com/watchlater',
+        'only_matching': True,
+    }]
+
+    def _real_initialize(self):
+        self._login()
+
+    def _page_url(self, base_url, pagenum):
+        url = '%s/page:%d/' % (base_url, pagenum)
+        request = sanitized_Request(url)
+        # Set the header to get a partial html page with the ids,
+        # the normal page doesn't contain them.
+        request.add_header('X-Requested-With', 'XMLHttpRequest')
+        return request
+
+    def _real_extract(self, url):
+        return self._extract_videos('watchlater', 'https://vimeo.com/watchlater')
+
+
+class VimeoLikesIE(VimeoChannelIE):
+    _VALID_URL = r'https://(?:www\.)?vimeo\.com/(?P<id>[^/]+)/likes/?(?:$|[?#]|sort:)'
+    IE_NAME = 'vimeo:likes'
+    IE_DESC = 'Vimeo user likes'
+    _TESTS = [{
+        'url': 'https://vimeo.com/user755559/likes/',
+        'playlist_mincount': 293,
+        'info_dict': {
+            'id': 'user755559',
+            'title': 'urza’s Likes',
+        },
+    }, {
+        'url': 'https://vimeo.com/stormlapse/likes',
+        'only_matching': True,
+    }]
+
+    def _page_url(self, base_url, pagenum):
+        return '%s/page:%d/' % (base_url, pagenum)
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+        return self._extract_videos(user_id, 'https://vimeo.com/%s/likes' % user_id)
+
+
+class VHXEmbedIE(VimeoBaseInfoExtractor):
+    IE_NAME = 'vhx:embed'
+    _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        config_url = self._parse_json(self._search_regex(
+            r'window\.OTTData\s*=\s*({.+})', webpage,
+            'ott data'), video_id, js_to_json)['config_url']
+        config = self._download_json(config_url, video_id)
+        info = self._parse_config(config, video_id)
+        self._vimeo_sort_formats(info['formats'])
+        return info
diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py

new file mode 100644 (file)

index 0000000..c74b437
--- /dev/null
+++ b/youtube_dl/extractor/vimple.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class SprutoBaseIE(InfoExtractor):
+    def _extract_spruto(self, spruto, video_id):
+        playlist = spruto['playlist'][0]
+        title = playlist['title']
+        video_id = playlist.get('videoId') or video_id
+        thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl')
+        duration = int_or_none(playlist.get('duration'))
+
+        formats = [{
+            'url': f['url'],
+        } for f in playlist['video']]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
+
+
+class VimpleIE(SprutoBaseIE):
+    IE_DESC = 'Vimple - one-click video hosting'
+    _VALID_URL = r'https?://(?:player\.vimple\.(?:ru|co)/iframe|vimple\.(?:ru|co))/(?P<id>[\da-f-]{32,36})'
+    _TESTS = [{
+        'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf',
+        'md5': '2e750a330ed211d3fd41821c6ad9a279',
+        'info_dict': {
+            'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf',
+            'ext': 'mp4',
+            'title': 'Sunset',
+            'duration': 20,
+            'thumbnail': r're:https?://.*?\.jpg',
+        },
+    }, {
+        'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9',
+        'only_matching': True,
+    }, {
+        'url': 'http://vimple.co/04506a053f124483b8fb05ed73899f19',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://player.vimple.ru/iframe/%s' % video_id, video_id)
+
+        spruto = self._parse_json(
+            self._search_regex(
+                r'sprutoData\s*:\s*({.+?}),\r\n', webpage, 'spruto data'),
+            video_id)
+
+        return self._extract_spruto(spruto, video_id)
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py

new file mode 100644 (file)

index 0000000..80b896b
--- /dev/null
+++ b/youtube_dl/extractor/vine.py
@@ -0,0 +1,154 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    unified_timestamp,
+)
+
+
+class VineIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vine\.co/(?:v|oembed)/(?P<id>\w+)'
+    _TESTS = [{
+        'url': 'https://vine.co/v/b9KOOWX7HUx',
+        'md5': '2f36fed6235b16da96ce9b4dc890940d',
+        'info_dict': {
+            'id': 'b9KOOWX7HUx',
+            'ext': 'mp4',
+            'title': 'Chicken.',
+            'alt_title': 'Vine by Jack',
+            'timestamp': 1368997951,
+            'upload_date': '20130519',
+            'uploader': 'Jack',
+            'uploader_id': '76',
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+            'repost_count': int,
+        },
+    }, {
+        'url': 'https://vine.co/v/e192BnZnZ9V',
+        'info_dict': {
+            'id': 'e192BnZnZ9V',
+            'ext': 'mp4',
+            'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries  #lovesickseason2',
+            'alt_title': 'Vine by Pimry_zaa',
+            'timestamp': 1436057405,
+            'upload_date': '20150705',
+            'uploader': 'Pimry_zaa',
+            'uploader_id': '1135760698325307392',
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+            'repost_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://vine.co/v/MYxVapFvz2z',
+        'only_matching': True,
+    }, {
+        'url': 'https://vine.co/v/bxVjBbZlPUH',
+        'only_matching': True,
+    }, {
+        'url': 'https://vine.co/oembed/MYxVapFvz2z.json',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        data = self._download_json(
+            'https://archive.vine.co/posts/%s.json' % video_id, video_id)
+
+        def video_url(kind):
+            for url_suffix in ('Url', 'URL'):
+                format_url = data.get('video%s%s' % (kind, url_suffix))
+                if format_url:
+                    return format_url
+
+        formats = []
+        for quality, format_id in enumerate(('low', '', 'dash')):
+            format_url = video_url(format_id.capitalize())
+            if not format_url:
+                continue
+            # DASH link returns plain mp4
+            if format_id == 'dash' and determine_ext(format_url) == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    format_url, video_id, mpd_id='dash', fatal=False))
+            else:
+                formats.append({
+                    'url': format_url,
+                    'format_id': format_id or 'standard',
+                    'quality': quality,
+                })
+        self._sort_formats(formats)
+
+        username = data.get('username')
+
+        alt_title = 'Vine by %s' % username if username else None
+
+        return {
+            'id': video_id,
+            'title': data.get('description') or alt_title or 'Vine video',
+            'alt_title': alt_title,
+            'thumbnail': data.get('thumbnailUrl'),
+            'timestamp': unified_timestamp(data.get('created')),
+            'uploader': username,
+            'uploader_id': data.get('userIdStr'),
+            'view_count': int_or_none(data.get('loops')),
+            'like_count': int_or_none(data.get('likes')),
+            'comment_count': int_or_none(data.get('comments')),
+            'repost_count': int_or_none(data.get('reposts')),
+            'formats': formats,
+        }
+
+
+class VineUserIE(InfoExtractor):
+    IE_NAME = 'vine:user'
+    _VALID_URL = r'https?://vine\.co/(?P<u>u/)?(?P<user>[^/]+)'
+    _VINE_BASE_URL = 'https://vine.co/'
+    _TESTS = [{
+        'url': 'https://vine.co/itsruthb',
+        'info_dict': {
+            'id': 'itsruthb',
+            'title': 'Ruth B',
+            'description': '| Instagram/Twitter: itsruthb | still a lost boy from neverland',
+        },
+        'playlist_mincount': 611,
+    }, {
+        'url': 'https://vine.co/u/942914934646415360',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if VineIE.suitable(url) else super(VineUserIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user = mobj.group('user')
+        u = mobj.group('u')
+
+        profile_url = '%sapi/users/profiles/%s%s' % (
+            self._VINE_BASE_URL, 'vanity/' if not u else '', user)
+        profile_data = self._download_json(
+            profile_url, user, note='Downloading user profile data')
+
+        data = profile_data['data']
+        user_id = data.get('userId') or data['userIdStr']
+        profile = self._download_json(
+            'https://archive.vine.co/profiles/%s.json' % user_id, user_id)
+        entries = [
+            self.url_result(
+                'https://vine.co/v/%s' % post_id, ie='Vine', video_id=post_id)
+            for post_id in profile['posts']
+            if post_id and isinstance(post_id, compat_str)]
+        return self.playlist_result(
+            entries, user, profile.get('username'), profile.get('description'))
diff --git a/youtube_dl/extractor/viqeo.py b/youtube_dl/extractor/viqeo.py

new file mode 100644 (file)

index 0000000..be7dfa8
--- /dev/null
+++ b/youtube_dl/extractor/viqeo.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    str_or_none,
+    url_or_none,
+)
+
+
+class ViqeoIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            viqeo:|
+                            https?://cdn\.viqeo\.tv/embed/*\?.*?\bvid=|
+                            https?://api\.viqeo\.tv/v\d+/data/startup?.*?\bvideo(?:%5B%5D|\[\])=
+                        )
+                        (?P<id>[\da-f]+)
+                    '''
+    _TESTS = [{
+        'url': 'https://cdn.viqeo.tv/embed/?vid=cde96f09d25f39bee837',
+        'md5': 'a169dd1a6426b350dca4296226f21e76',
+        'info_dict': {
+            'id': 'cde96f09d25f39bee837',
+            'ext': 'mp4',
+            'title': 'cde96f09d25f39bee837',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 76,
+        },
+    }, {
+        'url': 'viqeo:cde96f09d25f39bee837',
+        'only_matching': True,
+    }, {
+        'url': 'https://api.viqeo.tv/v1/data/startup?video%5B%5D=71bbec412ade45c3216c&profile=112',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url')
+            for mobj in re.finditer(
+                r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1',
+                webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://cdn.viqeo.tv/embed/?vid=%s' % video_id, video_id)
+
+        data = self._parse_json(
+            self._search_regex(
+                r'SLOT_DATA\s*=\s*({.+?})\s*;', webpage, 'slot data'),
+            video_id)
+
+        formats = []
+        thumbnails = []
+        for media_file in data['mediaFiles']:
+            if not isinstance(media_file, dict):
+                continue
+            media_url = url_or_none(media_file.get('url'))
+            if not media_url or not media_url.startswith(('http', '//')):
+                continue
+            media_type = str_or_none(media_file.get('type'))
+            if not media_type:
+                continue
+            media_kind = media_type.split('/')[0].lower()
+            f = {
+                'url': media_url,
+                'width': int_or_none(media_file.get('width')),
+                'height': int_or_none(media_file.get('height')),
+            }
+            format_id = str_or_none(media_file.get('quality'))
+            if media_kind == 'image':
+                f['id'] = format_id
+                thumbnails.append(f)
+            elif media_kind in ('video', 'audio'):
+                is_audio = media_kind == 'audio'
+                f.update({
+                    'format_id': 'audio' if is_audio else format_id,
+                    'fps': int_or_none(media_file.get('fps')),
+                    'vcodec': 'none' if is_audio else None,
+                })
+                formats.append(f)
+        self._sort_formats(formats)
+
+        duration = int_or_none(data.get('duration'))
+
+        return {
+            'id': video_id,
+            'title': video_id,
+            'duration': duration,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py

new file mode 100644 (file)

index 0000000..3bd3752
--- /dev/null
+++ b/youtube_dl/extractor/viu.py
@@ -0,0 +1,272 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_kwargs,
+    compat_str,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
+
+
+class ViuBaseIE(InfoExtractor):
+    def _real_initialize(self):
+        viu_auth_res = self._request_webpage(
+            'https://www.viu.com/api/apps/v2/authenticate', None,
+            'Requesting Viu auth', query={
+                'acct': 'test',
+                'appid': 'viu_desktop',
+                'fmt': 'json',
+                'iid': 'guest',
+                'languageid': 'default',
+                'platform': 'desktop',
+                'userid': 'guest',
+                'useridtype': 'guest',
+                'ver': '1.0'
+            }, headers=self.geo_verification_headers())
+        self._auth_token = viu_auth_res.info()['X-VIU-AUTH']
+
+    def _call_api(self, path, *args, **kwargs):
+        headers = self.geo_verification_headers()
+        headers.update({
+            'X-VIU-AUTH': self._auth_token
+        })
+        headers.update(kwargs.get('headers', {}))
+        kwargs['headers'] = headers
+        response = self._download_json(
+            'https://www.viu.com/api/' + path, *args,
+            **compat_kwargs(kwargs))['response']
+        if response.get('status') != 'success':
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, response['message']), expected=True)
+        return response
+
+
+class ViuIE(ViuBaseIE):
+    _VALID_URL = r'(?:viu:|https?://[^/]+\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059',
+        'info_dict': {
+            'id': '1116705532',
+            'ext': 'mp4',
+            'title': 'Citizen Khan - Ep 1',
+            'description': 'md5:d7ea1604f49e5ba79c212c551ce2110e',
+        },
+        'params': {
+            'skip_download': 'm3u8 download',
+        },
+        'skip': 'Geo-restricted to India',
+    }, {
+        'url': 'https://www.viu.com/en/media/1130599965',
+        'info_dict': {
+            'id': '1130599965',
+            'ext': 'mp4',
+            'title': 'Jealousy Incarnate - Episode 1',
+            'description': 'md5:d3d82375cab969415d2720b6894361e9',
+        },
+        'params': {
+            'skip_download': 'm3u8 download',
+        },
+        'skip': 'Geo-restricted to Indonesia',
+    }, {
+        'url': 'https://india.viu.com/en/media/1126286865',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video_data = self._call_api(
+            'clip/load', video_id, 'Downloading video data', query={
+                'appid': 'viu_desktop',
+                'fmt': 'json',
+                'id': video_id
+            })['item'][0]
+
+        title = video_data['title']
+
+        m3u8_url = None
+        url_path = video_data.get('urlpathd') or video_data.get('urlpath')
+        tdirforwhole = video_data.get('tdirforwhole')
+        # #EXT-X-BYTERANGE is not supported by native hls downloader
+        # and ffmpeg (#10955)
+        # hls_file = video_data.get('hlsfile')
+        hls_file = video_data.get('jwhlsfile')
+        if url_path and tdirforwhole and hls_file:
+            m3u8_url = '%s/%s/%s' % (url_path, tdirforwhole, hls_file)
+        else:
+            # m3u8_url = re.sub(
+            #     r'(/hlsc_)[a-z]+(\d+\.m3u8)',
+            #     r'\1whe\2', video_data['href'])
+            m3u8_url = video_data['href']
+        formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for key, value in video_data.items():
+            mobj = re.match(r'^subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key)
+            if not mobj:
+                continue
+            subtitles.setdefault(mobj.group('lang'), []).append({
+                'url': value,
+                'ext': mobj.group('ext')
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'series': video_data.get('moviealbumshowname'),
+            'episode': title,
+            'episode_number': int_or_none(video_data.get('episodeno')),
+            'duration': int_or_none(video_data.get('duration')),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class ViuPlaylistIE(ViuBaseIE):
+    IE_NAME = 'viu:playlist'
+    _VALID_URL = r'https?://www\.viu\.com/[^/]+/listing/playlist-(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.viu.com/en/listing/playlist-22461380',
+        'info_dict': {
+            'id': '22461380',
+            'title': 'The Good Wife',
+        },
+        'playlist_count': 16,
+        'skip': 'Geo-restricted to Indonesia',
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        playlist_data = self._call_api(
+            'container/load', playlist_id,
+            'Downloading playlist info', query={
+                'appid': 'viu_desktop',
+                'fmt': 'json',
+                'id': 'playlist-' + playlist_id
+            })['container']
+
+        entries = []
+        for item in playlist_data.get('item', []):
+            item_id = item.get('id')
+            if not item_id:
+                continue
+            item_id = compat_str(item_id)
+            entries.append(self.url_result(
+                'viu:' + item_id, 'Viu', item_id))
+
+        return self.playlist_result(
+            entries, playlist_id, playlist_data.get('title'))
+
+
+class ViuOTTIE(InfoExtractor):
+    IE_NAME = 'viu:ott'
+    _VALID_URL = r'https?://(?:www\.)?viu\.com/ott/(?P<country_code>[a-z]{2})/[a-z]{2}-[a-z]{2}/vod/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.viu.com/ott/sg/en-us/vod/3421/The%20Prime%20Minister%20and%20I',
+        'info_dict': {
+            'id': '3421',
+            'ext': 'mp4',
+            'title': 'A New Beginning',
+            'description': 'md5:1e7486a619b6399b25ba6a41c0fe5b2c',
+        },
+        'params': {
+            'skip_download': 'm3u8 download',
+        },
+        'skip': 'Geo-restricted to Singapore',
+    }, {
+        'url': 'http://www.viu.com/ott/hk/zh-hk/vod/7123/%E5%A4%A7%E4%BA%BA%E5%A5%B3%E5%AD%90',
+        'info_dict': {
+            'id': '7123',
+            'ext': 'mp4',
+            'title': '這就是我的生活之道',
+            'description': 'md5:4eb0d8b08cf04fcdc6bbbeb16043434f',
+        },
+        'params': {
+            'skip_download': 'm3u8 download',
+        },
+        'skip': 'Geo-restricted to Hong Kong',
+    }]
+
+    _AREA_ID = {
+        'HK': 1,
+        'SG': 2,
+        'TH': 4,
+        'PH': 5,
+    }
+
+    def _real_extract(self, url):
+        country_code, video_id = re.match(self._VALID_URL, url).groups()
+
+        query = {
+            'r': 'vod/ajax-detail',
+            'platform_flag_label': 'web',
+            'product_id': video_id,
+        }
+
+        area_id = self._AREA_ID.get(country_code.upper())
+        if area_id:
+            query['area_id'] = area_id
+
+        product_data = self._download_json(
+            'http://www.viu.com/ott/%s/index.php' % country_code, video_id,
+            'Downloading video info', query=query)['data']
+
+        video_data = product_data.get('current_product')
+        if not video_data:
+            raise ExtractorError('This video is not available in your region.', expected=True)
+
+        stream_data = self._download_json(
+            'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
+            video_id, 'Downloading stream info', query={
+                'ccs_product_id': video_data['ccs_product_id'],
+            }, headers={
+                'Referer': url,
+                'Origin': re.search(r'https?://[^/]+', url).group(0),
+            })['data']['stream']
+
+        stream_sizes = stream_data.get('size', {})
+        formats = []
+        for vid_format, stream_url in stream_data.get('url', {}).items():
+            height = int_or_none(self._search_regex(
+                r's(\d+)p', vid_format, 'height', default=None))
+            formats.append({
+                'format_id': vid_format,
+                'url': stream_url,
+                'height': height,
+                'ext': 'mp4',
+                'filesize': int_or_none(stream_sizes.get(vid_format))
+            })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for sub in video_data.get('subtitle', []):
+            sub_url = sub.get('url')
+            if not sub_url:
+                continue
+            subtitles.setdefault(sub.get('name'), []).append({
+                'url': sub_url,
+                'ext': 'srt',
+            })
+
+        title = video_data['synopsis'].strip()
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'series': product_data.get('series', {}).get('name'),
+            'episode': title,
+            'episode_number': int_or_none(video_data.get('number')),
+            'duration': int_or_none(stream_data.get('duration')),
+            'thumbnail': video_data.get('cover_image_url'),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py

new file mode 100644 (file)

index 0000000..00ec006
--- /dev/null
+++ b/youtube_dl/extractor/vk.py
@@ -0,0 +1,678 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import collections
+import functools
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    get_element_by_class,
+    int_or_none,
+    OnDemandPagedList,
+    orderedSet,
+    str_or_none,
+    str_to_int,
+    unescapeHTML,
+    unified_timestamp,
+    url_or_none,
+    urlencode_postdata,
+)
+from .dailymotion import DailymotionIE
+from .odnoklassniki import OdnoklassnikiIE
+from .pladform import PladformIE
+from .vimeo import VimeoIE
+from .youtube import YoutubeIE
+
+
+class VKBaseIE(InfoExtractor):
+    _NETRC_MACHINE = 'vk'
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if username is None:
+            return
+
+        login_page, url_handle = self._download_webpage_handle(
+            'https://vk.com', None, 'Downloading login page')
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'email': username.encode('cp1251'),
+            'pass': password.encode('cp1251'),
+        })
+
+        # vk serves two same remixlhk cookies in Set-Cookie header and expects
+        # first one to be actually set
+        self._apply_first_set_cookie_header(url_handle, 'remixlhk')
+
+        login_page = self._download_webpage(
+            'https://login.vk.com/?act=login', None,
+            note='Logging in',
+            data=urlencode_postdata(login_form))
+
+        if re.search(r'onLoginFailed', login_page):
+            raise ExtractorError(
+                'Unable to login, incorrect username and/or password', expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _download_payload(self, path, video_id, data, fatal=True):
+        data['al'] = 1
+        code, payload = self._download_json(
+            'https://vk.com/%s.php' % path, video_id,
+            data=urlencode_postdata(data), fatal=fatal,
+            headers={'X-Requested-With': 'XMLHttpRequest'})['payload']
+        if code == '3':
+            self.raise_login_required()
+        elif code == '8':
+            raise ExtractorError(clean_html(payload[0][1:-1]), expected=True)
+        return payload
+
+
+class VKIE(VKBaseIE):
+    IE_NAME = 'vk'
+    IE_DESC = 'VK'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:
+                                (?:(?:m|new)\.)?vk\.com/video_|
+                                (?:www\.)?daxab.com/
+                            )
+                            ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
+                            (?:
+                                (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video|
+                                (?:www\.)?daxab.com/embed/
+                            )
+                            (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
+                        )
+                    '''
+    _TESTS = [
+        {
+            'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
+            'md5': '7babad3b85ea2e91948005b1b8b0cb84',
+            'info_dict': {
+                'id': '-77521_162222515',
+                'ext': 'mp4',
+                'title': 'ProtivoGunz - Хуёвая песня',
+                'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
+                'uploader_id': '-77521',
+                'duration': 195,
+                'timestamp': 1329049880,
+                'upload_date': '20120212',
+            },
+        },
+        {
+            'url': 'http://vk.com/video205387401_165548505',
+            'info_dict': {
+                'id': '205387401_165548505',
+                'ext': 'mp4',
+                'title': 'No name',
+                'uploader': 'Tom Cruise',
+                'uploader_id': '205387401',
+                'duration': 9,
+                'timestamp': 1374364108,
+                'upload_date': '20130720',
+            }
+        },
+        {
+            'note': 'Embedded video',
+            'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa',
+            'md5': '7babad3b85ea2e91948005b1b8b0cb84',
+            'info_dict': {
+                'id': '-77521_162222515',
+                'ext': 'mp4',
+                'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
+                'title': 'ProtivoGunz - Хуёвая песня',
+                'duration': 195,
+                'upload_date': '20120212',
+                'timestamp': 1329049880,
+                'uploader_id': '-77521',
+            },
+        },
+        {
+            # VIDEO NOW REMOVED
+            # please update if you find a video whose URL follows the same pattern
+            'url': 'http://vk.com/video-8871596_164049491',
+            'md5': 'a590bcaf3d543576c9bd162812387666',
+            'note': 'Only available for registered users',
+            'info_dict': {
+                'id': '-8871596_164049491',
+                'ext': 'mp4',
+                'uploader': 'Триллеры',
+                'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]',
+                'duration': 8352,
+                'upload_date': '20121218',
+                'view_count': int,
+            },
+            'skip': 'Removed',
+        },
+        {
+            'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d',
+            'info_dict': {
+                'id': '-43215063_168067957',
+                'ext': 'mp4',
+                'uploader': 'Bro Mazter',
+                'title': ' ',
+                'duration': 7291,
+                'upload_date': '20140328',
+                'uploader_id': '223413403',
+                'timestamp': 1396018030,
+            },
+            'skip': 'Requires vk account credentials',
+        },
+        {
+            'url': 'http://m.vk.com/video-43215063_169084319?list=125c627d1aa1cebb83&from=wall-43215063_2566540',
+            'md5': '0c45586baa71b7cb1d0784ee3f4e00a6',
+            'note': 'ivi.ru embed',
+            'info_dict': {
+                'id': '-43215063_169084319',
+                'ext': 'mp4',
+                'title': 'Книга Илая',
+                'duration': 6771,
+                'upload_date': '20140626',
+                'view_count': int,
+            },
+            'skip': 'Removed',
+        },
+        {
+            # video (removed?) only available with list id
+            'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4',
+            'md5': '091287af5402239a1051c37ec7b92913',
+            'info_dict': {
+                'id': '30481095_171201961',
+                'ext': 'mp4',
+                'title': 'ТюменцевВВ_09.07.2015',
+                'uploader': 'Anton Ivanov',
+                'duration': 109,
+                'upload_date': '20150709',
+                'view_count': int,
+            },
+            'skip': 'Removed',
+        },
+        {
+            # youtube embed
+            'url': 'https://vk.com/video276849682_170681728',
+            'info_dict': {
+                'id': 'V3K4mi0SYkc',
+                'ext': 'mp4',
+                'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
+                'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
+                'duration': 178,
+                'upload_date': '20130116',
+                'uploader': "Children's Joy Foundation Inc.",
+                'uploader_id': 'thecjf',
+                'view_count': int,
+            },
+        },
+        {
+            # dailymotion embed
+            'url': 'https://vk.com/video-37468416_456239855',
+            'info_dict': {
+                'id': 'k3lz2cmXyRuJQSjGHUv',
+                'ext': 'mp4',
+                'title': 'md5:d52606645c20b0ddbb21655adaa4f56f',
+                'description': 'md5:424b8e88cc873217f520e582ba28bb36',
+                'uploader': 'AniLibria.Tv',
+                'upload_date': '20160914',
+                'uploader_id': 'x1p5vl5',
+                'timestamp': 1473877246,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # video key is extra_data not url\d+
+            'url': 'http://vk.com/video-110305615_171782105',
+            'md5': 'e13fcda136f99764872e739d13fac1d1',
+            'info_dict': {
+                'id': '-110305615_171782105',
+                'ext': 'mp4',
+                'title': 'S-Dance, репетиции к The way show',
+                'uploader': 'THE WAY SHOW | 17 апреля',
+                'uploader_id': '-110305615',
+                'timestamp': 1454859345,
+                'upload_date': '20160207',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # finished live stream, postlive_mp4
+            'url': 'https://vk.com/videos-387766?z=video-387766_456242764%2Fpl_-387766_-2',
+            'info_dict': {
+                'id': '-387766_456242764',
+                'ext': 'mp4',
+                'title': 'ИгроМир 2016 День 1 — Игромания Утром',
+                'uploader': 'Игромания',
+                'duration': 5239,
+                # TODO: use act=show to extract view_count
+                # 'view_count': int,
+                'upload_date': '20160929',
+                'uploader_id': '-387766',
+                'timestamp': 1475137527,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # live stream, hls and rtmp links, most likely already finished live
+            # stream by the time you are reading this comment
+            'url': 'https://vk.com/video-140332_456239111',
+            'only_matching': True,
+        },
+        {
+            # removed video, just testing that we match the pattern
+            'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
+            'only_matching': True,
+        },
+        {
+            # age restricted video, requires vk account credentials
+            'url': 'https://vk.com/video205387401_164765225',
+            'only_matching': True,
+        },
+        {
+            # pladform embed
+            'url': 'https://vk.com/video-76116461_171554880',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://new.vk.com/video205387401_165548505',
+            'only_matching': True,
+        },
+        {
+            # This video is no longer available, because its author has been blocked.
+            'url': 'https://vk.com/video-10639516_456240611',
+            'only_matching': True,
+        },
+        {
+            # The video is not available in your region.
+            'url': 'https://vk.com/video-51812607_171445436',
+            'only_matching': True,
+        }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+
+        mv_data = {}
+        if video_id:
+            data = {
+                'act': 'show_inline',
+                'video': video_id,
+            }
+            # Some videos (removed?) can only be downloaded with list id specified
+            list_id = mobj.group('list_id')
+            if list_id:
+                data['list'] = list_id
+
+            payload = self._download_payload('al_video', video_id, data)
+            info_page = payload[1]
+            opts = payload[-1]
+            mv_data = opts.get('mvData') or {}
+            player = opts.get('player') or {}
+        else:
+            video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
+
+            info_page = self._download_webpage(
+                'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id)
+
+            error_message = self._html_search_regex(
+                [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+                    r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
+                info_page, 'error message', default=None)
+            if error_message:
+                raise ExtractorError(error_message, expected=True)
+
+            if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
+                raise ExtractorError(
+                    'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
+                    expected=True)
+
+            ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.'
+
+            ERRORS = {
+                r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
+                ERROR_COPYRIGHT,
+
+                r'>The video .*? was removed from public access by request of the copyright holder.<':
+                ERROR_COPYRIGHT,
+
+                r'<!>Please log in or <':
+                'Video %s is only available for registered users, '
+                'use --username and --password options to provide account credentials.',
+
+                r'<!>Unknown error':
+                'Video %s does not exist.',
+
+                r'<!>Видео временно недоступно':
+                'Video %s is temporarily unavailable.',
+
+                r'<!>Access denied':
+                'Access denied to video %s.',
+
+                r'<!>Видеозапись недоступна, так как её автор был заблокирован.':
+                'Video %s is no longer available, because its author has been blocked.',
+
+                r'<!>This video is no longer available, because its author has been blocked.':
+                'Video %s is no longer available, because its author has been blocked.',
+
+                r'<!>This video is no longer available, because it has been deleted.':
+                'Video %s is no longer available, because it has been deleted.',
+
+                r'<!>The video .+? is not available in your region.':
+                'Video %s is not available in your region.',
+            }
+
+            for error_re, error_msg in ERRORS.items():
+                if re.search(error_re, info_page):
+                    raise ExtractorError(error_msg % video_id, expected=True)
+
+            player = self._parse_json(self._search_regex(
+                r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n',
+                info_page, 'player params'), video_id)
+
+        youtube_url = YoutubeIE._extract_url(info_page)
+        if youtube_url:
+            return self.url_result(youtube_url, YoutubeIE.ie_key())
+
+        vimeo_url = VimeoIE._extract_url(url, info_page)
+        if vimeo_url is not None:
+            return self.url_result(vimeo_url, VimeoIE.ie_key())
+
+        pladform_url = PladformIE._extract_url(info_page)
+        if pladform_url:
+            return self.url_result(pladform_url, PladformIE.ie_key())
+
+        m_rutube = re.search(
+            r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page)
+        if m_rutube is not None:
+            rutube_url = self._proto_relative_url(
+                m_rutube.group(1).replace('\\', ''))
+            return self.url_result(rutube_url)
+
+        dailymotion_urls = DailymotionIE._extract_urls(info_page)
+        if dailymotion_urls:
+            return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key())
+
+        odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page)
+        if odnoklassniki_url:
+            return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
+
+        m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
+        if m_opts:
+            m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
+            if m_opts_url:
+                opts_url = m_opts_url.group(1)
+                if opts_url.startswith('//'):
+                    opts_url = 'http:' + opts_url
+                return self.url_result(opts_url)
+
+        data = player['params'][0]
+        title = unescapeHTML(data['md_title'])
+
+        # 2 = live
+        # 3 = post live (finished live)
+        is_live = data.get('live') == 2
+        if is_live:
+            title = self._live_title(title)
+
+        timestamp = unified_timestamp(self._html_search_regex(
+            r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page,
+            'upload date', default=None)) or int_or_none(data.get('date'))
+
+        view_count = str_to_int(self._search_regex(
+            r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)',
+            info_page, 'view count', default=None))
+
+        formats = []
+        for format_id, format_url in data.items():
+            format_url = url_or_none(format_url)
+            if not format_url or not format_url.startswith(('http', '//', 'rtmp')):
+                continue
+            if (format_id.startswith(('url', 'cache'))
+                    or format_id in ('extra_data', 'live_mp4', 'postlive_mp4')):
+                height = int_or_none(self._search_regex(
+                    r'^(?:url|cache)(\d+)', format_id, 'height', default=None))
+                formats.append({
+                    'format_id': format_id,
+                    'url': format_url,
+                    'height': height,
+                })
+            elif format_id == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id=format_id, fatal=False, live=is_live))
+            elif format_id == 'rtmp':
+                formats.append({
+                    'format_id': format_id,
+                    'url': format_url,
+                    'ext': 'flv',
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'thumbnail': data.get('jpg'),
+            'uploader': data.get('md_author'),
+            'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')),
+            'duration': int_or_none(data.get('duration') or mv_data.get('duration')),
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'like_count': int_or_none(mv_data.get('likes')),
+            'comment_count': int_or_none(mv_data.get('commcount')),
+            'is_live': is_live,
+        }
+
+
+class VKUserVideosIE(VKBaseIE):
+    IE_NAME = 'vk:uservideos'
+    IE_DESC = "VK - User's Videos"
+    _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)'
+    _TEMPLATE_URL = 'https://vk.com/videos'
+    _TESTS = [{
+        'url': 'https://vk.com/videos-767561',
+        'info_dict': {
+            'id': '-767561_all',
+        },
+        'playlist_mincount': 1150,
+    }, {
+        'url': 'https://vk.com/videos-767561?section=uploaded',
+        'info_dict': {
+            'id': '-767561_uploaded',
+        },
+        'playlist_mincount': 425,
+    }, {
+        'url': 'http://vk.com/videos205387401',
+        'only_matching': True,
+    }, {
+        'url': 'http://vk.com/videos-77521',
+        'only_matching': True,
+    }, {
+        'url': 'http://vk.com/videos-97664626?section=all',
+        'only_matching': True,
+    }, {
+        'url': 'http://m.vk.com/videos205387401',
+        'only_matching': True,
+    }, {
+        'url': 'http://new.vk.com/videos205387401',
+        'only_matching': True,
+    }]
+    _PAGE_SIZE = 1000
+    _VIDEO = collections.namedtuple('Video', ['owner_id', 'id'])
+
+    def _fetch_page(self, page_id, section, page):
+        l = self._download_payload('al_video', page_id, {
+            'act': 'load_videos_silent',
+            'offset': page * self._PAGE_SIZE,
+            'oid': page_id,
+            'section': section,
+        })[0][section]['list']
+
+        for video in l:
+            v = self._VIDEO._make(video[:2])
+            video_id = '%d_%d' % (v.owner_id, v.id)
+            yield self.url_result(
+                'http://vk.com/video' + video_id, VKIE.ie_key(), video_id)
+
+    def _real_extract(self, url):
+        page_id, section = re.match(self._VALID_URL, url).groups()
+        if not section:
+            section = 'all'
+
+        entries = OnDemandPagedList(
+            functools.partial(self._fetch_page, page_id, section),
+            self._PAGE_SIZE)
+
+        return self.playlist_result(entries, '%s_%s' % (page_id, section))
+
+
+class VKWallPostIE(VKBaseIE):
+    IE_NAME = 'vk:wallpost'
+    _VALID_URL = r'https?://(?:(?:(?:(?:m|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P<id>-?\d+_\d+)))'
+    _TESTS = [{
+        # public page URL, audio playlist
+        'url': 'https://vk.com/bs.official?w=wall-23538238_35',
+        'info_dict': {
+            'id': '-23538238_35',
+            'title': 'Black Shadow - Wall post -23538238_35',
+            'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c',
+        },
+        'playlist': [{
+            'md5': '5ba93864ec5b85f7ce19a9af4af080f6',
+            'info_dict': {
+                'id': '135220665_111806521',
+                'ext': 'mp4',
+                'title': 'Black Shadow - Слепое Верование',
+                'duration': 370,
+                'uploader': 'Black Shadow',
+                'artist': 'Black Shadow',
+                'track': 'Слепое Верование',
+            },
+        }, {
+            'md5': '4cc7e804579122b17ea95af7834c9233',
+            'info_dict': {
+                'id': '135220665_111802303',
+                'ext': 'mp4',
+                'title': 'Black Shadow - Война - Негасимое Бездны Пламя!',
+                'duration': 423,
+                'uploader': 'Black Shadow',
+                'artist': 'Black Shadow',
+                'track': 'Война - Негасимое Бездны Пламя!',
+            },
+        }],
+        'params': {
+            'skip_download': True,
+            'usenetrc': True,
+        },
+        'skip': 'Requires vk account credentials',
+    }, {
+        # single YouTube embed, no leading -
+        'url': 'https://vk.com/wall85155021_6319',
+        'info_dict': {
+            'id': '85155021_6319',
+            'title': 'Сергей Горбунов - Wall post 85155021_6319',
+        },
+        'playlist_count': 1,
+        'params': {
+            'usenetrc': True,
+        },
+        'skip': 'Requires vk account credentials',
+    }, {
+        # wall page URL
+        'url': 'https://vk.com/wall-23538238_35',
+        'only_matching': True,
+    }, {
+        # mobile wall page URL
+        'url': 'https://m.vk.com/wall-23538238_35',
+        'only_matching': True,
+    }]
+    _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/='
+    _AUDIO = collections.namedtuple('Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads'])
+
+    def _decode(self, enc):
+        dec = ''
+        e = n = 0
+        for c in enc:
+            r = self._BASE64_CHARS.index(c)
+            cond = n % 4
+            e = 64 * e + r if cond else r
+            n += 1
+            if cond:
+                dec += chr(255 & e >> (-2 * n & 6))
+        return dec
+
+    def _unmask_url(self, mask_url, vk_id):
+        if 'audio_api_unavailable' in mask_url:
+            extra = mask_url.split('?extra=')[1].split('#')
+            func, base = self._decode(extra[1]).split(chr(11))
+            mask_url = list(self._decode(extra[0]))
+            url_len = len(mask_url)
+            indexes = [None] * url_len
+            index = int(base) ^ vk_id
+            for n in range(url_len - 1, -1, -1):
+                index = (url_len * (n + 1) ^ index + n) % url_len
+                indexes[n] = index
+            for n in range(1, url_len):
+                c = mask_url[n]
+                index = indexes[url_len - 1 - n]
+                mask_url[n] = mask_url[index]
+                mask_url[index] = c
+            mask_url = ''.join(mask_url)
+        return mask_url
+
+    def _real_extract(self, url):
+        post_id = self._match_id(url)
+
+        webpage = self._download_payload('wkview', post_id, {
+            'act': 'show',
+            'w': 'wall' + post_id,
+        })[1]
+
+        description = clean_html(get_element_by_class('wall_post_text', webpage))
+        uploader = clean_html(get_element_by_class('author', webpage))
+
+        entries = []
+
+        for audio in re.findall(r'data-audio="([^"]+)', webpage):
+            audio = self._parse_json(unescapeHTML(audio), post_id)
+            a = self._AUDIO._make(audio[:16])
+            if not a.url:
+                continue
+            title = unescapeHTML(a.title)
+            performer = unescapeHTML(a.performer)
+            entries.append({
+                'id': '%s_%s' % (a.owner_id, a.id),
+                'url': self._unmask_url(a.url, a.ads['vk_id']),
+                'title': '%s - %s' % (performer, title) if performer else title,
+                'thumbnails': [{'url': c_url} for c_url in a.cover_url.split(',')] if a.cover_url else None,
+                'duration': int_or_none(a.duration),
+                'uploader': uploader,
+                'artist': performer,
+                'track': title,
+                'ext': 'mp4',
+                'protocol': 'm3u8',
+            })
+
+        for video in re.finditer(
+                r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):
+            entries.append(self.url_result(
+                compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key()))
+
+        title = 'Wall post %s' % post_id
+
+        return self.playlist_result(
+            orderedSet(entries), post_id,
+            '%s - %s' % (uploader, title) if uploader else title,
+            description)
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py

new file mode 100644 (file)

index 0000000..f79531e
--- /dev/null
+++ b/youtube_dl/extractor/vlive.py
@@ -0,0 +1,367 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import time
+import itertools
+
+from .common import InfoExtractor
+from .naver import NaverBaseIE
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    merge_dicts,
+    remove_start,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class VLiveIE(NaverBaseIE):
+    IE_NAME = 'vlive'
+    _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)'
+    _NETRC_MACHINE = 'vlive'
+    _TESTS = [{
+        'url': 'http://www.vlive.tv/video/1326',
+        'md5': 'cc7314812855ce56de70a06a27314983',
+        'info_dict': {
+            'id': '1326',
+            'ext': 'mp4',
+            'title': "[V LIVE] Girl's Day's Broadcast",
+            'creator': "Girl's Day",
+            'view_count': int,
+            'uploader_id': 'muploader_a',
+        },
+    }, {
+        'url': 'http://www.vlive.tv/video/16937',
+        'info_dict': {
+            'id': '16937',
+            'ext': 'mp4',
+            'title': '[V LIVE] 첸백시 걍방',
+            'creator': 'EXO',
+            'view_count': int,
+            'subtitles': 'mincount:12',
+            'uploader_id': 'muploader_j',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.vlive.tv/video/129100',
+        'md5': 'ca2569453b79d66e5b919e5d308bff6b',
+        'info_dict': {
+            'id': '129100',
+            'ext': 'mp4',
+            'title': '[V LIVE] [BTS+] Run BTS! 2019 - EP.71 :: Behind the scene',
+            'creator': 'BTS+',
+            'view_count': int,
+            'subtitles': 'mincount:10',
+        },
+        'skip': 'This video is only available for CH+ subscribers',
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        email, password = self._get_login_info()
+        if None in (email, password):
+            return
+
+        def is_logged_in():
+            login_info = self._download_json(
+                'https://www.vlive.tv/auth/loginInfo', None,
+                note='Downloading login info',
+                headers={'Referer': 'https://www.vlive.tv/home'})
+            return try_get(
+                login_info, lambda x: x['message']['login'], bool) or False
+
+        LOGIN_URL = 'https://www.vlive.tv/auth/email/login'
+        self._request_webpage(
+            LOGIN_URL, None, note='Downloading login cookies')
+
+        self._download_webpage(
+            LOGIN_URL, None, note='Logging in',
+            data=urlencode_postdata({'email': email, 'pwd': password}),
+            headers={
+                'Referer': LOGIN_URL,
+                'Content-Type': 'application/x-www-form-urlencoded'
+            })
+
+        if not is_logged_in():
+            raise ExtractorError('Unable to log in', expected=True)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://www.vlive.tv/video/%s' % video_id, video_id)
+
+        VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)'
+        VIDEO_PARAMS_FIELD = 'video params'
+
+        params = self._parse_json(self._search_regex(
+            VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id,
+            transform_source=lambda s: '[' + s + ']', fatal=False)
+
+        if not params or len(params) < 7:
+            params = self._search_regex(
+                VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD)
+            params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)]
+
+        status, long_video_id, key = params[2], params[5], params[6]
+        status = remove_start(status, 'PRODUCT_')
+
+        if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'):
+            return self._live(video_id, webpage)
+        elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'):
+            return self._replay(video_id, webpage, long_video_id, key)
+
+        if status == 'LIVE_END':
+            raise ExtractorError('Uploading for replay. Please wait...',
+                                 expected=True)
+        elif status == 'COMING_SOON':
+            raise ExtractorError('Coming soon!', expected=True)
+        elif status == 'CANCELED':
+            raise ExtractorError('We are sorry, '
+                                 'but the live broadcast has been canceled.',
+                                 expected=True)
+        elif status == 'ONLY_APP':
+            raise ExtractorError('Unsupported video type', expected=True)
+        else:
+            raise ExtractorError('Unknown status %s' % status)
+
+    def _get_common_fields(self, webpage):
+        title = self._og_search_title(webpage)
+        creator = self._html_search_regex(
+            r'<div[^>]+class="info_area"[^>]*>\s*(?:<em[^>]*>.*?</em\s*>\s*)?<a\s+[^>]*>([^<]+)',
+            webpage, 'creator', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
+        return {
+            'title': title,
+            'creator': creator,
+            'thumbnail': thumbnail,
+        }
+
+    def _live(self, video_id, webpage):
+        init_page = self._download_init_page(video_id)
+
+        live_params = self._search_regex(
+            r'"liveStreamInfo"\s*:\s*(".*"),',
+            init_page, 'live stream info')
+        live_params = self._parse_json(live_params, video_id)
+        live_params = self._parse_json(live_params, video_id)
+
+        formats = []
+        for vid in live_params.get('resolutions', []):
+            formats.extend(self._extract_m3u8_formats(
+                vid['cdnUrl'], video_id, 'mp4',
+                m3u8_id=vid.get('name'),
+                fatal=False, live=True))
+        self._sort_formats(formats)
+
+        info = self._get_common_fields(webpage)
+        info.update({
+            'title': self._live_title(info['title']),
+            'id': video_id,
+            'formats': formats,
+            'is_live': True,
+        })
+        return info
+
+    def _replay(self, video_id, webpage, long_video_id, key):
+        if '' in (long_video_id, key):
+            init_page = self._download_init_page(video_id)
+            video_info = self._parse_json(self._search_regex(
+                (r'(?s)oVideoStatus\s*=\s*({.+?})\s*</script',
+                 r'(?s)oVideoStatus\s*=\s*({.+})'), init_page, 'video info'),
+                video_id)
+            if video_info.get('status') == 'NEED_CHANNEL_PLUS':
+                self.raise_login_required(
+                    'This video is only available for CH+ subscribers')
+            long_video_id, key = video_info['vid'], video_info['inkey']
+
+        return merge_dicts(
+            self._get_common_fields(webpage),
+            self._extract_video_info(video_id, long_video_id, key))
+
+    def _download_init_page(self, video_id):
+        return self._download_webpage(
+            'https://www.vlive.tv/video/init/view',
+            video_id, note='Downloading live webpage',
+            data=urlencode_postdata({'videoSeq': video_id}),
+            headers={
+                'Referer': 'https://www.vlive.tv/video/%s' % video_id,
+                'Content-Type': 'application/x-www-form-urlencoded'
+            })
+
+
+class VLiveChannelIE(InfoExtractor):
+    IE_NAME = 'vlive:channel'
+    _VALID_URL = r'https?://channels\.vlive\.tv/(?P<id>[0-9A-Z]+)'
+    _TEST = {
+        'url': 'http://channels.vlive.tv/FCD4B',
+        'info_dict': {
+            'id': 'FCD4B',
+            'title': 'MAMAMOO',
+        },
+        'playlist_mincount': 110
+    }
+    _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
+
+    def _real_extract(self, url):
+        channel_code = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://channels.vlive.tv/%s/video' % channel_code, channel_code)
+
+        app_id = None
+
+        app_js_url = self._search_regex(
+            r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1',
+            webpage, 'app js', default=None, group='url')
+
+        if app_js_url:
+            app_js = self._download_webpage(
+                app_js_url, channel_code, 'Downloading app JS', fatal=False)
+            if app_js:
+                app_id = self._search_regex(
+                    r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]',
+                    app_js, 'app id', default=None)
+
+        app_id = app_id or self._APP_ID
+
+        channel_info = self._download_json(
+            'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode',
+            channel_code, note='Downloading decode channel code',
+            query={
+                'app_id': app_id,
+                'channelCode': channel_code,
+                '_': int(time.time())
+            })
+
+        channel_seq = channel_info['result']['channelSeq']
+        channel_name = None
+        entries = []
+
+        for page_num in itertools.count(1):
+            video_list = self._download_json(
+                'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList',
+                channel_code, note='Downloading channel list page #%d' % page_num,
+                query={
+                    'app_id': app_id,
+                    'channelSeq': channel_seq,
+                    # Large values of maxNumOfRows (~300 or above) may cause
+                    # empty responses (see [1]), e.g. this happens for [2] that
+                    # has more than 300 videos.
+                    # 1. https://github.com/ytdl-org/youtube-dl/issues/13830
+                    # 2. http://channels.vlive.tv/EDBF.
+                    'maxNumOfRows': 100,
+                    '_': int(time.time()),
+                    'pageNo': page_num
+                }
+            )
+
+            if not channel_name:
+                channel_name = try_get(
+                    video_list,
+                    lambda x: x['result']['channelInfo']['channelName'],
+                    compat_str)
+
+            videos = try_get(
+                video_list, lambda x: x['result']['videoList'], list)
+            if not videos:
+                break
+
+            for video in videos:
+                video_id = video.get('videoSeq')
+                if not video_id:
+                    continue
+                video_id = compat_str(video_id)
+                entries.append(
+                    self.url_result(
+                        'http://www.vlive.tv/video/%s' % video_id,
+                        ie=VLiveIE.ie_key(), video_id=video_id))
+
+        return self.playlist_result(
+            entries, channel_code, channel_name)
+
+
+class VLivePlaylistIE(InfoExtractor):
+    IE_NAME = 'vlive:playlist'
+    _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
+    _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'
+    _TESTS = [{
+        # regular working playlist
+        'url': 'https://www.vlive.tv/video/117956/playlist/117963',
+        'info_dict': {
+            'id': '117963',
+            'title': '아이돌룸(IDOL ROOM) 41회 - (여자)아이들'
+        },
+        'playlist_mincount': 10
+    }, {
+        # playlist with no playlistVideoSeqs
+        'url': 'http://www.vlive.tv/video/22867/playlist/22912',
+        'info_dict': {
+            'id': '22867',
+            'ext': 'mp4',
+            'title': '[V LIVE] Valentine Day Message from MINA',
+            'creator': 'TWICE',
+            'view_count': int
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }]
+
+    def _build_video_result(self, video_id, message):
+        self.to_screen(message)
+        return self.url_result(
+            self._VIDEO_URL_TEMPLATE % video_id,
+            ie=VLiveIE.ie_key(), video_id=video_id)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, playlist_id = mobj.group('video_id', 'id')
+
+        if self._downloader.params.get('noplaylist'):
+            return self._build_video_result(
+                video_id,
+                'Downloading just video %s because of --no-playlist'
+                % video_id)
+
+        self.to_screen(
+            'Downloading playlist %s - add --no-playlist to just download video'
+            % playlist_id)
+
+        webpage = self._download_webpage(
+            'http://www.vlive.tv/video/%s/playlist/%s'
+            % (video_id, playlist_id), playlist_id)
+
+        raw_item_ids = self._search_regex(
+            r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage,
+            'playlist video seqs', default=None, fatal=False)
+
+        if not raw_item_ids:
+            return self._build_video_result(
+                video_id,
+                'Downloading just video %s because no playlist was found'
+                % video_id)
+
+        item_ids = self._parse_json(raw_item_ids, playlist_id)
+
+        entries = [
+            self.url_result(
+                self._VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(),
+                video_id=compat_str(item_id))
+            for item_id in item_ids]
+
+        playlist_name = self._html_search_regex(
+            r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)',
+            webpage, 'playlist title', fatal=False)
+
+        return self.playlist_result(entries, playlist_id, playlist_name)
diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py

new file mode 100644 (file)

index 0000000..02c9617
--- /dev/null
+++ b/youtube_dl/extractor/vodlocker.py
@@ -0,0 +1,80 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    NO_DEFAULT,
+    sanitized_Request,
+    urlencode_postdata,
+)
+
+
+class VodlockerIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vodlocker\.(?:com|city)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?'
+
+    _TESTS = [{
+        'url': 'http://vodlocker.com/e8wvyzz4sl42',
+        'md5': 'ce0c2d18fa0735f1bd91b69b0e54aacf',
+        'info_dict': {
+            'id': 'e8wvyzz4sl42',
+            'ext': 'mp4',
+            'title': 'Germany vs Brazil',
+            'thumbnail': r're:http://.*\.jpg',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        if any(p in webpage for p in (
+                '>THIS FILE WAS DELETED<',
+                '>File Not Found<',
+                'The file you were looking for could not be found, sorry for any inconvenience.<',
+                '>The file was removed')):
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+        fields = self._hidden_inputs(webpage)
+
+        if fields['op'] == 'download1':
+            self._sleep(3, video_id)  # they do detect when requests happen too fast!
+            post = urlencode_postdata(fields)
+            req = sanitized_Request(url, post)
+            req.add_header('Content-type', 'application/x-www-form-urlencoded')
+            webpage = self._download_webpage(
+                req, video_id, 'Downloading video page')
+
+        def extract_file_url(html, default=NO_DEFAULT):
+            return self._search_regex(
+                r'file:\s*"(http[^\"]+)",', html, 'file url', default=default)
+
+        video_url = extract_file_url(webpage, default=None)
+
+        if not video_url:
+            embed_url = self._search_regex(
+                r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?vodlocker\.(?:com|city)/embed-.+?)\1',
+                webpage, 'embed url', group='url')
+            embed_webpage = self._download_webpage(
+                embed_url, video_id, 'Downloading embed webpage')
+            video_url = extract_file_url(embed_webpage)
+            thumbnail_webpage = embed_webpage
+        else:
+            thumbnail_webpage = webpage
+
+        title = self._search_regex(
+            r'id="file_title".*?>\s*(.*?)\s*<(?:br|span)', webpage, 'title')
+        thumbnail = self._search_regex(
+            r'image:\s*"(http[^\"]+)",', thumbnail_webpage, 'thumbnail', fatal=False)
+
+        formats = [{
+            'format_id': 'sd',
+            'url': video_url,
+        }]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/vodpl.py b/youtube_dl/extractor/vodpl.py

new file mode 100644 (file)

index 0000000..9e91970
--- /dev/null
+++ b/youtube_dl/extractor/vodpl.py
@@ -0,0 +1,32 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .onet import OnetBaseIE
+
+
+class VODPlIE(OnetBaseIE):
+    _VALID_URL = r'https?://vod\.pl/(?:[^/]+/)+(?P<id>[0-9a-zA-Z]+)'
+
+    _TESTS = [{
+        'url': 'https://vod.pl/filmy/chlopaki-nie-placza/3ep3jns',
+        'md5': 'a7dc3b2f7faa2421aefb0ecaabf7ec74',
+        'info_dict': {
+            'id': '3ep3jns',
+            'ext': 'mp4',
+            'title': 'Chłopaki nie płaczą',
+            'description': 'md5:f5f03b84712e55f5ac9f0a3f94445224',
+            'timestamp': 1463415154,
+            'duration': 5765,
+            'upload_date': '20160516',
+        },
+    }, {
+        'url': 'https://vod.pl/seriale/belfer-na-planie-praca-kamery-online/2c10heh',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        info_dict = self._extract_from_id(self._search_mvp_id(webpage), webpage)
+        info_dict['id'] = video_id
+        return info_dict
diff --git a/youtube_dl/extractor/vodplatform.py b/youtube_dl/extractor/vodplatform.py

new file mode 100644 (file)

index 0000000..74d2257
--- /dev/null
+++ b/youtube_dl/extractor/vodplatform.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import unescapeHTML
+
+
+class VODPlatformIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar
+        'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw',
+        'md5': '1db2b7249ce383d6be96499006e951fc',
+        'info_dict': {
+            'id': 'RufMcytHDolTH1MuKHY9Fw',
+            'ext': 'mp4',
+            'title': 'LBCi News_ النصرة في ضيافة الـ "سي.أن.أن"',
+        }
+    }, {
+        'url': 'http://embed.kwikmotion.com/embed/RufMcytHDolTH1MuKHY9Fw',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = unescapeHTML(self._og_search_title(webpage))
+        hidden_inputs = self._hidden_inputs(webpage)
+
+        formats = self._extract_wowza_formats(
+            hidden_inputs.get('HiddenmyhHlsLink') or hidden_inputs['HiddenmyDashLink'], video_id, skip_protocols=['f4m', 'smil'])
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': hidden_inputs.get('HiddenThumbnail') or self._og_search_thumbnail(webpage),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py

new file mode 100644 (file)

index 0000000..a52e40a
--- /dev/null
+++ b/youtube_dl/extractor/voicerepublic.py
@@ -0,0 +1,62 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    determine_ext,
+    int_or_none,
+    urljoin,
+)
+
+
+class VoiceRepublicIE(InfoExtractor):
+    _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
+    _TESTS = [{
+        'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
+        'md5': 'b9174d651323f17783000876347116e3',
+        'info_dict': {
+            'id': '2296',
+            'display_id': 'watching-the-watchers-building-a-sousveillance-state',
+            'ext': 'm4a',
+            'title': 'Watching the Watchers: Building a Sousveillance State',
+            'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.',
+            'duration': 1556,
+            'view_count': int,
+        }
+    }, {
+        'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        if '>Queued for processing, please stand by...<' in webpage:
+            raise ExtractorError(
+                'Audio is still queued for processing', expected=True)
+
+        talk = self._parse_json(self._search_regex(
+            r'initialSnapshot\s*=\s*({.+?});',
+            webpage, 'talk'), display_id)['talk']
+        title = talk['title']
+        formats = [{
+            'url': urljoin(url, talk_url),
+            'format_id': format_id,
+            'ext': determine_ext(talk_url) or format_id,
+            'vcodec': 'none',
+        } for format_id, talk_url in talk['media_links'].items()]
+        self._sort_formats(formats)
+
+        return {
+            'id': compat_str(talk.get('id') or display_id),
+            'display_id': display_id,
+            'title': title,
+            'description': talk.get('teaser'),
+            'thumbnail': talk.get('image_url'),
+            'duration': int_or_none(talk.get('archived_duration')),
+            'view_count': int_or_none(talk.get('play_count')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py

new file mode 100644 (file)

index 0000000..751b21e
--- /dev/null
+++ b/youtube_dl/extractor/voot.py
@@ -0,0 +1,100 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    try_get,
+    unified_timestamp,
+)
+
+
+class VootIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?voot\.com/(?:[^/]+/)+(?P<id>\d+)'
+    _GEO_COUNTRIES = ['IN']
+    _TESTS = [{
+        'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353',
+        'info_dict': {
+            'id': '0_8ledb18o',
+            'ext': 'mp4',
+            'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340',
+            'description': 'md5:06291fbbbc4dcbe21235c40c262507c1',
+            'timestamp': 1472162937,
+            'upload_date': '20160825',
+            'duration': 1146,
+            'series': 'Ishq Ka Rang Safed',
+            'season_number': 1,
+            'episode': 'Is this the end of Kamini?',
+            'episode_number': 340,
+            'view_count': int,
+            'like_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.voot.com/movies/pandavas-5/424627',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        media_info = self._download_json(
+            'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id,
+            query={
+                'platform': 'Web',
+                'pId': 2,
+                'mediaId': video_id,
+            })
+
+        status_code = try_get(media_info, lambda x: x['status']['code'], int)
+        if status_code != 0:
+            raise ExtractorError(media_info['status']['message'], expected=True)
+
+        media = media_info['assets']
+
+        entry_id = media['EntryId']
+        title = media['MediaName']
+        formats = self._extract_m3u8_formats(
+            'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id,
+            video_id, 'mp4', m3u8_id='hls')
+        self._sort_formats(formats)
+
+        description, series, season_number, episode, episode_number = [None] * 5
+
+        for meta in try_get(media, lambda x: x['Metas'], list) or []:
+            key, value = meta.get('Key'), meta.get('Value')
+            if not key or not value:
+                continue
+            if key == 'ContentSynopsis':
+                description = value
+            elif key == 'RefSeriesTitle':
+                series = value
+            elif key == 'RefSeriesSeason':
+                season_number = int_or_none(value)
+            elif key == 'EpisodeMainTitle':
+                episode = value
+            elif key == 'EpisodeNo':
+                episode_number = int_or_none(value)
+
+        return {
+            'extractor_key': 'Kaltura',
+            'id': entry_id,
+            'title': title,
+            'description': description,
+            'series': series,
+            'season_number': season_number,
+            'episode': episode,
+            'episode_number': episode_number,
+            'timestamp': unified_timestamp(media.get('CreationDate')),
+            'duration': int_or_none(media.get('Duration')),
+            'view_count': int_or_none(media.get('ViewCounter')),
+            'like_count': int_or_none(media.get('like_counter')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py

new file mode 100644 (file)

index 0000000..b318e15
--- /dev/null
+++ b/youtube_dl/extractor/voxmedia.py
@@ -0,0 +1,215 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .once import OnceIE
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
+
+
+class VoxMediaVolumeIE(OnceIE):
+    _VALID_URL = r'https?://volume\.vox-cdn\.com/embed/(?P<id>[0-9a-f]{9})'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        setup = self._parse_json(self._search_regex(
+            r'setup\s*=\s*({.+});', webpage, 'setup'), video_id)
+        video_data = setup.get('video') or {}
+        info = {
+            'id': video_id,
+            'title': video_data.get('title_short'),
+            'description': video_data.get('description_long') or video_data.get('description_short'),
+            'thumbnail': video_data.get('brightcove_thumbnail')
+        }
+        asset = setup.get('asset') or setup.get('params') or {}
+
+        formats = []
+        hls_url = asset.get('hls_url')
+        if hls_url:
+            formats.extend(self._extract_m3u8_formats(
+                hls_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+        mp4_url = asset.get('mp4_url')
+        if mp4_url:
+            tbr = self._search_regex(r'-(\d+)k\.', mp4_url, 'bitrate', default=None)
+            format_id = 'http'
+            if tbr:
+                format_id += '-' + tbr
+            formats.append({
+                'format_id': format_id,
+                'url': mp4_url,
+                'tbr': int_or_none(tbr),
+            })
+        if formats:
+            self._sort_formats(formats)
+            info['formats'] = formats
+            return info
+
+        for provider_video_type in ('ooyala', 'youtube', 'brightcove'):
+            provider_video_id = video_data.get('%s_id' % provider_video_type)
+            if not provider_video_id:
+                continue
+            if provider_video_type == 'brightcove':
+                info['formats'] = self._extract_once_formats(provider_video_id)
+                self._sort_formats(info['formats'])
+            else:
+                info.update({
+                    '_type': 'url_transparent',
+                    'url': provider_video_id if provider_video_type == 'youtube' else '%s:%s' % (provider_video_type, provider_video_id),
+                    'ie_key': provider_video_type.capitalize(),
+                })
+            return info
+        raise ExtractorError('Unable to find provider video id')
+
+
+class VoxMediaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:theverge|vox|sbnation|eater|polygon|curbed|racked|funnyordie)\.com|recode\.net)/(?:[^/]+/)*(?P<id>[^/?]+)'
+    _TESTS = [{
+        # Volume embed, Youtube
+        'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of',
+        'info_dict': {
+            'id': 'j4mLW6x17VM',
+            'ext': 'mp4',
+            'title': 'Material world: how Google discovered what software is made of',
+            'description': 'md5:dfc17e7715e3b542d66e33a109861382',
+            'upload_date': '20190710',
+            'uploader_id': 'TheVerge',
+            'uploader': 'The Verge',
+        },
+        'add_ie': ['Youtube'],
+    }, {
+        # Volume embed, Youtube
+        'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet',
+        'md5': '4c8f4a0937752b437c3ebc0ed24802b5',
+        'info_dict': {
+            'id': 'Gy8Md3Eky38',
+            'ext': 'mp4',
+            'title': 'The Nexus 6: hands-on with Google\'s phablet',
+            'description': 'md5:d9f0216e5fb932dd2033d6db37ac3f1d',
+            'uploader_id': 'TheVerge',
+            'upload_date': '20141021',
+            'uploader': 'The Verge',
+        },
+        'add_ie': ['Youtube'],
+        'skip': 'similar to the previous test',
+    }, {
+        # Volume embed, Youtube
+        'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill',
+        'info_dict': {
+            'id': 'YCjDnX-Xzhg',
+            'ext': 'mp4',
+            'title': "Mississippi's laws are so bad that its anti-LGBTQ law isn't needed to allow discrimination",
+            'description': 'md5:fc1317922057de31cd74bce91eb1c66c',
+            'uploader_id': 'voxdotcom',
+            'upload_date': '20150915',
+            'uploader': 'Vox',
+        },
+        'add_ie': ['Youtube'],
+        'skip': 'similar to the previous test',
+    }, {
+        # youtube embed
+        'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance',
+        'md5': '83b3080489fb103941e549352d3e0977',
+        'info_dict': {
+            'id': 'FcNHTJU1ufM',
+            'ext': 'mp4',
+            'title': 'How "the robot" became the greatest novelty dance of all time',
+            'description': 'md5:b081c0d588b8b2085870cda55e6da176',
+            'upload_date': '20160324',
+            'uploader_id': 'voxdotcom',
+            'uploader': 'Vox',
+        },
+        'add_ie': ['Youtube'],
+        'skip': 'Page no longer contain videos',
+    }, {
+        # SBN.VideoLinkset.entryGroup multiple ooyala embeds
+        'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
+        'info_dict': {
+            'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
+            'title': '25 lies you will tell yourself on National Signing Day',
+            'description': 'It\'s the most self-delusional time of the year, and everyone\'s gonna tell the same lies together!',
+        },
+        'playlist': [{
+            'md5': '721fededf2ab74ae4176c8c8cbfe092e',
+            'info_dict': {
+                'id': 'p3cThlMjE61VDi_SD9JlIteSNPWVDBB9',
+                'ext': 'mp4',
+                'title': 'Buddy Hield vs Steph Curry (and the world)',
+                'description': 'Let’s dissect only the most important Final Four storylines.',
+            },
+        }, {
+            'md5': 'bf0c5cc115636af028be1bab79217ea9',
+            'info_dict': {
+                'id': 'BmbmVjMjE6esPHxdALGubTrouQ0jYLHj',
+                'ext': 'mp4',
+                'title': 'Chasing Cinderella 2016: Syracuse basketball',
+                'description': 'md5:e02d56b026d51aa32c010676765a690d',
+            },
+        }],
+        'skip': 'Page no longer contain videos',
+    }, {
+        # volume embed, Brightcove Once
+        'url': 'https://www.recode.net/2014/6/17/11628066/post-post-pc-ceo-the-full-code-conference-video-of-microsofts-satya',
+        'md5': '2dbc77b8b0bff1894c2fce16eded637d',
+        'info_dict': {
+            'id': '1231c973d',
+            'ext': 'mp4',
+            'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella',
+            'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.',
+        },
+        'add_ie': ['VoxMediaVolume'],
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = compat_urllib_parse_unquote(self._download_webpage(url, display_id))
+
+        def create_entry(provider_video_id, provider_video_type, title=None, description=None):
+            video_url = {
+                'youtube': '%s',
+                'ooyala': 'ooyala:%s',
+                'volume': 'http://volume.vox-cdn.com/embed/%s',
+            }[provider_video_type] % provider_video_id
+            return {
+                '_type': 'url_transparent',
+                'url': video_url,
+                'title': title or self._og_search_title(webpage),
+                'description': description or self._og_search_description(webpage),
+            }
+
+        entries = []
+        entries_data = self._search_regex([
+            r'Chorus\.VideoContext\.addVideo\((\[{.+}\])\);',
+            r'var\s+entry\s*=\s*({.+});',
+            r'SBN\.VideoLinkset\.entryGroup\(\s*(\[.+\])',
+        ], webpage, 'video data', default=None)
+        if entries_data:
+            entries_data = self._parse_json(entries_data, display_id)
+            if isinstance(entries_data, dict):
+                entries_data = [entries_data]
+            for video_data in entries_data:
+                provider_video_id = video_data.get('provider_video_id')
+                provider_video_type = video_data.get('provider_video_type')
+                if provider_video_id and provider_video_type:
+                    entries.append(create_entry(
+                        provider_video_id, provider_video_type,
+                        video_data.get('title'), video_data.get('description')))
+
+        provider_video_id = self._search_regex(
+            r'data-ooyala-id="([^"]+)"', webpage, 'ooyala id', default=None)
+        if provider_video_id:
+            entries.append(create_entry(provider_video_id, 'ooyala'))
+
+        volume_uuid = self._search_regex(
+            r'data-volume-uuid="([^"]+)"', webpage, 'volume uuid', default=None)
+        if volume_uuid:
+            entries.append(create_entry(volume_uuid, 'volume'))
+
+        if len(entries) == 1:
+            return entries[0]
+        else:
+            return self.playlist_result(entries, display_id, self._og_search_title(webpage), self._og_search_description(webpage))
diff --git a/youtube_dl/extractor/vrak.py b/youtube_dl/extractor/vrak.py

new file mode 100644 (file)

index 0000000..daa247c
--- /dev/null
+++ b/youtube_dl/extractor/vrak.py
@@ -0,0 +1,80 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveNewIE
+from ..utils import (
+    int_or_none,
+    parse_age_limit,
+    smuggle_url,
+    unescapeHTML,
+)
+
+
+class VrakIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vrak\.tv/videos\?.*?\btarget=(?P<id>[\d.]+)'
+    _TEST = {
+        'url': 'http://www.vrak.tv/videos?target=1.2306782&filtre=emission&id=1.1806721',
+        'info_dict': {
+            'id': '5345661243001',
+            'ext': 'mp4',
+            'title': 'Obésité, film de hockey et Roseline Filion',
+            'timestamp': 1488492126,
+            'upload_date': '20170302',
+            'uploader_id': '2890187628001',
+            'creator': 'VRAK.TV',
+            'age_limit': 8,
+            'series': 'ALT (Actualité Légèrement Tordue)',
+            'episode': 'Obésité, film de hockey et Roseline Filion',
+            'tags': list,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2890187628001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<h\d\b[^>]+\bclass=["\']videoTitle["\'][^>]*>([^<]+)',
+            webpage, 'title', default=None) or self._og_search_title(webpage)
+
+        content = self._parse_json(
+            self._search_regex(
+                r'data-player-options-content=(["\'])(?P<content>{.+?})\1',
+                webpage, 'content', default='{}', group='content'),
+            video_id, transform_source=unescapeHTML)
+
+        ref_id = content.get('refId') or self._search_regex(
+            r'refId&quot;:&quot;([^&]+)&quot;', webpage, 'ref id')
+
+        brightcove_id = self._search_regex(
+            r'''(?x)
+                java\.lang\.String\s+value\s*=\s*["']brightcove\.article\.\d+\.%s
+                [^>]*
+                java\.lang\.String\s+value\s*=\s*["'](\d+)
+            ''' % re.escape(ref_id), webpage, 'brightcove id')
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': BrightcoveNewIE.ie_key(),
+            'url': smuggle_url(
+                self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+                {'geo_countries': ['CA']}),
+            'id': brightcove_id,
+            'description': content.get('description'),
+            'creator': content.get('brand'),
+            'age_limit': parse_age_limit(content.get('rating')),
+            'series': content.get('showName') or content.get(
+                'episodeName'),  # this is intentional
+            'season_number': int_or_none(content.get('seasonNumber')),
+            'episode': title,
+            'episode_number': int_or_none(content.get('episodeNumber')),
+            'tags': content.get('tags', []),
+        }
diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py

new file mode 100644 (file)

index 0000000..4220252
--- /dev/null
+++ b/youtube_dl/extractor/vrt.py
@@ -0,0 +1,87 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    extract_attributes,
+    float_or_none,
+    get_element_by_class,
+    strip_or_none,
+    unified_timestamp,
+)
+
+
+class VRTIE(InfoExtractor):
+    IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza'
+    _VALID_URL = r'https?://(?:www\.)?(?P<site>vrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)'
+    _TESTS = [{
+        'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/',
+        'md5': 'e1663accf5cf13f375f3cd0d10476669',
+        'info_dict': {
+            'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd',
+            'ext': 'mp4',
+            'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand',
+            'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.',
+            'timestamp': 1557924660,
+            'upload_date': '20190515',
+            'duration': 31.2,
+        },
+    }, {
+        'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/',
+        'md5': '910bba927566e9ab992278f647eb4b75',
+        'info_dict': {
+            'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818',
+            'ext': 'mp4',
+            'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters',
+            'timestamp': 1557923760,
+            'upload_date': '20190515',
+            'duration': 115.17,
+        },
+    }, {
+        'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/',
+        'only_matching': True,
+    }]
+    _CLIENT_MAP = {
+        'vrt.be/vrtnws': 'vrtnieuws',
+        'sporza.be': 'sporza',
+    }
+
+    def _real_extract(self, url):
+        site, display_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, display_id)
+        attrs = extract_attributes(self._search_regex(
+            r'(<[^>]+class="vrtvideo"[^>]*>)', webpage, 'vrt video'))
+
+        asset_id = attrs['data-videoid']
+        publication_id = attrs.get('data-publicationid')
+        if publication_id:
+            asset_id = publication_id + '$' + asset_id
+        client = attrs.get('data-client') or self._CLIENT_MAP[site]
+
+        title = strip_or_none(get_element_by_class(
+            'vrt-title', webpage) or self._html_search_meta(
+            ['og:title', 'twitter:title', 'name'], webpage))
+        description = self._html_search_meta(
+            ['og:description', 'twitter:description', 'description'], webpage)
+        if description == '…':
+            description = None
+        timestamp = unified_timestamp(self._html_search_meta(
+            'article:published_time', webpage))
+
+        return {
+            '_type': 'url_transparent',
+            'id': asset_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': attrs.get('data-posterimage'),
+            'timestamp': timestamp,
+            'duration': float_or_none(attrs.get('data-duration'), 1000),
+            'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id),
+            'ie_key': 'Canvas',
+        }
diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py

new file mode 100644 (file)

index 0000000..6e51469
--- /dev/null
+++ b/youtube_dl/extractor/vrv.py
@@ -0,0 +1,277 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import json
+import hashlib
+import hmac
+import random
+import string
+import time
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_urllib_parse_urlencode,
+    compat_urllib_parse,
+)
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+)
+
+
+class VRVBaseIE(InfoExtractor):
+    _API_DOMAIN = None
+    _API_PARAMS = {}
+    _CMS_SIGNING = {}
+    _TOKEN = None
+    _TOKEN_SECRET = ''
+
+    def _call_api(self, path, video_id, note, data=None):
+        # https://tools.ietf.org/html/rfc5849#section-3
+        base_url = self._API_DOMAIN + '/core/' + path
+        query = [
+            ('oauth_consumer_key', self._API_PARAMS['oAuthKey']),
+            ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])),
+            ('oauth_signature_method', 'HMAC-SHA1'),
+            ('oauth_timestamp', int(time.time())),
+        ]
+        if self._TOKEN:
+            query.append(('oauth_token', self._TOKEN))
+        encoded_query = compat_urllib_parse_urlencode(query)
+        headers = self.geo_verification_headers()
+        if data:
+            data = json.dumps(data).encode()
+            headers['Content-Type'] = 'application/json'
+        base_string = '&'.join([
+            'POST' if data else 'GET',
+            compat_urllib_parse.quote(base_url, ''),
+            compat_urllib_parse.quote(encoded_query, '')])
+        oauth_signature = base64.b64encode(hmac.new(
+            (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'),
+            base_string.encode(), hashlib.sha1).digest()).decode()
+        encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '')
+        try:
+            return self._download_json(
+                '?'.join([base_url, encoded_query]), video_id,
+                note='Downloading %s JSON metadata' % note, headers=headers, data=data)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True)
+            raise
+
+    def _call_cms(self, path, video_id, note):
+        if not self._CMS_SIGNING:
+            index = self._call_api('index', video_id, 'CMS Signing')
+            self._CMS_SIGNING = index.get('cms_signing') or {}
+            if not self._CMS_SIGNING:
+                for signing_policy in index.get('signing_policies', []):
+                    signing_path = signing_policy.get('path')
+                    if signing_path and signing_path.startswith('/cms/'):
+                        name, value = signing_policy.get('name'), signing_policy.get('value')
+                        if name and value:
+                            self._CMS_SIGNING[name] = value
+        return self._download_json(
+            self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING,
+            note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers())
+
+    def _get_cms_resource(self, resource_key, video_id):
+        return self._call_api(
+            'cms_resource', video_id, 'resource path', data={
+                'resource_key': resource_key,
+            })['__links__']['cms_resource']['href']
+
+    def _real_initialize(self):
+        webpage = self._download_webpage(
+            'https://vrv.co/', None, headers=self.geo_verification_headers())
+        self._API_PARAMS = self._parse_json(self._search_regex(
+            [
+                r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:</script>|;)',
+                r'window\.__APP_CONFIG__\s*=\s*({.+})'
+            ], webpage, 'app config'), None)['cxApiParams']
+        self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co')
+
+
+class VRVIE(VRVBaseIE):
+    IE_NAME = 'vrv'
+    _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)'
+    _TESTS = [{
+        'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',
+        'info_dict': {
+            'id': 'GR9PNZ396',
+            'ext': 'mp4',
+            'title': 'BOSTON: WHERE THE PAST IS THE PRESENT',
+            'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f',
+            'uploader_id': 'seeso',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # movie listing
+        'url': 'https://vrv.co/watch/G6NQXZ1J6/Lily-CAT',
+        'info_dict': {
+            'id': 'G6NQXZ1J6',
+            'title': 'Lily C.A.T',
+            'description': 'md5:988b031e7809a6aeb60968be4af7db07',
+        },
+        'playlist_count': 2,
+    }]
+    _NETRC_MACHINE = 'vrv'
+
+    def _real_initialize(self):
+        super(VRVIE, self)._real_initialize()
+
+        email, password = self._get_login_info()
+        if email is None:
+            return
+
+        token_credentials = self._call_api(
+            'authenticate/by:credentials', None, 'Token Credentials', data={
+                'email': email,
+                'password': password,
+            })
+        self._TOKEN = token_credentials['oauth_token']
+        self._TOKEN_SECRET = token_credentials['oauth_token_secret']
+
+    def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
+        if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'):
+            return []
+        stream_id_list = []
+        if audio_lang:
+            stream_id_list.append('audio-%s' % audio_lang)
+        if hardsub_lang:
+            stream_id_list.append('hardsub-%s' % hardsub_lang)
+        format_id = stream_format
+        if stream_id_list:
+            format_id += '-' + '-'.join(stream_id_list)
+        if 'hls' in stream_format:
+            adaptive_formats = self._extract_m3u8_formats(
+                url, video_id, 'mp4', m3u8_id=format_id,
+                note='Downloading %s information' % format_id,
+                fatal=False)
+        elif stream_format == 'dash':
+            adaptive_formats = self._extract_mpd_formats(
+                url, video_id, mpd_id=format_id,
+                note='Downloading %s information' % format_id,
+                fatal=False)
+        if audio_lang:
+            for f in adaptive_formats:
+                if f.get('acodec') != 'none':
+                    f['language'] = audio_lang
+        return adaptive_formats
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        object_data = self._call_cms(self._get_cms_resource(
+            'cms:/objects/' + video_id, video_id), video_id, 'object')['items'][0]
+        resource_path = object_data['__links__']['resource']['href']
+        video_data = self._call_cms(resource_path, video_id, 'video')
+        title = video_data['title']
+        description = video_data.get('description')
+
+        if video_data.get('__class__') == 'movie_listing':
+            items = self._call_cms(
+                video_data['__links__']['movie_listing/movies']['href'],
+                video_id, 'movie listing').get('items') or []
+            if len(items) != 1:
+                entries = []
+                for item in items:
+                    item_id = item.get('id')
+                    if not item_id:
+                        continue
+                    entries.append(self.url_result(
+                        'https://vrv.co/watch/' + item_id,
+                        self.ie_key(), item_id, item.get('title')))
+                return self.playlist_result(entries, video_id, title, description)
+            video_data = items[0]
+
+        streams_path = video_data['__links__'].get('streams', {}).get('href')
+        if not streams_path:
+            self.raise_login_required()
+        streams_json = self._call_cms(streams_path, video_id, 'streams')
+
+        audio_locale = streams_json.get('audio_locale')
+        formats = []
+        for stream_type, streams in streams_json.get('streams', {}).items():
+            if stream_type in ('adaptive_hls', 'adaptive_dash'):
+                for stream in streams.values():
+                    formats.extend(self._extract_vrv_formats(
+                        stream.get('url'), video_id, stream_type.split('_')[1],
+                        audio_locale, stream.get('hardsub_locale')))
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for k in ('captions', 'subtitles'):
+            for subtitle in streams_json.get(k, {}).values():
+                subtitle_url = subtitle.get('url')
+                if not subtitle_url:
+                    continue
+                subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({
+                    'url': subtitle_url,
+                    'ext': subtitle.get('format', 'ass'),
+                })
+
+        thumbnails = []
+        for thumbnail in video_data.get('images', {}).get('thumbnails', []):
+            thumbnail_url = thumbnail.get('source')
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+                'width': int_or_none(thumbnail.get('width')),
+                'height': int_or_none(thumbnail.get('height')),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'subtitles': subtitles,
+            'thumbnails': thumbnails,
+            'description': description,
+            'duration': float_or_none(video_data.get('duration_ms'), 1000),
+            'uploader_id': video_data.get('channel_id'),
+            'series': video_data.get('series_title'),
+            'season': video_data.get('season_title'),
+            'season_number': int_or_none(video_data.get('season_number')),
+            'season_id': video_data.get('season_id'),
+            'episode': title,
+            'episode_number': int_or_none(video_data.get('episode_number')),
+            'episode_id': video_data.get('production_episode_id'),
+        }
+
+
+class VRVSeriesIE(VRVBaseIE):
+    IE_NAME = 'vrv:series'
+    _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P<id>[A-Z0-9]+)'
+    _TEST = {
+        'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider',
+        'info_dict': {
+            'id': 'G68VXG3G6',
+        },
+        'playlist_mincount': 11,
+    }
+
+    def _real_extract(self, url):
+        series_id = self._match_id(url)
+
+        seasons_path = self._get_cms_resource(
+            'cms:/seasons?series_id=' + series_id, series_id)
+        seasons_data = self._call_cms(seasons_path, series_id, 'seasons')
+
+        entries = []
+        for season in seasons_data.get('items', []):
+            episodes_path = season['__links__']['season/episodes']['href']
+            episodes = self._call_cms(episodes_path, series_id, 'episodes')
+            for episode in episodes.get('items', []):
+                episode_id = episode['id']
+                entries.append(self.url_result(
+                    'https://vrv.co/watch/' + episode_id,
+                    'VRV', episode_id, episode.get('title')))
+
+        return self.playlist_result(entries, series_id)
diff --git a/youtube_dl/extractor/vshare.py b/youtube_dl/extractor/vshare.py

new file mode 100644 (file)

index 0000000..c631ac1
--- /dev/null
+++ b/youtube_dl/extractor/vshare.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_chr
+from ..utils import (
+    decode_packed_codes,
+    ExtractorError,
+)
+
+
+class VShareIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://vshare.io/d/0f64ce6',
+        'md5': '17b39f55b5497ae8b59f5fbce8e35886',
+        'info_dict': {
+            'id': '0f64ce6',
+            'title': 'vl14062007715967',
+            'ext': 'mp4',
+        }
+    }, {
+        'url': 'https://vshare.io/v/0f64ce6/width-650/height-430/1',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)',
+            webpage)
+
+    def _extract_packed(self, webpage):
+        packed = self._search_regex(
+            r'(eval\(function.+)', webpage, 'packed code')
+        unpacked = decode_packed_codes(packed)
+        digits = self._search_regex(r'\[((?:\d+,?)+)\]', unpacked, 'digits')
+        digits = [int(digit) for digit in digits.split(',')]
+        key_digit = self._search_regex(
+            r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit')
+        chars = [compat_chr(d - int(key_digit)) for d in digits]
+        return ''.join(chars)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
+            video_id, headers={'Referer': url})
+
+        title = self._html_search_regex(
+            r'<title>([^<]+)</title>', webpage, 'title')
+        title = title.split(' - ')[0]
+
+        error = self._html_search_regex(
+            r'(?s)<div[^>]+\bclass=["\']xxx-error[^>]+>(.+?)</div', webpage,
+            'error', default=None)
+        if error:
+            raise ExtractorError(error, expected=True)
+
+        info = self._parse_html5_media_entries(
+            url, '<video>%s</video>' % self._extract_packed(webpage),
+            video_id)[0]
+
+        self._sort_formats(info['formats'])
+
+        info.update({
+            'id': video_id,
+            'title': title,
+        })
+
+        return info
diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py

new file mode 100644 (file)

index 0000000..8ce3a6b
--- /dev/null
+++ b/youtube_dl/extractor/vube.py
@@ -0,0 +1,172 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+)
+from ..utils import (
+    int_or_none,
+    ExtractorError,
+)
+
+
+class VubeIE(InfoExtractor):
+    IE_NAME = 'vube'
+    IE_DESC = 'Vube.com'
+    _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b'
+
+    _TESTS = [
+        {
+            'url': 'http://vube.com/trending/William+Wei/Y8NUZ69Tf7?t=s',
+            'md5': 'e7aabe1f8f1aa826b9e4735e1f9cee42',
+            'info_dict': {
+                'id': 'Y8NUZ69Tf7',
+                'ext': 'mp4',
+                'title': 'Best Drummer Ever [HD]',
+                'description': 'md5:2d63c4b277b85c2277761c2cf7337d71',
+                'thumbnail': r're:^https?://.*\.jpg',
+                'uploader': 'William',
+                'timestamp': 1406876915,
+                'upload_date': '20140801',
+                'duration': 258.051,
+                'like_count': int,
+                'dislike_count': int,
+                'comment_count': int,
+                'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'],
+            },
+            'skip': 'Not accessible from Travis CI server',
+        }, {
+            'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
+            'md5': 'db7aba89d4603dadd627e9d1973946fe',
+            'info_dict': {
+                'id': 'YL2qNPkqon',
+                'ext': 'mp4',
+                'title': 'Chiara Grispo - Price Tag by Jessie J',
+                'description': 'md5:8ea652a1f36818352428cb5134933313',
+                'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f\.jpg$',
+                'uploader': 'Chiara.Grispo',
+                'timestamp': 1388743358,
+                'upload_date': '20140103',
+                'duration': 170.56,
+                'like_count': int,
+                'dislike_count': int,
+                'comment_count': int,
+                'categories': ['pop', 'music', 'cover', 'singing', 'jessie j', 'price tag', 'chiara grispo'],
+            },
+            'skip': 'Removed due to DMCA',
+        },
+        {
+            'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1',
+            'md5': '5d4a52492d76f72712117ce6b0d98d08',
+            'info_dict': {
+                'id': 'UeBhTudbfS',
+                'ext': 'mp4',
+                'title': 'My 7 year old Sister and I singing "Alive" by Krewella',
+                'description': 'md5:40bcacb97796339f1690642c21d56f4a',
+                'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102265d5a9f-0f17-4f6b-5753-adf08484ee1e\.jpg$',
+                'uploader': 'Seraina',
+                'timestamp': 1396492438,
+                'upload_date': '20140403',
+                'duration': 240.107,
+                'like_count': int,
+                'dislike_count': int,
+                'comment_count': int,
+                'categories': ['seraina', 'jessica', 'krewella', 'alive'],
+            },
+            'skip': 'Removed due to DMCA',
+        }, {
+            'url': 'http://vube.com/vote/Siren+Gene/0nmsMY5vEq?n=2&t=s',
+            'md5': '0584fc13b50f887127d9d1007589d27f',
+            'info_dict': {
+                'id': '0nmsMY5vEq',
+                'ext': 'mp4',
+                'title': 'Frozen - Let It Go Cover by Siren Gene',
+                'description': 'My rendition of "Let It Go" originally sung by Idina Menzel.',
+                'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$',
+                'uploader': 'Siren',
+                'timestamp': 1395448018,
+                'upload_date': '20140322',
+                'duration': 221.788,
+                'like_count': int,
+                'dislike_count': int,
+                'comment_count': int,
+                'categories': ['let it go', 'cover', 'idina menzel', 'frozen', 'singing', 'disney', 'siren gene'],
+            },
+            'skip': 'Removed due to DMCA',
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        video = self._download_json(
+            'http://vube.com/t-api/v1/video/%s' % video_id, video_id, 'Downloading video JSON')
+
+        public_id = video['public_id']
+
+        formats = []
+
+        for media in video['media'].get('video', []) + video['media'].get('audio', []):
+            if media['transcoding_status'] != 'processed':
+                continue
+            fmt = {
+                'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (media['media_resolution_id'], public_id),
+                'abr': int(media['audio_bitrate']),
+                'format_id': compat_str(media['media_resolution_id']),
+            }
+            vbr = int(media['video_bitrate'])
+            if vbr:
+                fmt.update({
+                    'vbr': vbr,
+                    'height': int(media['height']),
+                })
+            formats.append(fmt)
+
+        self._sort_formats(formats)
+
+        if not formats and video.get('vst') == 'dmca':
+            raise ExtractorError(
+                'This video has been removed in response to a complaint received under the US Digital Millennium Copyright Act.',
+                expected=True)
+
+        title = video['title']
+        description = video.get('description')
+        thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:')
+        uploader = video.get('user_alias') or video.get('channel')
+        timestamp = int_or_none(video.get('upload_time'))
+        duration = video['duration']
+        view_count = video.get('raw_view_count')
+        like_count = video.get('total_likes')
+        dislike_count = video.get('total_hates')
+
+        comments = video.get('comments')
+        comment_count = None
+        if comments is None:
+            comment_data = self._download_json(
+                'http://vube.com/api/video/%s/comment' % video_id,
+                video_id, 'Downloading video comment JSON', fatal=False)
+            if comment_data is not None:
+                comment_count = int_or_none(comment_data.get('total'))
+        else:
+            comment_count = len(comments)
+
+        categories = [tag['text'] for tag in video['tags']]
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'timestamp': timestamp,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'comment_count': comment_count,
+            'categories': categories,
+        }
diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py

new file mode 100644 (file)

index 0000000..55e087b
--- /dev/null
+++ b/youtube_dl/extractor/vuclip.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_urlparse,
+)
+from ..utils import (
+    ExtractorError,
+    parse_duration,
+    remove_end,
+)
+
+
+class VuClipIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'
+
+    _TEST = {
+        'url': 'http://m.vuclip.com/w?cid=1129900602&bu=8589892792&frm=w&z=34801&op=0&oc=843169247&section=recommend',
+        'info_dict': {
+            'id': '1129900602',
+            'ext': '3gp',
+            'title': 'Top 10 TV Convicts',
+            'duration': 733,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        ad_m = re.search(
+            r'''value="No.*?" onClick="location.href='([^"']+)'"''', webpage)
+        if ad_m:
+            urlr = compat_urllib_parse_urlparse(url)
+            adfree_url = urlr.scheme + '://' + urlr.netloc + ad_m.group(1)
+            webpage = self._download_webpage(
+                adfree_url, video_id, note='Download post-ad page')
+
+        error_msg = self._html_search_regex(
+            r'<p class="message">(.*?)</p>', webpage, 'error message',
+            default=None)
+        if error_msg:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error_msg), expected=True)
+
+        # These clowns alternate between two page types
+        video_url = self._search_regex(
+            r'<a[^>]+href="([^"]+)"[^>]*><img[^>]+src="[^"]*/play\.gif',
+            webpage, 'video URL', default=None)
+        if video_url:
+            formats = [{
+                'url': video_url,
+            }]
+        else:
+            formats = self._parse_html5_media_entries(url, webpage, video_id)[0]['formats']
+
+        title = remove_end(self._html_search_regex(
+            r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip(), ' - Video')
+
+        duration = parse_duration(self._html_search_regex(
+            r'[(>]([0-9]+:[0-9]+)(?:<span|\))', webpage, 'duration', fatal=False))
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py

new file mode 100644 (file)

index 0000000..6906cd2
--- /dev/null
+++ b/youtube_dl/extractor/vvvvid.py
@@ -0,0 +1,158 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    str_or_none,
+)
+
+
+class VVVVIDIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/(?P<show_id>\d+)/[^/]+/(?P<season_id>\d+)/(?P<id>[0-9]+)'
+    _TESTS = [{
+        # video_type == 'video/vvvvid'
+        'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong',
+        'md5': 'b8d3cecc2e981adc3835adf07f6df91b',
+        'info_dict': {
+            'id': '489048',
+            'ext': 'mp4',
+            'title': 'Ping Pong',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # video_type == 'video/rcs'
+        'url': 'https://www.vvvvid.it/#!show/376/death-note-live-action/377/482493/episodio-01',
+        'md5': '33e0edfba720ad73a8782157fdebc648',
+        'info_dict': {
+            'id': '482493',
+            'ext': 'mp4',
+            'title': 'Episodio 01',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+    _conn_id = None
+
+    def _real_initialize(self):
+        self._conn_id = self._download_json(
+            'https://www.vvvvid.it/user/login',
+            None, headers=self.geo_verification_headers())['data']['conn_id']
+
+    def _real_extract(self, url):
+        show_id, season_id, video_id = re.match(self._VALID_URL, url).groups()
+        response = self._download_json(
+            'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id),
+            video_id, headers=self.geo_verification_headers(), query={
+                'conn_id': self._conn_id,
+            })
+        if response['result'] == 'error':
+            raise ExtractorError('%s said: %s' % (
+                self.IE_NAME, response['message']), expected=True)
+
+        vid = int(video_id)
+        video_data = list(filter(
+            lambda episode: episode.get('video_id') == vid, response['data']))[0]
+        formats = []
+
+        # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js
+        def ds(h):
+            g = "MNOPIJKL89+/4567UVWXQRSTEFGHABCDcdefYZabstuvopqr0123wxyzklmnghij"
+
+            def f(m):
+                l = []
+                o = 0
+                b = False
+                m_len = len(m)
+                while ((not b) and o < m_len):
+                    n = m[o] << 2
+                    o += 1
+                    k = -1
+                    j = -1
+                    if o < m_len:
+                        n += m[o] >> 4
+                        o += 1
+                        if o < m_len:
+                            k = (m[o - 1] << 4) & 255
+                            k += m[o] >> 2
+                            o += 1
+                            if o < m_len:
+                                j = (m[o - 1] << 6) & 255
+                                j += m[o]
+                                o += 1
+                            else:
+                                b = True
+                        else:
+                            b = True
+                    else:
+                        b = True
+                    l.append(n)
+                    if k != -1:
+                        l.append(k)
+                    if j != -1:
+                        l.append(j)
+                return l
+
+            c = []
+            for e in h:
+                c.append(g.index(e))
+
+            c_len = len(c)
+            for e in range(c_len * 2 - 1, -1, -1):
+                a = c[e % c_len] ^ c[(e + 1) % c_len]
+                c[e % c_len] = a
+
+            c = f(c)
+            d = ''
+            for e in c:
+                d += chr(e)
+
+            return d
+
+        for quality in ('_sd', ''):
+            embed_code = video_data.get('embed_info' + quality)
+            if not embed_code:
+                continue
+            embed_code = ds(embed_code)
+            video_type = video_data.get('video_type')
+            if video_type in ('video/rcs', 'video/kenc'):
+                embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8')
+                if video_type == 'video/kenc':
+                    kenc = self._download_json(
+                        'https://www.vvvvid.it/kenc', video_id, query={
+                            'action': 'kt',
+                            'conn_id': self._conn_id,
+                            'url': embed_code,
+                        }, fatal=False) or {}
+                    kenc_message = kenc.get('message')
+                    if kenc_message:
+                        embed_code += '?' + ds(kenc_message)
+                formats.extend(self._extract_m3u8_formats(
+                    embed_code, video_id, 'mp4',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.extend(self._extract_wowza_formats(
+                    'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_data['title'],
+            'formats': formats,
+            'thumbnail': video_data.get('thumbnail'),
+            'duration': int_or_none(video_data.get('length')),
+            'series': video_data.get('show_title'),
+            'season_id': season_id,
+            'season_number': video_data.get('season_number'),
+            'episode_id': str_or_none(video_data.get('id')),
+            'episode_number': int_or_none(video_data.get('number')),
+            'episode_title': video_data['title'],
+            'view_count': int_or_none(video_data.get('views')),
+            'like_count': int_or_none(video_data.get('video_likes')),
+        }
diff --git a/youtube_dl/extractor/vyborymos.py b/youtube_dl/extractor/vyborymos.py

new file mode 100644 (file)

index 0000000..9e703c4
--- /dev/null
+++ b/youtube_dl/extractor/vyborymos.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+
+
+class VyboryMosIE(InfoExtractor):
+    _VALID_URL = r'https?://vybory\.mos\.ru/(?:#precinct/|account/channels\?.*?\bstation_id=)(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://vybory.mos.ru/#precinct/13636',
+        'info_dict': {
+            'id': '13636',
+            'ext': 'mp4',
+            'title': 're:^Участковая избирательная комиссия №2231 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'Россия, Москва, улица Введенского, 32А',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://vybory.mos.ru/account/channels?station_id=13636',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        station_id = self._match_id(url)
+
+        channels = self._download_json(
+            'http://vybory.mos.ru/account/channels?station_id=%s' % station_id,
+            station_id, 'Downloading channels JSON')
+
+        formats = []
+        for cam_num, (sid, hosts, name, _) in enumerate(channels, 1):
+            for num, host in enumerate(hosts, 1):
+                formats.append({
+                    'url': 'http://%s/master.m3u8?sid=%s' % (host, sid),
+                    'ext': 'mp4',
+                    'format_id': 'camera%d-host%d' % (cam_num, num),
+                    'format_note': '%s, %s' % (name, host),
+                })
+
+        info = self._download_json(
+            'http://vybory.mos.ru/json/voting_stations/%s/%s.json'
+            % (compat_str(station_id)[:3], station_id),
+            station_id, 'Downloading station JSON', fatal=False)
+
+        return {
+            'id': station_id,
+            'title': self._live_title(info['name'] if info else station_id),
+            'description': info.get('address'),
+            'is_live': True,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py

new file mode 100644 (file)

index 0000000..b7d02fc
--- /dev/null
+++ b/youtube_dl/extractor/vzaar.py
@@ -0,0 +1,112 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    float_or_none,
+    unified_timestamp,
+    url_or_none,
+)
+
+
+class VzaarIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P<id>\d+)'
+    _TESTS = [{
+        # HTTP and HLS
+        'url': 'https://vzaar.com/videos/1152805',
+        'md5': 'bde5ddfeb104a6c56a93a06b04901dbf',
+        'info_dict': {
+            'id': '1152805',
+            'ext': 'mp4',
+            'title': 'sample video (public)',
+        },
+    }, {
+        'url': 'https://view.vzaar.com/27272/player',
+        'md5': '3b50012ac9bbce7f445550d54e0508f2',
+        'info_dict': {
+            'id': '27272',
+            'ext': 'mp3',
+            'title': 'MP3',
+        },
+    }, {
+        # hlsAes = true
+        'url': 'https://view.vzaar.com/11379930/player',
+        'info_dict': {
+            'id': '11379930',
+            'ext': 'mp4',
+            'title': 'Videoaula',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # with null videoTitle
+        'url': 'https://view.vzaar.com/20313539/download',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)',
+            webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'http://view.vzaar.com/v2/%s/video' % video_id, video_id)
+
+        title = video_data.get('videoTitle') or video_id
+
+        formats = []
+
+        source_url = url_or_none(video_data.get('sourceUrl'))
+        if source_url:
+            f = {
+                'url': source_url,
+                'format_id': 'http',
+                'preference': 1,
+            }
+            if 'audio' in source_url:
+                f.update({
+                    'vcodec': 'none',
+                    'ext': 'mp3',
+                })
+            else:
+                f.update({
+                    'width': int_or_none(video_data.get('width')),
+                    'height': int_or_none(video_data.get('height')),
+                    'ext': 'mp4',
+                    'fps': float_or_none(video_data.get('fps')),
+                })
+            formats.append(f)
+
+        video_guid = video_data.get('guid')
+        usp = video_data.get('usp')
+        if video_data.get('uspEnabled') and isinstance(video_guid, compat_str) and isinstance(usp, dict):
+            hls_aes = video_data.get('hlsAes')
+            qs = '&'.join('%s=%s' % (k, v) for k, v in usp.items())
+            url_templ = 'http://%%s.vzaar.com/v5/usp%s/%s/%s.ism%%s?' % ('aes' if hls_aes else '', video_guid, video_id)
+            m3u8_formats = self._extract_m3u8_formats(
+                url_templ % ('fable', '/.m3u8') + qs, video_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False)
+            if hls_aes:
+                for f in m3u8_formats:
+                    f['_decryption_key_url'] = url_templ % ('goose', '') + qs
+            formats.extend(m3u8_formats)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': self._proto_relative_url(video_data.get('poster')),
+            'duration': float_or_none(video_data.get('videoDuration')),
+            'timestamp': unified_timestamp(video_data.get('ts')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/wakanim.py b/youtube_dl/extractor/wakanim.py

new file mode 100644 (file)

index 0000000..f9a2395
--- /dev/null
+++ b/youtube_dl/extractor/wakanim.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    merge_dicts,
+    urljoin,
+)
+
+
+class WakanimIE(InfoExtractor):
+    _VALID_URL = r'https://(?:www\.)?wakanim\.tv/[^/]+/v2/catalogue/episode/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/2997/the-asterisk-war-omu-staffel-1-episode-02-omu',
+        'info_dict': {
+            'id': '2997',
+            'ext': 'mp4',
+            'title': 'Episode 02',
+            'description': 'md5:2927701ea2f7e901de8bfa8d39b2852d',
+            'series': 'The Asterisk War  (OmU.)',
+            'season_number': 1,
+            'episode': 'Episode 02',
+            'episode_number': 2,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+    }, {
+        # DRM Protected
+        'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        m3u8_url = urljoin(url, self._search_regex(
+            r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url',
+            group='url'))
+        # https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls
+        encryption = self._search_regex(
+            r'encryption%3D(c(?:enc|bc(?:s-aapl)?))',
+            m3u8_url, 'encryption', default=None)
+        if encryption and encryption in ('cenc', 'cbcs-aapl'):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
+        formats = self._extract_m3u8_formats(
+            m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+
+        info = self._search_json_ld(webpage, video_id, default={})
+
+        title = self._search_regex(
+            (r'<h1[^>]+\bclass=["\']episode_h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
+             r'<span[^>]+\bclass=["\']episode_title["\'][^>]*>(?P<title>[^<]+)'),
+            webpage, 'title', default=None, group='title')
+
+        return merge_dicts(info, {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        })
diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py

new file mode 100644 (file)

index 0000000..cbb5486
--- /dev/null
+++ b/youtube_dl/extractor/walla.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    xpath_text,
+    int_or_none,
+)
+
+
+class WallaIE(InfoExtractor):
+    _VALID_URL = r'https?://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)'
+    _TEST = {
+        'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one',
+        'info_dict': {
+            'id': '2642630',
+            'display_id': 'one-direction-all-for-one',
+            'ext': 'flv',
+            'title': 'וואן דיירקשן: ההיסטריה',
+            'description': 'md5:de9e2512a92442574cdb0913c49bc4d8',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 3600,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }
+
+    _SUBTITLE_LANGS = {
+        'עברית': 'heb',
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        video = self._download_xml(
+            'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id,
+            display_id)
+
+        item = video.find('./items/item')
+
+        title = xpath_text(item, './title', 'title')
+        description = xpath_text(item, './synopsis', 'description')
+        thumbnail = xpath_text(item, './preview_pic', 'thumbnail')
+        duration = int_or_none(xpath_text(item, './duration', 'duration'))
+
+        subtitles = {}
+        for subtitle in item.findall('./subtitles/subtitle'):
+            lang = xpath_text(subtitle, './title')
+            subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{
+                'ext': 'srt',
+                'url': xpath_text(subtitle, './src'),
+            }]
+
+        formats = []
+        for quality in item.findall('./qualities/quality'):
+            format_id = xpath_text(quality, './title')
+            fmt = {
+                'url': 'rtmp://wafla.walla.co.il/vod',
+                'play_path': xpath_text(quality, './src'),
+                'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf',
+                'page_url': url,
+                'ext': 'flv',
+                'format_id': xpath_text(quality, './title'),
+            }
+            m = re.search(r'^(?P<height>\d+)[Pp]', format_id)
+            if m:
+                fmt['height'] = int(m.group('height'))
+            formats.append(fmt)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py

new file mode 100644 (file)

index 0000000..625d0a1
--- /dev/null
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -0,0 +1,183 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    strip_jsonp,
+)
+
+
+class WashingtonPostIE(InfoExtractor):
+    IE_NAME = 'washingtonpost'
+    _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+    _TEST = {
+        'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
+        'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
+        'info_dict': {
+            'id': '480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
+            'ext': 'mp4',
+            'title': 'Egypt finds belongings, debris from plane crash',
+            'description': 'md5:a17ceee432f215a5371388c1f680bd86',
+            'upload_date': '20160520',
+            'uploader': 'Reuters',
+            'timestamp': 1463778452,
+        },
+    }
+
+    @classmethod
+    def _extract_urls(cls, webpage):
+        return re.findall(
+            r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id,
+            video_id, transform_source=strip_jsonp)[0]['contentConfig']
+        title = video_data['title']
+
+        urls = []
+        formats = []
+        for s in video_data.get('streams', []):
+            s_url = s.get('url')
+            if not s_url or s_url in urls:
+                continue
+            urls.append(s_url)
+            video_type = s.get('type')
+            if video_type == 'smil':
+                continue
+            elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url):
+                m3u8_formats = self._extract_m3u8_formats(
+                    s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+                for m3u8_format in m3u8_formats:
+                    width = m3u8_format.get('width')
+                    if not width:
+                        continue
+                    vbr = self._search_regex(
+                        r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None)
+                    if vbr:
+                        m3u8_format.update({
+                            'vbr': int_or_none(vbr),
+                        })
+                formats.extend(m3u8_formats)
+            else:
+                width = int_or_none(s.get('width'))
+                vbr = int_or_none(s.get('bitrate'))
+                has_width = width != 0
+                formats.append({
+                    'format_id': (
+                        '%s-%d-%d' % (video_type, width, vbr)
+                        if width
+                        else video_type),
+                    'vbr': vbr if has_width else None,
+                    'width': width,
+                    'height': int_or_none(s.get('height')),
+                    'acodec': s.get('audioCodec'),
+                    'vcodec': s.get('videoCodec') if has_width else 'none',
+                    'filesize': int_or_none(s.get('fileSize')),
+                    'url': s_url,
+                    'ext': 'mp4',
+                    'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None,
+                })
+        source_media_url = video_data.get('sourceMediaURL')
+        if source_media_url:
+            formats.append({
+                'format_id': 'source_media',
+                'url': source_media_url,
+            })
+        self._sort_formats(
+            formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('blurb'),
+            'uploader': video_data.get('credits', {}).get('source'),
+            'formats': formats,
+            'duration': int_or_none(video_data.get('videoDuration'), 100),
+            'timestamp': int_or_none(
+                video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000),
+        }
+
+
+class WashingtonPostArticleIE(InfoExtractor):
+    IE_NAME = 'washingtonpost:article'
+    _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
+        'info_dict': {
+            'id': 'sinkhole-of-bureaucracy',
+            'title': 'Sinkhole of bureaucracy',
+        },
+        'playlist': [{
+            'md5': 'b9be794ceb56c7267d410a13f99d801a',
+            'info_dict': {
+                'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
+                'ext': 'mp4',
+                'title': 'Breaking Points: The Paper Mine',
+                'duration': 1290,
+                'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
+                'uploader': 'The Washington Post',
+                'timestamp': 1395527908,
+                'upload_date': '20140322',
+            },
+        }, {
+            'md5': '1fff6a689d8770966df78c8cb6c8c17c',
+            'info_dict': {
+                'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
+                'ext': 'mp4',
+                'title': 'The town bureaucracy sustains',
+                'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
+                'duration': 2220,
+                'timestamp': 1395528005,
+                'upload_date': '20140322',
+                'uploader': 'The Washington Post',
+            },
+        }],
+    }, {
+        'url': 'http://www.washingtonpost.com/blogs/wonkblog/wp/2014/12/31/one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear/',
+        'info_dict': {
+            'id': 'one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear',
+            'title': 'One airline figured out how to make sure its airplanes never disappear',
+        },
+        'playlist': [{
+            'md5': 'a7c1b5634ba5e57a6a82cdffa5b1e0d0',
+            'info_dict': {
+                'id': '0e4bb54c-9065-11e4-a66f-0ca5037a597d',
+                'ext': 'mp4',
+                'description': 'Washington Post transportation reporter Ashley Halsey III explains why a plane\'s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.',
+                'upload_date': '20141230',
+                'uploader': 'The Washington Post',
+                'timestamp': 1419974765,
+                'title': 'Why black boxes don’t transmit data in real time',
+            }
+        }]
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if WashingtonPostIE.suitable(url) else super(WashingtonPostArticleIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+        webpage = self._download_webpage(url, page_id)
+
+        title = self._og_search_title(webpage)
+
+        uuids = re.findall(r'''(?x)
+            (?:
+                <div\s+class="posttv-video-embed[^>]*?data-uuid=|
+                data-video-uuid=
+            )"([^"]+)"''', webpage)
+        entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids]
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'id': page_id,
+            'title': title,
+        }
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py

new file mode 100644 (file)

index 0000000..8ef3e09
--- /dev/null
+++ b/youtube_dl/extractor/wat.py
@@ -0,0 +1,157 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    unified_strdate,
+    HEADRequest,
+    int_or_none,
+)
+
+
+class WatIE(InfoExtractor):
+    _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)'
+    IE_NAME = 'wat.tv'
+    _TESTS = [
+        {
+            'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
+            'info_dict': {
+                'id': '11713067',
+                'ext': 'mp4',
+                'title': 'Soupe de figues à l\'orange et aux épices',
+                'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',
+                'upload_date': '20140819',
+                'duration': 120,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+            'expected_warnings': ['HTTP Error 404'],
+        },
+        {
+            'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html',
+            'md5': 'b16574df2c3cd1a36ca0098f2a791925',
+            'info_dict': {
+                'id': '11713075',
+                'ext': 'mp4',
+                'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',
+                'upload_date': '20140816',
+            },
+            'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."],
+        },
+    ]
+
+    _FORMATS = (
+        (200, 416, 234),
+        (400, 480, 270),
+        (600, 640, 360),
+        (1200, 640, 360),
+        (1800, 960, 540),
+        (2500, 1280, 720),
+    )
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))
+
+        # 'contentv4' is used in the website, but it also returns the related
+        # videos, we don't need them
+        video_data = self._download_json(
+            'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)
+        video_info = video_data['media']
+
+        error_desc = video_info.get('error_desc')
+        if error_desc:
+            self.report_warning(
+                '%s returned error: %s' % (self.IE_NAME, error_desc))
+
+        chapters = video_info['chapters']
+        if chapters:
+            first_chapter = chapters[0]
+
+            def video_id_for_chapter(chapter):
+                return chapter['tc_start'].split('-')[0]
+
+            if video_id_for_chapter(first_chapter) != video_id:
+                self.to_screen('Multipart video detected')
+                entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters]
+                return self.playlist_result(entries, video_id, video_info['title'])
+            # Otherwise we can continue and extract just one part, we have to use
+            # the video id for getting the video url
+        else:
+            first_chapter = video_info
+
+        title = first_chapter['title']
+
+        def extract_url(path_template, url_type):
+            req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id)
+            head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False)
+            if head:
+                red_url = head.geturl()
+                if req_url != red_url:
+                    return red_url
+            return None
+
+        def remove_bitrate_limit(manifest_url):
+            return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url)
+
+        formats = []
+        try:
+            alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')]
+            manifest_urls = self._download_json(
+                'http://www.wat.tv/get/webhtml/' + video_id, video_id)
+            m3u8_url = manifest_urls.get('hls')
+            if m3u8_url:
+                m3u8_url = remove_bitrate_limit(m3u8_url)
+                for m3u8_alt_url in alt_urls(m3u8_url):
+                    formats.extend(self._extract_m3u8_formats(
+                        m3u8_alt_url, video_id, 'mp4',
+                        'm3u8_native', m3u8_id='hls', fatal=False))
+                    formats.extend(self._extract_f4m_formats(
+                        m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'),
+                        video_id, f4m_id='hds', fatal=False))
+            mpd_url = manifest_urls.get('mpd')
+            if mpd_url:
+                mpd_url = remove_bitrate_limit(mpd_url)
+                for mpd_alt_url in alt_urls(mpd_url):
+                    formats.extend(self._extract_mpd_formats(
+                        mpd_alt_url, video_id, mpd_id='dash', fatal=False))
+            self._sort_formats(formats)
+        except ExtractorError:
+            abr = 64
+            for vbr, width, height in self._FORMATS:
+                tbr = vbr + abr
+                format_id = 'http-%s' % tbr
+                fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr)
+                if self._is_valid_url(fmt_url, video_id, format_id):
+                    formats.append({
+                        'format_id': format_id,
+                        'url': fmt_url,
+                        'vbr': vbr,
+                        'abr': abr,
+                        'width': width,
+                        'height': height,
+                    })
+
+        date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4')
+        upload_date = unified_strdate(date_diffusion) if date_diffusion else None
+        duration = None
+        files = video_info['files']
+        if files:
+            duration = int_or_none(files[0].get('duration'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': first_chapter.get('preview'),
+            'description': first_chapter.get('description'),
+            'view_count': int_or_none(video_info.get('views')),
+            'upload_date': upload_date,
+            'duration': duration,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py

new file mode 100644 (file)

index 0000000..5a4e46e
--- /dev/null
+++ b/youtube_dl/extractor/watchbox.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    strip_or_none,
+    try_get,
+    unescapeHTML,
+    unified_timestamp,
+)
+
+
+class WatchBoxIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?P<kind>serien|filme)/(?:[^/]+/)*[^/]+-(?P<id>\d+)'
+    _TESTS = [{
+        # film
+        'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html',
+        'info_dict': {
+            'id': '341368',
+            'ext': 'mp4',
+            'title': 'Free Jimmy',
+            'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 4890,
+            'age_limit': 16,
+            'release_year': 2009,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        # episode
+        'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html',
+        'info_dict': {
+            'id': '328286',
+            'ext': 'mp4',
+            'title': 'S01 E01 - Date in der Hölle',
+            'description': 'md5:2f31c74a8186899f33cb5114491dae2b',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1291,
+            'age_limit': 12,
+            'release_year': 2010,
+            'series': 'Ugly Americans',
+            'season_number': 1,
+            'episode': 'Date in der Hölle',
+            'episode_number': 1,
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        kind, video_id = mobj.group('kind', 'id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        player_config = self._parse_json(
+            self._search_regex(
+                r'data-player-conf=(["\'])(?P<data>{.+?})\1', webpage,
+                'player config', default='{}', group='data'),
+            video_id, transform_source=unescapeHTML, fatal=False)
+
+        if not player_config:
+            player_config = self._parse_json(
+                self._search_regex(
+                    r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config',
+                    default='{}'),
+                video_id, transform_source=js_to_json, fatal=False) or {}
+
+        source = player_config.get('source') or {}
+
+        video_id = compat_str(source.get('videoId') or video_id)
+
+        devapi = self._download_json(
+            'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={
+                'format': 'json',
+                'apikey': 'hbbtv',
+            }, fatal=False)
+
+        item = try_get(devapi, lambda x: x['items'][0], dict) or {}
+
+        title = item.get('title') or try_get(
+            item, lambda x: x['movie']['headline_movie'],
+            compat_str) or source['title']
+
+        formats = []
+        hls_url = item.get('media_videourl_hls') or source.get('hls')
+        if hls_url:
+            formats.extend(self._extract_m3u8_formats(
+                hls_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+        dash_url = item.get('media_videourl_wv') or source.get('dash')
+        if dash_url:
+            formats.extend(self._extract_mpd_formats(
+                dash_url, video_id, mpd_id='dash', fatal=False))
+        mp4_url = item.get('media_videourl')
+        if mp4_url:
+            formats.append({
+                'url': mp4_url,
+                'format_id': 'mp4',
+                'width': int_or_none(item.get('width')),
+                'height': int_or_none(item.get('height')),
+                'tbr': int_or_none(item.get('bitrate')),
+            })
+        self._sort_formats(formats)
+
+        description = strip_or_none(item.get('descr'))
+        thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail')
+        duration = int_or_none(item.get('media_length') or source.get('length'))
+        timestamp = unified_timestamp(item.get('pubDate'))
+        view_count = int_or_none(item.get('media_views'))
+        age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk']))
+        release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year']))
+
+        info = {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'age_limit': age_limit,
+            'release_year': release_year,
+            'formats': formats,
+        }
+
+        if kind.lower() == 'serien':
+            series = try_get(
+                item, lambda x: x['special']['title'],
+                compat_str) or source.get('format')
+            season_number = int_or_none(self._search_regex(
+                r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number',
+                default=None) or self._search_regex(
+                    r'/staffel-(\d+)/', url, 'season number', default=None))
+            episode = source.get('title')
+            episode_number = int_or_none(self._search_regex(
+                r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number',
+                default=None))
+            info.update({
+                'series': series,
+                'season_number': season_number,
+                'episode': episode,
+                'episode_number': episode_number,
+            })
+
+        return info
diff --git a/youtube_dl/extractor/watchindianporn.py b/youtube_dl/extractor/watchindianporn.py

new file mode 100644 (file)

index 0000000..fadc539
--- /dev/null
+++ b/youtube_dl/extractor/watchindianporn.py
@@ -0,0 +1,68 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class WatchIndianPornIE(InfoExtractor):
+    IE_DESC = 'Watch Indian Porn'
+    _VALID_URL = r'https?://(?:www\.)?watchindianporn\.net/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html'
+    _TEST = {
+        'url': 'http://www.watchindianporn.net/video/hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera-RZa2avywNPa.html',
+        'md5': '249589a164dde236ec65832bfce17440',
+        'info_dict': {
+            'id': 'RZa2avywNPa',
+            'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera',
+            'ext': 'mp4',
+            'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 226,
+            'view_count': int,
+            'categories': list,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]
+
+        title = self._html_search_regex((
+            r'<title>(.+?)\s*-\s*Indian\s+Porn</title>',
+            r'<h4>(.+?)</h4>'
+        ), webpage, 'title')
+
+        duration = parse_duration(self._search_regex(
+            r'Time:\s*<strong>\s*(.+?)\s*</strong>',
+            webpage, 'duration', fatal=False))
+
+        view_count = int(self._search_regex(
+            r'(?s)Time:\s*<strong>.*?</strong>.*?<strong>\s*(\d+)\s*</strong>',
+            webpage, 'view count', fatal=False))
+
+        categories = re.findall(
+            r'<a[^>]+class=[\'"]categories[\'"][^>]*>\s*([^<]+)\s*</a>',
+            webpage)
+
+        info_dict.update({
+            'id': video_id,
+            'display_id': display_id,
+            'http_headers': {
+                'Referer': url,
+            },
+            'title': title,
+            'duration': duration,
+            'view_count': view_count,
+            'categories': categories,
+            'age_limit': 18,
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py

new file mode 100644 (file)

index 0000000..cf6f7c7
--- /dev/null
+++ b/youtube_dl/extractor/wdr.py
@@ -0,0 +1,330 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    js_to_json,
+    strip_jsonp,
+    try_get,
+    unified_strdate,
+    update_url_query,
+    urlhandle_detect_ext,
+)
+
+
+class WDRIE(InfoExtractor):
+    _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js'
+    _GEO_COUNTRIES = ['DE']
+    _TEST = {
+        'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js',
+        'info_dict': {
+            'id': 'mdb-1557833',
+            'ext': 'mp4',
+            'title': 'Biathlon-Staffel verpasst Podest bei Olympia-Generalprobe',
+            'upload_date': '20180112',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        metadata = self._download_json(
+            url, video_id, transform_source=strip_jsonp)
+
+        is_live = metadata.get('mediaType') == 'live'
+
+        tracker_data = metadata['trackerData']
+        media_resource = metadata['mediaResource']
+
+        formats = []
+
+        # check if the metadata contains a direct URL to a file
+        for kind, media_resource in media_resource.items():
+            if kind not in ('dflt', 'alt'):
+                continue
+
+            for tag_name, medium_url in media_resource.items():
+                if tag_name not in ('videoURL', 'audioURL'):
+                    continue
+
+                ext = determine_ext(medium_url)
+                if ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        medium_url, video_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls'))
+                elif ext == 'f4m':
+                    manifest_url = update_url_query(
+                        medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'})
+                    formats.extend(self._extract_f4m_formats(
+                        manifest_url, video_id, f4m_id='hds', fatal=False))
+                elif ext == 'smil':
+                    formats.extend(self._extract_smil_formats(
+                        medium_url, 'stream', fatal=False))
+                else:
+                    a_format = {
+                        'url': medium_url
+                    }
+                    if ext == 'unknown_video':
+                        urlh = self._request_webpage(
+                            medium_url, video_id, note='Determining extension')
+                        ext = urlhandle_detect_ext(urlh)
+                        a_format['ext'] = ext
+                    formats.append(a_format)
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        caption_url = media_resource.get('captionURL')
+        if caption_url:
+            subtitles['de'] = [{
+                'url': caption_url,
+                'ext': 'ttml',
+            }]
+
+        title = tracker_data['trackerClipTitle']
+
+        return {
+            'id': tracker_data.get('trackerClipId', video_id),
+            'title': self._live_title(title) if is_live else title,
+            'alt_title': tracker_data.get('trackerClipSubcategory'),
+            'formats': formats,
+            'subtitles': subtitles,
+            'upload_date': unified_strdate(tracker_data.get('trackerClipAirTime')),
+            'is_live': is_live,
+        }
+
+
+class WDRPageIE(InfoExtractor):
+    _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
+    _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html'
+    _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
+
+    _TESTS = [
+        {
+            'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html',
+            # HDS download, MD5 is unstable
+            'info_dict': {
+                'id': 'mdb-1058683',
+                'ext': 'flv',
+                'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100',
+                'title': 'Geheimnis Aachener Dom',
+                'alt_title': 'Doku am Freitag',
+                'upload_date': '20160304',
+                'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318',
+                'is_live': False,
+                'subtitles': {'de': [{
+                    'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml',
+                    'ext': 'ttml',
+                }]},
+            },
+            'skip': 'HTTP Error 404: Not Found',
+        },
+        {
+            'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html',
+            'md5': 'f4c1f96d01cf285240f53ea4309663d8',
+            'info_dict': {
+                'id': 'mdb-1072000',
+                'ext': 'mp3',
+                'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100',
+                'title': 'Schriftstellerin Juli Zeh',
+                'alt_title': 'WDR 3 Gespräch am Samstag',
+                'upload_date': '20160312',
+                'description': 'md5:e127d320bc2b1f149be697ce044a3dd7',
+                'is_live': False,
+                'subtitles': {}
+            },
+            'skip': 'HTTP Error 404: Not Found',
+        },
+        {
+            'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
+            'info_dict': {
+                'id': 'mdb-1406149',
+                'ext': 'mp4',
+                'title': r're:^WDR Fernsehen im Livestream \(nur in Deutschland erreichbar\) [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+                'alt_title': 'WDR Fernsehen Live',
+                'upload_date': '20150101',
+                'is_live': True,
+            },
+            'params': {
+                'skip_download': True,  # m3u8 download
+            },
+        },
+        {
+            'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html',
+            'playlist_mincount': 7,
+            'info_dict': {
+                'id': 'aktuelle-stunde-120',
+            },
+        },
+        {
+            'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
+            'info_dict': {
+                'id': 'mdb-1552552',
+                'ext': 'mp4',
+                'upload_date': 're:^[0-9]{8}$',
+                'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$',
+            },
+            'skip': 'The id changes from week to week because of the new episode'
+        },
+        {
+            'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5',
+            'md5': '803138901f6368ee497b4d195bb164f2',
+            'info_dict': {
+                'id': 'mdb-186083',
+                'ext': 'mp4',
+                'upload_date': '20130919',
+                'title': 'Sachgeschichte - Achterbahn ',
+            },
+        },
+        {
+            'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html',
+            # Live stream, MD5 unstable
+            'info_dict': {
+                'id': 'mdb-869971',
+                'ext': 'mp4',
+                'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+                'upload_date': '20160101',
+            },
+            'params': {
+                'skip_download': True,  # m3u8 download
+            }
+        },
+        {
+            'url': 'http://www.sportschau.de/handballem2018/handball-nationalmannschaft-em-stolperstein-vorrunde-100.html',
+            'info_dict': {
+                'id': 'mdb-1556012',
+                'ext': 'mp4',
+                'title': 'DHB-Vizepräsident Bob Hanning - "Die Weltspitze ist extrem breit"',
+                'upload_date': '20180111',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
+        webpage = self._download_webpage(url, display_id)
+
+        entries = []
+
+        # Article with several videos
+
+        # for wdr.de the data-extension is in a tag with the class "mediaLink"
+        # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
+        # for wdrmaus, in a tag with the class "videoButton" (previously a link
+        # to the page in a multiline "videoLink"-tag)
+        for mobj in re.finditer(
+            r'''(?sx)class=
+                    (?:
+                        (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+|
+                        (["\'])videoLink\b.*?\2[\s]*>\n[^\n]*
+                    )data-extension=(["\'])(?P<data>(?:(?!\3).)+)\3
+                    ''', webpage):
+            media_link_obj = self._parse_json(
+                mobj.group('data'), display_id, transform_source=js_to_json,
+                fatal=False)
+            if not media_link_obj:
+                continue
+            jsonp_url = try_get(
+                media_link_obj, lambda x: x['mediaObj']['url'], compat_str)
+            if jsonp_url:
+                entries.append(self.url_result(jsonp_url, ie=WDRIE.ie_key()))
+
+        # Playlist (e.g. https://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html)
+        if not entries:
+            entries = [
+                self.url_result(
+                    compat_urlparse.urljoin(url, mobj.group('href')),
+                    ie=WDRPageIE.ie_key())
+                for mobj in re.finditer(
+                    r'<a[^>]+\bhref=(["\'])(?P<href>(?:(?!\1).)+)\1[^>]+\bdata-extension=',
+                    webpage) if re.match(self._PAGE_REGEX, mobj.group('href'))
+            ]
+
+        return self.playlist_result(entries, playlist_id=display_id)
+
+
+class WDRElefantIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)wdrmaus\.de/elefantenseite/#(?P<id>.+)'
+    _TEST = {
+        'url': 'http://www.wdrmaus.de/elefantenseite/#folge_ostern_2015',
+        'info_dict': {
+            'title': 'Folge Oster-Spezial 2015',
+            'id': 'mdb-1088195',
+            'ext': 'mp4',
+            'age_limit': None,
+            'upload_date': '20150406'
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        # Table of Contents seems to always be at this address, so fetch it directly.
+        # The website fetches configurationJS.php5, which links to tableOfContentsJS.php5.
+        table_of_contents = self._download_json(
+            'https://www.wdrmaus.de/elefantenseite/data/tableOfContentsJS.php5',
+            display_id)
+        if display_id not in table_of_contents:
+            raise ExtractorError(
+                'No entry in site\'s table of contents for this URL. '
+                'Is the fragment part of the URL (after the #) correct?',
+                expected=True)
+        xml_metadata_path = table_of_contents[display_id]['xmlPath']
+        xml_metadata = self._download_xml(
+            'https://www.wdrmaus.de/elefantenseite/' + xml_metadata_path,
+            display_id)
+        zmdb_url_element = xml_metadata.find('./movie/zmdb_url')
+        if zmdb_url_element is None:
+            raise ExtractorError(
+                '%s is not a video' % display_id, expected=True)
+        return self.url_result(zmdb_url_element.text, ie=WDRIE.ie_key())
+
+
+class WDRMobileIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+        https?://mobile-ondemand\.wdr\.de/
+        .*?/fsk(?P<age_limit>[0-9]+)
+        /[0-9]+/[0-9]+/
+        (?P<id>[0-9]+)_(?P<title>[0-9]+)'''
+    IE_NAME = 'wdr:mobile'
+    _TEST = {
+        'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4',
+        'info_dict': {
+            'title': '4283021',
+            'id': '421735',
+            'ext': 'mp4',
+            'age_limit': 0,
+        },
+        'skip': 'Problems with loading data.'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        return {
+            'id': mobj.group('id'),
+            'title': mobj.group('title'),
+            'age_limit': int(mobj.group('age_limit')),
+            'url': url,
+            'http_headers': {
+                'User-Agent': 'mobile',
+            },
+        }
diff --git a/youtube_dl/extractor/webcaster.py b/youtube_dl/extractor/webcaster.py

new file mode 100644 (file)

index 0000000..e4b65f5
--- /dev/null
+++ b/youtube_dl/extractor/webcaster.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    xpath_text,
+)
+
+
+class WebcasterIE(InfoExtractor):
+    _VALID_URL = r'https?://bl\.webcaster\.pro/(?:quote|media)/start/free_(?P<id>[^/]+)'
+    _TESTS = [{
+        # http://video.khl.ru/quotes/393859
+        'url': 'http://bl.webcaster.pro/quote/start/free_c8cefd240aa593681c8d068cff59f407_hd/q393859/eb173f99dd5f558674dae55f4ba6806d/1480289104?sr%3D105%26fa%3D1%26type_id%3D18',
+        'md5': '0c162f67443f30916ff1c89425dcd4cd',
+        'info_dict': {
+            'id': 'c8cefd240aa593681c8d068cff59f407_hd',
+            'ext': 'mp4',
+            'title': 'Сибирь - Нефтехимик. Лучшие моменты первого периода',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://bl.webcaster.pro/media/start/free_6246c7a4453ac4c42b4398f840d13100_hd/2_2991109016/e8d0d82587ef435480118f9f9c41db41/4635726126',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        video = self._download_xml(url, video_id)
+
+        title = xpath_text(video, './/event_name', 'event name', fatal=True)
+
+        def make_id(parts, separator):
+            return separator.join(filter(None, parts))
+
+        formats = []
+        for format_id in (None, 'noise'):
+            track_tag = make_id(('track', format_id), '_')
+            for track in video.findall('.//iphone/%s' % track_tag):
+                track_url = track.text
+                if not track_url:
+                    continue
+                if determine_ext(track_url) == 'm3u8':
+                    m3u8_formats = self._extract_m3u8_formats(
+                        track_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native',
+                        m3u8_id=make_id(('hls', format_id), '-'), fatal=False)
+                    for f in m3u8_formats:
+                        f.update({
+                            'source_preference': 0 if format_id == 'noise' else 1,
+                            'format_note': track.get('title'),
+                        })
+                    formats.extend(m3u8_formats)
+        self._sort_formats(formats)
+
+        thumbnail = xpath_text(video, './/image', 'thumbnail')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
+
+
+class WebcasterFeedIE(InfoExtractor):
+    _VALID_URL = r'https?://bl\.webcaster\.pro/feed/start/free_(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://bl.webcaster.pro/feed/start/free_c8cefd240aa593681c8d068cff59f407_hd/q393859/eb173f99dd5f558674dae55f4ba6806d/1480289104',
+        'only_matching': True,
+    }
+
+    @staticmethod
+    def _extract_url(ie, webpage):
+        mobj = re.search(
+            r'<(?:object|a[^>]+class=["\']webcaster-player["\'])[^>]+data(?:-config)?=(["\']).*?config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_.*?)(?:[?&]|\1)',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+        for secure in (True, False):
+            video_url = ie._og_search_video_url(
+                webpage, secure=secure, default=None)
+            if video_url:
+                mobj = re.search(
+                    r'config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_[^?&=]+)',
+                    video_url)
+                if mobj:
+                    return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        feed = self._download_xml(url, video_id)
+
+        video_url = xpath_text(
+            feed, ('video_hd', 'video'), 'video url', fatal=True)
+
+        return self.url_result(video_url, WebcasterIE.ie_key())
diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py

new file mode 100644 (file)

index 0000000..f2b8d19
--- /dev/null
+++ b/youtube_dl/extractor/webofstories.py
@@ -0,0 +1,160 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    orderedSet,
+)
+
+
+class WebOfStoriesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?webofstories\.com/play/(?:[^/]+/)?(?P<id>[0-9]+)'
+    _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
+    _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
+    _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
+    _TESTS = [{
+        'url': 'http://www.webofstories.com/play/hans.bethe/71',
+        'md5': '373e4dd915f60cfe3116322642ddf364',
+        'info_dict': {
+            'id': '4536',
+            'ext': 'mp4',
+            'title': 'The temperature of the sun',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'Hans Bethe talks about calculating the temperature of the sun',
+            'duration': 238,
+        }
+    }, {
+        'url': 'http://www.webofstories.com/play/55908',
+        'md5': '2985a698e1fe3211022422c4b5ed962c',
+        'info_dict': {
+            'id': '55908',
+            'ext': 'mp4',
+            'title': 'The story of Gemmata obscuriglobus',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
+            'duration': 169,
+        },
+        'skip': 'notfound',
+    }, {
+        # malformed og:title meta
+        'url': 'http://www.webofstories.com/play/54215?o=MS',
+        'info_dict': {
+            'id': '54215',
+            'ext': 'mp4',
+            'title': '"A Leg to Stand On"',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'description': 'Oliver Sacks talks about the death and resurrection of a limb',
+            'duration': 97,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        # Sometimes og:title meta is malformed
+        title = self._og_search_title(webpage, default=None) or self._html_search_regex(
+            r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title')
+        description = self._html_search_meta('description', webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        embed_params = [s.strip(" \r\n\t'") for s in self._search_regex(
+            r'(?s)\$\("#embedCode"\).html\(getEmbedCode\((.*?)\)',
+            webpage, 'embed params').split(',')]
+
+        (
+            _, speaker_id, story_id, story_duration,
+            speaker_type, great_life, _thumbnail, _has_subtitles,
+            story_filename, _story_order) = embed_params
+
+        is_great_life_series = great_life == 'true'
+        duration = int_or_none(story_duration)
+
+        # URL building, see: http://www.webofstories.com/scripts/player.js
+        ms_prefix = ''
+        if speaker_type.lower() == 'ms':
+            ms_prefix = 'mini_sites/'
+
+        if is_great_life_series:
+            mp4_url = '{0:}lives/{1:}/{2:}.mp4'.format(
+                self._VIDEO_DOMAIN, speaker_id, story_filename)
+            rtmp_ext = 'flv'
+            streamer = self._GREAT_LIFE_STREAMER
+            play_path = 'stories/{0:}/{1:}'.format(
+                speaker_id, story_filename)
+        else:
+            mp4_url = '{0:}{1:}{2:}/{3:}.mp4'.format(
+                self._VIDEO_DOMAIN, ms_prefix, speaker_id, story_filename)
+            rtmp_ext = 'mp4'
+            streamer = self._USER_STREAMER
+            play_path = 'mp4:{0:}{1:}/{2}.mp4'.format(
+                ms_prefix, speaker_id, story_filename)
+
+        formats = [{
+            'format_id': 'mp4_sd',
+            'url': mp4_url,
+        }, {
+            'format_id': 'rtmp_sd',
+            'page_url': url,
+            'url': streamer,
+            'ext': rtmp_ext,
+            'play_path': play_path,
+        }]
+
+        self._sort_formats(formats)
+
+        return {
+            'id': story_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'description': description,
+            'duration': duration,
+        }
+
+
+class WebOfStoriesPlaylistIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://www.webofstories.com/playAll/donald.knuth',
+        'info_dict': {
+            'id': 'donald.knuth',
+            'title': 'Donald Knuth (Scientist)',
+        },
+        'playlist_mincount': 97,
+    }
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        entries = [
+            self.url_result(
+                'http://www.webofstories.com/play/%s' % video_id,
+                'WebOfStories', video_id=video_id)
+            for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage))
+        ]
+
+        title = self._search_regex(
+            r'<div id="speakerName">\s*<span>([^<]+)</span>',
+            webpage, 'speaker', default=None)
+        if title:
+            field = self._search_regex(
+                r'<span id="primaryField">([^<]+)</span>',
+                webpage, 'field', default=None)
+            if field:
+                title += ' (%s)' % field
+
+        if not title:
+            title = self._search_regex(
+                r'<title>Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories</title>',
+                webpage, 'title')
+
+        return self.playlist_result(entries, playlist_id, title)
diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py

new file mode 100644 (file)

index 0000000..621df5b
--- /dev/null
+++ b/youtube_dl/extractor/weibo.py
@@ -0,0 +1,140 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+import json
+import random
+import re
+
+from ..compat import (
+    compat_parse_qs,
+    compat_str,
+)
+from ..utils import (
+    js_to_json,
+    strip_jsonp,
+    urlencode_postdata,
+)
+
+
+class WeiboIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)'
+    _TEST = {
+        'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment',
+        'info_dict': {
+            'id': 'Fp6RGfbff',
+            'ext': 'mp4',
+            'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        # to get Referer url for genvisitor
+        webpage, urlh = self._download_webpage_handle(url, video_id)
+
+        visitor_url = urlh.geturl()
+
+        if 'passport.weibo.com' in visitor_url:
+            # first visit
+            visitor_data = self._download_json(
+                'https://passport.weibo.com/visitor/genvisitor', video_id,
+                note='Generating first-visit data',
+                transform_source=strip_jsonp,
+                headers={'Referer': visitor_url},
+                data=urlencode_postdata({
+                    'cb': 'gen_callback',
+                    'fp': json.dumps({
+                        'os': '2',
+                        'browser': 'Gecko57,0,0,0',
+                        'fonts': 'undefined',
+                        'screenInfo': '1440*900*24',
+                        'plugins': '',
+                    }),
+                }))
+
+            tid = visitor_data['data']['tid']
+            cnfd = '%03d' % visitor_data['data']['confidence']
+
+            self._download_webpage(
+                'https://passport.weibo.com/visitor/visitor', video_id,
+                note='Running first-visit callback',
+                query={
+                    'a': 'incarnate',
+                    't': tid,
+                    'w': 2,
+                    'c': cnfd,
+                    'cb': 'cross_domain',
+                    'from': 'weibo',
+                    '_rand': random.random(),
+                })
+
+            webpage = self._download_webpage(
+                url, video_id, note='Revisiting webpage')
+
+        title = self._html_search_regex(
+            r'<title>(.+?)</title>', webpage, 'title')
+
+        video_formats = compat_parse_qs(self._search_regex(
+            r'video-sources=\\\"(.+?)\"', webpage, 'video_sources'))
+
+        formats = []
+        supported_resolutions = (480, 720)
+        for res in supported_resolutions:
+            vid_urls = video_formats.get(compat_str(res))
+            if not vid_urls or not isinstance(vid_urls, list):
+                continue
+
+            vid_url = vid_urls[0]
+            formats.append({
+                'url': vid_url,
+                'height': res,
+            })
+
+        self._sort_formats(formats)
+
+        uploader = self._og_search_property(
+            'nick-name', webpage, 'uploader', default=None)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'uploader': uploader,
+            'formats': formats
+        }
+
+
+class WeiboMobileIE(InfoExtractor):
+    _VALID_URL = r'https?://m\.weibo\.cn/status/(?P<id>[0-9]+)(\?.+)?'
+    _TEST = {
+        'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0',
+        'info_dict': {
+            'id': '4189191225395228',
+            'ext': 'mp4',
+            'title': '午睡当然是要甜甜蜜蜜的啦',
+            'uploader': '柴犬柴犬'
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        # to get Referer url for genvisitor
+        webpage = self._download_webpage(url, video_id, note='visit the page')
+
+        weibo_info = self._parse_json(self._search_regex(
+            r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};',
+            webpage, 'js_code', flags=re.DOTALL),
+            video_id, transform_source=js_to_json)
+
+        status_data = weibo_info.get('status', {})
+        page_info = status_data.get('page_info')
+        title = status_data['status_title']
+        uploader = status_data.get('user', {}).get('screen_name')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'uploader': uploader,
+            'url': page_info['media_info']['stream_url']
+        }
diff --git a/youtube_dl/extractor/weiqitv.py b/youtube_dl/extractor/weiqitv.py

new file mode 100644 (file)

index 0000000..7e0befd
--- /dev/null
+++ b/youtube_dl/extractor/weiqitv.py
@@ -0,0 +1,52 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class WeiqiTVIE(InfoExtractor):
+    IE_DESC = 'WQTV'
+    _VALID_URL = r'https?://(?:www\.)?weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)'
+
+    _TESTS = [{
+        'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3',
+        'md5': '26450599afd64c513bc77030ad15db44',
+        'info_dict': {
+            'id': '53c744f09874f0e76a8b46f3',
+            'ext': 'mp4',
+            'title': '2013年度盘点',
+        },
+    }, {
+        'url': 'http://www.weiqitv.com/index/video_play?videoId=567379a2d4c36cca518b4569',
+        'info_dict': {
+            'id': '567379a2d4c36cca518b4569',
+            'ext': 'mp4',
+            'title': '民国围棋史',
+        },
+    }, {
+        'url': 'http://www.weiqitv.com/index/video_play?videoId=5430220a9874f088658b4567',
+        'info_dict': {
+            'id': '5430220a9874f088658b4567',
+            'ext': 'mp4',
+            'title': '二路托过的手段和运用',
+        },
+    }]
+
+    def _real_extract(self, url):
+        media_id = self._match_id(url)
+        page = self._download_webpage(url, media_id)
+
+        info_json_str = self._search_regex(
+            r'var\s+video\s*=\s*(.+});', page, 'info json str')
+        info_json = self._parse_json(info_json_str, media_id)
+
+        letvcloud_url = self._search_regex(
+            r'var\s+letvurl\s*=\s*"([^"]+)', page, 'letvcloud url')
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'LetvCloud',
+            'url': letvcloud_url,
+            'title': info_json['name'],
+            'id': media_id,
+        }
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py

new file mode 100644 (file)

index 0000000..77febd2
--- /dev/null
+++ b/youtube_dl/extractor/wistia.py
@@ -0,0 +1,162 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    unescapeHTML,
+)
+
+
+class WistiaIE(InfoExtractor):
+    _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P<id>[a-z0-9]{10})'
+    _EMBED_BASE_URL = 'http://fast.wistia.com/embed/'
+
+    _TESTS = [{
+        'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',
+        'md5': 'cafeb56ec0c53c18c97405eecb3133df',
+        'info_dict': {
+            'id': 'sh7fpupwlt',
+            'ext': 'mov',
+            'title': 'Being Resourceful',
+            'description': 'a Clients From Hell Video Series video from worldwidewebhosting',
+            'upload_date': '20131204',
+            'timestamp': 1386185018,
+            'duration': 117,
+        },
+    }, {
+        'url': 'wistia:sh7fpupwlt',
+        'only_matching': True,
+    }, {
+        # with hls video
+        'url': 'wistia:807fafadvk',
+        'only_matching': True,
+    }, {
+        'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt',
+        'only_matching': True,
+    }, {
+        'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json',
+        'only_matching': True,
+    }]
+
+    # https://wistia.com/support/embed-and-share/video-on-your-website
+    @staticmethod
+    def _extract_url(webpage):
+        urls = WistiaIE._extract_urls(webpage)
+        return urls[0] if urls else None
+
+    @staticmethod
+    def _extract_urls(webpage):
+        urls = []
+        for match in re.finditer(
+                r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage):
+            urls.append(unescapeHTML(match.group('url')))
+        for match in re.finditer(
+                r'''(?sx)
+                    <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1
+                ''', webpage):
+            urls.append('wistia:%s' % match.group('id'))
+        for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage):
+            urls.append('wistia:%s' % match.group('id'))
+        return urls
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        data_json = self._download_json(
+            self._EMBED_BASE_URL + 'medias/%s.json' % video_id, video_id,
+            # Some videos require this.
+            headers={
+                'Referer': url if url.startswith('http') else self._EMBED_BASE_URL + 'iframe/' + video_id,
+            })
+
+        if data_json.get('error'):
+            raise ExtractorError(
+                'Error while getting the playlist', expected=True)
+
+        data = data_json['media']
+        title = data['name']
+
+        formats = []
+        thumbnails = []
+        for a in data['assets']:
+            aurl = a.get('url')
+            if not aurl:
+                continue
+            astatus = a.get('status')
+            atype = a.get('type')
+            if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'):
+                continue
+            elif atype in ('still', 'still_image'):
+                thumbnails.append({
+                    'url': aurl,
+                    'width': int_or_none(a.get('width')),
+                    'height': int_or_none(a.get('height')),
+                    'filesize': int_or_none(a.get('size')),
+                })
+            else:
+                aext = a.get('ext')
+                display_name = a.get('display_name')
+                format_id = atype
+                if atype and atype.endswith('_video') and display_name:
+                    format_id = '%s-%s' % (atype[:-6], display_name)
+                f = {
+                    'format_id': format_id,
+                    'url': aurl,
+                    'tbr': int_or_none(a.get('bitrate')) or None,
+                    'preference': 1 if atype == 'original' else None,
+                }
+                if display_name == 'Audio':
+                    f.update({
+                        'vcodec': 'none',
+                    })
+                else:
+                    f.update({
+                        'width': int_or_none(a.get('width')),
+                        'height': int_or_none(a.get('height')),
+                        'vcodec': a.get('codec'),
+                    })
+                if a.get('container') == 'm3u8' or aext == 'm3u8':
+                    ts_f = f.copy()
+                    ts_f.update({
+                        'ext': 'ts',
+                        'format_id': f['format_id'].replace('hls-', 'ts-'),
+                        'url': f['url'].replace('.bin', '.ts'),
+                    })
+                    formats.append(ts_f)
+                    f.update({
+                        'ext': 'mp4',
+                        'protocol': 'm3u8_native',
+                    })
+                else:
+                    f.update({
+                        'container': a.get('container'),
+                        'ext': aext,
+                        'filesize': int_or_none(a.get('size')),
+                    })
+                formats.append(f)
+
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for caption in data.get('captions', []):
+            language = caption.get('language')
+            if not language:
+                continue
+            subtitles[language] = [{
+                'url': self._EMBED_BASE_URL + 'captions/' + video_id + '.vtt?language=' + language,
+            }]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': data.get('seoDescription'),
+            'formats': formats,
+            'thumbnails': thumbnails,
+            'duration': float_or_none(data.get('duration')),
+            'timestamp': int_or_none(data.get('createdAt')),
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py

new file mode 100644 (file)

index 0000000..82587b4
--- /dev/null
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -0,0 +1,40 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class WorldStarHipHopIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?.*?\bv=(?P<id>[^&]+)'
+    _TESTS = [{
+        'url': 'http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO',
+        'md5': '9d04de741161603bf7071bbf4e883186',
+        'info_dict': {
+            'id': 'wshh6a7q1ny0G34ZwuIO',
+            'ext': 'mp4',
+            'title': 'KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!'
+        }
+    }, {
+        'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        entries = self._parse_html5_media_entries(url, webpage, video_id)
+
+        if not entries:
+            return self.url_result(url, 'Generic')
+
+        title = self._html_search_regex(
+            [r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
+             r'<span[^>]+class="tc-sp-pinned-title">(.*)</span>'],
+            webpage, 'title')
+
+        info = entries[0]
+        info.update({
+            'id': video_id,
+            'title': title,
+        })
+        return info
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py

new file mode 100644 (file)

index 0000000..67236f3
--- /dev/null
+++ b/youtube_dl/extractor/wsj.py
@@ -0,0 +1,123 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    float_or_none,
+    unified_strdate,
+)
+
+
+class WSJIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
+                            https?://(?:www\.)?(?:wsj|barrons)\.com/video/(?:[^/]+/)+|
+                            wsj:
+                        )
+                        (?P<id>[a-fA-F0-9-]{36})
+                    '''
+    IE_DESC = 'Wall Street Journal'
+    _TESTS = [{
+        'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
+        'md5': 'e230a5bb249075e40793b655a54a02e4',
+        'info_dict': {
+            'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
+            'ext': 'mp4',
+            'upload_date': '20150202',
+            'uploader_id': 'jdesai',
+            'creator': 'jdesai',
+            'categories': list,  # a long list
+            'duration': 90,
+            'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo',
+        },
+    }, {
+        'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.barrons.com/video/capitalism-deserves-more-respect-from-millennials/F301217E-6F46-43AE-B8D2-B7180D642EE9.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.wsj.com/video/series/a-brief-history-of/the-modern-cell-carrier-how-we-got-here/980E2187-401D-48A1-B82B-1486CEE06CB9',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        info = self._download_json(
+            'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id,
+            query={
+                'type': 'guid',
+                'count': 1,
+                'query': video_id,
+                'fields': ','.join((
+                    'type', 'hls', 'videoMP4List', 'thumbnailList', 'author',
+                    'description', 'name', 'duration', 'videoURL', 'titletag',
+                    'formattedCreationDate', 'keywords', 'editor')),
+            })['items'][0]
+        title = info.get('name', info.get('titletag'))
+
+        formats = []
+
+        f4m_url = info.get('videoURL')
+        if f4m_url:
+            formats.extend(self._extract_f4m_formats(
+                f4m_url, video_id, f4m_id='hds', fatal=False))
+
+        m3u8_url = info.get('hls')
+        if m3u8_url:
+            formats.extend(self._extract_m3u8_formats(
+                info['hls'], video_id, ext='mp4',
+                entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+
+        for v in info.get('videoMP4List', []):
+            mp4_url = v.get('url')
+            if not mp4_url:
+                continue
+            tbr = int_or_none(v.get('bitrate'))
+            formats.append({
+                'url': mp4_url,
+                'format_id': 'http' + ('-%d' % tbr if tbr else ''),
+                'tbr': tbr,
+                'width': int_or_none(v.get('width')),
+                'height': int_or_none(v.get('height')),
+                'fps': float_or_none(v.get('fps')),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            # Thumbnails are conveniently in the correct format already
+            'thumbnails': info.get('thumbnailList'),
+            'creator': info.get('author'),
+            'uploader_id': info.get('editor'),
+            'duration': int_or_none(info.get('duration')),
+            'upload_date': unified_strdate(info.get(
+                'formattedCreationDate'), day_first=False),
+            'title': title,
+            'categories': info.get('keywords'),
+        }
+
+
+class WSJArticleIE(InfoExtractor):
+    _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
+        'info_dict': {
+            'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
+            'ext': 'mp4',
+            'upload_date': '20170221',
+            'uploader_id': 'ralcaraz',
+            'title': 'Bao Bao the Panda Leaves for China',
+        }
+    }
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+        webpage = self._download_webpage(url, article_id)
+        video_id = self._search_regex(
+            r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id')
+        return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)
diff --git a/youtube_dl/extractor/wwe.py b/youtube_dl/extractor/wwe.py

new file mode 100644 (file)

index 0000000..bebc77b
--- /dev/null
+++ b/youtube_dl/extractor/wwe.py
@@ -0,0 +1,140 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    try_get,
+    unescapeHTML,
+    url_or_none,
+    urljoin,
+)
+
+
+class WWEBaseIE(InfoExtractor):
+    _SUBTITLE_LANGS = {
+        'English': 'en',
+        'Deutsch': 'de',
+    }
+
+    def _extract_entry(self, data, url, video_id=None):
+        video_id = compat_str(video_id or data['nid'])
+        title = data['title']
+
+        formats = self._extract_m3u8_formats(
+            data['file'], video_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+
+        description = data.get('description')
+        thumbnail = urljoin(url, data.get('image'))
+        series = data.get('show_name')
+        episode = data.get('episode_name')
+
+        subtitles = {}
+        tracks = data.get('tracks')
+        if isinstance(tracks, list):
+            for track in tracks:
+                if not isinstance(track, dict):
+                    continue
+                if track.get('kind') != 'captions':
+                    continue
+                track_file = url_or_none(track.get('file'))
+                if not track_file:
+                    continue
+                label = track.get('label')
+                lang = self._SUBTITLE_LANGS.get(label, label) or 'en'
+                subtitles.setdefault(lang, []).append({
+                    'url': track_file,
+                })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'series': series,
+            'episode': episode,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class WWEIE(WWEBaseIE):
+    _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*videos/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.wwe.com/videos/daniel-bryan-vs-andrade-cien-almas-smackdown-live-sept-4-2018',
+        'md5': '92811c6a14bfc206f7a6a9c5d9140184',
+        'info_dict': {
+            'id': '40048199',
+            'ext': 'mp4',
+            'title': 'Daniel Bryan vs. Andrade "Cien" Almas: SmackDown LIVE, Sept. 4, 2018',
+            'description': 'md5:2d7424dbc6755c61a0e649d2a8677f67',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }, {
+        'url': 'https://de.wwe.com/videos/gran-metalik-vs-tony-nese-wwe-205-live-sept-4-2018',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        landing = self._parse_json(
+            self._html_search_regex(
+                r'(?s)Drupal\.settings\s*,\s*({.+?})\s*\)\s*;',
+                webpage, 'drupal settings'),
+            display_id)['WWEVideoLanding']
+
+        data = landing['initialVideo']['playlist'][0]
+        video_id = landing.get('initialVideoId')
+
+        info = self._extract_entry(data, url, video_id)
+        info['display_id'] = display_id
+        return info
+
+
+class WWEPlaylistIE(WWEBaseIE):
+    _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.wwe.com/shows/raw/2018-11-12',
+        'info_dict': {
+            'id': '2018-11-12',
+        },
+        'playlist_mincount': 11,
+    }, {
+        'url': 'http://www.wwe.com/article/walk-the-prank-wwe-edition',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.wwe.com/shows/wwenxt/article/matt-riddle-interview',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if WWEIE.suitable(url) else super(WWEPlaylistIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        entries = []
+        for mobj in re.finditer(
+                r'data-video\s*=\s*(["\'])(?P<data>{.+?})\1', webpage):
+            video = self._parse_json(
+                mobj.group('data'), display_id, transform_source=unescapeHTML,
+                fatal=False)
+            if not video:
+                continue
+            data = try_get(video, lambda x: x['playlist'][0], dict)
+            if not data:
+                continue
+            try:
+                entry = self._extract_entry(data, url)
+            except Exception:
+                continue
+            entry['extractor_key'] = WWEIE.ie_key()
+            entries.append(entry)
+
+        return self.playlist_result(entries, display_id)
diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py

new file mode 100644 (file)

index 0000000..4c41e98
--- /dev/null
+++ b/youtube_dl/extractor/xbef.py
@@ -0,0 +1,44 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+
+
+class XBefIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking',
+        'md5': 'a478b565baff61634a98f5e5338be995',
+        'info_dict': {
+            'id': '5119',
+            'ext': 'mp4',
+            'title': 'md5:7358a9faef8b7b57acda7c04816f170e',
+            'age_limit': 18,
+            'thumbnail': r're:^http://.*\.jpg',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<h1[^>]*>(.*?)</h1>', webpage, 'title')
+
+        config_url_enc = self._download_webpage(
+            'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id,
+            note='Retrieving config URL')
+        config_url = compat_urllib_parse_unquote(config_url_enc)
+        config = self._download_xml(
+            config_url, video_id, note='Retrieving config')
+
+        video_url = config.find('./file').text
+        thumbnail = config.find('./image').text
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py

new file mode 100644 (file)

index 0000000..d9c277b
--- /dev/null
+++ b/youtube_dl/extractor/xboxclips.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_filesize,
+    unified_strdate,
+)
+
+
+class XboxClipsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})'
+    _TEST = {
+        'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325',
+        'md5': 'fbe1ec805e920aeb8eced3c3e657df5d',
+        'info_dict': {
+            'id': '074a69a9-5faf-46aa-b93b-9909c1720325',
+            'ext': 'mp4',
+            'title': 'Iabdulelah playing Titanfall',
+            'filesize_approx': 26800000,
+            'upload_date': '20140807',
+            'duration': 56,
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._html_search_regex(
+            r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL')
+        title = self._html_search_regex(
+            r'<title>XboxClips \| ([^<]+)</title>', webpage, 'title')
+        upload_date = unified_strdate(self._html_search_regex(
+            r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False))
+        filesize = parse_filesize(self._html_search_regex(
+            r'>Size: ([^<]+)<', webpage, 'file size', fatal=False))
+        duration = int_or_none(self._html_search_regex(
+            r'>Duration: (\d+) Seconds<', webpage, 'duration', fatal=False))
+        view_count = int_or_none(self._html_search_regex(
+            r'>Views: (\d+)<', webpage, 'view count', fatal=False))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'upload_date': upload_date,
+            'filesize_approx': filesize,
+            'duration': duration,
+            'view_count': view_count,
+        }
diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py

new file mode 100644 (file)

index 0000000..48ef07e
--- /dev/null
+++ b/youtube_dl/extractor/xfileshare.py
@@ -0,0 +1,193 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_chr
+from ..utils import (
+    decode_packed_codes,
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    js_to_json,
+    urlencode_postdata,
+)
+
+
+# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58
+def aa_decode(aa_code):
+    symbol_table = [
+        ('7', '((ﾟｰﾟ) + (o^_^o))'),
+        ('6', '((o^_^o) +(o^_^o))'),
+        ('5', '((ﾟｰﾟ) + (ﾟΘﾟ))'),
+        ('2', '((o^_^o) - (ﾟΘﾟ))'),
+        ('4', '(ﾟｰﾟ)'),
+        ('3', '(o^_^o)'),
+        ('1', '(ﾟΘﾟ)'),
+        ('0', '(c^_^o)'),
+    ]
+    delim = '(ﾟДﾟ)[ﾟεﾟ]+'
+    ret = ''
+    for aa_char in aa_code.split(delim):
+        for val, pat in symbol_table:
+            aa_char = aa_char.replace(pat, val)
+        aa_char = aa_char.replace('+ ', '')
+        m = re.match(r'^\d+', aa_char)
+        if m:
+            ret += compat_chr(int(m.group(0), 8))
+        else:
+            m = re.match(r'^u([\da-f]+)', aa_char)
+            if m:
+                ret += compat_chr(int(m.group(1), 16))
+    return ret
+
+
+class XFileShareIE(InfoExtractor):
+    _SITES = (
+        (r'clipwatching\.com', 'ClipWatching'),
+        (r'gounlimited\.to', 'GoUnlimited'),
+        (r'govid\.me', 'GoVid'),
+        (r'holavid\.com', 'HolaVid'),
+        (r'streamty\.com', 'Streamty'),
+        (r'thevideobee\.to', 'TheVideoBee'),
+        (r'uqload\.com', 'Uqload'),
+        (r'vidbom\.com', 'VidBom'),
+        (r'vidlo\.us', 'vidlo'),
+        (r'vidlocker\.xyz', 'VidLocker'),
+        (r'vidshare\.tv', 'VidShare'),
+        (r'vup\.to', 'VUp'),
+        (r'xvideosharing\.com', 'XVideoSharing'),
+    )
+
+    IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
+    _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
+                  % '|'.join(site for site in list(zip(*_SITES))[0]))
+
+    _FILE_NOT_FOUND_REGEXES = (
+        r'>(?:404 - )?File Not Found<',
+        r'>The file was removed by administrator<',
+    )
+
+    _TESTS = [{
+        'url': 'http://xvideosharing.com/fq65f94nd2ve',
+        'md5': '4181f63957e8fe90ac836fa58dc3c8a6',
+        'info_dict': {
+            'id': 'fq65f94nd2ve',
+            'ext': 'mp4',
+            'title': 'sample',
+            'thumbnail': r're:http://.*\.jpg',
+        },
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url')
+            for mobj in re.finditer(
+                r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1'
+                % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]),
+                webpage)]
+
+    def _real_extract(self, url):
+        host, video_id = re.match(self._VALID_URL, url).groups()
+
+        url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id)
+        webpage = self._download_webpage(url, video_id)
+
+        if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES):
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+        fields = self._hidden_inputs(webpage)
+
+        if fields.get('op') == 'download1':
+            countdown = int_or_none(self._search_regex(
+                r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>',
+                webpage, 'countdown', default=None))
+            if countdown:
+                self._sleep(countdown, video_id)
+
+            webpage = self._download_webpage(
+                url, video_id, 'Downloading video page',
+                data=urlencode_postdata(fields), headers={
+                    'Referer': url,
+                    'Content-type': 'application/x-www-form-urlencoded',
+                })
+
+        title = (self._search_regex(
+            (r'style="z-index: [0-9]+;">([^<]+)</span>',
+             r'<td nowrap>([^<]+)</td>',
+             r'h4-fine[^>]*>([^<]+)<',
+             r'>Watch (.+)[ <]',
+             r'<h2 class="video-page-head">([^<]+)</h2>',
+             r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<',  # streamin.to
+             r'title\s*:\s*"([^"]+)"'),  # govid.me
+            webpage, 'title', default=None) or self._og_search_title(
+            webpage, default=None) or video_id).strip()
+
+        for regex, func in (
+                (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes),
+                (r'(ﾟ.+)', aa_decode)):
+            obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None)
+            if obf_code:
+                webpage = webpage.replace(obf_code, func(obf_code))
+
+        formats = []
+
+        jwplayer_data = self._search_regex(
+            [
+                r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);',
+                r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);',
+            ], webpage,
+            'jwplayer data', default=None)
+        if jwplayer_data:
+            jwplayer_data = self._parse_json(
+                jwplayer_data.replace(r"\'", "'"), video_id, js_to_json)
+            if jwplayer_data:
+                formats = self._parse_jwplayer_data(
+                    jwplayer_data, video_id, False,
+                    m3u8_id='hls', mpd_id='dash')['formats']
+
+        if not formats:
+            urls = []
+            for regex in (
+                    r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
+                    r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',
+                    r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
+                    r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'):
+                for mobj in re.finditer(regex, webpage):
+                    video_url = mobj.group('url')
+                    if video_url not in urls:
+                        urls.append(video_url)
+
+            sources = self._search_regex(
+                r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None)
+            if sources:
+                urls.extend(self._parse_json(sources, video_id))
+
+            formats = []
+            for video_url in urls:
+                if determine_ext(video_url) == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        video_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id='hls',
+                        fatal=False))
+                else:
+                    formats.append({
+                        'url': video_url,
+                        'format_id': 'sd',
+                    })
+        self._sort_formats(formats)
+
+        thumbnail = self._search_regex(
+            [
+                r'<video[^>]+poster="([^"]+)"',
+                r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],',
+            ], webpage, 'thumbnail', default=None)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py

new file mode 100644 (file)

index 0000000..902a3ed
--- /dev/null
+++ b/youtube_dl/extractor/xhamster.py
@@ -0,0 +1,393 @@
+from __future__ import unicode_literals
+
+import itertools
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    clean_html,
+    determine_ext,
+    dict_get,
+    extract_attributes,
+    ExtractorError,
+    int_or_none,
+    parse_duration,
+    try_get,
+    unified_strdate,
+    url_or_none,
+)
+
+
+class XHamsterIE(InfoExtractor):
+    _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:.+?\.)?%s/
+                        (?:
+                            movies/(?P<id>[\dA-Za-z]+)/(?P<display_id>[^/]*)\.html|
+                            videos/(?P<display_id_2>[^/]*)-(?P<id_2>[\dA-Za-z]+)
+                        )
+                    ''' % _DOMAINS
+    _TESTS = [{
+        'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
+        'md5': '98b4687efb1ffd331c4197854dc09e8f',
+        'info_dict': {
+            'id': '1509445',
+            'display_id': 'femaleagent-shy-beauty-takes-the-bait',
+            'ext': 'mp4',
+            'title': 'FemaleAgent Shy beauty takes the bait',
+            'timestamp': 1350194821,
+            'upload_date': '20121014',
+            'uploader': 'Ruseful2011',
+            'duration': 893,
+            'age_limit': 18,
+        },
+    }, {
+        'url': 'https://xhamster.com/videos/britney-spears-sexy-booty-2221348?hd=',
+        'info_dict': {
+            'id': '2221348',
+            'display_id': 'britney-spears-sexy-booty',
+            'ext': 'mp4',
+            'title': 'Britney Spears  Sexy Booty',
+            'timestamp': 1379123460,
+            'upload_date': '20130914',
+            'uploader': 'jojo747400',
+            'duration': 200,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # empty seo, unavailable via new URL schema
+        'url': 'http://xhamster.com/movies/5667973/.html',
+        'info_dict': {
+            'id': '5667973',
+            'ext': 'mp4',
+            'title': '....',
+            'timestamp': 1454948101,
+            'upload_date': '20160208',
+            'uploader': 'parejafree',
+            'duration': 72,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # mobile site
+        'url': 'https://m.xhamster.com/videos/cute-teen-jacqueline-solo-masturbation-8559111',
+        'only_matching': True,
+    }, {
+        'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',
+        'only_matching': True,
+    }, {
+        # This video is visible for marcoalfa123456's friends only
+        'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html',
+        'only_matching': True,
+    }, {
+        # new URL schema
+        'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821',
+        'only_matching': True,
+    }, {
+        'url': 'https://xhamster.one/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
+        'only_matching': True,
+    }, {
+        'url': 'https://xhamster.desi/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
+        'only_matching': True,
+    }, {
+        'url': 'https://xhamster2.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
+        'only_matching': True,
+    }, {
+        'url': 'https://xhamster11.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
+        'only_matching': True,
+    }, {
+        'url': 'https://xhamster26.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
+        'only_matching': True,
+    }, {
+        'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
+        'only_matching': True,
+    }, {
+        'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('id_2')
+        display_id = mobj.group('display_id') or mobj.group('display_id_2')
+
+        desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url)
+        webpage, urlh = self._download_webpage_handle(desktop_url, video_id)
+
+        error = self._html_search_regex(
+            r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>',
+            webpage, 'error', default=None)
+        if error:
+            raise ExtractorError(error, expected=True)
+
+        age_limit = self._rta_search(webpage)
+
+        def get_height(s):
+            return int_or_none(self._search_regex(
+                r'^(\d+)[pP]', s, 'height', default=None))
+
+        initials = self._parse_json(
+            self._search_regex(
+                r'window\.initials\s*=\s*({.+?})\s*;', webpage, 'initials',
+                default='{}'),
+            video_id, fatal=False)
+        if initials:
+            video = initials['videoModel']
+            title = video['title']
+            formats = []
+            for format_id, formats_dict in video['sources'].items():
+                if not isinstance(formats_dict, dict):
+                    continue
+                for quality, format_item in formats_dict.items():
+                    if format_id == 'download':
+                        # Download link takes some time to be generated,
+                        # skipping for now
+                        continue
+                        if not isinstance(format_item, dict):
+                            continue
+                        format_url = format_item.get('link')
+                        filesize = int_or_none(
+                            format_item.get('size'), invscale=1000000)
+                    else:
+                        format_url = format_item
+                        filesize = None
+                    format_url = url_or_none(format_url)
+                    if not format_url:
+                        continue
+                    formats.append({
+                        'format_id': '%s-%s' % (format_id, quality),
+                        'url': format_url,
+                        'ext': determine_ext(format_url, 'mp4'),
+                        'height': get_height(quality),
+                        'filesize': filesize,
+                        'http_headers': {
+                            'Referer': urlh.geturl(),
+                        },
+                    })
+            self._sort_formats(formats)
+
+            categories_list = video.get('categories')
+            if isinstance(categories_list, list):
+                categories = []
+                for c in categories_list:
+                    if not isinstance(c, dict):
+                        continue
+                    c_name = c.get('name')
+                    if isinstance(c_name, compat_str):
+                        categories.append(c_name)
+            else:
+                categories = None
+
+            return {
+                'id': video_id,
+                'display_id': display_id,
+                'title': title,
+                'description': video.get('description'),
+                'timestamp': int_or_none(video.get('created')),
+                'uploader': try_get(
+                    video, lambda x: x['author']['name'], compat_str),
+                'thumbnail': video.get('thumbURL'),
+                'duration': int_or_none(video.get('duration')),
+                'view_count': int_or_none(video.get('views')),
+                'like_count': int_or_none(try_get(
+                    video, lambda x: x['rating']['likes'], int)),
+                'dislike_count': int_or_none(try_get(
+                    video, lambda x: x['rating']['dislikes'], int)),
+                'comment_count': int_or_none(video.get('views')),
+                'age_limit': age_limit,
+                'categories': categories,
+                'formats': formats,
+            }
+
+        # Old layout fallback
+
+        title = self._html_search_regex(
+            [r'<h1[^>]*>([^<]+)</h1>',
+             r'<meta[^>]+itemprop=".*?caption.*?"[^>]+content="(.+?)"',
+             r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'],
+            webpage, 'title')
+
+        formats = []
+        format_urls = set()
+
+        sources = self._parse_json(
+            self._search_regex(
+                r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources',
+                default='{}'),
+            video_id, fatal=False)
+        for format_id, format_url in sources.items():
+            format_url = url_or_none(format_url)
+            if not format_url:
+                continue
+            if format_url in format_urls:
+                continue
+            format_urls.add(format_url)
+            formats.append({
+                'format_id': format_id,
+                'url': format_url,
+                'height': get_height(format_id),
+            })
+
+        video_url = self._search_regex(
+            [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
+             r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
+             r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
+            webpage, 'video url', group='mp4', default=None)
+        if video_url and video_url not in format_urls:
+            formats.append({
+                'url': video_url,
+            })
+
+        self._sort_formats(formats)
+
+        # Only a few videos have an description
+        mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
+        description = mobj.group(1) if mobj else None
+
+        upload_date = unified_strdate(self._search_regex(
+            r'hint=["\'](\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}',
+            webpage, 'upload date', fatal=False))
+
+        uploader = self._html_search_regex(
+            r'<span[^>]+itemprop=["\']author[^>]+><a[^>]+><span[^>]+>([^<]+)',
+            webpage, 'uploader', default='anonymous')
+
+        thumbnail = self._search_regex(
+            [r'''["']thumbUrl["']\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''',
+             r'''<video[^>]+"poster"=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''],
+            webpage, 'thumbnail', fatal=False, group='thumbnail')
+
+        duration = parse_duration(self._search_regex(
+            [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']',
+             r'Runtime:\s*</span>\s*([\d:]+)'], webpage,
+            'duration', fatal=False))
+
+        view_count = int_or_none(self._search_regex(
+            r'content=["\']User(?:View|Play)s:(\d+)',
+            webpage, 'view count', fatal=False))
+
+        mobj = re.search(r'hint=[\'"](?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes', webpage)
+        (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None)
+
+        mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage)
+        comment_count = mobj.group('commentcount') if mobj else 0
+
+        categories_html = self._search_regex(
+            r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage,
+            'categories', default=None)
+        categories = [clean_html(category) for category in re.findall(
+            r'<a[^>]+>(.+?)</a>', categories_html)] if categories_html else None
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': int_or_none(like_count),
+            'dislike_count': int_or_none(dislike_count),
+            'comment_count': int_or_none(comment_count),
+            'age_limit': age_limit,
+            'categories': categories,
+            'formats': formats,
+        }
+
+
+class XHamsterEmbedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P<id>\d+)' % XHamsterIE._DOMAINS
+    _TEST = {
+        'url': 'http://xhamster.com/xembed.php?video=3328539',
+        'info_dict': {
+            'id': '3328539',
+            'ext': 'mp4',
+            'title': 'Pen Masturbation',
+            'timestamp': 1406581861,
+            'upload_date': '20140728',
+            'uploader': 'ManyakisArt',
+            'duration': 5,
+            'age_limit': 18,
+        }
+    }
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [url for _, url in re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1',
+            webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'href="(https?://xhamster\.com/(?:movies/{0}/[^"]*\.html|videos/[^/]*-{0})[^"]*)"'.format(video_id),
+            webpage, 'xhamster url', default=None)
+
+        if not video_url:
+            vars = self._parse_json(
+                self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'),
+                video_id)
+            video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl'))
+
+        return self.url_result(video_url, 'XHamster')
+
+
+class XHamsterUserIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P<id>[^/?#&]+)' % XHamsterIE._DOMAINS
+    _TESTS = [{
+        # Paginated user profile
+        'url': 'https://xhamster.com/users/netvideogirls/videos',
+        'info_dict': {
+            'id': 'netvideogirls',
+        },
+        'playlist_mincount': 267,
+    }, {
+        # Non-paginated user profile
+        'url': 'https://xhamster.com/users/firatkaan/videos',
+        'info_dict': {
+            'id': 'firatkaan',
+        },
+        'playlist_mincount': 1,
+    }]
+
+    def _entries(self, user_id):
+        next_page_url = 'https://xhamster.com/users/%s/videos/1' % user_id
+        for pagenum in itertools.count(1):
+            page = self._download_webpage(
+                next_page_url, user_id, 'Downloading page %s' % pagenum)
+            for video_tag in re.findall(
+                    r'(<a[^>]+class=["\'].*?\bvideo-thumb__image-container[^>]+>)',
+                    page):
+                video = extract_attributes(video_tag)
+                video_url = url_or_none(video.get('href'))
+                if not video_url or not XHamsterIE.suitable(video_url):
+                    continue
+                video_id = XHamsterIE._match_id(video_url)
+                yield self.url_result(
+                    video_url, ie=XHamsterIE.ie_key(), video_id=video_id)
+            mobj = re.search(r'<a[^>]+data-page=["\']next[^>]+>', page)
+            if not mobj:
+                break
+            next_page = extract_attributes(mobj.group(0))
+            next_page_url = url_or_none(next_page.get('href'))
+            if not next_page_url:
+                break
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+        return self.playlist_result(self._entries(user_id), user_id)
diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py

new file mode 100644 (file)

index 0000000..618da83
--- /dev/null
+++ b/youtube_dl/extractor/xiami.py
@@ -0,0 +1,201 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import int_or_none
+
+
+class XiamiBaseIE(InfoExtractor):
+    _API_BASE_URL = 'https://emumo.xiami.com/song/playlist/cat/json/id'
+
+    def _download_webpage_handle(self, *args, **kwargs):
+        webpage = super(XiamiBaseIE, self)._download_webpage_handle(*args, **kwargs)
+        if '>Xiami is currently not available in your country.<' in webpage:
+            self.raise_geo_restricted('Xiami is currently not available in your country')
+        return webpage
+
+    def _extract_track(self, track, track_id=None):
+        track_name = track.get('songName') or track.get('name') or track['subName']
+        artist = track.get('artist') or track.get('artist_name') or track.get('singers')
+        title = '%s - %s' % (artist, track_name) if artist else track_name
+        track_url = self._decrypt(track['location'])
+
+        subtitles = {}
+        lyrics_url = track.get('lyric_url') or track.get('lyric')
+        if lyrics_url and lyrics_url.startswith('http'):
+            subtitles['origin'] = [{'url': lyrics_url}]
+
+        return {
+            'id': track.get('song_id') or track_id,
+            'url': track_url,
+            'title': title,
+            'thumbnail': track.get('pic') or track.get('album_pic'),
+            'duration': int_or_none(track.get('length')),
+            'creator': track.get('artist', '').split(';')[0],
+            'track': track_name,
+            'track_number': int_or_none(track.get('track')),
+            'album': track.get('album_name') or track.get('title'),
+            'artist': artist,
+            'subtitles': subtitles,
+        }
+
+    def _extract_tracks(self, item_id, referer, typ=None):
+        playlist = self._download_json(
+            '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''),
+            item_id, headers={
+                'Referer': referer,
+            })
+        return [
+            self._extract_track(track, item_id)
+            for track in playlist['data']['trackList']]
+
+    @staticmethod
+    def _decrypt(origin):
+        n = int(origin[0])
+        origin = origin[1:]
+        short_lenth = len(origin) // n
+        long_num = len(origin) - short_lenth * n
+        l = tuple()
+        for i in range(0, n):
+            length = short_lenth
+            if i < long_num:
+                length += 1
+            l += (origin[0:length], )
+            origin = origin[length:]
+        ans = ''
+        for i in range(0, short_lenth + 1):
+            for j in range(0, n):
+                if len(l[j]) > i:
+                    ans += l[j][i]
+        return compat_urllib_parse_unquote(ans).replace('^', '0')
+
+
+class XiamiSongIE(XiamiBaseIE):
+    IE_NAME = 'xiami:song'
+    IE_DESC = '虾米音乐'
+    _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.xiami.com/song/1775610518',
+        'md5': '521dd6bea40fd5c9c69f913c232cb57e',
+        'info_dict': {
+            'id': '1775610518',
+            'ext': 'mp3',
+            'title': 'HONNE - Woman',
+            'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
+            'duration': 265,
+            'creator': 'HONNE',
+            'track': 'Woman',
+            'album': 'Woman',
+            'artist': 'HONNE',
+            'subtitles': {
+                'origin': [{
+                    'ext': 'lrc',
+                }],
+            },
+        },
+        'skip': 'Georestricted',
+    }, {
+        'url': 'http://www.xiami.com/song/1775256504',
+        'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc',
+        'info_dict': {
+            'id': '1775256504',
+            'ext': 'mp3',
+            'title': '戴荃 - 悟空',
+            'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
+            'duration': 200,
+            'creator': '戴荃',
+            'track': '悟空',
+            'album': '悟空',
+            'artist': '戴荃',
+            'subtitles': {
+                'origin': [{
+                    'ext': 'lrc',
+                }],
+            },
+        },
+        'skip': 'Georestricted',
+    }, {
+        'url': 'http://www.xiami.com/song/1775953850',
+        'info_dict': {
+            'id': '1775953850',
+            'ext': 'mp3',
+            'title': 'До Скону - Чума Пожирает Землю',
+            'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
+            'duration': 683,
+            'creator': 'До Скону',
+            'track': 'Чума Пожирает Землю',
+            'track_number': 7,
+            'album': 'Ад',
+            'artist': 'До Скону',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.xiami.com/song/xLHGwgd07a1',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        return self._extract_tracks(self._match_id(url), url)[0]
+
+
+class XiamiPlaylistBaseIE(XiamiBaseIE):
+    def _real_extract(self, url):
+        item_id = self._match_id(url)
+        return self.playlist_result(self._extract_tracks(item_id, url, self._TYPE), item_id)
+
+
+class XiamiAlbumIE(XiamiPlaylistBaseIE):
+    IE_NAME = 'xiami:album'
+    IE_DESC = '虾米音乐 - 专辑'
+    _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P<id>[^/?#&]+)'
+    _TYPE = '1'
+    _TESTS = [{
+        'url': 'http://www.xiami.com/album/2100300444',
+        'info_dict': {
+            'id': '2100300444',
+        },
+        'playlist_count': 10,
+        'skip': 'Georestricted',
+    }, {
+        'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.xiami.com/album/URVDji2a506',
+        'only_matching': True,
+    }]
+
+
+class XiamiArtistIE(XiamiPlaylistBaseIE):
+    IE_NAME = 'xiami:artist'
+    IE_DESC = '虾米音乐 - 歌手'
+    _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P<id>[^/?#&]+)'
+    _TYPE = '2'
+    _TESTS = [{
+        'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp',
+        'info_dict': {
+            'id': '2132',
+        },
+        'playlist_count': 20,
+        'skip': 'Georestricted',
+    }, {
+        'url': 'http://www.xiami.com/artist/bC5Tk2K6eb99',
+        'only_matching': True,
+    }]
+
+
+class XiamiCollectionIE(XiamiPlaylistBaseIE):
+    IE_NAME = 'xiami:collection'
+    IE_DESC = '虾米音乐 - 精选集'
+    _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P<id>[^/?#&]+)'
+    _TYPE = '3'
+    _TEST = {
+        'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr',
+        'info_dict': {
+            'id': '156527391',
+        },
+        'playlist_mincount': 29,
+        'skip': 'Georestricted',
+    }
diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py

new file mode 100644 (file)

index 0000000..a912e54
--- /dev/null
+++ b/youtube_dl/extractor/ximalaya.py
@@ -0,0 +1,233 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+import itertools
+import re
+
+from .common import InfoExtractor
+
+
+class XimalayaBaseIE(InfoExtractor):
+    _GEO_COUNTRIES = ['CN']
+
+
+class XimalayaIE(XimalayaBaseIE):
+    IE_NAME = 'ximalaya'
+    IE_DESC = '喜马拉雅FM'
+    _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/sound/(?P<id>[0-9]+)'
+    _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/'
+    _TESTS = [
+        {
+            'url': 'http://www.ximalaya.com/61425525/sound/47740352/',
+            'info_dict': {
+                'id': '47740352',
+                'ext': 'm4a',
+                'uploader': '小彬彬爱听书',
+                'uploader_id': 61425525,
+                'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/',
+                'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
+                'description': "contains:《送孟浩然之广陵》\n作者：李白\n故人西辞黄鹤楼，烟花三月下扬州。\n孤帆远影碧空尽，惟见长江天际流。",
+                'thumbnails': [
+                    {
+                        'name': 'cover_url',
+                        'url': r're:^https?://.*\.jpg$',
+                    },
+                    {
+                        'name': 'cover_url_142',
+                        'url': r're:^https?://.*\.jpg$',
+                        'width': 180,
+                        'height': 180
+                    }
+                ],
+                'categories': ['renwen', '人文'],
+                'duration': 93,
+                'view_count': int,
+                'like_count': int,
+            }
+        },
+        {
+            'url': 'http://m.ximalaya.com/61425525/sound/47740352/',
+            'info_dict': {
+                'id': '47740352',
+                'ext': 'm4a',
+                'uploader': '小彬彬爱听书',
+                'uploader_id': 61425525,
+                'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/',
+                'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
+                'description': "contains:《送孟浩然之广陵》\n作者：李白\n故人西辞黄鹤楼，烟花三月下扬州。\n孤帆远影碧空尽，惟见长江天际流。",
+                'thumbnails': [
+                    {
+                        'name': 'cover_url',
+                        'url': r're:^https?://.*\.jpg$',
+                    },
+                    {
+                        'name': 'cover_url_142',
+                        'url': r're:^https?://.*\.jpg$',
+                        'width': 180,
+                        'height': 180
+                    }
+                ],
+                'categories': ['renwen', '人文'],
+                'duration': 93,
+                'view_count': int,
+                'like_count': int,
+            }
+        },
+        {
+            'url': 'https://www.ximalaya.com/11045267/sound/15705996/',
+            'info_dict': {
+                'id': '15705996',
+                'ext': 'm4a',
+                'uploader': '李延隆老师',
+                'uploader_id': 11045267,
+                'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/',
+                'title': 'Lesson 1 Excuse me!',
+                'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n"
+                               "听录音，然后回答问题，这是谁的手袋？",
+                'thumbnails': [
+                    {
+                        'name': 'cover_url',
+                        'url': r're:^https?://.*\.jpg$',
+                    },
+                    {
+                        'name': 'cover_url_142',
+                        'url': r're:^https?://.*\.jpg$',
+                        'width': 180,
+                        'height': 180
+                    }
+                ],
+                'categories': ['train', '外语'],
+                'duration': 40,
+                'view_count': int,
+                'like_count': int,
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+
+        is_m = 'm.ximalaya' in url
+        scheme = 'https' if url.startswith('https') else 'http'
+
+        audio_id = self._match_id(url)
+        webpage = self._download_webpage(url, audio_id,
+                                         note='Download sound page for %s' % audio_id,
+                                         errnote='Unable to get sound page')
+
+        audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id)
+        audio_info = self._download_json(audio_info_file, audio_id,
+                                         'Downloading info json %s' % audio_info_file,
+                                         'Unable to download info file')
+
+        formats = []
+        for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')):
+            if audio_info.get(k):
+                formats.append({
+                    'format_id': bps,
+                    'url': audio_info[k],
+                })
+
+        thumbnails = []
+        for k in audio_info.keys():
+            # cover pics kyes like: cover_url', 'cover_url_142'
+            if k.startswith('cover_url'):
+                thumbnail = {'name': k, 'url': audio_info[k]}
+                if k == 'cover_url_142':
+                    thumbnail['width'] = 180
+                    thumbnail['height'] = 180
+                thumbnails.append(thumbnail)
+
+        audio_uploader_id = audio_info.get('uid')
+
+        if is_m:
+            audio_description = self._html_search_regex(r'(?s)<section\s+class=["\']content[^>]+>(.+?)</section>',
+                                                        webpage, 'audio_description', fatal=False)
+        else:
+            audio_description = self._html_search_regex(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)',
+                                                        webpage, 'audio_description', fatal=False)
+
+        if not audio_description:
+            audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id)
+            audio_description = self._download_webpage(audio_description_file, audio_id,
+                                                       note='Downloading description file %s' % audio_description_file,
+                                                       errnote='Unable to download descrip file',
+                                                       fatal=False)
+            audio_description = audio_description.strip() if audio_description else None
+
+        return {
+            'id': audio_id,
+            'uploader': audio_info.get('nickname'),
+            'uploader_id': audio_uploader_id,
+            'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None,
+            'title': audio_info['title'],
+            'thumbnails': thumbnails,
+            'description': audio_description,
+            'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))),
+            'duration': audio_info.get('duration'),
+            'view_count': audio_info.get('play_count'),
+            'like_count': audio_info.get('favorites_count'),
+            'formats': formats,
+        }
+
+
+class XimalayaAlbumIE(XimalayaBaseIE):
+    IE_NAME = 'ximalaya:album'
+    IE_DESC = '喜马拉雅FM 专辑'
+    _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)'
+    _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/'
+    _BASE_URL_TEMPL = '%s://www.ximalaya.com%s'
+    _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%s/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">'
+    _TESTS = [{
+        'url': 'http://www.ximalaya.com/61425525/album/5534601/',
+        'info_dict': {
+            'title': '唐诗三百首（含赏析）',
+            'id': '5534601',
+        },
+        'playlist_count': 312,
+    }, {
+        'url': 'http://m.ximalaya.com/61425525/album/5534601',
+        'info_dict': {
+            'title': '唐诗三百首（含赏析）',
+            'id': '5534601',
+        },
+        'playlist_count': 312,
+    },
+    ]
+
+    def _real_extract(self, url):
+        self.scheme = scheme = 'https' if url.startswith('https') else 'http'
+
+        mobj = re.match(self._VALID_URL, url)
+        uid, playlist_id = mobj.group('uid'), mobj.group('id')
+
+        webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id,
+                                         note='Download album page for %s' % playlist_id,
+                                         errnote='Unable to get album info')
+
+        title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>',
+                                        webpage, 'title', fatal=False)
+
+        return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title)
+
+    def _entries(self, page, playlist_id, uid):
+        html = page
+        for page_num in itertools.count(1):
+            for entry in self._process_page(html, uid):
+                yield entry
+
+            next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3',
+                                          html, 'list_next_url', default=None, group='more')
+            if not next_url:
+                break
+
+            next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url)
+            html = self._download_webpage(next_full_url, playlist_id)
+
+    def _process_page(self, html, uid):
+        find_from = html.index('album_soundlist')
+        for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]):
+            yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')),
+                                  XimalayaIE.ie_key(),
+                                  mobj.group('id'),
+                                  mobj.group('title'))
diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py

new file mode 100644 (file)

index 0000000..36e5ead
--- /dev/null
+++ b/youtube_dl/extractor/xminus.py
@@ -0,0 +1,79 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_ord,
+)
+from ..utils import (
+    int_or_none,
+    parse_duration,
+)
+
+
+class XMinusIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?x-minus\.org/track/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://x-minus.org/track/4542/%D0%BF%D0%B5%D1%81%D0%B5%D0%BD%D0%BA%D0%B0-%D1%88%D0%BE%D1%84%D0%B5%D1%80%D0%B0.html',
+        'md5': '401a15f2d2dcf6d592cb95528d72a2a8',
+        'info_dict': {
+            'id': '4542',
+            'ext': 'mp3',
+            'title': 'Леонид Агутин-Песенка шофёра',
+            'duration': 156,
+            'tbr': 320,
+            'filesize_approx': 5900000,
+            'view_count': int,
+            'description': 'md5:03238c5b663810bc79cf42ef3c03e371',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        artist = self._html_search_regex(
+            r'<a[^>]+href="/artist/\d+">([^<]+)</a>', webpage, 'artist')
+        title = artist + '-' + self._html_search_regex(
+            r'<span[^>]+class="minustrack-full-title(?:\s+[^"]+)?"[^>]*>([^<]+)', webpage, 'title')
+        duration = parse_duration(self._html_search_regex(
+            r'<span[^>]+class="player-duration(?:\s+[^"]+)?"[^>]*>([^<]+)',
+            webpage, 'duration', fatal=False))
+        mobj = re.search(
+            r'<div[^>]+class="dw-info(?:\s+[^"]+)?"[^>]*>(?P<tbr>\d+)\s*кбит/c\s+(?P<filesize>[0-9.]+)\s*мб</div>',
+            webpage)
+        tbr = filesize_approx = None
+        if mobj:
+            filesize_approx = float(mobj.group('filesize')) * 1000000
+            tbr = float(mobj.group('tbr'))
+        view_count = int_or_none(self._html_search_regex(
+            r'<span><[^>]+class="icon-chart-bar".*?>(\d+)</span>',
+            webpage, 'view count', fatal=False))
+        description = self._html_search_regex(
+            r'(?s)<pre[^>]+id="lyrics-original"[^>]*>(.*?)</pre>',
+            webpage, 'song lyrics', fatal=False)
+        if description:
+            description = re.sub(' *\r *', '\n', description)
+
+        k = self._search_regex(
+            r'<div[^>]+id="player-bottom"[^>]+data-k="([^"]+)">', webpage,
+            'encoded data')
+        h = time.time() / 3600
+        a = sum(map(int, [compat_ord(c) for c in k])) + int(video_id) + h
+        video_url = 'http://x-minus.me/dl/minus?id=%s&tkn2=%df%d' % (video_id, a, h)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            # The extension is unknown until actual downloading
+            'ext': 'mp3',
+            'duration': duration,
+            'filesize_approx': filesize_approx,
+            'tbr': tbr,
+            'view_count': view_count,
+            'description': description,
+        }
diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py

new file mode 100644 (file)

index 0000000..ac1ccc4
--- /dev/null
+++ b/youtube_dl/extractor/xnxx.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    NO_DEFAULT,
+    str_to_int,
+)
+
+
+class XNXXIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/'
+    _TESTS = [{
+        'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video',
+        'md5': '7583e96c15c0f21e9da3453d9920fbba',
+        'info_dict': {
+            'id': '55awb78',
+            'ext': 'mp4',
+            'title': 'Skyrim Test Video',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 469,
+            'view_count': int,
+            'age_limit': 18,
+        },
+    }, {
+        'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.xnxx.com/video-55awb78/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        def get(meta, default=NO_DEFAULT, fatal=True):
+            return self._search_regex(
+                r'set%s\s*\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % meta,
+                webpage, meta, default=default, fatal=fatal, group='value')
+
+        title = self._og_search_title(
+            webpage, default=None) or get('VideoTitle')
+
+        formats = []
+        for mobj in re.finditer(
+                r'setVideo(?:Url(?P<id>Low|High)|HLS)\s*\(\s*(?P<q>["\'])(?P<url>(?:https?:)?//.+?)(?P=q)', webpage):
+            format_url = mobj.group('url')
+            if determine_ext(format_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    preference=1, m3u8_id='hls', fatal=False))
+            else:
+                format_id = mobj.group('id')
+                if format_id:
+                    format_id = format_id.lower()
+                formats.append({
+                    'url': format_url,
+                    'format_id': format_id,
+                    'quality': -1 if format_id == 'low' else 0,
+                })
+        self._sort_formats(formats)
+
+        thumbnail = self._og_search_thumbnail(webpage, default=None) or get(
+            'ThumbUrl', fatal=False) or get('ThumbUrl169', fatal=False)
+        duration = int_or_none(self._og_search_property('duration', webpage))
+        view_count = str_to_int(self._search_regex(
+            r'id=["\']nb-views-number[^>]+>([\d,.]+)', webpage, 'view count',
+            default=None))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'age_limit': 18,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py

new file mode 100644 (file)

index 0000000..76c91bd
--- /dev/null
+++ b/youtube_dl/extractor/xstream.py
@@ -0,0 +1,119 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+    xpath_with_ns,
+    xpath_text,
+    find_xpath_attr,
+)
+
+
+class XstreamIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    (?:
+                        xstream:|
+                        https?://frontend\.xstream\.(?:dk|net)/
+                    )
+                    (?P<partner_id>[^/]+)
+                    (?:
+                        :|
+                        /feed/video/\?.*?\bid=
+                    )
+                    (?P<id>\d+)
+                    '''
+    _TESTS = [{
+        'url': 'http://frontend.xstream.dk/btno/feed/video/?platform=web&id=86588',
+        'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
+        'info_dict': {
+            'id': '86588',
+            'ext': 'mov',
+            'title': 'Otto Wollertsen',
+            'description': 'Vestlendingen Otto Fredrik Wollertsen',
+            'timestamp': 1430473209,
+            'upload_date': '20150501',
+        },
+    }, {
+        'url': 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=21039',
+        'only_matching': True,
+    }]
+
+    def _extract_video_info(self, partner_id, video_id):
+        data = self._download_xml(
+            'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s'
+            % (partner_id, video_id),
+            video_id)
+
+        NS_MAP = {
+            'atom': 'http://www.w3.org/2005/Atom',
+            'xt': 'http://xstream.dk/',
+            'media': 'http://search.yahoo.com/mrss/',
+        }
+
+        entry = data.find(xpath_with_ns('./atom:entry', NS_MAP))
+
+        title = xpath_text(
+            entry, xpath_with_ns('./atom:title', NS_MAP), 'title')
+        description = xpath_text(
+            entry, xpath_with_ns('./atom:summary', NS_MAP), 'description')
+        timestamp = parse_iso8601(xpath_text(
+            entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date'))
+
+        formats = []
+        media_group = entry.find(xpath_with_ns('./media:group', NS_MAP))
+        for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)):
+            media_url = media_content.get('url')
+            if not media_url:
+                continue
+            tbr = int_or_none(media_content.get('bitrate'))
+            mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', media_url)
+            if mobj:
+                formats.append({
+                    'url': mobj.group('url'),
+                    'play_path': 'mp4:%s' % mobj.group('playpath'),
+                    'app': mobj.group('app'),
+                    'ext': 'flv',
+                    'tbr': tbr,
+                    'format_id': 'rtmp-%d' % tbr,
+                })
+            else:
+                formats.append({
+                    'url': media_url,
+                    'tbr': tbr,
+                })
+        self._sort_formats(formats)
+
+        link = find_xpath_attr(
+            entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original')
+        if link is not None:
+            formats.append({
+                'url': link.get('href'),
+                'format_id': link.get('rel'),
+                'preference': 1,
+            })
+
+        thumbnails = [{
+            'url': splash.get('url'),
+            'width': int_or_none(splash.get('width')),
+            'height': int_or_none(splash.get('height')),
+        } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'formats': formats,
+            'thumbnails': thumbnails,
+        }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        partner_id = mobj.group('partner_id')
+        video_id = mobj.group('id')
+
+        return self._extract_video_info(partner_id, video_id)
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py

new file mode 100644 (file)

index 0000000..01b253d
--- /dev/null
+++ b/youtube_dl/extractor/xtube.py
@@ -0,0 +1,200 @@
+from __future__ import unicode_literals
+
+import itertools
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    orderedSet,
+    parse_duration,
+    sanitized_Request,
+    str_to_int,
+)
+
+
+class XTubeIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            xtube:|
+                            https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?:embedded/)?(?P<display_id>[^/]+)-)
+                        )
+                        (?P<id>[^/?&#]+)
+                    '''
+
+    _TESTS = [{
+        # old URL schema
+        'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
+        'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
+        'info_dict': {
+            'id': 'kVTUy_G222_',
+            'ext': 'mp4',
+            'title': 'strange erotica',
+            'description': 'contains:an ET kind of thing',
+            'uploader': 'greenshowers',
+            'duration': 450,
+            'view_count': int,
+            'comment_count': int,
+            'age_limit': 18,
+        }
+    }, {
+        # FLV videos with duplicated formats
+        'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
+        'md5': 'a406963eb349dd43692ec54631efd88b',
+        'info_dict': {
+            'id': '9299752',
+            'display_id': 'A-Super-Run-Part-1-YT',
+            'ext': 'flv',
+            'title': 'A Super Run - Part 1 (YT)',
+            'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616',
+            'uploader': 'tshirtguy59',
+            'duration': 579,
+            'view_count': int,
+            'comment_count': int,
+            'age_limit': 18,
+        },
+    }, {
+        # new URL schema
+        'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
+        'only_matching': True,
+    }, {
+        'url': 'xtube:625837',
+        'only_matching': True,
+    }, {
+        'url': 'xtube:kVTUy_G222_',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.xtube.com/video-watch/embedded/milf-tara-and-teen-shared-and-cum-covered-extreme-bukkake-32203482?embedsize=big',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        if not display_id:
+            display_id = video_id
+
+        if video_id.isdigit() and len(video_id) < 11:
+            url_pattern = 'http://www.xtube.com/video-watch/-%s'
+        else:
+            url_pattern = 'http://www.xtube.com/watch.php?v=%s'
+
+        webpage = self._download_webpage(
+            url_pattern % video_id, display_id, headers={
+                'Cookie': 'age_verified=1; cookiesAccepted=1',
+            })
+
+        title, thumbnail, duration = [None] * 3
+
+        config = self._parse_json(self._search_regex(
+            r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config',
+            default='{}'), video_id, transform_source=js_to_json, fatal=False)
+        if config:
+            config = config.get('mainRoll')
+            if isinstance(config, dict):
+                title = config.get('title')
+                thumbnail = config.get('poster')
+                duration = int_or_none(config.get('duration'))
+                sources = config.get('sources') or config.get('format')
+
+        if not isinstance(sources, dict):
+            sources = self._parse_json(self._search_regex(
+                r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
+                webpage, 'sources', group='sources'), video_id,
+                transform_source=js_to_json)
+
+        formats = []
+        for format_id, format_url in sources.items():
+            formats.append({
+                'url': format_url,
+                'format_id': format_id,
+                'height': int_or_none(format_id),
+            })
+        self._remove_duplicate_formats(formats)
+        self._sort_formats(formats)
+
+        if not title:
+            title = self._search_regex(
+                (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'),
+                webpage, 'title', group='title')
+        description = self._og_search_description(
+            webpage, default=None) or self._html_search_meta(
+            'twitter:description', webpage, default=None) or self._search_regex(
+            r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False)
+        uploader = self._search_regex(
+            (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
+             r'<span[^>]+class="nickname"[^>]*>([^<]+)'),
+            webpage, 'uploader', fatal=False)
+        if not duration:
+            duration = parse_duration(self._search_regex(
+                r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>',
+                webpage, 'duration', fatal=False))
+        view_count = str_to_int(self._search_regex(
+            (r'["\']viewsCount["\'][^>]*>(\d+)\s+views',
+             r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>'),
+            webpage, 'view count', fatal=False))
+        comment_count = str_to_int(self._html_search_regex(
+            r'>Comments? \(([\d,\.]+)\)<',
+            webpage, 'comment count', fatal=False))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': duration,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'age_limit': 18,
+            'formats': formats,
+        }
+
+
+class XTubeUserIE(InfoExtractor):
+    IE_DESC = 'XTube user profile'
+    _VALID_URL = r'https?://(?:www\.)?xtube\.com/profile/(?P<id>[^/]+-\d+)'
+    _TEST = {
+        'url': 'http://www.xtube.com/profile/greenshowers-4056496',
+        'info_dict': {
+            'id': 'greenshowers-4056496',
+            'age_limit': 18,
+        },
+        'playlist_mincount': 154,
+    }
+
+    def _real_extract(self, url):
+        user_id = self._match_id(url)
+
+        entries = []
+        for pagenum in itertools.count(1):
+            request = sanitized_Request(
+                'http://www.xtube.com/profile/%s/videos/%d' % (user_id, pagenum),
+                headers={
+                    'Cookie': 'popunder=4',
+                    'X-Requested-With': 'XMLHttpRequest',
+                    'Referer': url,
+                })
+
+            page = self._download_json(
+                request, user_id, 'Downloading videos JSON page %d' % pagenum)
+
+            html = page.get('html')
+            if not html:
+                break
+
+            for video_id in orderedSet([video_id for _, video_id in re.findall(
+                    r'data-plid=(["\'])(.+?)\1', html)]):
+                entries.append(self.url_result('xtube:%s' % video_id, XTubeIE.ie_key()))
+
+            page_count = int_or_none(page.get('pageCount'))
+            if not page_count or pagenum == page_count:
+                break
+
+        playlist = self.playlist_result(entries, user_id)
+        playlist['age_limit'] = 18
+        return playlist
diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py

new file mode 100644 (file)

index 0000000..0276c0d
--- /dev/null
+++ b/youtube_dl/extractor/xuite.py
@@ -0,0 +1,153 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    get_element_by_attribute,
+    parse_iso8601,
+    remove_end,
+)
+
+
+class XuiteIE(InfoExtractor):
+    IE_DESC = '隨意窩Xuite影音'
+    _REGEX_BASE64 = r'(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?'
+    _VALID_URL = r'https?://vlog\.xuite\.net/(?:play|embed)/(?P<id>%s)' % _REGEX_BASE64
+    _TESTS = [{
+        # Audio
+        'url': 'http://vlog.xuite.net/play/RGkzc1ZULTM4NjA5MTQuZmx2',
+        'md5': 'e79284c87b371424885448d11f6398c8',
+        'info_dict': {
+            'id': '3860914',
+            'ext': 'mp3',
+            'title': '孤單南半球-歐德陽',
+            'description': '孤單南半球-歐德陽',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 247.246,
+            'timestamp': 1314932940,
+            'upload_date': '20110902',
+            'uploader': '阿能',
+            'uploader_id': '15973816',
+            'categories': ['個人短片'],
+        },
+    }, {
+        # Video with only one format
+        'url': 'http://vlog.xuite.net/play/WUxxR2xCLTI1OTI1MDk5LmZsdg==',
+        'md5': '21f7b39c009b5a4615b4463df6eb7a46',
+        'info_dict': {
+            'id': '25925099',
+            'ext': 'mp4',
+            'title': 'BigBuckBunny_320x180',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 596.458,
+            'timestamp': 1454242500,
+            'upload_date': '20160131',
+            'uploader': '屁姥',
+            'uploader_id': '12158353',
+            'categories': ['個人短片'],
+            'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4',
+        },
+    }, {
+        # Video with two formats
+        'url': 'http://vlog.xuite.net/play/bWo1N1pLLTIxMzAxMTcwLmZsdg==',
+        'md5': '1166e0f461efe55b62e26a2d2a68e6de',
+        'info_dict': {
+            'id': '21301170',
+            'ext': 'mp4',
+            'title': '暗殺教室 02',
+            'description': '字幕:【極影字幕社】',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 1384.907,
+            'timestamp': 1421481240,
+            'upload_date': '20150117',
+            'uploader': '我只是想認真點',
+            'uploader_id': '242127761',
+            'categories': ['電玩動漫'],
+        },
+        'skip': 'Video removed',
+    }, {
+        # Video with encoded media id
+        # from http://forgetfulbc.blogspot.com/2016/06/date.html
+        'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0',
+        'info_dict': {
+            'id': '27447336',
+            'ext': 'mp4',
+            'title': '男女平權只是口號？專家解釋約會時男生是否該幫女生付錢 (中字)',
+            'description': 'md5:1223810fa123b179083a3aed53574706',
+            'timestamp': 1466160960,
+            'upload_date': '20160617',
+            'uploader': 'B.C. & Lowy',
+            'uploader_id': '232279340',
+        },
+    }, {
+        'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        # /play/ URLs provide embedded video URL and more metadata
+        url = url.replace('/embed/', '/play/')
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        error_msg = self._search_regex(
+            r'<div id="error-message-content">([^<]+)',
+            webpage, 'error message', default=None)
+        if error_msg:
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error_msg),
+                expected=True)
+
+        media_info = self._parse_json(self._search_regex(
+            r'var\s+mediaInfo\s*=\s*({.*});', webpage, 'media info'), video_id)
+
+        video_id = media_info['MEDIA_ID']
+
+        formats = []
+        for key in ('html5Url', 'html5HQUrl'):
+            video_url = media_info.get(key)
+            if not video_url:
+                continue
+            format_id = self._search_regex(
+                r'\bq=(.+?)\b', video_url, 'format id', default=None)
+            formats.append({
+                'url': video_url,
+                'ext': 'mp4' if format_id.isnumeric() else format_id,
+                'format_id': format_id,
+                'height': int(format_id) if format_id.isnumeric() else None,
+            })
+        self._sort_formats(formats)
+
+        timestamp = media_info.get('PUBLISH_DATETIME')
+        if timestamp:
+            timestamp = parse_iso8601(timestamp + ' +0800', ' ')
+
+        category = media_info.get('catName')
+        categories = [category] if category else []
+
+        uploader = media_info.get('NICKNAME')
+        uploader_url = None
+
+        author_div = get_element_by_attribute('itemprop', 'author', webpage)
+        if author_div:
+            uploader = uploader or self._html_search_meta('name', author_div)
+            uploader_url = self._html_search_regex(
+                r'<link[^>]+itemprop="url"[^>]+href="([^"]+)"', author_div,
+                'uploader URL', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': media_info['TITLE'],
+            'description': remove_end(media_info.get('metaDesc'), ' (Xuite 影音)'),
+            'thumbnail': media_info.get('ogImageUrl'),
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'uploader_id': media_info.get('MEMBER_ID'),
+            'uploader_url': uploader_url,
+            'duration': float_or_none(media_info.get('MEDIA_DURATION'), 1000000),
+            'categories': categories,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py

new file mode 100644 (file)

index 0000000..8fc6491
--- /dev/null
+++ b/youtube_dl/extractor/xvideos.py
@@ -0,0 +1,147 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+    clean_html,
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    parse_duration,
+)
+
+
+class XVideosIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:[^/]+\.)?xvideos2?\.com/video|
+                            (?:www\.)?xvideos\.es/video|
+                            flashservice\.xvideos\.com/embedframe/|
+                            static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video=
+                        )
+                        (?P<id>[0-9]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl',
+        'md5': '14cea69fcb84db54293b1e971466c2e1',
+        'info_dict': {
+            'id': '4588838',
+            'ext': 'mp4',
+            'title': 'Biker Takes his Girl',
+            'duration': 108,
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'https://flashservice.xvideos.com/embedframe/4588838',
+        'only_matching': True,
+    }, {
+        'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=4588838',
+        'only_matching': True,
+    }, {
+        'url': 'http://xvideos.com/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'https://xvideos.com/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'https://xvideos.es/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'https://www.xvideos.es/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'http://xvideos.es/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'http://www.xvideos.es/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'http://fr.xvideos.com/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'https://fr.xvideos.com/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'http://it.xvideos.com/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'https://it.xvideos.com/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'http://de.xvideos.com/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }, {
+        'url': 'https://de.xvideos.com/video4588838/biker_takes_his_girl',
+        'only_matching': True
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'https://www.xvideos.com/video%s/' % video_id, video_id)
+
+        mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage)
+        if mobj:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
+
+        title = self._html_search_regex(
+            (r'<title>(?P<title>.+?)\s+-\s+XVID',
+             r'setVideoTitle\s*\(\s*(["\'])(?P<title>(?:(?!\1).)+)\1'),
+            webpage, 'title', default=None,
+            group='title') or self._og_search_title(webpage)
+
+        thumbnails = []
+        for preference, thumbnail in enumerate(('', '169')):
+            thumbnail_url = self._search_regex(
+                r'setThumbUrl%s\(\s*(["\'])(?P<thumbnail>(?:(?!\1).)+)\1' % thumbnail,
+                webpage, 'thumbnail', default=None, group='thumbnail')
+            if thumbnail_url:
+                thumbnails.append({
+                    'url': thumbnail_url,
+                    'preference': preference,
+                })
+
+        duration = int_or_none(self._og_search_property(
+            'duration', webpage, default=None)) or parse_duration(
+            self._search_regex(
+                r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)',
+                webpage, 'duration', fatal=False))
+
+        formats = []
+
+        video_url = compat_urllib_parse_unquote(self._search_regex(
+            r'flv_url=(.+?)&', webpage, 'video URL', default=''))
+        if video_url:
+            formats.append({
+                'url': video_url,
+                'format_id': 'flv',
+            })
+
+        for kind, _, format_url in re.findall(
+                r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage):
+            format_id = kind.lower()
+            if format_id == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4',
+                    entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+            elif format_id in ('urllow', 'urlhigh'):
+                formats.append({
+                    'url': format_url,
+                    'format_id': '%s-%s' % (determine_ext(format_url, 'mp4'), format_id[3:]),
+                    'quality': -2 if format_id.endswith('low') else None,
+                })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'duration': duration,
+            'thumbnails': thumbnails,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py

new file mode 100644 (file)

index 0000000..e34ebe3
--- /dev/null
+++ b/youtube_dl/extractor/xxxymovies.py
@@ -0,0 +1,81 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    int_or_none,
+)
+
+
+class XXXYMoviesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?xxxymovies\.com/videos/(?P<id>\d+)/(?P<display_id>[^/]+)'
+    _TEST = {
+        'url': 'http://xxxymovies.com/videos/138669/ecstatic-orgasm-sofcore/',
+        'md5': '810b1bdbbffff89dd13bdb369fe7be4b',
+        'info_dict': {
+            'id': '138669',
+            'display_id': 'ecstatic-orgasm-sofcore',
+            'ext': 'mp4',
+            'title': 'Ecstatic Orgasm Sofcore',
+            'duration': 931,
+            'categories': list,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_url = self._search_regex(
+            r"video_url\s*:\s*'([^']+)'", webpage, 'video URL')
+
+        title = self._html_search_regex(
+            [r'<div[^>]+\bclass="block_header"[^>]*>\s*<h1>([^<]+)<',
+             r'<title>(.*?)\s*-\s*(?:XXXYMovies\.com|XXX\s+Movies)</title>'],
+            webpage, 'title')
+
+        thumbnail = self._search_regex(
+            r"preview_url\s*:\s*'([^']+)'",
+            webpage, 'thumbnail', fatal=False)
+
+        categories = self._html_search_meta(
+            'keywords', webpage, 'categories', default='').split(',')
+
+        duration = parse_duration(self._search_regex(
+            r'<span>Duration:</span>\s*(\d+:\d+)',
+            webpage, 'duration', fatal=False))
+
+        view_count = int_or_none(self._html_search_regex(
+            r'<div class="video_views">\s*(\d+)',
+            webpage, 'view count', fatal=False))
+        like_count = int_or_none(self._search_regex(
+            r'>\s*Likes? <b>\((\d+)\)',
+            webpage, 'like count', fatal=False))
+        dislike_count = int_or_none(self._search_regex(
+            r'>\s*Dislike <b>\((\d+)\)</b>',
+            webpage, 'dislike count', fatal=False))
+
+        age_limit = self._rta_search(webpage)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'categories': categories,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py

new file mode 100644 (file)

index 0000000..e461537
--- /dev/null
+++ b/youtube_dl/extractor/yahoo.py
@@ -0,0 +1,569 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import itertools
+import re
+
+from .common import InfoExtractor, SearchInfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse,
+)
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    mimetype2ext,
+    parse_iso8601,
+    smuggle_url,
+    try_get,
+    url_or_none,
+)
+
+from .brightcove import BrightcoveNewIE
+
+
+class YahooIE(InfoExtractor):
+    IE_DESC = 'Yahoo screen and movies'
+    _VALID_URL = r'(?P<url>https?://(?:(?P<country>[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P<id>[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)'
+    _TESTS = [{
+        'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
+        'info_dict': {
+            'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
+            'ext': 'mp4',
+            'title': 'Julian Smith & Travis Legg Watch Julian Smith',
+            'description': 'Julian and Travis watch Julian Smith',
+            'duration': 6863,
+            'timestamp': 1369812016,
+            'upload_date': '20130529',
+        },
+    }, {
+        'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
+        'md5': '7993e572fac98e044588d0b5260f4352',
+        'info_dict': {
+            'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
+            'ext': 'mp4',
+            'title': "Yahoo Saves 'Community'",
+            'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
+            'duration': 170,
+            'timestamp': 1406838636,
+            'upload_date': '20140731',
+        },
+    }, {
+        'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
+        'md5': '71298482f7c64cbb7fa064e4553ff1c1',
+        'info_dict': {
+            'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58',
+            'ext': 'webm',
+            'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder',
+            'description': 'md5:f66c890e1490f4910a9953c941dee944',
+            'duration': 97,
+            'timestamp': 1414489862,
+            'upload_date': '20141028',
+        }
+    }, {
+        'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
+        'md5': '88e209b417f173d86186bef6e4d1f160',
+        'info_dict': {
+            'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521',
+            'ext': 'mp4',
+            'title': 'China Moses Is Crazy About the Blues',
+            'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
+            'duration': 128,
+            'timestamp': 1385722202,
+            'upload_date': '20131129',
+        }
+    }, {
+        'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html',
+        'md5': '2a9752f74cb898af5d1083ea9f661b58',
+        'info_dict': {
+            'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
+            'ext': 'mp4',
+            'title': '\'True Story\' Trailer',
+            'description': 'True Story',
+            'duration': 150,
+            'timestamp': 1418919206,
+            'upload_date': '20141218',
+        },
+    }, {
+        'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
+        'only_matching': True,
+    }, {
+        'note': 'NBC Sports embeds',
+        'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313',
+        'info_dict': {
+            'id': '9CsDKds0kvHI',
+            'ext': 'flv',
+            'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
+            'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+            'upload_date': '20150313',
+            'uploader': 'NBCU-SPORTS',
+            'timestamp': 1426270238,
+        },
+    }, {
+        'url': 'https://tw.news.yahoo.com/-100120367.html',
+        'only_matching': True,
+    }, {
+        # Query result is embedded in webpage, but explicit request to video API fails with geo restriction
+        'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html',
+        'md5': '4fbafb9c9b6f07aa8f870629f6671b35',
+        'info_dict': {
+            'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504',
+            'ext': 'mp4',
+            'title': 'Communitary - Community Episode 1: Ladders',
+            'description': 'md5:8fc39608213295748e1e289807838c97',
+            'duration': 1646,
+            'timestamp': 1440436550,
+            'upload_date': '20150824',
+            'series': 'Communitary',
+            'season_number': 6,
+            'episode_number': 1,
+        },
+    }, {
+        # ytwnews://cavideo/
+        'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html',
+        'info_dict': {
+            'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff',
+            'ext': 'mp4',
+            'title': '單車天使 - 中文版預',
+            'description': '中文版預',
+            'timestamp': 1476696196,
+            'upload_date': '20161017',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # Contains both a Yahoo hosted video and multiple Youtube embeds
+        'url': 'https://www.yahoo.com/entertainment/gwen-stefani-reveals-the-pop-hit-she-passed-on-assigns-it-to-her-voice-contestant-instead-033045672.html',
+        'info_dict': {
+            'id': '46c5d95a-528f-3d03-b732-732fcadd51de',
+            'title': 'Gwen Stefani reveals the pop hit she passed on, assigns it to her \'Voice\' contestant instead',
+            'description': 'Gwen decided not to record this hit herself, but she decided it was the perfect fit for Kyndall Inskeep.',
+        },
+        'playlist': [{
+            'info_dict': {
+                'id': '966d4262-4fd1-3aaa-b45b-049ca6e38ba6',
+                'ext': 'mp4',
+                'title': 'Gwen Stefani reveals she turned down one of Sia\'s best songs',
+                'description': 'On "The Voice" Tuesday, Gwen Stefani told Taylor Swift which Sia hit was almost hers.',
+                'timestamp': 1572406500,
+                'upload_date': '20191030',
+            },
+        }, {
+            'info_dict': {
+                'id': '352CFDOQrKg',
+                'ext': 'mp4',
+                'title': 'Kyndal Inskeep "Performs the Hell Out of" Sia\'s "Elastic Heart" - The Voice Knockouts 2019',
+                'description': 'md5:35b61e94c2ae214bc965ff4245f80d11',
+                'uploader': 'The Voice',
+                'uploader_id': 'NBCTheVoice',
+                'upload_date': '20191029',
+            },
+        }],
+        'params': {
+            'playlistend': 2,
+        },
+        'expected_warnings': ['HTTP Error 404'],
+    }, {
+        'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://es-us.noticias.yahoo.com/es-la-puerta-irrompible-que-110539379.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.yahoo.com/entertainment/v/longtime-cbs-news-60-minutes-032036500-cbs.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        url, country, display_id = re.match(self._VALID_URL, url).groups()
+        if not country:
+            country = 'us'
+        else:
+            country = country.split('-')[0]
+        api_base = 'https://%s.yahoo.com/_td/api/resource/' % country
+
+        for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]):
+            content = self._download_json(
+                api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid,
+                display_id, 'Downloading content JSON metadata', fatal=i == 1)
+            if content:
+                item = content['items'][0]
+                break
+
+        if item.get('type') != 'video':
+            entries = []
+
+            cover = item.get('cover') or {}
+            if cover.get('type') == 'yvideo':
+                cover_url = cover.get('url')
+                if cover_url:
+                    entries.append(self.url_result(
+                        cover_url, 'Yahoo', cover.get('uuid')))
+
+            for e in item.get('body', []):
+                if e.get('type') == 'videoIframe':
+                    iframe_url = e.get('url')
+                    if not iframe_url:
+                        continue
+                    entries.append(self.url_result(iframe_url))
+
+            return self.playlist_result(
+                entries, item.get('uuid'),
+                item.get('title'), item.get('summary'))
+
+        video_id = item['uuid']
+        video = self._download_json(
+            api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id,
+            video_id, 'Downloading video JSON metadata')[0]
+        title = video['title']
+
+        if country == 'malaysia':
+            country = 'my'
+
+        is_live = video.get('live_state') == 'live'
+        fmts = ('m3u8',) if is_live else ('webm', 'mp4')
+
+        urls = []
+        formats = []
+        subtitles = {}
+        for fmt in fmts:
+            media_obj = self._download_json(
+                'https://video-api.yql.yahoo.com/v1/video/sapi/streams/' + video_id,
+                video_id, 'Downloading %s JSON metadata' % fmt,
+                headers=self.geo_verification_headers(), query={
+                    'format': fmt,
+                    'region': country.upper(),
+                })['query']['results']['mediaObj'][0]
+            msg = media_obj.get('status', {}).get('msg')
+
+            for s in media_obj.get('streams', []):
+                host = s.get('host')
+                path = s.get('path')
+                if not host or not path:
+                    continue
+                s_url = host + path
+                if s.get('format') == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        s_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+                    continue
+                tbr = int_or_none(s.get('bitrate'))
+                formats.append({
+                    'url': s_url,
+                    'format_id': fmt + ('-%d' % tbr if tbr else ''),
+                    'width': int_or_none(s.get('width')),
+                    'height': int_or_none(s.get('height')),
+                    'tbr': tbr,
+                    'fps': int_or_none(s.get('framerate')),
+                })
+
+            for cc in media_obj.get('closedcaptions', []):
+                cc_url = cc.get('url')
+                if not cc_url or cc_url in urls:
+                    continue
+                urls.append(cc_url)
+                subtitles.setdefault(cc.get('lang') or 'en-US', []).append({
+                    'url': cc_url,
+                    'ext': mimetype2ext(cc.get('content_type')),
+                })
+
+        streaming_url = video.get('streaming_url')
+        if streaming_url and not is_live:
+            formats.extend(self._extract_m3u8_formats(
+                streaming_url, video_id, 'mp4',
+                'm3u8_native', m3u8_id='hls', fatal=False))
+
+        if not formats and msg == 'geo restricted':
+            self.raise_geo_restricted()
+
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for thumb in video.get('thumbnails', []):
+            thumb_url = thumb.get('url')
+            if not thumb_url:
+                continue
+            thumbnails.append({
+                'id': thumb.get('tag'),
+                'url': thumb.get('url'),
+                'width': int_or_none(thumb.get('width')),
+                'height': int_or_none(thumb.get('height')),
+            })
+
+        series_info = video.get('series_info') or {}
+
+        return {
+            'id': video_id,
+            'title': self._live_title(title) if is_live else title,
+            'formats': formats,
+            'display_id': display_id,
+            'thumbnails': thumbnails,
+            'description': clean_html(video.get('description')),
+            'timestamp': parse_iso8601(video.get('publish_time')),
+            'subtitles': subtitles,
+            'duration': int_or_none(video.get('duration')),
+            'view_count': int_or_none(video.get('view_count')),
+            'is_live': is_live,
+            'series': video.get('show_name'),
+            'season_number': int_or_none(series_info.get('season_number')),
+            'episode_number': int_or_none(series_info.get('episode_number')),
+        }
+
+
+class YahooSearchIE(SearchInfoExtractor):
+    IE_DESC = 'Yahoo screen search'
+    _MAX_RESULTS = 1000
+    IE_NAME = 'screen.yahoo:search'
+    _SEARCH_KEY = 'yvsearch'
+
+    def _get_n_results(self, query, n):
+        """Get a specified number of results for a query"""
+        entries = []
+        for pagenum in itertools.count(0):
+            result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
+            info = self._download_json(result_url, query,
+                                       note='Downloading results page ' + str(pagenum + 1))
+            m = info['m']
+            results = info['results']
+
+            for (i, r) in enumerate(results):
+                if (pagenum * 30) + i >= n:
+                    break
+                mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
+                e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
+                entries.append(e)
+            if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
+                break
+
+        return {
+            '_type': 'playlist',
+            'id': query,
+            'entries': entries,
+        }
+
+
+class YahooGyaOPlayerIE(InfoExtractor):
+    IE_NAME = 'yahoo:gyao:player'
+    _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode/[^/]+)|streaming\.yahoo\.co\.jp/c/y)/(?P<id>\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TESTS = [{
+        'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/',
+        'info_dict': {
+            'id': '5993125228001',
+            'ext': 'mp4',
+            'title': 'フューリー　【字幕版】',
+            'description': 'md5:21e691c798a15330eda4db17a8fe45a5',
+            'uploader_id': '4235717419001',
+            'upload_date': '20190124',
+            'timestamp': 1548294365,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://streaming.yahoo.co.jp/c/y/01034/v00133/v0000000000000000706/',
+        'only_matching': True,
+    }, {
+        'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682',
+        'only_matching': True,
+    }]
+    _GEO_BYPASS = False
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url).replace('/', ':')
+        headers = self.geo_verification_headers()
+        headers['Accept'] = 'application/json'
+        resp = self._download_json(
+            'https://gyao.yahoo.co.jp/apis/playback/graphql', video_id, query={
+                'appId': 'dj00aiZpPUNJeDh2cU1RazU3UCZzPWNvbnN1bWVyc2VjcmV0Jng9NTk-',
+                'query': '''{
+  content(parameter: {contentId: "%s", logicaAgent: PC_WEB}) {
+    video {
+      delivery {
+        id
+      }
+      title
+    }
+  }
+}''' % video_id,
+            }, headers=headers)
+        content = resp['data']['content']
+        if not content:
+            msg = resp['errors'][0]['message']
+            if msg == 'not in japan':
+                self.raise_geo_restricted(countries=['JP'])
+            raise ExtractorError(msg)
+        video = content['video']
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': video['title'],
+            'url': smuggle_url(
+                'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video['delivery']['id'],
+                {'geo_countries': ['JP']}),
+            'ie_key': BrightcoveNewIE.ie_key(),
+        }
+
+
+class YahooGyaOIE(InfoExtractor):
+    IE_NAME = 'yahoo:gyao'
+    _VALID_URL = r'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title(?:/[^/]+)?)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+    _TESTS = [{
+        'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/',
+        'info_dict': {
+            'id': '00449:v03102',
+        },
+        'playlist_count': 2,
+    }, {
+        'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/',
+        'only_matching': True,
+    }, {
+        'url': 'https://gyao.yahoo.co.jp/title/%E3%81%97%E3%82%83%E3%81%B9%E3%81%8F%E3%82%8A007/5b025a49-b2e5-4dc7-945c-09c6634afacf',
+        'only_matching': True,
+    }, {
+        'url': 'https://gyao.yahoo.co.jp/title/5b025a49-b2e5-4dc7-945c-09c6634afacf',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        program_id = self._match_id(url).replace('/', ':')
+        videos = self._download_json(
+            'https://gyao.yahoo.co.jp/api/programs/%s/videos' % program_id, program_id)['videos']
+        entries = []
+        for video in videos:
+            video_id = video.get('id')
+            if not video_id:
+                continue
+            entries.append(self.url_result(
+                'https://gyao.yahoo.co.jp/player/%s/' % video_id.replace(':', '/'),
+                YahooGyaOPlayerIE.ie_key(), video_id))
+        return self.playlist_result(entries, program_id)
+
+
+class YahooJapanNewsIE(InfoExtractor):
+    IE_NAME = 'yahoo:japannews'
+    IE_DESC = 'Yahoo! Japan News'
+    _VALID_URL = r'https?://(?P<host>(?:news|headlines)\.yahoo\.co\.jp)[^\d]*(?P<id>\d[\d-]*\d)?'
+    _GEO_COUNTRIES = ['JP']
+    _TESTS = [{
+        'url': 'https://headlines.yahoo.co.jp/videonews/ann?a=20190716-00000071-ann-int',
+        'info_dict': {
+            'id': '1736242',
+            'ext': 'mp4',
+            'title': 'ムン大統領が対日批判を強化“現金化”効果は？（テレビ朝日系（ANN）） - Yahoo!ニュース',
+            'description': '韓国の元徴用工らを巡る裁判の原告が弁護士が差し押さえた三菱重工業の資産を売却して - Yahoo!ニュース(テレビ朝日系（ANN）)',
+            'thumbnail': r're:^https?://.*\.[a-zA-Z\d]{3,4}$',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # geo restricted
+        'url': 'https://headlines.yahoo.co.jp/hl?a=20190721-00000001-oxv-l04',
+        'only_matching': True,
+    }, {
+        'url': 'https://headlines.yahoo.co.jp/videonews/',
+        'only_matching': True,
+    }, {
+        'url': 'https://news.yahoo.co.jp',
+        'only_matching': True,
+    }, {
+        'url': 'https://news.yahoo.co.jp/byline/hashimotojunji/20190628-00131977/',
+        'only_matching': True,
+    }, {
+        'url': 'https://news.yahoo.co.jp/feature/1356',
+        'only_matching': True
+    }]
+
+    def _extract_formats(self, json_data, content_id):
+        formats = []
+
+        video_data = try_get(
+            json_data,
+            lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'],
+            list)
+        for vid in video_data or []:
+            delivery = vid.get('delivery')
+            url = url_or_none(vid.get('Url'))
+            if not delivery or not url:
+                continue
+            elif delivery == 'hls':
+                formats.extend(
+                    self._extract_m3u8_formats(
+                        url, content_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': url,
+                    'format_id': 'http-%s' % compat_str(vid.get('bitrate', '')),
+                    'height': int_or_none(vid.get('height')),
+                    'width': int_or_none(vid.get('width')),
+                    'tbr': int_or_none(vid.get('bitrate')),
+                })
+        self._remove_duplicate_formats(formats)
+        self._sort_formats(formats)
+
+        return formats
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        host = mobj.group('host')
+        display_id = mobj.group('id') or host
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage, 'title', default=None
+        ) or self._html_search_regex('<title>([^<]+)</title>', webpage, 'title')
+
+        if display_id == host:
+            # Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...)
+            stream_plists = re.findall(r'plist=(\d+)', webpage) or re.findall(r'plist["\']:\s*["\']([^"\']+)', webpage)
+            entries = [
+                self.url_result(
+                    smuggle_url(
+                        'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=%s' % plist_id,
+                        {'geo_countries': ['JP']}),
+                    ie='BrightcoveNew', video_id=plist_id)
+                for plist_id in stream_plists]
+            return self.playlist_result(entries, playlist_title=title)
+
+        # Article page
+        description = self._html_search_meta(
+            ['og:description', 'description', 'twitter:description'],
+            webpage, 'description', default=None)
+        thumbnail = self._og_search_thumbnail(
+            webpage, default=None) or self._html_search_meta(
+            'twitter:image', webpage, 'thumbnail', default=None)
+        space_id = self._search_regex([
+            r'<script[^>]+class=["\']yvpub-player["\'][^>]+spaceid=([^&"\']+)',
+            r'YAHOO\.JP\.srch\.\w+link\.onLoad[^;]+spaceID["\' ]*:["\' ]+([^"\']+)',
+            r'<!--\s+SpaceID=(\d+)'
+        ], webpage, 'spaceid')
+
+        content_id = self._search_regex(
+            r'<script[^>]+class=["\']yvpub-player["\'][^>]+contentid=(?P<contentid>[^&"\']+)',
+            webpage, 'contentid', group='contentid')
+
+        json_data = self._download_json(
+            'https://feapi-yvpub.yahooapis.jp/v1/content/%s' % content_id,
+            content_id,
+            query={
+                'appid': 'dj0zaiZpPVZMTVFJR0FwZWpiMyZzPWNvbnN1bWVyc2VjcmV0Jng9YjU-',
+                'output': 'json',
+                'space_id': space_id,
+                'domain': host,
+                'ak': hashlib.md5('_'.join((space_id, host)).encode()).hexdigest(),
+                'device_type': '1100',
+            })
+        formats = self._extract_formats(json_data, content_id)
+
+        return {
+            'id': content_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py

new file mode 100644 (file)

index 0000000..e8f6ae1
--- /dev/null
+++ b/youtube_dl/extractor/yandexdisk.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    try_get,
+    urlencode_postdata,
+)
+
+
+class YandexDiskIE(InfoExtractor):
+    _VALID_URL = r'https?://yadi\.sk/[di]/(?P<id>[^/?#&]+)'
+
+    _TESTS = [{
+        'url': 'https://yadi.sk/i/VdOeDou8eZs6Y',
+        'md5': '33955d7ae052f15853dc41f35f17581c',
+        'info_dict': {
+            'id': 'VdOeDou8eZs6Y',
+            'ext': 'mp4',
+            'title': '4.mp4',
+            'duration': 168.6,
+            'uploader': 'y.botova',
+            'uploader_id': '300043621',
+            'view_count': int,
+        },
+    }, {
+        'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        status = self._download_webpage(
+            'https://disk.yandex.com/auth/status', video_id, query={
+                'urlOrigin': url,
+                'source': 'public',
+                'md5': 'false',
+            })
+
+        sk = self._search_regex(
+            r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P<value>(?:(?!\2).)+)\2',
+            status, 'sk', group='value')
+
+        webpage = self._download_webpage(url, video_id)
+
+        models = self._parse_json(
+            self._search_regex(
+                r'<script[^>]+id=["\']models-client[^>]+>\s*(\[.+?\])\s*</script',
+                webpage, 'video JSON'),
+            video_id)
+
+        data = next(
+            model['data'] for model in models
+            if model.get('model') == 'resource')
+
+        video_hash = data['id']
+        title = data['name']
+
+        models = self._download_json(
+            'https://disk.yandex.com/models/', video_id,
+            data=urlencode_postdata({
+                '_model.0': 'videoInfo',
+                'id.0': video_hash,
+                '_model.1': 'do-get-resource-url',
+                'id.1': video_hash,
+                'version': '13.6',
+                'sk': sk,
+            }), query={'_m': 'videoInfo'})['models']
+
+        videos = try_get(models, lambda x: x[0]['data']['videos'], list) or []
+        source_url = try_get(
+            models, lambda x: x[1]['data']['file'], compat_str)
+
+        formats = []
+        if source_url:
+            formats.append({
+                'url': source_url,
+                'format_id': 'source',
+                'ext': determine_ext(title, 'mp4'),
+                'quality': 1,
+            })
+        for video in videos:
+            format_url = video.get('url')
+            if not format_url:
+                continue
+            if determine_ext(format_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'url': format_url,
+                })
+        self._sort_formats(formats)
+
+        duration = float_or_none(try_get(
+            models, lambda x: x[0]['data']['duration']), 1000)
+        uploader = try_get(
+            data, lambda x: x['user']['display_name'], compat_str)
+        uploader_id = try_get(
+            data, lambda x: x['user']['uid'], compat_str)
+        view_count = int_or_none(try_get(
+            data, lambda x: x['meta']['views_counter']))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'duration': duration,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'view_count': view_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py

new file mode 100644 (file)

index 0000000..4358bc8
--- /dev/null
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -0,0 +1,313 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import hashlib
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    float_or_none,
+    try_get,
+)
+
+
+class YandexMusicBaseIE(InfoExtractor):
+    @staticmethod
+    def _handle_error(response):
+        if isinstance(response, dict):
+            error = response.get('error')
+            if error:
+                raise ExtractorError(error, expected=True)
+            if response.get('type') == 'captcha' or 'captcha' in response:
+                YandexMusicBaseIE._raise_captcha()
+
+    @staticmethod
+    def _raise_captcha():
+        raise ExtractorError(
+            'YandexMusic has considered youtube-dlc requests automated and '
+            'asks you to solve a CAPTCHA. You can either wait for some '
+            'time until unblocked and optionally use --sleep-interval '
+            'in future or alternatively you can go to https://music.yandex.ru/ '
+            'solve CAPTCHA, then export cookies and pass cookie file to '
+            'youtube-dlc with --cookies',
+            expected=True)
+
+    def _download_webpage_handle(self, *args, **kwargs):
+        webpage = super(YandexMusicBaseIE, self)._download_webpage_handle(*args, **kwargs)
+        if 'Нам очень жаль, но&nbsp;запросы, поступившие с&nbsp;вашего IP-адреса, похожи на&nbsp;автоматические.' in webpage:
+            self._raise_captcha()
+        return webpage
+
+    def _download_json(self, *args, **kwargs):
+        response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs)
+        self._handle_error(response)
+        return response
+
+
+class YandexMusicTrackIE(YandexMusicBaseIE):
+    IE_NAME = 'yandexmusic:track'
+    IE_DESC = 'Яндекс.Музыка - Трек'
+    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://music.yandex.ru/album/540508/track/4878838',
+        'md5': 'f496818aa2f60b6c0062980d2e00dc20',
+        'info_dict': {
+            'id': '4878838',
+            'ext': 'mp3',
+            'title': 'Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1',
+            'filesize': 4628061,
+            'duration': 193.04,
+            'track': 'Gypsy Eyes 1',
+            'album': 'Gypsy Soul',
+            'album_artist': 'Carlo Ambrosio',
+            'artist': 'Carlo Ambrosio & Fabio Di Bari',
+            'release_year': 2009,
+        },
+        'skip': 'Travis CI servers blocked by YandexMusic',
+    }, {
+        # multiple disks
+        'url': 'http://music.yandex.ru/album/3840501/track/705105',
+        'md5': 'ebe7b4e2ac7ac03fe11c19727ca6153e',
+        'info_dict': {
+            'id': '705105',
+            'ext': 'mp3',
+            'title': 'Hooverphonic - Sometimes',
+            'filesize': 5743386,
+            'duration': 239.27,
+            'track': 'Sometimes',
+            'album': 'The Best of Hooverphonic',
+            'album_artist': 'Hooverphonic',
+            'artist': 'Hooverphonic',
+            'release_year': 2016,
+            'genre': 'pop',
+            'disc_number': 2,
+            'track_number': 9,
+        },
+        'skip': 'Travis CI servers blocked by YandexMusic',
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        album_id, track_id = mobj.group('album_id'), mobj.group('id')
+
+        track = self._download_json(
+            'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id),
+            track_id, 'Downloading track JSON')['track']
+        track_title = track['title']
+
+        download_data = self._download_json(
+            'https://music.yandex.ru/api/v2.1/handlers/track/%s:%s/web-album_track-track-track-main/download/m' % (track_id, album_id),
+            track_id, 'Downloading track location url JSON',
+            headers={'X-Retpath-Y': url})
+
+        fd_data = self._download_json(
+            download_data['src'], track_id,
+            'Downloading track location JSON',
+            query={'format': 'json'})
+        key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode('utf-8')).hexdigest()
+        storage = track['storageDir'].split('.')
+        f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], storage[1])
+
+        thumbnail = None
+        cover_uri = track.get('albums', [{}])[0].get('coverUri')
+        if cover_uri:
+            thumbnail = cover_uri.replace('%%', 'orig')
+            if not thumbnail.startswith('http'):
+                thumbnail = 'http://' + thumbnail
+
+        track_info = {
+            'id': track_id,
+            'ext': 'mp3',
+            'url': f_url,
+            'filesize': int_or_none(track.get('fileSize')),
+            'duration': float_or_none(track.get('durationMs'), 1000),
+            'thumbnail': thumbnail,
+            'track': track_title,
+            'acodec': download_data.get('codec'),
+            'abr': int_or_none(download_data.get('bitrate')),
+        }
+
+        def extract_artist_name(artist):
+            decomposed = artist.get('decomposed')
+            if not isinstance(decomposed, list):
+                return artist['name']
+            parts = [artist['name']]
+            for element in decomposed:
+                if isinstance(element, dict) and element.get('name'):
+                    parts.append(element['name'])
+                elif isinstance(element, compat_str):
+                    parts.append(element)
+            return ''.join(parts)
+
+        def extract_artist(artist_list):
+            if artist_list and isinstance(artist_list, list):
+                artists_names = [extract_artist_name(a) for a in artist_list if a.get('name')]
+                if artists_names:
+                    return ', '.join(artists_names)
+
+        albums = track.get('albums')
+        if albums and isinstance(albums, list):
+            album = albums[0]
+            if isinstance(album, dict):
+                year = album.get('year')
+                disc_number = int_or_none(try_get(
+                    album, lambda x: x['trackPosition']['volume']))
+                track_number = int_or_none(try_get(
+                    album, lambda x: x['trackPosition']['index']))
+                track_info.update({
+                    'album': album.get('title'),
+                    'album_artist': extract_artist(album.get('artists')),
+                    'release_year': int_or_none(year),
+                    'genre': album.get('genre'),
+                    'disc_number': disc_number,
+                    'track_number': track_number,
+                })
+
+        track_artist = extract_artist(track.get('artists'))
+        if track_artist:
+            track_info.update({
+                'artist': track_artist,
+                'title': '%s - %s' % (track_artist, track_title),
+            })
+        else:
+            track_info['title'] = track_title
+
+        return track_info
+
+
+class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):
+    def _build_playlist(self, tracks):
+        return [
+            self.url_result(
+                'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id']))
+            for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)]
+
+
+class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
+    IE_NAME = 'yandexmusic:album'
+    IE_DESC = 'Яндекс.Музыка - Альбом'
+    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<id>\d+)/?(\?|$)'
+
+    _TESTS = [{
+        'url': 'http://music.yandex.ru/album/540508',
+        'info_dict': {
+            'id': '540508',
+            'title': 'Carlo Ambrosio - Gypsy Soul (2009)',
+        },
+        'playlist_count': 50,
+        'skip': 'Travis CI servers blocked by YandexMusic',
+    }, {
+        'url': 'https://music.yandex.ru/album/3840501',
+        'info_dict': {
+            'id': '3840501',
+            'title': 'Hooverphonic - The Best of Hooverphonic (2016)',
+        },
+        'playlist_count': 33,
+        'skip': 'Travis CI servers blocked by YandexMusic',
+    }]
+
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+
+        album = self._download_json(
+            'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id,
+            album_id, 'Downloading album JSON')
+
+        entries = self._build_playlist([track for volume in album['volumes'] for track in volume])
+
+        title = '%s - %s' % (album['artists'][0]['name'], album['title'])
+        year = album.get('year')
+        if year:
+            title += ' (%s)' % year
+
+        return self.playlist_result(entries, compat_str(album['id']), title)
+
+
+class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
+    IE_NAME = 'yandexmusic:playlist'
+    IE_DESC = 'Яндекс.Музыка - Плейлист'
+    _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
+        'info_dict': {
+            'id': '1245',
+            'title': 'Что слушают Enter Shikari',
+            'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9',
+        },
+        'playlist_count': 6,
+        'skip': 'Travis CI servers blocked by YandexMusic',
+    }, {
+        # playlist exceeding the limit of 150 tracks shipped with webpage (see
+        # https://github.com/ytdl-org/youtube-dl/issues/6666)
+        'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036',
+        'info_dict': {
+            'id': '1036',
+            'title': 'Музыка 90-х',
+        },
+        'playlist_mincount': 300,
+        'skip': 'Travis CI servers blocked by YandexMusic',
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        tld = mobj.group('tld')
+        user = mobj.group('user')
+        playlist_id = mobj.group('id')
+
+        playlist = self._download_json(
+            'https://music.yandex.%s/handlers/playlist.jsx' % tld,
+            playlist_id, 'Downloading missing tracks JSON',
+            fatal=False,
+            headers={
+                'Referer': url,
+                'X-Requested-With': 'XMLHttpRequest',
+                'X-Retpath-Y': url,
+            },
+            query={
+                'owner': user,
+                'kinds': playlist_id,
+                'light': 'true',
+                'lang': tld,
+                'external-domain': 'music.yandex.%s' % tld,
+                'overembed': 'false',
+            })['playlist']
+
+        tracks = playlist['tracks']
+        track_ids = [compat_str(track_id) for track_id in playlist['trackIds']]
+
+        # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
+        # missing tracks should be retrieved manually.
+        if len(tracks) < len(track_ids):
+            present_track_ids = set([
+                compat_str(track['id'])
+                for track in tracks if track.get('id')])
+            missing_track_ids = [
+                track_id for track_id in track_ids
+                if track_id not in present_track_ids]
+            missing_tracks = self._download_json(
+                'https://music.yandex.%s/handlers/track-entries.jsx' % tld,
+                playlist_id, 'Downloading missing tracks JSON',
+                fatal=False,
+                headers={
+                    'Referer': url,
+                    'X-Requested-With': 'XMLHttpRequest',
+                },
+                query={
+                    'entries': ','.join(missing_track_ids),
+                    'lang': tld,
+                    'external-domain': 'music.yandex.%s' % tld,
+                    'overembed': 'false',
+                    'strict': 'true',
+                })
+            if missing_tracks:
+                tracks.extend(missing_tracks)
+
+        return self.playlist_result(
+            self._build_playlist(tracks),
+            compat_str(playlist_id),
+            playlist.get('title'), playlist.get('description'))
diff --git a/youtube_dl/extractor/yandexvideo.py b/youtube_dl/extractor/yandexvideo.py

new file mode 100644 (file)

index 0000000..46529be
--- /dev/null
+++ b/youtube_dl/extractor/yandexvideo.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    url_or_none,
+)
+
+
+class YandexVideoIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            yandex\.ru(?:/portal/(?:video|efir))?/?\?.*?stream_id=|
+                            frontend\.vh\.yandex\.ru/player/
+                        )
+                        (?P<id>[\da-f]+)
+                    '''
+    _TESTS = [{
+        'url': 'https://yandex.ru/portal/video?stream_id=4dbb262b4fe5cf15a215de4f34eee34d',
+        'md5': '33955d7ae052f15853dc41f35f17581c',
+        'info_dict': {
+            'id': '4dbb262b4fe5cf15a215de4f34eee34d',
+            'ext': 'mp4',
+            'title': 'В Нью-Йорке баржи и теплоход оторвались от причала и расплылись по Гудзону',
+            'description': '',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'timestamp': 0,
+            'duration': 30,
+            'age_limit': 18,
+        },
+    }, {
+        'url': 'https://yandex.ru/portal/efir?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374&from=morda',
+        'only_matching': True,
+    }, {
+        'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d',
+        'only_matching': True,
+    }, {
+        'url': 'https://frontend.vh.yandex.ru/player/4dbb262b4fe5cf15a215de4f34eee34d?from=morda',
+        'only_matching': True,
+    }, {
+        # vod-episode, series episode
+        'url': 'https://yandex.ru/portal/video?stream_id=45b11db6e4b68797919c93751a938cee',
+        'only_matching': True,
+    }, {
+        # episode, sports
+        'url': 'https://yandex.ru/?stream_channel=1538487871&stream_id=4132a07f71fb0396be93d74b3477131d',
+        'only_matching': True,
+    }, {
+        # DASH with DRM
+        'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        content = self._download_json(
+            'https://frontend.vh.yandex.ru/v22/player/%s.json' % video_id,
+            video_id, query={
+                'stream_options': 'hires',
+                'disable_trackings': 1,
+            })['content']
+
+        content_url = url_or_none(content.get('content_url')) or url_or_none(
+            content['streams'][0]['url'])
+        title = content.get('title') or content.get('computed_title')
+
+        ext = determine_ext(content_url)
+
+        if ext == 'm3u8':
+            formats = self._extract_m3u8_formats(
+                content_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls')
+        elif ext == 'mpd':
+            formats = self._extract_mpd_formats(
+                content_url, video_id, mpd_id='dash')
+        else:
+            formats = [{'url': content_url}]
+
+        self._sort_formats(formats)
+
+        description = content.get('description')
+        thumbnail = content.get('thumbnail')
+        timestamp = (int_or_none(content.get('release_date'))
+                     or int_or_none(content.get('release_date_ut'))
+                     or int_or_none(content.get('start_time')))
+        duration = int_or_none(content.get('duration'))
+        series = content.get('program_title')
+        age_limit = int_or_none(content.get('restriction_age'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'duration': duration,
+            'series': series,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/yapfiles.py b/youtube_dl/extractor/yapfiles.py

new file mode 100644 (file)

index 0000000..cfb368d
--- /dev/null
+++ b/youtube_dl/extractor/yapfiles.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    qualities,
+    unescapeHTML,
+    url_or_none,
+)
+
+
+class YapFilesIE(InfoExtractor):
+    _YAPFILES_URL = r'//(?:(?:www|api)\.)?yapfiles\.ru/get_player/*\?.*?\bv=(?P<id>\w+)'
+    _VALID_URL = r'https?:%s' % _YAPFILES_URL
+    _TESTS = [{
+        # with hd
+        'url': 'http://www.yapfiles.ru/get_player/?v=vMDE1NjcyNDUt0413',
+        'md5': '2db19e2bfa2450568868548a1aa1956c',
+        'info_dict': {
+            'id': 'vMDE1NjcyNDUt0413',
+            'ext': 'mp4',
+            'title': 'Самый худший пароль WIFI',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 72,
+        },
+    }, {
+        # without hd
+        'url': 'https://api.yapfiles.ru/get_player/?uid=video_player_1872528&plroll=1&adv=1&v=vMDE4NzI1Mjgt690b',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [unescapeHTML(mobj.group('url')) for mobj in re.finditer(
+            r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.*?)\1'
+            % YapFilesIE._YAPFILES_URL, webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id, fatal=False)
+
+        player_url = None
+        query = {}
+        if webpage:
+            player_url = self._search_regex(
+                r'player\.init\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+                'player url', default=None, group='url')
+
+        if not player_url:
+            player_url = 'http://api.yapfiles.ru/load/%s/' % video_id
+            query = {
+                'md5': 'ded5f369be61b8ae5f88e2eeb2f3caff',
+                'type': 'json',
+                'ref': url,
+            }
+
+        player = self._download_json(
+            player_url, video_id, query=query)['player']
+
+        playlist_url = player['playlist']
+        title = player['title']
+        thumbnail = player.get('poster')
+
+        if title == 'Ролик удален' or 'deleted.jpg' in (thumbnail or ''):
+            raise ExtractorError(
+                'Video %s has been removed' % video_id, expected=True)
+
+        playlist = self._download_json(
+            playlist_url, video_id)['player']['main']
+
+        hd_height = int_or_none(player.get('hd'))
+
+        QUALITIES = ('sd', 'hd')
+        quality_key = qualities(QUALITIES)
+        formats = []
+        for format_id in QUALITIES:
+            is_hd = format_id == 'hd'
+            format_url = url_or_none(playlist.get(
+                'file%s' % ('_hd' if is_hd else '')))
+            if not format_url:
+                continue
+            formats.append({
+                'url': format_url,
+                'format_id': format_id,
+                'quality': quality_key(format_id),
+                'height': hd_height if is_hd else None,
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': int_or_none(player.get('length')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/yesjapan.py b/youtube_dl/extractor/yesjapan.py

new file mode 100644 (file)

index 0000000..681338c
--- /dev/null
+++ b/youtube_dl/extractor/yesjapan.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    HEADRequest,
+    get_element_by_attribute,
+    parse_iso8601,
+)
+
+
+class YesJapanIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?yesjapan\.com/video/(?P<slug>[A-Za-z0-9\-]*)_(?P<id>[A-Za-z0-9]+)\.html'
+    _TEST = {
+        'url': 'http://www.yesjapan.com/video/japanese-in-5-20-wa-and-ga-particle-usages_726497834.html',
+        'md5': 'f0be416314e5be21a12b499b330c21cf',
+        'info_dict': {
+            'id': '726497834',
+            'title': 'Japanese in 5! #20 - WA And GA Particle Usages',
+            'description': 'This should clear up some issues most students of Japanese encounter with WA and GA....',
+            'ext': 'mp4',
+            'timestamp': 1416391590,
+            'upload_date': '20141119',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._og_search_title(webpage)
+        video_url = self._og_search_video_url(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        timestamp = None
+        submit_info = get_element_by_attribute('class', 'pm-submit-data', webpage)
+        if submit_info:
+            timestamp = parse_iso8601(self._search_regex(
+                r'datetime="([^"]+)"', submit_info, 'upload date', fatal=False, default=None))
+
+        # attempt to resolve the final URL in order to get a proper extension
+        redirect_req = HEADRequest(video_url)
+        req = self._request_webpage(
+            redirect_req, video_id, note='Resolving final URL', errnote='Could not resolve final URL', fatal=False)
+        if req:
+            video_url = req.geturl()
+
+        formats = [{
+            'format_id': 'sd',
+            'url': video_url,
+        }]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'timestamp': timestamp,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py

new file mode 100644 (file)

index 0000000..1fd8d35
--- /dev/null
+++ b/youtube_dl/extractor/yinyuetai.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class YinYueTaiIE(InfoExtractor):
+    IE_NAME = 'yinyuetai:video'
+    IE_DESC = '音悦Tai'
+    _VALID_URL = r'https?://v\.yinyuetai\.com/video(?:/h5)?/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://v.yinyuetai.com/video/2322376',
+        'md5': '6e3abe28d38e3a54b591f9f040595ce0',
+        'info_dict': {
+            'id': '2322376',
+            'ext': 'mp4',
+            'title': '少女时代_PARTY_Music Video Teaser',
+            'creator': '少女时代',
+            'duration': 25,
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://v.yinyuetai.com/video/h5/2322376',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        info = self._download_json(
+            'http://ext.yinyuetai.com/main/get-h-mv-info?json=true&videoId=%s' % video_id, video_id,
+            'Downloading mv info')['videoInfo']['coreVideoInfo']
+
+        if info['error']:
+            raise ExtractorError(info['errorMsg'], expected=True)
+
+        formats = [{
+            'url': format_info['videoUrl'],
+            'format_id': format_info['qualityLevel'],
+            'format': format_info.get('qualityLevelName'),
+            'filesize': format_info.get('fileSize'),
+            # though URLs ends with .flv, the downloaded files are in fact mp4
+            'ext': 'mp4',
+            'tbr': format_info.get('bitrate'),
+        } for format_info in info['videoUrlModels']]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': info['videoName'],
+            'thumbnail': info.get('bigHeadImage'),
+            'creator': info.get('artistNames'),
+            'duration': info.get('duration'),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py

new file mode 100644 (file)

index 0000000..c4ae4d8
--- /dev/null
+++ b/youtube_dl/extractor/ynet.py
@@ -0,0 +1,52 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote_plus
+
+
+class YnetIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html'
+    _TESTS = [
+        {
+            'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html',
+            'info_dict': {
+                'id': 'L-11659-99244',
+                'ext': 'flv',
+                'title': 'איש לא יודע מאיפה באנו',
+                'thumbnail': r're:^https?://.*\.jpg',
+            }
+        }, {
+            'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html',
+            'info_dict': {
+                'id': 'L-8859-84418',
+                'ext': 'flv',
+                'title': "צפו: הנשיקה הלוהטת של תורגי' ויוליה פלוטקין",
+                'thumbnail': r're:^https?://.*\.jpg',
+            }
+        }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        content = compat_urllib_parse_unquote_plus(self._og_search_video_url(webpage))
+        config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config'))
+        f4m_url = config['clip']['url']
+        title = self._og_search_title(webpage)
+        m = re.search(r'ynet - HOT -- (["\']+)(?P<title>.+?)\1', title)
+        if m:
+            title = m.group('title')
+        formats = self._extract_f4m_formats(f4m_url, video_id)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py

new file mode 100644 (file)

index 0000000..88aabd2
--- /dev/null
+++ b/youtube_dl/extractor/youjizz.py
@@ -0,0 +1,95 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    parse_duration,
+    url_or_none,
+)
+
+
+class YouJizzIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))'
+    _TESTS = [{
+        'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
+        'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4',
+        'info_dict': {
+            'id': '2189178',
+            'ext': 'mp4',
+            'title': 'Zeichentrick 1',
+            'age_limit': 18,
+            'duration': 2874,
+        }
+    }, {
+        'url': 'http://www.youjizz.com/videos/-2189178.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youjizz.com/videos/embed/31991001',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id') or mobj.group('embed_id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<title>(.+?)</title>', webpage, 'title')
+
+        formats = []
+
+        encodings = self._parse_json(
+            self._search_regex(
+                r'[Ee]ncodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings',
+                default='[]'),
+            video_id, fatal=False)
+        for encoding in encodings:
+            if not isinstance(encoding, dict):
+                continue
+            format_url = url_or_none(encoding.get('filename'))
+            if not format_url:
+                continue
+            if determine_ext(format_url) == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                format_id = encoding.get('name') or encoding.get('quality')
+                height = int_or_none(self._search_regex(
+                    r'^(\d+)[pP]', format_id, 'height', default=None))
+                formats.append({
+                    'url': format_url,
+                    'format_id': format_id,
+                    'height': height,
+                })
+
+        if formats:
+            info_dict = {
+                'formats': formats,
+            }
+        else:
+            # YouJizz's HTML5 player has invalid HTML
+            webpage = webpage.replace('"controls', '" controls')
+            info_dict = self._parse_html5_media_entries(
+                url, webpage, video_id)[0]
+
+        duration = parse_duration(self._search_regex(
+            r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration',
+            default=None))
+        uploader = self._search_regex(
+            r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader',
+            default=None)
+
+        info_dict.update({
+            'id': video_id,
+            'title': title,
+            'age_limit': self._rta_search(webpage),
+            'duration': duration,
+            'uploader': uploader,
+        })
+
+        return info_dict
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py

new file mode 100644 (file)

index 0000000..61d1ab2
--- /dev/null
+++ b/youtube_dl/extractor/youku.py
@@ -0,0 +1,309 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import re
+import string
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    get_element_by_class,
+    js_to_json,
+    str_or_none,
+    strip_jsonp,
+)
+
+
+class YoukuIE(InfoExtractor):
+    IE_NAME = 'youku'
+    IE_DESC = '优酷'
+    _VALID_URL = r'''(?x)
+        (?:
+            https?://(
+                (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
+                video\.tudou\.com/v/)|
+            youku:)
+        (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
+    '''
+
+    _TESTS = [{
+        # MD5 is unstable
+        'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
+        'info_dict': {
+            'id': 'XMTc1ODE5Njcy',
+            'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
+            'ext': 'mp4',
+            'duration': 74.73,
+            'thumbnail': r're:^https?://.*',
+            'uploader': '。躲猫猫、',
+            'uploader_id': '36017967',
+            'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4',
+            'tags': list,
+        }
+    }, {
+        'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
+        'only_matching': True,
+    }, {
+        'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
+        'info_dict': {
+            'id': 'XODgxNjg1Mzk2',
+            'ext': 'mp4',
+            'title': '武媚娘传奇 85',
+            'duration': 1999.61,
+            'thumbnail': r're:^https?://.*',
+            'uploader': '疯狂豆花',
+            'uploader_id': '62583473',
+            'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky',
+            'tags': list,
+        },
+    }, {
+        'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
+        'info_dict': {
+            'id': 'XMTI1OTczNDM5Mg',
+            'ext': 'mp4',
+            'title': '花千骨 04',
+            'duration': 2363,
+            'thumbnail': r're:^https?://.*',
+            'uploader': '放剧场-花千骨',
+            'uploader_id': '772849359',
+            'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==',
+            'tags': list,
+        },
+    }, {
+        'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html',
+        'note': 'Video protected with password',
+        'info_dict': {
+            'id': 'XNjA1NzA2Njgw',
+            'ext': 'mp4',
+            'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起',
+            'duration': 7264.5,
+            'thumbnail': r're:^https?://.*',
+            'uploader': 'FoxJin1006',
+            'uploader_id': '322014285',
+            'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==',
+            'tags': list,
+        },
+        'params': {
+            'videopassword': '100600',
+        },
+    }, {
+        # /play/get.json contains streams with "channel_type":"tail"
+        'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html',
+        'info_dict': {
+            'id': 'XOTUxMzg4NDMy',
+            'ext': 'mp4',
+            'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft',
+            'duration': 702.08,
+            'thumbnail': r're:^https?://.*',
+            'uploader': '明月庄主moon',
+            'uploader_id': '38465621',
+            'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0',
+            'tags': list,
+        },
+    }, {
+        'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805',
+        'info_dict': {
+            'id': 'XMjIyNzAzMTQ4NA',
+            'ext': 'mp4',
+            'title': '卡马乔国足开大脚长传冲吊集锦',
+            'duration': 289,
+            'thumbnail': r're:^https?://.*',
+            'uploader': '阿卜杜拉之星',
+            'uploader_id': '2382249',
+            'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==',
+            'tags': list,
+        },
+    }, {
+        'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def get_ysuid():
+        return '%d%s' % (int(time.time()), ''.join([
+            random.choice(string.ascii_letters) for i in range(3)]))
+
+    def get_format_name(self, fm):
+        _dict = {
+            '3gp': 'h6',
+            '3gphd': 'h5',
+            'flv': 'h4',
+            'flvhd': 'h4',
+            'mp4': 'h3',
+            'mp4hd': 'h3',
+            'mp4hd2': 'h4',
+            'mp4hd3': 'h4',
+            'hd2': 'h2',
+            'hd3': 'h1',
+        }
+        return _dict.get(fm)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        self._set_cookie('youku.com', '__ysuid', self.get_ysuid())
+        self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com')
+
+        _, urlh = self._download_webpage_handle(
+            'https://log.mmstat.com/eg.js', video_id, 'Retrieving cna info')
+        # The etag header is '"foobar"'; let's remove the double quotes
+        cna = urlh.headers['etag'][1:-1]
+
+        # request basic data
+        basic_data_params = {
+            'vid': video_id,
+            'ccode': '0590',
+            'client_ip': '192.168.1.1',
+            'utid': cna,
+            'client_ts': time.time() / 1000,
+        }
+
+        video_password = self._downloader.params.get('videopassword')
+        if video_password:
+            basic_data_params['password'] = video_password
+
+        headers = {
+            'Referer': url,
+        }
+        headers.update(self.geo_verification_headers())
+        data = self._download_json(
+            'https://ups.youku.com/ups/get.json', video_id,
+            'Downloading JSON metadata',
+            query=basic_data_params, headers=headers)['data']
+
+        error = data.get('error')
+        if error:
+            error_note = error.get('note')
+            if error_note is not None and '因版权原因无法观看此视频' in error_note:
+                raise ExtractorError(
+                    'Youku said: Sorry, this video is available in China only', expected=True)
+            elif error_note and '该视频被设为私密' in error_note:
+                raise ExtractorError(
+                    'Youku said: Sorry, this video is private', expected=True)
+            else:
+                msg = 'Youku server reported error %i' % error.get('code')
+                if error_note is not None:
+                    msg += ': ' + error_note
+                raise ExtractorError(msg)
+
+        # get video title
+        video_data = data['video']
+        title = video_data['title']
+
+        formats = [{
+            'url': stream['m3u8_url'],
+            'format_id': self.get_format_name(stream.get('stream_type')),
+            'ext': 'mp4',
+            'protocol': 'm3u8_native',
+            'filesize': int(stream.get('size')),
+            'width': stream.get('width'),
+            'height': stream.get('height'),
+        } for stream in data['stream'] if stream.get('channel_type') != 'tail']
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'duration': video_data.get('seconds'),
+            'thumbnail': video_data.get('logo'),
+            'uploader': video_data.get('username'),
+            'uploader_id': str_or_none(video_data.get('userid')),
+            'uploader_url': data.get('uploader', {}).get('homepage'),
+            'tags': video_data.get('tags'),
+        }
+
+
+class YoukuShowIE(InfoExtractor):
+    _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html'
+    IE_NAME = 'youku:show'
+
+    _TESTS = [{
+        'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html',
+        'info_dict': {
+            'id': 'zc7c670be07ff11e48b3f',
+            'title': '花千骨 DVD版',
+            'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558',
+        },
+        'playlist_count': 50,
+    }, {
+        # Episode number not starting from 1
+        'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html',
+        'info_dict': {
+            'id': 'zefbfbd70efbfbd780bef',
+            'title': '超级飞侠3',
+            'description': 'md5:275715156abebe5ccc2a1992e9d56b98',
+        },
+        'playlist_count': 24,
+    }, {
+        # Ongoing playlist. The initial page is the last one
+        'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html',
+        'only_matching': True,
+    }, {
+        #  No data-id value.
+        'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html',
+        'only_matching': True,
+    }, {
+        #  Wrong number of reload_id.
+        'url': 'http://list.youku.com/show/id_z20eb4acaf5c211e3b2ad.html',
+        'only_matching': True,
+    }]
+
+    def _extract_entries(self, playlist_data_url, show_id, note, query):
+        query['callback'] = 'cb'
+        playlist_data = self._download_json(
+            playlist_data_url, show_id, query=query, note=note,
+            transform_source=lambda s: js_to_json(strip_jsonp(s))).get('html')
+        if playlist_data is None:
+            return [None, None]
+        drama_list = (get_element_by_class('p-drama-grid', playlist_data)
+                      or get_element_by_class('p-drama-half-row', playlist_data))
+        if drama_list is None:
+            raise ExtractorError('No episodes found')
+        video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list)
+        return playlist_data, [
+            self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key())
+            for video_url in video_urls]
+
+    def _real_extract(self, url):
+        show_id = self._match_id(url)
+        webpage = self._download_webpage(url, show_id)
+
+        entries = []
+        page_config = self._parse_json(self._search_regex(
+            r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'),
+            show_id, transform_source=js_to_json)
+        first_page, initial_entries = self._extract_entries(
+            'http://list.youku.com/show/module', show_id,
+            note='Downloading initial playlist data page',
+            query={
+                'id': page_config['showid'],
+                'tab': 'showInfo',
+            })
+        first_page_reload_id = self._html_search_regex(
+            r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id')
+        # The first reload_id has the same items as first_page
+        reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page)
+        entries.extend(initial_entries)
+        for idx, reload_id in enumerate(reload_ids):
+            if reload_id == first_page_reload_id:
+                continue
+            _, new_entries = self._extract_entries(
+                'http://list.youku.com/show/episode', show_id,
+                note='Downloading playlist data page %d' % (idx + 1),
+                query={
+                    'id': page_config['showid'],
+                    'stage': reload_id,
+                })
+            if new_entries is not None:
+                entries.extend(new_entries)
+        desc = self._html_search_meta('description', webpage, fatal=False)
+        playlist_title = desc.split(',')[0] if desc else None
+        detail_li = get_element_by_class('p-intro', webpage)
+        playlist_description = get_element_by_class(
+            'intro-more', detail_li) if detail_li else None
+
+        return self.playlist_result(
+            entries, show_id, playlist_title, playlist_description)
diff --git a/youtube_dl/extractor/younow.py b/youtube_dl/extractor/younow.py

new file mode 100644 (file)

index 0000000..04dbc87
--- /dev/null
+++ b/youtube_dl/extractor/younow.py
@@ -0,0 +1,202 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    try_get,
+)
+
+CDN_API_BASE = 'https://cdn.younow.com/php/api'
+MOMENT_URL_FORMAT = '%s/moment/fetch/id=%%s' % CDN_API_BASE
+
+
+class YouNowLiveIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.younow.com/AmandaPadeezy',
+        'info_dict': {
+            'id': 'AmandaPadeezy',
+            'ext': 'mp4',
+            'is_live': True,
+            'title': 'March 26, 2017',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'tags': ['girls'],
+            'categories': ['girls'],
+            'uploader': 'AmandaPadeezy',
+            'uploader_id': '6716501',
+            'uploader_url': 'https://www.younow.com/AmandaPadeezy',
+            'creator': 'AmandaPadeezy',
+        },
+        'skip': True,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return (False
+                if YouNowChannelIE.suitable(url) or YouNowMomentIE.suitable(url)
+                else super(YouNowLiveIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        username = self._match_id(url)
+
+        data = self._download_json(
+            'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s'
+            % username, username)
+
+        if data.get('errorCode') != 0:
+            raise ExtractorError(data['errorMsg'], expected=True)
+
+        uploader = try_get(
+            data, lambda x: x['user']['profileUrlString'],
+            compat_str) or username
+
+        return {
+            'id': uploader,
+            'is_live': True,
+            'title': self._live_title(uploader),
+            'thumbnail': data.get('awsUrl'),
+            'tags': data.get('tags'),
+            'categories': data.get('tags'),
+            'uploader': uploader,
+            'uploader_id': data.get('userId'),
+            'uploader_url': 'https://www.younow.com/%s' % username,
+            'creator': uploader,
+            'view_count': int_or_none(data.get('viewers')),
+            'like_count': int_or_none(data.get('likes')),
+            'formats': [{
+                'url': '%s/broadcast/videoPath/hls=1/broadcastId=%s/channelId=%s'
+                       % (CDN_API_BASE, data['broadcastId'], data['userId']),
+                'ext': 'mp4',
+                'protocol': 'm3u8',
+            }],
+        }
+
+
+def _extract_moment(item, fatal=True):
+    moment_id = item.get('momentId')
+    if not moment_id:
+        if not fatal:
+            return
+        raise ExtractorError('Unable to extract moment id')
+
+    moment_id = compat_str(moment_id)
+
+    title = item.get('text')
+    if not title:
+        title = 'YouNow %s' % (
+            item.get('momentType') or item.get('titleType') or 'moment')
+
+    uploader = try_get(item, lambda x: x['owner']['name'], compat_str)
+    uploader_id = try_get(item, lambda x: x['owner']['userId'])
+    uploader_url = 'https://www.younow.com/%s' % uploader if uploader else None
+
+    entry = {
+        'extractor_key': 'YouNowMoment',
+        'id': moment_id,
+        'title': title,
+        'view_count': int_or_none(item.get('views')),
+        'like_count': int_or_none(item.get('likes')),
+        'timestamp': int_or_none(item.get('created')),
+        'creator': uploader,
+        'uploader': uploader,
+        'uploader_id': uploader_id,
+        'uploader_url': uploader_url,
+        'formats': [{
+            'url': 'https://hls.younow.com/momentsplaylists/live/%s/%s.m3u8'
+                   % (moment_id, moment_id),
+            'ext': 'mp4',
+            'protocol': 'm3u8_native',
+        }],
+    }
+
+    return entry
+
+
+class YouNowChannelIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/]+)/channel'
+    _TEST = {
+        'url': 'https://www.younow.com/its_Kateee_/channel',
+        'info_dict': {
+            'id': '14629760',
+            'title': 'its_Kateee_ moments'
+        },
+        'playlist_mincount': 8,
+    }
+
+    def _entries(self, username, channel_id):
+        created_before = 0
+        for page_num in itertools.count(1):
+            if created_before is None:
+                break
+            info = self._download_json(
+                '%s/moment/profile/channelId=%s/createdBefore=%d/records=20'
+                % (CDN_API_BASE, channel_id, created_before), username,
+                note='Downloading moments page %d' % page_num)
+            items = info.get('items')
+            if not items or not isinstance(items, list):
+                break
+            for item in items:
+                if not isinstance(item, dict):
+                    continue
+                item_type = item.get('type')
+                if item_type == 'moment':
+                    entry = _extract_moment(item, fatal=False)
+                    if entry:
+                        yield entry
+                elif item_type == 'collection':
+                    moments = item.get('momentsIds')
+                    if isinstance(moments, list):
+                        for moment_id in moments:
+                            m = self._download_json(
+                                MOMENT_URL_FORMAT % moment_id, username,
+                                note='Downloading %s moment JSON' % moment_id,
+                                fatal=False)
+                            if m and isinstance(m, dict) and m.get('item'):
+                                entry = _extract_moment(m['item'])
+                                if entry:
+                                    yield entry
+                created_before = int_or_none(item.get('created'))
+
+    def _real_extract(self, url):
+        username = self._match_id(url)
+        channel_id = compat_str(self._download_json(
+            'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s'
+            % username, username, note='Downloading user information')['userId'])
+        return self.playlist_result(
+            self._entries(username, channel_id), channel_id,
+            '%s moments' % username)
+
+
+class YouNowMomentIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?younow\.com/[^/]+/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.younow.com/GABO.../20712117/36319236/3b316doc/m',
+        'md5': 'a30c70eadb9fb39a1aa3c8c0d22a0807',
+        'info_dict': {
+            'id': '20712117',
+            'ext': 'mp4',
+            'title': 'YouNow capture',
+            'view_count': int,
+            'like_count': int,
+            'timestamp': 1490432040,
+            'upload_date': '20170325',
+            'uploader': 'GABO...',
+            'uploader_id': 35917228,
+        },
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return (False
+                if YouNowChannelIE.suitable(url)
+                else super(YouNowMomentIE, cls).suitable(url))
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        item = self._download_json(MOMENT_URL_FORMAT % video_id, video_id)
+        return _extract_moment(item['item'])
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py

new file mode 100644 (file)

index 0000000..e7fca22
--- /dev/null
+++ b/youtube_dl/extractor/youporn.py
@@ -0,0 +1,203 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    str_to_int,
+    unescapeHTML,
+    unified_strdate,
+    url_or_none,
+)
+from ..aes import aes_decrypt_text
+
+
+class YouPornIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+    _TESTS = [{
+        'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
+        'md5': '3744d24c50438cf5b6f6d59feb5055c2',
+        'info_dict': {
+            'id': '505835',
+            'display_id': 'sex-ed-is-it-safe-to-masturbate-daily',
+            'ext': 'mp4',
+            'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
+            'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Ask Dan And Jennifer',
+            'upload_date': '20101217',
+            'average_rating': int,
+            'view_count': int,
+            'comment_count': int,
+            'categories': list,
+            'tags': list,
+            'age_limit': 18,
+        },
+    }, {
+        # Unknown uploader
+        'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4',
+        'info_dict': {
+            'id': '561726',
+            'display_id': 'big-tits-awesome-brunette-on-amazing-webcam-show',
+            'ext': 'mp4',
+            'title': 'Big Tits Awesome Brunette On amazing webcam show',
+            'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'uploader': 'Unknown',
+            'upload_date': '20110418',
+            'average_rating': int,
+            'view_count': int,
+            'comment_count': int,
+            'categories': list,
+            'tags': list,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.youporn.com/watch/505835',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return re.findall(
+            r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)',
+            webpage)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage = self._download_webpage(
+            'http://www.youporn.com/watch/%s' % video_id, display_id,
+            headers={'Cookie': 'age_verified=1'})
+
+        title = self._html_search_regex(
+            r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
+            webpage, 'title', default=None) or self._og_search_title(
+            webpage, default=None) or self._html_search_meta(
+            'title', webpage, fatal=True)
+
+        links = []
+
+        # Main source
+        definitions = self._parse_json(
+            self._search_regex(
+                r'mediaDefinition\s*=\s*(\[.+?\]);', webpage,
+                'media definitions', default='[]'),
+            video_id, fatal=False)
+        if definitions:
+            for definition in definitions:
+                if not isinstance(definition, dict):
+                    continue
+                video_url = url_or_none(definition.get('videoUrl'))
+                if video_url:
+                    links.append(video_url)
+
+        # Fallback #1, this also contains extra low quality 180p format
+        for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
+            links.append(link)
+
+        # Fallback #2 (unavailable as at 22.06.2017)
+        sources = self._search_regex(
+            r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)
+        if sources:
+            for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
+                links.append(link)
+
+        # Fallback #3 (unavailable as at 22.06.2017)
+        for _, link in re.findall(
+                r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):
+            links.append(link)
+
+        # Fallback #4, encrypted links (unavailable as at 22.06.2017)
+        for _, encrypted_link in re.findall(
+                r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage):
+            links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8'))
+
+        formats = []
+        for video_url in set(unescapeHTML(link) for link in links):
+            f = {
+                'url': video_url,
+            }
+            # Video URL's path looks like this:
+            #  /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
+            #  /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
+            # We will benefit from it by extracting some metadata
+            mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
+            if mobj:
+                height = int(mobj.group('height'))
+                bitrate = int(mobj.group('bitrate'))
+                f.update({
+                    'format_id': '%dp-%dk' % (height, bitrate),
+                    'height': height,
+                    'tbr': bitrate,
+                })
+            formats.append(f)
+        self._sort_formats(formats)
+
+        description = self._html_search_regex(
+            r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>',
+            webpage, 'description',
+            default=None) or self._og_search_description(
+            webpage, default=None)
+        thumbnail = self._search_regex(
+            r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1',
+            webpage, 'thumbnail', fatal=False, group='thumbnail')
+
+        uploader = self._html_search_regex(
+            r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
+            webpage, 'uploader', fatal=False)
+        upload_date = unified_strdate(self._html_search_regex(
+            [r'Date\s+[Aa]dded:\s*<span>([^<]+)',
+             r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'],
+            webpage, 'upload date', fatal=False))
+
+        age_limit = self._rta_search(webpage)
+
+        average_rating = int_or_none(self._search_regex(
+            r'<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>',
+            webpage, 'average rating', fatal=False))
+
+        view_count = str_to_int(self._search_regex(
+            r'(?s)<div[^>]+class=(["\']).*?\bvideoInfoViews\b.*?\1[^>]*>.*?(?P<count>[\d,.]+)<',
+            webpage, 'view count', fatal=False, group='count'))
+        comment_count = str_to_int(self._search_regex(
+            r'>All [Cc]omments? \(([\d,.]+)\)',
+            webpage, 'comment count', fatal=False))
+
+        def extract_tag_box(regex, title):
+            tag_box = self._search_regex(regex, webpage, title, default=None)
+            if not tag_box:
+                return []
+            return re.findall(r'<a[^>]+href=[^>]+>([^<]+)', tag_box)
+
+        categories = extract_tag_box(
+            r'(?s)Categories:.*?</[^>]+>(.+?)</div>', 'categories')
+        tags = extract_tag_box(
+            r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>',
+            'tags')
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'upload_date': upload_date,
+            'average_rating': average_rating,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'categories': categories,
+            'tags': tags,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/yourporn.py b/youtube_dl/extractor/yourporn.py

new file mode 100644 (file)

index 0000000..9834749
--- /dev/null
+++ b/youtube_dl/extractor/yourporn.py
@@ -0,0 +1,67 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    parse_duration,
+    urljoin,
+)
+
+
+class YourPornIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?sxyprn\.com/post/(?P<id>[^/?#&.]+)'
+    _TESTS = [{
+        'url': 'https://sxyprn.com/post/57ffcb2e1179b.html',
+        'md5': '6f8682b6464033d87acaa7a8ff0c092e',
+        'info_dict': {
+            'id': '57ffcb2e1179b',
+            'ext': 'mp4',
+            'title': 'md5:c9f43630bd968267672651ba905a7d35',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 165,
+            'age_limit': 18,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://sxyprn.com/post/57ffcb2e1179b.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        parts = self._parse_json(
+            self._search_regex(
+                r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info',
+                group='data'),
+            video_id)[video_id].split('/')
+
+        num = 0
+        for c in parts[6] + parts[7]:
+            if c.isnumeric():
+                num += int(c)
+        parts[5] = compat_str(int(parts[5]) - num)
+        parts[1] += '8'
+        video_url = urljoin(url, '/'.join(parts))
+
+        title = (self._search_regex(
+            r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title',
+            default=None) or self._og_search_description(webpage)).strip()
+        thumbnail = self._og_search_thumbnail(webpage)
+        duration = parse_duration(self._search_regex(
+            r'duration\s*:\s*<[^>]+>([\d:]+)', webpage, 'duration',
+            default=None))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': 18,
+            'ext': 'mp4',
+        }
diff --git a/youtube_dl/extractor/yourupload.py b/youtube_dl/extractor/yourupload.py

new file mode 100644 (file)

index 0000000..9fa7728
--- /dev/null
+++ b/youtube_dl/extractor/yourupload.py
@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import urljoin
+
+
+class YourUploadIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?(?:yourupload\.com/(?:watch|embed)|embed\.yourupload\.com)/(?P<id>[A-Za-z0-9]+)'
+    _TESTS = [{
+        'url': 'http://yourupload.com/watch/14i14h',
+        'md5': '5e2c63385454c557f97c4c4131a393cd',
+        'info_dict': {
+            'id': '14i14h',
+            'ext': 'mp4',
+            'title': 'BigBuckBunny_320x180.mp4',
+            'thumbnail': r're:^https?://.*\.jpe?g',
+        }
+    }, {
+        'url': 'http://www.yourupload.com/embed/14i14h',
+        'only_matching': True,
+    }, {
+        'url': 'http://embed.yourupload.com/14i14h',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        embed_url = 'http://www.yourupload.com/embed/%s' % video_id
+
+        webpage = self._download_webpage(embed_url, video_id)
+
+        title = self._og_search_title(webpage)
+        video_url = urljoin(embed_url, self._og_search_video_url(webpage))
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'http_headers': {
+                'Referer': embed_url,
+            },
+        }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

new file mode 100644 (file)

index 0000000..70a5bd3
--- /dev/null
+++ b/youtube_dl/extractor/youtube.py
@@ -0,0 +1,3445 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+
+import itertools
+import json
+import os.path
+import random
+import re
+import time
+import traceback
+
+from .common import InfoExtractor, SearchInfoExtractor
+from ..jsinterp import JSInterpreter
+from ..swfinterp import SWFInterpreter
+from ..compat import (
+    compat_chr,
+    compat_HTTPError,
+    compat_kwargs,
+    compat_parse_qs,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
+    compat_urllib_parse_urlencode,
+    compat_urllib_parse_urlparse,
+    compat_urlparse,
+    compat_str,
+)
+from ..utils import (
+    bool_or_none,
+    clean_html,
+    error_to_compat_str,
+    extract_attributes,
+    ExtractorError,
+    float_or_none,
+    get_element_by_attribute,
+    get_element_by_id,
+    int_or_none,
+    mimetype2ext,
+    orderedSet,
+    parse_codecs,
+    parse_duration,
+    remove_quotes,
+    remove_start,
+    smuggle_url,
+    str_or_none,
+    str_to_int,
+    try_get,
+    unescapeHTML,
+    unified_strdate,
+    unsmuggle_url,
+    uppercase_escape,
+    url_or_none,
+    urlencode_postdata,
+)
+
+
+class YoutubeBaseInfoExtractor(InfoExtractor):
+    """Provide base functions for Youtube extractors"""
+    _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
+    _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
+
+    _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
+    _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
+    _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
+
+    _NETRC_MACHINE = 'youtube'
+    # If True it will raise an error if no login info is provided
+    _LOGIN_REQUIRED = False
+
+    _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
+
+    _YOUTUBE_CLIENT_HEADERS = {
+        'x-youtube-client-name': '1',
+        'x-youtube-client-version': '1.20200609.04.02',
+    }
+
+    def _set_language(self):
+        self._set_cookie(
+            '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
+            # YouTube sets the expire time to about two months
+            expire_time=time.time() + 2 * 30 * 24 * 3600)
+
+    def _ids_to_results(self, ids):
+        return [
+            self.url_result(vid_id, 'Youtube', video_id=vid_id)
+            for vid_id in ids]
+
+    def _login(self):
+        """
+        Attempt to log in to YouTube.
+        True is returned if successful or skipped.
+        False is returned if login failed.
+
+        If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
+        """
+        username, password = self._get_login_info()
+        # No authentication to be performed
+        if username is None:
+            if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
+                raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+            return True
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None,
+            note='Downloading login page',
+            errnote='unable to fetch login page', fatal=False)
+        if login_page is False:
+            return
+
+        login_form = self._hidden_inputs(login_page)
+
+        def req(url, f_req, note, errnote):
+            data = login_form.copy()
+            data.update({
+                'pstMsg': 1,
+                'checkConnection': 'youtube',
+                'checkedDomains': 'youtube',
+                'hl': 'en',
+                'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
+                'f.req': json.dumps(f_req),
+                'flowName': 'GlifWebSignIn',
+                'flowEntry': 'ServiceLogin',
+                # TODO: reverse actual botguard identifier generation algo
+                'bgRequest': '["identifier",""]',
+            })
+            return self._download_json(
+                url, None, note=note, errnote=errnote,
+                transform_source=lambda s: re.sub(r'^[^[]*', '', s),
+                fatal=False,
+                data=urlencode_postdata(data), headers={
+                    'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
+                    'Google-Accounts-XSRF': 1,
+                })
+
+        def warn(message):
+            self._downloader.report_warning(message)
+
+        lookup_req = [
+            username,
+            None, [], None, 'US', None, None, 2, False, True,
+            [
+                None, None,
+                [2, 1, None, 1,
+                 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
+                 None, [], 4],
+                1, [None, None, []], None, None, None, True
+            ],
+            username,
+        ]
+
+        lookup_results = req(
+            self._LOOKUP_URL, lookup_req,
+            'Looking up account info', 'Unable to look up account info')
+
+        if lookup_results is False:
+            return False
+
+        user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
+        if not user_hash:
+            warn('Unable to extract user hash')
+            return False
+
+        challenge_req = [
+            user_hash,
+            None, 1, None, [1, None, None, None, [password, None, True]],
+            [
+                None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
+                1, [None, None, []], None, None, None, True
+            ]]
+
+        challenge_results = req(
+            self._CHALLENGE_URL, challenge_req,
+            'Logging in', 'Unable to log in')
+
+        if challenge_results is False:
+            return
+
+        login_res = try_get(challenge_results, lambda x: x[0][5], list)
+        if login_res:
+            login_msg = try_get(login_res, lambda x: x[5], compat_str)
+            warn(
+                'Unable to login: %s' % 'Invalid password'
+                if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
+            return False
+
+        res = try_get(challenge_results, lambda x: x[0][-1], list)
+        if not res:
+            warn('Unable to extract result entry')
+            return False
+
+        login_challenge = try_get(res, lambda x: x[0][0], list)
+        if login_challenge:
+            challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
+            if challenge_str == 'TWO_STEP_VERIFICATION':
+                # SEND_SUCCESS - TFA code has been successfully sent to phone
+                # QUOTA_EXCEEDED - reached the limit of TFA codes
+                status = try_get(login_challenge, lambda x: x[5], compat_str)
+                if status == 'QUOTA_EXCEEDED':
+                    warn('Exceeded the limit of TFA codes, try later')
+                    return False
+
+                tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
+                if not tl:
+                    warn('Unable to extract TL')
+                    return False
+
+                tfa_code = self._get_tfa_info('2-step verification code')
+
+                if not tfa_code:
+                    warn(
+                        'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
+                        '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+                    return False
+
+                tfa_code = remove_start(tfa_code, 'G-')
+
+                tfa_req = [
+                    user_hash, None, 2, None,
+                    [
+                        9, None, None, None, None, None, None, None,
+                        [None, tfa_code, True, 2]
+                    ]]
+
+                tfa_results = req(
+                    self._TFA_URL.format(tl), tfa_req,
+                    'Submitting TFA code', 'Unable to submit TFA code')
+
+                if tfa_results is False:
+                    return False
+
+                tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
+                if tfa_res:
+                    tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
+                    warn(
+                        'Unable to finish TFA: %s' % 'Invalid TFA code'
+                        if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
+                    return False
+
+                check_cookie_url = try_get(
+                    tfa_results, lambda x: x[0][-1][2], compat_str)
+            else:
+                CHALLENGES = {
+                    'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
+                    'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
+                    'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
+                }
+                challenge = CHALLENGES.get(
+                    challenge_str,
+                    '%s returned error %s.' % (self.IE_NAME, challenge_str))
+                warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
+                return False
+        else:
+            check_cookie_url = try_get(res, lambda x: x[2], compat_str)
+
+        if not check_cookie_url:
+            warn('Unable to extract CheckCookie URL')
+            return False
+
+        check_cookie_results = self._download_webpage(
+            check_cookie_url, None, 'Checking cookie', fatal=False)
+
+        if check_cookie_results is False:
+            return False
+
+        if 'https://myaccount.google.com/' not in check_cookie_results:
+            warn('Unable to log in')
+            return False
+
+        return True
+
+    def _download_webpage_handle(self, *args, **kwargs):
+        query = kwargs.get('query', {}).copy()
+        query['disable_polymer'] = 'true'
+        kwargs['query'] = query
+        return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
+            *args, **compat_kwargs(kwargs))
+
+    def _real_initialize(self):
+        if self._downloader is None:
+            return
+        self._set_language()
+        if not self._login():
+            return
+
+
+class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
+    # Extract entries from page with "Load more" button
+    def _entries(self, page, playlist_id):
+        more_widget_html = content_html = page
+        for page_num in itertools.count(1):
+            for entry in self._process_page(content_html):
+                yield entry
+
+            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+            if not mobj:
+                break
+
+            count = 0
+            retries = 3
+            while count <= retries:
+                try:
+                    # Downloading page may result in intermittent 5xx HTTP error
+                    # that is usually worked around with a retry
+                    more = self._download_json(
+                        'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
+                        'Downloading page #%s%s'
+                        % (page_num, ' (retry #%d)' % count if count else ''),
+                        transform_source=uppercase_escape,
+                        headers=self._YOUTUBE_CLIENT_HEADERS)
+                    break
+                except ExtractorError as e:
+                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
+                        count += 1
+                        if count <= retries:
+                            continue
+                    raise
+
+            content_html = more['content_html']
+            if not content_html.strip():
+                # Some webpages show a "Load more" button but they don't
+                # have more videos
+                break
+            more_widget_html = more['load_more_widget_html']
+
+
+class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+    def _process_page(self, content):
+        for video_id, video_title in self.extract_videos_from_page(content):
+            yield self.url_result(video_id, 'Youtube', video_id, video_title)
+
+    def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
+        for mobj in re.finditer(video_re, page):
+            # The link with index 0 is not the first video of the playlist (not sure if still actual)
+            if 'index' in mobj.groupdict() and mobj.group('id') == '0':
+                continue
+            video_id = mobj.group('id')
+            video_title = unescapeHTML(
+                mobj.group('title')) if 'title' in mobj.groupdict() else None
+            if video_title:
+                video_title = video_title.strip()
+            if video_title == '► Play all':
+                video_title = None
+            try:
+                idx = ids_in_page.index(video_id)
+                if video_title and not titles_in_page[idx]:
+                    titles_in_page[idx] = video_title
+            except ValueError:
+                ids_in_page.append(video_id)
+                titles_in_page.append(video_title)
+
+    def extract_videos_from_page(self, page):
+        ids_in_page = []
+        titles_in_page = []
+        self.extract_videos_from_page_impl(
+            self._VIDEO_RE, page, ids_in_page, titles_in_page)
+        return zip(ids_in_page, titles_in_page)
+
+
+class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
+    def _process_page(self, content):
+        for playlist_id in orderedSet(re.findall(
+                r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
+                content)):
+            yield self.url_result(
+                'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        webpage = self._download_webpage(url, playlist_id)
+        title = self._og_search_title(webpage, fatal=False)
+        return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
+
+
+class YoutubeIE(YoutubeBaseInfoExtractor):
+    IE_DESC = 'YouTube.com'
+    _VALID_URL = r"""(?x)^
+                     (
+                         (?:https?://|//)                                    # http(s):// or protocol-independent URL
+                         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
+                            (?:www\.)?deturl\.com/www\.youtube\.com/|
+                            (?:www\.)?pwnyoutube\.com/|
+                            (?:www\.)?hooktube\.com/|
+                            (?:www\.)?yourepeat\.com/|
+                            tube\.majestyc\.net/|
+                            # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
+                            (?:(?:www|dev)\.)?invidio\.us/|
+                            (?:(?:www|no)\.)?invidiou\.sh/|
+                            (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
+                            (?:www\.)?invidious\.kabi\.tk/|
+                            (?:www\.)?invidious\.13ad\.de/|
+                            (?:www\.)?invidious\.mastodon\.host/|
+                            (?:www\.)?invidious\.nixnet\.xyz/|
+                            (?:www\.)?invidious\.drycat\.fr/|
+                            (?:www\.)?tube\.poal\.co/|
+                            (?:www\.)?vid\.wxzm\.sx/|
+                            (?:www\.)?yewtu\.be/|
+                            (?:www\.)?yt\.elukerio\.org/|
+                            (?:www\.)?yt\.lelux\.fi/|
+                            (?:www\.)?invidious\.ggc-project\.de/|
+                            (?:www\.)?yt\.maisputain\.ovh/|
+                            (?:www\.)?invidious\.13ad\.de/|
+                            (?:www\.)?invidious\.toot\.koeln/|
+                            (?:www\.)?invidious\.fdn\.fr/|
+                            (?:www\.)?watch\.nettohikari\.com/|
+                            (?:www\.)?kgg2m7yk5aybusll\.onion/|
+                            (?:www\.)?qklhadlycap4cnod\.onion/|
+                            (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
+                            (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
+                            (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
+                            (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
+                            (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
+                            (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
+                            youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
+                         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
+                         (?:                                                  # the various things that can precede the ID:
+                             (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
+                             |(?:                                             # or the v= param in all its forms
+                                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+                                 (?:\?|\#!?)                                  # the params delimiter ? or # or #!
+                                 (?:.*?[&;])??                                # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
+                                 v=
+                             )
+                         ))
+                         |(?:
+                            youtu\.be|                                        # just youtu.be/xxxx
+                            vid\.plus|                                        # or vid.plus/xxxx
+                            zwearz\.com/watch|                                # or zwearz.com/watch/xxxx
+                         )/
+                         |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
+                         )
+                     )?                                                       # all until now is optional -> you can pass the naked ID
+                     ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
+                     (?!.*?\blist=
+                        (?:
+                            %(playlist_id)s|                                  # combined list/video URLs are handled by the playlist IE
+                            WL                                                # WL are handled by the watch later IE
+                        )
+                     )
+                     (?(1).+)?                                                # if we found the ID, everything can follow
+                     $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
+    _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
+    _PLAYER_INFO_RE = (
+        r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
+        r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
+    )
+    _formats = {
+        '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+        '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+        '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
+        '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
+        '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
+        '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+        '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+        '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+        # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
+        '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
+        '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+        '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+        '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+        '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+        '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+        '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+        '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+        '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+
+
+        # 3D videos
+        '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
+        '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
+        '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
+        '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
+        '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
+        '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
+        '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
+
+        # Apple HTTP Live Streaming
+        '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+        '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+        '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
+        '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
+        '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
+        '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
+        '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+        '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
+
+        # DASH mp4 video
+        '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
+        '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
+        '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+        '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
+        '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
+        '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'},  # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
+        '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
+        '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+        '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
+        '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+        '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+        '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
+
+        # Dash mp4 audio
+        '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
+        '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
+        '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
+        '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+        '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+        '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
+        '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
+
+        # Dash webm
+        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+        '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
+        '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+        '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+        '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+        '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+        '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+        '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+        '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+        '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+        # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
+        '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+        '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+        '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+        '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+        '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+        '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+
+        # Dash webm audio
+        '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
+        '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
+
+        # Dash webm audio with opus inside
+        '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
+        '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
+        '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
+
+        # RTMP (unnamed)
+        '_rtmp': {'protocol': 'rtmp'},
+
+        # av01 video only formats sometimes served with "unknown" codecs
+        '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+        '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+    }
+    _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
+
+    _GEO_BYPASS = False
+
+    IE_NAME = 'youtube'
+    _TESTS = [
+        {
+            'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
+            'info_dict': {
+                'id': 'BaW_jenozKc',
+                'ext': 'mp4',
+                'title': 'youtube-dl test video "\'/\\ä↭𝕐',
+                'uploader': 'Philipp Hagemeister',
+                'uploader_id': 'phihag',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
+                'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
+                'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
+                'upload_date': '20121002',
+                'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
+                'categories': ['Science & Technology'],
+                'tags': ['youtube-dl'],
+                'duration': 10,
+                'view_count': int,
+                'like_count': int,
+                'dislike_count': int,
+                'start_time': 1,
+                'end_time': 9,
+            }
+        },
+        {
+            'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
+            'note': 'Test generic use_cipher_signature video (#897)',
+            'info_dict': {
+                'id': 'UxxajLWwzqY',
+                'ext': 'mp4',
+                'upload_date': '20120506',
+                'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
+                'alt_title': 'I Love It (feat. Charli XCX)',
+                'description': 'md5:19a2f98d9032b9311e686ed039564f63',
+                'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
+                         'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
+                         'iconic ep', 'iconic', 'love', 'it'],
+                'duration': 180,
+                'uploader': 'Icona Pop',
+                'uploader_id': 'IconaPop',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
+                'creator': 'Icona Pop',
+                'track': 'I Love It (feat. Charli XCX)',
+                'artist': 'Icona Pop',
+            }
+        },
+        {
+            'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
+            'note': 'Test VEVO video with age protection (#956)',
+            'info_dict': {
+                'id': '07FYdnEawAQ',
+                'ext': 'mp4',
+                'upload_date': '20130703',
+                'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',
+                'alt_title': 'Tunnel Vision',
+                'description': 'md5:07dab3356cde4199048e4c7cd93471e1',
+                'duration': 419,
+                'uploader': 'justintimberlakeVEVO',
+                'uploader_id': 'justintimberlakeVEVO',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
+                'creator': 'Justin Timberlake',
+                'track': 'Tunnel Vision',
+                'artist': 'Justin Timberlake',
+                'age_limit': 18,
+            }
+        },
+        {
+            'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
+            'note': 'Embed-only video (#1746)',
+            'info_dict': {
+                'id': 'yZIXLfi8CZQ',
+                'ext': 'mp4',
+                'upload_date': '20120608',
+                'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
+                'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
+                'uploader': 'SET India',
+                'uploader_id': 'setindia',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
+                'age_limit': 18,
+            }
+        },
+        {
+            'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
+            'note': 'Use the first video ID in the URL',
+            'info_dict': {
+                'id': 'BaW_jenozKc',
+                'ext': 'mp4',
+                'title': 'youtube-dl test video "\'/\\ä↭𝕐',
+                'uploader': 'Philipp Hagemeister',
+                'uploader_id': 'phihag',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
+                'upload_date': '20121002',
+                'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
+                'categories': ['Science & Technology'],
+                'tags': ['youtube-dl'],
+                'duration': 10,
+                'view_count': int,
+                'like_count': int,
+                'dislike_count': int,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
+            'note': '256k DASH audio (format 141) via DASH manifest',
+            'info_dict': {
+                'id': 'a9LDPn-MO4I',
+                'ext': 'm4a',
+                'upload_date': '20121002',
+                'uploader_id': '8KVIDEO',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
+                'description': '',
+                'uploader': '8KVIDEO',
+                'title': 'UHDTV TEST 8K VIDEO.mp4'
+            },
+            'params': {
+                'youtube_include_dash_manifest': True,
+                'format': '141',
+            },
+            'skip': 'format 141 not served anymore',
+        },
+        # DASH manifest with encrypted signature
+        {
+            'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
+            'info_dict': {
+                'id': 'IB3lcPjvWLA',
+                'ext': 'm4a',
+                'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
+                'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
+                'duration': 244,
+                'uploader': 'AfrojackVEVO',
+                'uploader_id': 'AfrojackVEVO',
+                'upload_date': '20131011',
+            },
+            'params': {
+                'youtube_include_dash_manifest': True,
+                'format': '141/bestaudio[ext=m4a]',
+            },
+        },
+        # JS player signature function name containing $
+        {
+            'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
+            'info_dict': {
+                'id': 'nfWlot6h_JM',
+                'ext': 'm4a',
+                'title': 'Taylor Swift - Shake It Off',
+                'description': 'md5:307195cd21ff7fa352270fe884570ef0',
+                'duration': 242,
+                'uploader': 'TaylorSwiftVEVO',
+                'uploader_id': 'TaylorSwiftVEVO',
+                'upload_date': '20140818',
+            },
+            'params': {
+                'youtube_include_dash_manifest': True,
+                'format': '141/bestaudio[ext=m4a]',
+            },
+        },
+        # Controversy video
+        {
+            'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
+            'info_dict': {
+                'id': 'T4XJQO3qol8',
+                'ext': 'mp4',
+                'duration': 219,
+                'upload_date': '20100909',
+                'uploader': 'Amazing Atheist',
+                'uploader_id': 'TheAmazingAtheist',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
+                'title': 'Burning Everyone\'s Koran',
+                'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
+            }
+        },
+        # Normal age-gate video (No vevo, embed allowed)
+        {
+            'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
+            'info_dict': {
+                'id': 'HtVdAasjOgU',
+                'ext': 'mp4',
+                'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
+                'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
+                'duration': 142,
+                'uploader': 'The Witcher',
+                'uploader_id': 'WitcherGame',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
+                'upload_date': '20140605',
+                'age_limit': 18,
+            },
+        },
+        # Age-gate video with encrypted signature
+        {
+            'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
+            'info_dict': {
+                'id': '6kLq3WMV1nU',
+                'ext': 'mp4',
+                'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
+                'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
+                'duration': 246,
+                'uploader': 'LloydVEVO',
+                'uploader_id': 'LloydVEVO',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
+                'upload_date': '20110629',
+                'age_limit': 18,
+            },
+        },
+        # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
+        # YouTube Red ad is not captured for creator
+        {
+            'url': '__2ABJjxzNo',
+            'info_dict': {
+                'id': '__2ABJjxzNo',
+                'ext': 'mp4',
+                'duration': 266,
+                'upload_date': '20100430',
+                'uploader_id': 'deadmau5',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
+                'creator': 'Dada Life, deadmau5',
+                'description': 'md5:12c56784b8032162bb936a5f76d55360',
+                'uploader': 'deadmau5',
+                'title': 'Deadmau5 - Some Chords (HD)',
+                'alt_title': 'This Machine Kills Some Chords',
+            },
+            'expected_warnings': [
+                'DASH manifest missing',
+            ]
+        },
+        # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
+        {
+            'url': 'lqQg6PlCWgI',
+            'info_dict': {
+                'id': 'lqQg6PlCWgI',
+                'ext': 'mp4',
+                'duration': 6085,
+                'upload_date': '20150827',
+                'uploader_id': 'olympic',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
+                'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
+                'uploader': 'Olympic',
+                'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games',
+            },
+            'params': {
+                'skip_download': 'requires avconv',
+            }
+        },
+        # Non-square pixels
+        {
+            'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
+            'info_dict': {
+                'id': '_b-2C3KPAM0',
+                'ext': 'mp4',
+                'stretched_ratio': 16 / 9.,
+                'duration': 85,
+                'upload_date': '20110310',
+                'uploader_id': 'AllenMeow',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
+                'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
+                'uploader': '孫ᄋᄅ',
+                'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
+            },
+        },
+        # url_encoded_fmt_stream_map is empty string
+        {
+            'url': 'qEJwOuvDf7I',
+            'info_dict': {
+                'id': 'qEJwOuvDf7I',
+                'ext': 'webm',
+                'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
+                'description': '',
+                'upload_date': '20150404',
+                'uploader_id': 'spbelect',
+                'uploader': 'Наблюдатели Петербурга',
+            },
+            'params': {
+                'skip_download': 'requires avconv',
+            },
+            'skip': 'This live event has ended.',
+        },
+        # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
+        {
+            'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
+            'info_dict': {
+                'id': 'FIl7x6_3R5Y',
+                'ext': 'webm',
+                'title': 'md5:7b81415841e02ecd4313668cde88737a',
+                'description': 'md5:116377fd2963b81ec4ce64b542173306',
+                'duration': 220,
+                'upload_date': '20150625',
+                'uploader_id': 'dorappi2000',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
+                'uploader': 'dorappi2000',
+                'formats': 'mincount:31',
+            },
+            'skip': 'not actual anymore',
+        },
+        # DASH manifest with segment_list
+        {
+            'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
+            'md5': '8ce563a1d667b599d21064e982ab9e31',
+            'info_dict': {
+                'id': 'CsmdDsKjzN8',
+                'ext': 'mp4',
+                'upload_date': '20150501',  # According to '<meta itemprop="datePublished"', but in other places it's 20150510
+                'uploader': 'Airtek',
+                'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
+                'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
+                'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
+            },
+            'params': {
+                'youtube_include_dash_manifest': True,
+                'format': '135',  # bestvideo
+            },
+            'skip': 'This live event has ended.',
+        },
+        {
+            # Multifeed videos (multiple cameras), URL is for Main Camera
+            'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
+            'info_dict': {
+                'id': 'jqWvoWXjCVs',
+                'title': 'teamPGP: Rocket League Noob Stream',
+                'description': 'md5:dc7872fb300e143831327f1bae3af010',
+            },
+            'playlist': [{
+                'info_dict': {
+                    'id': 'jqWvoWXjCVs',
+                    'ext': 'mp4',
+                    'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
+                    'description': 'md5:dc7872fb300e143831327f1bae3af010',
+                    'duration': 7335,
+                    'upload_date': '20150721',
+                    'uploader': 'Beer Games Beer',
+                    'uploader_id': 'beergamesbeer',
+                    'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+                    'license': 'Standard YouTube License',
+                },
+            }, {
+                'info_dict': {
+                    'id': '6h8e8xoXJzg',
+                    'ext': 'mp4',
+                    'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
+                    'description': 'md5:dc7872fb300e143831327f1bae3af010',
+                    'duration': 7337,
+                    'upload_date': '20150721',
+                    'uploader': 'Beer Games Beer',
+                    'uploader_id': 'beergamesbeer',
+                    'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+                    'license': 'Standard YouTube License',
+                },
+            }, {
+                'info_dict': {
+                    'id': 'PUOgX5z9xZw',
+                    'ext': 'mp4',
+                    'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
+                    'description': 'md5:dc7872fb300e143831327f1bae3af010',
+                    'duration': 7337,
+                    'upload_date': '20150721',
+                    'uploader': 'Beer Games Beer',
+                    'uploader_id': 'beergamesbeer',
+                    'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+                    'license': 'Standard YouTube License',
+                },
+            }, {
+                'info_dict': {
+                    'id': 'teuwxikvS5k',
+                    'ext': 'mp4',
+                    'title': 'teamPGP: Rocket League Noob Stream (zim)',
+                    'description': 'md5:dc7872fb300e143831327f1bae3af010',
+                    'duration': 7334,
+                    'upload_date': '20150721',
+                    'uploader': 'Beer Games Beer',
+                    'uploader_id': 'beergamesbeer',
+                    'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+                    'license': 'Standard YouTube License',
+                },
+            }],
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'This video is not available.',
+        },
+        {
+            # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
+            'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
+            'info_dict': {
+                'id': 'gVfLd0zydlo',
+                'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
+            },
+            'playlist_count': 2,
+            'skip': 'Not multifeed anymore',
+        },
+        {
+            'url': 'https://vid.plus/FlRa-iH7PGw',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
+            'only_matching': True,
+        },
+        {
+            # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
+            # Also tests cut-off URL expansion in video description (see
+            # https://github.com/ytdl-org/youtube-dl/issues/1892,
+            # https://github.com/ytdl-org/youtube-dl/issues/8164)
+            'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
+            'info_dict': {
+                'id': 'lsguqyKfVQg',
+                'ext': 'mp4',
+                'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
+                'alt_title': 'Dark Walk - Position Music',
+                'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
+                'duration': 133,
+                'upload_date': '20151119',
+                'uploader_id': 'IronSoulElf',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
+                'uploader': 'IronSoulElf',
+                'creator': 'Todd Haberman,  Daniel Law Heath and Aaron Kaplan',
+                'track': 'Dark Walk - Position Music',
+                'artist': 'Todd Haberman,  Daniel Law Heath and Aaron Kaplan',
+                'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
+            'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
+            'only_matching': True,
+        },
+        {
+            # Video with yt:stretch=17:0
+            'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
+            'info_dict': {
+                'id': 'Q39EVAstoRM',
+                'ext': 'mp4',
+                'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
+                'description': 'md5:ee18a25c350637c8faff806845bddee9',
+                'upload_date': '20151107',
+                'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
+                'uploader': 'CH GAMER DROID',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'This video does not exist.',
+        },
+        {
+            # Video licensed under Creative Commons
+            'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
+            'info_dict': {
+                'id': 'M4gD1WSo5mA',
+                'ext': 'mp4',
+                'title': 'md5:e41008789470fc2533a3252216f1c1d1',
+                'description': 'md5:a677553cf0840649b731a3024aeff4cc',
+                'duration': 721,
+                'upload_date': '20150127',
+                'uploader_id': 'BerkmanCenter',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
+                'uploader': 'The Berkman Klein Center for Internet & Society',
+                'license': 'Creative Commons Attribution license (reuse allowed)',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # Channel-like uploader_url
+            'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
+            'info_dict': {
+                'id': 'eQcmzGIKrzg',
+                'ext': 'mp4',
+                'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
+                'description': 'md5:dda0d780d5a6e120758d1711d062a867',
+                'duration': 4060,
+                'upload_date': '20151119',
+                'uploader': 'Bernie Sanders',
+                'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
+                'license': 'Creative Commons Attribution license (reuse allowed)',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
+            'only_matching': True,
+        },
+        {
+            # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
+            'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
+            'only_matching': True,
+        },
+        {
+            # Rental video preview
+            'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
+            'info_dict': {
+                'id': 'uGpuVWrhIzE',
+                'ext': 'mp4',
+                'title': 'Piku - Trailer',
+                'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
+                'upload_date': '20150811',
+                'uploader': 'FlixMatrix',
+                'uploader_id': 'FlixMatrixKaravan',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
+                'license': 'Standard YouTube License',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'This video is not available.',
+        },
+        {
+            # YouTube Red video with episode data
+            'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
+            'info_dict': {
+                'id': 'iqKdEhx-dD4',
+                'ext': 'mp4',
+                'title': 'Isolation - Mind Field (Ep 1)',
+                'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
+                'duration': 2085,
+                'upload_date': '20170118',
+                'uploader': 'Vsauce',
+                'uploader_id': 'Vsauce',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
+                'series': 'Mind Field',
+                'season_number': 1,
+                'episode_number': 1,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'expected_warnings': [
+                'Skipping DASH manifest',
+            ],
+        },
+        {
+            # The following content has been identified by the YouTube community
+            # as inappropriate or offensive to some audiences.
+            'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
+            'info_dict': {
+                'id': '6SJNVb0GnPI',
+                'ext': 'mp4',
+                'title': 'Race Differences in Intelligence',
+                'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
+                'duration': 965,
+                'upload_date': '20140124',
+                'uploader': 'New Century Foundation',
+                'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # itag 212
+            'url': '1t24XAntNCY',
+            'only_matching': True,
+        },
+        {
+            # geo restricted to JP
+            'url': 'sJL6WA-aGkQ',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://invidio.us/watch?v=BaW_jenozKc',
+            'only_matching': True,
+        },
+        {
+            # DRM protected
+            'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
+            'only_matching': True,
+        },
+        {
+            # Video with unsupported adaptive stream type formats
+            'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
+            'info_dict': {
+                'id': 'Z4Vy8R84T1U',
+                'ext': 'mp4',
+                'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
+                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+                'duration': 433,
+                'upload_date': '20130923',
+                'uploader': 'Amelia Putri Harwita',
+                'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
+                'formats': 'maxcount:10',
+            },
+            'params': {
+                'skip_download': True,
+                'youtube_include_dash_manifest': False,
+            },
+            'skip': 'not actual anymore',
+        },
+        {
+            # Youtube Music Auto-generated description
+            'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
+            'info_dict': {
+                'id': 'MgNrAu2pzNs',
+                'ext': 'mp4',
+                'title': 'Voyeur Girl',
+                'description': 'md5:7ae382a65843d6df2685993e90a8628f',
+                'upload_date': '20190312',
+                'uploader': 'Stephen - Topic',
+                'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
+                'artist': 'Stephen',
+                'track': 'Voyeur Girl',
+                'album': 'it\'s too much love to know my dear',
+                'release_date': '20190313',
+                'release_year': 2019,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # Youtube Music Auto-generated description
+            # Retrieve 'artist' field from 'Artist:' in video description
+            # when it is present on youtube music video
+            'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
+            'info_dict': {
+                'id': 'k0jLE7tTwjY',
+                'ext': 'mp4',
+                'title': 'Latch Feat. Sam Smith',
+                'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
+                'upload_date': '20150110',
+                'uploader': 'Various Artists - Topic',
+                'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
+                'artist': 'Disclosure',
+                'track': 'Latch Feat. Sam Smith',
+                'album': 'Latch Featuring Sam Smith',
+                'release_date': '20121008',
+                'release_year': 2012,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # Youtube Music Auto-generated description
+            # handle multiple artists on youtube music video
+            'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
+            'info_dict': {
+                'id': '74qn0eJSjpA',
+                'ext': 'mp4',
+                'title': 'Eastside',
+                'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
+                'upload_date': '20180710',
+                'uploader': 'Benny Blanco - Topic',
+                'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
+                'artist': 'benny blanco, Halsey, Khalid',
+                'track': 'Eastside',
+                'album': 'Eastside',
+                'release_date': '20180713',
+                'release_year': 2018,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # Youtube Music Auto-generated description
+            # handle youtube music video with release_year and no release_date
+            'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
+            'info_dict': {
+                'id': '-hcAI0g-f5M',
+                'ext': 'mp4',
+                'title': 'Put It On Me',
+                'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
+                'upload_date': '20180426',
+                'uploader': 'Matt Maeson - Topic',
+                'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
+                'artist': 'Matt Maeson',
+                'track': 'Put It On Me',
+                'album': 'The Hearse',
+                'release_date': None,
+                'release_year': 2018,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
+            'only_matching': True,
+        },
+        {
+            # invalid -> valid video id redirection
+            'url': 'DJztXj2GPfl',
+            'info_dict': {
+                'id': 'DJztXj2GPfk',
+                'ext': 'mp4',
+                'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
+                'description': 'md5:bf577a41da97918e94fa9798d9228825',
+                'upload_date': '20090125',
+                'uploader': 'Prochorowka',
+                'uploader_id': 'Prochorowka',
+                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
+                'artist': 'Panjabi MC',
+                'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
+                'album': 'Beware of the Boys (Mundian To Bach Ke)',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        }
+    ]
+
+    def __init__(self, *args, **kwargs):
+        super(YoutubeIE, self).__init__(*args, **kwargs)
+        self._player_cache = {}
+
+    def report_video_info_webpage_download(self, video_id):
+        """Report attempt to download video info webpage."""
+        self.to_screen('%s: Downloading video info webpage' % video_id)
+
+    def report_information_extraction(self, video_id):
+        """Report attempt to extract video information."""
+        self.to_screen('%s: Extracting video information' % video_id)
+
+    def report_unavailable_format(self, video_id, format):
+        """Report extracted video URL."""
+        self.to_screen('%s: Format %s not available' % (video_id, format))
+
+    def report_rtmp_download(self):
+        """Indicate the download will use the RTMP protocol."""
+        self.to_screen('RTMP download detected')
+
+    def _signature_cache_id(self, example_sig):
+        """ Return a string representation of a signature """
+        return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
+
+    @classmethod
+    def _extract_player_info(cls, player_url):
+        for player_re in cls._PLAYER_INFO_RE:
+            id_m = re.search(player_re, player_url)
+            if id_m:
+                break
+        else:
+            raise ExtractorError('Cannot identify player %r' % player_url)
+        return id_m.group('ext'), id_m.group('id')
+
+    def _extract_signature_function(self, video_id, player_url, example_sig):
+        player_type, player_id = self._extract_player_info(player_url)
+
+        # Read from filesystem cache
+        func_id = '%s_%s_%s' % (
+            player_type, player_id, self._signature_cache_id(example_sig))
+        assert os.path.basename(func_id) == func_id
+
+        cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
+        if cache_spec is not None:
+            return lambda s: ''.join(s[i] for i in cache_spec)
+
+        download_note = (
+            'Downloading player %s' % player_url
+            if self._downloader.params.get('verbose') else
+            'Downloading %s player %s' % (player_type, player_id)
+        )
+        if player_type == 'js':
+            code = self._download_webpage(
+                player_url, video_id,
+                note=download_note,
+                errnote='Download of %s failed' % player_url)
+            res = self._parse_sig_js(code)
+        elif player_type == 'swf':
+            urlh = self._request_webpage(
+                player_url, video_id,
+                note=download_note,
+                errnote='Download of %s failed' % player_url)
+            code = urlh.read()
+            res = self._parse_sig_swf(code)
+        else:
+            assert False, 'Invalid player type %r' % player_type
+
+        test_string = ''.join(map(compat_chr, range(len(example_sig))))
+        cache_res = res(test_string)
+        cache_spec = [ord(c) for c in cache_res]
+
+        self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
+        return res
+
+    def _print_sig_code(self, func, example_sig):
+        def gen_sig_code(idxs):
+            def _genslice(start, end, step):
+                starts = '' if start == 0 else str(start)
+                ends = (':%d' % (end + step)) if end + step >= 0 else ':'
+                steps = '' if step == 1 else (':%d' % step)
+                return 's[%s%s%s]' % (starts, ends, steps)
+
+            step = None
+            # Quelch pyflakes warnings - start will be set when step is set
+            start = '(Never used)'
+            for i, prev in zip(idxs[1:], idxs[:-1]):
+                if step is not None:
+                    if i - prev == step:
+                        continue
+                    yield _genslice(start, prev, step)
+                    step = None
+                    continue
+                if i - prev in [-1, 1]:
+                    step = i - prev
+                    start = prev
+                    continue
+                else:
+                    yield 's[%d]' % prev
+            if step is None:
+                yield 's[%d]' % i
+            else:
+                yield _genslice(start, i, step)
+
+        test_string = ''.join(map(compat_chr, range(len(example_sig))))
+        cache_res = func(test_string)
+        cache_spec = [ord(c) for c in cache_res]
+        expr_code = ' + '.join(gen_sig_code(cache_spec))
+        signature_id_tuple = '(%s)' % (
+            ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
+        code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
+                '    return %s\n') % (signature_id_tuple, expr_code)
+        self.to_screen('Extracted signature function:\n' + code)
+
+    def _parse_sig_js(self, jscode):
+        funcname = self._search_regex(
+            (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+             r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+             # Obsolete patterns
+             r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+             r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
+            jscode, 'Initial JS player signature function name', group='sig')
+
+        jsi = JSInterpreter(jscode)
+        initial_function = jsi.extract_function(funcname)
+        return lambda s: initial_function([s])
+
+    def _parse_sig_swf(self, file_contents):
+        swfi = SWFInterpreter(file_contents)
+        TARGET_CLASSNAME = 'SignatureDecipher'
+        searched_class = swfi.extract_class(TARGET_CLASSNAME)
+        initial_function = swfi.extract_function(searched_class, 'decipher')
+        return lambda s: initial_function([s])
+
+    def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
+        """Turn the encrypted s field into a working signature"""
+
+        if player_url is None:
+            raise ExtractorError('Cannot decrypt signature without player_url')
+
+        if player_url.startswith('//'):
+            player_url = 'https:' + player_url
+        elif not re.match(r'https?://', player_url):
+            player_url = compat_urlparse.urljoin(
+                'https://www.youtube.com', player_url)
+        try:
+            player_id = (player_url, self._signature_cache_id(s))
+            if player_id not in self._player_cache:
+                func = self._extract_signature_function(
+                    video_id, player_url, s
+                )
+                self._player_cache[player_id] = func
+            func = self._player_cache[player_id]
+            if self._downloader.params.get('youtube_print_sig_code'):
+                self._print_sig_code(func, s)
+            return func(s)
+        except Exception as e:
+            tb = traceback.format_exc()
+            raise ExtractorError(
+                'Signature extraction failed: ' + tb, cause=e)
+
+    def _get_subtitles(self, video_id, webpage, has_live_chat_replay):
+        try:
+            subs_doc = self._download_xml(
+                'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
+                video_id, note=False)
+        except ExtractorError as err:
+            self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
+            return {}
+
+        sub_lang_list = {}
+        for track in subs_doc.findall('track'):
+            lang = track.attrib['lang_code']
+            if lang in sub_lang_list:
+                continue
+            sub_formats = []
+            for ext in self._SUBTITLE_FORMATS:
+                params = compat_urllib_parse_urlencode({
+                    'lang': lang,
+                    'v': video_id,
+                    'fmt': ext,
+                    'name': track.attrib['name'].encode('utf-8'),
+                })
+                sub_formats.append({
+                    'url': 'https://www.youtube.com/api/timedtext?' + params,
+                    'ext': ext,
+                })
+            sub_lang_list[lang] = sub_formats
+        if has_live_chat_replay:
+            sub_lang_list['live_chat'] = [
+                {
+                    'video_id': video_id,
+                    'ext': 'json',
+                    'protocol': 'youtube_live_chat_replay',
+                },
+            ]
+        if not sub_lang_list:
+            self._downloader.report_warning('video doesn\'t have subtitles')
+            return {}
+        return sub_lang_list
+
+    def _get_ytplayer_config(self, video_id, webpage):
+        patterns = (
+            # User data may contain arbitrary character sequences that may affect
+            # JSON extraction with regex, e.g. when '};' is contained the second
+            # regex won't capture the whole JSON. Yet working around by trying more
+            # concrete regex first keeping in mind proper quoted string handling
+            # to be implemented in future that will replace this workaround (see
+            # https://github.com/ytdl-org/youtube-dl/issues/7468,
+            # https://github.com/ytdl-org/youtube-dl/pull/7599)
+            r';ytplayer\.config\s*=\s*({.+?});ytplayer',
+            r';ytplayer\.config\s*=\s*({.+?});',
+        )
+        config = self._search_regex(
+            patterns, webpage, 'ytplayer.config', default=None)
+        if config:
+            return self._parse_json(
+                uppercase_escape(config), video_id, fatal=False)
+
+    def _get_yt_initial_data(self, video_id, webpage):
+        config = self._search_regex(
+            (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
+             r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
+            webpage, 'ytInitialData', default=None)
+        if config:
+            return self._parse_json(
+                uppercase_escape(config), video_id, fatal=False)
+
+    def _get_automatic_captions(self, video_id, webpage):
+        """We need the webpage for getting the captions url, pass it as an
+           argument to speed up the process."""
+        self.to_screen('%s: Looking for automatic captions' % video_id)
+        player_config = self._get_ytplayer_config(video_id, webpage)
+        err_msg = 'Couldn\'t find automatic captions for %s' % video_id
+        if not player_config:
+            self._downloader.report_warning(err_msg)
+            return {}
+        try:
+            args = player_config['args']
+            caption_url = args.get('ttsurl')
+            if caption_url:
+                timestamp = args['timestamp']
+                # We get the available subtitles
+                list_params = compat_urllib_parse_urlencode({
+                    'type': 'list',
+                    'tlangs': 1,
+                    'asrs': 1,
+                })
+                list_url = caption_url + '&' + list_params
+                caption_list = self._download_xml(list_url, video_id)
+                original_lang_node = caption_list.find('track')
+                if original_lang_node is None:
+                    self._downloader.report_warning('Video doesn\'t have automatic captions')
+                    return {}
+                original_lang = original_lang_node.attrib['lang_code']
+                caption_kind = original_lang_node.attrib.get('kind', '')
+
+                sub_lang_list = {}
+                for lang_node in caption_list.findall('target'):
+                    sub_lang = lang_node.attrib['lang_code']
+                    sub_formats = []
+                    for ext in self._SUBTITLE_FORMATS:
+                        params = compat_urllib_parse_urlencode({
+                            'lang': original_lang,
+                            'tlang': sub_lang,
+                            'fmt': ext,
+                            'ts': timestamp,
+                            'kind': caption_kind,
+                        })
+                        sub_formats.append({
+                            'url': caption_url + '&' + params,
+                            'ext': ext,
+                        })
+                    sub_lang_list[sub_lang] = sub_formats
+                return sub_lang_list
+
+            def make_captions(sub_url, sub_langs):
+                parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
+                caption_qs = compat_parse_qs(parsed_sub_url.query)
+                captions = {}
+                for sub_lang in sub_langs:
+                    sub_formats = []
+                    for ext in self._SUBTITLE_FORMATS:
+                        caption_qs.update({
+                            'tlang': [sub_lang],
+                            'fmt': [ext],
+                        })
+                        sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
+                            query=compat_urllib_parse_urlencode(caption_qs, True)))
+                        sub_formats.append({
+                            'url': sub_url,
+                            'ext': ext,
+                        })
+                    captions[sub_lang] = sub_formats
+                return captions
+
+            # New captions format as of 22.06.2017
+            player_response = args.get('player_response')
+            if player_response and isinstance(player_response, compat_str):
+                player_response = self._parse_json(
+                    player_response, video_id, fatal=False)
+                if player_response:
+                    renderer = player_response['captions']['playerCaptionsTracklistRenderer']
+                    base_url = renderer['captionTracks'][0]['baseUrl']
+                    sub_lang_list = []
+                    for lang in renderer['translationLanguages']:
+                        lang_code = lang.get('languageCode')
+                        if lang_code:
+                            sub_lang_list.append(lang_code)
+                    return make_captions(base_url, sub_lang_list)
+
+            # Some videos don't provide ttsurl but rather caption_tracks and
+            # caption_translation_languages (e.g. 20LmZk1hakA)
+            # Does not used anymore as of 22.06.2017
+            caption_tracks = args['caption_tracks']
+            caption_translation_languages = args['caption_translation_languages']
+            caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
+            sub_lang_list = []
+            for lang in caption_translation_languages.split(','):
+                lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
+                sub_lang = lang_qs.get('lc', [None])[0]
+                if sub_lang:
+                    sub_lang_list.append(sub_lang)
+            return make_captions(caption_url, sub_lang_list)
+        # An extractor error can be raise by the download process if there are
+        # no automatic captions but there are subtitles
+        except (KeyError, IndexError, ExtractorError):
+            self._downloader.report_warning(err_msg)
+            return {}
+
+    def _mark_watched(self, video_id, video_info, player_response):
+        playback_url = url_or_none(try_get(
+            player_response,
+            lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
+            video_info, lambda x: x['videostats_playback_base_url'][0]))
+        if not playback_url:
+            return
+        parsed_playback_url = compat_urlparse.urlparse(playback_url)
+        qs = compat_urlparse.parse_qs(parsed_playback_url.query)
+
+        # cpn generation algorithm is reverse engineered from base.js.
+        # In fact it works even with dummy cpn.
+        CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
+        cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
+
+        qs.update({
+            'ver': ['2'],
+            'cpn': [cpn],
+        })
+        playback_url = compat_urlparse.urlunparse(
+            parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
+        self._download_webpage(
+            playback_url, video_id, 'Marking watched',
+            'Unable to mark watched', fatal=False)
+
+    @staticmethod
+    def _extract_urls(webpage):
+        # Embedded YouTube player
+        entries = [
+            unescapeHTML(mobj.group('url'))
+            for mobj in re.finditer(r'''(?x)
+            (?:
+                <iframe[^>]+?src=|
+                data-video-url=|
+                <embed[^>]+?src=|
+                embedSWF\(?:\s*|
+                <object[^>]+data=|
+                new\s+SWFObject\(
+            )
+            (["\'])
+                (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
+                (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
+            \1''', webpage)]
+
+        # lazyYT YouTube embed
+        entries.extend(list(map(
+            unescapeHTML,
+            re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
+
+        # Wordpress "YouTube Video Importer" plugin
+        matches = re.findall(r'''(?x)<div[^>]+
+            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
+            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
+        entries.extend(m[-1] for m in matches)
+
+        return entries
+
+    @staticmethod
+    def _extract_url(webpage):
+        urls = YoutubeIE._extract_urls(webpage)
+        return urls[0] if urls else None
+
+    @classmethod
+    def extract_id(cls, url):
+        mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
+        if mobj is None:
+            raise ExtractorError('Invalid URL: %s' % url)
+        video_id = mobj.group(2)
+        return video_id
+
+    def _extract_chapters_from_json(self, webpage, video_id, duration):
+        if not webpage:
+            return
+        initial_data = self._parse_json(
+            self._search_regex(
+                r'window\["ytInitialData"\] = (.+);\n', webpage,
+                'player args', default='{}'),
+            video_id, fatal=False)
+        if not initial_data or not isinstance(initial_data, dict):
+            return
+        chapters_list = try_get(
+            initial_data,
+            lambda x: x['playerOverlays']
+                       ['playerOverlayRenderer']
+                       ['decoratedPlayerBarRenderer']
+                       ['decoratedPlayerBarRenderer']
+                       ['playerBar']
+                       ['chapteredPlayerBarRenderer']
+                       ['chapters'],
+            list)
+        if not chapters_list:
+            return
+
+        def chapter_time(chapter):
+            return float_or_none(
+                try_get(
+                    chapter,
+                    lambda x: x['chapterRenderer']['timeRangeStartMillis'],
+                    int),
+                scale=1000)
+        chapters = []
+        for next_num, chapter in enumerate(chapters_list, start=1):
+            start_time = chapter_time(chapter)
+            if start_time is None:
+                continue
+            end_time = (chapter_time(chapters_list[next_num])
+                        if next_num < len(chapters_list) else duration)
+            if end_time is None:
+                continue
+            title = try_get(
+                chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
+                compat_str)
+            chapters.append({
+                'start_time': start_time,
+                'end_time': end_time,
+                'title': title,
+            })
+        return chapters
+
+    @staticmethod
+    def _extract_chapters_from_description(description, duration):
+        if not description:
+            return None
+        chapter_lines = re.findall(
+            r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
+            description)
+        if not chapter_lines:
+            return None
+        chapters = []
+        for next_num, (chapter_line, time_point) in enumerate(
+                chapter_lines, start=1):
+            start_time = parse_duration(time_point)
+            if start_time is None:
+                continue
+            if start_time > duration:
+                break
+            end_time = (duration if next_num == len(chapter_lines)
+                        else parse_duration(chapter_lines[next_num][1]))
+            if end_time is None:
+                continue
+            if end_time > duration:
+                end_time = duration
+            if start_time > end_time:
+                break
+            chapter_title = re.sub(
+                r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
+            chapter_title = re.sub(r'\s+', ' ', chapter_title)
+            chapters.append({
+                'start_time': start_time,
+                'end_time': end_time,
+                'title': chapter_title,
+            })
+        return chapters
+
+    def _extract_chapters(self, webpage, description, video_id, duration):
+        return (self._extract_chapters_from_json(webpage, video_id, duration)
+                or self._extract_chapters_from_description(description, duration))
+
+    def _real_extract(self, url):
+        url, smuggled_data = unsmuggle_url(url, {})
+
+        proto = (
+            'http' if self._downloader.params.get('prefer_insecure', False)
+            else 'https')
+
+        start_time = None
+        end_time = None
+        parsed_url = compat_urllib_parse_urlparse(url)
+        for component in [parsed_url.fragment, parsed_url.query]:
+            query = compat_parse_qs(component)
+            if start_time is None and 't' in query:
+                start_time = parse_duration(query['t'][0])
+            if start_time is None and 'start' in query:
+                start_time = parse_duration(query['start'][0])
+            if end_time is None and 'end' in query:
+                end_time = parse_duration(query['end'][0])
+
+        # Extract original video URL from URL with redirection, like age verification, using next_url parameter
+        mobj = re.search(self._NEXT_URL_RE, url)
+        if mobj:
+            url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
+        video_id = self.extract_id(url)
+
+        # Get video webpage
+        url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
+        video_webpage, urlh = self._download_webpage_handle(url, video_id)
+
+        qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
+        video_id = qs.get('v', [None])[0] or video_id
+
+        # Attempt to extract SWF player URL
+        mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
+        if mobj is not None:
+            player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
+        else:
+            player_url = None
+
+        dash_mpds = []
+
+        def add_dash_mpd(video_info):
+            dash_mpd = video_info.get('dashmpd')
+            if dash_mpd and dash_mpd[0] not in dash_mpds:
+                dash_mpds.append(dash_mpd[0])
+
+        def add_dash_mpd_pr(pl_response):
+            dash_mpd = url_or_none(try_get(
+                pl_response, lambda x: x['streamingData']['dashManifestUrl'],
+                compat_str))
+            if dash_mpd and dash_mpd not in dash_mpds:
+                dash_mpds.append(dash_mpd)
+
+        is_live = None
+        view_count = None
+
+        def extract_view_count(v_info):
+            return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
+
+        def extract_player_response(player_response, video_id):
+            pl_response = str_or_none(player_response)
+            if not pl_response:
+                return
+            pl_response = self._parse_json(pl_response, video_id, fatal=False)
+            if isinstance(pl_response, dict):
+                add_dash_mpd_pr(pl_response)
+                return pl_response
+
+        player_response = {}
+
+        # Get video info
+        video_info = {}
+        embed_webpage = None
+        if self._html_search_meta('og:restrictions:age', video_webpage, default=None) == "18+":
+            age_gate = True
+            # We simulate the access to the video from www.youtube.com/v/{video_id}
+            # this can be viewed without login into Youtube
+            url = proto + '://www.youtube.com/embed/%s' % video_id
+            embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
+            data = compat_urllib_parse_urlencode({
+                'video_id': video_id,
+                'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+                'sts': self._search_regex(
+                    r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
+            })
+            video_info_url = proto + '://www.youtube.com/get_video_info?' + data
+            try:
+                video_info_webpage = self._download_webpage(
+                    video_info_url, video_id,
+                    note='Refetching age-gated info webpage',
+                    errnote='unable to download video info webpage')
+            except ExtractorError:
+                video_info_webpage = None
+            if video_info_webpage:
+                video_info = compat_parse_qs(video_info_webpage)
+                pl_response = video_info.get('player_response', [None])[0]
+                player_response = extract_player_response(pl_response, video_id)
+                add_dash_mpd(video_info)
+                view_count = extract_view_count(video_info)
+        else:
+            age_gate = False
+            # Try looking directly into the video webpage
+            ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
+            if ytplayer_config:
+                args = ytplayer_config['args']
+                if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
+                    # Convert to the same format returned by compat_parse_qs
+                    video_info = dict((k, [v]) for k, v in args.items())
+                    add_dash_mpd(video_info)
+                # Rental video is not rented but preview is available (e.g.
+                # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
+                # https://github.com/ytdl-org/youtube-dl/issues/10532)
+                if not video_info and args.get('ypc_vid'):
+                    return self.url_result(
+                        args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
+                if args.get('livestream') == '1' or args.get('live_playback') == 1:
+                    is_live = True
+                if not player_response:
+                    player_response = extract_player_response(args.get('player_response'), video_id)
+            if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+                add_dash_mpd_pr(player_response)
+
+        def extract_unavailable_message():
+            messages = []
+            for tag, kind in (('h1', 'message'), ('div', 'submessage')):
+                msg = self._html_search_regex(
+                    r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
+                    video_webpage, 'unavailable %s' % kind, default=None)
+                if msg:
+                    messages.append(msg)
+            if messages:
+                return '\n'.join(messages)
+
+        if not video_info and not player_response:
+            unavailable_message = extract_unavailable_message()
+            if not unavailable_message:
+                unavailable_message = 'Unable to extract video data'
+            raise ExtractorError(
+                'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
+
+        if not isinstance(video_info, dict):
+            video_info = {}
+
+        video_details = try_get(
+            player_response, lambda x: x['videoDetails'], dict) or {}
+
+        microformat = try_get(
+            player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
+
+        video_title = video_info.get('title', [None])[0] or video_details.get('title')
+        if not video_title:
+            self._downloader.report_warning('Unable to extract video title')
+            video_title = '_'
+
+        description_original = video_description = get_element_by_id("eow-description", video_webpage)
+        if video_description:
+
+            def replace_url(m):
+                redir_url = compat_urlparse.urljoin(url, m.group(1))
+                parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
+                if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
+                    qs = compat_parse_qs(parsed_redir_url.query)
+                    q = qs.get('q')
+                    if q and q[0]:
+                        return q[0]
+                return redir_url
+
+            description_original = video_description = re.sub(r'''(?x)
+                <a\s+
+                    (?:[a-zA-Z-]+="[^"]*"\s+)*?
+                    (?:title|href)="([^"]+)"\s+
+                    (?:[a-zA-Z-]+="[^"]*"\s+)*?
+                    class="[^"]*"[^>]*>
+                [^<]+\.{3}\s*
+                </a>
+            ''', replace_url, video_description)
+            video_description = clean_html(video_description)
+        else:
+            video_description = video_details.get('shortDescription') or self._html_search_meta('description', video_webpage)
+
+        if not smuggled_data.get('force_singlefeed', False):
+            if not self._downloader.params.get('noplaylist'):
+                multifeed_metadata_list = try_get(
+                    player_response,
+                    lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
+                    compat_str) or try_get(
+                    video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
+                if multifeed_metadata_list:
+                    entries = []
+                    feed_ids = []
+                    for feed in multifeed_metadata_list.split(','):
+                        # Unquote should take place before split on comma (,) since textual
+                        # fields may contain comma as well (see
+                        # https://github.com/ytdl-org/youtube-dl/issues/8536)
+                        feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
+
+                        def feed_entry(name):
+                            return try_get(feed_data, lambda x: x[name][0], compat_str)
+
+                        feed_id = feed_entry('id')
+                        if not feed_id:
+                            continue
+                        feed_title = feed_entry('title')
+                        title = video_title
+                        if feed_title:
+                            title += ' (%s)' % feed_title
+                        entries.append({
+                            '_type': 'url_transparent',
+                            'ie_key': 'Youtube',
+                            'url': smuggle_url(
+                                '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
+                                {'force_singlefeed': True}),
+                            'title': title,
+                        })
+                        feed_ids.append(feed_id)
+                    self.to_screen(
+                        'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
+                        % (', '.join(feed_ids), video_id))
+                    return self.playlist_result(entries, video_id, video_title, video_description)
+            else:
+                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
+        if view_count is None:
+            view_count = extract_view_count(video_info)
+        if view_count is None and video_details:
+            view_count = int_or_none(video_details.get('viewCount'))
+        if view_count is None and microformat:
+            view_count = int_or_none(microformat.get('viewCount'))
+
+        if is_live is None:
+            is_live = bool_or_none(video_details.get('isLive'))
+
+        has_live_chat_replay = False
+        if not is_live:
+            yt_initial_data = self._get_yt_initial_data(video_id, video_webpage)
+            try:
+                yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
+                has_live_chat_replay = True
+            except (KeyError, IndexError, TypeError):
+                pass
+
+        # Check for "rental" videos
+        if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
+            raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
+
+        def _extract_filesize(media_url):
+            return int_or_none(self._search_regex(
+                r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
+
+        streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
+        streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
+
+        if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
+            self.report_rtmp_download()
+            formats = [{
+                'format_id': '_rtmp',
+                'protocol': 'rtmp',
+                'url': video_info['conn'][0],
+                'player_url': player_url,
+            }]
+        elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
+            encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
+            if 'rtmpe%3Dyes' in encoded_url_map:
+                raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
+            formats = []
+            formats_spec = {}
+            fmt_list = video_info.get('fmt_list', [''])[0]
+            if fmt_list:
+                for fmt in fmt_list.split(','):
+                    spec = fmt.split('/')
+                    if len(spec) > 1:
+                        width_height = spec[1].split('x')
+                        if len(width_height) == 2:
+                            formats_spec[spec[0]] = {
+                                'resolution': spec[1],
+                                'width': int_or_none(width_height[0]),
+                                'height': int_or_none(width_height[1]),
+                            }
+            for fmt in streaming_formats:
+                itag = str_or_none(fmt.get('itag'))
+                if not itag:
+                    continue
+                quality = fmt.get('quality')
+                quality_label = fmt.get('qualityLabel') or quality
+                formats_spec[itag] = {
+                    'asr': int_or_none(fmt.get('audioSampleRate')),
+                    'filesize': int_or_none(fmt.get('contentLength')),
+                    'format_note': quality_label,
+                    'fps': int_or_none(fmt.get('fps')),
+                    'height': int_or_none(fmt.get('height')),
+                    # bitrate for itag 43 is always 2147483647
+                    'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
+                    'width': int_or_none(fmt.get('width')),
+                }
+
+            for fmt in streaming_formats:
+                if fmt.get('drmFamilies') or fmt.get('drm_families'):
+                    continue
+                url = url_or_none(fmt.get('url'))
+
+                if not url:
+                    cipher = fmt.get('cipher') or fmt.get('signatureCipher')
+                    if not cipher:
+                        continue
+                    url_data = compat_parse_qs(cipher)
+                    url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
+                    if not url:
+                        continue
+                else:
+                    cipher = None
+                    url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+
+                stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
+                # Unsupported FORMAT_STREAM_TYPE_OTF
+                if stream_type == 3:
+                    continue
+
+                format_id = fmt.get('itag') or url_data['itag'][0]
+                if not format_id:
+                    continue
+                format_id = compat_str(format_id)
+
+                if cipher:
+                    if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
+                        ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
+                        jsplayer_url_json = self._search_regex(
+                            ASSETS_RE,
+                            embed_webpage if age_gate else video_webpage,
+                            'JS player URL (1)', default=None)
+                        if not jsplayer_url_json and not age_gate:
+                            # We need the embed website after all
+                            if embed_webpage is None:
+                                embed_url = proto + '://www.youtube.com/embed/%s' % video_id
+                                embed_webpage = self._download_webpage(
+                                    embed_url, video_id, 'Downloading embed webpage')
+                            jsplayer_url_json = self._search_regex(
+                                ASSETS_RE, embed_webpage, 'JS player URL')
+
+                        player_url = json.loads(jsplayer_url_json)
+                        if player_url is None:
+                            player_url_json = self._search_regex(
+                                r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
+                                video_webpage, 'age gate player URL')
+                            player_url = json.loads(player_url_json)
+
+                    if 'sig' in url_data:
+                        url += '&signature=' + url_data['sig'][0]
+                    elif 's' in url_data:
+                        encrypted_sig = url_data['s'][0]
+
+                        if self._downloader.params.get('verbose'):
+                            if player_url is None:
+                                player_desc = 'unknown'
+                            else:
+                                player_type, player_version = self._extract_player_info(player_url)
+                                player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
+                            parts_sizes = self._signature_cache_id(encrypted_sig)
+                            self.to_screen('{%s} signature length %s, %s' %
+                                           (format_id, parts_sizes, player_desc))
+
+                        signature = self._decrypt_signature(
+                            encrypted_sig, video_id, player_url, age_gate)
+                        sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
+                        url += '&%s=%s' % (sp, signature)
+                if 'ratebypass' not in url:
+                    url += '&ratebypass=yes'
+
+                dct = {
+                    'format_id': format_id,
+                    'url': url,
+                    'player_url': player_url,
+                }
+                if format_id in self._formats:
+                    dct.update(self._formats[format_id])
+                if format_id in formats_spec:
+                    dct.update(formats_spec[format_id])
+
+                # Some itags are not included in DASH manifest thus corresponding formats will
+                # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
+                # Trying to extract metadata from url_encoded_fmt_stream_map entry.
+                mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
+                width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
+
+                if width is None:
+                    width = int_or_none(fmt.get('width'))
+                if height is None:
+                    height = int_or_none(fmt.get('height'))
+
+                filesize = int_or_none(url_data.get(
+                    'clen', [None])[0]) or _extract_filesize(url)
+
+                quality = url_data.get('quality', [None])[0] or fmt.get('quality')
+                quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
+
+                tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
+                       or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
+                fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
+
+                more_fields = {
+                    'filesize': filesize,
+                    'tbr': tbr,
+                    'width': width,
+                    'height': height,
+                    'fps': fps,
+                    'format_note': quality_label or quality,
+                }
+                for key, value in more_fields.items():
+                    if value:
+                        dct[key] = value
+                type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
+                if type_:
+                    type_split = type_.split(';')
+                    kind_ext = type_split[0].split('/')
+                    if len(kind_ext) == 2:
+                        kind, _ = kind_ext
+                        dct['ext'] = mimetype2ext(type_split[0])
+                        if kind in ('audio', 'video'):
+                            codecs = None
+                            for mobj in re.finditer(
+                                    r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
+                                if mobj.group('key') == 'codecs':
+                                    codecs = mobj.group('val')
+                                    break
+                            if codecs:
+                                dct.update(parse_codecs(codecs))
+                if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
+                    dct['downloader_options'] = {
+                        # Youtube throttles chunks >~10M
+                        'http_chunk_size': 10485760,
+                    }
+                formats.append(dct)
+        else:
+            manifest_url = (
+                url_or_none(try_get(
+                    player_response,
+                    lambda x: x['streamingData']['hlsManifestUrl'],
+                    compat_str))
+                or url_or_none(try_get(
+                    video_info, lambda x: x['hlsvp'][0], compat_str)))
+            if manifest_url:
+                formats = []
+                m3u8_formats = self._extract_m3u8_formats(
+                    manifest_url, video_id, 'mp4', fatal=False)
+                for a_format in m3u8_formats:
+                    itag = self._search_regex(
+                        r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
+                    if itag:
+                        a_format['format_id'] = itag
+                        if itag in self._formats:
+                            dct = self._formats[itag].copy()
+                            dct.update(a_format)
+                            a_format = dct
+                    a_format['player_url'] = player_url
+                    # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
+                    a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
+                    formats.append(a_format)
+            else:
+                error_message = extract_unavailable_message()
+                if not error_message:
+                    error_message = clean_html(try_get(
+                        player_response, lambda x: x['playabilityStatus']['reason'],
+                        compat_str))
+                if not error_message:
+                    error_message = clean_html(
+                        try_get(video_info, lambda x: x['reason'][0], compat_str))
+                if error_message:
+                    raise ExtractorError(error_message, expected=True)
+                raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
+
+        # uploader
+        video_uploader = try_get(
+            video_info, lambda x: x['author'][0],
+            compat_str) or str_or_none(video_details.get('author'))
+        if video_uploader:
+            video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
+        else:
+            self._downloader.report_warning('unable to extract uploader name')
+
+        # uploader_id
+        video_uploader_id = None
+        video_uploader_url = None
+        mobj = re.search(
+            r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
+            video_webpage)
+        if mobj is not None:
+            video_uploader_id = mobj.group('uploader_id')
+            video_uploader_url = mobj.group('uploader_url')
+        else:
+            owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
+            if owner_profile_url:
+                video_uploader_id = self._search_regex(
+                    r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
+                    default=None)
+                video_uploader_url = owner_profile_url
+
+        channel_id = (
+            str_or_none(video_details.get('channelId'))
+            or self._html_search_meta(
+                'channelId', video_webpage, 'channel id', default=None)
+            or self._search_regex(
+                r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
+                video_webpage, 'channel id', default=None, group='id'))
+        channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
+
+        thumbnails = []
+        thumbnails_list = try_get(
+            video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
+        for t in thumbnails_list:
+            if not isinstance(t, dict):
+                continue
+            thumbnail_url = url_or_none(t.get('url'))
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+                'width': int_or_none(t.get('width')),
+                'height': int_or_none(t.get('height')),
+            })
+
+        if not thumbnails:
+            video_thumbnail = None
+            # We try first to get a high quality image:
+            m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
+                                video_webpage, re.DOTALL)
+            if m_thumb is not None:
+                video_thumbnail = m_thumb.group(1)
+            thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
+            if thumbnail_url:
+                video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
+            if video_thumbnail:
+                thumbnails.append({'url': video_thumbnail})
+
+        # upload date
+        upload_date = self._html_search_meta(
+            'datePublished', video_webpage, 'upload date', default=None)
+        if not upload_date:
+            upload_date = self._search_regex(
+                [r'(?s)id="eow-date.*?>(.*?)</span>',
+                 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
+                video_webpage, 'upload date', default=None)
+        if not upload_date:
+            upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
+        upload_date = unified_strdate(upload_date)
+
+        video_license = self._html_search_regex(
+            r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
+            video_webpage, 'license', default=None)
+
+        m_music = re.search(
+            r'''(?x)
+                <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
+                <ul[^>]*>\s*
+                <li>(?P<title>.+?)
+                by (?P<creator>.+?)
+                (?:
+                    \(.+?\)|
+                    <a[^>]*
+                        (?:
+                            \bhref=["\']/red[^>]*>|             # drop possible
+                            >\s*Listen ad-free with YouTube Red # YouTube Red ad
+                        )
+                    .*?
+                )?</li
+            ''',
+            video_webpage)
+        if m_music:
+            video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
+            video_creator = clean_html(m_music.group('creator'))
+        else:
+            video_alt_title = video_creator = None
+
+        def extract_meta(field):
+            return self._html_search_regex(
+                r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
+                video_webpage, field, default=None)
+
+        track = extract_meta('Song')
+        artist = extract_meta('Artist')
+        album = extract_meta('Album')
+
+        # Youtube Music Auto-generated description
+        release_date = release_year = None
+        if video_description:
+            mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
+            if mobj:
+                if not track:
+                    track = mobj.group('track').strip()
+                if not artist:
+                    artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
+                if not album:
+                    album = mobj.group('album'.strip())
+                release_year = mobj.group('release_year')
+                release_date = mobj.group('release_date')
+                if release_date:
+                    release_date = release_date.replace('-', '')
+                    if not release_year:
+                        release_year = int(release_date[:4])
+                if release_year:
+                    release_year = int(release_year)
+
+        m_episode = re.search(
+            r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
+            video_webpage)
+        if m_episode:
+            series = unescapeHTML(m_episode.group('series'))
+            season_number = int(m_episode.group('season'))
+            episode_number = int(m_episode.group('episode'))
+        else:
+            series = season_number = episode_number = None
+
+        m_cat_container = self._search_regex(
+            r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
+            video_webpage, 'categories', default=None)
+        category = None
+        if m_cat_container:
+            category = self._html_search_regex(
+                r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
+                default=None)
+        if not category:
+            category = try_get(
+                microformat, lambda x: x['category'], compat_str)
+        video_categories = None if category is None else [category]
+
+        video_tags = [
+            unescapeHTML(m.group('content'))
+            for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
+        if not video_tags:
+            video_tags = try_get(video_details, lambda x: x['keywords'], list)
+
+        def _extract_count(count_name):
+            return str_to_int(self._search_regex(
+                r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
+                % re.escape(count_name),
+                video_webpage, count_name, default=None))
+
+        like_count = _extract_count('like')
+        dislike_count = _extract_count('dislike')
+
+        if view_count is None:
+            view_count = str_to_int(self._search_regex(
+                r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
+                'view count', default=None))
+
+        average_rating = (
+            float_or_none(video_details.get('averageRating'))
+            or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
+
+        # subtitles
+        video_subtitles = self.extract_subtitles(
+            video_id, video_webpage, has_live_chat_replay)
+        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
+
+        video_duration = try_get(
+            video_info, lambda x: int_or_none(x['length_seconds'][0]))
+        if not video_duration:
+            video_duration = int_or_none(video_details.get('lengthSeconds'))
+        if not video_duration:
+            video_duration = parse_duration(self._html_search_meta(
+                'duration', video_webpage, 'video duration'))
+
+        # annotations
+        video_annotations = None
+        if self._downloader.params.get('writeannotations', False):
+            xsrf_token = self._search_regex(
+                r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
+                video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
+            invideo_url = try_get(
+                player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
+            if xsrf_token and invideo_url:
+                xsrf_field_name = self._search_regex(
+                    r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
+                    video_webpage, 'xsrf field name',
+                    group='xsrf_field_name', default='session_token')
+                video_annotations = self._download_webpage(
+                    self._proto_relative_url(invideo_url),
+                    video_id, note='Downloading annotations',
+                    errnote='Unable to download video annotations', fatal=False,
+                    data=urlencode_postdata({xsrf_field_name: xsrf_token}))
+
+        chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
+
+        # Look for the DASH manifest
+        if self._downloader.params.get('youtube_include_dash_manifest', True):
+            dash_mpd_fatal = True
+            for mpd_url in dash_mpds:
+                dash_formats = {}
+                try:
+                    def decrypt_sig(mobj):
+                        s = mobj.group(1)
+                        dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
+                        return '/signature/%s' % dec_s
+
+                    mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
+
+                    for df in self._extract_mpd_formats(
+                            mpd_url, video_id, fatal=dash_mpd_fatal,
+                            formats_dict=self._formats):
+                        if not df.get('filesize'):
+                            df['filesize'] = _extract_filesize(df['url'])
+                        # Do not overwrite DASH format found in some previous DASH manifest
+                        if df['format_id'] not in dash_formats:
+                            dash_formats[df['format_id']] = df
+                        # Additional DASH manifests may end up in HTTP Error 403 therefore
+                        # allow them to fail without bug report message if we already have
+                        # some DASH manifest succeeded. This is temporary workaround to reduce
+                        # burst of bug reports until we figure out the reason and whether it
+                        # can be fixed at all.
+                        dash_mpd_fatal = False
+                except (ExtractorError, KeyError) as e:
+                    self.report_warning(
+                        'Skipping DASH manifest: %r' % e, video_id)
+                if dash_formats:
+                    # Remove the formats we found through non-DASH, they
+                    # contain less info and it can be wrong, because we use
+                    # fixed values (for example the resolution). See
+                    # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
+                    # example.
+                    formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
+                    formats.extend(dash_formats.values())
+
+        # Check for malformed aspect ratio
+        stretched_m = re.search(
+            r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
+            video_webpage)
+        if stretched_m:
+            w = float(stretched_m.group('w'))
+            h = float(stretched_m.group('h'))
+            # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
+            # We will only process correct ratios.
+            if w > 0 and h > 0:
+                ratio = w / h
+                for f in formats:
+                    if f.get('vcodec') != 'none':
+                        f['stretched_ratio'] = ratio
+
+        if not formats:
+            if 'reason' in video_info:
+                if 'The uploader has not made this video available in your country.' in video_info['reason']:
+                    regions_allowed = self._html_search_meta(
+                        'regionsAllowed', video_webpage, default=None)
+                    countries = regions_allowed.split(',') if regions_allowed else None
+                    self.raise_geo_restricted(
+                        msg=video_info['reason'][0], countries=countries)
+                reason = video_info['reason'][0]
+                if 'Invalid parameters' in reason:
+                    unavailable_message = extract_unavailable_message()
+                    if unavailable_message:
+                        reason = unavailable_message
+                raise ExtractorError(
+                    'YouTube said: %s' % reason,
+                    expected=True, video_id=video_id)
+            if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
+                raise ExtractorError('This video is DRM protected.', expected=True)
+
+        self._sort_formats(formats)
+
+        self.mark_watched(video_id, video_info, player_response)
+
+        return {
+            'id': video_id,
+            'uploader': video_uploader,
+            'uploader_id': video_uploader_id,
+            'uploader_url': video_uploader_url,
+            'channel_id': channel_id,
+            'channel_url': channel_url,
+            'upload_date': upload_date,
+            'license': video_license,
+            'creator': video_creator or artist,
+            'title': video_title,
+            'alt_title': video_alt_title or track,
+            'thumbnails': thumbnails,
+            'description': video_description,
+            'categories': video_categories,
+            'tags': video_tags,
+            'subtitles': video_subtitles,
+            'automatic_captions': automatic_captions,
+            'duration': video_duration,
+            'age_limit': 18 if age_gate else 0,
+            'annotations': video_annotations,
+            'chapters': chapters,
+            'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'average_rating': average_rating,
+            'formats': formats,
+            'is_live': is_live,
+            'start_time': start_time,
+            'end_time': end_time,
+            'series': series,
+            'season_number': season_number,
+            'episode_number': episode_number,
+            'track': track,
+            'artist': artist,
+            'album': album,
+            'release_date': release_date,
+            'release_year': release_year,
+        }
+
+
+class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
+    IE_DESC = 'YouTube.com playlists'
+    _VALID_URL = r"""(?x)(?:
+                        (?:https?://)?
+                        (?:\w+\.)?
+                        (?:
+                            (?:
+                                youtube(?:kids)?\.com|
+                                invidio\.us
+                            )
+                            /
+                            (?:
+                               (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
+                               \? (?:.*?[&;])*? (?:p|a|list)=
+                            |  p/
+                            )|
+                            youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
+                        )
+                        (
+                            (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
+                            # Top tracks, they can also include dots
+                            |(?:MC)[\w\.]*
+                        )
+                        .*
+                     |
+                        (%(playlist_id)s)
+                     )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
+    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
+    _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
+    _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
+    IE_NAME = 'youtube:playlist'
+    _TESTS = [{
+        'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
+        'info_dict': {
+            'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
+            'uploader': 'Sergey M.',
+            'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
+            'title': 'youtube-dl public playlist',
+        },
+        'playlist_count': 1,
+    }, {
+        'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
+        'info_dict': {
+            'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
+            'uploader': 'Sergey M.',
+            'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
+            'title': 'youtube-dl empty playlist',
+        },
+        'playlist_count': 0,
+    }, {
+        'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
+        'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+        'info_dict': {
+            'title': '29C3: Not my department',
+            'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+            'uploader': 'Christiaan008',
+            'uploader_id': 'ChRiStIaAn008',
+        },
+        'playlist_count': 96,
+    }, {
+        'note': 'issue #673',
+        'url': 'PLBB231211A4F62143',
+        'info_dict': {
+            'title': '[OLD]Team Fortress 2 (Class-based LP)',
+            'id': 'PLBB231211A4F62143',
+            'uploader': 'Wickydoo',
+            'uploader_id': 'Wickydoo',
+        },
+        'playlist_mincount': 26,
+    }, {
+        'note': 'Large playlist',
+        'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
+        'info_dict': {
+            'title': 'Uploads from Cauchemar',
+            'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
+            'uploader': 'Cauchemar',
+            'uploader_id': 'Cauchemar89',
+        },
+        'playlist_mincount': 799,
+    }, {
+        'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+        'info_dict': {
+            'title': 'YDL_safe_search',
+            'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+        },
+        'playlist_count': 2,
+        'skip': 'This playlist is private',
+    }, {
+        'note': 'embedded',
+        'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
+        'playlist_count': 4,
+        'info_dict': {
+            'title': 'JODA15',
+            'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
+            'uploader': 'milan',
+            'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
+        }
+    }, {
+        'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
+        'playlist_mincount': 485,
+        'info_dict': {
+            'title': '2018 Chinese New Singles (11/6 updated)',
+            'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
+            'uploader': 'LBK',
+            'uploader_id': 'sdragonfang',
+        }
+    }, {
+        'note': 'Embedded SWF player',
+        'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
+        'playlist_count': 4,
+        'info_dict': {
+            'title': 'JODA7',
+            'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
+        },
+        'skip': 'This playlist does not exist',
+    }, {
+        'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
+        'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
+        'info_dict': {
+            'title': 'Uploads from Interstellar Movie',
+            'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
+            'uploader': 'Interstellar Movie',
+            'uploader_id': 'InterstellarMovie1',
+        },
+        'playlist_mincount': 21,
+    }, {
+        # Playlist URL that does not actually serve a playlist
+        'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
+        'info_dict': {
+            'id': 'FqZTN594JQw',
+            'ext': 'webm',
+            'title': "Smiley's People 01 detective, Adventure Series, Action",
+            'uploader': 'STREEM',
+            'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
+            'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
+            'upload_date': '20150526',
+            'license': 'Standard YouTube License',
+            'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
+            'categories': ['People & Blogs'],
+            'tags': list,
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'skip': 'This video is not available.',
+        'add_ie': [YoutubeIE.ie_key()],
+    }, {
+        'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
+        'info_dict': {
+            'id': 'yeWKywCrFtk',
+            'ext': 'mp4',
+            'title': 'Small Scale Baler and Braiding Rugs',
+            'uploader': 'Backus-Page House Museum',
+            'uploader_id': 'backuspagemuseum',
+            'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
+            'upload_date': '20161008',
+            'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
+            'categories': ['Nonprofits & Activism'],
+            'tags': list,
+            'like_count': int,
+            'dislike_count': int,
+        },
+        'params': {
+            'noplaylist': True,
+            'skip_download': True,
+        },
+    }, {
+        # https://github.com/ytdl-org/youtube-dl/issues/21844
+        'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+        'info_dict': {
+            'title': 'Data Analysis with Dr Mike Pound',
+            'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+            'uploader_id': 'Computerphile',
+            'uploader': 'Computerphile',
+        },
+        'playlist_mincount': 11,
+    }, {
+        'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
+        'only_matching': True,
+    }, {
+        'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
+        'only_matching': True,
+    }, {
+        # music album playlist
+        'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
+        'only_matching': True,
+    }, {
+        'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
+        'only_matching': True,
+    }]
+
+    def _real_initialize(self):
+        self._login()
+
+    def extract_videos_from_page(self, page):
+        ids_in_page = []
+        titles_in_page = []
+
+        for item in re.findall(
+                r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
+            attrs = extract_attributes(item)
+            video_id = attrs['data-video-id']
+            video_title = unescapeHTML(attrs.get('data-title'))
+            if video_title:
+                video_title = video_title.strip()
+            ids_in_page.append(video_id)
+            titles_in_page.append(video_title)
+
+        # Fallback with old _VIDEO_RE
+        self.extract_videos_from_page_impl(
+            self._VIDEO_RE, page, ids_in_page, titles_in_page)
+
+        # Relaxed fallbacks
+        self.extract_videos_from_page_impl(
+            r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
+            ids_in_page, titles_in_page)
+        self.extract_videos_from_page_impl(
+            r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
+            ids_in_page, titles_in_page)
+
+        return zip(ids_in_page, titles_in_page)
+
+    def _extract_mix(self, playlist_id):
+        # The mixes are generated from a single video
+        # the id of the playlist is just 'RD' + video_id
+        ids = []
+        last_id = playlist_id[-11:]
+        for n in itertools.count(1):
+            url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
+            webpage = self._download_webpage(
+                url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
+            new_ids = orderedSet(re.findall(
+                r'''(?xs)data-video-username=".*?".*?
+                           href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
+                webpage))
+            # Fetch new pages until all the videos are repeated, it seems that
+            # there are always 51 unique videos.
+            new_ids = [_id for _id in new_ids if _id not in ids]
+            if not new_ids:
+                break
+            ids.extend(new_ids)
+            last_id = ids[-1]
+
+        url_results = self._ids_to_results(ids)
+
+        search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
+        title_span = (
+            search_title('playlist-title')
+            or search_title('title long-title')
+            or search_title('title'))
+        title = clean_html(title_span)
+
+        return self.playlist_result(url_results, playlist_id, title)
+
+    def _extract_playlist(self, playlist_id):
+        url = self._TEMPLATE_URL % playlist_id
+        page = self._download_webpage(url, playlist_id)
+
+        # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
+        for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
+            match = match.strip()
+            # Check if the playlist exists or is private
+            mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
+            if mobj:
+                reason = mobj.group('reason')
+                message = 'This playlist %s' % reason
+                if 'private' in reason:
+                    message += ', use --username or --netrc to access it'
+                message += '.'
+                raise ExtractorError(message, expected=True)
+            elif re.match(r'[^<]*Invalid parameters[^<]*', match):
+                raise ExtractorError(
+                    'Invalid parameters. Maybe URL is incorrect.',
+                    expected=True)
+            elif re.match(r'[^<]*Choose your language[^<]*', match):
+                continue
+            else:
+                self.report_warning('Youtube gives an alert message: ' + match)
+
+        playlist_title = self._html_search_regex(
+            r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
+            page, 'title', default=None)
+
+        _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
+        uploader = self._html_search_regex(
+            r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
+            page, 'uploader', default=None)
+        mobj = re.search(
+            r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
+            page)
+        if mobj:
+            uploader_id = mobj.group('uploader_id')
+            uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
+        else:
+            uploader_id = uploader_url = None
+
+        has_videos = True
+
+        if not playlist_title:
+            try:
+                # Some playlist URLs don't actually serve a playlist (e.g.
+                # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
+                next(self._entries(page, playlist_id))
+            except StopIteration:
+                has_videos = False
+
+        playlist = self.playlist_result(
+            self._entries(page, playlist_id), playlist_id, playlist_title)
+        playlist.update({
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'uploader_url': uploader_url,
+        })
+
+        return has_videos, playlist
+
+    def _check_download_just_video(self, url, playlist_id):
+        # Check if it's a video-specific URL
+        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        video_id = query_dict.get('v', [None])[0] or self._search_regex(
+            r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
+            'video id', default=None)
+        if video_id:
+            if self._downloader.params.get('noplaylist'):
+                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+                return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
+            else:
+                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+                return video_id, None
+        return None, None
+
+    def _real_extract(self, url):
+        # Extract playlist id
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError('Invalid URL: %s' % url)
+        playlist_id = mobj.group(1) or mobj.group(2)
+
+        video_id, video = self._check_download_just_video(url, playlist_id)
+        if video:
+            return video
+
+        if playlist_id.startswith(('RD', 'UL', 'PU')):
+            # Mixes require a custom extraction process
+            return self._extract_mix(playlist_id)
+
+        has_videos, playlist = self._extract_playlist(playlist_id)
+        if has_videos or not video_id:
+            return playlist
+
+        # Some playlist URLs don't actually serve a playlist (see
+        # https://github.com/ytdl-org/youtube-dl/issues/10537).
+        # Fallback to plain video extraction if there is a video id
+        # along with playlist id.
+        return self.url_result(video_id, 'Youtube', video_id=video_id)
+
+
+class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
+    IE_DESC = 'YouTube.com channels'
+    _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
+    _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
+    _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
+    IE_NAME = 'youtube:channel'
+    _TESTS = [{
+        'note': 'paginated channel',
+        'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
+        'playlist_mincount': 91,
+        'info_dict': {
+            'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
+            'title': 'Uploads from lex will',
+            'uploader': 'lex will',
+            'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+        }
+    }, {
+        'note': 'Age restricted channel',
+        # from https://www.youtube.com/user/DeusExOfficial
+        'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
+        'playlist_mincount': 64,
+        'info_dict': {
+            'id': 'UUs0ifCMCm1icqRbqhUINa0w',
+            'title': 'Uploads from Deus Ex',
+            'uploader': 'Deus Ex',
+            'uploader_id': 'DeusExOfficial',
+        },
+    }, {
+        'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
+                else super(YoutubeChannelIE, cls).suitable(url))
+
+    def _build_template_url(self, url, channel_id):
+        return self._TEMPLATE_URL % channel_id
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        url = self._build_template_url(url, channel_id)
+
+        # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
+        # Workaround by extracting as a playlist if managed to obtain channel playlist URL
+        # otherwise fallback on channel by page extraction
+        channel_page = self._download_webpage(
+            url + '?view=57', channel_id,
+            'Downloading channel page', fatal=False)
+        if channel_page is False:
+            channel_playlist_id = False
+        else:
+            channel_playlist_id = self._html_search_meta(
+                'channelId', channel_page, 'channel id', default=None)
+            if not channel_playlist_id:
+                channel_url = self._html_search_meta(
+                    ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
+                    channel_page, 'channel url', default=None)
+                if channel_url:
+                    channel_playlist_id = self._search_regex(
+                        r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
+                        channel_url, 'channel id', default=None)
+        if channel_playlist_id and channel_playlist_id.startswith('UC'):
+            playlist_id = 'UU' + channel_playlist_id[2:]
+            return self.url_result(
+                compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
+
+        channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
+        autogenerated = re.search(r'''(?x)
+                class="[^"]*?(?:
+                    channel-header-autogenerated-label|
+                    yt-channel-title-autogenerated
+                )[^"]*"''', channel_page) is not None
+
+        if autogenerated:
+            # The videos are contained in a single page
+            # the ajax pages can't be used, they are empty
+            entries = [
+                self.url_result(
+                    video_id, 'Youtube', video_id=video_id,
+                    video_title=video_title)
+                for video_id, video_title in self.extract_videos_from_page(channel_page)]
+            return self.playlist_result(entries, channel_id)
+
+        try:
+            next(self._entries(channel_page, channel_id))
+        except StopIteration:
+            alert_message = self._html_search_regex(
+                r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
+                channel_page, 'alert', default=None, group='alert')
+            if alert_message:
+                raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
+
+        return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
+
+
+class YoutubeUserIE(YoutubeChannelIE):
+    IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
+    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
+    _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
+    IE_NAME = 'youtube:user'
+
+    _TESTS = [{
+        'url': 'https://www.youtube.com/user/TheLinuxFoundation',
+        'playlist_mincount': 320,
+        'info_dict': {
+            'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
+            'title': 'Uploads from The Linux Foundation',
+            'uploader': 'The Linux Foundation',
+            'uploader_id': 'TheLinuxFoundation',
+        }
+    }, {
+        # Only available via https://www.youtube.com/c/12minuteathlete/videos
+        # but not https://www.youtube.com/user/12minuteathlete/videos
+        'url': 'https://www.youtube.com/c/12minuteathlete/videos',
+        'playlist_mincount': 249,
+        'info_dict': {
+            'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
+            'title': 'Uploads from 12 Minute Athlete',
+            'uploader': '12 Minute Athlete',
+            'uploader_id': 'the12minuteathlete',
+        }
+    }, {
+        'url': 'ytuser:phihag',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/c/gametrailers',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/gametrailers',
+        'only_matching': True,
+    }, {
+        # This channel is not available, geo restricted to JP
+        'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        # Don't return True if the url can be extracted with other youtube
+        # extractor, the regex would is too permissive and it would match.
+        other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
+        if any(ie.suitable(url) for ie in other_yt_ies):
+            return False
+        else:
+            return super(YoutubeUserIE, cls).suitable(url)
+
+    def _build_template_url(self, url, channel_id):
+        mobj = re.match(self._VALID_URL, url)
+        return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
+
+
+class YoutubeLiveIE(YoutubeBaseInfoExtractor):
+    IE_DESC = 'YouTube.com live streams'
+    _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
+    IE_NAME = 'youtube:live'
+
+    _TESTS = [{
+        'url': 'https://www.youtube.com/user/TheYoungTurks/live',
+        'info_dict': {
+            'id': 'a48o2S1cPoo',
+            'ext': 'mp4',
+            'title': 'The Young Turks - Live Main Show',
+            'uploader': 'The Young Turks',
+            'uploader_id': 'TheYoungTurks',
+            'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
+            'upload_date': '20150715',
+            'license': 'Standard YouTube License',
+            'description': 'md5:438179573adcdff3c97ebb1ee632b891',
+            'categories': ['News & Politics'],
+            'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
+            'like_count': int,
+            'dislike_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/TheYoungTurks/live',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        channel_id = mobj.group('id')
+        base_url = mobj.group('base_url')
+        webpage = self._download_webpage(url, channel_id, fatal=False)
+        if webpage:
+            page_type = self._og_search_property(
+                'type', webpage, 'page type', default='')
+            video_id = self._html_search_meta(
+                'videoId', webpage, 'video id', default=None)
+            if page_type.startswith('video') and video_id and re.match(
+                    r'^[0-9A-Za-z_-]{11}$', video_id):
+                return self.url_result(video_id, YoutubeIE.ie_key())
+        return self.url_result(base_url)
+
+
+class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
+    IE_DESC = 'YouTube.com user/channel playlists'
+    _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
+    IE_NAME = 'youtube:playlists'
+
+    _TESTS = [{
+        'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
+        'playlist_mincount': 4,
+        'info_dict': {
+            'id': 'ThirstForScience',
+            'title': 'ThirstForScience',
+        },
+    }, {
+        # with "Load more" button
+        'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
+        'playlist_mincount': 70,
+        'info_dict': {
+            'id': 'igorkle1',
+            'title': 'Игорь Клейнер',
+        },
+    }, {
+        'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
+        'playlist_mincount': 17,
+        'info_dict': {
+            'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
+            'title': 'Chem Player',
+        },
+        'skip': 'Blocked',
+    }, {
+        'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
+        'only_matching': True,
+    }]
+
+
+class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
+    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
+
+
+class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
+    IE_DESC = 'YouTube.com searches'
+    # there doesn't appear to be a real limit, for example if you search for
+    # 'python' you get more than 8.000.000 results
+    _MAX_RESULTS = float('inf')
+    IE_NAME = 'youtube:search'
+    _SEARCH_KEY = 'ytsearch'
+    _EXTRA_QUERY_ARGS = {}
+    _TESTS = []
+
+    def _get_n_results(self, query, n):
+        """Get a specified number of results for a query"""
+
+        videos = []
+        limit = n
+
+        url_query = {
+            'search_query': query.encode('utf-8'),
+        }
+        url_query.update(self._EXTRA_QUERY_ARGS)
+        result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
+
+        for pagenum in itertools.count(1):
+            data = self._download_json(
+                result_url, video_id='query "%s"' % query,
+                note='Downloading page %s' % pagenum,
+                errnote='Unable to download API page',
+                query={'spf': 'navigate'})
+            html_content = data[1]['body']['content']
+
+            if 'class="search-message' in html_content:
+                raise ExtractorError(
+                    '[youtube] No video results', expected=True)
+
+            new_videos = list(self._process_page(html_content))
+            videos += new_videos
+            if not new_videos or len(videos) > limit:
+                break
+            next_link = self._html_search_regex(
+                r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next',
+                html_content, 'next link', default=None)
+            if next_link is None:
+                break
+            result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link)
+
+        if len(videos) > n:
+            videos = videos[:n]
+        return self.playlist_result(videos, query)
+
+
+class YoutubeSearchDateIE(YoutubeSearchIE):
+    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
+    _SEARCH_KEY = 'ytsearchdate'
+    IE_DESC = 'YouTube.com searches, newest videos first'
+    _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
+
+
+class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
+    IE_DESC = 'YouTube.com search URLs'
+    IE_NAME = 'youtube:search_url'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
+    _TESTS = [{
+        'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
+        'playlist_mincount': 5,
+        'info_dict': {
+            'title': 'youtube-dl test video',
+        }
+    }, {
+        'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        query = compat_urllib_parse_unquote_plus(mobj.group('query'))
+        webpage = self._download_webpage(url, query)
+        return self.playlist_result(self._process_page(webpage), playlist_title=query)
+
+
+class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
+    IE_DESC = 'YouTube.com (multi-season) shows'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
+    IE_NAME = 'youtube:show'
+    _TESTS = [{
+        'url': 'https://www.youtube.com/show/airdisasters',
+        'playlist_mincount': 5,
+        'info_dict': {
+            'id': 'airdisasters',
+            'title': 'Air Disasters',
+        }
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+        return super(YoutubeShowIE, self)._real_extract(
+            'https://www.youtube.com/show/%s/playlists' % playlist_id)
+
+
+class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
+    """
+    Base class for feed extractors
+    Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
+    """
+    _LOGIN_REQUIRED = True
+
+    @property
+    def IE_NAME(self):
+        return 'youtube:%s' % self._FEED_NAME
+
+    def _real_initialize(self):
+        self._login()
+
+    def _entries(self, page):
+        # The extraction process is the same as for playlists, but the regex
+        # for the video ids doesn't contain an index
+        ids = []
+        more_widget_html = content_html = page
+        for page_num in itertools.count(1):
+            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
+
+            # 'recommended' feed has infinite 'load more' and each new portion spins
+            # the same videos in (sometimes) slightly different order, so we'll check
+            # for unicity and break when portion has no new videos
+            new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
+            if not new_ids:
+                break
+
+            ids.extend(new_ids)
+
+            for entry in self._ids_to_results(new_ids):
+                yield entry
+
+            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+            if not mobj:
+                break
+
+            more = self._download_json(
+                'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+                'Downloading page #%s' % page_num,
+                transform_source=uppercase_escape,
+                headers=self._YOUTUBE_CLIENT_HEADERS)
+            content_html = more['content_html']
+            more_widget_html = more['load_more_widget_html']
+
+    def _real_extract(self, url):
+        page = self._download_webpage(
+            'https://www.youtube.com/feed/%s' % self._FEED_NAME,
+            self._PLAYLIST_TITLE)
+        return self.playlist_result(
+            self._entries(page), playlist_title=self._PLAYLIST_TITLE)
+
+
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+    IE_NAME = 'youtube:watchlater'
+    IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
+
+    _TESTS = [{
+        'url': 'https://www.youtube.com/playlist?list=WL',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        _, video = self._check_download_just_video(url, 'WL')
+        if video:
+            return video
+        _, playlist = self._extract_playlist('WL')
+        return playlist
+
+
+class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
+    IE_NAME = 'youtube:favorites'
+    IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
+    _LOGIN_REQUIRED = True
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
+        playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
+        return self.url_result(playlist_id, 'YoutubePlaylist')
+
+
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+    _FEED_NAME = 'recommended'
+    _PLAYLIST_TITLE = 'Youtube Recommended videos'
+
+
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+    _FEED_NAME = 'subscriptions'
+    _PLAYLIST_TITLE = 'Youtube Subscriptions'
+
+
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
+    _FEED_NAME = 'history'
+    _PLAYLIST_TITLE = 'Youtube History'
+
+
+class YoutubeTruncatedURLIE(InfoExtractor):
+    IE_NAME = 'youtube:truncated_url'
+    IE_DESC = False  # Do not list
+    _VALID_URL = r'''(?x)
+        (?:https?://)?
+        (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
+        (?:watch\?(?:
+            feature=[a-z_]+|
+            annotation_id=annotation_[^&]+|
+            x-yt-cl=[0-9]+|
+            hl=[^&]*|
+            t=[0-9]+
+        )?
+        |
+            attribution_link\?a=[^&]+
+        )
+        $
+    '''
+
+    _TESTS = [{
+        'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/watch?',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/watch?feature=foo',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/watch?hl=en-GB',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/watch?t=2372',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        raise ExtractorError(
+            'Did you forget to quote the URL? Remember that & is a meta '
+            'character in most shells, so you want to put the URL in quotes, '
+            'like  youtube-dl '
+            '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
+            ' or simply  youtube-dl BaW_jenozKc  .',
+            expected=True)
+
+
+class YoutubeTruncatedIDIE(InfoExtractor):
+    IE_NAME = 'youtube:truncated_id'
+    IE_DESC = False  # Do not list
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
+
+    _TESTS = [{
+        'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        raise ExtractorError(
+            'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
+            expected=True)
diff --git a/youtube_dl/extractor/zapiks.py b/youtube_dl/extractor/zapiks.py

new file mode 100644 (file)

index 0000000..f6496f5
--- /dev/null
+++ b/youtube_dl/extractor/zapiks.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+    parse_iso8601,
+    xpath_with_ns,
+    xpath_text,
+    int_or_none,
+)
+
+
+class ZapiksIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html|index\.php\?.*\bmedia_id=(?P<id>\d+))'
+    _TESTS = [
+        {
+            'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html',
+            'md5': 'aeb3c473b2d564b2d46d664d28d5f050',
+            'info_dict': {
+                'id': '80798',
+                'ext': 'mp4',
+                'title': 'EP2S3 - Bon Appétit - Eh bé viva les pyrénées con!',
+                'description': 'md5:7054d6f6f620c6519be1fe710d4da847',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 528,
+                'timestamp': 1359044972,
+                'upload_date': '20130124',
+                'view_count': int,
+            },
+        },
+        {
+            'url': 'http://www.zapiks.com/ep3s5-bon-appetit-baqueira-m-1.html',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.zapiks.com/nl/ep3s5-bon-appetit-baqueira-m-1.html',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.zapiks.fr/index.php?action=playerIframe&amp;media_id=118046&amp;width=640&amp;height=360&amp;autoStart=false&amp;language=fr',
+            'only_matching': True,
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage = self._download_webpage(url, display_id)
+
+        if not video_id:
+            video_id = self._search_regex(
+                r'data-media-id="(\d+)"', webpage, 'video id')
+
+        playlist = self._download_xml(
+            'http://www.zapiks.fr/view/index.php?action=playlist&media_id=%s&lang=en' % video_id,
+            display_id)
+
+        NS_MAP = {
+            'jwplayer': 'http://rss.jwpcdn.com/'
+        }
+
+        def ns(path):
+            return xpath_with_ns(path, NS_MAP)
+
+        item = playlist.find('./channel/item')
+
+        title = xpath_text(item, 'title', 'title') or self._og_search_title(webpage)
+        description = self._og_search_description(webpage, default=None)
+        thumbnail = xpath_text(
+            item, ns('./jwplayer:image'), 'thumbnail') or self._og_search_thumbnail(webpage, default=None)
+        duration = parse_duration(self._html_search_meta(
+            'duration', webpage, 'duration', default=None))
+        timestamp = parse_iso8601(self._html_search_meta(
+            'uploadDate', webpage, 'upload date', default=None), ' ')
+
+        view_count = int_or_none(self._search_regex(
+            r'UserPlays:(\d+)', webpage, 'view count', default=None))
+        comment_count = int_or_none(self._search_regex(
+            r'UserComments:(\d+)', webpage, 'comment count', default=None))
+
+        formats = []
+        for source in item.findall(ns('./jwplayer:source')):
+            format_id = source.attrib['label']
+            f = {
+                'url': source.attrib['file'],
+                'format_id': format_id,
+            }
+            m = re.search(r'^(?P<height>\d+)[pP]', format_id)
+            if m:
+                f['height'] = int(m.group('height'))
+            formats.append(f)
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py

new file mode 100644 (file)

index 0000000..889aff5
--- /dev/null
+++ b/youtube_dl/extractor/zaq1.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unified_timestamp,
+)
+
+
+class Zaq1IE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://zaq1.pl/video/xev0e',
+        'md5': '24a5eb3f052e604ae597c4d0d19b351e',
+        'info_dict': {
+            'id': 'xev0e',
+            'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa',
+            'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147',
+            'ext': 'mp4',
+            'duration': 511,
+            'timestamp': 1490896361,
+            'uploader': 'Anonim',
+            'upload_date': '20170330',
+            'view_count': int,
+        }
+    }, {
+        # malformed JSON-LD
+        'url': 'http://zaq1.pl/video/x81vn',
+        'info_dict': {
+            'id': 'x81vn',
+            'title': 'SEKRETNE ŻYCIE WALTERA MITTY',
+            'ext': 'mp4',
+            'duration': 6234,
+            'timestamp': 1493494860,
+            'uploader': 'Anonim',
+            'upload_date': '20170429',
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'expected_warnings': ['Failed to parse JSON'],
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_url = self._search_regex(
+            r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'video url', group='url')
+
+        info = self._search_json_ld(webpage, video_id, fatal=False)
+
+        def extract_data(field, name, fatal=False):
+            return self._search_regex(
+                r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field,
+                webpage, field, fatal=fatal, group='field')
+
+        if not info.get('title'):
+            info['title'] = extract_data('file-name', 'title', fatal=True)
+
+        if not info.get('duration'):
+            info['duration'] = int_or_none(extract_data('duration', 'duration'))
+
+        if not info.get('thumbnail'):
+            info['thumbnail'] = extract_data('photo-url', 'thumbnail')
+
+        if not info.get('timestamp'):
+            info['timestamp'] = unified_timestamp(self._html_search_meta(
+                'uploadDate', webpage, 'timestamp'))
+
+        if not info.get('interactionCount'):
+            info['view_count'] = int_or_none(self._html_search_meta(
+                'interactionCount', webpage, 'view count'))
+
+        uploader = self._html_search_regex(
+            r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader',
+            fatal=False)
+
+        width = int_or_none(self._html_search_meta(
+            'width', webpage, fatal=False))
+        height = int_or_none(self._html_search_meta(
+            'height', webpage, fatal=False))
+
+        info.update({
+            'id': video_id,
+            'formats': [{
+                'url': video_url,
+                'width': width,
+                'height': height,
+                'http_headers': {
+                    'Referer': url,
+                },
+            }],
+            'uploader': uploader,
+        })
+
+        return info
diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py

new file mode 100644 (file)

index 0000000..6bac302
--- /dev/null
+++ b/youtube_dl/extractor/zattoo.py
@@ -0,0 +1,433 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from uuid import uuid4
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    try_get,
+    url_or_none,
+    urlencode_postdata,
+)
+
+
+class ZattooPlatformBaseIE(InfoExtractor):
+    _power_guide_hash = None
+
+    def _host_url(self):
+        return 'https://%s' % (self._API_HOST if hasattr(self, '_API_HOST') else self._HOST)
+
+    def _login(self):
+        username, password = self._get_login_info()
+        if not username or not password:
+            self.raise_login_required(
+                'A valid %s account is needed to access this media.'
+                % self._NETRC_MACHINE)
+
+        try:
+            data = self._download_json(
+                '%s/zapi/v2/account/login' % self._host_url(), None, 'Logging in',
+                data=urlencode_postdata({
+                    'login': username,
+                    'password': password,
+                    'remember': 'true',
+                }), headers={
+                    'Referer': '%s/login' % self._host_url(),
+                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+                })
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+                raise ExtractorError(
+                    'Unable to login: incorrect username and/or password',
+                    expected=True)
+            raise
+
+        self._power_guide_hash = data['session']['power_guide_hash']
+
+    def _real_initialize(self):
+        webpage = self._download_webpage(
+            self._host_url(), None, 'Downloading app token')
+        app_token = self._html_search_regex(
+            r'appToken\s*=\s*(["\'])(?P<token>(?:(?!\1).)+?)\1',
+            webpage, 'app token', group='token')
+        app_version = self._html_search_regex(
+            r'<!--\w+-(.+?)-', webpage, 'app version', default='2.8.2')
+
+        # Will setup appropriate cookies
+        self._request_webpage(
+            '%s/zapi/v2/session/hello' % self._host_url(), None,
+            'Opening session', data=urlencode_postdata({
+                'client_app_token': app_token,
+                'uuid': compat_str(uuid4()),
+                'lang': 'en',
+                'app_version': app_version,
+                'format': 'json',
+            }))
+
+        self._login()
+
+    def _extract_cid(self, video_id, channel_name):
+        channel_groups = self._download_json(
+            '%s/zapi/v2/cached/channels/%s' % (self._host_url(),
+                                               self._power_guide_hash),
+            video_id, 'Downloading channel list',
+            query={'details': False})['channel_groups']
+        channel_list = []
+        for chgrp in channel_groups:
+            channel_list.extend(chgrp['channels'])
+        try:
+            return next(
+                chan['cid'] for chan in channel_list
+                if chan.get('cid') and (
+                    chan.get('display_alias') == channel_name
+                    or chan.get('cid') == channel_name))
+        except StopIteration:
+            raise ExtractorError('Could not extract channel id')
+
+    def _extract_cid_and_video_info(self, video_id):
+        data = self._download_json(
+            '%s/zapi/v2/cached/program/power_details/%s' % (
+                self._host_url(), self._power_guide_hash),
+            video_id,
+            'Downloading video information',
+            query={
+                'program_ids': video_id,
+                'complete': True,
+            })
+
+        p = data['programs'][0]
+        cid = p['cid']
+
+        info_dict = {
+            'id': video_id,
+            'title': p.get('t') or p['et'],
+            'description': p.get('d'),
+            'thumbnail': p.get('i_url'),
+            'creator': p.get('channel_name'),
+            'episode': p.get('et'),
+            'episode_number': int_or_none(p.get('e_no')),
+            'season_number': int_or_none(p.get('s_no')),
+            'release_year': int_or_none(p.get('year')),
+            'categories': try_get(p, lambda x: x['c'], list),
+            'tags': try_get(p, lambda x: x['g'], list)
+        }
+
+        return cid, info_dict
+
+    def _extract_formats(self, cid, video_id, record_id=None, is_live=False):
+        postdata_common = {
+            'https_watch_urls': True,
+        }
+
+        if is_live:
+            postdata_common.update({'timeshift': 10800})
+            url = '%s/zapi/watch/live/%s' % (self._host_url(), cid)
+        elif record_id:
+            url = '%s/zapi/watch/recording/%s' % (self._host_url(), record_id)
+        else:
+            url = '%s/zapi/watch/recall/%s/%s' % (self._host_url(), cid, video_id)
+
+        formats = []
+        for stream_type in ('dash', 'hls', 'hls5', 'hds'):
+            postdata = postdata_common.copy()
+            postdata['stream_type'] = stream_type
+
+            data = self._download_json(
+                url, video_id, 'Downloading %s formats' % stream_type.upper(),
+                data=urlencode_postdata(postdata), fatal=False)
+            if not data:
+                continue
+
+            watch_urls = try_get(
+                data, lambda x: x['stream']['watch_urls'], list)
+            if not watch_urls:
+                continue
+
+            for watch in watch_urls:
+                if not isinstance(watch, dict):
+                    continue
+                watch_url = url_or_none(watch.get('url'))
+                if not watch_url:
+                    continue
+                format_id_list = [stream_type]
+                maxrate = watch.get('maxrate')
+                if maxrate:
+                    format_id_list.append(compat_str(maxrate))
+                audio_channel = watch.get('audio_channel')
+                if audio_channel:
+                    format_id_list.append(compat_str(audio_channel))
+                preference = 1 if audio_channel == 'A' else None
+                format_id = '-'.join(format_id_list)
+                if stream_type in ('dash', 'dash_widevine', 'dash_playready'):
+                    this_formats = self._extract_mpd_formats(
+                        watch_url, video_id, mpd_id=format_id, fatal=False)
+                elif stream_type in ('hls', 'hls5', 'hls5_fairplay'):
+                    this_formats = self._extract_m3u8_formats(
+                        watch_url, video_id, 'mp4',
+                        entry_protocol='m3u8_native', m3u8_id=format_id,
+                        fatal=False)
+                elif stream_type == 'hds':
+                    this_formats = self._extract_f4m_formats(
+                        watch_url, video_id, f4m_id=format_id, fatal=False)
+                elif stream_type == 'smooth_playready':
+                    this_formats = self._extract_ism_formats(
+                        watch_url, video_id, ism_id=format_id, fatal=False)
+                else:
+                    assert False
+                for this_format in this_formats:
+                    this_format['preference'] = preference
+                formats.extend(this_formats)
+        self._sort_formats(formats)
+        return formats
+
+    def _extract_video(self, channel_name, video_id, record_id=None, is_live=False):
+        if is_live:
+            cid = self._extract_cid(video_id, channel_name)
+            info_dict = {
+                'id': channel_name,
+                'title': self._live_title(channel_name),
+                'is_live': True,
+            }
+        else:
+            cid, info_dict = self._extract_cid_and_video_info(video_id)
+        formats = self._extract_formats(
+            cid, video_id, record_id=record_id, is_live=is_live)
+        info_dict['formats'] = formats
+        return info_dict
+
+
+class QuicklineBaseIE(ZattooPlatformBaseIE):
+    _NETRC_MACHINE = 'quickline'
+    _HOST = 'mobiltv.quickline.com'
+
+
+class QuicklineIE(QuicklineBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' % re.escape(QuicklineBaseIE._HOST)
+
+    _TEST = {
+        'url': 'https://mobiltv.quickline.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        channel_name, video_id = re.match(self._VALID_URL, url).groups()
+        return self._extract_video(channel_name, video_id)
+
+
+class QuicklineLiveIE(QuicklineBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<id>[^/]+)' % re.escape(QuicklineBaseIE._HOST)
+
+    _TEST = {
+        'url': 'https://mobiltv.quickline.com/watch/srf1',
+        'only_matching': True,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if QuicklineIE.suitable(url) else super(QuicklineLiveIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        channel_name = video_id = self._match_id(url)
+        return self._extract_video(channel_name, video_id, is_live=True)
+
+
+class ZattooBaseIE(ZattooPlatformBaseIE):
+    _NETRC_MACHINE = 'zattoo'
+    _HOST = 'zattoo.com'
+
+
+def _make_valid_url(tmpl, host):
+    return tmpl % re.escape(host)
+
+
+class ZattooIE(ZattooBaseIE):
+    _VALID_URL_TEMPLATE = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?'
+    _VALID_URL = _make_valid_url(_VALID_URL_TEMPLATE, ZattooBaseIE._HOST)
+
+    # Since regular videos are only available for 7 days and recorded videos
+    # are only available for a specific user, we cannot have detailed tests.
+    _TESTS = [{
+        'url': 'https://zattoo.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste',
+        'only_matching': True,
+    }, {
+        'url': 'https://zattoo.com/watch/srf_zwei/132905652-eishockey-spengler-cup/102791477/1512211800000/1514433500000/92000',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        channel_name, video_id, record_id = re.match(self._VALID_URL, url).groups()
+        return self._extract_video(channel_name, video_id, record_id)
+
+
+class ZattooLiveIE(ZattooBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)'
+
+    _TEST = {
+        'url': 'https://zattoo.com/watch/srf1',
+        'only_matching': True,
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return False if ZattooIE.suitable(url) else super(ZattooLiveIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        channel_name = video_id = self._match_id(url)
+        return self._extract_video(channel_name, video_id, is_live=True)
+
+
+class NetPlusIE(ZattooIE):
+    _NETRC_MACHINE = 'netplus'
+    _HOST = 'netplus.tv'
+    _API_HOST = 'www.%s' % _HOST
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://www.netplus.tv/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class MNetTVIE(ZattooIE):
+    _NETRC_MACHINE = 'mnettv'
+    _HOST = 'tvplus.m-net.de'
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://tvplus.m-net.de/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class WalyTVIE(ZattooIE):
+    _NETRC_MACHINE = 'walytv'
+    _HOST = 'player.waly.tv'
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://player.waly.tv/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class BBVTVIE(ZattooIE):
+    _NETRC_MACHINE = 'bbvtv'
+    _HOST = 'bbv-tv.net'
+    _API_HOST = 'www.%s' % _HOST
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://www.bbv-tv.net/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class VTXTVIE(ZattooIE):
+    _NETRC_MACHINE = 'vtxtv'
+    _HOST = 'vtxtv.ch'
+    _API_HOST = 'www.%s' % _HOST
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://www.vtxtv.ch/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class MyVisionTVIE(ZattooIE):
+    _NETRC_MACHINE = 'myvisiontv'
+    _HOST = 'myvisiontv.ch'
+    _API_HOST = 'www.%s' % _HOST
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://www.myvisiontv.ch/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class GlattvisionTVIE(ZattooIE):
+    _NETRC_MACHINE = 'glattvisiontv'
+    _HOST = 'iptv.glattvision.ch'
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://iptv.glattvision.ch/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class SAKTVIE(ZattooIE):
+    _NETRC_MACHINE = 'saktv'
+    _HOST = 'saktv.ch'
+    _API_HOST = 'www.%s' % _HOST
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://www.saktv.ch/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class EWETVIE(ZattooIE):
+    _NETRC_MACHINE = 'ewetv'
+    _HOST = 'tvonline.ewe.de'
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://tvonline.ewe.de/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class QuantumTVIE(ZattooIE):
+    _NETRC_MACHINE = 'quantumtv'
+    _HOST = 'quantum-tv.com'
+    _API_HOST = 'www.%s' % _HOST
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://www.quantum-tv.com/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class OsnatelTVIE(ZattooIE):
+    _NETRC_MACHINE = 'osnateltv'
+    _HOST = 'tvonline.osnatel.de'
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://tvonline.osnatel.de/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class EinsUndEinsTVIE(ZattooIE):
+    _NETRC_MACHINE = '1und1tv'
+    _HOST = '1und1.tv'
+    _API_HOST = 'www.%s' % _HOST
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://www.1und1.tv/watch/abc/123-abc',
+        'only_matching': True,
+    }]
+
+
+class SaltTVIE(ZattooIE):
+    _NETRC_MACHINE = 'salttv'
+    _HOST = 'tv.salt.ch'
+    _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST)
+
+    _TESTS = [{
+        'url': 'https://tv.salt.ch/watch/abc/123-abc',
+        'only_matching': True,
+    }]
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py

new file mode 100644 (file)

index 0000000..7b5ad4a
--- /dev/null
+++ b/youtube_dl/extractor/zdf.py
@@ -0,0 +1,332 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    NO_DEFAULT,
+    orderedSet,
+    parse_codecs,
+    qualities,
+    try_get,
+    unified_timestamp,
+    update_url_query,
+    url_or_none,
+    urljoin,
+)
+
+
+class ZDFBaseIE(InfoExtractor):
+    def _call_api(self, url, player, referrer, video_id, item):
+        return self._download_json(
+            url, video_id, 'Downloading JSON %s' % item,
+            headers={
+                'Referer': referrer,
+                'Api-Auth': 'Bearer %s' % player['apiToken'],
+            })
+
+    def _extract_player(self, webpage, video_id, fatal=True):
+        return self._parse_json(
+            self._search_regex(
+                r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage,
+                'player JSON', default='{}' if not fatal else NO_DEFAULT,
+                group='json'),
+            video_id)
+
+
+class ZDFIE(ZDFBaseIE):
+    IE_NAME = "ZDF-3sat"
+    _VALID_URL = r'https?://www\.(zdf|3sat)\.de/(?:[^/]+/)*(?P<id>[^/?]+)\.html'
+    _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh')
+    _GEO_COUNTRIES = ['DE']
+
+    _TESTS = [{
+        'url': 'https://www.3sat.de/wissen/wissenschaftsdoku/luxusgut-lebensraum-100.html',
+        'info_dict': {
+            'id': 'luxusgut-lebensraum-100',
+            'ext': 'mp4',
+            'title': 'Luxusgut Lebensraum',
+            'description': 'md5:5c09b2f45ac3bc5233d1b50fc543d061',
+            'duration': 2601,
+            'timestamp': 1566497700,
+            'upload_date': '20190822',
+        }
+    }, {
+        'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html',
+        'info_dict': {
+            'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100',
+            'ext': 'mp4',
+            'title': 'Die Magie der Farben (2/2)',
+            'description': 'md5:a89da10c928c6235401066b60a6d5c1a',
+            'duration': 2615,
+            'timestamp': 1465021200,
+            'upload_date': '20160604',
+        },
+    }, {
+        'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_subtitles(src):
+        subtitles = {}
+        for caption in try_get(src, lambda x: x['captions'], list) or []:
+            subtitle_url = url_or_none(caption.get('uri'))
+            if subtitle_url:
+                lang = caption.get('language', 'deu')
+                subtitles.setdefault(lang, []).append({
+                    'url': subtitle_url,
+                })
+        return subtitles
+
+    def _extract_format(self, video_id, formats, format_urls, meta):
+        format_url = url_or_none(meta.get('url'))
+        if not format_url:
+            return
+        if format_url in format_urls:
+            return
+        format_urls.add(format_url)
+        mime_type = meta.get('mimeType')
+        ext = determine_ext(format_url)
+        if mime_type == 'application/x-mpegURL' or ext == 'm3u8':
+            formats.extend(self._extract_m3u8_formats(
+                format_url, video_id, 'mp4', m3u8_id='hls',
+                entry_protocol='m3u8_native', fatal=False))
+        elif mime_type == 'application/f4m+xml' or ext == 'f4m':
+            formats.extend(self._extract_f4m_formats(
+                update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False))
+        else:
+            f = parse_codecs(meta.get('mimeCodec'))
+            format_id = ['http']
+            for p in (meta.get('type'), meta.get('quality')):
+                if p and isinstance(p, compat_str):
+                    format_id.append(p)
+            f.update({
+                'url': format_url,
+                'format_id': '-'.join(format_id),
+                'format_note': meta.get('quality'),
+                'language': meta.get('language'),
+                'quality': qualities(self._QUALITIES)(meta.get('quality')),
+                'preference': -10,
+            })
+            formats.append(f)
+
+    def _extract_entry(self, url, player, content, video_id):
+        title = content.get('title') or content['teaserHeadline']
+
+        t = content['mainVideoContent']['http://zdf.de/rels/target']
+
+        ptmd_path = t.get('http://zdf.de/rels/streams/ptmd')
+
+        if not ptmd_path:
+            ptmd_path = t[
+                'http://zdf.de/rels/streams/ptmd-template'].replace(
+                '{playerId}', 'portal')
+
+        ptmd = self._call_api(
+            urljoin(url, ptmd_path), player, url, video_id, 'metadata')
+
+        formats = []
+        track_uris = set()
+        for p in ptmd['priorityList']:
+            formitaeten = p.get('formitaeten')
+            if not isinstance(formitaeten, list):
+                continue
+            for f in formitaeten:
+                f_qualities = f.get('qualities')
+                if not isinstance(f_qualities, list):
+                    continue
+                for quality in f_qualities:
+                    tracks = try_get(quality, lambda x: x['audio']['tracks'], list)
+                    if not tracks:
+                        continue
+                    for track in tracks:
+                        self._extract_format(
+                            video_id, formats, track_uris, {
+                                'url': track.get('uri'),
+                                'type': f.get('type'),
+                                'mimeType': f.get('mimeType'),
+                                'quality': quality.get('quality'),
+                                'language': track.get('language'),
+                            })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        layouts = try_get(
+            content, lambda x: x['teaserImageRef']['layouts'], dict)
+        if layouts:
+            for layout_key, layout_url in layouts.items():
+                layout_url = url_or_none(layout_url)
+                if not layout_url:
+                    continue
+                thumbnail = {
+                    'url': layout_url,
+                    'format_id': layout_key,
+                }
+                mobj = re.search(r'(?P<width>\d+)x(?P<height>\d+)', layout_key)
+                if mobj:
+                    thumbnail.update({
+                        'width': int(mobj.group('width')),
+                        'height': int(mobj.group('height')),
+                    })
+                thumbnails.append(thumbnail)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': content.get('leadParagraph') or content.get('teasertext'),
+            'duration': int_or_none(t.get('duration')),
+            'timestamp': unified_timestamp(content.get('editorialDate')),
+            'thumbnails': thumbnails,
+            'subtitles': self._extract_subtitles(ptmd),
+            'formats': formats,
+        }
+
+    def _extract_regular(self, url, player, video_id):
+        content = self._call_api(
+            player['content'], player, url, video_id, 'content')
+        return self._extract_entry(player['content'], player, content, video_id)
+
+    def _extract_mobile(self, video_id):
+        document = self._download_json(
+            'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id,
+            video_id)['document']
+
+        title = document['titel']
+
+        formats = []
+        format_urls = set()
+        for f in document['formitaeten']:
+            self._extract_format(video_id, formats, format_urls, f)
+        self._sort_formats(formats)
+
+        thumbnails = []
+        teaser_bild = document.get('teaserBild')
+        if isinstance(teaser_bild, dict):
+            for thumbnail_key, thumbnail in teaser_bild.items():
+                thumbnail_url = try_get(
+                    thumbnail, lambda x: x['url'], compat_str)
+                if thumbnail_url:
+                    thumbnails.append({
+                        'url': thumbnail_url,
+                        'id': thumbnail_key,
+                        'width': int_or_none(thumbnail.get('width')),
+                        'height': int_or_none(thumbnail.get('height')),
+                    })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': document.get('beschreibung'),
+            'duration': int_or_none(document.get('length')),
+            'timestamp': unified_timestamp(try_get(
+                document, lambda x: x['meta']['editorialDate'], compat_str)),
+            'thumbnails': thumbnails,
+            'subtitles': self._extract_subtitles(document),
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id, fatal=False)
+        if webpage:
+            player = self._extract_player(webpage, url, fatal=False)
+            if player:
+                return self._extract_regular(url, player, video_id)
+
+        return self._extract_mobile(video_id)
+
+
+class ZDFChannelIE(ZDFBaseIE):
+    _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio',
+        'info_dict': {
+            'id': 'das-aktuelle-sportstudio',
+            'title': 'das aktuelle sportstudio | ZDF',
+        },
+        'playlist_mincount': 23,
+    }, {
+        'url': 'https://www.zdf.de/dokumentation/planet-e',
+        'info_dict': {
+            'id': 'planet-e',
+            'title': 'planet e.',
+        },
+        'playlist_mincount': 50,
+    }, {
+        'url': 'https://www.zdf.de/filme/taunuskrimi/',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, channel_id)
+
+        entries = [
+            self.url_result(item_url, ie=ZDFIE.ie_key())
+            for item_url in orderedSet(re.findall(
+                r'data-plusbar-url=["\'](http.+?\.html)', webpage))]
+
+        return self.playlist_result(
+            entries, channel_id, self._og_search_title(webpage, fatal=False))
+
+        r"""
+        player = self._extract_player(webpage, channel_id)
+
+        channel_id = self._search_regex(
+            r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage,
+            'channel id', group='id')
+
+        channel = self._call_api(
+            'https://api.zdf.de/content/documents/%s.json' % channel_id,
+            player, url, channel_id)
+
+        items = []
+        for module in channel['module']:
+            for teaser in try_get(module, lambda x: x['teaser'], list) or []:
+                t = try_get(
+                    teaser, lambda x: x['http://zdf.de/rels/target'], dict)
+                if not t:
+                    continue
+                items.extend(try_get(
+                    t,
+                    lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'],
+                    list) or [])
+            items.extend(try_get(
+                module,
+                lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'],
+                list) or [])
+
+        entries = []
+        entry_urls = set()
+        for item in items:
+            t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict)
+            if not t:
+                continue
+            sharing_url = t.get('http://zdf.de/rels/sharing-url')
+            if not sharing_url or not isinstance(sharing_url, compat_str):
+                continue
+            if sharing_url in entry_urls:
+                continue
+            entry_urls.add(sharing_url)
+            entries.append(self.url_result(
+                sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id')))
+
+        return self.playlist_result(entries, channel_id, channel.get('title'))
+        """
diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py

new file mode 100644 (file)

index 0000000..adfdcaa
--- /dev/null
+++ b/youtube_dl/extractor/zingmp3.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    update_url_query,
+)
+
+
+class ZingMp3BaseInfoExtractor(InfoExtractor):
+
+    def _extract_item(self, item, page_type, fatal=True):
+        error_message = item.get('msg')
+        if error_message:
+            if not fatal:
+                return
+            raise ExtractorError(
+                '%s returned error: %s' % (self.IE_NAME, error_message),
+                expected=True)
+
+        formats = []
+        for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])):
+            if not source_url or source_url == 'require vip':
+                continue
+            if not re.match(r'https?://', source_url):
+                source_url = '//' + source_url
+            source_url = self._proto_relative_url(source_url, 'http:')
+            quality_num = int_or_none(quality)
+            f = {
+                'format_id': quality,
+                'url': source_url,
+            }
+            if page_type == 'video':
+                f.update({
+                    'height': quality_num,
+                    'ext': 'mp4',
+                })
+            else:
+                f.update({
+                    'abr': quality_num,
+                    'ext': 'mp3',
+                })
+            formats.append(f)
+
+        cover = item.get('cover')
+
+        return {
+            'title': (item.get('name') or item.get('title')).strip(),
+            'formats': formats,
+            'thumbnail': 'http:/' + cover if cover else None,
+            'artist': item.get('artist'),
+        }
+
+    def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None):
+        player_json = self._download_json(player_json_url, id, 'Downloading Player JSON')
+        items = player_json['data']
+        if 'item' in items:
+            items = items['item']
+
+        if len(items) == 1:
+            # one single song
+            data = self._extract_item(items[0], page_type)
+            data['id'] = id
+
+            return data
+        else:
+            # playlist of songs
+            entries = []
+
+            for i, item in enumerate(items, 1):
+                entry = self._extract_item(item, page_type, fatal=False)
+                if not entry:
+                    continue
+                entry['id'] = '%s-%d' % (id, i)
+                entries.append(entry)
+
+            return {
+                '_type': 'playlist',
+                'id': id,
+                'title': playlist_title,
+                'entries': entries,
+            }
+
+
+class ZingMp3IE(ZingMp3BaseInfoExtractor):
+    _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P<id>\w+)\.html'
+    _TESTS = [{
+        'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
+        'md5': 'ead7ae13693b3205cbc89536a077daed',
+        'info_dict': {
+            'id': 'ZWZB9WAB',
+            'title': 'Xa Mãi Xa',
+            'ext': 'mp3',
+            'thumbnail': r're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html',
+        'md5': '870295a9cd8045c0e15663565902618d',
+        'info_dict': {
+            'id': 'ZW6BAEA0',
+            'title': 'Let It Go (Frozen OST)',
+            'ext': 'mp4',
+        },
+    }, {
+        'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html',
+        'info_dict': {
+            '_type': 'playlist',
+            'id': 'ZWZBWDAF',
+            'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless',
+        },
+        'playlist_count': 10,
+        'skip': 'removed at the request of the owner',
+    }, {
+        'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html',
+        'only_matching': True,
+    }]
+    IE_NAME = 'zingmp3'
+    IE_DESC = 'mp3.zing.vn'
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, page_id)
+
+        player_json_url = self._search_regex([
+            r'data-xml="([^"]+)',
+            r'&amp;xmlURL=([^&]+)&'
+        ], webpage, 'player xml url')
+
+        playlist_title = None
+        page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type')
+        if page_type == 'video':
+            player_json_url = update_url_query(player_json_url, {'format': 'json'})
+        else:
+            player_json_url = player_json_url.replace('/xml/', '/html5xml/')
+            if page_type == 'album':
+                playlist_title = self._og_search_title(webpage)
+
+        return self._extract_player_json(player_json_url, page_id, page_type, playlist_title)
diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py

new file mode 100644 (file)

index 0000000..2e2e97a
--- /dev/null
+++ b/youtube_dl/extractor/zype.py
@@ -0,0 +1,134 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    dict_get,
+    ExtractorError,
+    int_or_none,
+    js_to_json,
+    parse_iso8601,
+)
+
+
+class ZypeIE(InfoExtractor):
+    _ID_RE = r'[\da-fA-F]+'
+    _COMMON_RE = r'//player\.zype\.com/embed/%s\.(?:js|json|html)\?.*?(?:access_token|(?:ap[ip]|player)_key)='
+    _VALID_URL = r'https?:%s[^&]+' % (_COMMON_RE % ('(?P<id>%s)' % _ID_RE))
+    _TEST = {
+        'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false',
+        'md5': 'eaee31d474c76a955bdaba02a505c595',
+        'info_dict': {
+            'id': '5b400b834b32992a310622b9',
+            'ext': 'mp4',
+            'title': 'Smoky Barbecue Favorites',
+            'thumbnail': r're:^https?://.*\.jpe?g',
+            'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
+            'timestamp': 1504915200,
+            'upload_date': '20170909',
+        },
+    }
+
+    @staticmethod
+    def _extract_urls(webpage):
+        return [
+            mobj.group('url')
+            for mobj in re.finditer(
+                r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.+?)\1' % (ZypeIE._COMMON_RE % ZypeIE._ID_RE),
+                webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        try:
+            response = self._download_json(re.sub(
+                r'\.(?:js|html)\?', '.json?', url), video_id)['response']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 403):
+                raise ExtractorError(self._parse_json(
+                    e.cause.read().decode(), video_id)['message'], expected=True)
+            raise
+
+        body = response['body']
+        video = response['video']
+        title = video['title']
+
+        if isinstance(body, dict):
+            formats = []
+            for output in body.get('outputs', []):
+                output_url = output.get('url')
+                if not output_url:
+                    continue
+                name = output.get('name')
+                if name == 'm3u8':
+                    formats = self._extract_m3u8_formats(
+                        output_url, video_id, 'mp4',
+                        'm3u8_native', m3u8_id='hls', fatal=False)
+                else:
+                    f = {
+                        'format_id': name,
+                        'tbr': int_or_none(output.get('bitrate')),
+                        'url': output_url,
+                    }
+                    if name in ('m4a', 'mp3'):
+                        f['vcodec'] = 'none'
+                    else:
+                        f.update({
+                            'height': int_or_none(output.get('height')),
+                            'width': int_or_none(output.get('width')),
+                        })
+                    formats.append(f)
+            text_tracks = body.get('subtitles') or []
+        else:
+            m3u8_url = self._search_regex(
+                r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1',
+                body, 'm3u8 url', group='url')
+            formats = self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+            text_tracks = self._search_regex(
+                r'textTracks\s*:\s*(\[[^]]+\])',
+                body, 'text tracks', default=None)
+            if text_tracks:
+                text_tracks = self._parse_json(
+                    text_tracks, video_id, js_to_json, False)
+        self._sort_formats(formats)
+
+        subtitles = {}
+        if text_tracks:
+            for text_track in text_tracks:
+                tt_url = dict_get(text_track, ('file', 'src'))
+                if not tt_url:
+                    continue
+                subtitles.setdefault(text_track.get('label') or 'English', []).append({
+                    'url': tt_url,
+                })
+
+        thumbnails = []
+        for thumbnail in video.get('thumbnails', []):
+            thumbnail_url = thumbnail.get('url')
+            if not thumbnail_url:
+                continue
+            thumbnails.append({
+                'url': thumbnail_url,
+                'width': int_or_none(thumbnail.get('width')),
+                'height': int_or_none(thumbnail.get('height')),
+            })
+
+        return {
+            'id': video_id,
+            'display_id': video.get('friendly_title'),
+            'title': title,
+            'thumbnails': thumbnails,
+            'description': dict_get(video, ('description', 'ott_description', 'short_description')),
+            'timestamp': parse_iso8601(video.get('published_at')),
+            'duration': int_or_none(video.get('duration')),
+            'view_count': int_or_none(video.get('request_count')),
+            'average_rating': int_or_none(video.get('rating')),
+            'season_number': int_or_none(video.get('season')),
+            'episode_number': int_or_none(video.get('episode')),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py

new file mode 100644 (file)

index 0000000..7bda596
--- /dev/null
+++ b/youtube_dl/jsinterp.py
@@ -0,0 +1,262 @@
+from __future__ import unicode_literals
+
+import json
+import operator
+import re
+
+from .utils import (
+    ExtractorError,
+    remove_quotes,
+)
+
+_OPERATORS = [
+    ('|', operator.or_),
+    ('^', operator.xor),
+    ('&', operator.and_),
+    ('>>', operator.rshift),
+    ('<<', operator.lshift),
+    ('-', operator.sub),
+    ('+', operator.add),
+    ('%', operator.mod),
+    ('/', operator.truediv),
+    ('*', operator.mul),
+]
+_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS]
+_ASSIGN_OPERATORS.append(('=', lambda cur, right: right))
+
+_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
+
+
+class JSInterpreter(object):
+    def __init__(self, code, objects=None):
+        if objects is None:
+            objects = {}
+        self.code = code
+        self._functions = {}
+        self._objects = objects
+
+    def interpret_statement(self, stmt, local_vars, allow_recursion=100):
+        if allow_recursion < 0:
+            raise ExtractorError('Recursion limit reached')
+
+        should_abort = False
+        stmt = stmt.lstrip()
+        stmt_m = re.match(r'var\s', stmt)
+        if stmt_m:
+            expr = stmt[len(stmt_m.group(0)):]
+        else:
+            return_m = re.match(r'return(?:\s+|$)', stmt)
+            if return_m:
+                expr = stmt[len(return_m.group(0)):]
+                should_abort = True
+            else:
+                # Try interpreting it as an expression
+                expr = stmt
+
+        v = self.interpret_expression(expr, local_vars, allow_recursion)
+        return v, should_abort
+
+    def interpret_expression(self, expr, local_vars, allow_recursion):
+        expr = expr.strip()
+        if expr == '':  # Empty expression
+            return None
+
+        if expr.startswith('('):
+            parens_count = 0
+            for m in re.finditer(r'[()]', expr):
+                if m.group(0) == '(':
+                    parens_count += 1
+                else:
+                    parens_count -= 1
+                    if parens_count == 0:
+                        sub_expr = expr[1:m.start()]
+                        sub_result = self.interpret_expression(
+                            sub_expr, local_vars, allow_recursion)
+                        remaining_expr = expr[m.end():].strip()
+                        if not remaining_expr:
+                            return sub_result
+                        else:
+                            expr = json.dumps(sub_result) + remaining_expr
+                        break
+            else:
+                raise ExtractorError('Premature end of parens in %r' % expr)
+
+        for op, opfunc in _ASSIGN_OPERATORS:
+            m = re.match(r'''(?x)
+                (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?
+                \s*%s
+                (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr)
+            if not m:
+                continue
+            right_val = self.interpret_expression(
+                m.group('expr'), local_vars, allow_recursion - 1)
+
+            if m.groupdict().get('index'):
+                lvar = local_vars[m.group('out')]
+                idx = self.interpret_expression(
+                    m.group('index'), local_vars, allow_recursion)
+                assert isinstance(idx, int)
+                cur = lvar[idx]
+                val = opfunc(cur, right_val)
+                lvar[idx] = val
+                return val
+            else:
+                cur = local_vars.get(m.group('out'))
+                val = opfunc(cur, right_val)
+                local_vars[m.group('out')] = val
+                return val
+
+        if expr.isdigit():
+            return int(expr)
+
+        var_m = re.match(
+            r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE,
+            expr)
+        if var_m:
+            return local_vars[var_m.group('name')]
+
+        try:
+            return json.loads(expr)
+        except ValueError:
+            pass
+
+        m = re.match(
+            r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
+        if m:
+            val = local_vars[m.group('in')]
+            idx = self.interpret_expression(
+                m.group('idx'), local_vars, allow_recursion - 1)
+            return val[idx]
+
+        m = re.match(
+            r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
+            expr)
+        if m:
+            variable = m.group('var')
+            member = remove_quotes(m.group('member') or m.group('member2'))
+            arg_str = m.group('args')
+
+            if variable in local_vars:
+                obj = local_vars[variable]
+            else:
+                if variable not in self._objects:
+                    self._objects[variable] = self.extract_object(variable)
+                obj = self._objects[variable]
+
+            if arg_str is None:
+                # Member access
+                if member == 'length':
+                    return len(obj)
+                return obj[member]
+
+            assert expr.endswith(')')
+            # Function call
+            if arg_str == '':
+                argvals = tuple()
+            else:
+                argvals = tuple([
+                    self.interpret_expression(v, local_vars, allow_recursion)
+                    for v in arg_str.split(',')])
+
+            if member == 'split':
+                assert argvals == ('',)
+                return list(obj)
+            if member == 'join':
+                assert len(argvals) == 1
+                return argvals[0].join(obj)
+            if member == 'reverse':
+                assert len(argvals) == 0
+                obj.reverse()
+                return obj
+            if member == 'slice':
+                assert len(argvals) == 1
+                return obj[argvals[0]:]
+            if member == 'splice':
+                assert isinstance(obj, list)
+                index, howMany = argvals
+                res = []
+                for i in range(index, min(index + howMany, len(obj))):
+                    res.append(obj.pop(index))
+                return res
+
+            return obj[member](argvals)
+
+        for op, opfunc in _OPERATORS:
+            m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
+            if not m:
+                continue
+            x, abort = self.interpret_statement(
+                m.group('x'), local_vars, allow_recursion - 1)
+            if abort:
+                raise ExtractorError(
+                    'Premature left-side return of %s in %r' % (op, expr))
+            y, abort = self.interpret_statement(
+                m.group('y'), local_vars, allow_recursion - 1)
+            if abort:
+                raise ExtractorError(
+                    'Premature right-side return of %s in %r' % (op, expr))
+            return opfunc(x, y)
+
+        m = re.match(
+            r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr)
+        if m:
+            fname = m.group('func')
+            argvals = tuple([
+                int(v) if v.isdigit() else local_vars[v]
+                for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple()
+            if fname not in self._functions:
+                self._functions[fname] = self.extract_function(fname)
+            return self._functions[fname](argvals)
+
+        raise ExtractorError('Unsupported JS expression %r' % expr)
+
+    def extract_object(self, objname):
+        _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
+        obj = {}
+        obj_m = re.search(
+            r'''(?x)
+                (?<!this\.)%s\s*=\s*{\s*
+                    (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
+                }\s*;
+            ''' % (re.escape(objname), _FUNC_NAME_RE),
+            self.code)
+        fields = obj_m.group('fields')
+        # Currently, it only supports function definitions
+        fields_m = re.finditer(
+            r'''(?x)
+                (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}
+            ''' % _FUNC_NAME_RE,
+            fields)
+        for f in fields_m:
+            argnames = f.group('args').split(',')
+            obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))
+
+        return obj
+
+    def extract_function(self, funcname):
+        func_m = re.search(
+            r'''(?x)
+                (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
+                \((?P<args>[^)]*)\)\s*
+                \{(?P<code>[^}]+)\}''' % (
+                re.escape(funcname), re.escape(funcname), re.escape(funcname)),
+            self.code)
+        if func_m is None:
+            raise ExtractorError('Could not find JS function %r' % funcname)
+        argnames = func_m.group('args').split(',')
+
+        return self.build_function(argnames, func_m.group('code'))
+
+    def call_function(self, funcname, *args):
+        f = self.extract_function(funcname)
+        return f(args)
+
+    def build_function(self, argnames, code):
+        def resf(args):
+            local_vars = dict(zip(argnames, args))
+            for stmt in code.split(';'):
+                res, abort = self.interpret_statement(stmt, local_vars)
+                if abort:
+                    break
+            return res
+        return resf
diff --git a/youtube_dl/options.py b/youtube_dl/options.py

new file mode 100644 (file)

index 0000000..2cc5eee
--- /dev/null
+++ b/youtube_dl/options.py
@@ -0,0 +1,916 @@
+from __future__ import unicode_literals
+
+import os.path
+import optparse
+import re
+import sys
+
+from .downloader.external import list_external_downloaders
+from .compat import (
+    compat_expanduser,
+    compat_get_terminal_size,
+    compat_getenv,
+    compat_kwargs,
+    compat_shlex_split,
+)
+from .utils import (
+    preferredencoding,
+    write_string,
+)
+from .version import __version__
+
+
+def _hide_login_info(opts):
+    PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
+    eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
+
+    def _scrub_eq(o):
+        m = eqre.match(o)
+        if m:
+            return m.group('key') + '=PRIVATE'
+        else:
+            return o
+
+    opts = list(map(_scrub_eq, opts))
+    for idx, opt in enumerate(opts):
+        if opt in PRIVATE_OPTS and idx + 1 < len(opts):
+            opts[idx + 1] = 'PRIVATE'
+    return opts
+
+
+def parseOpts(overrideArguments=None):
+    def _readOptions(filename_bytes, default=[]):
+        try:
+            optionf = open(filename_bytes)
+        except IOError:
+            return default  # silently skip if file is not present
+        try:
+            # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
+            contents = optionf.read()
+            if sys.version_info < (3,):
+                contents = contents.decode(preferredencoding())
+            res = compat_shlex_split(contents, comments=True)
+        finally:
+            optionf.close()
+        return res
+
+    def _readUserConf():
+        xdg_config_home = compat_getenv('XDG_CONFIG_HOME')
+        if xdg_config_home:
+            userConfFile = os.path.join(xdg_config_home, 'youtube-dlc', 'config')
+            if not os.path.isfile(userConfFile):
+                userConfFile = os.path.join(xdg_config_home, 'youtube-dlc.conf')
+        else:
+            userConfFile = os.path.join(compat_expanduser('~'), '.config', 'youtube-dlc', 'config')
+            if not os.path.isfile(userConfFile):
+                userConfFile = os.path.join(compat_expanduser('~'), '.config', 'youtube-dlc.conf')
+        userConf = _readOptions(userConfFile, None)
+
+        if userConf is None:
+            appdata_dir = compat_getenv('appdata')
+            if appdata_dir:
+                userConf = _readOptions(
+                    os.path.join(appdata_dir, 'youtube-dlc', 'config'),
+                    default=None)
+                if userConf is None:
+                    userConf = _readOptions(
+                        os.path.join(appdata_dir, 'youtube-dlc', 'config.txt'),
+                        default=None)
+
+        if userConf is None:
+            userConf = _readOptions(
+                os.path.join(compat_expanduser('~'), 'youtube-dlc.conf'),
+                default=None)
+        if userConf is None:
+            userConf = _readOptions(
+                os.path.join(compat_expanduser('~'), 'youtube-dlc.conf.txt'),
+                default=None)
+
+        if userConf is None:
+            userConf = []
+
+        return userConf
+
+    def _format_option_string(option):
+        ''' ('-o', '--option') -> -o, --format METAVAR'''
+
+        opts = []
+
+        if option._short_opts:
+            opts.append(option._short_opts[0])
+        if option._long_opts:
+            opts.append(option._long_opts[0])
+        if len(opts) > 1:
+            opts.insert(1, ', ')
+
+        if option.takes_value():
+            opts.append(' %s' % option.metavar)
+
+        return ''.join(opts)
+
+    def _comma_separated_values_options_callback(option, opt_str, value, parser):
+        setattr(parser.values, option.dest, value.split(','))
+
+    # No need to wrap help messages if we're on a wide console
+    columns = compat_get_terminal_size().columns
+    max_width = columns if columns else 80
+    max_help_position = 80
+
+    fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
+    fmt.format_option_strings = _format_option_string
+
+    kw = {
+        'version': __version__,
+        'formatter': fmt,
+        'usage': '%prog [OPTIONS] URL [URL...]',
+        'conflict_handler': 'resolve',
+    }
+
+    parser = optparse.OptionParser(**compat_kwargs(kw))
+
+    general = optparse.OptionGroup(parser, 'General Options')
+    general.add_option(
+        '-h', '--help',
+        action='help',
+        help='Print this help text and exit')
+    general.add_option(
+        '--version',
+        action='version',
+        help='Print program version and exit')
+    general.add_option(
+        '-U', '--update',
+        action='store_true', dest='update_self',
+        help='Update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)')
+    general.add_option(
+        '-i', '--ignore-errors',
+        action='store_true', dest='ignoreerrors', default=False,
+        help='Continue on download errors, for example to skip unavailable videos in a playlist')
+    general.add_option(
+        '--abort-on-error',
+        action='store_false', dest='ignoreerrors',
+        help='Abort downloading of further videos (in the playlist or the command line) if an error occurs')
+    general.add_option(
+        '--dump-user-agent',
+        action='store_true', dest='dump_user_agent', default=False,
+        help='Display the current browser identification')
+    general.add_option(
+        '--list-extractors',
+        action='store_true', dest='list_extractors', default=False,
+        help='List all supported extractors')
+    general.add_option(
+        '--extractor-descriptions',
+        action='store_true', dest='list_extractor_descriptions', default=False,
+        help='Output descriptions of all supported extractors')
+    general.add_option(
+        '--force-generic-extractor',
+        action='store_true', dest='force_generic_extractor', default=False,
+        help='Force extraction to use the generic extractor')
+    general.add_option(
+        '--default-search',
+        dest='default_search', metavar='PREFIX',
+        help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dlc "large apple". Use the value "auto" to let youtube-dlc guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching.')
+    general.add_option(
+        '--ignore-config',
+        action='store_true',
+        help='Do not read configuration files. '
+        'When given in the global configuration file /etc/youtube-dlc.conf: '
+        'Do not read the user configuration in ~/.config/youtube-dlc/config '
+        '(%APPDATA%/youtube-dlc/config.txt on Windows)')
+    general.add_option(
+        '--config-location',
+        dest='config_location', metavar='PATH',
+        help='Location of the configuration file; either the path to the config or its containing directory.')
+    general.add_option(
+        '--flat-playlist',
+        action='store_const', dest='extract_flat', const='in_playlist',
+        default=False,
+        help='Do not extract the videos of a playlist, only list them.')
+    general.add_option(
+        '--mark-watched',
+        action='store_true', dest='mark_watched', default=False,
+        help='Mark videos watched (YouTube only)')
+    general.add_option(
+        '--no-mark-watched',
+        action='store_false', dest='mark_watched', default=False,
+        help='Do not mark videos watched (YouTube only)')
+    general.add_option(
+        '--no-color', '--no-colors',
+        action='store_true', dest='no_color',
+        default=False,
+        help='Do not emit color codes in output')
+
+    network = optparse.OptionGroup(parser, 'Network Options')
+    network.add_option(
+        '--proxy', dest='proxy',
+        default=None, metavar='URL',
+        help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable '
+             'SOCKS proxy, specify a proper scheme. For example '
+             'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") '
+             'for direct connection')
+    network.add_option(
+        '--socket-timeout',
+        dest='socket_timeout', type=float, default=None, metavar='SECONDS',
+        help='Time to wait before giving up, in seconds')
+    network.add_option(
+        '--source-address',
+        metavar='IP', dest='source_address', default=None,
+        help='Client-side IP address to bind to',
+    )
+    network.add_option(
+        '-4', '--force-ipv4',
+        action='store_const', const='0.0.0.0', dest='source_address',
+        help='Make all connections via IPv4',
+    )
+    network.add_option(
+        '-6', '--force-ipv6',
+        action='store_const', const='::', dest='source_address',
+        help='Make all connections via IPv6',
+    )
+
+    geo = optparse.OptionGroup(parser, 'Geo Restriction')
+    geo.add_option(
+        '--geo-verification-proxy',
+        dest='geo_verification_proxy', default=None, metavar='URL',
+        help='Use this proxy to verify the IP address for some geo-restricted sites. '
+        'The default proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading.')
+    geo.add_option(
+        '--cn-verification-proxy',
+        dest='cn_verification_proxy', default=None, metavar='URL',
+        help=optparse.SUPPRESS_HELP)
+    geo.add_option(
+        '--geo-bypass',
+        action='store_true', dest='geo_bypass', default=True,
+        help='Bypass geographic restriction via faking X-Forwarded-For HTTP header')
+    geo.add_option(
+        '--no-geo-bypass',
+        action='store_false', dest='geo_bypass', default=True,
+        help='Do not bypass geographic restriction via faking X-Forwarded-For HTTP header')
+    geo.add_option(
+        '--geo-bypass-country', metavar='CODE',
+        dest='geo_bypass_country', default=None,
+        help='Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 country code')
+    geo.add_option(
+        '--geo-bypass-ip-block', metavar='IP_BLOCK',
+        dest='geo_bypass_ip_block', default=None,
+        help='Force bypass geographic restriction with explicitly provided IP block in CIDR notation')
+
+    selection = optparse.OptionGroup(parser, 'Video Selection')
+    selection.add_option(
+        '--playlist-start',
+        dest='playliststart', metavar='NUMBER', default=1, type=int,
+        help='Playlist video to start at (default is %default)')
+    selection.add_option(
+        '--playlist-end',
+        dest='playlistend', metavar='NUMBER', default=None, type=int,
+        help='Playlist video to end at (default is last)')
+    selection.add_option(
+        '--playlist-items',
+        dest='playlist_items', metavar='ITEM_SPEC', default=None,
+        help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.')
+    selection.add_option(
+        '--match-title',
+        dest='matchtitle', metavar='REGEX',
+        help='Download only matching titles (regex or caseless sub-string)')
+    selection.add_option(
+        '--reject-title',
+        dest='rejecttitle', metavar='REGEX',
+        help='Skip download for matching titles (regex or caseless sub-string)')
+    selection.add_option(
+        '--max-downloads',
+        dest='max_downloads', metavar='NUMBER', type=int, default=None,
+        help='Abort after downloading NUMBER files')
+    selection.add_option(
+        '--min-filesize',
+        metavar='SIZE', dest='min_filesize', default=None,
+        help='Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)')
+    selection.add_option(
+        '--max-filesize',
+        metavar='SIZE', dest='max_filesize', default=None,
+        help='Do not download any videos larger than SIZE (e.g. 50k or 44.6m)')
+    selection.add_option(
+        '--date',
+        metavar='DATE', dest='date', default=None,
+        help='Download only videos uploaded in this date')
+    selection.add_option(
+        '--datebefore',
+        metavar='DATE', dest='datebefore', default=None,
+        help='Download only videos uploaded on or before this date (i.e. inclusive)')
+    selection.add_option(
+        '--dateafter',
+        metavar='DATE', dest='dateafter', default=None,
+        help='Download only videos uploaded on or after this date (i.e. inclusive)')
+    selection.add_option(
+        '--min-views',
+        metavar='COUNT', dest='min_views', default=None, type=int,
+        help='Do not download any videos with less than COUNT views')
+    selection.add_option(
+        '--max-views',
+        metavar='COUNT', dest='max_views', default=None, type=int,
+        help='Do not download any videos with more than COUNT views')
+    selection.add_option(
+        '--match-filter',
+        metavar='FILTER', dest='match_filter', default=None,
+        help=(
+            'Generic video filter. '
+            'Specify any key (see the "OUTPUT TEMPLATE" for a list of available keys) to '
+            'match if the key is present, '
+            '!key to check if the key is not present, '
+            'key > NUMBER (like "comment_count > 12", also works with '
+            '>=, <, <=, !=, =) to compare against a number, '
+            'key = \'LITERAL\' (like "uploader = \'Mike Smith\'", also works with !=) '
+            'to match against a string literal '
+            'and & to require multiple matches. '
+            'Values which are not known are excluded unless you '
+            'put a question mark (?) after the operator. '
+            'For example, to only match videos that have been liked more than '
+            '100 times and disliked less than 50 times (or the dislike '
+            'functionality is not available at the given service), but who '
+            'also have a description, use --match-filter '
+            '"like_count > 100 & dislike_count <? 50 & description" .'
+        ))
+    selection.add_option(
+        '--no-playlist',
+        action='store_true', dest='noplaylist', default=False,
+        help='Download only the video, if the URL refers to a video and a playlist.')
+    selection.add_option(
+        '--yes-playlist',
+        action='store_false', dest='noplaylist', default=False,
+        help='Download the playlist, if the URL refers to a video and a playlist.')
+    selection.add_option(
+        '--age-limit',
+        metavar='YEARS', dest='age_limit', default=None, type=int,
+        help='Download only videos suitable for the given age')
+    selection.add_option(
+        '--download-archive', metavar='FILE',
+        dest='download_archive',
+        help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
+    selection.add_option(
+        '--include-ads',
+        dest='include_ads', action='store_true',
+        help='Download advertisements as well (experimental)')
+
+    authentication = optparse.OptionGroup(parser, 'Authentication Options')
+    authentication.add_option(
+        '-u', '--username',
+        dest='username', metavar='USERNAME',
+        help='Login with this account ID')
+    authentication.add_option(
+        '-p', '--password',
+        dest='password', metavar='PASSWORD',
+        help='Account password. If this option is left out, youtube-dlc will ask interactively.')
+    authentication.add_option(
+        '-2', '--twofactor',
+        dest='twofactor', metavar='TWOFACTOR',
+        help='Two-factor authentication code')
+    authentication.add_option(
+        '-n', '--netrc',
+        action='store_true', dest='usenetrc', default=False,
+        help='Use .netrc authentication data')
+    authentication.add_option(
+        '--video-password',
+        dest='videopassword', metavar='PASSWORD',
+        help='Video password (vimeo, smotri, youku)')
+
+    adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options')
+    adobe_pass.add_option(
+        '--ap-mso',
+        dest='ap_mso', metavar='MSO',
+        help='Adobe Pass multiple-system operator (TV provider) identifier, use --ap-list-mso for a list of available MSOs')
+    adobe_pass.add_option(
+        '--ap-username',
+        dest='ap_username', metavar='USERNAME',
+        help='Multiple-system operator account login')
+    adobe_pass.add_option(
+        '--ap-password',
+        dest='ap_password', metavar='PASSWORD',
+        help='Multiple-system operator account password. If this option is left out, youtube-dlc will ask interactively.')
+    adobe_pass.add_option(
+        '--ap-list-mso',
+        action='store_true', dest='ap_list_mso', default=False,
+        help='List all supported multiple-system operators')
+
+    video_format = optparse.OptionGroup(parser, 'Video Format Options')
+    video_format.add_option(
+        '-f', '--format',
+        action='store', dest='format', metavar='FORMAT', default=None,
+        help='Video format code, see the "FORMAT SELECTION" for all the info')
+    video_format.add_option(
+        '--all-formats',
+        action='store_const', dest='format', const='all',
+        help='Download all available video formats')
+    video_format.add_option(
+        '--prefer-free-formats',
+        action='store_true', dest='prefer_free_formats', default=False,
+        help='Prefer free video formats unless a specific one is requested')
+    video_format.add_option(
+        '-F', '--list-formats',
+        action='store_true', dest='listformats',
+        help='List all available formats of requested videos')
+    video_format.add_option(
+        '--youtube-include-dash-manifest',
+        action='store_true', dest='youtube_include_dash_manifest', default=True,
+        help=optparse.SUPPRESS_HELP)
+    video_format.add_option(
+        '--youtube-skip-dash-manifest',
+        action='store_false', dest='youtube_include_dash_manifest',
+        help='Do not download the DASH manifests and related data on YouTube videos')
+    video_format.add_option(
+        '--merge-output-format',
+        action='store', dest='merge_output_format', metavar='FORMAT', default=None,
+        help=(
+            'If a merge is required (e.g. bestvideo+bestaudio), '
+            'output to given container format. One of mkv, mp4, ogg, webm, flv. '
+            'Ignored if no merge is required'))
+
+    subtitles = optparse.OptionGroup(parser, 'Subtitle Options')
+    subtitles.add_option(
+        '--write-sub', '--write-srt',
+        action='store_true', dest='writesubtitles', default=False,
+        help='Write subtitle file')
+    subtitles.add_option(
+        '--write-auto-sub', '--write-automatic-sub',
+        action='store_true', dest='writeautomaticsub', default=False,
+        help='Write automatically generated subtitle file (YouTube only)')
+    subtitles.add_option(
+        '--all-subs',
+        action='store_true', dest='allsubtitles', default=False,
+        help='Download all the available subtitles of the video')
+    subtitles.add_option(
+        '--list-subs',
+        action='store_true', dest='listsubtitles', default=False,
+        help='List all available subtitles for the video')
+    subtitles.add_option(
+        '--sub-format',
+        action='store', dest='subtitlesformat', metavar='FORMAT', default='best',
+        help='Subtitle format, accepts formats preference, for example: "srt" or "ass/srt/best"')
+    subtitles.add_option(
+        '--sub-lang', '--sub-langs', '--srt-lang',
+        action='callback', dest='subtitleslangs', metavar='LANGS', type='str',
+        default=[], callback=_comma_separated_values_options_callback,
+        help='Languages of the subtitles to download (optional) separated by commas, use --list-subs for available language tags')
+
+    downloader = optparse.OptionGroup(parser, 'Download Options')
+    downloader.add_option(
+        '-r', '--limit-rate', '--rate-limit',
+        dest='ratelimit', metavar='RATE',
+        help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)')
+    downloader.add_option(
+        '-R', '--retries',
+        dest='retries', metavar='RETRIES', default=10,
+        help='Number of retries (default is %default), or "infinite".')
+    downloader.add_option(
+        '--fragment-retries',
+        dest='fragment_retries', metavar='RETRIES', default=10,
+        help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)')
+    downloader.add_option(
+        '--skip-unavailable-fragments',
+        action='store_true', dest='skip_unavailable_fragments', default=True,
+        help='Skip unavailable fragments (DASH, hlsnative and ISM)')
+    downloader.add_option(
+        '--abort-on-unavailable-fragment',
+        action='store_false', dest='skip_unavailable_fragments',
+        help='Abort downloading when some fragment is not available')
+    downloader.add_option(
+        '--keep-fragments',
+        action='store_true', dest='keep_fragments', default=False,
+        help='Keep downloaded fragments on disk after downloading is finished; fragments are erased by default')
+    downloader.add_option(
+        '--buffer-size',
+        dest='buffersize', metavar='SIZE', default='1024',
+        help='Size of download buffer (e.g. 1024 or 16K) (default is %default)')
+    downloader.add_option(
+        '--no-resize-buffer',
+        action='store_true', dest='noresizebuffer', default=False,
+        help='Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.')
+    downloader.add_option(
+        '--http-chunk-size',
+        dest='http_chunk_size', metavar='SIZE', default=None,
+        help='Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). '
+             'May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)')
+    downloader.add_option(
+        '--test',
+        action='store_true', dest='test', default=False,
+        help=optparse.SUPPRESS_HELP)
+    downloader.add_option(
+        '--playlist-reverse',
+        action='store_true',
+        help='Download playlist videos in reverse order')
+    downloader.add_option(
+        '--playlist-random',
+        action='store_true',
+        help='Download playlist videos in random order')
+    downloader.add_option(
+        '--xattr-set-filesize',
+        dest='xattr_set_filesize', action='store_true',
+        help='Set file xattribute ytdl.filesize with expected file size')
+    downloader.add_option(
+        '--hls-prefer-native',
+        dest='hls_prefer_native', action='store_true', default=None,
+        help='Use the native HLS downloader instead of ffmpeg')
+    downloader.add_option(
+        '--hls-prefer-ffmpeg',
+        dest='hls_prefer_native', action='store_false', default=None,
+        help='Use ffmpeg instead of the native HLS downloader')
+    downloader.add_option(
+        '--hls-use-mpegts',
+        dest='hls_use_mpegts', action='store_true',
+        help='Use the mpegts container for HLS videos, allowing to play the '
+             'video while downloading (some players may not be able to play it)')
+    downloader.add_option(
+        '--external-downloader',
+        dest='external_downloader', metavar='COMMAND',
+        help='Use the specified external downloader. '
+             'Currently supports %s' % ','.join(list_external_downloaders()))
+    downloader.add_option(
+        '--external-downloader-args',
+        dest='external_downloader_args', metavar='ARGS',
+        help='Give these arguments to the external downloader')
+
+    workarounds = optparse.OptionGroup(parser, 'Workarounds')
+    workarounds.add_option(
+        '--encoding',
+        dest='encoding', metavar='ENCODING',
+        help='Force the specified encoding (experimental)')
+    workarounds.add_option(
+        '--no-check-certificate',
+        action='store_true', dest='no_check_certificate', default=False,
+        help='Suppress HTTPS certificate validation')
+    workarounds.add_option(
+        '--prefer-insecure',
+        '--prefer-unsecure', action='store_true', dest='prefer_insecure',
+        help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)')
+    workarounds.add_option(
+        '--user-agent',
+        metavar='UA', dest='user_agent',
+        help='Specify a custom user agent')
+    workarounds.add_option(
+        '--referer',
+        metavar='URL', dest='referer', default=None,
+        help='Specify a custom referer, use if the video access is restricted to one domain',
+    )
+    workarounds.add_option(
+        '--add-header',
+        metavar='FIELD:VALUE', dest='headers', action='append',
+        help='Specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times',
+    )
+    workarounds.add_option(
+        '--bidi-workaround',
+        dest='bidi_workaround', action='store_true',
+        help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
+    workarounds.add_option(
+        '--sleep-interval', '--min-sleep-interval', metavar='SECONDS',
+        dest='sleep_interval', type=float,
+        help=(
+            'Number of seconds to sleep before each download when used alone '
+            'or a lower bound of a range for randomized sleep before each download '
+            '(minimum possible number of seconds to sleep) when used along with '
+            '--max-sleep-interval.'))
+    workarounds.add_option(
+        '--max-sleep-interval', metavar='SECONDS',
+        dest='max_sleep_interval', type=float,
+        help=(
+            'Upper bound of a range for randomized sleep before each download '
+            '(maximum possible number of seconds to sleep). Must only be used '
+            'along with --min-sleep-interval.'))
+
+    verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
+    verbosity.add_option(
+        '-q', '--quiet',
+        action='store_true', dest='quiet', default=False,
+        help='Activate quiet mode')
+    verbosity.add_option(
+        '--no-warnings',
+        dest='no_warnings', action='store_true', default=False,
+        help='Ignore warnings')
+    verbosity.add_option(
+        '-s', '--simulate',
+        action='store_true', dest='simulate', default=False,
+        help='Do not download the video and do not write anything to disk')
+    verbosity.add_option(
+        '--skip-download',
+        action='store_true', dest='skip_download', default=False,
+        help='Do not download the video')
+    verbosity.add_option(
+        '-g', '--get-url',
+        action='store_true', dest='geturl', default=False,
+        help='Simulate, quiet but print URL')
+    verbosity.add_option(
+        '-e', '--get-title',
+        action='store_true', dest='gettitle', default=False,
+        help='Simulate, quiet but print title')
+    verbosity.add_option(
+        '--get-id',
+        action='store_true', dest='getid', default=False,
+        help='Simulate, quiet but print id')
+    verbosity.add_option(
+        '--get-thumbnail',
+        action='store_true', dest='getthumbnail', default=False,
+        help='Simulate, quiet but print thumbnail URL')
+    verbosity.add_option(
+        '--get-description',
+        action='store_true', dest='getdescription', default=False,
+        help='Simulate, quiet but print video description')
+    verbosity.add_option(
+        '--get-duration',
+        action='store_true', dest='getduration', default=False,
+        help='Simulate, quiet but print video length')
+    verbosity.add_option(
+        '--get-filename',
+        action='store_true', dest='getfilename', default=False,
+        help='Simulate, quiet but print output filename')
+    verbosity.add_option(
+        '--get-format',
+        action='store_true', dest='getformat', default=False,
+        help='Simulate, quiet but print output format')
+    verbosity.add_option(
+        '-j', '--dump-json',
+        action='store_true', dest='dumpjson', default=False,
+        help='Simulate, quiet but print JSON information. See the "OUTPUT TEMPLATE" for a description of available keys.')
+    verbosity.add_option(
+        '-J', '--dump-single-json',
+        action='store_true', dest='dump_single_json', default=False,
+        help='Simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist information in a single line.')
+    verbosity.add_option(
+        '--print-json',
+        action='store_true', dest='print_json', default=False,
+        help='Be quiet and print the video information as JSON (video is still being downloaded).',
+    )
+    verbosity.add_option(
+        '--newline',
+        action='store_true', dest='progress_with_newline', default=False,
+        help='Output progress bar as new lines')
+    verbosity.add_option(
+        '--no-progress',
+        action='store_true', dest='noprogress', default=False,
+        help='Do not print progress bar')
+    verbosity.add_option(
+        '--console-title',
+        action='store_true', dest='consoletitle', default=False,
+        help='Display progress in console titlebar')
+    verbosity.add_option(
+        '-v', '--verbose',
+        action='store_true', dest='verbose', default=False,
+        help='Print various debugging information')
+    verbosity.add_option(
+        '--dump-pages', '--dump-intermediate-pages',
+        action='store_true', dest='dump_intermediate_pages', default=False,
+        help='Print downloaded pages encoded using base64 to debug problems (very verbose)')
+    verbosity.add_option(
+        '--write-pages',
+        action='store_true', dest='write_pages', default=False,
+        help='Write downloaded intermediary pages to files in the current directory to debug problems')
+    verbosity.add_option(
+        '--youtube-print-sig-code',
+        action='store_true', dest='youtube_print_sig_code', default=False,
+        help=optparse.SUPPRESS_HELP)
+    verbosity.add_option(
+        '--print-traffic', '--dump-headers',
+        dest='debug_printtraffic', action='store_true', default=False,
+        help='Display sent and read HTTP traffic')
+    verbosity.add_option(
+        '-C', '--call-home',
+        dest='call_home', action='store_true', default=False,
+        help='Contact the youtube-dlc server for debugging')
+    verbosity.add_option(
+        '--no-call-home',
+        dest='call_home', action='store_false', default=False,
+        help='Do NOT contact the youtube-dlc server for debugging')
+
+    filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
+    filesystem.add_option(
+        '-a', '--batch-file',
+        dest='batchfile', metavar='FILE',
+        help="File containing URLs to download ('-' for stdin), one URL per line. "
+             "Lines starting with '#', ';' or ']' are considered as comments and ignored.")
+    filesystem.add_option(
+        '--id', default=False,
+        action='store_true', dest='useid', help='Use only video ID in file name')
+    filesystem.add_option(
+        '-o', '--output',
+        dest='outtmpl', metavar='TEMPLATE',
+        help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info'))
+    filesystem.add_option(
+        '--autonumber-size',
+        dest='autonumber_size', metavar='NUMBER', type=int,
+        help=optparse.SUPPRESS_HELP)
+    filesystem.add_option(
+        '--autonumber-start',
+        dest='autonumber_start', metavar='NUMBER', default=1, type=int,
+        help='Specify the start value for %(autonumber)s (default is %default)')
+    filesystem.add_option(
+        '--restrict-filenames',
+        action='store_true', dest='restrictfilenames', default=False,
+        help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames')
+    filesystem.add_option(
+        '-A', '--auto-number',
+        action='store_true', dest='autonumber', default=False,
+        help=optparse.SUPPRESS_HELP)
+    filesystem.add_option(
+        '-t', '--title',
+        action='store_true', dest='usetitle', default=False,
+        help=optparse.SUPPRESS_HELP)
+    filesystem.add_option(
+        '-l', '--literal', default=False,
+        action='store_true', dest='usetitle',
+        help=optparse.SUPPRESS_HELP)
+    filesystem.add_option(
+        '-w', '--no-overwrites',
+        action='store_true', dest='nooverwrites', default=False,
+        help='Do not overwrite files')
+    filesystem.add_option(
+        '-c', '--continue',
+        action='store_true', dest='continue_dl', default=True,
+        help='Force resume of partially downloaded files. By default, youtube-dlc will resume downloads if possible.')
+    filesystem.add_option(
+        '--no-continue',
+        action='store_false', dest='continue_dl',
+        help='Do not resume partially downloaded files (restart from beginning)')
+    filesystem.add_option(
+        '--no-part',
+        action='store_true', dest='nopart', default=False,
+        help='Do not use .part files - write directly into output file')
+    filesystem.add_option(
+        '--no-mtime',
+        action='store_false', dest='updatetime', default=True,
+        help='Do not use the Last-modified header to set the file modification time')
+    filesystem.add_option(
+        '--write-description',
+        action='store_true', dest='writedescription', default=False,
+        help='Write video description to a .description file')
+    filesystem.add_option(
+        '--write-info-json',
+        action='store_true', dest='writeinfojson', default=False,
+        help='Write video metadata to a .info.json file')
+    filesystem.add_option(
+        '--write-annotations',
+        action='store_true', dest='writeannotations', default=False,
+        help='Write video annotations to a .annotations.xml file')
+    filesystem.add_option(
+        '--load-info-json', '--load-info',
+        dest='load_info_filename', metavar='FILE',
+        help='JSON file containing the video information (created with the "--write-info-json" option)')
+    filesystem.add_option(
+        '--cookies',
+        dest='cookiefile', metavar='FILE',
+        help='File to read cookies from and dump cookie jar in')
+    filesystem.add_option(
+        '--cache-dir', dest='cachedir', default=None, metavar='DIR',
+        help='Location in the filesystem where youtube-dlc can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dlc or ~/.cache/youtube-dlc . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.')
+    filesystem.add_option(
+        '--no-cache-dir', action='store_const', const=False, dest='cachedir',
+        help='Disable filesystem caching')
+    filesystem.add_option(
+        '--rm-cache-dir',
+        action='store_true', dest='rm_cachedir',
+        help='Delete all filesystem cache files')
+
+    thumbnail = optparse.OptionGroup(parser, 'Thumbnail images')
+    thumbnail.add_option(
+        '--write-thumbnail',
+        action='store_true', dest='writethumbnail', default=False,
+        help='Write thumbnail image to disk')
+    thumbnail.add_option(
+        '--write-all-thumbnails',
+        action='store_true', dest='write_all_thumbnails', default=False,
+        help='Write all thumbnail image formats to disk')
+    thumbnail.add_option(
+        '--list-thumbnails',
+        action='store_true', dest='list_thumbnails', default=False,
+        help='Simulate and list all available thumbnail formats')
+
+    postproc = optparse.OptionGroup(parser, 'Post-processing Options')
+    postproc.add_option(
+        '-x', '--extract-audio',
+        action='store_true', dest='extractaudio', default=False,
+        help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)')
+    postproc.add_option(
+        '--audio-format', metavar='FORMAT', dest='audioformat', default='best',
+        help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x')
+    postproc.add_option(
+        '--audio-quality', metavar='QUALITY',
+        dest='audioquality', default='5',
+        help='Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default %default)')
+    postproc.add_option(
+        '--recode-video',
+        metavar='FORMAT', dest='recodevideo', default=None,
+        help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi)')
+    postproc.add_option(
+        '--postprocessor-args',
+        dest='postprocessor_args', metavar='ARGS',
+        help='Give these arguments to the postprocessor')
+    postproc.add_option(
+        '-k', '--keep-video',
+        action='store_true', dest='keepvideo', default=False,
+        help='Keep the video file on disk after the post-processing; the video is erased by default')
+    postproc.add_option(
+        '--no-post-overwrites',
+        action='store_true', dest='nopostoverwrites', default=False,
+        help='Do not overwrite post-processed files; the post-processed files are overwritten by default')
+    postproc.add_option(
+        '--embed-subs',
+        action='store_true', dest='embedsubtitles', default=False,
+        help='Embed subtitles in the video (only for mp4, webm and mkv videos)')
+    postproc.add_option(
+        '--embed-thumbnail',
+        action='store_true', dest='embedthumbnail', default=False,
+        help='Embed thumbnail in the audio as cover art')
+    postproc.add_option(
+        '--add-metadata',
+        action='store_true', dest='addmetadata', default=False,
+        help='Write metadata to the video file')
+    postproc.add_option(
+        '--metadata-from-title',
+        metavar='FORMAT', dest='metafromtitle',
+        help='Parse additional metadata like song title / artist from the video title. '
+             'The format syntax is the same as --output. Regular expression with '
+             'named capture groups may also be used. '
+             'The parsed parameters replace existing values. '
+             'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
+             '"Coldplay - Paradise". '
+             'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"')
+    postproc.add_option(
+        '--xattrs',
+        action='store_true', dest='xattrs', default=False,
+        help='Write metadata to the video file\'s xattrs (using dublin core and xdg standards)')
+    postproc.add_option(
+        '--fixup',
+        metavar='POLICY', dest='fixup', default='detect_or_warn',
+        help='Automatically correct known faults of the file. '
+             'One of never (do nothing), warn (only emit a warning), '
+             'detect_or_warn (the default; fix file if we can, warn otherwise)')
+    postproc.add_option(
+        '--prefer-avconv',
+        action='store_false', dest='prefer_ffmpeg',
+        help='Prefer avconv over ffmpeg for running the postprocessors')
+    postproc.add_option(
+        '--prefer-ffmpeg',
+        action='store_true', dest='prefer_ffmpeg',
+        help='Prefer ffmpeg over avconv for running the postprocessors (default)')
+    postproc.add_option(
+        '--ffmpeg-location', '--avconv-location', metavar='PATH',
+        dest='ffmpeg_location',
+        help='Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory.')
+    postproc.add_option(
+        '--exec',
+        metavar='CMD', dest='exec_cmd',
+        help='Execute a command on the file after downloading and post-processing, similar to find\'s -exec syntax. Example: --exec \'adb push {} /sdcard/Music/ && rm {}\'')
+    postproc.add_option(
+        '--convert-subs', '--convert-subtitles',
+        metavar='FORMAT', dest='convertsubtitles', default=None,
+        help='Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc)')
+
+    parser.add_option_group(general)
+    parser.add_option_group(network)
+    parser.add_option_group(geo)
+    parser.add_option_group(selection)
+    parser.add_option_group(downloader)
+    parser.add_option_group(filesystem)
+    parser.add_option_group(thumbnail)
+    parser.add_option_group(verbosity)
+    parser.add_option_group(workarounds)
+    parser.add_option_group(video_format)
+    parser.add_option_group(subtitles)
+    parser.add_option_group(authentication)
+    parser.add_option_group(adobe_pass)
+    parser.add_option_group(postproc)
+
+    if overrideArguments is not None:
+        opts, args = parser.parse_args(overrideArguments)
+        if opts.verbose:
+            write_string('[debug] Override config: ' + repr(overrideArguments) + '\n')
+    else:
+        def compat_conf(conf):
+            if sys.version_info < (3,):
+                return [a.decode(preferredencoding(), 'replace') for a in conf]
+            return conf
+
+        command_line_conf = compat_conf(sys.argv[1:])
+        opts, args = parser.parse_args(command_line_conf)
+
+        system_conf = user_conf = custom_conf = []
+
+        if '--config-location' in command_line_conf:
+            location = compat_expanduser(opts.config_location)
+            if os.path.isdir(location):
+                location = os.path.join(location, 'youtube-dlc.conf')
+            if not os.path.exists(location):
+                parser.error('config-location %s does not exist.' % location)
+            custom_conf = _readOptions(location)
+        elif '--ignore-config' in command_line_conf:
+            pass
+        else:
+            system_conf = _readOptions('/etc/youtube-dlc.conf')
+            if '--ignore-config' not in system_conf:
+                user_conf = _readUserConf()
+
+        argv = system_conf + user_conf + custom_conf + command_line_conf
+        opts, args = parser.parse_args(argv)
+        if opts.verbose:
+            for conf_label, conf in (
+                    ('System config', system_conf),
+                    ('User config', user_conf),
+                    ('Custom config', custom_conf),
+                    ('Command-line args', command_line_conf)):
+                write_string('[debug] %s: %s\n' % (conf_label, repr(_hide_login_info(conf))))
+
+    return parser, opts, args
diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py

new file mode 100644 (file)

index 0000000..3ea5183
--- /dev/null
+++ b/youtube_dl/postprocessor/__init__.py
@@ -0,0 +1,40 @@
+from __future__ import unicode_literals
+
+from .embedthumbnail import EmbedThumbnailPP
+from .ffmpeg import (
+    FFmpegPostProcessor,
+    FFmpegEmbedSubtitlePP,
+    FFmpegExtractAudioPP,
+    FFmpegFixupStretchedPP,
+    FFmpegFixupM3u8PP,
+    FFmpegFixupM4aPP,
+    FFmpegMergerPP,
+    FFmpegMetadataPP,
+    FFmpegVideoConvertorPP,
+    FFmpegSubtitlesConvertorPP,
+)
+from .xattrpp import XAttrMetadataPP
+from .execafterdownload import ExecAfterDownloadPP
+from .metadatafromtitle import MetadataFromTitlePP
+
+
+def get_postprocessor(key):
+    return globals()[key + 'PP']
+
+
+__all__ = [
+    'EmbedThumbnailPP',
+    'ExecAfterDownloadPP',
+    'FFmpegEmbedSubtitlePP',
+    'FFmpegExtractAudioPP',
+    'FFmpegFixupM3u8PP',
+    'FFmpegFixupM4aPP',
+    'FFmpegFixupStretchedPP',
+    'FFmpegMergerPP',
+    'FFmpegMetadataPP',
+    'FFmpegPostProcessor',
+    'FFmpegSubtitlesConvertorPP',
+    'FFmpegVideoConvertorPP',
+    'MetadataFromTitlePP',
+    'XAttrMetadataPP',
+]
diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py

new file mode 100644 (file)

index 0000000..599dd1d
--- /dev/null
+++ b/youtube_dl/postprocessor/common.py
@@ -0,0 +1,69 @@
+from __future__ import unicode_literals
+
+import os
+
+from ..utils import (
+    PostProcessingError,
+    cli_configuration_args,
+    encodeFilename,
+)
+
+
+class PostProcessor(object):
+    """Post Processor class.
+
+    PostProcessor objects can be added to downloaders with their
+    add_post_processor() method. When the downloader has finished a
+    successful download, it will take its internal chain of PostProcessors
+    and start calling the run() method on each one of them, first with
+    an initial argument and then with the returned value of the previous
+    PostProcessor.
+
+    The chain will be stopped if one of them ever returns None or the end
+    of the chain is reached.
+
+    PostProcessor objects follow a "mutual registration" process similar
+    to InfoExtractor objects.
+
+    Optionally PostProcessor can use a list of additional command-line arguments
+    with self._configuration_args.
+    """
+
+    _downloader = None
+
+    def __init__(self, downloader=None):
+        self._downloader = downloader
+
+    def set_downloader(self, downloader):
+        """Sets the downloader for this PP."""
+        self._downloader = downloader
+
+    def run(self, information):
+        """Run the PostProcessor.
+
+        The "information" argument is a dictionary like the ones
+        composed by InfoExtractors. The only difference is that this
+        one has an extra field called "filepath" that points to the
+        downloaded file.
+
+        This method returns a tuple, the first element is a list of the files
+        that can be deleted, and the second of which is the updated
+        information.
+
+        In addition, this method may raise a PostProcessingError
+        exception if post processing fails.
+        """
+        return [], information  # by default, keep file and do nothing
+
+    def try_utime(self, path, atime, mtime, errnote='Cannot update utime of file'):
+        try:
+            os.utime(encodeFilename(path), (atime, mtime))
+        except Exception:
+            self._downloader.report_warning(errnote)
+
+    def _configuration_args(self, default=[]):
+        return cli_configuration_args(self._downloader.params, 'postprocessor_args', default)
+
+
+class AudioConversionError(PostProcessingError):
+    pass
diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py

new file mode 100644 (file)

index 0000000..e2002ab
--- /dev/null
+++ b/youtube_dl/postprocessor/embedthumbnail.py
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+
+import os
+import subprocess
+
+from .ffmpeg import FFmpegPostProcessor
+
+from ..utils import (
+    check_executable,
+    encodeArgument,
+    encodeFilename,
+    PostProcessingError,
+    prepend_extension,
+    shell_quote
+)
+
+
+class EmbedThumbnailPPError(PostProcessingError):
+    pass
+
+
+class EmbedThumbnailPP(FFmpegPostProcessor):
+    def __init__(self, downloader=None, already_have_thumbnail=False):
+        super(EmbedThumbnailPP, self).__init__(downloader)
+        self._already_have_thumbnail = already_have_thumbnail
+
+    def run(self, info):
+        filename = info['filepath']
+        temp_filename = prepend_extension(filename, 'temp')
+
+        if not info.get('thumbnails'):
+            self._downloader.to_screen('[embedthumbnail] There aren\'t any thumbnails to embed')
+            return [], info
+
+        thumbnail_filename = info['thumbnails'][-1]['filename']
+
+        if not os.path.exists(encodeFilename(thumbnail_filename)):
+            self._downloader.report_warning(
+                'Skipping embedding the thumbnail because the file is missing.')
+            return [], info
+
+        # Check for mislabeled webp file
+        with open(encodeFilename(thumbnail_filename), "rb") as f:
+            b = f.read(16)
+        if b'\x57\x45\x42\x50' in b:  # Binary for WEBP
+            [thumbnail_filename_path, thumbnail_filename_extension] = os.path.splitext(thumbnail_filename)
+            if not thumbnail_filename_extension == ".webp":
+                webp_thumbnail_filename = thumbnail_filename_path + ".webp"
+                os.rename(encodeFilename(thumbnail_filename), encodeFilename(webp_thumbnail_filename))
+                thumbnail_filename = webp_thumbnail_filename
+
+        # If not a jpg or png thumbnail, convert it to jpg using ffmpeg
+        if not os.path.splitext(thumbnail_filename)[1].lower() in ['.jpg', '.png']:
+            jpg_thumbnail_filename = os.path.splitext(thumbnail_filename)[0] + ".jpg"
+            jpg_thumbnail_filename = os.path.join(os.path.dirname(jpg_thumbnail_filename), os.path.basename(jpg_thumbnail_filename).replace('%', '_'))  # ffmpeg interprets % as image sequence
+
+            self._downloader.to_screen('[ffmpeg] Converting thumbnail "%s" to JPEG' % thumbnail_filename)
+
+            self.run_ffmpeg(thumbnail_filename, jpg_thumbnail_filename, ['-bsf:v', 'mjpeg2jpeg'])
+
+            os.remove(encodeFilename(thumbnail_filename))
+            thumbnail_filename = jpg_thumbnail_filename
+
+        if info['ext'] == 'mp3':
+            options = [
+                '-c', 'copy', '-map', '0', '-map', '1',
+                '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"']
+
+            self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename)
+
+            self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options)
+
+            if not self._already_have_thumbnail:
+                os.remove(encodeFilename(thumbnail_filename))
+            os.remove(encodeFilename(filename))
+            os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+        elif info['ext'] in ['m4a', 'mp4']:
+            if not check_executable('AtomicParsley', ['-v']):
+                raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
+
+            cmd = [encodeFilename('AtomicParsley', True),
+                   encodeFilename(filename, True),
+                   encodeArgument('--artwork'),
+                   encodeFilename(thumbnail_filename, True),
+                   encodeArgument('-o'),
+                   encodeFilename(temp_filename, True)]
+
+            self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename)
+
+            if self._downloader.params.get('verbose', False):
+                self._downloader.to_screen('[debug] AtomicParsley command line: %s' % shell_quote(cmd))
+
+            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            stdout, stderr = p.communicate()
+
+            if p.returncode != 0:
+                msg = stderr.decode('utf-8', 'replace').strip()
+                raise EmbedThumbnailPPError(msg)
+
+            if not self._already_have_thumbnail:
+                os.remove(encodeFilename(thumbnail_filename))
+            # for formats that don't support thumbnails (like 3gp) AtomicParsley
+            # won't create to the temporary file
+            if b'No changes' in stdout:
+                self._downloader.report_warning('The file format doesn\'t support embedding a thumbnail')
+            else:
+                os.remove(encodeFilename(filename))
+                os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+        else:
+            raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.')
+
+        return [], info
diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py

new file mode 100644 (file)

index 0000000..64dabe7
--- /dev/null
+++ b/youtube_dl/postprocessor/execafterdownload.py
@@ -0,0 +1,31 @@
+from __future__ import unicode_literals
+
+import subprocess
+
+from .common import PostProcessor
+from ..compat import compat_shlex_quote
+from ..utils import (
+    encodeArgument,
+    PostProcessingError,
+)
+
+
+class ExecAfterDownloadPP(PostProcessor):
+    def __init__(self, downloader, exec_cmd):
+        super(ExecAfterDownloadPP, self).__init__(downloader)
+        self.exec_cmd = exec_cmd
+
+    def run(self, information):
+        cmd = self.exec_cmd
+        if '{}' not in cmd:
+            cmd += ' {}'
+
+        cmd = cmd.replace('{}', compat_shlex_quote(information['filepath']))
+
+        self._downloader.to_screen('[exec] Executing command: %s' % cmd)
+        retCode = subprocess.call(encodeArgument(cmd), shell=True)
+        if retCode != 0:
+            raise PostProcessingError(
+                'Command returned error code %d' % retCode)
+
+        return [], information
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py

new file mode 100644 (file)

index 0000000..dbc736c
--- /dev/null
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -0,0 +1,657 @@
+from __future__ import unicode_literals
+
+import io
+import os
+import subprocess
+import time
+import re
+
+
+from .common import AudioConversionError, PostProcessor
+
+from ..utils import (
+    encodeArgument,
+    encodeFilename,
+    get_exe_version,
+    is_outdated_version,
+    PostProcessingError,
+    prepend_extension,
+    shell_quote,
+    subtitles_filename,
+    dfxp2srt,
+    ISO639Utils,
+    replace_extension,
+)
+
+
+EXT_TO_OUT_FORMATS = {
+    'aac': 'adts',
+    'flac': 'flac',
+    'm4a': 'ipod',
+    'mka': 'matroska',
+    'mkv': 'matroska',
+    'mpg': 'mpeg',
+    'ogv': 'ogg',
+    'ts': 'mpegts',
+    'wma': 'asf',
+    'wmv': 'asf',
+}
+ACODECS = {
+    'mp3': 'libmp3lame',
+    'aac': 'aac',
+    'flac': 'flac',
+    'm4a': 'aac',
+    'opus': 'libopus',
+    'vorbis': 'libvorbis',
+    'wav': None,
+}
+
+
+class FFmpegPostProcessorError(PostProcessingError):
+    pass
+
+
+class FFmpegPostProcessor(PostProcessor):
+    def __init__(self, downloader=None):
+        PostProcessor.__init__(self, downloader)
+        self._determine_executables()
+
+    def check_version(self):
+        if not self.available:
+            raise FFmpegPostProcessorError('ffmpeg or avconv not found. Please install one.')
+
+        required_version = '10-0' if self.basename == 'avconv' else '1.0'
+        if is_outdated_version(
+                self._versions[self.basename], required_version):
+            warning = 'Your copy of %s is outdated, update %s to version %s or newer if you encounter any errors.' % (
+                self.basename, self.basename, required_version)
+            if self._downloader:
+                self._downloader.report_warning(warning)
+
+    @staticmethod
+    def get_versions(downloader=None):
+        return FFmpegPostProcessor(downloader)._versions
+
+    def _determine_executables(self):
+        programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe']
+        prefer_ffmpeg = True
+
+        def get_ffmpeg_version(path):
+            ver = get_exe_version(path, args=['-version'])
+            if ver:
+                regexs = [
+                    r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$',  # Ubuntu, see [1]
+                    r'n([0-9.]+)$',  # Arch Linux
+                    # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/
+                ]
+                for regex in regexs:
+                    mobj = re.match(regex, ver)
+                    if mobj:
+                        ver = mobj.group(1)
+            return ver
+
+        self.basename = None
+        self.probe_basename = None
+
+        self._paths = None
+        self._versions = None
+        if self._downloader:
+            prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True)
+            location = self._downloader.params.get('ffmpeg_location')
+            if location is not None:
+                if not os.path.exists(location):
+                    self._downloader.report_warning(
+                        'ffmpeg-location %s does not exist! '
+                        'Continuing without avconv/ffmpeg.' % (location))
+                    self._versions = {}
+                    return
+                elif not os.path.isdir(location):
+                    basename = os.path.splitext(os.path.basename(location))[0]
+                    if basename not in programs:
+                        self._downloader.report_warning(
+                            'Cannot identify executable %s, its basename should be one of %s. '
+                            'Continuing without avconv/ffmpeg.' %
+                            (location, ', '.join(programs)))
+                        self._versions = {}
+                        return None
+                    location = os.path.dirname(os.path.abspath(location))
+                    if basename in ('ffmpeg', 'ffprobe'):
+                        prefer_ffmpeg = True
+
+                self._paths = dict(
+                    (p, os.path.join(location, p)) for p in programs)
+                self._versions = dict(
+                    (p, get_ffmpeg_version(self._paths[p])) for p in programs)
+        if self._versions is None:
+            self._versions = dict(
+                (p, get_ffmpeg_version(p)) for p in programs)
+            self._paths = dict((p, p) for p in programs)
+
+        if prefer_ffmpeg is False:
+            prefs = ('avconv', 'ffmpeg')
+        else:
+            prefs = ('ffmpeg', 'avconv')
+        for p in prefs:
+            if self._versions[p]:
+                self.basename = p
+                break
+
+        if prefer_ffmpeg is False:
+            prefs = ('avprobe', 'ffprobe')
+        else:
+            prefs = ('ffprobe', 'avprobe')
+        for p in prefs:
+            if self._versions[p]:
+                self.probe_basename = p
+                break
+
+    @property
+    def available(self):
+        return self.basename is not None
+
+    @property
+    def executable(self):
+        return self._paths[self.basename]
+
+    @property
+    def probe_available(self):
+        return self.probe_basename is not None
+
+    @property
+    def probe_executable(self):
+        return self._paths[self.probe_basename]
+
+    def get_audio_codec(self, path):
+        if not self.probe_available and not self.available:
+            raise PostProcessingError('ffprobe/avprobe and ffmpeg/avconv not found. Please install one.')
+        try:
+            if self.probe_available:
+                cmd = [
+                    encodeFilename(self.probe_executable, True),
+                    encodeArgument('-show_streams')]
+            else:
+                cmd = [
+                    encodeFilename(self.executable, True),
+                    encodeArgument('-i')]
+            cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True))
+            if self._downloader.params.get('verbose', False):
+                self._downloader.to_screen(
+                    '[debug] %s command line: %s' % (self.basename, shell_quote(cmd)))
+            handle = subprocess.Popen(
+                cmd, stderr=subprocess.PIPE,
+                stdout=subprocess.PIPE, stdin=subprocess.PIPE)
+            stdout_data, stderr_data = handle.communicate()
+            expected_ret = 0 if self.probe_available else 1
+            if handle.wait() != expected_ret:
+                return None
+        except (IOError, OSError):
+            return None
+        output = (stdout_data if self.probe_available else stderr_data).decode('ascii', 'ignore')
+        if self.probe_available:
+            audio_codec = None
+            for line in output.split('\n'):
+                if line.startswith('codec_name='):
+                    audio_codec = line.split('=')[1].strip()
+                elif line.strip() == 'codec_type=audio' and audio_codec is not None:
+                    return audio_codec
+        else:
+            # Stream #FILE_INDEX:STREAM_INDEX[STREAM_ID](LANGUAGE): CODEC_TYPE: CODEC_NAME
+            mobj = re.search(
+                r'Stream\s*#\d+:\d+(?:\[0x[0-9a-f]+\])?(?:\([a-z]{3}\))?:\s*Audio:\s*([0-9a-z]+)',
+                output)
+            if mobj:
+                return mobj.group(1)
+        return None
+
+    def run_ffmpeg_multiple_files(self, input_paths, out_path, opts):
+        self.check_version()
+
+        oldest_mtime = min(
+            os.stat(encodeFilename(path)).st_mtime for path in input_paths)
+
+        opts += self._configuration_args()
+
+        files_cmd = []
+        for path in input_paths:
+            files_cmd.extend([
+                encodeArgument('-i'),
+                encodeFilename(self._ffmpeg_filename_argument(path), True)
+            ])
+        cmd = [encodeFilename(self.executable, True), encodeArgument('-y')]
+        # avconv does not have repeat option
+        if self.basename == 'ffmpeg':
+            cmd += [encodeArgument('-loglevel'), encodeArgument('repeat+info')]
+        cmd += (files_cmd
+                + [encodeArgument(o) for o in opts]
+                + [encodeFilename(self._ffmpeg_filename_argument(out_path), True)])
+
+        if self._downloader.params.get('verbose', False):
+            self._downloader.to_screen('[debug] ffmpeg command line: %s' % shell_quote(cmd))
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+        stdout, stderr = p.communicate()
+        if p.returncode != 0:
+            stderr = stderr.decode('utf-8', 'replace')
+            msg = stderr.strip().split('\n')[-1]
+            raise FFmpegPostProcessorError(msg)
+        self.try_utime(out_path, oldest_mtime, oldest_mtime)
+
+    def run_ffmpeg(self, path, out_path, opts):
+        self.run_ffmpeg_multiple_files([path], out_path, opts)
+
+    def _ffmpeg_filename_argument(self, fn):
+        # Always use 'file:' because the filename may contain ':' (ffmpeg
+        # interprets that as a protocol) or can start with '-' (-- is broken in
+        # ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details)
+        # Also leave '-' intact in order not to break streaming to stdout.
+        return 'file:' + fn if fn != '-' else fn
+
+
+class FFmpegExtractAudioPP(FFmpegPostProcessor):
+    def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, nopostoverwrites=False):
+        FFmpegPostProcessor.__init__(self, downloader)
+        if preferredcodec is None:
+            preferredcodec = 'best'
+        self._preferredcodec = preferredcodec
+        self._preferredquality = preferredquality
+        self._nopostoverwrites = nopostoverwrites
+
+    def run_ffmpeg(self, path, out_path, codec, more_opts):
+        if codec is None:
+            acodec_opts = []
+        else:
+            acodec_opts = ['-acodec', codec]
+        opts = ['-vn'] + acodec_opts + more_opts
+        try:
+            FFmpegPostProcessor.run_ffmpeg(self, path, out_path, opts)
+        except FFmpegPostProcessorError as err:
+            raise AudioConversionError(err.msg)
+
+    def run(self, information):
+        path = information['filepath']
+
+        filecodec = self.get_audio_codec(path)
+        if filecodec is None:
+            raise PostProcessingError('WARNING: unable to obtain file audio codec with ffprobe')
+
+        more_opts = []
+        if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
+            if filecodec == 'aac' and self._preferredcodec in ['m4a', 'best']:
+                # Lossless, but in another container
+                acodec = 'copy'
+                extension = 'm4a'
+                more_opts = ['-bsf:a', 'aac_adtstoasc']
+            elif filecodec in ['aac', 'flac', 'mp3', 'vorbis', 'opus']:
+                # Lossless if possible
+                acodec = 'copy'
+                extension = filecodec
+                if filecodec == 'aac':
+                    more_opts = ['-f', 'adts']
+                if filecodec == 'vorbis':
+                    extension = 'ogg'
+            else:
+                # MP3 otherwise.
+                acodec = 'libmp3lame'
+                extension = 'mp3'
+                more_opts = []
+                if self._preferredquality is not None:
+                    if int(self._preferredquality) < 10:
+                        more_opts += ['-q:a', self._preferredquality]
+                    else:
+                        more_opts += ['-b:a', self._preferredquality + 'k']
+        else:
+            # We convert the audio (lossy if codec is lossy)
+            acodec = ACODECS[self._preferredcodec]
+            extension = self._preferredcodec
+            more_opts = []
+            if self._preferredquality is not None:
+                # The opus codec doesn't support the -aq option
+                if int(self._preferredquality) < 10 and extension != 'opus':
+                    more_opts += ['-q:a', self._preferredquality]
+                else:
+                    more_opts += ['-b:a', self._preferredquality + 'k']
+            if self._preferredcodec == 'aac':
+                more_opts += ['-f', 'adts']
+            if self._preferredcodec == 'm4a':
+                more_opts += ['-bsf:a', 'aac_adtstoasc']
+            if self._preferredcodec == 'vorbis':
+                extension = 'ogg'
+            if self._preferredcodec == 'wav':
+                extension = 'wav'
+                more_opts += ['-f', 'wav']
+
+        prefix, sep, ext = path.rpartition('.')  # not os.path.splitext, since the latter does not work on unicode in all setups
+        new_path = prefix + sep + extension
+
+        information['filepath'] = new_path
+        information['ext'] = extension
+
+        # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly.
+        if (new_path == path
+                or (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))):
+            self._downloader.to_screen('[ffmpeg] Post-process file %s exists, skipping' % new_path)
+            return [], information
+
+        try:
+            self._downloader.to_screen('[ffmpeg] Destination: ' + new_path)
+            self.run_ffmpeg(path, new_path, acodec, more_opts)
+        except AudioConversionError as e:
+            raise PostProcessingError(
+                'audio conversion failed: ' + e.msg)
+        except Exception:
+            raise PostProcessingError('error running ' + self.basename)
+
+        # Try to update the date time for extracted audio file.
+        if information.get('filetime') is not None:
+            self.try_utime(
+                new_path, time.time(), information['filetime'],
+                errnote='Cannot update utime of audio file')
+
+        return [path], information
+
+
+class FFmpegVideoConvertorPP(FFmpegPostProcessor):
+    def __init__(self, downloader=None, preferedformat=None):
+        super(FFmpegVideoConvertorPP, self).__init__(downloader)
+        self._preferedformat = preferedformat
+
+    def run(self, information):
+        path = information['filepath']
+        if information['ext'] == self._preferedformat:
+            self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat))
+            return [], information
+        options = []
+        if self._preferedformat == 'avi':
+            options.extend(['-c:v', 'libxvid', '-vtag', 'XVID'])
+        prefix, sep, ext = path.rpartition('.')
+        outpath = prefix + sep + self._preferedformat
+        self._downloader.to_screen('[' + 'ffmpeg' + '] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath)
+        self.run_ffmpeg(path, outpath, options)
+        information['filepath'] = outpath
+        information['format'] = self._preferedformat
+        information['ext'] = self._preferedformat
+        return [path], information
+
+
+class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
+    def run(self, information):
+        if information['ext'] not in ('mp4', 'webm', 'mkv'):
+            self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4, webm or mkv files')
+            return [], information
+        subtitles = information.get('requested_subtitles')
+        if not subtitles:
+            self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed')
+            return [], information
+
+        filename = information['filepath']
+
+        ext = information['ext']
+        sub_langs = []
+        sub_filenames = []
+        webm_vtt_warn = False
+
+        for lang, sub_info in subtitles.items():
+            sub_ext = sub_info['ext']
+            if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt':
+                sub_langs.append(lang)
+                sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext))
+            else:
+                if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt':
+                    webm_vtt_warn = True
+                    self._downloader.to_screen('[ffmpeg] Only WebVTT subtitles can be embedded in webm files')
+
+        if not sub_langs:
+            return [], information
+
+        input_files = [filename] + sub_filenames
+
+        opts = [
+            '-map', '0',
+            '-c', 'copy',
+            # Don't copy the existing subtitles, we may be running the
+            # postprocessor a second time
+            '-map', '-0:s',
+            # Don't copy Apple TV chapters track, bin_data (see #19042, #19024,
+            # https://trac.ffmpeg.org/ticket/6016)
+            '-map', '-0:d',
+        ]
+        if information['ext'] == 'mp4':
+            opts += ['-c:s', 'mov_text']
+        for (i, lang) in enumerate(sub_langs):
+            opts.extend(['-map', '%d:0' % (i + 1)])
+            lang_code = ISO639Utils.short2long(lang) or lang
+            opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
+
+        temp_filename = prepend_extension(filename, 'temp')
+        self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename)
+        self.run_ffmpeg_multiple_files(input_files, temp_filename, opts)
+        os.remove(encodeFilename(filename))
+        os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+        return sub_filenames, information
+
+
+class FFmpegMetadataPP(FFmpegPostProcessor):
+    def run(self, info):
+        metadata = {}
+
+        def add(meta_list, info_list=None):
+            if not info_list:
+                info_list = meta_list
+            if not isinstance(meta_list, (list, tuple)):
+                meta_list = (meta_list,)
+            if not isinstance(info_list, (list, tuple)):
+                info_list = (info_list,)
+            for info_f in info_list:
+                if info.get(info_f) is not None:
+                    for meta_f in meta_list:
+                        metadata[meta_f] = info[info_f]
+                    break
+
+        # See [1-4] for some info on media metadata/metadata supported
+        # by ffmpeg.
+        # 1. https://kdenlive.org/en/project/adding-meta-data-to-mp4-video/
+        # 2. https://wiki.multimedia.cx/index.php/FFmpeg_Metadata
+        # 3. https://kodi.wiki/view/Video_file_tagging
+        # 4. http://atomicparsley.sourceforge.net/mpeg-4files.html
+
+        add('title', ('track', 'title'))
+        add('date', 'upload_date')
+        add(('description', 'comment'), 'description')
+        add('purl', 'webpage_url')
+        add('track', 'track_number')
+        add('artist', ('artist', 'creator', 'uploader', 'uploader_id'))
+        add('genre')
+        add('album')
+        add('album_artist')
+        add('disc', 'disc_number')
+        add('show', 'series')
+        add('season_number')
+        add('episode_id', ('episode', 'episode_id'))
+        add('episode_sort', 'episode_number')
+
+        if not metadata:
+            self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add')
+            return [], info
+
+        filename = info['filepath']
+        temp_filename = prepend_extension(filename, 'temp')
+        in_filenames = [filename]
+        options = []
+
+        if info['ext'] == 'm4a':
+            options.extend(['-vn', '-acodec', 'copy'])
+        else:
+            options.extend(['-c', 'copy'])
+
+        for (name, value) in metadata.items():
+            options.extend(['-metadata', '%s=%s' % (name, value)])
+
+        chapters = info.get('chapters', [])
+        if chapters:
+            metadata_filename = replace_extension(filename, 'meta')
+            with io.open(metadata_filename, 'wt', encoding='utf-8') as f:
+                def ffmpeg_escape(text):
+                    return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text)
+
+                metadata_file_content = ';FFMETADATA1\n'
+                for chapter in chapters:
+                    metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n'
+                    metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000)
+                    metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000)
+                    chapter_title = chapter.get('title')
+                    if chapter_title:
+                        metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title)
+                f.write(metadata_file_content)
+                in_filenames.append(metadata_filename)
+                options.extend(['-map_metadata', '1'])
+
+        self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
+        self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options)
+        if chapters:
+            os.remove(metadata_filename)
+        os.remove(encodeFilename(filename))
+        os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+        return [], info
+
+
+class FFmpegMergerPP(FFmpegPostProcessor):
+    def run(self, info):
+        filename = info['filepath']
+        temp_filename = prepend_extension(filename, 'temp')
+        args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0']
+        self._downloader.to_screen('[ffmpeg] Merging formats into "%s"' % filename)
+        self.run_ffmpeg_multiple_files(info['__files_to_merge'], temp_filename, args)
+        os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+        return info['__files_to_merge'], info
+
+    def can_merge(self):
+        # TODO: figure out merge-capable ffmpeg version
+        if self.basename != 'avconv':
+            return True
+
+        required_version = '10-0'
+        if is_outdated_version(
+                self._versions[self.basename], required_version):
+            warning = ('Your copy of %s is outdated and unable to properly mux separate video and audio files, '
+                       'youtube-dlc will download single file media. '
+                       'Update %s to version %s or newer to fix this.') % (
+                           self.basename, self.basename, required_version)
+            if self._downloader:
+                self._downloader.report_warning(warning)
+            return False
+        return True
+
+
+class FFmpegFixupStretchedPP(FFmpegPostProcessor):
+    def run(self, info):
+        stretched_ratio = info.get('stretched_ratio')
+        if stretched_ratio is None or stretched_ratio == 1:
+            return [], info
+
+        filename = info['filepath']
+        temp_filename = prepend_extension(filename, 'temp')
+
+        options = ['-c', 'copy', '-aspect', '%f' % stretched_ratio]
+        self._downloader.to_screen('[ffmpeg] Fixing aspect ratio in "%s"' % filename)
+        self.run_ffmpeg(filename, temp_filename, options)
+
+        os.remove(encodeFilename(filename))
+        os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+        return [], info
+
+
+class FFmpegFixupM4aPP(FFmpegPostProcessor):
+    def run(self, info):
+        if info.get('container') != 'm4a_dash':
+            return [], info
+
+        filename = info['filepath']
+        temp_filename = prepend_extension(filename, 'temp')
+
+        options = ['-c', 'copy', '-f', 'mp4']
+        self._downloader.to_screen('[ffmpeg] Correcting container in "%s"' % filename)
+        self.run_ffmpeg(filename, temp_filename, options)
+
+        os.remove(encodeFilename(filename))
+        os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+        return [], info
+
+
+class FFmpegFixupM3u8PP(FFmpegPostProcessor):
+    def run(self, info):
+        filename = info['filepath']
+        if self.get_audio_codec(filename) == 'aac':
+            temp_filename = prepend_extension(filename, 'temp')
+
+            options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
+            self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename)
+            self.run_ffmpeg(filename, temp_filename, options)
+
+            os.remove(encodeFilename(filename))
+            os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+        return [], info
+
+
+class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
+    def __init__(self, downloader=None, format=None):
+        super(FFmpegSubtitlesConvertorPP, self).__init__(downloader)
+        self.format = format
+
+    def run(self, info):
+        subs = info.get('requested_subtitles')
+        filename = info['filepath']
+        new_ext = self.format
+        new_format = new_ext
+        if new_format == 'vtt':
+            new_format = 'webvtt'
+        if subs is None:
+            self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to convert')
+            return [], info
+        self._downloader.to_screen('[ffmpeg] Converting subtitles')
+        sub_filenames = []
+        for lang, sub in subs.items():
+            ext = sub['ext']
+            if ext == new_ext:
+                self._downloader.to_screen(
+                    '[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext)
+                continue
+            old_file = subtitles_filename(filename, lang, ext, info.get('ext'))
+            sub_filenames.append(old_file)
+            new_file = subtitles_filename(filename, lang, new_ext, info.get('ext'))
+
+            if ext in ('dfxp', 'ttml', 'tt'):
+                self._downloader.report_warning(
+                    'You have requested to convert dfxp (TTML) subtitles into another format, '
+                    'which results in style information loss')
+
+                dfxp_file = old_file
+                srt_file = subtitles_filename(filename, lang, 'srt', info.get('ext'))
+
+                with open(dfxp_file, 'rb') as f:
+                    srt_data = dfxp2srt(f.read())
+
+                with io.open(srt_file, 'wt', encoding='utf-8') as f:
+                    f.write(srt_data)
+                old_file = srt_file
+
+                subs[lang] = {
+                    'ext': 'srt',
+                    'data': srt_data
+                }
+
+                if new_ext == 'srt':
+                    continue
+                else:
+                    sub_filenames.append(srt_file)
+
+            self.run_ffmpeg(old_file, new_file, ['-f', new_format])
+
+            with io.open(new_file, 'rt', encoding='utf-8') as f:
+                subs[lang] = {
+                    'ext': new_ext,
+                    'data': f.read(),
+                }
+
+        return sub_filenames, info
diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py

new file mode 100644 (file)

index 0000000..f5c14d9
--- /dev/null
+++ b/youtube_dl/postprocessor/metadatafromtitle.py
@@ -0,0 +1,48 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import PostProcessor
+
+
+class MetadataFromTitlePP(PostProcessor):
+    def __init__(self, downloader, titleformat):
+        super(MetadataFromTitlePP, self).__init__(downloader)
+        self._titleformat = titleformat
+        self._titleregex = (self.format_to_regex(titleformat)
+                            if re.search(r'%\(\w+\)s', titleformat)
+                            else titleformat)
+
+    def format_to_regex(self, fmt):
+        r"""
+        Converts a string like
+           '%(title)s - %(artist)s'
+        to a regex like
+           '(?P<title>.+)\ \-\ (?P<artist>.+)'
+        """
+        lastpos = 0
+        regex = ''
+        # replace %(..)s with regex group and escape other string parts
+        for match in re.finditer(r'%\((\w+)\)s', fmt):
+            regex += re.escape(fmt[lastpos:match.start()])
+            regex += r'(?P<' + match.group(1) + '>.+)'
+            lastpos = match.end()
+        if lastpos < len(fmt):
+            regex += re.escape(fmt[lastpos:])
+        return regex
+
+    def run(self, info):
+        title = info['title']
+        match = re.match(self._titleregex, title)
+        if match is None:
+            self._downloader.to_screen(
+                '[fromtitle] Could not interpret title of video as "%s"'
+                % self._titleformat)
+            return [], info
+        for attribute, value in match.groupdict().items():
+            info[attribute] = value
+            self._downloader.to_screen(
+                '[fromtitle] parsed %s: %s'
+                % (attribute, value if value is not None else 'NA'))
+
+        return [], info
diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py

new file mode 100644 (file)

index 0000000..814dabe
--- /dev/null
+++ b/youtube_dl/postprocessor/xattrpp.py
@@ -0,0 +1,79 @@
+from __future__ import unicode_literals
+
+from .common import PostProcessor
+from ..compat import compat_os_name
+from ..utils import (
+    hyphenate_date,
+    write_xattr,
+    XAttrMetadataError,
+    XAttrUnavailableError,
+)
+
+
+class XAttrMetadataPP(PostProcessor):
+
+    #
+    # More info about extended attributes for media:
+    #   http://freedesktop.org/wiki/CommonExtendedAttributes/
+    #   http://www.freedesktop.org/wiki/PhreedomDraft/
+    #   http://dublincore.org/documents/usageguide/elements.shtml
+    #
+    # TODO:
+    #  * capture youtube keywords and put them in 'user.dublincore.subject' (comma-separated)
+    #  * figure out which xattrs can be used for 'duration', 'thumbnail', 'resolution'
+    #
+
+    def run(self, info):
+        """ Set extended attributes on downloaded file (if xattr support is found). """
+
+        # Write the metadata to the file's xattrs
+        self._downloader.to_screen('[metadata] Writing metadata to file\'s xattrs')
+
+        filename = info['filepath']
+
+        try:
+            xattr_mapping = {
+                'user.xdg.referrer.url': 'webpage_url',
+                # 'user.xdg.comment':            'description',
+                'user.dublincore.title': 'title',
+                'user.dublincore.date': 'upload_date',
+                'user.dublincore.description': 'description',
+                'user.dublincore.contributor': 'uploader',
+                'user.dublincore.format': 'format',
+            }
+
+            num_written = 0
+            for xattrname, infoname in xattr_mapping.items():
+
+                value = info.get(infoname)
+
+                if value:
+                    if infoname == 'upload_date':
+                        value = hyphenate_date(value)
+
+                    byte_value = value.encode('utf-8')
+                    write_xattr(filename, xattrname, byte_value)
+                    num_written += 1
+
+            return [], info
+
+        except XAttrUnavailableError as e:
+            self._downloader.report_error(str(e))
+            return [], info
+
+        except XAttrMetadataError as e:
+            if e.reason == 'NO_SPACE':
+                self._downloader.report_warning(
+                    'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. '
+                    + (('Some ' if num_written else '') + 'extended attributes are not written.').capitalize())
+            elif e.reason == 'VALUE_TOO_LONG':
+                self._downloader.report_warning(
+                    'Unable to write extended attributes due to too long values.')
+            else:
+                msg = 'This filesystem doesn\'t support extended attributes. '
+                if compat_os_name == 'nt':
+                    msg += 'You need to use NTFS.'
+                else:
+                    msg += '(You may have to enable them in your /etc/fstab)'
+                self._downloader.report_error(msg)
+            return [], info
diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py

new file mode 100644 (file)

index 0000000..5d4adbe
--- /dev/null
+++ b/youtube_dl/socks.py
@@ -0,0 +1,273 @@
+# Public Domain SOCKS proxy protocol implementation
+# Adapted from https://gist.github.com/bluec0re/cafd3764412967417fd3
+
+from __future__ import unicode_literals
+
+# References:
+# SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol
+# SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol
+# SOCKS5 protocol https://tools.ietf.org/html/rfc1928
+# SOCKS5 username/password authentication https://tools.ietf.org/html/rfc1929
+
+import collections
+import socket
+
+from .compat import (
+    compat_ord,
+    compat_struct_pack,
+    compat_struct_unpack,
+)
+
+__author__ = 'Timo Schmid <coding@timoschmid.de>'
+
+SOCKS4_VERSION = 4
+SOCKS4_REPLY_VERSION = 0x00
+# Excerpt from SOCKS4A protocol:
+# if the client cannot resolve the destination host's domain name to find its
+# IP address, it should set the first three bytes of DSTIP to NULL and the last
+# byte to a non-zero value.
+SOCKS4_DEFAULT_DSTIP = compat_struct_pack('!BBBB', 0, 0, 0, 0xFF)
+
+SOCKS5_VERSION = 5
+SOCKS5_USER_AUTH_VERSION = 0x01
+SOCKS5_USER_AUTH_SUCCESS = 0x00
+
+
+class Socks4Command(object):
+    CMD_CONNECT = 0x01
+    CMD_BIND = 0x02
+
+
+class Socks5Command(Socks4Command):
+    CMD_UDP_ASSOCIATE = 0x03
+
+
+class Socks5Auth(object):
+    AUTH_NONE = 0x00
+    AUTH_GSSAPI = 0x01
+    AUTH_USER_PASS = 0x02
+    AUTH_NO_ACCEPTABLE = 0xFF  # For server response
+
+
+class Socks5AddressType(object):
+    ATYP_IPV4 = 0x01
+    ATYP_DOMAINNAME = 0x03
+    ATYP_IPV6 = 0x04
+
+
+class ProxyError(socket.error):
+    ERR_SUCCESS = 0x00
+
+    def __init__(self, code=None, msg=None):
+        if code is not None and msg is None:
+            msg = self.CODES.get(code) or 'unknown error'
+        super(ProxyError, self).__init__(code, msg)
+
+
+class InvalidVersionError(ProxyError):
+    def __init__(self, expected_version, got_version):
+        msg = ('Invalid response version from server. Expected {0:02x} got '
+               '{1:02x}'.format(expected_version, got_version))
+        super(InvalidVersionError, self).__init__(0, msg)
+
+
+class Socks4Error(ProxyError):
+    ERR_SUCCESS = 90
+
+    CODES = {
+        91: 'request rejected or failed',
+        92: 'request rejected because SOCKS server cannot connect to identd on the client',
+        93: 'request rejected because the client program and identd report different user-ids'
+    }
+
+
+class Socks5Error(ProxyError):
+    ERR_GENERAL_FAILURE = 0x01
+
+    CODES = {
+        0x01: 'general SOCKS server failure',
+        0x02: 'connection not allowed by ruleset',
+        0x03: 'Network unreachable',
+        0x04: 'Host unreachable',
+        0x05: 'Connection refused',
+        0x06: 'TTL expired',
+        0x07: 'Command not supported',
+        0x08: 'Address type not supported',
+        0xFE: 'unknown username or invalid password',
+        0xFF: 'all offered authentication methods were rejected'
+    }
+
+
+class ProxyType(object):
+    SOCKS4 = 0
+    SOCKS4A = 1
+    SOCKS5 = 2
+
+
+Proxy = collections.namedtuple('Proxy', (
+    'type', 'host', 'port', 'username', 'password', 'remote_dns'))
+
+
+class sockssocket(socket.socket):
+    def __init__(self, *args, **kwargs):
+        self._proxy = None
+        super(sockssocket, self).__init__(*args, **kwargs)
+
+    def setproxy(self, proxytype, addr, port, rdns=True, username=None, password=None):
+        assert proxytype in (ProxyType.SOCKS4, ProxyType.SOCKS4A, ProxyType.SOCKS5)
+
+        self._proxy = Proxy(proxytype, addr, port, username, password, rdns)
+
+    def recvall(self, cnt):
+        data = b''
+        while len(data) < cnt:
+            cur = self.recv(cnt - len(data))
+            if not cur:
+                raise EOFError('{0} bytes missing'.format(cnt - len(data)))
+            data += cur
+        return data
+
+    def _recv_bytes(self, cnt):
+        data = self.recvall(cnt)
+        return compat_struct_unpack('!{0}B'.format(cnt), data)
+
+    @staticmethod
+    def _len_and_data(data):
+        return compat_struct_pack('!B', len(data)) + data
+
+    def _check_response_version(self, expected_version, got_version):
+        if got_version != expected_version:
+            self.close()
+            raise InvalidVersionError(expected_version, got_version)
+
+    def _resolve_address(self, destaddr, default, use_remote_dns):
+        try:
+            return socket.inet_aton(destaddr)
+        except socket.error:
+            if use_remote_dns and self._proxy.remote_dns:
+                return default
+            else:
+                return socket.inet_aton(socket.gethostbyname(destaddr))
+
+    def _setup_socks4(self, address, is_4a=False):
+        destaddr, port = address
+
+        ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a)
+
+        packet = compat_struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr
+
+        username = (self._proxy.username or '').encode('utf-8')
+        packet += username + b'\x00'
+
+        if is_4a and self._proxy.remote_dns:
+            packet += destaddr.encode('utf-8') + b'\x00'
+
+        self.sendall(packet)
+
+        version, resp_code, dstport, dsthost = compat_struct_unpack('!BBHI', self.recvall(8))
+
+        self._check_response_version(SOCKS4_REPLY_VERSION, version)
+
+        if resp_code != Socks4Error.ERR_SUCCESS:
+            self.close()
+            raise Socks4Error(resp_code)
+
+        return (dsthost, dstport)
+
+    def _setup_socks4a(self, address):
+        self._setup_socks4(address, is_4a=True)
+
+    def _socks5_auth(self):
+        packet = compat_struct_pack('!B', SOCKS5_VERSION)
+
+        auth_methods = [Socks5Auth.AUTH_NONE]
+        if self._proxy.username and self._proxy.password:
+            auth_methods.append(Socks5Auth.AUTH_USER_PASS)
+
+        packet += compat_struct_pack('!B', len(auth_methods))
+        packet += compat_struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods)
+
+        self.sendall(packet)
+
+        version, method = self._recv_bytes(2)
+
+        self._check_response_version(SOCKS5_VERSION, version)
+
+        if method == Socks5Auth.AUTH_NO_ACCEPTABLE or (
+                method == Socks5Auth.AUTH_USER_PASS and (not self._proxy.username or not self._proxy.password)):
+            self.close()
+            raise Socks5Error(Socks5Auth.AUTH_NO_ACCEPTABLE)
+
+        if method == Socks5Auth.AUTH_USER_PASS:
+            username = self._proxy.username.encode('utf-8')
+            password = self._proxy.password.encode('utf-8')
+            packet = compat_struct_pack('!B', SOCKS5_USER_AUTH_VERSION)
+            packet += self._len_and_data(username) + self._len_and_data(password)
+            self.sendall(packet)
+
+            version, status = self._recv_bytes(2)
+
+            self._check_response_version(SOCKS5_USER_AUTH_VERSION, version)
+
+            if status != SOCKS5_USER_AUTH_SUCCESS:
+                self.close()
+                raise Socks5Error(Socks5Error.ERR_GENERAL_FAILURE)
+
+    def _setup_socks5(self, address):
+        destaddr, port = address
+
+        ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True)
+
+        self._socks5_auth()
+
+        reserved = 0
+        packet = compat_struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved)
+        if ipaddr is None:
+            destaddr = destaddr.encode('utf-8')
+            packet += compat_struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME)
+            packet += self._len_and_data(destaddr)
+        else:
+            packet += compat_struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr
+        packet += compat_struct_pack('!H', port)
+
+        self.sendall(packet)
+
+        version, status, reserved, atype = self._recv_bytes(4)
+
+        self._check_response_version(SOCKS5_VERSION, version)
+
+        if status != Socks5Error.ERR_SUCCESS:
+            self.close()
+            raise Socks5Error(status)
+
+        if atype == Socks5AddressType.ATYP_IPV4:
+            destaddr = self.recvall(4)
+        elif atype == Socks5AddressType.ATYP_DOMAINNAME:
+            alen = compat_ord(self.recv(1))
+            destaddr = self.recvall(alen)
+        elif atype == Socks5AddressType.ATYP_IPV6:
+            destaddr = self.recvall(16)
+        destport = compat_struct_unpack('!H', self.recvall(2))[0]
+
+        return (destaddr, destport)
+
+    def _make_proxy(self, connect_func, address):
+        if not self._proxy:
+            return connect_func(self, address)
+
+        result = connect_func(self, (self._proxy.host, self._proxy.port))
+        if result != 0 and result is not None:
+            return result
+        setup_funcs = {
+            ProxyType.SOCKS4: self._setup_socks4,
+            ProxyType.SOCKS4A: self._setup_socks4a,
+            ProxyType.SOCKS5: self._setup_socks5,
+        }
+        setup_funcs[self._proxy.type](address)
+        return result
+
+    def connect(self, address):
+        self._make_proxy(socket.socket.connect, address)
+
+    def connect_ex(self, address):
+        return self._make_proxy(socket.socket.connect_ex, address)
diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py

new file mode 100644 (file)

index 0000000..0c71585
--- /dev/null
+++ b/youtube_dl/swfinterp.py
@@ -0,0 +1,834 @@
+from __future__ import unicode_literals
+
+import collections
+import io
+import zlib
+
+from .compat import (
+    compat_str,
+    compat_struct_unpack,
+)
+from .utils import (
+    ExtractorError,
+)
+
+
+def _extract_tags(file_contents):
+    if file_contents[1:3] != b'WS':
+        raise ExtractorError(
+            'Not an SWF file; header is %r' % file_contents[:3])
+    if file_contents[:1] == b'C':
+        content = zlib.decompress(file_contents[8:])
+    else:
+        raise NotImplementedError(
+            'Unsupported compression format %r' %
+            file_contents[:1])
+
+    # Determine number of bits in framesize rectangle
+    framesize_nbits = compat_struct_unpack('!B', content[:1])[0] >> 3
+    framesize_len = (5 + 4 * framesize_nbits + 7) // 8
+
+    pos = framesize_len + 2 + 2
+    while pos < len(content):
+        header16 = compat_struct_unpack('<H', content[pos:pos + 2])[0]
+        pos += 2
+        tag_code = header16 >> 6
+        tag_len = header16 & 0x3f
+        if tag_len == 0x3f:
+            tag_len = compat_struct_unpack('<I', content[pos:pos + 4])[0]
+            pos += 4
+        assert pos + tag_len <= len(content), \
+            ('Tag %d ends at %d+%d - that\'s longer than the file (%d)'
+                % (tag_code, pos, tag_len, len(content)))
+        yield (tag_code, content[pos:pos + tag_len])
+        pos += tag_len
+
+
+class _AVMClass_Object(object):
+    def __init__(self, avm_class):
+        self.avm_class = avm_class
+
+    def __repr__(self):
+        return '%s#%x' % (self.avm_class.name, id(self))
+
+
+class _ScopeDict(dict):
+    def __init__(self, avm_class):
+        super(_ScopeDict, self).__init__()
+        self.avm_class = avm_class
+
+    def __repr__(self):
+        return '%s__Scope(%s)' % (
+            self.avm_class.name,
+            super(_ScopeDict, self).__repr__())
+
+
+class _AVMClass(object):
+    def __init__(self, name_idx, name, static_properties=None):
+        self.name_idx = name_idx
+        self.name = name
+        self.method_names = {}
+        self.method_idxs = {}
+        self.methods = {}
+        self.method_pyfunctions = {}
+        self.static_properties = static_properties if static_properties else {}
+
+        self.variables = _ScopeDict(self)
+        self.constants = {}
+
+    def make_object(self):
+        return _AVMClass_Object(self)
+
+    def __repr__(self):
+        return '_AVMClass(%s)' % (self.name)
+
+    def register_methods(self, methods):
+        self.method_names.update(methods.items())
+        self.method_idxs.update(dict(
+            (idx, name)
+            for name, idx in methods.items()))
+
+
+class _Multiname(object):
+    def __init__(self, kind):
+        self.kind = kind
+
+    def __repr__(self):
+        return '[MULTINAME kind: 0x%x]' % self.kind
+
+
+def _read_int(reader):
+    res = 0
+    shift = 0
+    for _ in range(5):
+        buf = reader.read(1)
+        assert len(buf) == 1
+        b = compat_struct_unpack('<B', buf)[0]
+        res = res | ((b & 0x7f) << shift)
+        if b & 0x80 == 0:
+            break
+        shift += 7
+    return res
+
+
+def _u30(reader):
+    res = _read_int(reader)
+    assert res & 0xf0000000 == 0
+    return res
+
+
+_u32 = _read_int
+
+
+def _s32(reader):
+    v = _read_int(reader)
+    if v & 0x80000000 != 0:
+        v = - ((v ^ 0xffffffff) + 1)
+    return v
+
+
+def _s24(reader):
+    bs = reader.read(3)
+    assert len(bs) == 3
+    last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00'
+    return compat_struct_unpack('<i', bs + last_byte)[0]
+
+
+def _read_string(reader):
+    slen = _u30(reader)
+    resb = reader.read(slen)
+    assert len(resb) == slen
+    return resb.decode('utf-8')
+
+
+def _read_bytes(count, reader):
+    assert count >= 0
+    resb = reader.read(count)
+    assert len(resb) == count
+    return resb
+
+
+def _read_byte(reader):
+    resb = _read_bytes(1, reader=reader)
+    res = compat_struct_unpack('<B', resb)[0]
+    return res
+
+
+StringClass = _AVMClass('(no name idx)', 'String')
+ByteArrayClass = _AVMClass('(no name idx)', 'ByteArray')
+TimerClass = _AVMClass('(no name idx)', 'Timer')
+TimerEventClass = _AVMClass('(no name idx)', 'TimerEvent', {'TIMER': 'timer'})
+_builtin_classes = {
+    StringClass.name: StringClass,
+    ByteArrayClass.name: ByteArrayClass,
+    TimerClass.name: TimerClass,
+    TimerEventClass.name: TimerEventClass,
+}
+
+
+class _Undefined(object):
+    def __bool__(self):
+        return False
+    __nonzero__ = __bool__
+
+    def __hash__(self):
+        return 0
+
+    def __str__(self):
+        return 'undefined'
+    __repr__ = __str__
+
+
+undefined = _Undefined()
+
+
+class SWFInterpreter(object):
+    def __init__(self, file_contents):
+        self._patched_functions = {
+            (TimerClass, 'addEventListener'): lambda params: undefined,
+        }
+        code_tag = next(tag
+                        for tag_code, tag in _extract_tags(file_contents)
+                        if tag_code == 82)
+        p = code_tag.index(b'\0', 4) + 1
+        code_reader = io.BytesIO(code_tag[p:])
+
+        # Parse ABC (AVM2 ByteCode)
+
+        # Define a couple convenience methods
+        u30 = lambda *args: _u30(*args, reader=code_reader)
+        s32 = lambda *args: _s32(*args, reader=code_reader)
+        u32 = lambda *args: _u32(*args, reader=code_reader)
+        read_bytes = lambda *args: _read_bytes(*args, reader=code_reader)
+        read_byte = lambda *args: _read_byte(*args, reader=code_reader)
+
+        # minor_version + major_version
+        read_bytes(2 + 2)
+
+        # Constant pool
+        int_count = u30()
+        self.constant_ints = [0]
+        for _c in range(1, int_count):
+            self.constant_ints.append(s32())
+        self.constant_uints = [0]
+        uint_count = u30()
+        for _c in range(1, uint_count):
+            self.constant_uints.append(u32())
+        double_count = u30()
+        read_bytes(max(0, (double_count - 1)) * 8)
+        string_count = u30()
+        self.constant_strings = ['']
+        for _c in range(1, string_count):
+            s = _read_string(code_reader)
+            self.constant_strings.append(s)
+        namespace_count = u30()
+        for _c in range(1, namespace_count):
+            read_bytes(1)  # kind
+            u30()  # name
+        ns_set_count = u30()
+        for _c in range(1, ns_set_count):
+            count = u30()
+            for _c2 in range(count):
+                u30()
+        multiname_count = u30()
+        MULTINAME_SIZES = {
+            0x07: 2,  # QName
+            0x0d: 2,  # QNameA
+            0x0f: 1,  # RTQName
+            0x10: 1,  # RTQNameA
+            0x11: 0,  # RTQNameL
+            0x12: 0,  # RTQNameLA
+            0x09: 2,  # Multiname
+            0x0e: 2,  # MultinameA
+            0x1b: 1,  # MultinameL
+            0x1c: 1,  # MultinameLA
+        }
+        self.multinames = ['']
+        for _c in range(1, multiname_count):
+            kind = u30()
+            assert kind in MULTINAME_SIZES, 'Invalid multiname kind %r' % kind
+            if kind == 0x07:
+                u30()  # namespace_idx
+                name_idx = u30()
+                self.multinames.append(self.constant_strings[name_idx])
+            elif kind == 0x09:
+                name_idx = u30()
+                u30()
+                self.multinames.append(self.constant_strings[name_idx])
+            else:
+                self.multinames.append(_Multiname(kind))
+                for _c2 in range(MULTINAME_SIZES[kind]):
+                    u30()
+
+        # Methods
+        method_count = u30()
+        MethodInfo = collections.namedtuple(
+            'MethodInfo',
+            ['NEED_ARGUMENTS', 'NEED_REST'])
+        method_infos = []
+        for method_id in range(method_count):
+            param_count = u30()
+            u30()  # return type
+            for _ in range(param_count):
+                u30()  # param type
+            u30()  # name index (always 0 for youtube)
+            flags = read_byte()
+            if flags & 0x08 != 0:
+                # Options present
+                option_count = u30()
+                for c in range(option_count):
+                    u30()  # val
+                    read_bytes(1)  # kind
+            if flags & 0x80 != 0:
+                # Param names present
+                for _ in range(param_count):
+                    u30()  # param name
+            mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
+            method_infos.append(mi)
+
+        # Metadata
+        metadata_count = u30()
+        for _c in range(metadata_count):
+            u30()  # name
+            item_count = u30()
+            for _c2 in range(item_count):
+                u30()  # key
+                u30()  # value
+
+        def parse_traits_info():
+            trait_name_idx = u30()
+            kind_full = read_byte()
+            kind = kind_full & 0x0f
+            attrs = kind_full >> 4
+            methods = {}
+            constants = None
+            if kind == 0x00:  # Slot
+                u30()  # Slot id
+                u30()  # type_name_idx
+                vindex = u30()
+                if vindex != 0:
+                    read_byte()  # vkind
+            elif kind == 0x06:  # Const
+                u30()  # Slot id
+                u30()  # type_name_idx
+                vindex = u30()
+                vkind = 'any'
+                if vindex != 0:
+                    vkind = read_byte()
+                if vkind == 0x03:  # Constant_Int
+                    value = self.constant_ints[vindex]
+                elif vkind == 0x04:  # Constant_UInt
+                    value = self.constant_uints[vindex]
+                else:
+                    return {}, None  # Ignore silently for now
+                constants = {self.multinames[trait_name_idx]: value}
+            elif kind in (0x01, 0x02, 0x03):  # Method / Getter / Setter
+                u30()  # disp_id
+                method_idx = u30()
+                methods[self.multinames[trait_name_idx]] = method_idx
+            elif kind == 0x04:  # Class
+                u30()  # slot_id
+                u30()  # classi
+            elif kind == 0x05:  # Function
+                u30()  # slot_id
+                function_idx = u30()
+                methods[function_idx] = self.multinames[trait_name_idx]
+            else:
+                raise ExtractorError('Unsupported trait kind %d' % kind)
+
+            if attrs & 0x4 != 0:  # Metadata present
+                metadata_count = u30()
+                for _c3 in range(metadata_count):
+                    u30()  # metadata index
+
+            return methods, constants
+
+        # Classes
+        class_count = u30()
+        classes = []
+        for class_id in range(class_count):
+            name_idx = u30()
+
+            cname = self.multinames[name_idx]
+            avm_class = _AVMClass(name_idx, cname)
+            classes.append(avm_class)
+
+            u30()  # super_name idx
+            flags = read_byte()
+            if flags & 0x08 != 0:  # Protected namespace is present
+                u30()  # protected_ns_idx
+            intrf_count = u30()
+            for _c2 in range(intrf_count):
+                u30()
+            u30()  # iinit
+            trait_count = u30()
+            for _c2 in range(trait_count):
+                trait_methods, trait_constants = parse_traits_info()
+                avm_class.register_methods(trait_methods)
+                if trait_constants:
+                    avm_class.constants.update(trait_constants)
+
+        assert len(classes) == class_count
+        self._classes_by_name = dict((c.name, c) for c in classes)
+
+        for avm_class in classes:
+            avm_class.cinit_idx = u30()
+            trait_count = u30()
+            for _c2 in range(trait_count):
+                trait_methods, trait_constants = parse_traits_info()
+                avm_class.register_methods(trait_methods)
+                if trait_constants:
+                    avm_class.constants.update(trait_constants)
+
+        # Scripts
+        script_count = u30()
+        for _c in range(script_count):
+            u30()  # init
+            trait_count = u30()
+            for _c2 in range(trait_count):
+                parse_traits_info()
+
+        # Method bodies
+        method_body_count = u30()
+        Method = collections.namedtuple('Method', ['code', 'local_count'])
+        self._all_methods = []
+        for _c in range(method_body_count):
+            method_idx = u30()
+            u30()  # max_stack
+            local_count = u30()
+            u30()  # init_scope_depth
+            u30()  # max_scope_depth
+            code_length = u30()
+            code = read_bytes(code_length)
+            m = Method(code, local_count)
+            self._all_methods.append(m)
+            for avm_class in classes:
+                if method_idx in avm_class.method_idxs:
+                    avm_class.methods[avm_class.method_idxs[method_idx]] = m
+            exception_count = u30()
+            for _c2 in range(exception_count):
+                u30()  # from
+                u30()  # to
+                u30()  # target
+                u30()  # exc_type
+                u30()  # var_name
+            trait_count = u30()
+            for _c2 in range(trait_count):
+                parse_traits_info()
+
+        assert p + code_reader.tell() == len(code_tag)
+
+    def patch_function(self, avm_class, func_name, f):
+        self._patched_functions[(avm_class, func_name)] = f
+
+    def extract_class(self, class_name, call_cinit=True):
+        try:
+            res = self._classes_by_name[class_name]
+        except KeyError:
+            raise ExtractorError('Class %r not found' % class_name)
+
+        if call_cinit and hasattr(res, 'cinit_idx'):
+            res.register_methods({'$cinit': res.cinit_idx})
+            res.methods['$cinit'] = self._all_methods[res.cinit_idx]
+            cinit = self.extract_function(res, '$cinit')
+            cinit([])
+
+        return res
+
+    def extract_function(self, avm_class, func_name):
+        p = self._patched_functions.get((avm_class, func_name))
+        if p:
+            return p
+        if func_name in avm_class.method_pyfunctions:
+            return avm_class.method_pyfunctions[func_name]
+        if func_name in self._classes_by_name:
+            return self._classes_by_name[func_name].make_object()
+        if func_name not in avm_class.methods:
+            raise ExtractorError('Cannot find function %s.%s' % (
+                avm_class.name, func_name))
+        m = avm_class.methods[func_name]
+
+        def resfunc(args):
+            # Helper functions
+            coder = io.BytesIO(m.code)
+            s24 = lambda: _s24(coder)
+            u30 = lambda: _u30(coder)
+
+            registers = [avm_class.variables] + list(args) + [None] * m.local_count
+            stack = []
+            scopes = collections.deque([
+                self._classes_by_name, avm_class.constants, avm_class.variables])
+            while True:
+                opcode = _read_byte(coder)
+                if opcode == 9:  # label
+                    pass  # Spec says: "Do nothing."
+                elif opcode == 16:  # jump
+                    offset = s24()
+                    coder.seek(coder.tell() + offset)
+                elif opcode == 17:  # iftrue
+                    offset = s24()
+                    value = stack.pop()
+                    if value:
+                        coder.seek(coder.tell() + offset)
+                elif opcode == 18:  # iffalse
+                    offset = s24()
+                    value = stack.pop()
+                    if not value:
+                        coder.seek(coder.tell() + offset)
+                elif opcode == 19:  # ifeq
+                    offset = s24()
+                    value2 = stack.pop()
+                    value1 = stack.pop()
+                    if value2 == value1:
+                        coder.seek(coder.tell() + offset)
+                elif opcode == 20:  # ifne
+                    offset = s24()
+                    value2 = stack.pop()
+                    value1 = stack.pop()
+                    if value2 != value1:
+                        coder.seek(coder.tell() + offset)
+                elif opcode == 21:  # iflt
+                    offset = s24()
+                    value2 = stack.pop()
+                    value1 = stack.pop()
+                    if value1 < value2:
+                        coder.seek(coder.tell() + offset)
+                elif opcode == 32:  # pushnull
+                    stack.append(None)
+                elif opcode == 33:  # pushundefined
+                    stack.append(undefined)
+                elif opcode == 36:  # pushbyte
+                    v = _read_byte(coder)
+                    stack.append(v)
+                elif opcode == 37:  # pushshort
+                    v = u30()
+                    stack.append(v)
+                elif opcode == 38:  # pushtrue
+                    stack.append(True)
+                elif opcode == 39:  # pushfalse
+                    stack.append(False)
+                elif opcode == 40:  # pushnan
+                    stack.append(float('NaN'))
+                elif opcode == 42:  # dup
+                    value = stack[-1]
+                    stack.append(value)
+                elif opcode == 44:  # pushstring
+                    idx = u30()
+                    stack.append(self.constant_strings[idx])
+                elif opcode == 48:  # pushscope
+                    new_scope = stack.pop()
+                    scopes.append(new_scope)
+                elif opcode == 66:  # construct
+                    arg_count = u30()
+                    args = list(reversed(
+                        [stack.pop() for _ in range(arg_count)]))
+                    obj = stack.pop()
+                    res = obj.avm_class.make_object()
+                    stack.append(res)
+                elif opcode == 70:  # callproperty
+                    index = u30()
+                    mname = self.multinames[index]
+                    arg_count = u30()
+                    args = list(reversed(
+                        [stack.pop() for _ in range(arg_count)]))
+                    obj = stack.pop()
+
+                    if obj == StringClass:
+                        if mname == 'String':
+                            assert len(args) == 1
+                            assert isinstance(args[0], (
+                                int, compat_str, _Undefined))
+                            if args[0] == undefined:
+                                res = 'undefined'
+                            else:
+                                res = compat_str(args[0])
+                            stack.append(res)
+                            continue
+                        else:
+                            raise NotImplementedError(
+                                'Function String.%s is not yet implemented'
+                                % mname)
+                    elif isinstance(obj, _AVMClass_Object):
+                        func = self.extract_function(obj.avm_class, mname)
+                        res = func(args)
+                        stack.append(res)
+                        continue
+                    elif isinstance(obj, _AVMClass):
+                        func = self.extract_function(obj, mname)
+                        res = func(args)
+                        stack.append(res)
+                        continue
+                    elif isinstance(obj, _ScopeDict):
+                        if mname in obj.avm_class.method_names:
+                            func = self.extract_function(obj.avm_class, mname)
+                            res = func(args)
+                        else:
+                            res = obj[mname]
+                        stack.append(res)
+                        continue
+                    elif isinstance(obj, compat_str):
+                        if mname == 'split':
+                            assert len(args) == 1
+                            assert isinstance(args[0], compat_str)
+                            if args[0] == '':
+                                res = list(obj)
+                            else:
+                                res = obj.split(args[0])
+                            stack.append(res)
+                            continue
+                        elif mname == 'charCodeAt':
+                            assert len(args) <= 1
+                            idx = 0 if len(args) == 0 else args[0]
+                            assert isinstance(idx, int)
+                            res = ord(obj[idx])
+                            stack.append(res)
+                            continue
+                    elif isinstance(obj, list):
+                        if mname == 'slice':
+                            assert len(args) == 1
+                            assert isinstance(args[0], int)
+                            res = obj[args[0]:]
+                            stack.append(res)
+                            continue
+                        elif mname == 'join':
+                            assert len(args) == 1
+                            assert isinstance(args[0], compat_str)
+                            res = args[0].join(obj)
+                            stack.append(res)
+                            continue
+                    raise NotImplementedError(
+                        'Unsupported property %r on %r'
+                        % (mname, obj))
+                elif opcode == 71:  # returnvoid
+                    res = undefined
+                    return res
+                elif opcode == 72:  # returnvalue
+                    res = stack.pop()
+                    return res
+                elif opcode == 73:  # constructsuper
+                    # Not yet implemented, just hope it works without it
+                    arg_count = u30()
+                    args = list(reversed(
+                        [stack.pop() for _ in range(arg_count)]))
+                    obj = stack.pop()
+                elif opcode == 74:  # constructproperty
+                    index = u30()
+                    arg_count = u30()
+                    args = list(reversed(
+                        [stack.pop() for _ in range(arg_count)]))
+                    obj = stack.pop()
+
+                    mname = self.multinames[index]
+                    assert isinstance(obj, _AVMClass)
+
+                    # We do not actually call the constructor for now;
+                    # we just pretend it does nothing
+                    stack.append(obj.make_object())
+                elif opcode == 79:  # callpropvoid
+                    index = u30()
+                    mname = self.multinames[index]
+                    arg_count = u30()
+                    args = list(reversed(
+                        [stack.pop() for _ in range(arg_count)]))
+                    obj = stack.pop()
+                    if isinstance(obj, _AVMClass_Object):
+                        func = self.extract_function(obj.avm_class, mname)
+                        res = func(args)
+                        assert res is undefined
+                        continue
+                    if isinstance(obj, _ScopeDict):
+                        assert mname in obj.avm_class.method_names
+                        func = self.extract_function(obj.avm_class, mname)
+                        res = func(args)
+                        assert res is undefined
+                        continue
+                    if mname == 'reverse':
+                        assert isinstance(obj, list)
+                        obj.reverse()
+                    else:
+                        raise NotImplementedError(
+                            'Unsupported (void) property %r on %r'
+                            % (mname, obj))
+                elif opcode == 86:  # newarray
+                    arg_count = u30()
+                    arr = []
+                    for i in range(arg_count):
+                        arr.append(stack.pop())
+                    arr = arr[::-1]
+                    stack.append(arr)
+                elif opcode == 93:  # findpropstrict
+                    index = u30()
+                    mname = self.multinames[index]
+                    for s in reversed(scopes):
+                        if mname in s:
+                            res = s
+                            break
+                    else:
+                        res = scopes[0]
+                    if mname not in res and mname in _builtin_classes:
+                        stack.append(_builtin_classes[mname])
+                    else:
+                        stack.append(res[mname])
+                elif opcode == 94:  # findproperty
+                    index = u30()
+                    mname = self.multinames[index]
+                    for s in reversed(scopes):
+                        if mname in s:
+                            res = s
+                            break
+                    else:
+                        res = avm_class.variables
+                    stack.append(res)
+                elif opcode == 96:  # getlex
+                    index = u30()
+                    mname = self.multinames[index]
+                    for s in reversed(scopes):
+                        if mname in s:
+                            scope = s
+                            break
+                    else:
+                        scope = avm_class.variables
+
+                    if mname in scope:
+                        res = scope[mname]
+                    elif mname in _builtin_classes:
+                        res = _builtin_classes[mname]
+                    else:
+                        # Assume uninitialized
+                        # TODO warn here
+                        res = undefined
+                    stack.append(res)
+                elif opcode == 97:  # setproperty
+                    index = u30()
+                    value = stack.pop()
+                    idx = self.multinames[index]
+                    if isinstance(idx, _Multiname):
+                        idx = stack.pop()
+                    obj = stack.pop()
+                    obj[idx] = value
+                elif opcode == 98:  # getlocal
+                    index = u30()
+                    stack.append(registers[index])
+                elif opcode == 99:  # setlocal
+                    index = u30()
+                    value = stack.pop()
+                    registers[index] = value
+                elif opcode == 102:  # getproperty
+                    index = u30()
+                    pname = self.multinames[index]
+                    if pname == 'length':
+                        obj = stack.pop()
+                        assert isinstance(obj, (compat_str, list))
+                        stack.append(len(obj))
+                    elif isinstance(pname, compat_str):  # Member access
+                        obj = stack.pop()
+                        if isinstance(obj, _AVMClass):
+                            res = obj.static_properties[pname]
+                            stack.append(res)
+                            continue
+
+                        assert isinstance(obj, (dict, _ScopeDict)),\
+                            'Accessing member %r on %r' % (pname, obj)
+                        res = obj.get(pname, undefined)
+                        stack.append(res)
+                    else:  # Assume attribute access
+                        idx = stack.pop()
+                        assert isinstance(idx, int)
+                        obj = stack.pop()
+                        assert isinstance(obj, list)
+                        stack.append(obj[idx])
+                elif opcode == 104:  # initproperty
+                    index = u30()
+                    value = stack.pop()
+                    idx = self.multinames[index]
+                    if isinstance(idx, _Multiname):
+                        idx = stack.pop()
+                    obj = stack.pop()
+                    obj[idx] = value
+                elif opcode == 115:  # convert_
+                    value = stack.pop()
+                    intvalue = int(value)
+                    stack.append(intvalue)
+                elif opcode == 128:  # coerce
+                    u30()
+                elif opcode == 130:  # coerce_a
+                    value = stack.pop()
+                    # um, yes, it's any value
+                    stack.append(value)
+                elif opcode == 133:  # coerce_s
+                    assert isinstance(stack[-1], (type(None), compat_str))
+                elif opcode == 147:  # decrement
+                    value = stack.pop()
+                    assert isinstance(value, int)
+                    stack.append(value - 1)
+                elif opcode == 149:  # typeof
+                    value = stack.pop()
+                    return {
+                        _Undefined: 'undefined',
+                        compat_str: 'String',
+                        int: 'Number',
+                        float: 'Number',
+                    }[type(value)]
+                elif opcode == 160:  # add
+                    value2 = stack.pop()
+                    value1 = stack.pop()
+                    res = value1 + value2
+                    stack.append(res)
+                elif opcode == 161:  # subtract
+                    value2 = stack.pop()
+                    value1 = stack.pop()
+                    res = value1 - value2
+                    stack.append(res)
+                elif opcode == 162:  # multiply
+                    value2 = stack.pop()
+                    value1 = stack.pop()
+                    res = value1 * value2
+                    stack.append(res)
+                elif opcode == 164:  # modulo
+                    value2 = stack.pop()
+                    value1 = stack.pop()
+                    res = value1 % value2
+                    stack.append(res)
+                elif opcode == 168:  # bitand
+                    value2 = stack.pop()
+                    value1 = stack.pop()
+                    assert isinstance(value1, int)
+                    assert isinstance(value2, int)
+                    res = value1 & value2
+                    stack.append(res)
+                elif opcode == 171:  # equals
+                    value2 = stack.pop()
+                    value1 = stack.pop()
+                    result = value1 == value2
+                    stack.append(result)
+                elif opcode == 175:  # greaterequals
+                    value2 = stack.pop()
+                    value1 = stack.pop()
+                    result = value1 >= value2
+                    stack.append(result)
+                elif opcode == 192:  # increment_i
+                    value = stack.pop()
+                    assert isinstance(value, int)
+                    stack.append(value + 1)
+                elif opcode == 208:  # getlocal_0
+                    stack.append(registers[0])
+                elif opcode == 209:  # getlocal_1
+                    stack.append(registers[1])
+                elif opcode == 210:  # getlocal_2
+                    stack.append(registers[2])
+                elif opcode == 211:  # getlocal_3
+                    stack.append(registers[3])
+                elif opcode == 212:  # setlocal_0
+                    registers[0] = stack.pop()
+                elif opcode == 213:  # setlocal_1
+                    registers[1] = stack.pop()
+                elif opcode == 214:  # setlocal_2
+                    registers[2] = stack.pop()
+                elif opcode == 215:  # setlocal_3
+                    registers[3] = stack.pop()
+                else:
+                    raise NotImplementedError(
+                        'Unsupported opcode %d' % opcode)
+
+        avm_class.method_pyfunctions[func_name] = resfunc
+        return resfunc
diff --git a/youtube_dl/update.py b/youtube_dl/update.py

new file mode 100644 (file)

index 0000000..d95a07c
--- /dev/null
+++ b/youtube_dl/update.py
@@ -0,0 +1,190 @@
+from __future__ import unicode_literals
+
+import io
+import json
+import traceback
+import hashlib
+import os
+import subprocess
+import sys
+from zipimport import zipimporter
+
+from .compat import compat_realpath
+from .utils import encode_compat_str
+
+from .version import __version__
+
+
+def rsa_verify(message, signature, key):
+    from hashlib import sha256
+    assert isinstance(message, bytes)
+    byte_size = (len(bin(key[0])) - 2 + 8 - 1) // 8
+    signature = ('%x' % pow(int(signature, 16), key[1], key[0])).encode()
+    signature = (byte_size * 2 - len(signature)) * b'0' + signature
+    asn1 = b'3031300d060960864801650304020105000420'
+    asn1 += sha256(message).hexdigest().encode()
+    if byte_size < len(asn1) // 2 + 11:
+        return False
+    expected = b'0001' + (byte_size - len(asn1) // 2 - 3) * b'ff' + b'00' + asn1
+    return expected == signature
+
+
+def update_self(to_screen, verbose, opener):
+    """Update the program file with the latest version from the repository"""
+
+    UPDATE_URL = 'https://yt-dl.org/update/'
+    VERSION_URL = UPDATE_URL + 'LATEST_VERSION'
+    JSON_URL = UPDATE_URL + 'versions.json'
+    UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537)
+
+    if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, 'frozen'):
+        to_screen('It looks like you installed youtube-dlc with a package manager, pip, setup.py or a tarball. Please use that to update.')
+        return
+
+    # Check if there is a new version
+    try:
+        newversion = opener.open(VERSION_URL).read().decode('utf-8').strip()
+    except Exception:
+        if verbose:
+            to_screen(encode_compat_str(traceback.format_exc()))
+        to_screen('ERROR: can\'t find the current version. Please try again later.')
+        return
+    if newversion == __version__:
+        to_screen('youtube-dlc is up-to-date (' + __version__ + ')')
+        return
+
+    # Download and check versions info
+    try:
+        versions_info = opener.open(JSON_URL).read().decode('utf-8')
+        versions_info = json.loads(versions_info)
+    except Exception:
+        if verbose:
+            to_screen(encode_compat_str(traceback.format_exc()))
+        to_screen('ERROR: can\'t obtain versions info. Please try again later.')
+        return
+    if 'signature' not in versions_info:
+        to_screen('ERROR: the versions file is not signed or corrupted. Aborting.')
+        return
+    signature = versions_info['signature']
+    del versions_info['signature']
+    if not rsa_verify(json.dumps(versions_info, sort_keys=True).encode('utf-8'), signature, UPDATES_RSA_KEY):
+        to_screen('ERROR: the versions file signature is invalid. Aborting.')
+        return
+
+    version_id = versions_info['latest']
+
+    def version_tuple(version_str):
+        return tuple(map(int, version_str.split('.')))
+    if version_tuple(__version__) >= version_tuple(version_id):
+        to_screen('youtube-dlc is up to date (%s)' % __version__)
+        return
+
+    to_screen('Updating to version ' + version_id + ' ...')
+    version = versions_info['versions'][version_id]
+
+    print_notes(to_screen, versions_info['versions'])
+
+    # sys.executable is set to the full pathname of the exe-file for py2exe
+    # though symlinks are not followed so that we need to do this manually
+    # with help of realpath
+    filename = compat_realpath(sys.executable if hasattr(sys, 'frozen') else sys.argv[0])
+
+    if not os.access(filename, os.W_OK):
+        to_screen('ERROR: no write permissions on %s' % filename)
+        return
+
+    # Py2EXE
+    if hasattr(sys, 'frozen'):
+        exe = filename
+        directory = os.path.dirname(exe)
+        if not os.access(directory, os.W_OK):
+            to_screen('ERROR: no write permissions on %s' % directory)
+            return
+
+        try:
+            urlh = opener.open(version['exe'][0])
+            newcontent = urlh.read()
+            urlh.close()
+        except (IOError, OSError):
+            if verbose:
+                to_screen(encode_compat_str(traceback.format_exc()))
+            to_screen('ERROR: unable to download latest version')
+            return
+
+        newcontent_hash = hashlib.sha256(newcontent).hexdigest()
+        if newcontent_hash != version['exe'][1]:
+            to_screen('ERROR: the downloaded file hash does not match. Aborting.')
+            return
+
+        try:
+            with open(exe + '.new', 'wb') as outf:
+                outf.write(newcontent)
+        except (IOError, OSError):
+            if verbose:
+                to_screen(encode_compat_str(traceback.format_exc()))
+            to_screen('ERROR: unable to write the new version')
+            return
+
+        try:
+            bat = os.path.join(directory, 'youtube-dlc-updater.bat')
+            with io.open(bat, 'w') as batfile:
+                batfile.write('''
+@echo off
+echo Waiting for file handle to be closed ...
+ping 127.0.0.1 -n 5 -w 1000 > NUL
+move /Y "%s.new" "%s" > NUL
+echo Updated youtube-dlc to version %s.
+start /b "" cmd /c del "%%~f0"&exit /b"
+                \n''' % (exe, exe, version_id))
+
+            subprocess.Popen([bat])  # Continues to run in the background
+            return  # Do not show premature success messages
+        except (IOError, OSError):
+            if verbose:
+                to_screen(encode_compat_str(traceback.format_exc()))
+            to_screen('ERROR: unable to overwrite current version')
+            return
+
+    # Zip unix package
+    elif isinstance(globals().get('__loader__'), zipimporter):
+        try:
+            urlh = opener.open(version['bin'][0])
+            newcontent = urlh.read()
+            urlh.close()
+        except (IOError, OSError):
+            if verbose:
+                to_screen(encode_compat_str(traceback.format_exc()))
+            to_screen('ERROR: unable to download latest version')
+            return
+
+        newcontent_hash = hashlib.sha256(newcontent).hexdigest()
+        if newcontent_hash != version['bin'][1]:
+            to_screen('ERROR: the downloaded file hash does not match. Aborting.')
+            return
+
+        try:
+            with open(filename, 'wb') as outf:
+                outf.write(newcontent)
+        except (IOError, OSError):
+            if verbose:
+                to_screen(encode_compat_str(traceback.format_exc()))
+            to_screen('ERROR: unable to overwrite current version')
+            return
+
+    to_screen('Updated youtube-dlc. Restart youtube-dlc to use the new version.')
+
+
+def get_notes(versions, fromVersion):
+    notes = []
+    for v, vdata in sorted(versions.items()):
+        if v > fromVersion:
+            notes.extend(vdata.get('notes', []))
+    return notes
+
+
+def print_notes(to_screen, versions, fromVersion=__version__):
+    notes = get_notes(versions, fromVersion)
+    if notes:
+        to_screen('PLEASE NOTE:')
+        for note in notes:
+            to_screen(note)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

new file mode 100644 (file)

index 0000000..7dafaca
--- /dev/null
+++ b/youtube_dl/utils.py
@@ -0,0 +1,5707 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+import base64
+import binascii
+import calendar
+import codecs
+import collections
+import contextlib
+import ctypes
+import datetime
+import email.utils
+import email.header
+import errno
+import functools
+import gzip
+import io
+import itertools
+import json
+import locale
+import math
+import operator
+import os
+import platform
+import random
+import re
+import socket
+import ssl
+import subprocess
+import sys
+import tempfile
+import time
+import traceback
+import xml.etree.ElementTree
+import zlib
+
+from .compat import (
+    compat_HTMLParseError,
+    compat_HTMLParser,
+    compat_basestring,
+    compat_chr,
+    compat_cookiejar,
+    compat_ctypes_WINFUNCTYPE,
+    compat_etree_fromstring,
+    compat_expanduser,
+    compat_html_entities,
+    compat_html_entities_html5,
+    compat_http_client,
+    compat_integer_types,
+    compat_kwargs,
+    compat_os_name,
+    compat_parse_qs,
+    compat_shlex_quote,
+    compat_str,
+    compat_struct_pack,
+    compat_struct_unpack,
+    compat_urllib_error,
+    compat_urllib_parse,
+    compat_urllib_parse_urlencode,
+    compat_urllib_parse_urlparse,
+    compat_urllib_parse_unquote_plus,
+    compat_urllib_request,
+    compat_urlparse,
+    compat_xpath,
+)
+
+from .socks import (
+    ProxyType,
+    sockssocket,
+)
+
+
+def register_socks_protocols():
+    # "Register" SOCKS protocols
+    # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
+    # URLs with protocols not in urlparse.uses_netloc are not handled correctly
+    for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
+        if scheme not in compat_urlparse.uses_netloc:
+            compat_urlparse.uses_netloc.append(scheme)
+
+
+# This is not clearly defined otherwise
+compiled_regex_type = type(re.compile(''))
+
+
+def random_user_agent():
+    _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
+    _CHROME_VERSIONS = (
+        '74.0.3729.129',
+        '76.0.3780.3',
+        '76.0.3780.2',
+        '74.0.3729.128',
+        '76.0.3780.1',
+        '76.0.3780.0',
+        '75.0.3770.15',
+        '74.0.3729.127',
+        '74.0.3729.126',
+        '76.0.3779.1',
+        '76.0.3779.0',
+        '75.0.3770.14',
+        '74.0.3729.125',
+        '76.0.3778.1',
+        '76.0.3778.0',
+        '75.0.3770.13',
+        '74.0.3729.124',
+        '74.0.3729.123',
+        '73.0.3683.121',
+        '76.0.3777.1',
+        '76.0.3777.0',
+        '75.0.3770.12',
+        '74.0.3729.122',
+        '76.0.3776.4',
+        '75.0.3770.11',
+        '74.0.3729.121',
+        '76.0.3776.3',
+        '76.0.3776.2',
+        '73.0.3683.120',
+        '74.0.3729.120',
+        '74.0.3729.119',
+        '74.0.3729.118',
+        '76.0.3776.1',
+        '76.0.3776.0',
+        '76.0.3775.5',
+        '75.0.3770.10',
+        '74.0.3729.117',
+        '76.0.3775.4',
+        '76.0.3775.3',
+        '74.0.3729.116',
+        '75.0.3770.9',
+        '76.0.3775.2',
+        '76.0.3775.1',
+        '76.0.3775.0',
+        '75.0.3770.8',
+        '74.0.3729.115',
+        '74.0.3729.114',
+        '76.0.3774.1',
+        '76.0.3774.0',
+        '75.0.3770.7',
+        '74.0.3729.113',
+        '74.0.3729.112',
+        '74.0.3729.111',
+        '76.0.3773.1',
+        '76.0.3773.0',
+        '75.0.3770.6',
+        '74.0.3729.110',
+        '74.0.3729.109',
+        '76.0.3772.1',
+        '76.0.3772.0',
+        '75.0.3770.5',
+        '74.0.3729.108',
+        '74.0.3729.107',
+        '76.0.3771.1',
+        '76.0.3771.0',
+        '75.0.3770.4',
+        '74.0.3729.106',
+        '74.0.3729.105',
+        '75.0.3770.3',
+        '74.0.3729.104',
+        '74.0.3729.103',
+        '74.0.3729.102',
+        '75.0.3770.2',
+        '74.0.3729.101',
+        '75.0.3770.1',
+        '75.0.3770.0',
+        '74.0.3729.100',
+        '75.0.3769.5',
+        '75.0.3769.4',
+        '74.0.3729.99',
+        '75.0.3769.3',
+        '75.0.3769.2',
+        '75.0.3768.6',
+        '74.0.3729.98',
+        '75.0.3769.1',
+        '75.0.3769.0',
+        '74.0.3729.97',
+        '73.0.3683.119',
+        '73.0.3683.118',
+        '74.0.3729.96',
+        '75.0.3768.5',
+        '75.0.3768.4',
+        '75.0.3768.3',
+        '75.0.3768.2',
+        '74.0.3729.95',
+        '74.0.3729.94',
+        '75.0.3768.1',
+        '75.0.3768.0',
+        '74.0.3729.93',
+        '74.0.3729.92',
+        '73.0.3683.117',
+        '74.0.3729.91',
+        '75.0.3766.3',
+        '74.0.3729.90',
+        '75.0.3767.2',
+        '75.0.3767.1',
+        '75.0.3767.0',
+        '74.0.3729.89',
+        '73.0.3683.116',
+        '75.0.3766.2',
+        '74.0.3729.88',
+        '75.0.3766.1',
+        '75.0.3766.0',
+        '74.0.3729.87',
+        '73.0.3683.115',
+        '74.0.3729.86',
+        '75.0.3765.1',
+        '75.0.3765.0',
+        '74.0.3729.85',
+        '73.0.3683.114',
+        '74.0.3729.84',
+        '75.0.3764.1',
+        '75.0.3764.0',
+        '74.0.3729.83',
+        '73.0.3683.113',
+        '75.0.3763.2',
+        '75.0.3761.4',
+        '74.0.3729.82',
+        '75.0.3763.1',
+        '75.0.3763.0',
+        '74.0.3729.81',
+        '73.0.3683.112',
+        '75.0.3762.1',
+        '75.0.3762.0',
+        '74.0.3729.80',
+        '75.0.3761.3',
+        '74.0.3729.79',
+        '73.0.3683.111',
+        '75.0.3761.2',
+        '74.0.3729.78',
+        '74.0.3729.77',
+        '75.0.3761.1',
+        '75.0.3761.0',
+        '73.0.3683.110',
+        '74.0.3729.76',
+        '74.0.3729.75',
+        '75.0.3760.0',
+        '74.0.3729.74',
+        '75.0.3759.8',
+        '75.0.3759.7',
+        '75.0.3759.6',
+        '74.0.3729.73',
+        '75.0.3759.5',
+        '74.0.3729.72',
+        '73.0.3683.109',
+        '75.0.3759.4',
+        '75.0.3759.3',
+        '74.0.3729.71',
+        '75.0.3759.2',
+        '74.0.3729.70',
+        '73.0.3683.108',
+        '74.0.3729.69',
+        '75.0.3759.1',
+        '75.0.3759.0',
+        '74.0.3729.68',
+        '73.0.3683.107',
+        '74.0.3729.67',
+        '75.0.3758.1',
+        '75.0.3758.0',
+        '74.0.3729.66',
+        '73.0.3683.106',
+        '74.0.3729.65',
+        '75.0.3757.1',
+        '75.0.3757.0',
+        '74.0.3729.64',
+        '73.0.3683.105',
+        '74.0.3729.63',
+        '75.0.3756.1',
+        '75.0.3756.0',
+        '74.0.3729.62',
+        '73.0.3683.104',
+        '75.0.3755.3',
+        '75.0.3755.2',
+        '73.0.3683.103',
+        '75.0.3755.1',
+        '75.0.3755.0',
+        '74.0.3729.61',
+        '73.0.3683.102',
+        '74.0.3729.60',
+        '75.0.3754.2',
+        '74.0.3729.59',
+        '75.0.3753.4',
+        '74.0.3729.58',
+        '75.0.3754.1',
+        '75.0.3754.0',
+        '74.0.3729.57',
+        '73.0.3683.101',
+        '75.0.3753.3',
+        '75.0.3752.2',
+        '75.0.3753.2',
+        '74.0.3729.56',
+        '75.0.3753.1',
+        '75.0.3753.0',
+        '74.0.3729.55',
+        '73.0.3683.100',
+        '74.0.3729.54',
+        '75.0.3752.1',
+        '75.0.3752.0',
+        '74.0.3729.53',
+        '73.0.3683.99',
+        '74.0.3729.52',
+        '75.0.3751.1',
+        '75.0.3751.0',
+        '74.0.3729.51',
+        '73.0.3683.98',
+        '74.0.3729.50',
+        '75.0.3750.0',
+        '74.0.3729.49',
+        '74.0.3729.48',
+        '74.0.3729.47',
+        '75.0.3749.3',
+        '74.0.3729.46',
+        '73.0.3683.97',
+        '75.0.3749.2',
+        '74.0.3729.45',
+        '75.0.3749.1',
+        '75.0.3749.0',
+        '74.0.3729.44',
+        '73.0.3683.96',
+        '74.0.3729.43',
+        '74.0.3729.42',
+        '75.0.3748.1',
+        '75.0.3748.0',
+        '74.0.3729.41',
+        '75.0.3747.1',
+        '73.0.3683.95',
+        '75.0.3746.4',
+        '74.0.3729.40',
+        '74.0.3729.39',
+        '75.0.3747.0',
+        '75.0.3746.3',
+        '75.0.3746.2',
+        '74.0.3729.38',
+        '75.0.3746.1',
+        '75.0.3746.0',
+        '74.0.3729.37',
+        '73.0.3683.94',
+        '75.0.3745.5',
+        '75.0.3745.4',
+        '75.0.3745.3',
+        '75.0.3745.2',
+        '74.0.3729.36',
+        '75.0.3745.1',
+        '75.0.3745.0',
+        '75.0.3744.2',
+        '74.0.3729.35',
+        '73.0.3683.93',
+        '74.0.3729.34',
+        '75.0.3744.1',
+        '75.0.3744.0',
+        '74.0.3729.33',
+        '73.0.3683.92',
+        '74.0.3729.32',
+        '74.0.3729.31',
+        '73.0.3683.91',
+        '75.0.3741.2',
+        '75.0.3740.5',
+        '74.0.3729.30',
+        '75.0.3741.1',
+        '75.0.3741.0',
+        '74.0.3729.29',
+        '75.0.3740.4',
+        '73.0.3683.90',
+        '74.0.3729.28',
+        '75.0.3740.3',
+        '73.0.3683.89',
+        '75.0.3740.2',
+        '74.0.3729.27',
+        '75.0.3740.1',
+        '75.0.3740.0',
+        '74.0.3729.26',
+        '73.0.3683.88',
+        '73.0.3683.87',
+        '74.0.3729.25',
+        '75.0.3739.1',
+        '75.0.3739.0',
+        '73.0.3683.86',
+        '74.0.3729.24',
+        '73.0.3683.85',
+        '75.0.3738.4',
+        '75.0.3738.3',
+        '75.0.3738.2',
+        '75.0.3738.1',
+        '75.0.3738.0',
+        '74.0.3729.23',
+        '73.0.3683.84',
+        '74.0.3729.22',
+        '74.0.3729.21',
+        '75.0.3737.1',
+        '75.0.3737.0',
+        '74.0.3729.20',
+        '73.0.3683.83',
+        '74.0.3729.19',
+        '75.0.3736.1',
+        '75.0.3736.0',
+        '74.0.3729.18',
+        '73.0.3683.82',
+        '74.0.3729.17',
+        '75.0.3735.1',
+        '75.0.3735.0',
+        '74.0.3729.16',
+        '73.0.3683.81',
+        '75.0.3734.1',
+        '75.0.3734.0',
+        '74.0.3729.15',
+        '73.0.3683.80',
+        '74.0.3729.14',
+        '75.0.3733.1',
+        '75.0.3733.0',
+        '75.0.3732.1',
+        '74.0.3729.13',
+        '74.0.3729.12',
+        '73.0.3683.79',
+        '74.0.3729.11',
+        '75.0.3732.0',
+        '74.0.3729.10',
+        '73.0.3683.78',
+        '74.0.3729.9',
+        '74.0.3729.8',
+        '74.0.3729.7',
+        '75.0.3731.3',
+        '75.0.3731.2',
+        '75.0.3731.0',
+        '74.0.3729.6',
+        '73.0.3683.77',
+        '73.0.3683.76',
+        '75.0.3730.5',
+        '75.0.3730.4',
+        '73.0.3683.75',
+        '74.0.3729.5',
+        '73.0.3683.74',
+        '75.0.3730.3',
+        '75.0.3730.2',
+        '74.0.3729.4',
+        '73.0.3683.73',
+        '73.0.3683.72',
+        '75.0.3730.1',
+        '75.0.3730.0',
+        '74.0.3729.3',
+        '73.0.3683.71',
+        '74.0.3729.2',
+        '73.0.3683.70',
+        '74.0.3729.1',
+        '74.0.3729.0',
+        '74.0.3726.4',
+        '73.0.3683.69',
+        '74.0.3726.3',
+        '74.0.3728.0',
+        '74.0.3726.2',
+        '73.0.3683.68',
+        '74.0.3726.1',
+        '74.0.3726.0',
+        '74.0.3725.4',
+        '73.0.3683.67',
+        '73.0.3683.66',
+        '74.0.3725.3',
+        '74.0.3725.2',
+        '74.0.3725.1',
+        '74.0.3724.8',
+        '74.0.3725.0',
+        '73.0.3683.65',
+        '74.0.3724.7',
+        '74.0.3724.6',
+        '74.0.3724.5',
+        '74.0.3724.4',
+        '74.0.3724.3',
+        '74.0.3724.2',
+        '74.0.3724.1',
+        '74.0.3724.0',
+        '73.0.3683.64',
+        '74.0.3723.1',
+        '74.0.3723.0',
+        '73.0.3683.63',
+        '74.0.3722.1',
+        '74.0.3722.0',
+        '73.0.3683.62',
+        '74.0.3718.9',
+        '74.0.3702.3',
+        '74.0.3721.3',
+        '74.0.3721.2',
+        '74.0.3721.1',
+        '74.0.3721.0',
+        '74.0.3720.6',
+        '73.0.3683.61',
+        '72.0.3626.122',
+        '73.0.3683.60',
+        '74.0.3720.5',
+        '72.0.3626.121',
+        '74.0.3718.8',
+        '74.0.3720.4',
+        '74.0.3720.3',
+        '74.0.3718.7',
+        '74.0.3720.2',
+        '74.0.3720.1',
+        '74.0.3720.0',
+        '74.0.3718.6',
+        '74.0.3719.5',
+        '73.0.3683.59',
+        '74.0.3718.5',
+        '74.0.3718.4',
+        '74.0.3719.4',
+        '74.0.3719.3',
+        '74.0.3719.2',
+        '74.0.3719.1',
+        '73.0.3683.58',
+        '74.0.3719.0',
+        '73.0.3683.57',
+        '73.0.3683.56',
+        '74.0.3718.3',
+        '73.0.3683.55',
+        '74.0.3718.2',
+        '74.0.3718.1',
+        '74.0.3718.0',
+        '73.0.3683.54',
+        '74.0.3717.2',
+        '73.0.3683.53',
+        '74.0.3717.1',
+        '74.0.3717.0',
+        '73.0.3683.52',
+        '74.0.3716.1',
+        '74.0.3716.0',
+        '73.0.3683.51',
+        '74.0.3715.1',
+        '74.0.3715.0',
+        '73.0.3683.50',
+        '74.0.3711.2',
+        '74.0.3714.2',
+        '74.0.3713.3',
+        '74.0.3714.1',
+        '74.0.3714.0',
+        '73.0.3683.49',
+        '74.0.3713.1',
+        '74.0.3713.0',
+        '72.0.3626.120',
+        '73.0.3683.48',
+        '74.0.3712.2',
+        '74.0.3712.1',
+        '74.0.3712.0',
+        '73.0.3683.47',
+        '72.0.3626.119',
+        '73.0.3683.46',
+        '74.0.3710.2',
+        '72.0.3626.118',
+        '74.0.3711.1',
+        '74.0.3711.0',
+        '73.0.3683.45',
+        '72.0.3626.117',
+        '74.0.3710.1',
+        '74.0.3710.0',
+        '73.0.3683.44',
+        '72.0.3626.116',
+        '74.0.3709.1',
+        '74.0.3709.0',
+        '74.0.3704.9',
+        '73.0.3683.43',
+        '72.0.3626.115',
+        '74.0.3704.8',
+        '74.0.3704.7',
+        '74.0.3708.0',
+        '74.0.3706.7',
+        '74.0.3704.6',
+        '73.0.3683.42',
+        '72.0.3626.114',
+        '74.0.3706.6',
+        '72.0.3626.113',
+        '74.0.3704.5',
+        '74.0.3706.5',
+        '74.0.3706.4',
+        '74.0.3706.3',
+        '74.0.3706.2',
+        '74.0.3706.1',
+        '74.0.3706.0',
+        '73.0.3683.41',
+        '72.0.3626.112',
+        '74.0.3705.1',
+        '74.0.3705.0',
+        '73.0.3683.40',
+        '72.0.3626.111',
+        '73.0.3683.39',
+        '74.0.3704.4',
+        '73.0.3683.38',
+        '74.0.3704.3',
+        '74.0.3704.2',
+        '74.0.3704.1',
+        '74.0.3704.0',
+        '73.0.3683.37',
+        '72.0.3626.110',
+        '72.0.3626.109',
+        '74.0.3703.3',
+        '74.0.3703.2',
+        '73.0.3683.36',
+        '74.0.3703.1',
+        '74.0.3703.0',
+        '73.0.3683.35',
+        '72.0.3626.108',
+        '74.0.3702.2',
+        '74.0.3699.3',
+        '74.0.3702.1',
+        '74.0.3702.0',
+        '73.0.3683.34',
+        '72.0.3626.107',
+        '73.0.3683.33',
+        '74.0.3701.1',
+        '74.0.3701.0',
+        '73.0.3683.32',
+        '73.0.3683.31',
+        '72.0.3626.105',
+        '74.0.3700.1',
+        '74.0.3700.0',
+        '73.0.3683.29',
+        '72.0.3626.103',
+        '74.0.3699.2',
+        '74.0.3699.1',
+        '74.0.3699.0',
+        '73.0.3683.28',
+        '72.0.3626.102',
+        '73.0.3683.27',
+        '73.0.3683.26',
+        '74.0.3698.0',
+        '74.0.3696.2',
+        '72.0.3626.101',
+        '73.0.3683.25',
+        '74.0.3696.1',
+        '74.0.3696.0',
+        '74.0.3694.8',
+        '72.0.3626.100',
+        '74.0.3694.7',
+        '74.0.3694.6',
+        '74.0.3694.5',
+        '74.0.3694.4',
+        '72.0.3626.99',
+        '72.0.3626.98',
+        '74.0.3694.3',
+        '73.0.3683.24',
+        '72.0.3626.97',
+        '72.0.3626.96',
+        '72.0.3626.95',
+        '73.0.3683.23',
+        '72.0.3626.94',
+        '73.0.3683.22',
+        '73.0.3683.21',
+        '72.0.3626.93',
+        '74.0.3694.2',
+        '72.0.3626.92',
+        '74.0.3694.1',
+        '74.0.3694.0',
+        '74.0.3693.6',
+        '73.0.3683.20',
+        '72.0.3626.91',
+        '74.0.3693.5',
+        '74.0.3693.4',
+        '74.0.3693.3',
+        '74.0.3693.2',
+        '73.0.3683.19',
+        '74.0.3693.1',
+        '74.0.3693.0',
+        '73.0.3683.18',
+        '72.0.3626.90',
+        '74.0.3692.1',
+        '74.0.3692.0',
+        '73.0.3683.17',
+        '72.0.3626.89',
+        '74.0.3687.3',
+        '74.0.3691.1',
+        '74.0.3691.0',
+        '73.0.3683.16',
+        '72.0.3626.88',
+        '72.0.3626.87',
+        '73.0.3683.15',
+        '74.0.3690.1',
+        '74.0.3690.0',
+        '73.0.3683.14',
+        '72.0.3626.86',
+        '73.0.3683.13',
+        '73.0.3683.12',
+        '74.0.3689.1',
+        '74.0.3689.0',
+        '73.0.3683.11',
+        '72.0.3626.85',
+        '73.0.3683.10',
+        '72.0.3626.84',
+        '73.0.3683.9',
+        '74.0.3688.1',
+        '74.0.3688.0',
+        '73.0.3683.8',
+        '72.0.3626.83',
+        '74.0.3687.2',
+        '74.0.3687.1',
+        '74.0.3687.0',
+        '73.0.3683.7',
+        '72.0.3626.82',
+        '74.0.3686.4',
+        '72.0.3626.81',
+        '74.0.3686.3',
+        '74.0.3686.2',
+        '74.0.3686.1',
+        '74.0.3686.0',
+        '73.0.3683.6',
+        '72.0.3626.80',
+        '74.0.3685.1',
+        '74.0.3685.0',
+        '73.0.3683.5',
+        '72.0.3626.79',
+        '74.0.3684.1',
+        '74.0.3684.0',
+        '73.0.3683.4',
+        '72.0.3626.78',
+        '72.0.3626.77',
+        '73.0.3683.3',
+        '73.0.3683.2',
+        '72.0.3626.76',
+        '73.0.3683.1',
+        '73.0.3683.0',
+        '72.0.3626.75',
+        '71.0.3578.141',
+        '73.0.3682.1',
+        '73.0.3682.0',
+        '72.0.3626.74',
+        '71.0.3578.140',
+        '73.0.3681.4',
+        '73.0.3681.3',
+        '73.0.3681.2',
+        '73.0.3681.1',
+        '73.0.3681.0',
+        '72.0.3626.73',
+        '71.0.3578.139',
+        '72.0.3626.72',
+        '72.0.3626.71',
+        '73.0.3680.1',
+        '73.0.3680.0',
+        '72.0.3626.70',
+        '71.0.3578.138',
+        '73.0.3678.2',
+        '73.0.3679.1',
+        '73.0.3679.0',
+        '72.0.3626.69',
+        '71.0.3578.137',
+        '73.0.3678.1',
+        '73.0.3678.0',
+        '71.0.3578.136',
+        '73.0.3677.1',
+        '73.0.3677.0',
+        '72.0.3626.68',
+        '72.0.3626.67',
+        '71.0.3578.135',
+        '73.0.3676.1',
+        '73.0.3676.0',
+        '73.0.3674.2',
+        '72.0.3626.66',
+        '71.0.3578.134',
+        '73.0.3674.1',
+        '73.0.3674.0',
+        '72.0.3626.65',
+        '71.0.3578.133',
+        '73.0.3673.2',
+        '73.0.3673.1',
+        '73.0.3673.0',
+        '72.0.3626.64',
+        '71.0.3578.132',
+        '72.0.3626.63',
+        '72.0.3626.62',
+        '72.0.3626.61',
+        '72.0.3626.60',
+        '73.0.3672.1',
+        '73.0.3672.0',
+        '72.0.3626.59',
+        '71.0.3578.131',
+        '73.0.3671.3',
+        '73.0.3671.2',
+        '73.0.3671.1',
+        '73.0.3671.0',
+        '72.0.3626.58',
+        '71.0.3578.130',
+        '73.0.3670.1',
+        '73.0.3670.0',
+        '72.0.3626.57',
+        '71.0.3578.129',
+        '73.0.3669.1',
+        '73.0.3669.0',
+        '72.0.3626.56',
+        '71.0.3578.128',
+        '73.0.3668.2',
+        '73.0.3668.1',
+        '73.0.3668.0',
+        '72.0.3626.55',
+        '71.0.3578.127',
+        '73.0.3667.2',
+        '73.0.3667.1',
+        '73.0.3667.0',
+        '72.0.3626.54',
+        '71.0.3578.126',
+        '73.0.3666.1',
+        '73.0.3666.0',
+        '72.0.3626.53',
+        '71.0.3578.125',
+        '73.0.3665.4',
+        '73.0.3665.3',
+        '72.0.3626.52',
+        '73.0.3665.2',
+        '73.0.3664.4',
+        '73.0.3665.1',
+        '73.0.3665.0',
+        '72.0.3626.51',
+        '71.0.3578.124',
+        '72.0.3626.50',
+        '73.0.3664.3',
+        '73.0.3664.2',
+        '73.0.3664.1',
+        '73.0.3664.0',
+        '73.0.3663.2',
+        '72.0.3626.49',
+        '71.0.3578.123',
+        '73.0.3663.1',
+        '73.0.3663.0',
+        '72.0.3626.48',
+        '71.0.3578.122',
+        '73.0.3662.1',
+        '73.0.3662.0',
+        '72.0.3626.47',
+        '71.0.3578.121',
+        '73.0.3661.1',
+        '72.0.3626.46',
+        '73.0.3661.0',
+        '72.0.3626.45',
+        '71.0.3578.120',
+        '73.0.3660.2',
+        '73.0.3660.1',
+        '73.0.3660.0',
+        '72.0.3626.44',
+        '71.0.3578.119',
+        '73.0.3659.1',
+        '73.0.3659.0',
+        '72.0.3626.43',
+        '71.0.3578.118',
+        '73.0.3658.1',
+        '73.0.3658.0',
+        '72.0.3626.42',
+        '71.0.3578.117',
+        '73.0.3657.1',
+        '73.0.3657.0',
+        '72.0.3626.41',
+        '71.0.3578.116',
+        '73.0.3656.1',
+        '73.0.3656.0',
+        '72.0.3626.40',
+        '71.0.3578.115',
+        '73.0.3655.1',
+        '73.0.3655.0',
+        '72.0.3626.39',
+        '71.0.3578.114',
+        '73.0.3654.1',
+        '73.0.3654.0',
+        '72.0.3626.38',
+        '71.0.3578.113',
+        '73.0.3653.1',
+        '73.0.3653.0',
+        '72.0.3626.37',
+        '71.0.3578.112',
+        '73.0.3652.1',
+        '73.0.3652.0',
+        '72.0.3626.36',
+        '71.0.3578.111',
+        '73.0.3651.1',
+        '73.0.3651.0',
+        '72.0.3626.35',
+        '71.0.3578.110',
+        '73.0.3650.1',
+        '73.0.3650.0',
+        '72.0.3626.34',
+        '71.0.3578.109',
+        '73.0.3649.1',
+        '73.0.3649.0',
+        '72.0.3626.33',
+        '71.0.3578.108',
+        '73.0.3648.2',
+        '73.0.3648.1',
+        '73.0.3648.0',
+        '72.0.3626.32',
+        '71.0.3578.107',
+        '73.0.3647.2',
+        '73.0.3647.1',
+        '73.0.3647.0',
+        '72.0.3626.31',
+        '71.0.3578.106',
+        '73.0.3635.3',
+        '73.0.3646.2',
+        '73.0.3646.1',
+        '73.0.3646.0',
+        '72.0.3626.30',
+        '71.0.3578.105',
+        '72.0.3626.29',
+        '73.0.3645.2',
+        '73.0.3645.1',
+        '73.0.3645.0',
+        '72.0.3626.28',
+        '71.0.3578.104',
+        '72.0.3626.27',
+        '72.0.3626.26',
+        '72.0.3626.25',
+        '72.0.3626.24',
+        '73.0.3644.0',
+        '73.0.3643.2',
+        '72.0.3626.23',
+        '71.0.3578.103',
+        '73.0.3643.1',
+        '73.0.3643.0',
+        '72.0.3626.22',
+        '71.0.3578.102',
+        '73.0.3642.1',
+        '73.0.3642.0',
+        '72.0.3626.21',
+        '71.0.3578.101',
+        '73.0.3641.1',
+        '73.0.3641.0',
+        '72.0.3626.20',
+        '71.0.3578.100',
+        '72.0.3626.19',
+        '73.0.3640.1',
+        '73.0.3640.0',
+        '72.0.3626.18',
+        '73.0.3639.1',
+        '71.0.3578.99',
+        '73.0.3639.0',
+        '72.0.3626.17',
+        '73.0.3638.2',
+        '72.0.3626.16',
+        '73.0.3638.1',
+        '73.0.3638.0',
+        '72.0.3626.15',
+        '71.0.3578.98',
+        '73.0.3635.2',
+        '71.0.3578.97',
+        '73.0.3637.1',
+        '73.0.3637.0',
+        '72.0.3626.14',
+        '71.0.3578.96',
+        '71.0.3578.95',
+        '72.0.3626.13',
+        '71.0.3578.94',
+        '73.0.3636.2',
+        '71.0.3578.93',
+        '73.0.3636.1',
+        '73.0.3636.0',
+        '72.0.3626.12',
+        '71.0.3578.92',
+        '73.0.3635.1',
+        '73.0.3635.0',
+        '72.0.3626.11',
+        '71.0.3578.91',
+        '73.0.3634.2',
+        '73.0.3634.1',
+        '73.0.3634.0',
+        '72.0.3626.10',
+        '71.0.3578.90',
+        '71.0.3578.89',
+        '73.0.3633.2',
+        '73.0.3633.1',
+        '73.0.3633.0',
+        '72.0.3610.4',
+        '72.0.3626.9',
+        '71.0.3578.88',
+        '73.0.3632.5',
+        '73.0.3632.4',
+        '73.0.3632.3',
+        '73.0.3632.2',
+        '73.0.3632.1',
+        '73.0.3632.0',
+        '72.0.3626.8',
+        '71.0.3578.87',
+        '73.0.3631.2',
+        '73.0.3631.1',
+        '73.0.3631.0',
+        '72.0.3626.7',
+        '71.0.3578.86',
+        '72.0.3626.6',
+        '73.0.3630.1',
+        '73.0.3630.0',
+        '72.0.3626.5',
+        '71.0.3578.85',
+        '72.0.3626.4',
+        '73.0.3628.3',
+        '73.0.3628.2',
+        '73.0.3629.1',
+        '73.0.3629.0',
+        '72.0.3626.3',
+        '71.0.3578.84',
+        '73.0.3628.1',
+        '73.0.3628.0',
+        '71.0.3578.83',
+        '73.0.3627.1',
+        '73.0.3627.0',
+        '72.0.3626.2',
+        '71.0.3578.82',
+        '71.0.3578.81',
+        '71.0.3578.80',
+        '72.0.3626.1',
+        '72.0.3626.0',
+        '71.0.3578.79',
+        '70.0.3538.124',
+        '71.0.3578.78',
+        '72.0.3623.4',
+        '72.0.3625.2',
+        '72.0.3625.1',
+        '72.0.3625.0',
+        '71.0.3578.77',
+        '70.0.3538.123',
+        '72.0.3624.4',
+        '72.0.3624.3',
+        '72.0.3624.2',
+        '71.0.3578.76',
+        '72.0.3624.1',
+        '72.0.3624.0',
+        '72.0.3623.3',
+        '71.0.3578.75',
+        '70.0.3538.122',
+        '71.0.3578.74',
+        '72.0.3623.2',
+        '72.0.3610.3',
+        '72.0.3623.1',
+        '72.0.3623.0',
+        '72.0.3622.3',
+        '72.0.3622.2',
+        '71.0.3578.73',
+        '70.0.3538.121',
+        '72.0.3622.1',
+        '72.0.3622.0',
+        '71.0.3578.72',
+        '70.0.3538.120',
+        '72.0.3621.1',
+        '72.0.3621.0',
+        '71.0.3578.71',
+        '70.0.3538.119',
+        '72.0.3620.1',
+        '72.0.3620.0',
+        '71.0.3578.70',
+        '70.0.3538.118',
+        '71.0.3578.69',
+        '72.0.3619.1',
+        '72.0.3619.0',
+        '71.0.3578.68',
+        '70.0.3538.117',
+        '71.0.3578.67',
+        '72.0.3618.1',
+        '72.0.3618.0',
+        '71.0.3578.66',
+        '70.0.3538.116',
+        '72.0.3617.1',
+        '72.0.3617.0',
+        '71.0.3578.65',
+        '70.0.3538.115',
+        '72.0.3602.3',
+        '71.0.3578.64',
+        '72.0.3616.1',
+        '72.0.3616.0',
+        '71.0.3578.63',
+        '70.0.3538.114',
+        '71.0.3578.62',
+        '72.0.3615.1',
+        '72.0.3615.0',
+        '71.0.3578.61',
+        '70.0.3538.113',
+        '72.0.3614.1',
+        '72.0.3614.0',
+        '71.0.3578.60',
+        '70.0.3538.112',
+        '72.0.3613.1',
+        '72.0.3613.0',
+        '71.0.3578.59',
+        '70.0.3538.111',
+        '72.0.3612.2',
+        '72.0.3612.1',
+        '72.0.3612.0',
+        '70.0.3538.110',
+        '71.0.3578.58',
+        '70.0.3538.109',
+        '72.0.3611.2',
+        '72.0.3611.1',
+        '72.0.3611.0',
+        '71.0.3578.57',
+        '70.0.3538.108',
+        '72.0.3610.2',
+        '71.0.3578.56',
+        '71.0.3578.55',
+        '72.0.3610.1',
+        '72.0.3610.0',
+        '71.0.3578.54',
+        '70.0.3538.107',
+        '71.0.3578.53',
+        '72.0.3609.3',
+        '71.0.3578.52',
+        '72.0.3609.2',
+        '71.0.3578.51',
+        '72.0.3608.5',
+        '72.0.3609.1',
+        '72.0.3609.0',
+        '71.0.3578.50',
+        '70.0.3538.106',
+        '72.0.3608.4',
+        '72.0.3608.3',
+        '72.0.3608.2',
+        '71.0.3578.49',
+        '72.0.3608.1',
+        '72.0.3608.0',
+        '70.0.3538.105',
+        '71.0.3578.48',
+        '72.0.3607.1',
+        '72.0.3607.0',
+        '71.0.3578.47',
+        '70.0.3538.104',
+        '72.0.3606.2',
+        '72.0.3606.1',
+        '72.0.3606.0',
+        '71.0.3578.46',
+        '70.0.3538.103',
+        '70.0.3538.102',
+        '72.0.3605.3',
+        '72.0.3605.2',
+        '72.0.3605.1',
+        '72.0.3605.0',
+        '71.0.3578.45',
+        '70.0.3538.101',
+        '71.0.3578.44',
+        '71.0.3578.43',
+        '70.0.3538.100',
+        '70.0.3538.99',
+        '71.0.3578.42',
+        '72.0.3604.1',
+        '72.0.3604.0',
+        '71.0.3578.41',
+        '70.0.3538.98',
+        '71.0.3578.40',
+        '72.0.3603.2',
+        '72.0.3603.1',
+        '72.0.3603.0',
+        '71.0.3578.39',
+        '70.0.3538.97',
+        '72.0.3602.2',
+        '71.0.3578.38',
+        '71.0.3578.37',
+        '72.0.3602.1',
+        '72.0.3602.0',
+        '71.0.3578.36',
+        '70.0.3538.96',
+        '72.0.3601.1',
+        '72.0.3601.0',
+        '71.0.3578.35',
+        '70.0.3538.95',
+        '72.0.3600.1',
+        '72.0.3600.0',
+        '71.0.3578.34',
+        '70.0.3538.94',
+        '72.0.3599.3',
+        '72.0.3599.2',
+        '72.0.3599.1',
+        '72.0.3599.0',
+        '71.0.3578.33',
+        '70.0.3538.93',
+        '72.0.3598.1',
+        '72.0.3598.0',
+        '71.0.3578.32',
+        '70.0.3538.87',
+        '72.0.3597.1',
+        '72.0.3597.0',
+        '72.0.3596.2',
+        '71.0.3578.31',
+        '70.0.3538.86',
+        '71.0.3578.30',
+        '71.0.3578.29',
+        '72.0.3596.1',
+        '72.0.3596.0',
+        '71.0.3578.28',
+        '70.0.3538.85',
+        '72.0.3595.2',
+        '72.0.3591.3',
+        '72.0.3595.1',
+        '72.0.3595.0',
+        '71.0.3578.27',
+        '70.0.3538.84',
+        '72.0.3594.1',
+        '72.0.3594.0',
+        '71.0.3578.26',
+        '70.0.3538.83',
+        '72.0.3593.2',
+        '72.0.3593.1',
+        '72.0.3593.0',
+        '71.0.3578.25',
+        '70.0.3538.82',
+        '72.0.3589.3',
+        '72.0.3592.2',
+        '72.0.3592.1',
+        '72.0.3592.0',
+        '71.0.3578.24',
+        '72.0.3589.2',
+        '70.0.3538.81',
+        '70.0.3538.80',
+        '72.0.3591.2',
+        '72.0.3591.1',
+        '72.0.3591.0',
+        '71.0.3578.23',
+        '70.0.3538.79',
+        '71.0.3578.22',
+        '72.0.3590.1',
+        '72.0.3590.0',
+        '71.0.3578.21',
+        '70.0.3538.78',
+        '70.0.3538.77',
+        '72.0.3589.1',
+        '72.0.3589.0',
+        '71.0.3578.20',
+        '70.0.3538.76',
+        '71.0.3578.19',
+        '70.0.3538.75',
+        '72.0.3588.1',
+        '72.0.3588.0',
+        '71.0.3578.18',
+        '70.0.3538.74',
+        '72.0.3586.2',
+        '72.0.3587.0',
+        '71.0.3578.17',
+        '70.0.3538.73',
+        '72.0.3586.1',
+        '72.0.3586.0',
+        '71.0.3578.16',
+        '70.0.3538.72',
+        '72.0.3585.1',
+        '72.0.3585.0',
+        '71.0.3578.15',
+        '70.0.3538.71',
+        '71.0.3578.14',
+        '72.0.3584.1',
+        '72.0.3584.0',
+        '71.0.3578.13',
+        '70.0.3538.70',
+        '72.0.3583.2',
+        '71.0.3578.12',
+        '72.0.3583.1',
+        '72.0.3583.0',
+        '71.0.3578.11',
+        '70.0.3538.69',
+        '71.0.3578.10',
+        '72.0.3582.0',
+        '72.0.3581.4',
+        '71.0.3578.9',
+        '70.0.3538.67',
+        '72.0.3581.3',
+        '72.0.3581.2',
+        '72.0.3581.1',
+        '72.0.3581.0',
+        '71.0.3578.8',
+        '70.0.3538.66',
+        '72.0.3580.1',
+        '72.0.3580.0',
+        '71.0.3578.7',
+        '70.0.3538.65',
+        '71.0.3578.6',
+        '72.0.3579.1',
+        '72.0.3579.0',
+        '71.0.3578.5',
+        '70.0.3538.64',
+        '71.0.3578.4',
+        '71.0.3578.3',
+        '71.0.3578.2',
+        '71.0.3578.1',
+        '71.0.3578.0',
+        '70.0.3538.63',
+        '69.0.3497.128',
+        '70.0.3538.62',
+        '70.0.3538.61',
+        '70.0.3538.60',
+        '70.0.3538.59',
+        '71.0.3577.1',
+        '71.0.3577.0',
+        '70.0.3538.58',
+        '69.0.3497.127',
+        '71.0.3576.2',
+        '71.0.3576.1',
+        '71.0.3576.0',
+        '70.0.3538.57',
+        '70.0.3538.56',
+        '71.0.3575.2',
+        '70.0.3538.55',
+        '69.0.3497.126',
+        '70.0.3538.54',
+        '71.0.3575.1',
+        '71.0.3575.0',
+        '71.0.3574.1',
+        '71.0.3574.0',
+        '70.0.3538.53',
+        '69.0.3497.125',
+        '70.0.3538.52',
+        '71.0.3573.1',
+        '71.0.3573.0',
+        '70.0.3538.51',
+        '69.0.3497.124',
+        '71.0.3572.1',
+        '71.0.3572.0',
+        '70.0.3538.50',
+        '69.0.3497.123',
+        '71.0.3571.2',
+        '70.0.3538.49',
+        '69.0.3497.122',
+        '71.0.3571.1',
+        '71.0.3571.0',
+        '70.0.3538.48',
+        '69.0.3497.121',
+        '71.0.3570.1',
+        '71.0.3570.0',
+        '70.0.3538.47',
+        '69.0.3497.120',
+        '71.0.3568.2',
+        '71.0.3569.1',
+        '71.0.3569.0',
+        '70.0.3538.46',
+        '69.0.3497.119',
+        '70.0.3538.45',
+        '71.0.3568.1',
+        '71.0.3568.0',
+        '70.0.3538.44',
+        '69.0.3497.118',
+        '70.0.3538.43',
+        '70.0.3538.42',
+        '71.0.3567.1',
+        '71.0.3567.0',
+        '70.0.3538.41',
+        '69.0.3497.117',
+        '71.0.3566.1',
+        '71.0.3566.0',
+        '70.0.3538.40',
+        '69.0.3497.116',
+        '71.0.3565.1',
+        '71.0.3565.0',
+        '70.0.3538.39',
+        '69.0.3497.115',
+        '71.0.3564.1',
+        '71.0.3564.0',
+        '70.0.3538.38',
+        '69.0.3497.114',
+        '71.0.3563.0',
+        '71.0.3562.2',
+        '70.0.3538.37',
+        '69.0.3497.113',
+        '70.0.3538.36',
+        '70.0.3538.35',
+        '71.0.3562.1',
+        '71.0.3562.0',
+        '70.0.3538.34',
+        '69.0.3497.112',
+        '70.0.3538.33',
+        '71.0.3561.1',
+        '71.0.3561.0',
+        '70.0.3538.32',
+        '69.0.3497.111',
+        '71.0.3559.6',
+        '71.0.3560.1',
+        '71.0.3560.0',
+        '71.0.3559.5',
+        '71.0.3559.4',
+        '70.0.3538.31',
+        '69.0.3497.110',
+        '71.0.3559.3',
+        '70.0.3538.30',
+        '69.0.3497.109',
+        '71.0.3559.2',
+        '71.0.3559.1',
+        '71.0.3559.0',
+        '70.0.3538.29',
+        '69.0.3497.108',
+        '71.0.3558.2',
+        '71.0.3558.1',
+        '71.0.3558.0',
+        '70.0.3538.28',
+        '69.0.3497.107',
+        '71.0.3557.2',
+        '71.0.3557.1',
+        '71.0.3557.0',
+        '70.0.3538.27',
+        '69.0.3497.106',
+        '71.0.3554.4',
+        '70.0.3538.26',
+        '71.0.3556.1',
+        '71.0.3556.0',
+        '70.0.3538.25',
+        '71.0.3554.3',
+        '69.0.3497.105',
+        '71.0.3554.2',
+        '70.0.3538.24',
+        '69.0.3497.104',
+        '71.0.3555.2',
+        '70.0.3538.23',
+        '71.0.3555.1',
+        '71.0.3555.0',
+        '70.0.3538.22',
+        '69.0.3497.103',
+        '71.0.3554.1',
+        '71.0.3554.0',
+        '70.0.3538.21',
+        '69.0.3497.102',
+        '71.0.3553.3',
+        '70.0.3538.20',
+        '69.0.3497.101',
+        '71.0.3553.2',
+        '69.0.3497.100',
+        '71.0.3553.1',
+        '71.0.3553.0',
+        '70.0.3538.19',
+        '69.0.3497.99',
+        '69.0.3497.98',
+        '69.0.3497.97',
+        '71.0.3552.6',
+        '71.0.3552.5',
+        '71.0.3552.4',
+        '71.0.3552.3',
+        '71.0.3552.2',
+        '71.0.3552.1',
+        '71.0.3552.0',
+        '70.0.3538.18',
+        '69.0.3497.96',
+        '71.0.3551.3',
+        '71.0.3551.2',
+        '71.0.3551.1',
+        '71.0.3551.0',
+        '70.0.3538.17',
+        '69.0.3497.95',
+        '71.0.3550.3',
+        '71.0.3550.2',
+        '71.0.3550.1',
+        '71.0.3550.0',
+        '70.0.3538.16',
+        '69.0.3497.94',
+        '71.0.3549.1',
+        '71.0.3549.0',
+        '70.0.3538.15',
+        '69.0.3497.93',
+        '69.0.3497.92',
+        '71.0.3548.1',
+        '71.0.3548.0',
+        '70.0.3538.14',
+        '69.0.3497.91',
+        '71.0.3547.1',
+        '71.0.3547.0',
+        '70.0.3538.13',
+        '69.0.3497.90',
+        '71.0.3546.2',
+        '69.0.3497.89',
+        '71.0.3546.1',
+        '71.0.3546.0',
+        '70.0.3538.12',
+        '69.0.3497.88',
+        '71.0.3545.4',
+        '71.0.3545.3',
+        '71.0.3545.2',
+        '71.0.3545.1',
+        '71.0.3545.0',
+        '70.0.3538.11',
+        '69.0.3497.87',
+        '71.0.3544.5',
+        '71.0.3544.4',
+        '71.0.3544.3',
+        '71.0.3544.2',
+        '71.0.3544.1',
+        '71.0.3544.0',
+        '69.0.3497.86',
+        '70.0.3538.10',
+        '69.0.3497.85',
+        '70.0.3538.9',
+        '69.0.3497.84',
+        '71.0.3543.4',
+        '70.0.3538.8',
+        '71.0.3543.3',
+        '71.0.3543.2',
+        '71.0.3543.1',
+        '71.0.3543.0',
+        '70.0.3538.7',
+        '69.0.3497.83',
+        '71.0.3542.2',
+        '71.0.3542.1',
+        '71.0.3542.0',
+        '70.0.3538.6',
+        '69.0.3497.82',
+        '69.0.3497.81',
+        '71.0.3541.1',
+        '71.0.3541.0',
+        '70.0.3538.5',
+        '69.0.3497.80',
+        '71.0.3540.1',
+        '71.0.3540.0',
+        '70.0.3538.4',
+        '69.0.3497.79',
+        '70.0.3538.3',
+        '71.0.3539.1',
+        '71.0.3539.0',
+        '69.0.3497.78',
+        '68.0.3440.134',
+        '69.0.3497.77',
+        '70.0.3538.2',
+        '70.0.3538.1',
+        '70.0.3538.0',
+        '69.0.3497.76',
+        '68.0.3440.133',
+        '69.0.3497.75',
+        '70.0.3537.2',
+        '70.0.3537.1',
+        '70.0.3537.0',
+        '69.0.3497.74',
+        '68.0.3440.132',
+        '70.0.3536.0',
+        '70.0.3535.5',
+        '70.0.3535.4',
+        '70.0.3535.3',
+        '69.0.3497.73',
+        '68.0.3440.131',
+        '70.0.3532.8',
+        '70.0.3532.7',
+        '69.0.3497.72',
+        '69.0.3497.71',
+        '70.0.3535.2',
+        '70.0.3535.1',
+        '70.0.3535.0',
+        '69.0.3497.70',
+        '68.0.3440.130',
+        '69.0.3497.69',
+        '68.0.3440.129',
+        '70.0.3534.4',
+        '70.0.3534.3',
+        '70.0.3534.2',
+        '70.0.3534.1',
+        '70.0.3534.0',
+        '69.0.3497.68',
+        '68.0.3440.128',
+        '70.0.3533.2',
+        '70.0.3533.1',
+        '70.0.3533.0',
+        '69.0.3497.67',
+        '68.0.3440.127',
+        '70.0.3532.6',
+        '70.0.3532.5',
+        '70.0.3532.4',
+        '69.0.3497.66',
+        '68.0.3440.126',
+        '70.0.3532.3',
+        '70.0.3532.2',
+        '70.0.3532.1',
+        '69.0.3497.60',
+        '69.0.3497.65',
+        '69.0.3497.64',
+        '70.0.3532.0',
+        '70.0.3531.0',
+        '70.0.3530.4',
+        '70.0.3530.3',
+        '70.0.3530.2',
+        '69.0.3497.58',
+        '68.0.3440.125',
+        '69.0.3497.57',
+        '69.0.3497.56',
+        '69.0.3497.55',
+        '69.0.3497.54',
+        '70.0.3530.1',
+        '70.0.3530.0',
+        '69.0.3497.53',
+        '68.0.3440.124',
+        '69.0.3497.52',
+        '70.0.3529.3',
+        '70.0.3529.2',
+        '70.0.3529.1',
+        '70.0.3529.0',
+        '69.0.3497.51',
+        '70.0.3528.4',
+        '68.0.3440.123',
+        '70.0.3528.3',
+        '70.0.3528.2',
+        '70.0.3528.1',
+        '70.0.3528.0',
+        '69.0.3497.50',
+        '68.0.3440.122',
+        '70.0.3527.1',
+        '70.0.3527.0',
+        '69.0.3497.49',
+        '68.0.3440.121',
+        '70.0.3526.1',
+        '70.0.3526.0',
+        '68.0.3440.120',
+        '69.0.3497.48',
+        '69.0.3497.47',
+        '68.0.3440.119',
+        '68.0.3440.118',
+        '70.0.3525.5',
+        '70.0.3525.4',
+        '70.0.3525.3',
+        '68.0.3440.117',
+        '69.0.3497.46',
+        '70.0.3525.2',
+        '70.0.3525.1',
+        '70.0.3525.0',
+        '69.0.3497.45',
+        '68.0.3440.116',
+        '70.0.3524.4',
+        '70.0.3524.3',
+        '69.0.3497.44',
+        '70.0.3524.2',
+        '70.0.3524.1',
+        '70.0.3524.0',
+        '70.0.3523.2',
+        '69.0.3497.43',
+        '68.0.3440.115',
+        '70.0.3505.9',
+        '69.0.3497.42',
+        '70.0.3505.8',
+        '70.0.3523.1',
+        '70.0.3523.0',
+        '69.0.3497.41',
+        '68.0.3440.114',
+        '70.0.3505.7',
+        '69.0.3497.40',
+        '70.0.3522.1',
+        '70.0.3522.0',
+        '70.0.3521.2',
+        '69.0.3497.39',
+        '68.0.3440.113',
+        '70.0.3505.6',
+        '70.0.3521.1',
+        '70.0.3521.0',
+        '69.0.3497.38',
+        '68.0.3440.112',
+        '70.0.3520.1',
+        '70.0.3520.0',
+        '69.0.3497.37',
+        '68.0.3440.111',
+        '70.0.3519.3',
+        '70.0.3519.2',
+        '70.0.3519.1',
+        '70.0.3519.0',
+        '69.0.3497.36',
+        '68.0.3440.110',
+        '70.0.3518.1',
+        '70.0.3518.0',
+        '69.0.3497.35',
+        '69.0.3497.34',
+        '68.0.3440.109',
+        '70.0.3517.1',
+        '70.0.3517.0',
+        '69.0.3497.33',
+        '68.0.3440.108',
+        '69.0.3497.32',
+        '70.0.3516.3',
+        '70.0.3516.2',
+        '70.0.3516.1',
+        '70.0.3516.0',
+        '69.0.3497.31',
+        '68.0.3440.107',
+        '70.0.3515.4',
+        '68.0.3440.106',
+        '70.0.3515.3',
+        '70.0.3515.2',
+        '70.0.3515.1',
+        '70.0.3515.0',
+        '69.0.3497.30',
+        '68.0.3440.105',
+        '68.0.3440.104',
+        '70.0.3514.2',
+        '70.0.3514.1',
+        '70.0.3514.0',
+        '69.0.3497.29',
+        '68.0.3440.103',
+        '70.0.3513.1',
+        '70.0.3513.0',
+        '69.0.3497.28',
+    )
+    return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
+
+
+std_headers = {
+    'User-Agent': random_user_agent(),
+    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Encoding': 'gzip, deflate',
+    'Accept-Language': 'en-us,en;q=0.5',
+}
+
+
+USER_AGENTS = {
+    'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
+}
+
+
+NO_DEFAULT = object()
+
+ENGLISH_MONTH_NAMES = [
+    'January', 'February', 'March', 'April', 'May', 'June',
+    'July', 'August', 'September', 'October', 'November', 'December']
+
+MONTH_NAMES = {
+    'en': ENGLISH_MONTH_NAMES,
+    'fr': [
+        'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
+        'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
+}
+
+KNOWN_EXTENSIONS = (
+    'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
+    'flv', 'f4v', 'f4a', 'f4b',
+    'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
+    'mkv', 'mka', 'mk3d',
+    'avi', 'divx',
+    'mov',
+    'asf', 'wmv', 'wma',
+    '3gp', '3g2',
+    'mp3',
+    'flac',
+    'ape',
+    'wav',
+    'f4f', 'f4m', 'm3u8', 'smil')
+
+# needed for sanitizing filenames in restricted mode
+ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
+                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
+                                        'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
+
+DATE_FORMATS = (
+    '%d %B %Y',
+    '%d %b %Y',
+    '%B %d %Y',
+    '%B %dst %Y',
+    '%B %dnd %Y',
+    '%B %drd %Y',
+    '%B %dth %Y',
+    '%b %d %Y',
+    '%b %dst %Y',
+    '%b %dnd %Y',
+    '%b %drd %Y',
+    '%b %dth %Y',
+    '%b %dst %Y %I:%M',
+    '%b %dnd %Y %I:%M',
+    '%b %drd %Y %I:%M',
+    '%b %dth %Y %I:%M',
+    '%Y %m %d',
+    '%Y-%m-%d',
+    '%Y/%m/%d',
+    '%Y/%m/%d %H:%M',
+    '%Y/%m/%d %H:%M:%S',
+    '%Y-%m-%d %H:%M',
+    '%Y-%m-%d %H:%M:%S',
+    '%Y-%m-%d %H:%M:%S.%f',
+    '%d.%m.%Y %H:%M',
+    '%d.%m.%Y %H.%M',
+    '%Y-%m-%dT%H:%M:%SZ',
+    '%Y-%m-%dT%H:%M:%S.%fZ',
+    '%Y-%m-%dT%H:%M:%S.%f0Z',
+    '%Y-%m-%dT%H:%M:%S',
+    '%Y-%m-%dT%H:%M:%S.%f',
+    '%Y-%m-%dT%H:%M',
+    '%b %d %Y at %H:%M',
+    '%b %d %Y at %H:%M:%S',
+    '%B %d %Y at %H:%M',
+    '%B %d %Y at %H:%M:%S',
+)
+
+DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_DAY_FIRST.extend([
+    '%d-%m-%Y',
+    '%d.%m.%Y',
+    '%d.%m.%y',
+    '%d/%m/%Y',
+    '%d/%m/%y',
+    '%d/%m/%Y %H:%M:%S',
+])
+
+DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_MONTH_FIRST.extend([
+    '%m-%d-%Y',
+    '%m.%d.%Y',
+    '%m/%d/%Y',
+    '%m/%d/%y',
+    '%m/%d/%Y %H:%M:%S',
+])
+
+PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
+JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
+
+
+def preferredencoding():
+    """Get preferred encoding.
+
+    Returns the best encoding scheme for the system, based on
+    locale.getpreferredencoding() and some further tweaks.
+    """
+    try:
+        pref = locale.getpreferredencoding()
+        'TEST'.encode(pref)
+    except Exception:
+        pref = 'UTF-8'
+
+    return pref
+
+
+def write_json_file(obj, fn):
+    """ Encode obj as JSON and write it to fn, atomically if possible """
+
+    fn = encodeFilename(fn)
+    if sys.version_info < (3, 0) and sys.platform != 'win32':
+        encoding = get_filesystem_encoding()
+        # os.path.basename returns a bytes object, but NamedTemporaryFile
+        # will fail if the filename contains non ascii characters unless we
+        # use a unicode object
+        path_basename = lambda f: os.path.basename(fn).decode(encoding)
+        # the same for os.path.dirname
+        path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
+    else:
+        path_basename = os.path.basename
+        path_dirname = os.path.dirname
+
+    args = {
+        'suffix': '.tmp',
+        'prefix': path_basename(fn) + '.',
+        'dir': path_dirname(fn),
+        'delete': False,
+    }
+
+    # In Python 2.x, json.dump expects a bytestream.
+    # In Python 3.x, it writes to a character stream
+    if sys.version_info < (3, 0):
+        args['mode'] = 'wb'
+    else:
+        args.update({
+            'mode': 'w',
+            'encoding': 'utf-8',
+        })
+
+    tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
+
+    try:
+        with tf:
+            json.dump(obj, tf)
+        if sys.platform == 'win32':
+            # Need to remove existing file on Windows, else os.rename raises
+            # WindowsError or FileExistsError.
+            try:
+                os.unlink(fn)
+            except OSError:
+                pass
+        try:
+            mask = os.umask(0)
+            os.umask(mask)
+            os.chmod(tf.name, 0o666 & ~mask)
+        except OSError:
+            pass
+        os.rename(tf.name, fn)
+    except Exception:
+        try:
+            os.remove(tf.name)
+        except OSError:
+            pass
+        raise
+
+
+if sys.version_info >= (2, 7):
+    def find_xpath_attr(node, xpath, key, val=None):
+        """ Find the xpath xpath[@key=val] """
+        assert re.match(r'^[a-zA-Z_-]+$', key)
+        expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
+        return node.find(expr)
+else:
+    def find_xpath_attr(node, xpath, key, val=None):
+        for f in node.findall(compat_xpath(xpath)):
+            if key not in f.attrib:
+                continue
+            if val is None or f.attrib.get(key) == val:
+                return f
+        return None
+
+# On python2.6 the xml.etree.ElementTree.Element methods don't support
+# the namespace parameter
+
+
+def xpath_with_ns(path, ns_map):
+    components = [c.split(':') for c in path.split('/')]
+    replaced = []
+    for c in components:
+        if len(c) == 1:
+            replaced.append(c[0])
+        else:
+            ns, tag = c
+            replaced.append('{%s}%s' % (ns_map[ns], tag))
+    return '/'.join(replaced)
+
+
+def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
+    def _find_xpath(xpath):
+        return node.find(compat_xpath(xpath))
+
+    if isinstance(xpath, (str, compat_str)):
+        n = _find_xpath(xpath)
+    else:
+        for xp in xpath:
+            n = _find_xpath(xp)
+            if n is not None:
+                break
+
+    if n is None:
+        if default is not NO_DEFAULT:
+            return default
+        elif fatal:
+            name = xpath if name is None else name
+            raise ExtractorError('Could not find XML element %s' % name)
+        else:
+            return None
+    return n
+
+
+def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
+    n = xpath_element(node, xpath, name, fatal=fatal, default=default)
+    if n is None or n == default:
+        return n
+    if n.text is None:
+        if default is not NO_DEFAULT:
+            return default
+        elif fatal:
+            name = xpath if name is None else name
+            raise ExtractorError('Could not find XML element\'s text %s' % name)
+        else:
+            return None
+    return n.text
+
+
+def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
+    n = find_xpath_attr(node, xpath, key)
+    if n is None:
+        if default is not NO_DEFAULT:
+            return default
+        elif fatal:
+            name = '%s[@%s]' % (xpath, key) if name is None else name
+            raise ExtractorError('Could not find XML attribute %s' % name)
+        else:
+            return None
+    return n.attrib[key]
+
+
+def get_element_by_id(id, html):
+    """Return the content of the tag with the specified ID in the passed HTML document"""
+    return get_element_by_attribute('id', id, html)
+
+
+def get_element_by_class(class_name, html):
+    """Return the content of the first tag with the specified class in the passed HTML document"""
+    retval = get_elements_by_class(class_name, html)
+    return retval[0] if retval else None
+
+
+def get_element_by_attribute(attribute, value, html, escape_value=True):
+    retval = get_elements_by_attribute(attribute, value, html, escape_value)
+    return retval[0] if retval else None
+
+
+def get_elements_by_class(class_name, html):
+    """Return the content of all tags with the specified class in the passed HTML document as a list"""
+    return get_elements_by_attribute(
+        'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
+        html, escape_value=False)
+
+
+def get_elements_by_attribute(attribute, value, html, escape_value=True):
+    """Return the content of the tag with the specified attribute in the passed HTML document"""
+
+    value = re.escape(value) if escape_value else value
+
+    retlist = []
+    for m in re.finditer(r'''(?xs)
+        <([a-zA-Z0-9:._-]+)
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+         \s+%s=['"]?%s['"]?
+         (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+        \s*>
+        (?P<content>.*?)
+        </\1>
+    ''' % (re.escape(attribute), value), html):
+        res = m.group('content')
+
+        if res.startswith('"') or res.startswith("'"):
+            res = res[1:-1]
+
+        retlist.append(unescapeHTML(res))
+
+    return retlist
+
+
+class HTMLAttributeParser(compat_HTMLParser):
+    """Trivial HTML parser to gather the attributes for a single element"""
+    def __init__(self):
+        self.attrs = {}
+        compat_HTMLParser.__init__(self)
+
+    def handle_starttag(self, tag, attrs):
+        self.attrs = dict(attrs)
+
+
+def extract_attributes(html_element):
+    """Given a string for an HTML element such as
+    <el
+         a="foo" B="bar" c="&98;az" d=boz
+         empty= noval entity="&amp;"
+         sq='"' dq="'"
+    >
+    Decode and return a dictionary of attributes.
+    {
+        'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
+        'empty': '', 'noval': None, 'entity': '&',
+        'sq': '"', 'dq': '\''
+    }.
+    NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
+    but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
+    """
+    parser = HTMLAttributeParser()
+    try:
+        parser.feed(html_element)
+        parser.close()
+    # Older Python may throw HTMLParseError in case of malformed HTML
+    except compat_HTMLParseError:
+        pass
+    return parser.attrs
+
+
+def clean_html(html):
+    """Clean an HTML snippet into a readable string"""
+
+    if html is None:  # Convenience for sanitizing descriptions etc.
+        return html
+
+    # Newline vs <br />
+    html = html.replace('\n', ' ')
+    html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
+    html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
+    # Strip html tags
+    html = re.sub('<.*?>', '', html)
+    # Replace html entities
+    html = unescapeHTML(html)
+    return html.strip()
+
+
+def sanitize_open(filename, open_mode):
+    """Try to open the given filename, and slightly tweak it if this fails.
+
+    Attempts to open the given filename. If this fails, it tries to change
+    the filename slightly, step by step, until it's either able to open it
+    or it fails and raises a final exception, like the standard open()
+    function.
+
+    It returns the tuple (stream, definitive_file_name).
+    """
+    try:
+        if filename == '-':
+            if sys.platform == 'win32':
+                import msvcrt
+                msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
+            return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
+        stream = open(encodeFilename(filename), open_mode)
+        return (stream, filename)
+    except (IOError, OSError) as err:
+        if err.errno in (errno.EACCES,):
+            raise
+
+        # In case of error, try to remove win32 forbidden chars
+        alt_filename = sanitize_path(filename)
+        if alt_filename == filename:
+            raise
+        else:
+            # An exception here should be caught in the caller
+            stream = open(encodeFilename(alt_filename), open_mode)
+            return (stream, alt_filename)
+
+
+def timeconvert(timestr):
+    """Convert RFC 2822 defined time string into system timestamp"""
+    timestamp = None
+    timetuple = email.utils.parsedate_tz(timestr)
+    if timetuple is not None:
+        timestamp = email.utils.mktime_tz(timetuple)
+    return timestamp
+
+
+def sanitize_filename(s, restricted=False, is_id=False):
+    """Sanitizes a string so it could be used as part of a filename.
+    If restricted is set, use a stricter subset of allowed characters.
+    Set is_id if this is not an arbitrary string, but an ID that should be kept
+    if possible.
+    """
+    def replace_insane(char):
+        if restricted and char in ACCENT_CHARS:
+            return ACCENT_CHARS[char]
+        if char == '?' or ord(char) < 32 or ord(char) == 127:
+            return ''
+        elif char == '"':
+            return '' if restricted else '\''
+        elif char == ':':
+            return '_-' if restricted else ' -'
+        elif char in '\\/|*<>':
+            return '_'
+        if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
+            return '_'
+        if restricted and ord(char) > 127:
+            return '_'
+        return char
+
+    # Handle timestamps
+    s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
+    result = ''.join(map(replace_insane, s))
+    if not is_id:
+        while '__' in result:
+            result = result.replace('__', '_')
+        result = result.strip('_')
+        # Common case of "Foreign band name - English song title"
+        if restricted and result.startswith('-_'):
+            result = result[2:]
+        if result.startswith('-'):
+            result = '_' + result[len('-'):]
+        result = result.lstrip('.')
+        if not result:
+            result = '_'
+    return result
+
+
+def sanitize_path(s):
+    """Sanitizes and normalizes path on Windows"""
+    if sys.platform != 'win32':
+        return s
+    drive_or_unc, _ = os.path.splitdrive(s)
+    if sys.version_info < (2, 7) and not drive_or_unc:
+        drive_or_unc, _ = os.path.splitunc(s)
+    norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
+    if drive_or_unc:
+        norm_path.pop(0)
+    sanitized_path = [
+        path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
+        for path_part in norm_path]
+    if drive_or_unc:
+        sanitized_path.insert(0, drive_or_unc + os.path.sep)
+    return os.path.join(*sanitized_path)
+
+
+def sanitize_url(url):
+    # Prepend protocol-less URLs with `http:` scheme in order to mitigate
+    # the number of unwanted failures due to missing protocol
+    if url.startswith('//'):
+        return 'http:%s' % url
+    # Fix some common typos seen so far
+    COMMON_TYPOS = (
+        # https://github.com/ytdl-org/youtube-dl/issues/15649
+        (r'^httpss://', r'https://'),
+        # https://bx1.be/lives/direct-tv/
+        (r'^rmtp([es]?)://', r'rtmp\1://'),
+    )
+    for mistake, fixup in COMMON_TYPOS:
+        if re.match(mistake, url):
+            return re.sub(mistake, fixup, url)
+    return url
+
+
+def sanitized_Request(url, *args, **kwargs):
+    return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
+
+
+def expand_path(s):
+    """Expand shell variables and ~"""
+    return os.path.expandvars(compat_expanduser(s))
+
+
+def orderedSet(iterable):
+    """ Remove all duplicates from the input iterable """
+    res = []
+    for el in iterable:
+        if el not in res:
+            res.append(el)
+    return res
+
+
+def _htmlentity_transform(entity_with_semicolon):
+    """Transforms an HTML entity to a character."""
+    entity = entity_with_semicolon[:-1]
+
+    # Known non-numeric HTML entity
+    if entity in compat_html_entities.name2codepoint:
+        return compat_chr(compat_html_entities.name2codepoint[entity])
+
+    # TODO: HTML5 allows entities without a semicolon. For example,
+    # '&Eacuteric' should be decoded as 'Éric'.
+    if entity_with_semicolon in compat_html_entities_html5:
+        return compat_html_entities_html5[entity_with_semicolon]
+
+    mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
+    if mobj is not None:
+        numstr = mobj.group(1)
+        if numstr.startswith('x'):
+            base = 16
+            numstr = '0%s' % numstr
+        else:
+            base = 10
+        # See https://github.com/ytdl-org/youtube-dl/issues/7518
+        try:
+            return compat_chr(int(numstr, base))
+        except ValueError:
+            pass
+
+    # Unknown entity in name, return its literal representation
+    return '&%s;' % entity
+
+
+def unescapeHTML(s):
+    if s is None:
+        return None
+    assert type(s) == compat_str
+
+    return re.sub(
+        r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
+
+
+def get_subprocess_encoding():
+    if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
+        # For subprocess calls, encode with locale encoding
+        # Refer to http://stackoverflow.com/a/9951851/35070
+        encoding = preferredencoding()
+    else:
+        encoding = sys.getfilesystemencoding()
+    if encoding is None:
+        encoding = 'utf-8'
+    return encoding
+
+
+def encodeFilename(s, for_subprocess=False):
+    """
+    @param s The name of the file
+    """
+
+    assert type(s) == compat_str
+
+    # Python 3 has a Unicode API
+    if sys.version_info >= (3, 0):
+        return s
+
+    # Pass '' directly to use Unicode APIs on Windows 2000 and up
+    # (Detecting Windows NT 4 is tricky because 'major >= 4' would
+    # match Windows 9x series as well. Besides, NT 4 is obsolete.)
+    if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
+        return s
+
+    # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
+    if sys.platform.startswith('java'):
+        return s
+
+    return s.encode(get_subprocess_encoding(), 'ignore')
+
+
+def decodeFilename(b, for_subprocess=False):
+
+    if sys.version_info >= (3, 0):
+        return b
+
+    if not isinstance(b, bytes):
+        return b
+
+    return b.decode(get_subprocess_encoding(), 'ignore')
+
+
+def encodeArgument(s):
+    if not isinstance(s, compat_str):
+        # Legacy code that uses byte strings
+        # Uncomment the following line after fixing all post processors
+        # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
+        s = s.decode('ascii')
+    return encodeFilename(s, True)
+
+
+def decodeArgument(b):
+    return decodeFilename(b, True)
+
+
+def decodeOption(optval):
+    if optval is None:
+        return optval
+    if isinstance(optval, bytes):
+        optval = optval.decode(preferredencoding())
+
+    assert isinstance(optval, compat_str)
+    return optval
+
+
+def formatSeconds(secs):
+    if secs > 3600:
+        return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
+    elif secs > 60:
+        return '%d:%02d' % (secs // 60, secs % 60)
+    else:
+        return '%d' % secs
+
+
+def make_HTTPS_handler(params, **kwargs):
+    opts_no_check_certificate = params.get('nocheckcertificate', False)
+    if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
+        context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
+        if opts_no_check_certificate:
+            context.check_hostname = False
+            context.verify_mode = ssl.CERT_NONE
+        try:
+            return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
+        except TypeError:
+            # Python 2.7.8
+            # (create_default_context present but HTTPSHandler has no context=)
+            pass
+
+    if sys.version_info < (3, 2):
+        return YoutubeDLHTTPSHandler(params, **kwargs)
+    else:  # Python < 3.4
+        context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
+        context.verify_mode = (ssl.CERT_NONE
+                               if opts_no_check_certificate
+                               else ssl.CERT_REQUIRED)
+        context.set_default_verify_paths()
+        return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
+
+
+def bug_reports_message():
+    if ytdl_is_updateable():
+        update_cmd = 'type  youtube-dlc -U  to update'
+    else:
+        update_cmd = 'see  https://yt-dl.org/update  on how to update'
+    msg = '; please report this issue on https://yt-dl.org/bug .'
+    msg += ' Make sure you are using the latest version; %s.' % update_cmd
+    msg += ' Be sure to call youtube-dlc with the --verbose flag and include its complete output.'
+    return msg
+
+
+class YoutubeDLError(Exception):
+    """Base exception for YoutubeDL errors."""
+    pass
+
+
+class ExtractorError(YoutubeDLError):
+    """Error during info extraction."""
+
+    def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
+        """ tb, if given, is the original traceback (so that it can be printed out).
+        If expected is set, this is a normal error message and most likely not a bug in youtube-dlc.
+        """
+
+        if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
+            expected = True
+        if video_id is not None:
+            msg = video_id + ': ' + msg
+        if cause:
+            msg += ' (caused by %r)' % cause
+        if not expected:
+            msg += bug_reports_message()
+        super(ExtractorError, self).__init__(msg)
+
+        self.traceback = tb
+        self.exc_info = sys.exc_info()  # preserve original exception
+        self.cause = cause
+        self.video_id = video_id
+
+    def format_traceback(self):
+        if self.traceback is None:
+            return None
+        return ''.join(traceback.format_tb(self.traceback))
+
+
+class UnsupportedError(ExtractorError):
+    def __init__(self, url):
+        super(UnsupportedError, self).__init__(
+            'Unsupported URL: %s' % url, expected=True)
+        self.url = url
+
+
+class RegexNotFoundError(ExtractorError):
+    """Error when a regex didn't match"""
+    pass
+
+
+class GeoRestrictedError(ExtractorError):
+    """Geographic restriction Error exception.
+
+    This exception may be thrown when a video is not available from your
+    geographic location due to geographic restrictions imposed by a website.
+    """
+    def __init__(self, msg, countries=None):
+        super(GeoRestrictedError, self).__init__(msg, expected=True)
+        self.msg = msg
+        self.countries = countries
+
+
+class DownloadError(YoutubeDLError):
+    """Download Error exception.
+
+    This exception may be thrown by FileDownloader objects if they are not
+    configured to continue on errors. They will contain the appropriate
+    error message.
+    """
+
+    def __init__(self, msg, exc_info=None):
+        """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
+        super(DownloadError, self).__init__(msg)
+        self.exc_info = exc_info
+
+
+class SameFileError(YoutubeDLError):
+    """Same File exception.
+
+    This exception will be thrown by FileDownloader objects if they detect
+    multiple files would have to be downloaded to the same file on disk.
+    """
+    pass
+
+
+class PostProcessingError(YoutubeDLError):
+    """Post Processing exception.
+
+    This exception may be raised by PostProcessor's .run() method to
+    indicate an error in the postprocessing task.
+    """
+
+    def __init__(self, msg):
+        super(PostProcessingError, self).__init__(msg)
+        self.msg = msg
+
+
+class MaxDownloadsReached(YoutubeDLError):
+    """ --max-downloads limit has been reached. """
+    pass
+
+
+class UnavailableVideoError(YoutubeDLError):
+    """Unavailable Format exception.
+
+    This exception will be thrown when a video is requested
+    in a format that is not available for that video.
+    """
+    pass
+
+
+class ContentTooShortError(YoutubeDLError):
+    """Content Too Short exception.
+
+    This exception may be raised by FileDownloader objects when a file they
+    download is too small for what the server announced first, indicating
+    the connection was probably interrupted.
+    """
+
+    def __init__(self, downloaded, expected):
+        super(ContentTooShortError, self).__init__(
+            'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
+        )
+        # Both in bytes
+        self.downloaded = downloaded
+        self.expected = expected
+
+
+class XAttrMetadataError(YoutubeDLError):
+    def __init__(self, code=None, msg='Unknown error'):
+        super(XAttrMetadataError, self).__init__(msg)
+        self.code = code
+        self.msg = msg
+
+        # Parsing code and msg
+        if (self.code in (errno.ENOSPC, errno.EDQUOT)
+                or 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
+            self.reason = 'NO_SPACE'
+        elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
+            self.reason = 'VALUE_TOO_LONG'
+        else:
+            self.reason = 'NOT_SUPPORTED'
+
+
+class XAttrUnavailableError(YoutubeDLError):
+    pass
+
+
+def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
+    # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
+    # expected HTTP responses to meet HTTP/1.0 or later (see also
+    # https://github.com/ytdl-org/youtube-dl/issues/6727)
+    if sys.version_info < (3, 0):
+        kwargs['strict'] = True
+    hc = http_class(*args, **compat_kwargs(kwargs))
+    source_address = ydl_handler._params.get('source_address')
+
+    if source_address is not None:
+        # This is to workaround _create_connection() from socket where it will try all
+        # address data from getaddrinfo() including IPv6. This filters the result from
+        # getaddrinfo() based on the source_address value.
+        # This is based on the cpython socket.create_connection() function.
+        # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
+        def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
+            host, port = address
+            err = None
+            addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
+            af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
+            ip_addrs = [addr for addr in addrs if addr[0] == af]
+            if addrs and not ip_addrs:
+                ip_version = 'v4' if af == socket.AF_INET else 'v6'
+                raise socket.error(
+                    "No remote IP%s addresses available for connect, can't use '%s' as source address"
+                    % (ip_version, source_address[0]))
+            for res in ip_addrs:
+                af, socktype, proto, canonname, sa = res
+                sock = None
+                try:
+                    sock = socket.socket(af, socktype, proto)
+                    if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
+                        sock.settimeout(timeout)
+                    sock.bind(source_address)
+                    sock.connect(sa)
+                    err = None  # Explicitly break reference cycle
+                    return sock
+                except socket.error as _:
+                    err = _
+                    if sock is not None:
+                        sock.close()
+            if err is not None:
+                raise err
+            else:
+                raise socket.error('getaddrinfo returns an empty list')
+        if hasattr(hc, '_create_connection'):
+            hc._create_connection = _create_connection
+        sa = (source_address, 0)
+        if hasattr(hc, 'source_address'):  # Python 2.7+
+            hc.source_address = sa
+        else:  # Python 2.6
+            def _hc_connect(self, *args, **kwargs):
+                sock = _create_connection(
+                    (self.host, self.port), self.timeout, sa)
+                if is_https:
+                    self.sock = ssl.wrap_socket(
+                        sock, self.key_file, self.cert_file,
+                        ssl_version=ssl.PROTOCOL_TLSv1)
+                else:
+                    self.sock = sock
+            hc.connect = functools.partial(_hc_connect, hc)
+
+    return hc
+
+
+def handle_youtubedl_headers(headers):
+    filtered_headers = headers
+
+    if 'Youtubedl-no-compression' in filtered_headers:
+        filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
+        del filtered_headers['Youtubedl-no-compression']
+
+    return filtered_headers
+
+
+class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
+    """Handler for HTTP requests and responses.
+
+    This class, when installed with an OpenerDirector, automatically adds
+    the standard headers to every HTTP request and handles gzipped and
+    deflated responses from web servers. If compression is to be avoided in
+    a particular request, the original request in the program code only has
+    to include the HTTP header "Youtubedl-no-compression", which will be
+    removed before making the real request.
+
+    Part of this code was copied from:
+
+    http://techknack.net/python-urllib2-handlers/
+
+    Andrew Rowls, the author of that code, agreed to release it to the
+    public domain.
+    """
+
+    def __init__(self, params, *args, **kwargs):
+        compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
+        self._params = params
+
+    def http_open(self, req):
+        conn_class = compat_http_client.HTTPConnection
+
+        socks_proxy = req.headers.get('Ytdl-socks-proxy')
+        if socks_proxy:
+            conn_class = make_socks_conn_class(conn_class, socks_proxy)
+            del req.headers['Ytdl-socks-proxy']
+
+        return self.do_open(functools.partial(
+            _create_http_connection, self, conn_class, False),
+            req)
+
+    @staticmethod
+    def deflate(data):
+        try:
+            return zlib.decompress(data, -zlib.MAX_WBITS)
+        except zlib.error:
+            return zlib.decompress(data)
+
+    def http_request(self, req):
+        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
+        # always respected by websites, some tend to give out URLs with non percent-encoded
+        # non-ASCII characters (see telemb.py, ard.py [#3412])
+        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
+        # To work around aforementioned issue we will replace request's original URL with
+        # percent-encoded one
+        # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
+        # the code of this workaround has been moved here from YoutubeDL.urlopen()
+        url = req.get_full_url()
+        url_escaped = escape_url(url)
+
+        # Substitute URL if any change after escaping
+        if url != url_escaped:
+            req = update_Request(req, url=url_escaped)
+
+        for h, v in std_headers.items():
+            # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
+            # The dict keys are capitalized because of this bug by urllib
+            if h.capitalize() not in req.headers:
+                req.add_header(h, v)
+
+        req.headers = handle_youtubedl_headers(req.headers)
+
+        if sys.version_info < (2, 7) and '#' in req.get_full_url():
+            # Python 2.6 is brain-dead when it comes to fragments
+            req._Request__original = req._Request__original.partition('#')[0]
+            req._Request__r_type = req._Request__r_type.partition('#')[0]
+
+        return req
+
+    def http_response(self, req, resp):
+        old_resp = resp
+        # gzip
+        if resp.headers.get('Content-encoding', '') == 'gzip':
+            content = resp.read()
+            gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
+            try:
+                uncompressed = io.BytesIO(gz.read())
+            except IOError as original_ioerror:
+                # There may be junk add the end of the file
+                # See http://stackoverflow.com/q/4928560/35070 for details
+                for i in range(1, 1024):
+                    try:
+                        gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
+                        uncompressed = io.BytesIO(gz.read())
+                    except IOError:
+                        continue
+                    break
+                else:
+                    raise original_ioerror
+            resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
+            resp.msg = old_resp.msg
+            del resp.headers['Content-encoding']
+        # deflate
+        if resp.headers.get('Content-encoding', '') == 'deflate':
+            gz = io.BytesIO(self.deflate(resp.read()))
+            resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
+            resp.msg = old_resp.msg
+            del resp.headers['Content-encoding']
+        # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
+        # https://github.com/ytdl-org/youtube-dl/issues/6457).
+        if 300 <= resp.code < 400:
+            location = resp.headers.get('Location')
+            if location:
+                # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
+                if sys.version_info >= (3, 0):
+                    location = location.encode('iso-8859-1').decode('utf-8')
+                else:
+                    location = location.decode('utf-8')
+                location_escaped = escape_url(location)
+                if location != location_escaped:
+                    del resp.headers['Location']
+                    if sys.version_info < (3, 0):
+                        location_escaped = location_escaped.encode('utf-8')
+                    resp.headers['Location'] = location_escaped
+        return resp
+
+    https_request = http_request
+    https_response = http_response
+
+
+def make_socks_conn_class(base_class, socks_proxy):
+    assert issubclass(base_class, (
+        compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
+
+    url_components = compat_urlparse.urlparse(socks_proxy)
+    if url_components.scheme.lower() == 'socks5':
+        socks_type = ProxyType.SOCKS5
+    elif url_components.scheme.lower() in ('socks', 'socks4'):
+        socks_type = ProxyType.SOCKS4
+    elif url_components.scheme.lower() == 'socks4a':
+        socks_type = ProxyType.SOCKS4A
+
+    def unquote_if_non_empty(s):
+        if not s:
+            return s
+        return compat_urllib_parse_unquote_plus(s)
+
+    proxy_args = (
+        socks_type,
+        url_components.hostname, url_components.port or 1080,
+        True,  # Remote DNS
+        unquote_if_non_empty(url_components.username),
+        unquote_if_non_empty(url_components.password),
+    )
+
+    class SocksConnection(base_class):
+        def connect(self):
+            self.sock = sockssocket()
+            self.sock.setproxy(*proxy_args)
+            if type(self.timeout) in (int, float):
+                self.sock.settimeout(self.timeout)
+            self.sock.connect((self.host, self.port))
+
+            if isinstance(self, compat_http_client.HTTPSConnection):
+                if hasattr(self, '_context'):  # Python > 2.6
+                    self.sock = self._context.wrap_socket(
+                        self.sock, server_hostname=self.host)
+                else:
+                    self.sock = ssl.wrap_socket(self.sock)
+
+    return SocksConnection
+
+
+class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
+    def __init__(self, params, https_conn_class=None, *args, **kwargs):
+        compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
+        self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
+        self._params = params
+
+    def https_open(self, req):
+        kwargs = {}
+        conn_class = self._https_conn_class
+
+        if hasattr(self, '_context'):  # python > 2.6
+            kwargs['context'] = self._context
+        if hasattr(self, '_check_hostname'):  # python 3.x
+            kwargs['check_hostname'] = self._check_hostname
+
+        socks_proxy = req.headers.get('Ytdl-socks-proxy')
+        if socks_proxy:
+            conn_class = make_socks_conn_class(conn_class, socks_proxy)
+            del req.headers['Ytdl-socks-proxy']
+
+        return self.do_open(functools.partial(
+            _create_http_connection, self, conn_class, True),
+            req, **kwargs)
+
+
+class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
+    """
+    See [1] for cookie file format.
+
+    1. https://curl.haxx.se/docs/http-cookies.html
+    """
+    _HTTPONLY_PREFIX = '#HttpOnly_'
+    _ENTRY_LEN = 7
+    _HEADER = '''# Netscape HTTP Cookie File
+# This file is generated by youtube-dlc.  Do not edit.
+
+'''
+    _CookieFileEntry = collections.namedtuple(
+        'CookieFileEntry',
+        ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
+
+    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
+        """
+        Save cookies to a file.
+
+        Most of the code is taken from CPython 3.8 and slightly adapted
+        to support cookie files with UTF-8 in both python 2 and 3.
+        """
+        if filename is None:
+            if self.filename is not None:
+                filename = self.filename
+            else:
+                raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
+
+        # Store session cookies with `expires` set to 0 instead of an empty
+        # string
+        for cookie in self:
+            if cookie.expires is None:
+                cookie.expires = 0
+
+        with io.open(filename, 'w', encoding='utf-8') as f:
+            f.write(self._HEADER)
+            now = time.time()
+            for cookie in self:
+                if not ignore_discard and cookie.discard:
+                    continue
+                if not ignore_expires and cookie.is_expired(now):
+                    continue
+                if cookie.secure:
+                    secure = 'TRUE'
+                else:
+                    secure = 'FALSE'
+                if cookie.domain.startswith('.'):
+                    initial_dot = 'TRUE'
+                else:
+                    initial_dot = 'FALSE'
+                if cookie.expires is not None:
+                    expires = compat_str(cookie.expires)
+                else:
+                    expires = ''
+                if cookie.value is None:
+                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
+                    # with no name, whereas http.cookiejar regards it as a
+                    # cookie with no value.
+                    name = ''
+                    value = cookie.name
+                else:
+                    name = cookie.name
+                    value = cookie.value
+                f.write(
+                    '\t'.join([cookie.domain, initial_dot, cookie.path,
+                               secure, expires, name, value]) + '\n')
+
+    def load(self, filename=None, ignore_discard=False, ignore_expires=False):
+        """Load cookies from a file."""
+        if filename is None:
+            if self.filename is not None:
+                filename = self.filename
+            else:
+                raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
+
+        def prepare_line(line):
+            if line.startswith(self._HTTPONLY_PREFIX):
+                line = line[len(self._HTTPONLY_PREFIX):]
+            # comments and empty lines are fine
+            if line.startswith('#') or not line.strip():
+                return line
+            cookie_list = line.split('\t')
+            if len(cookie_list) != self._ENTRY_LEN:
+                raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
+            cookie = self._CookieFileEntry(*cookie_list)
+            if cookie.expires_at and not cookie.expires_at.isdigit():
+                raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
+            return line
+
+        cf = io.StringIO()
+        with io.open(filename, encoding='utf-8') as f:
+            for line in f:
+                try:
+                    cf.write(prepare_line(line))
+                except compat_cookiejar.LoadError as e:
+                    write_string(
+                        'WARNING: skipping cookie file entry due to %s: %r\n'
+                        % (e, line), sys.stderr)
+                    continue
+        cf.seek(0)
+        self._really_load(cf, filename, ignore_discard, ignore_expires)
+        # Session cookies are denoted by either `expires` field set to
+        # an empty string or 0. MozillaCookieJar only recognizes the former
+        # (see [1]). So we need force the latter to be recognized as session
+        # cookies on our own.
+        # Session cookies may be important for cookies-based authentication,
+        # e.g. usually, when user does not check 'Remember me' check box while
+        # logging in on a site, some important cookies are stored as session
+        # cookies so that not recognizing them will result in failed login.
+        # 1. https://bugs.python.org/issue17164
+        for cookie in self:
+            # Treat `expires=0` cookies as session cookies
+            if cookie.expires == 0:
+                cookie.expires = None
+                cookie.discard = True
+
+
+class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
+    def __init__(self, cookiejar=None):
+        compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
+
+    def http_response(self, request, response):
+        # Python 2 will choke on next HTTP request in row if there are non-ASCII
+        # characters in Set-Cookie HTTP header of last response (see
+        # https://github.com/ytdl-org/youtube-dl/issues/6769).
+        # In order to at least prevent crashing we will percent encode Set-Cookie
+        # header before HTTPCookieProcessor starts processing it.
+        # if sys.version_info < (3, 0) and response.headers:
+        #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
+        #         set_cookie = response.headers.get(set_cookie_header)
+        #         if set_cookie:
+        #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
+        #             if set_cookie != set_cookie_escaped:
+        #                 del response.headers[set_cookie_header]
+        #                 response.headers[set_cookie_header] = set_cookie_escaped
+        return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
+
+    https_request = compat_urllib_request.HTTPCookieProcessor.http_request
+    https_response = http_response
+
+
+class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
+    if sys.version_info[0] < 3:
+        def redirect_request(self, req, fp, code, msg, headers, newurl):
+            # On python 2 urlh.geturl() may sometimes return redirect URL
+            # as byte string instead of unicode. This workaround allows
+            # to force it always return unicode.
+            return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl))
+
+
+def extract_timezone(date_str):
+    m = re.search(
+        r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+        date_str)
+    if not m:
+        timezone = datetime.timedelta()
+    else:
+        date_str = date_str[:-len(m.group('tz'))]
+        if not m.group('sign'):
+            timezone = datetime.timedelta()
+        else:
+            sign = 1 if m.group('sign') == '+' else -1
+            timezone = datetime.timedelta(
+                hours=sign * int(m.group('hours')),
+                minutes=sign * int(m.group('minutes')))
+    return timezone, date_str
+
+
+def parse_iso8601(date_str, delimiter='T', timezone=None):
+    """ Return a UNIX timestamp from the given date """
+
+    if date_str is None:
+        return None
+
+    date_str = re.sub(r'\.[0-9]+', '', date_str)
+
+    if timezone is None:
+        timezone, date_str = extract_timezone(date_str)
+
+    try:
+        date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
+        dt = datetime.datetime.strptime(date_str, date_format) - timezone
+        return calendar.timegm(dt.timetuple())
+    except ValueError:
+        pass
+
+
+def date_formats(day_first=True):
+    return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
+
+
+def unified_strdate(date_str, day_first=True):
+    """Return a string with the date in the format YYYYMMDD"""
+
+    if date_str is None:
+        return None
+    upload_date = None
+    # Replace commas
+    date_str = date_str.replace(',', ' ')
+    # Remove AM/PM + timezone
+    date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+    _, date_str = extract_timezone(date_str)
+
+    for expression in date_formats(day_first):
+        try:
+            upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
+        except ValueError:
+            pass
+    if upload_date is None:
+        timetuple = email.utils.parsedate_tz(date_str)
+        if timetuple:
+            try:
+                upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+            except ValueError:
+                pass
+    if upload_date is not None:
+        return compat_str(upload_date)
+
+
+def unified_timestamp(date_str, day_first=True):
+    if date_str is None:
+        return None
+
+    date_str = re.sub(r'[,|]', '', date_str)
+
+    pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
+    timezone, date_str = extract_timezone(date_str)
+
+    # Remove AM/PM + timezone
+    date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+
+    # Remove unrecognized timezones from ISO 8601 alike timestamps
+    m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
+    if m:
+        date_str = date_str[:-len(m.group('tz'))]
+
+    # Python only supports microseconds, so remove nanoseconds
+    m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
+    if m:
+        date_str = m.group(1)
+
+    for expression in date_formats(day_first):
+        try:
+            dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
+            return calendar.timegm(dt.timetuple())
+        except ValueError:
+            pass
+    timetuple = email.utils.parsedate_tz(date_str)
+    if timetuple:
+        return calendar.timegm(timetuple) + pm_delta * 3600
+
+
+def determine_ext(url, default_ext='unknown_video'):
+    if url is None or '.' not in url:
+        return default_ext
+    guess = url.partition('?')[0].rpartition('.')[2]
+    if re.match(r'^[A-Za-z0-9]+$', guess):
+        return guess
+    # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
+    elif guess.rstrip('/') in KNOWN_EXTENSIONS:
+        return guess.rstrip('/')
+    else:
+        return default_ext
+
+
+def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
+    return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
+
+
+def date_from_str(date_str):
+    """
+    Return a datetime object from a string in the format YYYYMMDD or
+    (now|today)[+-][0-9](day|week|month|year)(s)?"""
+    today = datetime.date.today()
+    if date_str in ('now', 'today'):
+        return today
+    if date_str == 'yesterday':
+        return today - datetime.timedelta(days=1)
+    match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
+    if match is not None:
+        sign = match.group('sign')
+        time = int(match.group('time'))
+        if sign == '-':
+            time = -time
+        unit = match.group('unit')
+        # A bad approximation?
+        if unit == 'month':
+            unit = 'day'
+            time *= 30
+        elif unit == 'year':
+            unit = 'day'
+            time *= 365
+        unit += 's'
+        delta = datetime.timedelta(**{unit: time})
+        return today + delta
+    return datetime.datetime.strptime(date_str, '%Y%m%d').date()
+
+
+def hyphenate_date(date_str):
+    """
+    Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
+    match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
+    if match is not None:
+        return '-'.join(match.groups())
+    else:
+        return date_str
+
+
+class DateRange(object):
+    """Represents a time interval between two dates"""
+
+    def __init__(self, start=None, end=None):
+        """start and end must be strings in the format accepted by date"""
+        if start is not None:
+            self.start = date_from_str(start)
+        else:
+            self.start = datetime.datetime.min.date()
+        if end is not None:
+            self.end = date_from_str(end)
+        else:
+            self.end = datetime.datetime.max.date()
+        if self.start > self.end:
+            raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
+
+    @classmethod
+    def day(cls, day):
+        """Returns a range that only contains the given day"""
+        return cls(day, day)
+
+    def __contains__(self, date):
+        """Check if the date is in the range"""
+        if not isinstance(date, datetime.date):
+            date = date_from_str(date)
+        return self.start <= date <= self.end
+
+    def __str__(self):
+        return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
+
+
+def platform_name():
+    """ Returns the platform name as a compat_str """
+    res = platform.platform()
+    if isinstance(res, bytes):
+        res = res.decode(preferredencoding())
+
+    assert isinstance(res, compat_str)
+    return res
+
+
+def _windows_write_string(s, out):
+    """ Returns True if the string was written using special methods,
+    False if it has yet to be written out."""
+    # Adapted from http://stackoverflow.com/a/3259271/35070
+
+    import ctypes
+    import ctypes.wintypes
+
+    WIN_OUTPUT_IDS = {
+        1: -11,
+        2: -12,
+    }
+
+    try:
+        fileno = out.fileno()
+    except AttributeError:
+        # If the output stream doesn't have a fileno, it's virtual
+        return False
+    except io.UnsupportedOperation:
+        # Some strange Windows pseudo files?
+        return False
+    if fileno not in WIN_OUTPUT_IDS:
+        return False
+
+    GetStdHandle = compat_ctypes_WINFUNCTYPE(
+        ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
+        ('GetStdHandle', ctypes.windll.kernel32))
+    h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
+
+    WriteConsoleW = compat_ctypes_WINFUNCTYPE(
+        ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
+        ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
+        ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
+    written = ctypes.wintypes.DWORD(0)
+
+    GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
+    FILE_TYPE_CHAR = 0x0002
+    FILE_TYPE_REMOTE = 0x8000
+    GetConsoleMode = compat_ctypes_WINFUNCTYPE(
+        ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
+        ctypes.POINTER(ctypes.wintypes.DWORD))(
+        ('GetConsoleMode', ctypes.windll.kernel32))
+    INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
+
+    def not_a_console(handle):
+        if handle == INVALID_HANDLE_VALUE or handle is None:
+            return True
+        return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
+                or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
+
+    if not_a_console(h):
+        return False
+
+    def next_nonbmp_pos(s):
+        try:
+            return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
+        except StopIteration:
+            return len(s)
+
+    while s:
+        count = min(next_nonbmp_pos(s), 1024)
+
+        ret = WriteConsoleW(
+            h, s, count if count else 2, ctypes.byref(written), None)
+        if ret == 0:
+            raise OSError('Failed to write string')
+        if not count:  # We just wrote a non-BMP character
+            assert written.value == 2
+            s = s[1:]
+        else:
+            assert written.value > 0
+            s = s[written.value:]
+    return True
+
+
+def write_string(s, out=None, encoding=None):
+    if out is None:
+        out = sys.stderr
+    assert type(s) == compat_str
+
+    if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
+        if _windows_write_string(s, out):
+            return
+
+    if ('b' in getattr(out, 'mode', '')
+            or sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
+        byt = s.encode(encoding or preferredencoding(), 'ignore')
+        out.write(byt)
+    elif hasattr(out, 'buffer'):
+        enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
+        byt = s.encode(enc, 'ignore')
+        out.buffer.write(byt)
+    else:
+        out.write(s)
+    out.flush()
+
+
+def bytes_to_intlist(bs):
+    if not bs:
+        return []
+    if isinstance(bs[0], int):  # Python 3
+        return list(bs)
+    else:
+        return [ord(c) for c in bs]
+
+
+def intlist_to_bytes(xs):
+    if not xs:
+        return b''
+    return compat_struct_pack('%dB' % len(xs), *xs)
+
+
+# Cross-platform file locking
+if sys.platform == 'win32':
+    import ctypes.wintypes
+    import msvcrt
+
+    class OVERLAPPED(ctypes.Structure):
+        _fields_ = [
+            ('Internal', ctypes.wintypes.LPVOID),
+            ('InternalHigh', ctypes.wintypes.LPVOID),
+            ('Offset', ctypes.wintypes.DWORD),
+            ('OffsetHigh', ctypes.wintypes.DWORD),
+            ('hEvent', ctypes.wintypes.HANDLE),
+        ]
+
+    kernel32 = ctypes.windll.kernel32
+    LockFileEx = kernel32.LockFileEx
+    LockFileEx.argtypes = [
+        ctypes.wintypes.HANDLE,     # hFile
+        ctypes.wintypes.DWORD,      # dwFlags
+        ctypes.wintypes.DWORD,      # dwReserved
+        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
+        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
+        ctypes.POINTER(OVERLAPPED)  # Overlapped
+    ]
+    LockFileEx.restype = ctypes.wintypes.BOOL
+    UnlockFileEx = kernel32.UnlockFileEx
+    UnlockFileEx.argtypes = [
+        ctypes.wintypes.HANDLE,     # hFile
+        ctypes.wintypes.DWORD,      # dwReserved
+        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
+        ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
+        ctypes.POINTER(OVERLAPPED)  # Overlapped
+    ]
+    UnlockFileEx.restype = ctypes.wintypes.BOOL
+    whole_low = 0xffffffff
+    whole_high = 0x7fffffff
+
+    def _lock_file(f, exclusive):
+        overlapped = OVERLAPPED()
+        overlapped.Offset = 0
+        overlapped.OffsetHigh = 0
+        overlapped.hEvent = 0
+        f._lock_file_overlapped_p = ctypes.pointer(overlapped)
+        handle = msvcrt.get_osfhandle(f.fileno())
+        if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
+                          whole_low, whole_high, f._lock_file_overlapped_p):
+            raise OSError('Locking file failed: %r' % ctypes.FormatError())
+
+    def _unlock_file(f):
+        assert f._lock_file_overlapped_p
+        handle = msvcrt.get_osfhandle(f.fileno())
+        if not UnlockFileEx(handle, 0,
+                            whole_low, whole_high, f._lock_file_overlapped_p):
+            raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
+
+else:
+    # Some platforms, such as Jython, is missing fcntl
+    try:
+        import fcntl
+
+        def _lock_file(f, exclusive):
+            fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
+
+        def _unlock_file(f):
+            fcntl.flock(f, fcntl.LOCK_UN)
+    except ImportError:
+        UNSUPPORTED_MSG = 'file locking is not supported on this platform'
+
+        def _lock_file(f, exclusive):
+            raise IOError(UNSUPPORTED_MSG)
+
+        def _unlock_file(f):
+            raise IOError(UNSUPPORTED_MSG)
+
+
+class locked_file(object):
+    def __init__(self, filename, mode, encoding=None):
+        assert mode in ['r', 'a', 'w']
+        self.f = io.open(filename, mode, encoding=encoding)
+        self.mode = mode
+
+    def __enter__(self):
+        exclusive = self.mode != 'r'
+        try:
+            _lock_file(self.f, exclusive)
+        except IOError:
+            self.f.close()
+            raise
+        return self
+
+    def __exit__(self, etype, value, traceback):
+        try:
+            _unlock_file(self.f)
+        finally:
+            self.f.close()
+
+    def __iter__(self):
+        return iter(self.f)
+
+    def write(self, *args):
+        return self.f.write(*args)
+
+    def read(self, *args):
+        return self.f.read(*args)
+
+
+def get_filesystem_encoding():
+    encoding = sys.getfilesystemencoding()
+    return encoding if encoding is not None else 'utf-8'
+
+
+def shell_quote(args):
+    quoted_args = []
+    encoding = get_filesystem_encoding()
+    for a in args:
+        if isinstance(a, bytes):
+            # We may get a filename encoded with 'encodeFilename'
+            a = a.decode(encoding)
+        quoted_args.append(compat_shlex_quote(a))
+    return ' '.join(quoted_args)
+
+
+def smuggle_url(url, data):
+    """ Pass additional data in a URL for internal use. """
+
+    url, idata = unsmuggle_url(url, {})
+    data.update(idata)
+    sdata = compat_urllib_parse_urlencode(
+        {'__youtubedl_smuggle': json.dumps(data)})
+    return url + '#' + sdata
+
+
+def unsmuggle_url(smug_url, default=None):
+    if '#__youtubedl_smuggle' not in smug_url:
+        return smug_url, default
+    url, _, sdata = smug_url.rpartition('#')
+    jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
+    data = json.loads(jsond)
+    return url, data
+
+
+def format_bytes(bytes):
+    if bytes is None:
+        return 'N/A'
+    if type(bytes) is str:
+        bytes = float(bytes)
+    if bytes == 0.0:
+        exponent = 0
+    else:
+        exponent = int(math.log(bytes, 1024.0))
+    suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
+    converted = float(bytes) / float(1024 ** exponent)
+    return '%.2f%s' % (converted, suffix)
+
+
+def lookup_unit_table(unit_table, s):
+    units_re = '|'.join(re.escape(u) for u in unit_table)
+    m = re.match(
+        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
+    if not m:
+        return None
+    num_str = m.group('num').replace(',', '.')
+    mult = unit_table[m.group('unit')]
+    return int(float(num_str) * mult)
+
+
+def parse_filesize(s):
+    if s is None:
+        return None
+
+    # The lower-case forms are of course incorrect and unofficial,
+    # but we support those too
+    _UNIT_TABLE = {
+        'B': 1,
+        'b': 1,
+        'bytes': 1,
+        'KiB': 1024,
+        'KB': 1000,
+        'kB': 1024,
+        'Kb': 1000,
+        'kb': 1000,
+        'kilobytes': 1000,
+        'kibibytes': 1024,
+        'MiB': 1024 ** 2,
+        'MB': 1000 ** 2,
+        'mB': 1024 ** 2,
+        'Mb': 1000 ** 2,
+        'mb': 1000 ** 2,
+        'megabytes': 1000 ** 2,
+        'mebibytes': 1024 ** 2,
+        'GiB': 1024 ** 3,
+        'GB': 1000 ** 3,
+        'gB': 1024 ** 3,
+        'Gb': 1000 ** 3,
+        'gb': 1000 ** 3,
+        'gigabytes': 1000 ** 3,
+        'gibibytes': 1024 ** 3,
+        'TiB': 1024 ** 4,
+        'TB': 1000 ** 4,
+        'tB': 1024 ** 4,
+        'Tb': 1000 ** 4,
+        'tb': 1000 ** 4,
+        'terabytes': 1000 ** 4,
+        'tebibytes': 1024 ** 4,
+        'PiB': 1024 ** 5,
+        'PB': 1000 ** 5,
+        'pB': 1024 ** 5,
+        'Pb': 1000 ** 5,
+        'pb': 1000 ** 5,
+        'petabytes': 1000 ** 5,
+        'pebibytes': 1024 ** 5,
+        'EiB': 1024 ** 6,
+        'EB': 1000 ** 6,
+        'eB': 1024 ** 6,
+        'Eb': 1000 ** 6,
+        'eb': 1000 ** 6,
+        'exabytes': 1000 ** 6,
+        'exbibytes': 1024 ** 6,
+        'ZiB': 1024 ** 7,
+        'ZB': 1000 ** 7,
+        'zB': 1024 ** 7,
+        'Zb': 1000 ** 7,
+        'zb': 1000 ** 7,
+        'zettabytes': 1000 ** 7,
+        'zebibytes': 1024 ** 7,
+        'YiB': 1024 ** 8,
+        'YB': 1000 ** 8,
+        'yB': 1024 ** 8,
+        'Yb': 1000 ** 8,
+        'yb': 1000 ** 8,
+        'yottabytes': 1000 ** 8,
+        'yobibytes': 1024 ** 8,
+    }
+
+    return lookup_unit_table(_UNIT_TABLE, s)
+
+
+def parse_count(s):
+    if s is None:
+        return None
+
+    s = s.strip()
+
+    if re.match(r'^[\d,.]+$', s):
+        return str_to_int(s)
+
+    _UNIT_TABLE = {
+        'k': 1000,
+        'K': 1000,
+        'm': 1000 ** 2,
+        'M': 1000 ** 2,
+        'kk': 1000 ** 2,
+        'KK': 1000 ** 2,
+    }
+
+    return lookup_unit_table(_UNIT_TABLE, s)
+
+
+def parse_resolution(s):
+    if s is None:
+        return {}
+
+    mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
+    if mobj:
+        return {
+            'width': int(mobj.group('w')),
+            'height': int(mobj.group('h')),
+        }
+
+    mobj = re.search(r'\b(\d+)[pPiI]\b', s)
+    if mobj:
+        return {'height': int(mobj.group(1))}
+
+    mobj = re.search(r'\b([48])[kK]\b', s)
+    if mobj:
+        return {'height': int(mobj.group(1)) * 540}
+
+    return {}
+
+
+def parse_bitrate(s):
+    if not isinstance(s, compat_str):
+        return
+    mobj = re.search(r'\b(\d+)\s*kbps', s)
+    if mobj:
+        return int(mobj.group(1))
+
+
+def month_by_name(name, lang='en'):
+    """ Return the number of a month by (locale-independently) English name """
+
+    month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
+
+    try:
+        return month_names.index(name) + 1
+    except ValueError:
+        return None
+
+
+def month_by_abbreviation(abbrev):
+    """ Return the number of a month by (locale-independently) English
+        abbreviations """
+
+    try:
+        return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
+    except ValueError:
+        return None
+
+
+def fix_xml_ampersands(xml_str):
+    """Replace all the '&' by '&amp;' in XML"""
+    return re.sub(
+        r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
+        '&amp;',
+        xml_str)
+
+
+def setproctitle(title):
+    assert isinstance(title, compat_str)
+
+    # ctypes in Jython is not complete
+    # http://bugs.jython.org/issue2148
+    if sys.platform.startswith('java'):
+        return
+
+    try:
+        libc = ctypes.cdll.LoadLibrary('libc.so.6')
+    except OSError:
+        return
+    except TypeError:
+        # LoadLibrary in Windows Python 2.7.13 only expects
+        # a bytestring, but since unicode_literals turns
+        # every string into a unicode string, it fails.
+        return
+    title_bytes = title.encode('utf-8')
+    buf = ctypes.create_string_buffer(len(title_bytes))
+    buf.value = title_bytes
+    try:
+        libc.prctl(15, buf, 0, 0, 0)
+    except AttributeError:
+        return  # Strange libc, just skip this
+
+
+def remove_start(s, start):
+    return s[len(start):] if s is not None and s.startswith(start) else s
+
+
+def remove_end(s, end):
+    return s[:-len(end)] if s is not None and s.endswith(end) else s
+
+
+def remove_quotes(s):
+    if s is None or len(s) < 2:
+        return s
+    for quote in ('"', "'", ):
+        if s[0] == quote and s[-1] == quote:
+            return s[1:-1]
+    return s
+
+
+def url_basename(url):
+    path = compat_urlparse.urlparse(url).path
+    return path.strip('/').split('/')[-1]
+
+
+def base_url(url):
+    return re.match(r'https?://[^?#&]+/', url).group()
+
+
+def urljoin(base, path):
+    if isinstance(path, bytes):
+        path = path.decode('utf-8')
+    if not isinstance(path, compat_str) or not path:
+        return None
+    if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
+        return path
+    if isinstance(base, bytes):
+        base = base.decode('utf-8')
+    if not isinstance(base, compat_str) or not re.match(
+            r'^(?:https?:)?//', base):
+        return None
+    return compat_urlparse.urljoin(base, path)
+
+
+class HEADRequest(compat_urllib_request.Request):
+    def get_method(self):
+        return 'HEAD'
+
+
+class PUTRequest(compat_urllib_request.Request):
+    def get_method(self):
+        return 'PUT'
+
+
+def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
+    if get_attr:
+        if v is not None:
+            v = getattr(v, get_attr, None)
+    if v == '':
+        v = None
+    if v is None:
+        return default
+    try:
+        return int(v) * invscale // scale
+    except (ValueError, TypeError):
+        return default
+
+
+def str_or_none(v, default=None):
+    return default if v is None else compat_str(v)
+
+
+def str_to_int(int_str):
+    """ A more relaxed version of int_or_none """
+    if isinstance(int_str, compat_integer_types):
+        return int_str
+    elif isinstance(int_str, compat_str):
+        int_str = re.sub(r'[,\.\+]', '', int_str)
+        return int_or_none(int_str)
+
+
+def float_or_none(v, scale=1, invscale=1, default=None):
+    if v is None:
+        return default
+    try:
+        return float(v) * invscale / scale
+    except (ValueError, TypeError):
+        return default
+
+
+def bool_or_none(v, default=None):
+    return v if isinstance(v, bool) else default
+
+
+def strip_or_none(v, default=None):
+    return v.strip() if isinstance(v, compat_str) else default
+
+
+def url_or_none(url):
+    if not url or not isinstance(url, compat_str):
+        return None
+    url = url.strip()
+    return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
+
+
+def parse_duration(s):
+    if not isinstance(s, compat_basestring):
+        return None
+
+    s = s.strip()
+
+    days, hours, mins, secs, ms = [None] * 5
+    m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
+    if m:
+        days, hours, mins, secs, ms = m.groups()
+    else:
+        m = re.match(
+            r'''(?ix)(?:P?
+                (?:
+                    [0-9]+\s*y(?:ears?)?\s*
+                )?
+                (?:
+                    [0-9]+\s*m(?:onths?)?\s*
+                )?
+                (?:
+                    [0-9]+\s*w(?:eeks?)?\s*
+                )?
+                (?:
+                    (?P<days>[0-9]+)\s*d(?:ays?)?\s*
+                )?
+                T)?
+                (?:
+                    (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
+                )?
+                (?:
+                    (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
+                )?
+                (?:
+                    (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
+                )?Z?$''', s)
+        if m:
+            days, hours, mins, secs, ms = m.groups()
+        else:
+            m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
+            if m:
+                hours, mins = m.groups()
+            else:
+                return None
+
+    duration = 0
+    if secs:
+        duration += float(secs)
+    if mins:
+        duration += float(mins) * 60
+    if hours:
+        duration += float(hours) * 60 * 60
+    if days:
+        duration += float(days) * 24 * 60 * 60
+    if ms:
+        duration += float(ms)
+    return duration
+
+
+def prepend_extension(filename, ext, expected_real_ext=None):
+    name, real_ext = os.path.splitext(filename)
+    return (
+        '{0}.{1}{2}'.format(name, ext, real_ext)
+        if not expected_real_ext or real_ext[1:] == expected_real_ext
+        else '{0}.{1}'.format(filename, ext))
+
+
+def replace_extension(filename, ext, expected_real_ext=None):
+    name, real_ext = os.path.splitext(filename)
+    return '{0}.{1}'.format(
+        name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
+        ext)
+
+
+def check_executable(exe, args=[]):
+    """ Checks if the given binary is installed somewhere in PATH, and returns its name.
+    args can be a list of arguments for a short output (like -version) """
+    try:
+        subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+    except OSError:
+        return False
+    return exe
+
+
+def get_exe_version(exe, args=['--version'],
+                    version_re=None, unrecognized='present'):
+    """ Returns the version of the specified executable,
+    or False if the executable is not present """
+    try:
+        # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
+        # SIGTTOU if youtube-dlc is run in the background.
+        # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
+        out, _ = subprocess.Popen(
+            [encodeArgument(exe)] + args,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
+    except OSError:
+        return False
+    if isinstance(out, bytes):  # Python 2.x
+        out = out.decode('ascii', 'ignore')
+    return detect_exe_version(out, version_re, unrecognized)
+
+
+def detect_exe_version(output, version_re=None, unrecognized='present'):
+    assert isinstance(output, compat_str)
+    if version_re is None:
+        version_re = r'version\s+([-0-9._a-zA-Z]+)'
+    m = re.search(version_re, output)
+    if m:
+        return m.group(1)
+    else:
+        return unrecognized
+
+
+class PagedList(object):
+    def __len__(self):
+        # This is only useful for tests
+        return len(self.getslice())
+
+
+class OnDemandPagedList(PagedList):
+    def __init__(self, pagefunc, pagesize, use_cache=True):
+        self._pagefunc = pagefunc
+        self._pagesize = pagesize
+        self._use_cache = use_cache
+        if use_cache:
+            self._cache = {}
+
+    def getslice(self, start=0, end=None):
+        res = []
+        for pagenum in itertools.count(start // self._pagesize):
+            firstid = pagenum * self._pagesize
+            nextfirstid = pagenum * self._pagesize + self._pagesize
+            if start >= nextfirstid:
+                continue
+
+            page_results = None
+            if self._use_cache:
+                page_results = self._cache.get(pagenum)
+            if page_results is None:
+                page_results = list(self._pagefunc(pagenum))
+            if self._use_cache:
+                self._cache[pagenum] = page_results
+
+            startv = (
+                start % self._pagesize
+                if firstid <= start < nextfirstid
+                else 0)
+
+            endv = (
+                ((end - 1) % self._pagesize) + 1
+                if (end is not None and firstid <= end <= nextfirstid)
+                else None)
+
+            if startv != 0 or endv is not None:
+                page_results = page_results[startv:endv]
+            res.extend(page_results)
+
+            # A little optimization - if current page is not "full", ie. does
+            # not contain page_size videos then we can assume that this page
+            # is the last one - there are no more ids on further pages -
+            # i.e. no need to query again.
+            if len(page_results) + startv < self._pagesize:
+                break
+
+            # If we got the whole page, but the next page is not interesting,
+            # break out early as well
+            if end == nextfirstid:
+                break
+        return res
+
+
+class InAdvancePagedList(PagedList):
+    def __init__(self, pagefunc, pagecount, pagesize):
+        self._pagefunc = pagefunc
+        self._pagecount = pagecount
+        self._pagesize = pagesize
+
+    def getslice(self, start=0, end=None):
+        res = []
+        start_page = start // self._pagesize
+        end_page = (
+            self._pagecount if end is None else (end // self._pagesize + 1))
+        skip_elems = start - start_page * self._pagesize
+        only_more = None if end is None else end - start
+        for pagenum in range(start_page, end_page):
+            page = list(self._pagefunc(pagenum))
+            if skip_elems:
+                page = page[skip_elems:]
+                skip_elems = None
+            if only_more is not None:
+                if len(page) < only_more:
+                    only_more -= len(page)
+                else:
+                    page = page[:only_more]
+                    res.extend(page)
+                    break
+            res.extend(page)
+        return res
+
+
+def uppercase_escape(s):
+    unicode_escape = codecs.getdecoder('unicode_escape')
+    return re.sub(
+        r'\\U[0-9a-fA-F]{8}',
+        lambda m: unicode_escape(m.group(0))[0],
+        s)
+
+
+def lowercase_escape(s):
+    unicode_escape = codecs.getdecoder('unicode_escape')
+    return re.sub(
+        r'\\u[0-9a-fA-F]{4}',
+        lambda m: unicode_escape(m.group(0))[0],
+        s)
+
+
+def escape_rfc3986(s):
+    """Escape non-ASCII characters as suggested by RFC 3986"""
+    if sys.version_info < (3, 0) and isinstance(s, compat_str):
+        s = s.encode('utf-8')
+    return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
+
+
+def escape_url(url):
+    """Escape URL as suggested by RFC 3986"""
+    url_parsed = compat_urllib_parse_urlparse(url)
+    return url_parsed._replace(
+        netloc=url_parsed.netloc.encode('idna').decode('ascii'),
+        path=escape_rfc3986(url_parsed.path),
+        params=escape_rfc3986(url_parsed.params),
+        query=escape_rfc3986(url_parsed.query),
+        fragment=escape_rfc3986(url_parsed.fragment)
+    ).geturl()
+
+
+def read_batch_urls(batch_fd):
+    def fixup(url):
+        if not isinstance(url, compat_str):
+            url = url.decode('utf-8', 'replace')
+        BOM_UTF8 = '\xef\xbb\xbf'
+        if url.startswith(BOM_UTF8):
+            url = url[len(BOM_UTF8):]
+        url = url.strip()
+        if url.startswith(('#', ';', ']')):
+            return False
+        return url
+
+    with contextlib.closing(batch_fd) as fd:
+        return [url for url in map(fixup, fd) if url]
+
+
+def urlencode_postdata(*args, **kargs):
+    return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
+
+
+def update_url_query(url, query):
+    if not query:
+        return url
+    parsed_url = compat_urlparse.urlparse(url)
+    qs = compat_parse_qs(parsed_url.query)
+    qs.update(query)
+    return compat_urlparse.urlunparse(parsed_url._replace(
+        query=compat_urllib_parse_urlencode(qs, True)))
+
+
+def update_Request(req, url=None, data=None, headers={}, query={}):
+    req_headers = req.headers.copy()
+    req_headers.update(headers)
+    req_data = data or req.data
+    req_url = update_url_query(url or req.get_full_url(), query)
+    req_get_method = req.get_method()
+    if req_get_method == 'HEAD':
+        req_type = HEADRequest
+    elif req_get_method == 'PUT':
+        req_type = PUTRequest
+    else:
+        req_type = compat_urllib_request.Request
+    new_req = req_type(
+        req_url, data=req_data, headers=req_headers,
+        origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+    if hasattr(req, 'timeout'):
+        new_req.timeout = req.timeout
+    return new_req
+
+
+def _multipart_encode_impl(data, boundary):
+    content_type = 'multipart/form-data; boundary=%s' % boundary
+
+    out = b''
+    for k, v in data.items():
+        out += b'--' + boundary.encode('ascii') + b'\r\n'
+        if isinstance(k, compat_str):
+            k = k.encode('utf-8')
+        if isinstance(v, compat_str):
+            v = v.encode('utf-8')
+        # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
+        # suggests sending UTF-8 directly. Firefox sends UTF-8, too
+        content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
+        if boundary.encode('ascii') in content:
+            raise ValueError('Boundary overlaps with data')
+        out += content
+
+    out += b'--' + boundary.encode('ascii') + b'--\r\n'
+
+    return out, content_type
+
+
+def multipart_encode(data, boundary=None):
+    '''
+    Encode a dict to RFC 7578-compliant form-data
+
+    data:
+        A dict where keys and values can be either Unicode or bytes-like
+        objects.
+    boundary:
+        If specified a Unicode object, it's used as the boundary. Otherwise
+        a random boundary is generated.
+
+    Reference: https://tools.ietf.org/html/rfc7578
+    '''
+    has_specified_boundary = boundary is not None
+
+    while True:
+        if boundary is None:
+            boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
+
+        try:
+            out, content_type = _multipart_encode_impl(data, boundary)
+            break
+        except ValueError:
+            if has_specified_boundary:
+                raise
+            boundary = None
+
+    return out, content_type
+
+
+def dict_get(d, key_or_keys, default=None, skip_false_values=True):
+    if isinstance(key_or_keys, (list, tuple)):
+        for key in key_or_keys:
+            if key not in d or d[key] is None or skip_false_values and not d[key]:
+                continue
+            return d[key]
+        return default
+    return d.get(key_or_keys, default)
+
+
+def try_get(src, getter, expected_type=None):
+    if not isinstance(getter, (list, tuple)):
+        getter = [getter]
+    for get in getter:
+        try:
+            v = get(src)
+        except (AttributeError, KeyError, TypeError, IndexError):
+            pass
+        else:
+            if expected_type is None or isinstance(v, expected_type):
+                return v
+
+
+def merge_dicts(*dicts):
+    merged = {}
+    for a_dict in dicts:
+        for k, v in a_dict.items():
+            if v is None:
+                continue
+            if (k not in merged
+                    or (isinstance(v, compat_str) and v
+                        and isinstance(merged[k], compat_str)
+                        and not merged[k])):
+                merged[k] = v
+    return merged
+
+
+def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
+    return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
+
+
+US_RATINGS = {
+    'G': 0,
+    'PG': 10,
+    'PG-13': 13,
+    'R': 16,
+    'NC': 18,
+}
+
+
+TV_PARENTAL_GUIDELINES = {
+    'TV-Y': 0,
+    'TV-Y7': 7,
+    'TV-G': 0,
+    'TV-PG': 0,
+    'TV-14': 14,
+    'TV-MA': 17,
+}
+
+
+def parse_age_limit(s):
+    if type(s) == int:
+        return s if 0 <= s <= 21 else None
+    if not isinstance(s, compat_basestring):
+        return None
+    m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
+    if m:
+        return int(m.group('age'))
+    if s in US_RATINGS:
+        return US_RATINGS[s]
+    m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
+    if m:
+        return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
+    return None
+
+
+def strip_jsonp(code):
+    return re.sub(
+        r'''(?sx)^
+            (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
+            (?:\s*&&\s*(?P=func_name))?
+            \s*\(\s*(?P<callback_data>.*)\);?
+            \s*?(?://[^\n]*)*$''',
+        r'\g<callback_data>', code)
+
+
+def js_to_json(code):
+    COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
+    SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
+    INTEGER_TABLE = (
+        (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
+        (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
+    )
+
+    def fix_kv(m):
+        v = m.group(0)
+        if v in ('true', 'false', 'null'):
+            return v
+        elif v.startswith('/*') or v.startswith('//') or v == ',':
+            return ""
+
+        if v[0] in ("'", '"'):
+            v = re.sub(r'(?s)\\.|"', lambda m: {
+                '"': '\\"',
+                "\\'": "'",
+                '\\\n': '',
+                '\\x': '\\u00',
+            }.get(m.group(0), m.group(0)), v[1:-1])
+
+        for regex, base in INTEGER_TABLE:
+            im = re.match(regex, v)
+            if im:
+                i = int(im.group(1), base)
+                return '"%d":' % i if v.endswith(':') else '%d' % i
+
+        return '"%s"' % v
+
+    return re.sub(r'''(?sx)
+        "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
+        '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
+        {comment}|,(?={skip}[\]}}])|
+        (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
+        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
+        [0-9]+(?={skip}:)
+        '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
+
+
+def qualities(quality_ids):
+    """ Get a numeric quality value out of a list of possible values """
+    def q(qid):
+        try:
+            return quality_ids.index(qid)
+        except ValueError:
+            return -1
+    return q
+
+
+DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
+
+
+def limit_length(s, length):
+    """ Add ellipses to overly long strings """
+    if s is None:
+        return None
+    ELLIPSES = '...'
+    if len(s) > length:
+        return s[:length - len(ELLIPSES)] + ELLIPSES
+    return s
+
+
+def version_tuple(v):
+    return tuple(int(e) for e in re.split(r'[-.]', v))
+
+
+def is_outdated_version(version, limit, assume_new=True):
+    if not version:
+        return not assume_new
+    try:
+        return version_tuple(version) < version_tuple(limit)
+    except ValueError:
+        return not assume_new
+
+
+def ytdl_is_updateable():
+    """ Returns if youtube-dlc can be updated with -U """
+    from zipimport import zipimporter
+
+    return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
+
+
+def args_to_str(args):
+    # Get a short string representation for a subprocess command
+    return ' '.join(compat_shlex_quote(a) for a in args)
+
+
+def error_to_compat_str(err):
+    err_str = str(err)
+    # On python 2 error byte string must be decoded with proper
+    # encoding rather than ascii
+    if sys.version_info[0] < 3:
+        err_str = err_str.decode(preferredencoding())
+    return err_str
+
+
+def mimetype2ext(mt):
+    if mt is None:
+        return None
+
+    ext = {
+        'audio/mp4': 'm4a',
+        # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
+        # it's the most popular one
+        'audio/mpeg': 'mp3',
+        'audio/x-wav': 'wav',
+    }.get(mt)
+    if ext is not None:
+        return ext
+
+    _, _, res = mt.rpartition('/')
+    res = res.split(';')[0].strip().lower()
+
+    return {
+        '3gpp': '3gp',
+        'smptett+xml': 'tt',
+        'ttaf+xml': 'dfxp',
+        'ttml+xml': 'ttml',
+        'x-flv': 'flv',
+        'x-mp4-fragmented': 'mp4',
+        'x-ms-sami': 'sami',
+        'x-ms-wmv': 'wmv',
+        'mpegurl': 'm3u8',
+        'x-mpegurl': 'm3u8',
+        'vnd.apple.mpegurl': 'm3u8',
+        'dash+xml': 'mpd',
+        'f4m+xml': 'f4m',
+        'hds+xml': 'f4m',
+        'vnd.ms-sstr+xml': 'ism',
+        'quicktime': 'mov',
+        'mp2t': 'ts',
+    }.get(res, res)
+
+
+def parse_codecs(codecs_str):
+    # http://tools.ietf.org/html/rfc6381
+    if not codecs_str:
+        return {}
+    splited_codecs = list(filter(None, map(
+        lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
+    vcodec, acodec = None, None
+    for full_codec in splited_codecs:
+        codec = full_codec.split('.')[0]
+        if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
+            if not vcodec:
+                vcodec = full_codec
+        elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
+            if not acodec:
+                acodec = full_codec
+        else:
+            write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
+    if not vcodec and not acodec:
+        if len(splited_codecs) == 2:
+            return {
+                'vcodec': splited_codecs[0],
+                'acodec': splited_codecs[1],
+            }
+    else:
+        return {
+            'vcodec': vcodec or 'none',
+            'acodec': acodec or 'none',
+        }
+    return {}
+
+
+def urlhandle_detect_ext(url_handle):
+    getheader = url_handle.headers.get
+
+    cd = getheader('Content-Disposition')
+    if cd:
+        m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
+        if m:
+            e = determine_ext(m.group('filename'), default_ext=None)
+            if e:
+                return e
+
+    return mimetype2ext(getheader('Content-Type'))
+
+
+def encode_data_uri(data, mime_type):
+    return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
+
+
+def age_restricted(content_limit, age_limit):
+    """ Returns True iff the content should be blocked """
+
+    if age_limit is None:  # No limit set
+        return False
+    if content_limit is None:
+        return False  # Content available for everyone
+    return age_limit < content_limit
+
+
+def is_html(first_bytes):
+    """ Detect whether a file contains HTML by examining its first bytes. """
+
+    BOMS = [
+        (b'\xef\xbb\xbf', 'utf-8'),
+        (b'\x00\x00\xfe\xff', 'utf-32-be'),
+        (b'\xff\xfe\x00\x00', 'utf-32-le'),
+        (b'\xff\xfe', 'utf-16-le'),
+        (b'\xfe\xff', 'utf-16-be'),
+    ]
+    for bom, enc in BOMS:
+        if first_bytes.startswith(bom):
+            s = first_bytes[len(bom):].decode(enc, 'replace')
+            break
+    else:
+        s = first_bytes.decode('utf-8', 'replace')
+
+    return re.match(r'^\s*<', s)
+
+
+def determine_protocol(info_dict):
+    protocol = info_dict.get('protocol')
+    if protocol is not None:
+        return protocol
+
+    url = info_dict['url']
+    if url.startswith('rtmp'):
+        return 'rtmp'
+    elif url.startswith('mms'):
+        return 'mms'
+    elif url.startswith('rtsp'):
+        return 'rtsp'
+
+    ext = determine_ext(url)
+    if ext == 'm3u8':
+        return 'm3u8'
+    elif ext == 'f4m':
+        return 'f4m'
+
+    return compat_urllib_parse_urlparse(url).scheme
+
+
+def render_table(header_row, data):
+    """ Render a list of rows, each as a list of values """
+    table = [header_row] + data
+    max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
+    format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
+    return '\n'.join(format_str % tuple(row) for row in table)
+
+
+def _match_one(filter_part, dct):
+    COMPARISON_OPERATORS = {
+        '<': operator.lt,
+        '<=': operator.le,
+        '>': operator.gt,
+        '>=': operator.ge,
+        '=': operator.eq,
+        '!=': operator.ne,
+    }
+    operator_rex = re.compile(r'''(?x)\s*
+        (?P<key>[a-z_]+)
+        \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+        (?:
+            (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
+            (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
+            (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
+        )
+        \s*$
+        ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
+    m = operator_rex.search(filter_part)
+    if m:
+        op = COMPARISON_OPERATORS[m.group('op')]
+        actual_value = dct.get(m.group('key'))
+        if (m.group('quotedstrval') is not None
+            or m.group('strval') is not None
+            # If the original field is a string and matching comparisonvalue is
+            # a number we should respect the origin of the original field
+            # and process comparison value as a string (see
+            # https://github.com/ytdl-org/youtube-dl/issues/11082).
+            or actual_value is not None and m.group('intval') is not None
+                and isinstance(actual_value, compat_str)):
+            if m.group('op') not in ('=', '!='):
+                raise ValueError(
+                    'Operator %s does not support string values!' % m.group('op'))
+            comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
+            quote = m.group('quote')
+            if quote is not None:
+                comparison_value = comparison_value.replace(r'\%s' % quote, quote)
+        else:
+            try:
+                comparison_value = int(m.group('intval'))
+            except ValueError:
+                comparison_value = parse_filesize(m.group('intval'))
+                if comparison_value is None:
+                    comparison_value = parse_filesize(m.group('intval') + 'B')
+                if comparison_value is None:
+                    raise ValueError(
+                        'Invalid integer value %r in filter part %r' % (
+                            m.group('intval'), filter_part))
+        if actual_value is None:
+            return m.group('none_inclusive')
+        return op(actual_value, comparison_value)
+
+    UNARY_OPERATORS = {
+        '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
+        '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
+    }
+    operator_rex = re.compile(r'''(?x)\s*
+        (?P<op>%s)\s*(?P<key>[a-z_]+)
+        \s*$
+        ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
+    m = operator_rex.search(filter_part)
+    if m:
+        op = UNARY_OPERATORS[m.group('op')]
+        actual_value = dct.get(m.group('key'))
+        return op(actual_value)
+
+    raise ValueError('Invalid filter part %r' % filter_part)
+
+
+def match_str(filter_str, dct):
+    """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
+
+    return all(
+        _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
+
+
+def match_filter_func(filter_str):
+    def _match_func(info_dict):
+        if match_str(filter_str, info_dict):
+            return None
+        else:
+            video_title = info_dict.get('title', info_dict.get('id', 'video'))
+            return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
+    return _match_func
+
+
+def parse_dfxp_time_expr(time_expr):
+    if not time_expr:
+        return
+
+    mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
+    if mobj:
+        return float(mobj.group('time_offset'))
+
+    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
+    if mobj:
+        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
+
+
+def srt_subtitles_timecode(seconds):
+    return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
+
+
+def dfxp2srt(dfxp_data):
+    '''
+    @param dfxp_data A bytes-like object containing DFXP data
+    @returns A unicode object containing converted SRT data
+    '''
+    LEGACY_NAMESPACES = (
+        (b'http://www.w3.org/ns/ttml', [
+            b'http://www.w3.org/2004/11/ttaf1',
+            b'http://www.w3.org/2006/04/ttaf1',
+            b'http://www.w3.org/2006/10/ttaf1',
+        ]),
+        (b'http://www.w3.org/ns/ttml#styling', [
+            b'http://www.w3.org/ns/ttml#style',
+        ]),
+    )
+
+    SUPPORTED_STYLING = [
+        'color',
+        'fontFamily',
+        'fontSize',
+        'fontStyle',
+        'fontWeight',
+        'textDecoration'
+    ]
+
+    _x = functools.partial(xpath_with_ns, ns_map={
+        'xml': 'http://www.w3.org/XML/1998/namespace',
+        'ttml': 'http://www.w3.org/ns/ttml',
+        'tts': 'http://www.w3.org/ns/ttml#styling',
+    })
+
+    styles = {}
+    default_style = {}
+
+    class TTMLPElementParser(object):
+        _out = ''
+        _unclosed_elements = []
+        _applied_styles = []
+
+        def start(self, tag, attrib):
+            if tag in (_x('ttml:br'), 'br'):
+                self._out += '\n'
+            else:
+                unclosed_elements = []
+                style = {}
+                element_style_id = attrib.get('style')
+                if default_style:
+                    style.update(default_style)
+                if element_style_id:
+                    style.update(styles.get(element_style_id, {}))
+                for prop in SUPPORTED_STYLING:
+                    prop_val = attrib.get(_x('tts:' + prop))
+                    if prop_val:
+                        style[prop] = prop_val
+                if style:
+                    font = ''
+                    for k, v in sorted(style.items()):
+                        if self._applied_styles and self._applied_styles[-1].get(k) == v:
+                            continue
+                        if k == 'color':
+                            font += ' color="%s"' % v
+                        elif k == 'fontSize':
+                            font += ' size="%s"' % v
+                        elif k == 'fontFamily':
+                            font += ' face="%s"' % v
+                        elif k == 'fontWeight' and v == 'bold':
+                            self._out += '<b>'
+                            unclosed_elements.append('b')
+                        elif k == 'fontStyle' and v == 'italic':
+                            self._out += '<i>'
+                            unclosed_elements.append('i')
+                        elif k == 'textDecoration' and v == 'underline':
+                            self._out += '<u>'
+                            unclosed_elements.append('u')
+                    if font:
+                        self._out += '<font' + font + '>'
+                        unclosed_elements.append('font')
+                    applied_style = {}
+                    if self._applied_styles:
+                        applied_style.update(self._applied_styles[-1])
+                    applied_style.update(style)
+                    self._applied_styles.append(applied_style)
+                self._unclosed_elements.append(unclosed_elements)
+
+        def end(self, tag):
+            if tag not in (_x('ttml:br'), 'br'):
+                unclosed_elements = self._unclosed_elements.pop()
+                for element in reversed(unclosed_elements):
+                    self._out += '</%s>' % element
+                if unclosed_elements and self._applied_styles:
+                    self._applied_styles.pop()
+
+        def data(self, data):
+            self._out += data
+
+        def close(self):
+            return self._out.strip()
+
+    def parse_node(node):
+        target = TTMLPElementParser()
+        parser = xml.etree.ElementTree.XMLParser(target=target)
+        parser.feed(xml.etree.ElementTree.tostring(node))
+        return parser.close()
+
+    for k, v in LEGACY_NAMESPACES:
+        for ns in v:
+            dfxp_data = dfxp_data.replace(ns, k)
+
+    dfxp = compat_etree_fromstring(dfxp_data)
+    out = []
+    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
+
+    if not paras:
+        raise ValueError('Invalid dfxp/TTML subtitle')
+
+    repeat = False
+    while True:
+        for style in dfxp.findall(_x('.//ttml:style')):
+            style_id = style.get('id') or style.get(_x('xml:id'))
+            if not style_id:
+                continue
+            parent_style_id = style.get('style')
+            if parent_style_id:
+                if parent_style_id not in styles:
+                    repeat = True
+                    continue
+                styles[style_id] = styles[parent_style_id].copy()
+            for prop in SUPPORTED_STYLING:
+                prop_val = style.get(_x('tts:' + prop))
+                if prop_val:
+                    styles.setdefault(style_id, {})[prop] = prop_val
+        if repeat:
+            repeat = False
+        else:
+            break
+
+    for p in ('body', 'div'):
+        ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
+        if ele is None:
+            continue
+        style = styles.get(ele.get('style'))
+        if not style:
+            continue
+        default_style.update(style)
+
+    for para, index in zip(paras, itertools.count(1)):
+        begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
+        end_time = parse_dfxp_time_expr(para.attrib.get('end'))
+        dur = parse_dfxp_time_expr(para.attrib.get('dur'))
+        if begin_time is None:
+            continue
+        if not end_time:
+            if not dur:
+                continue
+            end_time = begin_time + dur
+        out.append('%d\n%s --> %s\n%s\n\n' % (
+            index,
+            srt_subtitles_timecode(begin_time),
+            srt_subtitles_timecode(end_time),
+            parse_node(para)))
+
+    return ''.join(out)
+
+
+def cli_option(params, command_option, param):
+    param = params.get(param)
+    if param:
+        param = compat_str(param)
+    return [command_option, param] if param is not None else []
+
+
+def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
+    param = params.get(param)
+    if param is None:
+        return []
+    assert isinstance(param, bool)
+    if separator:
+        return [command_option + separator + (true_value if param else false_value)]
+    return [command_option, true_value if param else false_value]
+
+
+def cli_valueless_option(params, command_option, param, expected_value=True):
+    param = params.get(param)
+    return [command_option] if param == expected_value else []
+
+
+def cli_configuration_args(params, param, default=[]):
+    ex_args = params.get(param)
+    if ex_args is None:
+        return default
+    assert isinstance(ex_args, list)
+    return ex_args
+
+
+class ISO639Utils(object):
+    # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
+    _lang_map = {
+        'aa': 'aar',
+        'ab': 'abk',
+        'ae': 'ave',
+        'af': 'afr',
+        'ak': 'aka',
+        'am': 'amh',
+        'an': 'arg',
+        'ar': 'ara',
+        'as': 'asm',
+        'av': 'ava',
+        'ay': 'aym',
+        'az': 'aze',
+        'ba': 'bak',
+        'be': 'bel',
+        'bg': 'bul',
+        'bh': 'bih',
+        'bi': 'bis',
+        'bm': 'bam',
+        'bn': 'ben',
+        'bo': 'bod',
+        'br': 'bre',
+        'bs': 'bos',
+        'ca': 'cat',
+        'ce': 'che',
+        'ch': 'cha',
+        'co': 'cos',
+        'cr': 'cre',
+        'cs': 'ces',
+        'cu': 'chu',
+        'cv': 'chv',
+        'cy': 'cym',
+        'da': 'dan',
+        'de': 'deu',
+        'dv': 'div',
+        'dz': 'dzo',
+        'ee': 'ewe',
+        'el': 'ell',
+        'en': 'eng',
+        'eo': 'epo',
+        'es': 'spa',
+        'et': 'est',
+        'eu': 'eus',
+        'fa': 'fas',
+        'ff': 'ful',
+        'fi': 'fin',
+        'fj': 'fij',
+        'fo': 'fao',
+        'fr': 'fra',
+        'fy': 'fry',
+        'ga': 'gle',
+        'gd': 'gla',
+        'gl': 'glg',
+        'gn': 'grn',
+        'gu': 'guj',
+        'gv': 'glv',
+        'ha': 'hau',
+        'he': 'heb',
+        'iw': 'heb',  # Replaced by he in 1989 revision
+        'hi': 'hin',
+        'ho': 'hmo',
+        'hr': 'hrv',
+        'ht': 'hat',
+        'hu': 'hun',
+        'hy': 'hye',
+        'hz': 'her',
+        'ia': 'ina',
+        'id': 'ind',
+        'in': 'ind',  # Replaced by id in 1989 revision
+        'ie': 'ile',
+        'ig': 'ibo',
+        'ii': 'iii',
+        'ik': 'ipk',
+        'io': 'ido',
+        'is': 'isl',
+        'it': 'ita',
+        'iu': 'iku',
+        'ja': 'jpn',
+        'jv': 'jav',
+        'ka': 'kat',
+        'kg': 'kon',
+        'ki': 'kik',
+        'kj': 'kua',
+        'kk': 'kaz',
+        'kl': 'kal',
+        'km': 'khm',
+        'kn': 'kan',
+        'ko': 'kor',
+        'kr': 'kau',
+        'ks': 'kas',
+        'ku': 'kur',
+        'kv': 'kom',
+        'kw': 'cor',
+        'ky': 'kir',
+        'la': 'lat',
+        'lb': 'ltz',
+        'lg': 'lug',
+        'li': 'lim',
+        'ln': 'lin',
+        'lo': 'lao',
+        'lt': 'lit',
+        'lu': 'lub',
+        'lv': 'lav',
+        'mg': 'mlg',
+        'mh': 'mah',
+        'mi': 'mri',
+        'mk': 'mkd',
+        'ml': 'mal',
+        'mn': 'mon',
+        'mr': 'mar',
+        'ms': 'msa',
+        'mt': 'mlt',
+        'my': 'mya',
+        'na': 'nau',
+        'nb': 'nob',
+        'nd': 'nde',
+        'ne': 'nep',
+        'ng': 'ndo',
+        'nl': 'nld',
+        'nn': 'nno',
+        'no': 'nor',
+        'nr': 'nbl',
+        'nv': 'nav',
+        'ny': 'nya',
+        'oc': 'oci',
+        'oj': 'oji',
+        'om': 'orm',
+        'or': 'ori',
+        'os': 'oss',
+        'pa': 'pan',
+        'pi': 'pli',
+        'pl': 'pol',
+        'ps': 'pus',
+        'pt': 'por',
+        'qu': 'que',
+        'rm': 'roh',
+        'rn': 'run',
+        'ro': 'ron',
+        'ru': 'rus',
+        'rw': 'kin',
+        'sa': 'san',
+        'sc': 'srd',
+        'sd': 'snd',
+        'se': 'sme',
+        'sg': 'sag',
+        'si': 'sin',
+        'sk': 'slk',
+        'sl': 'slv',
+        'sm': 'smo',
+        'sn': 'sna',
+        'so': 'som',
+        'sq': 'sqi',
+        'sr': 'srp',
+        'ss': 'ssw',
+        'st': 'sot',
+        'su': 'sun',
+        'sv': 'swe',
+        'sw': 'swa',
+        'ta': 'tam',
+        'te': 'tel',
+        'tg': 'tgk',
+        'th': 'tha',
+        'ti': 'tir',
+        'tk': 'tuk',
+        'tl': 'tgl',
+        'tn': 'tsn',
+        'to': 'ton',
+        'tr': 'tur',
+        'ts': 'tso',
+        'tt': 'tat',
+        'tw': 'twi',
+        'ty': 'tah',
+        'ug': 'uig',
+        'uk': 'ukr',
+        'ur': 'urd',
+        'uz': 'uzb',
+        've': 'ven',
+        'vi': 'vie',
+        'vo': 'vol',
+        'wa': 'wln',
+        'wo': 'wol',
+        'xh': 'xho',
+        'yi': 'yid',
+        'ji': 'yid',  # Replaced by yi in 1989 revision
+        'yo': 'yor',
+        'za': 'zha',
+        'zh': 'zho',
+        'zu': 'zul',
+    }
+
+    @classmethod
+    def short2long(cls, code):
+        """Convert language code from ISO 639-1 to ISO 639-2/T"""
+        return cls._lang_map.get(code[:2])
+
+    @classmethod
+    def long2short(cls, code):
+        """Convert language code from ISO 639-2/T to ISO 639-1"""
+        for short_name, long_name in cls._lang_map.items():
+            if long_name == code:
+                return short_name
+
+
+class ISO3166Utils(object):
+    # From http://data.okfn.org/data/core/country-list
+    _country_map = {
+        'AF': 'Afghanistan',
+        'AX': 'Åland Islands',
+        'AL': 'Albania',
+        'DZ': 'Algeria',
+        'AS': 'American Samoa',
+        'AD': 'Andorra',
+        'AO': 'Angola',
+        'AI': 'Anguilla',
+        'AQ': 'Antarctica',
+        'AG': 'Antigua and Barbuda',
+        'AR': 'Argentina',
+        'AM': 'Armenia',
+        'AW': 'Aruba',
+        'AU': 'Australia',
+        'AT': 'Austria',
+        'AZ': 'Azerbaijan',
+        'BS': 'Bahamas',
+        'BH': 'Bahrain',
+        'BD': 'Bangladesh',
+        'BB': 'Barbados',
+        'BY': 'Belarus',
+        'BE': 'Belgium',
+        'BZ': 'Belize',
+        'BJ': 'Benin',
+        'BM': 'Bermuda',
+        'BT': 'Bhutan',
+        'BO': 'Bolivia, Plurinational State of',
+        'BQ': 'Bonaire, Sint Eustatius and Saba',
+        'BA': 'Bosnia and Herzegovina',
+        'BW': 'Botswana',
+        'BV': 'Bouvet Island',
+        'BR': 'Brazil',
+        'IO': 'British Indian Ocean Territory',
+        'BN': 'Brunei Darussalam',
+        'BG': 'Bulgaria',
+        'BF': 'Burkina Faso',
+        'BI': 'Burundi',
+        'KH': 'Cambodia',
+        'CM': 'Cameroon',
+        'CA': 'Canada',
+        'CV': 'Cape Verde',
+        'KY': 'Cayman Islands',
+        'CF': 'Central African Republic',
+        'TD': 'Chad',
+        'CL': 'Chile',
+        'CN': 'China',
+        'CX': 'Christmas Island',
+        'CC': 'Cocos (Keeling) Islands',
+        'CO': 'Colombia',
+        'KM': 'Comoros',
+        'CG': 'Congo',
+        'CD': 'Congo, the Democratic Republic of the',
+        'CK': 'Cook Islands',
+        'CR': 'Costa Rica',
+        'CI': 'Côte d\'Ivoire',
+        'HR': 'Croatia',
+        'CU': 'Cuba',
+        'CW': 'Curaçao',
+        'CY': 'Cyprus',
+        'CZ': 'Czech Republic',
+        'DK': 'Denmark',
+        'DJ': 'Djibouti',
+        'DM': 'Dominica',
+        'DO': 'Dominican Republic',
+        'EC': 'Ecuador',
+        'EG': 'Egypt',
+        'SV': 'El Salvador',
+        'GQ': 'Equatorial Guinea',
+        'ER': 'Eritrea',
+        'EE': 'Estonia',
+        'ET': 'Ethiopia',
+        'FK': 'Falkland Islands (Malvinas)',
+        'FO': 'Faroe Islands',
+        'FJ': 'Fiji',
+        'FI': 'Finland',
+        'FR': 'France',
+        'GF': 'French Guiana',
+        'PF': 'French Polynesia',
+        'TF': 'French Southern Territories',
+        'GA': 'Gabon',
+        'GM': 'Gambia',
+        'GE': 'Georgia',
+        'DE': 'Germany',
+        'GH': 'Ghana',
+        'GI': 'Gibraltar',
+        'GR': 'Greece',
+        'GL': 'Greenland',
+        'GD': 'Grenada',
+        'GP': 'Guadeloupe',
+        'GU': 'Guam',
+        'GT': 'Guatemala',
+        'GG': 'Guernsey',
+        'GN': 'Guinea',
+        'GW': 'Guinea-Bissau',
+        'GY': 'Guyana',
+        'HT': 'Haiti',
+        'HM': 'Heard Island and McDonald Islands',
+        'VA': 'Holy See (Vatican City State)',
+        'HN': 'Honduras',
+        'HK': 'Hong Kong',
+        'HU': 'Hungary',
+        'IS': 'Iceland',
+        'IN': 'India',
+        'ID': 'Indonesia',
+        'IR': 'Iran, Islamic Republic of',
+        'IQ': 'Iraq',
+        'IE': 'Ireland',
+        'IM': 'Isle of Man',
+        'IL': 'Israel',
+        'IT': 'Italy',
+        'JM': 'Jamaica',
+        'JP': 'Japan',
+        'JE': 'Jersey',
+        'JO': 'Jordan',
+        'KZ': 'Kazakhstan',
+        'KE': 'Kenya',
+        'KI': 'Kiribati',
+        'KP': 'Korea, Democratic People\'s Republic of',
+        'KR': 'Korea, Republic of',
+        'KW': 'Kuwait',
+        'KG': 'Kyrgyzstan',
+        'LA': 'Lao People\'s Democratic Republic',
+        'LV': 'Latvia',
+        'LB': 'Lebanon',
+        'LS': 'Lesotho',
+        'LR': 'Liberia',
+        'LY': 'Libya',
+        'LI': 'Liechtenstein',
+        'LT': 'Lithuania',
+        'LU': 'Luxembourg',
+        'MO': 'Macao',
+        'MK': 'Macedonia, the Former Yugoslav Republic of',
+        'MG': 'Madagascar',
+        'MW': 'Malawi',
+        'MY': 'Malaysia',
+        'MV': 'Maldives',
+        'ML': 'Mali',
+        'MT': 'Malta',
+        'MH': 'Marshall Islands',
+        'MQ': 'Martinique',
+        'MR': 'Mauritania',
+        'MU': 'Mauritius',
+        'YT': 'Mayotte',
+        'MX': 'Mexico',
+        'FM': 'Micronesia, Federated States of',
+        'MD': 'Moldova, Republic of',
+        'MC': 'Monaco',
+        'MN': 'Mongolia',
+        'ME': 'Montenegro',
+        'MS': 'Montserrat',
+        'MA': 'Morocco',
+        'MZ': 'Mozambique',
+        'MM': 'Myanmar',
+        'NA': 'Namibia',
+        'NR': 'Nauru',
+        'NP': 'Nepal',
+        'NL': 'Netherlands',
+        'NC': 'New Caledonia',
+        'NZ': 'New Zealand',
+        'NI': 'Nicaragua',
+        'NE': 'Niger',
+        'NG': 'Nigeria',
+        'NU': 'Niue',
+        'NF': 'Norfolk Island',
+        'MP': 'Northern Mariana Islands',
+        'NO': 'Norway',
+        'OM': 'Oman',
+        'PK': 'Pakistan',
+        'PW': 'Palau',
+        'PS': 'Palestine, State of',
+        'PA': 'Panama',
+        'PG': 'Papua New Guinea',
+        'PY': 'Paraguay',
+        'PE': 'Peru',
+        'PH': 'Philippines',
+        'PN': 'Pitcairn',
+        'PL': 'Poland',
+        'PT': 'Portugal',
+        'PR': 'Puerto Rico',
+        'QA': 'Qatar',
+        'RE': 'Réunion',
+        'RO': 'Romania',
+        'RU': 'Russian Federation',
+        'RW': 'Rwanda',
+        'BL': 'Saint Barthélemy',
+        'SH': 'Saint Helena, Ascension and Tristan da Cunha',
+        'KN': 'Saint Kitts and Nevis',
+        'LC': 'Saint Lucia',
+        'MF': 'Saint Martin (French part)',
+        'PM': 'Saint Pierre and Miquelon',
+        'VC': 'Saint Vincent and the Grenadines',
+        'WS': 'Samoa',
+        'SM': 'San Marino',
+        'ST': 'Sao Tome and Principe',
+        'SA': 'Saudi Arabia',
+        'SN': 'Senegal',
+        'RS': 'Serbia',
+        'SC': 'Seychelles',
+        'SL': 'Sierra Leone',
+        'SG': 'Singapore',
+        'SX': 'Sint Maarten (Dutch part)',
+        'SK': 'Slovakia',
+        'SI': 'Slovenia',
+        'SB': 'Solomon Islands',
+        'SO': 'Somalia',
+        'ZA': 'South Africa',
+        'GS': 'South Georgia and the South Sandwich Islands',
+        'SS': 'South Sudan',
+        'ES': 'Spain',
+        'LK': 'Sri Lanka',
+        'SD': 'Sudan',
+        'SR': 'Suriname',
+        'SJ': 'Svalbard and Jan Mayen',
+        'SZ': 'Swaziland',
+        'SE': 'Sweden',
+        'CH': 'Switzerland',
+        'SY': 'Syrian Arab Republic',
+        'TW': 'Taiwan, Province of China',
+        'TJ': 'Tajikistan',
+        'TZ': 'Tanzania, United Republic of',
+        'TH': 'Thailand',
+        'TL': 'Timor-Leste',
+        'TG': 'Togo',
+        'TK': 'Tokelau',
+        'TO': 'Tonga',
+        'TT': 'Trinidad and Tobago',
+        'TN': 'Tunisia',
+        'TR': 'Turkey',
+        'TM': 'Turkmenistan',
+        'TC': 'Turks and Caicos Islands',
+        'TV': 'Tuvalu',
+        'UG': 'Uganda',
+        'UA': 'Ukraine',
+        'AE': 'United Arab Emirates',
+        'GB': 'United Kingdom',
+        'US': 'United States',
+        'UM': 'United States Minor Outlying Islands',
+        'UY': 'Uruguay',
+        'UZ': 'Uzbekistan',
+        'VU': 'Vanuatu',
+        'VE': 'Venezuela, Bolivarian Republic of',
+        'VN': 'Viet Nam',
+        'VG': 'Virgin Islands, British',
+        'VI': 'Virgin Islands, U.S.',
+        'WF': 'Wallis and Futuna',
+        'EH': 'Western Sahara',
+        'YE': 'Yemen',
+        'ZM': 'Zambia',
+        'ZW': 'Zimbabwe',
+    }
+
+    @classmethod
+    def short2full(cls, code):
+        """Convert an ISO 3166-2 country code to the corresponding full name"""
+        return cls._country_map.get(code.upper())
+
+
+class GeoUtils(object):
+    # Major IPv4 address blocks per country
+    _country_ip_map = {
+        'AD': '46.172.224.0/19',
+        'AE': '94.200.0.0/13',
+        'AF': '149.54.0.0/17',
+        'AG': '209.59.64.0/18',
+        'AI': '204.14.248.0/21',
+        'AL': '46.99.0.0/16',
+        'AM': '46.70.0.0/15',
+        'AO': '105.168.0.0/13',
+        'AP': '182.50.184.0/21',
+        'AQ': '23.154.160.0/24',
+        'AR': '181.0.0.0/12',
+        'AS': '202.70.112.0/20',
+        'AT': '77.116.0.0/14',
+        'AU': '1.128.0.0/11',
+        'AW': '181.41.0.0/18',
+        'AX': '185.217.4.0/22',
+        'AZ': '5.197.0.0/16',
+        'BA': '31.176.128.0/17',
+        'BB': '65.48.128.0/17',
+        'BD': '114.130.0.0/16',
+        'BE': '57.0.0.0/8',
+        'BF': '102.178.0.0/15',
+        'BG': '95.42.0.0/15',
+        'BH': '37.131.0.0/17',
+        'BI': '154.117.192.0/18',
+        'BJ': '137.255.0.0/16',
+        'BL': '185.212.72.0/23',
+        'BM': '196.12.64.0/18',
+        'BN': '156.31.0.0/16',
+        'BO': '161.56.0.0/16',
+        'BQ': '161.0.80.0/20',
+        'BR': '191.128.0.0/12',
+        'BS': '24.51.64.0/18',
+        'BT': '119.2.96.0/19',
+        'BW': '168.167.0.0/16',
+        'BY': '178.120.0.0/13',
+        'BZ': '179.42.192.0/18',
+        'CA': '99.224.0.0/11',
+        'CD': '41.243.0.0/16',
+        'CF': '197.242.176.0/21',
+        'CG': '160.113.0.0/16',
+        'CH': '85.0.0.0/13',
+        'CI': '102.136.0.0/14',
+        'CK': '202.65.32.0/19',
+        'CL': '152.172.0.0/14',
+        'CM': '102.244.0.0/14',
+        'CN': '36.128.0.0/10',
+        'CO': '181.240.0.0/12',
+        'CR': '201.192.0.0/12',
+        'CU': '152.206.0.0/15',
+        'CV': '165.90.96.0/19',
+        'CW': '190.88.128.0/17',
+        'CY': '31.153.0.0/16',
+        'CZ': '88.100.0.0/14',
+        'DE': '53.0.0.0/8',
+        'DJ': '197.241.0.0/17',
+        'DK': '87.48.0.0/12',
+        'DM': '192.243.48.0/20',
+        'DO': '152.166.0.0/15',
+        'DZ': '41.96.0.0/12',
+        'EC': '186.68.0.0/15',
+        'EE': '90.190.0.0/15',
+        'EG': '156.160.0.0/11',
+        'ER': '196.200.96.0/20',
+        'ES': '88.0.0.0/11',
+        'ET': '196.188.0.0/14',
+        'EU': '2.16.0.0/13',
+        'FI': '91.152.0.0/13',
+        'FJ': '144.120.0.0/16',
+        'FK': '80.73.208.0/21',
+        'FM': '119.252.112.0/20',
+        'FO': '88.85.32.0/19',
+        'FR': '90.0.0.0/9',
+        'GA': '41.158.0.0/15',
+        'GB': '25.0.0.0/8',
+        'GD': '74.122.88.0/21',
+        'GE': '31.146.0.0/16',
+        'GF': '161.22.64.0/18',
+        'GG': '62.68.160.0/19',
+        'GH': '154.160.0.0/12',
+        'GI': '95.164.0.0/16',
+        'GL': '88.83.0.0/19',
+        'GM': '160.182.0.0/15',
+        'GN': '197.149.192.0/18',
+        'GP': '104.250.0.0/19',
+        'GQ': '105.235.224.0/20',
+        'GR': '94.64.0.0/13',
+        'GT': '168.234.0.0/16',
+        'GU': '168.123.0.0/16',
+        'GW': '197.214.80.0/20',
+        'GY': '181.41.64.0/18',
+        'HK': '113.252.0.0/14',
+        'HN': '181.210.0.0/16',
+        'HR': '93.136.0.0/13',
+        'HT': '148.102.128.0/17',
+        'HU': '84.0.0.0/14',
+        'ID': '39.192.0.0/10',
+        'IE': '87.32.0.0/12',
+        'IL': '79.176.0.0/13',
+        'IM': '5.62.80.0/20',
+        'IN': '117.192.0.0/10',
+        'IO': '203.83.48.0/21',
+        'IQ': '37.236.0.0/14',
+        'IR': '2.176.0.0/12',
+        'IS': '82.221.0.0/16',
+        'IT': '79.0.0.0/10',
+        'JE': '87.244.64.0/18',
+        'JM': '72.27.0.0/17',
+        'JO': '176.29.0.0/16',
+        'JP': '133.0.0.0/8',
+        'KE': '105.48.0.0/12',
+        'KG': '158.181.128.0/17',
+        'KH': '36.37.128.0/17',
+        'KI': '103.25.140.0/22',
+        'KM': '197.255.224.0/20',
+        'KN': '198.167.192.0/19',
+        'KP': '175.45.176.0/22',
+        'KR': '175.192.0.0/10',
+        'KW': '37.36.0.0/14',
+        'KY': '64.96.0.0/15',
+        'KZ': '2.72.0.0/13',
+        'LA': '115.84.64.0/18',
+        'LB': '178.135.0.0/16',
+        'LC': '24.92.144.0/20',
+        'LI': '82.117.0.0/19',
+        'LK': '112.134.0.0/15',
+        'LR': '102.183.0.0/16',
+        'LS': '129.232.0.0/17',
+        'LT': '78.56.0.0/13',
+        'LU': '188.42.0.0/16',
+        'LV': '46.109.0.0/16',
+        'LY': '41.252.0.0/14',
+        'MA': '105.128.0.0/11',
+        'MC': '88.209.64.0/18',
+        'MD': '37.246.0.0/16',
+        'ME': '178.175.0.0/17',
+        'MF': '74.112.232.0/21',
+        'MG': '154.126.0.0/17',
+        'MH': '117.103.88.0/21',
+        'MK': '77.28.0.0/15',
+        'ML': '154.118.128.0/18',
+        'MM': '37.111.0.0/17',
+        'MN': '49.0.128.0/17',
+        'MO': '60.246.0.0/16',
+        'MP': '202.88.64.0/20',
+        'MQ': '109.203.224.0/19',
+        'MR': '41.188.64.0/18',
+        'MS': '208.90.112.0/22',
+        'MT': '46.11.0.0/16',
+        'MU': '105.16.0.0/12',
+        'MV': '27.114.128.0/18',
+        'MW': '102.70.0.0/15',
+        'MX': '187.192.0.0/11',
+        'MY': '175.136.0.0/13',
+        'MZ': '197.218.0.0/15',
+        'NA': '41.182.0.0/16',
+        'NC': '101.101.0.0/18',
+        'NE': '197.214.0.0/18',
+        'NF': '203.17.240.0/22',
+        'NG': '105.112.0.0/12',
+        'NI': '186.76.0.0/15',
+        'NL': '145.96.0.0/11',
+        'NO': '84.208.0.0/13',
+        'NP': '36.252.0.0/15',
+        'NR': '203.98.224.0/19',
+        'NU': '49.156.48.0/22',
+        'NZ': '49.224.0.0/14',
+        'OM': '5.36.0.0/15',
+        'PA': '186.72.0.0/15',
+        'PE': '186.160.0.0/14',
+        'PF': '123.50.64.0/18',
+        'PG': '124.240.192.0/19',
+        'PH': '49.144.0.0/13',
+        'PK': '39.32.0.0/11',
+        'PL': '83.0.0.0/11',
+        'PM': '70.36.0.0/20',
+        'PR': '66.50.0.0/16',
+        'PS': '188.161.0.0/16',
+        'PT': '85.240.0.0/13',
+        'PW': '202.124.224.0/20',
+        'PY': '181.120.0.0/14',
+        'QA': '37.210.0.0/15',
+        'RE': '102.35.0.0/16',
+        'RO': '79.112.0.0/13',
+        'RS': '93.86.0.0/15',
+        'RU': '5.136.0.0/13',
+        'RW': '41.186.0.0/16',
+        'SA': '188.48.0.0/13',
+        'SB': '202.1.160.0/19',
+        'SC': '154.192.0.0/11',
+        'SD': '102.120.0.0/13',
+        'SE': '78.64.0.0/12',
+        'SG': '8.128.0.0/10',
+        'SI': '188.196.0.0/14',
+        'SK': '78.98.0.0/15',
+        'SL': '102.143.0.0/17',
+        'SM': '89.186.32.0/19',
+        'SN': '41.82.0.0/15',
+        'SO': '154.115.192.0/18',
+        'SR': '186.179.128.0/17',
+        'SS': '105.235.208.0/21',
+        'ST': '197.159.160.0/19',
+        'SV': '168.243.0.0/16',
+        'SX': '190.102.0.0/20',
+        'SY': '5.0.0.0/16',
+        'SZ': '41.84.224.0/19',
+        'TC': '65.255.48.0/20',
+        'TD': '154.68.128.0/19',
+        'TG': '196.168.0.0/14',
+        'TH': '171.96.0.0/13',
+        'TJ': '85.9.128.0/18',
+        'TK': '27.96.24.0/21',
+        'TL': '180.189.160.0/20',
+        'TM': '95.85.96.0/19',
+        'TN': '197.0.0.0/11',
+        'TO': '175.176.144.0/21',
+        'TR': '78.160.0.0/11',
+        'TT': '186.44.0.0/15',
+        'TV': '202.2.96.0/19',
+        'TW': '120.96.0.0/11',
+        'TZ': '156.156.0.0/14',
+        'UA': '37.52.0.0/14',
+        'UG': '102.80.0.0/13',
+        'US': '6.0.0.0/8',
+        'UY': '167.56.0.0/13',
+        'UZ': '84.54.64.0/18',
+        'VA': '212.77.0.0/19',
+        'VC': '207.191.240.0/21',
+        'VE': '186.88.0.0/13',
+        'VG': '66.81.192.0/20',
+        'VI': '146.226.0.0/16',
+        'VN': '14.160.0.0/11',
+        'VU': '202.80.32.0/20',
+        'WF': '117.20.32.0/21',
+        'WS': '202.4.32.0/19',
+        'YE': '134.35.0.0/16',
+        'YT': '41.242.116.0/22',
+        'ZA': '41.0.0.0/11',
+        'ZM': '102.144.0.0/13',
+        'ZW': '102.177.192.0/18',
+    }
+
+    @classmethod
+    def random_ipv4(cls, code_or_block):
+        if len(code_or_block) == 2:
+            block = cls._country_ip_map.get(code_or_block.upper())
+            if not block:
+                return None
+        else:
+            block = code_or_block
+        addr, preflen = block.split('/')
+        addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
+        addr_max = addr_min | (0xffffffff >> int(preflen))
+        return compat_str(socket.inet_ntoa(
+            compat_struct_pack('!L', random.randint(addr_min, addr_max))))
+
+
+class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
+    def __init__(self, proxies=None):
+        # Set default handlers
+        for type in ('http', 'https'):
+            setattr(self, '%s_open' % type,
+                    lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
+                        meth(r, proxy, type))
+        compat_urllib_request.ProxyHandler.__init__(self, proxies)
+
+    def proxy_open(self, req, proxy, type):
+        req_proxy = req.headers.get('Ytdl-request-proxy')
+        if req_proxy is not None:
+            proxy = req_proxy
+            del req.headers['Ytdl-request-proxy']
+
+        if proxy == '__noproxy__':
+            return None  # No Proxy
+        if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
+            req.add_header('Ytdl-socks-proxy', proxy)
+            # youtube-dlc's http/https handlers do wrapping the socket with socks
+            return None
+        return compat_urllib_request.ProxyHandler.proxy_open(
+            self, req, proxy, type)
+
+
+# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
+# released into Public Domain
+# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
+
+def long_to_bytes(n, blocksize=0):
+    """long_to_bytes(n:long, blocksize:int) : string
+    Convert a long integer to a byte string.
+
+    If optional blocksize is given and greater than zero, pad the front of the
+    byte string with binary zeros so that the length is a multiple of
+    blocksize.
+    """
+    # after much testing, this algorithm was deemed to be the fastest
+    s = b''
+    n = int(n)
+    while n > 0:
+        s = compat_struct_pack('>I', n & 0xffffffff) + s
+        n = n >> 32
+    # strip off leading zeros
+    for i in range(len(s)):
+        if s[i] != b'\000'[0]:
+            break
+    else:
+        # only happens when n == 0
+        s = b'\000'
+        i = 0
+    s = s[i:]
+    # add back some pad bytes.  this could be done more efficiently w.r.t. the
+    # de-padding being done above, but sigh...
+    if blocksize > 0 and len(s) % blocksize:
+        s = (blocksize - len(s) % blocksize) * b'\000' + s
+    return s
+
+
+def bytes_to_long(s):
+    """bytes_to_long(string) : long
+    Convert a byte string to a long integer.
+
+    This is (essentially) the inverse of long_to_bytes().
+    """
+    acc = 0
+    length = len(s)
+    if length % 4:
+        extra = (4 - length % 4)
+        s = b'\000' * extra + s
+        length = length + extra
+    for i in range(0, length, 4):
+        acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
+    return acc
+
+
+def ohdave_rsa_encrypt(data, exponent, modulus):
+    '''
+    Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
+
+    Input:
+        data: data to encrypt, bytes-like object
+        exponent, modulus: parameter e and N of RSA algorithm, both integer
+    Output: hex string of encrypted data
+
+    Limitation: supports one block encryption only
+    '''
+
+    payload = int(binascii.hexlify(data[::-1]), 16)
+    encrypted = pow(payload, exponent, modulus)
+    return '%x' % encrypted
+
+
+def pkcs1pad(data, length):
+    """
+    Padding input data with PKCS#1 scheme
+
+    @param {int[]} data        input data
+    @param {int}   length      target length
+    @returns {int[]}           padded data
+    """
+    if len(data) > length - 11:
+        raise ValueError('Input data too long for PKCS#1 padding')
+
+    pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
+    return [0, 2] + pseudo_random + [0] + data
+
+
+def encode_base_n(num, n, table=None):
+    FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    if not table:
+        table = FULL_TABLE[:n]
+
+    if n > len(table):
+        raise ValueError('base %d exceeds table length %d' % (n, len(table)))
+
+    if num == 0:
+        return table[0]
+
+    ret = ''
+    while num:
+        ret = table[num % n] + ret
+        num = num // n
+    return ret
+
+
+def decode_packed_codes(code):
+    mobj = re.search(PACKED_CODES_RE, code)
+    obfucasted_code, base, count, symbols = mobj.groups()
+    base = int(base)
+    count = int(count)
+    symbols = symbols.split('|')
+    symbol_table = {}
+
+    while count:
+        count -= 1
+        base_n_count = encode_base_n(count, base)
+        symbol_table[base_n_count] = symbols[count] or base_n_count
+
+    return re.sub(
+        r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
+        obfucasted_code)
+
+
+def caesar(s, alphabet, shift):
+    if shift == 0:
+        return s
+    l = len(alphabet)
+    return ''.join(
+        alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
+        for c in s)
+
+
+def rot47(s):
+    return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
+
+
+def parse_m3u8_attributes(attrib):
+    info = {}
+    for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
+        if val.startswith('"'):
+            val = val[1:-1]
+        info[key] = val
+    return info
+
+
+def urshift(val, n):
+    return val >> n if val >= 0 else (val + 0x100000000) >> n
+
+
+# Based on png2str() written by @gdkchan and improved by @yokrysty
+# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
+def decode_png(png_data):
+    # Reference: https://www.w3.org/TR/PNG/
+    header = png_data[8:]
+
+    if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
+        raise IOError('Not a valid PNG file.')
+
+    int_map = {1: '>B', 2: '>H', 4: '>I'}
+    unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
+
+    chunks = []
+
+    while header:
+        length = unpack_integer(header[:4])
+        header = header[4:]
+
+        chunk_type = header[:4]
+        header = header[4:]
+
+        chunk_data = header[:length]
+        header = header[length:]
+
+        header = header[4:]  # Skip CRC
+
+        chunks.append({
+            'type': chunk_type,
+            'length': length,
+            'data': chunk_data
+        })
+
+    ihdr = chunks[0]['data']
+
+    width = unpack_integer(ihdr[:4])
+    height = unpack_integer(ihdr[4:8])
+
+    idat = b''
+
+    for chunk in chunks:
+        if chunk['type'] == b'IDAT':
+            idat += chunk['data']
+
+    if not idat:
+        raise IOError('Unable to read PNG data.')
+
+    decompressed_data = bytearray(zlib.decompress(idat))
+
+    stride = width * 3
+    pixels = []
+
+    def _get_pixel(idx):
+        x = idx % stride
+        y = idx // stride
+        return pixels[y][x]
+
+    for y in range(height):
+        basePos = y * (1 + stride)
+        filter_type = decompressed_data[basePos]
+
+        current_row = []
+
+        pixels.append(current_row)
+
+        for x in range(stride):
+            color = decompressed_data[1 + basePos + x]
+            basex = y * stride + x
+            left = 0
+            up = 0
+
+            if x > 2:
+                left = _get_pixel(basex - 3)
+            if y > 0:
+                up = _get_pixel(basex - stride)
+
+            if filter_type == 1:  # Sub
+                color = (color + left) & 0xff
+            elif filter_type == 2:  # Up
+                color = (color + up) & 0xff
+            elif filter_type == 3:  # Average
+                color = (color + ((left + up) >> 1)) & 0xff
+            elif filter_type == 4:  # Paeth
+                a = left
+                b = up
+                c = 0
+
+                if x > 2 and y > 0:
+                    c = _get_pixel(basex - stride - 3)
+
+                p = a + b - c
+
+                pa = abs(p - a)
+                pb = abs(p - b)
+                pc = abs(p - c)
+
+                if pa <= pb and pa <= pc:
+                    color = (color + a) & 0xff
+                elif pb <= pc:
+                    color = (color + b) & 0xff
+                else:
+                    color = (color + c) & 0xff
+
+            current_row.append(color)
+
+    return width, height, pixels
+
+
+def write_xattr(path, key, value):
+    # This mess below finds the best xattr tool for the job
+    try:
+        # try the pyxattr module...
+        import xattr
+
+        if hasattr(xattr, 'set'):  # pyxattr
+            # Unicode arguments are not supported in python-pyxattr until
+            # version 0.5.0
+            # See https://github.com/ytdl-org/youtube-dl/issues/5498
+            pyxattr_required_version = '0.5.0'
+            if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
+                # TODO: fallback to CLI tools
+                raise XAttrUnavailableError(
+                    'python-pyxattr is detected but is too old. '
+                    'youtube-dlc requires %s or above while your version is %s. '
+                    'Falling back to other xattr implementations' % (
+                        pyxattr_required_version, xattr.__version__))
+
+            setxattr = xattr.set
+        else:  # xattr
+            setxattr = xattr.setxattr
+
+        try:
+            setxattr(path, key, value)
+        except EnvironmentError as e:
+            raise XAttrMetadataError(e.errno, e.strerror)
+
+    except ImportError:
+        if compat_os_name == 'nt':
+            # Write xattrs to NTFS Alternate Data Streams:
+            # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
+            assert ':' not in key
+            assert os.path.exists(path)
+
+            ads_fn = path + ':' + key
+            try:
+                with open(ads_fn, 'wb') as f:
+                    f.write(value)
+            except EnvironmentError as e:
+                raise XAttrMetadataError(e.errno, e.strerror)
+        else:
+            user_has_setfattr = check_executable('setfattr', ['--version'])
+            user_has_xattr = check_executable('xattr', ['-h'])
+
+            if user_has_setfattr or user_has_xattr:
+
+                value = value.decode('utf-8')
+                if user_has_setfattr:
+                    executable = 'setfattr'
+                    opts = ['-n', key, '-v', value]
+                elif user_has_xattr:
+                    executable = 'xattr'
+                    opts = ['-w', key, value]
+
+                cmd = ([encodeFilename(executable, True)]
+                       + [encodeArgument(o) for o in opts]
+                       + [encodeFilename(path, True)])
+
+                try:
+                    p = subprocess.Popen(
+                        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+                except EnvironmentError as e:
+                    raise XAttrMetadataError(e.errno, e.strerror)
+                stdout, stderr = p.communicate()
+                stderr = stderr.decode('utf-8', 'replace')
+                if p.returncode != 0:
+                    raise XAttrMetadataError(p.returncode, stderr)
+
+            else:
+                # On Unix, and can't find pyxattr, setfattr, or xattr.
+                if sys.platform.startswith('linux'):
+                    raise XAttrUnavailableError(
+                        "Couldn't find a tool to set the xattrs. "
+                        "Install either the python 'pyxattr' or 'xattr' "
+                        "modules, or the GNU 'attr' package "
+                        "(which contains the 'setfattr' tool).")
+                else:
+                    raise XAttrUnavailableError(
+                        "Couldn't find a tool to set the xattrs. "
+                        "Install either the python 'xattr' module, "
+                        "or the 'xattr' binary.")
+
+
+def random_birthday(year_field, month_field, day_field):
+    start_date = datetime.date(1950, 1, 1)
+    end_date = datetime.date(1995, 12, 31)
+    offset = random.randint(0, (end_date - start_date).days)
+    random_date = start_date + datetime.timedelta(offset)
+    return {
+        year_field: str(random_date.year),
+        month_field: str(random_date.month),
+        day_field: str(random_date.day),
+    }
diff --git a/youtube_dl/version.py b/youtube_dl/version.py

new file mode 100644 (file)

index 0000000..2d5d946
--- /dev/null
+++ b/youtube_dl/version.py
@@ -0,0 +1,3 @@
+from __future__ import unicode_literals
+
+__version__ = '2020.09.02-dev2'
diff --git a/youtube_dlc/version.py b/youtube_dlc/version.py

index 2d5d94649de845546aa6f36d8b66dd691f44e563..b48afe6c42737fc7214f0260e5a61c7becbc51a9 100644 (file)
--- a/youtube_dlc/version.py
+++ b/youtube_dlc/version.py
@@ -1,3 +1,3 @@
  from __future__ import unicode_literals
  
-__version__ = '2020.09.02-dev2'
+__version__ = '2020.09.02-dev3'
author	Unknown <redacted>
	Wed, 2 Sep 2020 21:33:41 +0000 (23:33 +0200)
committer	Unknown <redacted>
	Wed, 2 Sep 2020 21:33:41 +0000 (23:33 +0200)