]> jfr.im git - yt-dlp.git/commitdiff
Merge branch 'pr/duboku' of https://github.com/lkho/youtube-dl into lkho-pr/duboku
authorTom-Oliver Heidel <redacted>
Sat, 12 Sep 2020 03:47:27 +0000 (05:47 +0200)
committerTom-Oliver Heidel <redacted>
Sat, 12 Sep 2020 03:47:27 +0000 (05:47 +0200)
1  2 
youtube_dlc/extractor/duboku.py
youtube_dlc/extractor/extractors.py

index 0000000000000000000000000000000000000000,fdc695bf451adeb9d90ed7067de6dbe862b27f87..fdc695bf451adeb9d90ed7067de6dbe862b27f87
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,242 +1,242 @@@
+ # coding: utf-8
+ from __future__ import unicode_literals
+ import re
+ from .common import InfoExtractor
+ from ..compat import compat_urlparse
+ from ..utils import (
+     clean_html,
+     extract_attributes,
+     ExtractorError,
+     get_elements_by_class,
+     int_or_none,
+     js_to_json,
+     smuggle_url,
+     unescapeHTML,
+ )
+ def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
+     """Return the content of the tag with the specified attribute in the passed HTML document"""
+     if tag is None:
+         tag = '[a-zA-Z0-9:._-]+'
+     if attribute is None:
+         attribute = ''
+     else:
+         attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute)
+     if value is None:
+         value = ''
+     else:
+         value = re.escape(value) if escape_value else value
+         value = '=[\'"]?(?P<value>%s)[\'"]?' % value
+     retlist = []
+     for m in re.finditer(r'''(?xs)
+         <(?P<tag>%s)
+          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+          %s%s
+          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
+         \s*>
+         (?P<content>.*?)
+         </\1>
+     ''' % (tag, attribute, value), html):
+         retlist.append(m)
+     return retlist
+ def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
+     retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
+     return retval[0] if retval else None
+ class DubokuIE(InfoExtractor):
+     IE_NAME = 'duboku'
+     IE_DESC = 'www.duboku.co'
+     _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
+     _TESTS = [{
+         'url': 'https://www.duboku.co/vodplay/1575-1-1.html',
+         'info_dict': {
+             'id': '1575-1-1',
+             'ext': 'ts',
+             'series': '白色月光',
+             'title': 'contains:白色月光',
+             'season_number': 1,
+             'episode_number': 1,
+         },
+         'params': {
+             'skip_download': 'm3u8 download',
+         },
+     }, {
+         'url': 'https://www.duboku.co/vodplay/1588-1-1.html',
+         'info_dict': {
+             'id': '1588-1-1',
+             'ext': 'ts',
+             'series': '亲爱的自己',
+             'title': 'contains:预告片',
+             'season_number': 1,
+             'episode_number': 1,
+         },
+         'params': {
+             'skip_download': 'm3u8 download',
+         },
+     }]
+     _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
+     def _real_extract(self, url):
+         video_id = self._match_id(url)
+         temp = video_id.split('-')
+         series_id = temp[0]
+         season_id = temp[1]
+         episode_id = temp[2]
+         webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id
+         webpage_html = self._download_webpage(webpage_url, video_id)
+         # extract video url
+         player_data = self._search_regex(
+             self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
+         player_data = self._parse_json(player_data, video_id, js_to_json)
+         # extract title
+         temp = get_elements_by_class('title', webpage_html)
+         series_title = None
+         title = None
+         for html in temp:
+             mobj = re.search(r'<a\s+.*>(.*)</a>', html)
+             if mobj:
+                 href = extract_attributes(mobj.group(0)).get('href')
+                 if href:
+                     mobj1 = re.search(r'/(\d+)\.html', href)
+                     if mobj1 and mobj1.group(1) == series_id:
+                         series_title = clean_html(mobj.group(0))
+                         series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
+                         title = clean_html(html)
+                         title = re.sub(r'[\s\r\n\t]+', ' ', title)
+                         break
+         data_url = player_data.get('url')
+         if not data_url:
+             raise ExtractorError('Cannot find url in player_data')
+         data_from = player_data.get('from')
+         # if it is an embedded iframe, maybe it's an external source
+         if data_from == 'iframe':
+             # use _type url_transparent to retain the meaningful details
+             # of the video.
+             return {
+                 '_type': 'url_transparent',
+                 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}),
+                 'id': video_id,
+                 'title': title,
+                 'series': series_title,
+                 'season_number': int_or_none(season_id),
+                 'season_id': season_id,
+                 'episode_number': int_or_none(episode_id),
+                 'episode_id': episode_id,
+             }
+         formats = self._extract_m3u8_formats(data_url, video_id, 'mp4')
+         return {
+             'id': video_id,
+             'title': title,
+             'series': series_title,
+             'season_number': int_or_none(season_id),
+             'season_id': season_id,
+             'episode_number': int_or_none(episode_id),
+             'episode_id': episode_id,
+             'formats': formats,
+             'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'}
+         }
+ class DubokuPlaylistIE(InfoExtractor):
+     IE_NAME = 'duboku:list'
+     IE_DESC = 'www.duboku.co entire series'
+     _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*'
+     _TESTS = [{
+         'url': 'https://www.duboku.co/voddetail/1575.html',
+         'info_dict': {
+             'id': 'startswith:1575',
+             'title': '白色月光',
+         },
+         'playlist_count': 12,
+     }, {
+         'url': 'https://www.duboku.co/voddetail/1554.html',
+         'info_dict': {
+             'id': 'startswith:1554',
+             'title': '以家人之名',
+         },
+         'playlist_mincount': 30,
+     }, {
+         'url': 'https://www.duboku.co/voddetail/1554.html#playlist2',
+         'info_dict': {
+             'id': '1554#playlist2',
+             'title': '以家人之名',
+         },
+         'playlist_mincount': 27,
+     }]
+     def _real_extract(self, url):
+         mobj = re.match(self._VALID_URL, url)
+         if mobj is None:
+             raise ExtractorError('Invalid URL: %s' % url)
+         series_id = mobj.group('id')
+         fragment = compat_urlparse.urlparse(url).fragment
+         webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id
+         webpage_html = self._download_webpage(webpage_url, series_id)
+         # extract title
+         title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
+         title = unescapeHTML(title.group('content')) if title else None
+         if not title:
+             title = self._html_search_meta('keywords', webpage_html)
+         if not title:
+             title = _get_element_by_tag_and_attrib(webpage_html, 'title')
+             title = unescapeHTML(title.group('content')) if title else None
+         # extract playlists
+         playlists = {}
+         for div in _get_elements_by_tag_and_attrib(
+                 webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
+             playlist_id = div.group('value')
+             playlist = []
+             for a in _get_elements_by_tag_and_attrib(
+                     div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
+                 playlist.append({
+                     'href': unescapeHTML(a.group('value')),
+                     'title': unescapeHTML(a.group('content'))
+                 })
+             playlists[playlist_id] = playlist
+         # select the specified playlist if url fragment exists
+         playlist = None
+         playlist_id = None
+         if fragment:
+             playlist = playlists.get(fragment)
+             playlist_id = fragment
+         else:
+             first = next(iter(playlists.items()), None)
+             if first:
+                 (playlist_id, playlist) = first
+         if not playlist:
+             raise ExtractorError(
+                 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist')
+         # return url results
+         return self.playlist_result([
+             self.url_result(
+                 compat_urlparse.urljoin('https://www.duboku.co', x['href']),
+                 ie=DubokuIE.ie_key(), video_title=x.get('title'))
+             for x in playlist], series_id + '#' + playlist_id, title)
index 42f93b4c7f1e7a2aede85796395888d812d7ba7b,40770171759a61f2aaa940e4e0bbe95e6d7c265d..e70e779ffcc04e844d8e1e7ac85150dcafc743c9
@@@ -36,10 -36,6 +36,10 @@@ from .afreecatv import AfreecaTVI
  from .airmozilla import AirMozillaIE
  from .aljazeera import AlJazeeraIE
  from .alphaporno import AlphaPornoIE
 +from .alura import (
 +    AluraIE,
 +    AluraCourseIE
 +)
  from .amcnetworks import AMCNetworksIE
  from .americastestkitchen import AmericasTestKitchenIE
  from .animeondemand import AnimeOnDemandIE
@@@ -266,10 -262,7 +266,10 @@@ from .daum import 
  )
  from .dbtv import DBTVIE
  from .dctp import DctpTvIE
 -from .deezer import DeezerPlaylistIE
 +from .deezer import (
 +    DeezerPlaylistIE,
 +    DeezerAlbumIE,
 +)
  from .democracynow import DemocracynowIE
  from .dfb import DFBIE
  from .dhm import DHMIE
@@@ -280,6 -273,7 +280,6 @@@ from .douyutv import 
      DouyuTVIE,
  )
  from .dplay import DPlayIE
 -from .dreisat import DreiSatIE
  from .drbonanza import DRBonanzaIE
  from .drtuber import DrTuberIE
  from .drtv import (
  )
  from .dtube import DTubeIE
  from .dvtv import DVTVIE
+ from .duboku import (
+     DubokuIE,
+     DubokuPlaylistIE
+ )
  from .dumpert import DumpertIE
  from .defense import DefenseGouvFrIE
  from .discovery import DiscoveryIE
@@@ -299,7 -297,6 +303,7 @@@ from .discoverynetworks import Discover
  from .discoveryvr import DiscoveryVRIE
  from .disney import DisneyIE
  from .dispeak import DigitallySpeakingIE
 +from .doodstream import DoodStreamIE
  from .dropbox import DropboxIE
  from .dw import (
      DWIE,
@@@ -447,7 -444,6 +451,7 @@@ from .hotstar import 
  )
  from .howcast import HowcastIE
  from .howstuffworks import HowStuffWorksIE
 +from .hrfensehen import HRFernsehenIE
  from .hrti import (
      HRTiIE,
      HRTiPlaylistIE,
@@@ -593,7 -589,6 +597,7 @@@ from .lynda import 
      LyndaCourseIE
  )
  from .m6 import M6IE
 +from .magentamusik360 import MagentaMusik360IE
  from .mailru import (
      MailRuIE,
      MailRuMusicIE,
@@@ -676,7 -671,6 +680,7 @@@ from .myvi import 
      MyviIE,
      MyviEmbedIE,
  )
 +from .myvideoge import MyVideoGeIE
  from .myvidster import MyVidsterIE
  from .nationalgeographic import (
      NationalGeographicVideoIE,
@@@ -868,10 -862,7 +872,10 @@@ from .pluralsight import 
      PluralsightCourseIE,
  )
  from .podomatic import PodomaticIE
 -from .pokemon import PokemonIE
 +from .pokemon import (
 +    PokemonIE,
 +    PokemonWatchIE,
 +)
  from .polskieradio import (
      PolskieRadioIE,
      PolskieRadioCategoryIE,
@@@ -1070,11 -1061,6 +1074,11 @@@ from .spike import 
      BellatorIE,
      ParamountNetworkIE,
  )
 +from .storyfire import (
 +    StoryFireIE,
 +    StoryFireUserIE,
 +    StoryFireSeriesIE,
 +)
  from .stitcher import StitcherIE
  from .sport5 import Sport5IE
  from .sportbox import SportBoxIE
@@@ -1225,7 -1211,6 +1229,7 @@@ from .tvnet import TVNetI
  from .tvnoe import TVNoeIE
  from .tvnow import (
      TVNowIE,
 +    TVNowFilmIE,
      TVNowNewIE,
      TVNowSeasonIE,
      TVNowAnnualIE,
@@@ -1248,11 -1233,14 +1252,11 @@@ from .twentymin import TwentyMinutenI
  from .twentythreevideo import TwentyThreeVideoIE
  from .twitcasting import TwitCastingIE
  from .twitch import (
 -    TwitchVideoIE,
 -    TwitchChapterIE,
      TwitchVodIE,
 -    TwitchProfileIE,
 -    TwitchAllVideosIE,
 -    TwitchUploadsIE,
 -    TwitchPastBroadcastsIE,
 -    TwitchHighlightsIE,
 +    TwitchCollectionIE,
 +    TwitchVideosIE,
 +    TwitchVideosClipsIE,
 +    TwitchVideosCollectionsIE,
      TwitchStreamIE,
      TwitchClipsIE,
  )