yt_dlp/extractor/twitcasting.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..utils import (
   8     clean_html,
   9     float_or_none,
  10     get_element_by_class,
  11     get_element_by_id,
  12     parse_duration,
  13     str_to_int,
  14     unified_timestamp,
  15     urlencode_postdata,
  16 )
  17
  18
  19 class TwitCastingIE(InfoExtractor):
  20     _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)'
  21     _TESTS = [{
  22         'url': 'https://twitcasting.tv/ivetesangalo/movie/2357609',
  23         'md5': '745243cad58c4681dc752490f7540d7f',
  24         'info_dict': {
  25             'id': '2357609',
  26             'ext': 'mp4',
  27             'title': 'Live #2357609',
  28             'uploader_id': 'ivetesangalo',
  29             'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.',
  30             'thumbnail': r're:^https?://.*\.jpg$',
  31             'upload_date': '20110822',
  32             'timestamp': 1314010824,
  33             'duration': 32,
  34             'view_count': int,
  35         },
  36         'params': {
  37             'skip_download': True,
  38         },
  39     }, {
  40         'url': 'https://twitcasting.tv/mttbernardini/movie/3689740',
  41         'info_dict': {
  42             'id': '3689740',
  43             'ext': 'mp4',
  44             'title': 'Live playing something #3689740',
  45             'uploader_id': 'mttbernardini',
  46             'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.',
  47             'thumbnail': r're:^https?://.*\.jpg$',
  48             'upload_date': '20120212',
  49             'timestamp': 1329028024,
  50             'duration': 681,
  51             'view_count': int,
  52         },
  53         'params': {
  54             'skip_download': True,
  55             'videopassword': 'abc',
  56         },
  57     }]
  58
  59     def _real_extract(self, url):
  60         uploader_id, video_id = re.match(self._VALID_URL, url).groups()
  61
  62         video_password = self.get_param('videopassword')
  63         request_data = None
  64         if video_password:
  65             request_data = urlencode_postdata({
  66                 'password': video_password,
  67             })
  68         webpage = self._download_webpage(
  69             url, video_id, data=request_data,
  70             headers={'Origin': 'https://twitcasting.tv'})
  71
  72         title = clean_html(get_element_by_id(
  73             'movietitle', webpage)) or self._html_search_meta(
  74             ['og:title', 'twitter:title'], webpage, fatal=True)
  75
  76         video_js_data = {}
  77         m3u8_url = self._search_regex(
  78             r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
  79             webpage, 'm3u8 url', group='url', default=None)
  80         if not m3u8_url:
  81             video_js_data = self._parse_json(self._search_regex(
  82                 r'data-movie-playlist=(["\'])(?P<url>(?:(?!\1).)+)',
  83                 webpage, 'movie playlist', group='url'), video_id)
  84             if isinstance(video_js_data, dict):
  85                 video_js_data = list(video_js_data.values())[0]
  86             video_js_data = video_js_data[0]
  87             m3u8_url = video_js_data['source']['url']
  88
  89         formats = self._extract_m3u8_formats(
  90             m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
  91         thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage)
  92         description = clean_html(get_element_by_id(
  93             'authorcomment', webpage)) or self._html_search_meta(
  94             ['description', 'og:description', 'twitter:description'], webpage)
  95         duration = float_or_none(video_js_data.get(
  96             'duration'), 1000) or parse_duration(clean_html(
  97                 get_element_by_class('tw-player-duration-time', webpage)))
  98         view_count = str_to_int(self._search_regex(
  99             r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None))
 100         timestamp = unified_timestamp(self._search_regex(
 101             r'data-toggle="true"[^>]+datetime="([^"]+)"',
 102             webpage, 'datetime', None))
 103
 104         return {
 105             'id': video_id,
 106             'title': title,
 107             'description': description,
 108             'thumbnail': thumbnail,
 109             'timestamp': timestamp,
 110             'uploader_id': uploader_id,
 111             'duration': duration,
 112             'view_count': view_count,
 113             'formats': formats,
 114         }