yt_dlp/extractor/cracked.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from .youtube import YoutubeIE
   5 from ..utils import (
   6     parse_iso8601,
   7     str_to_int,
   8 )
   9
  10
  11 class CrackedIE(InfoExtractor):
  12     _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html'
  13     _TESTS = [{
  14         'url': 'http://www.cracked.com/video_19070_if-animal-actors-got-e21-true-hollywood-stories.html',
  15         'md5': '89b90b9824e3806ca95072c4d78f13f7',
  16         'info_dict': {
  17             'id': '19070',
  18             'ext': 'mp4',
  19             'title': 'If Animal Actors Got E! True Hollywood Stories',
  20             'timestamp': 1404954000,
  21             'upload_date': '20140710',
  22         },
  23     }, {
  24         # youtube embed
  25         'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html',
  26         'md5': 'ccd52866b50bde63a6ef3b35016ba8c7',
  27         'info_dict': {
  28             'id': 'EjI00A3rZD0',
  29             'ext': 'mp4',
  30             'title': "4 Plot Holes You Didn't Notice in Your Favorite Movies - The Spit Take",
  31             'description': 'md5:c603708c718b796fe6079e2b3351ffc7',
  32             'upload_date': '20140725',
  33             'uploader_id': 'Cracked',
  34             'uploader': 'Cracked',
  35         },
  36     }]
  37
  38     def _real_extract(self, url):
  39         video_id = self._match_id(url)
  40
  41         webpage = self._download_webpage(url, video_id)
  42
  43         youtube_url = YoutubeIE._extract_url(webpage)
  44         if youtube_url:
  45             return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
  46
  47         video_url = self._html_search_regex(
  48             [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'],
  49             webpage, 'video URL')
  50
  51         title = self._search_regex(
  52             [r'property="?og:title"?\s+content="([^"]+)"', r'class="?title"?>([^<]+)'],
  53             webpage, 'title')
  54
  55         description = self._search_regex(
  56             r'name="?(?:og:)?description"?\s+content="([^"]+)"',
  57             webpage, 'description', default=None)
  58
  59         timestamp = self._html_search_regex(
  60             r'"date"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False)
  61         if timestamp:
  62             timestamp = parse_iso8601(timestamp[:-6])
  63
  64         view_count = str_to_int(self._html_search_regex(
  65             r'<span\s+class="?views"? id="?viewCounts"?>([\d,\.]+) Views</span>',
  66             webpage, 'view count', fatal=False))
  67         comment_count = str_to_int(self._html_search_regex(
  68             r'<span\s+id="?commentCounts"?>([\d,\.]+)</span>',
  69             webpage, 'comment count', fatal=False))
  70
  71         m = re.search(r'_(?P<width>\d+)X(?P<height>\d+)\.mp4$', video_url)
  72         if m:
  73             width = int(m.group('width'))
  74             height = int(m.group('height'))
  75         else:
  76             width = height = None
  77
  78         return {
  79             'id': video_id,
  80             'url': video_url,
  81             'title': title,
  82             'description': description,
  83             'timestamp': timestamp,
  84             'view_count': view_count,
  85             'comment_count': comment_count,
  86             'height': height,
  87             'width': width,
  88         }