yt_dlp/extractor/camdemy.py

   1 import re
   2 import urllib.parse
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     clean_html,
   7     parse_duration,
   8     str_to_int,
   9     unified_strdate,
  10 )
  11
  12
  13 class CamdemyIE(InfoExtractor):
  14     _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
  15     _TESTS = [{
  16         # single file
  17         'url': 'http://www.camdemy.com/media/5181/',
  18         'md5': '5a5562b6a98b37873119102e052e311b',
  19         'info_dict': {
  20             'id': '5181',
  21             'ext': 'mp4',
  22             'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
  23             'thumbnail': r're:^https?://.*\.jpg$',
  24             'creator': 'ss11spring',
  25             'duration': 1591,
  26             'upload_date': '20130114',
  27             'view_count': int,
  28         },
  29     }, {
  30         # With non-empty description
  31         # webpage returns "No permission or not login"
  32         'url': 'http://www.camdemy.com/media/13885',
  33         'md5': '4576a3bb2581f86c61044822adbd1249',
  34         'info_dict': {
  35             'id': '13885',
  36             'ext': 'mp4',
  37             'title': 'EverCam + Camdemy QuickStart',
  38             'thumbnail': r're:^https?://.*\.jpg$',
  39             'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
  40             'creator': 'evercam',
  41             'duration': 318,
  42         },
  43     }, {
  44         # External source (YouTube)
  45         'url': 'http://www.camdemy.com/media/14842',
  46         'info_dict': {
  47             'id': '2vsYQzNIsJo',
  48             'ext': 'mp4',
  49             'title': 'Excel 2013 Tutorial - How to add Password Protection',
  50             'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
  51             'upload_date': '20130211',
  52             'uploader': 'Hun Kim',
  53             'uploader_id': 'hunkimtutorials',
  54         },
  55         'params': {
  56             'skip_download': True,
  57         },
  58     }]
  59
  60     def _real_extract(self, url):
  61         video_id = self._match_id(url)
  62
  63         webpage = self._download_webpage(url, video_id)
  64
  65         src_from = self._html_search_regex(
  66             r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
  67             webpage, 'external source', default=None, group='url')
  68         if src_from:
  69             return self.url_result(src_from)
  70
  71         oembed_obj = self._download_json(
  72             'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
  73
  74         title = oembed_obj['title']
  75         thumb_url = oembed_obj['thumbnail_url']
  76         video_folder = urllib.parse.urljoin(thumb_url, 'video/')
  77         file_list_doc = self._download_xml(
  78             urllib.parse.urljoin(video_folder, 'fileList.xml'),
  79             video_id, 'Downloading filelist XML')
  80         file_name = file_list_doc.find('./video/item/fileName').text
  81         video_url = urllib.parse.urljoin(video_folder, file_name)
  82
  83         # Some URLs return "No permission or not login" in a webpage despite being
  84         # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
  85         upload_date = unified_strdate(self._search_regex(
  86             r'>published on ([^<]+)<', webpage,
  87             'upload date', default=None))
  88         view_count = str_to_int(self._search_regex(
  89             r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
  90             webpage, 'view count', default=None))
  91         description = self._html_search_meta(
  92             'description', webpage, default=None) or clean_html(
  93             oembed_obj.get('description'))
  94
  95         return {
  96             'id': video_id,
  97             'url': video_url,
  98             'title': title,
  99             'thumbnail': thumb_url,
 100             'description': description,
 101             'creator': oembed_obj.get('author_name'),
 102             'duration': parse_duration(oembed_obj.get('duration')),
 103             'upload_date': upload_date,
 104             'view_count': view_count,
 105         }
 106
 107
 108 class CamdemyFolderIE(InfoExtractor):
 109     _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
 110     _TESTS = [{
 111         # links with trailing slash
 112         'url': 'http://www.camdemy.com/folder/450',
 113         'info_dict': {
 114             'id': '450',
 115             'title': '信號與系統 2012 & 2011 (Signals and Systems)',
 116         },
 117         'playlist_mincount': 145,
 118     }, {
 119         # links without trailing slash
 120         # and multi-page
 121         'url': 'http://www.camdemy.com/folder/853',
 122         'info_dict': {
 123             'id': '853',
 124             'title': '科學計算 - 使用 Matlab',
 125         },
 126         'playlist_mincount': 20,
 127     }, {
 128         # with displayMode parameter. For testing the codes to add parameters
 129         'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
 130         'info_dict': {
 131             'id': '853',
 132             'title': '科學計算 - 使用 Matlab',
 133         },
 134         'playlist_mincount': 20,
 135     }]
 136
 137     def _real_extract(self, url):
 138         folder_id = self._match_id(url)
 139
 140         # Add displayMode=list so that all links are displayed in a single page
 141         parsed_url = list(urllib.parse.urlparse(url))
 142         query = dict(urllib.parse.parse_qsl(parsed_url[4]))
 143         query.update({'displayMode': 'list'})
 144         parsed_url[4] = urllib.parse.urlencode(query)
 145         final_url = urllib.parse.urlunparse(parsed_url)
 146
 147         page = self._download_webpage(final_url, folder_id)
 148         matches = re.findall(r"href='(/media/\d+/?)'", page)
 149
 150         entries = [self.url_result('http://www.camdemy.com' + media_path)
 151                    for media_path in matches]
 152
 153         folder_title = self._html_search_meta('keywords', page)
 154
 155         return self.playlist_result(entries, folder_id, folder_title)