[yt-dlp.git] / yt_dlp / extractor / camdemy.py

import re
import urllib.parse

from .common import InfoExtractor
from ..utils import (
    clean_html,
    parse_duration,
    str_to_int,
    unified_strdate,
)


class CamdemyIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
    _TESTS = [{
        # single file
        'url': 'http://www.camdemy.com/media/5181/',
        'md5': '5a5562b6a98b37873119102e052e311b',
        'info_dict': {
            'id': '5181',
            'ext': 'mp4',
            'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
            'thumbnail': r're:^https?://.*\.jpg$',
            'creator': 'ss11spring',
            'duration': 1591,
            'upload_date': '20130114',
            'view_count': int,
        },
    }, {
        # With non-empty description
        # webpage returns "No permission or not login"
        'url': 'http://www.camdemy.com/media/13885',
        'md5': '4576a3bb2581f86c61044822adbd1249',
        'info_dict': {
            'id': '13885',
            'ext': 'mp4',
            'title': 'EverCam + Camdemy QuickStart',
            'thumbnail': r're:^https?://.*\.jpg$',
            'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
            'creator': 'evercam',
            'duration': 318,
        },
    }, {
        # External source (YouTube)
        'url': 'http://www.camdemy.com/media/14842',
        'info_dict': {
            'id': '2vsYQzNIsJo',
            'ext': 'mp4',
            'title': 'Excel 2013 Tutorial - How to add Password Protection',
            'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
            'upload_date': '20130211',
            'uploader': 'Hun Kim',
            'uploader_id': 'hunkimtutorials',
        },
        'params': {
            'skip_download': True,
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        src_from = self._html_search_regex(
            r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
            webpage, 'external source', default=None, group='url')
        if src_from:
            return self.url_result(src_from)

        oembed_obj = self._download_json(
            'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)

        title = oembed_obj['title']
        thumb_url = oembed_obj['thumbnail_url']
        video_folder = urllib.parse.urljoin(thumb_url, 'video/')
        file_list_doc = self._download_xml(
            urllib.parse.urljoin(video_folder, 'fileList.xml'),
            video_id, 'Downloading filelist XML')
        file_name = file_list_doc.find('./video/item/fileName').text
        video_url = urllib.parse.urljoin(video_folder, file_name)

        # Some URLs return "No permission or not login" in a webpage despite being
        # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
        upload_date = unified_strdate(self._search_regex(
            r'>published on ([^<]+)<', webpage,
            'upload date', default=None))
        view_count = str_to_int(self._search_regex(
            r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
            webpage, 'view count', default=None))
        description = self._html_search_meta(
            'description', webpage, default=None) or clean_html(
            oembed_obj.get('description'))

        return {
            'id': video_id,
            'url': video_url,
            'title': title,
            'thumbnail': thumb_url,
            'description': description,
            'creator': oembed_obj.get('author_name'),
            'duration': parse_duration(oembed_obj.get('duration')),
            'upload_date': upload_date,
            'view_count': view_count,
        }


class CamdemyFolderIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
    _TESTS = [{
        # links with trailing slash
        'url': 'http://www.camdemy.com/folder/450',
        'info_dict': {
            'id': '450',
            'title': '信號與系統 2012 & 2011 (Signals and Systems)',
        },
        'playlist_mincount': 145,
    }, {
        # links without trailing slash
        # and multi-page
        'url': 'http://www.camdemy.com/folder/853',
        'info_dict': {
            'id': '853',
            'title': '科學計算 - 使用 Matlab',
        },
        'playlist_mincount': 20,
    }, {
        # with displayMode parameter. For testing the codes to add parameters
        'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
        'info_dict': {
            'id': '853',
            'title': '科學計算 - 使用 Matlab',
        },
        'playlist_mincount': 20,
    }]

    def _real_extract(self, url):
        folder_id = self._match_id(url)

        # Add displayMode=list so that all links are displayed in a single page
        parsed_url = list(urllib.parse.urlparse(url))
        query = dict(urllib.parse.parse_qsl(parsed_url[4]))
        query.update({'displayMode': 'list'})
        parsed_url[4] = urllib.parse.urlencode(query)
        final_url = urllib.parse.urlunparse(parsed_url)

        page = self._download_webpage(final_url, folder_id)
        matches = re.findall(r"href='(/media/\d+/?)'", page)

        entries = [self.url_result('http://www.camdemy.com' + media_path)
                   for media_path in matches]

        folder_title = self._html_search_meta('keywords', page)

        return self.playlist_result(entries, folder_id, folder_title)
Commit	Line	Data
8708d764	1	import re
add96eb9	2	import urllib.parse
8708d764 YCH	3
8708d764 YCH	4	from .common import InfoExtractor
08b38d54	5	from ..utils import (
59eaf69e	6	clean_html,
0a147785	7	parse_duration,
08b38d54	8	str_to_int,
59eaf69e	9	unified_strdate,
08b38d54	10	)
8708d764 YCH	11
	12
	13	class CamdemyIE(InfoExtractor):
5886b38d	14	_VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
8708d764 YCH	15	_TESTS = [{
	16	# single file
	17	'url': 'http://www.camdemy.com/media/5181/',
	18	'md5': '5a5562b6a98b37873119102e052e311b',
	19	'info_dict': {
	20	'id': '5181',
	21	'ext': 'mp4',
	22	'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
ec85ded8	23	'thumbnail': r're:^https?://.*\.jpg$',
8708d764	24	'creator': 'ss11spring',
0a147785	25	'duration': 1591,
8708d764	26	'upload_date': '20130114',
08b38d54	27	'view_count': int,
add96eb9	28	},
8708d764 YCH	29	}, {
8708d764 YCH	30	# With non-empty description
59eaf69e	31	# webpage returns "No permission or not login"
8708d764 YCH	32	'url': 'http://www.camdemy.com/media/13885',
	33	'md5': '4576a3bb2581f86c61044822adbd1249',
	34	'info_dict': {
	35	'id': '13885',
	36	'ext': 'mp4',
	37	'title': 'EverCam + Camdemy QuickStart',
ec85ded8	38	'thumbnail': r're:^https?://.*\.jpg$',
59eaf69e	39	'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
8708d764	40	'creator': 'evercam',
0a147785	41	'duration': 318,
add96eb9	42	},
8367d3f3	43	}, {
59eaf69e	44	# External source (YouTube)
8367d3f3	45	'url': 'http://www.camdemy.com/media/14842',
8367d3f3 YCH	46	'info_dict': {
	47	'id': '2vsYQzNIsJo',
	48	'ext': 'mp4',
59eaf69e S	49	'title': 'Excel 2013 Tutorial - How to add Password Protection',
59eaf69e S	50	'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
8367d3f3 YCH	51	'upload_date': '20130211',
8367d3f3 YCH	52	'uploader': 'Hun Kim',
8367d3f3	53	'uploader_id': 'hunkimtutorials',
59eaf69e S	54	},
	55	'params': {
	56	'skip_download': True,
	57	},
8708d764 YCH	58	}]
	59
	60	def _real_extract(self, url):
	61	video_id = self._match_id(url)
59eaf69e S	62
59eaf69e S	63	webpage = self._download_webpage(url, video_id)
8708d764	64
08b38d54	65	src_from = self._html_search_regex(
59eaf69e S	66	r"class=['\"]srcFrom['\"][^>]>Sources?(?:\s+from)?\s:\s*<a[^>]+(?:href\|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
59eaf69e S	67	webpage, 'external source', default=None, group='url')
08b38d54 PH	68	if src_from:
08b38d54 PH	69	return self.url_result(src_from)
8367d3f3	70
8708d764 YCH	71	oembed_obj = self._download_json(
	72	'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
	73
0a147785	74	title = oembed_obj['title']
8708d764	75	thumb_url = oembed_obj['thumbnail_url']
add96eb9	76	video_folder = urllib.parse.urljoin(thumb_url, 'video/')
08b38d54	77	file_list_doc = self._download_xml(
add96eb9	78	urllib.parse.urljoin(video_folder, 'fileList.xml'),
59eaf69e	79	video_id, 'Downloading filelist XML')
08b38d54	80	file_name = file_list_doc.find('./video/item/fileName').text
add96eb9	81	video_url = urllib.parse.urljoin(video_folder, file_name)
8708d764	82
59eaf69e S	83	# Some URLs return "No permission or not login" in a webpage despite being
	84	# freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
	85	upload_date = unified_strdate(self._search_regex(
	86	r'>published on ([^<]+)<', webpage,
	87	'upload date', default=None))
	88	view_count = str_to_int(self._search_regex(
	89	r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
	90	webpage, 'view count', default=None))
	91	description = self._html_search_meta(
	92	'description', webpage, default=None) or clean_html(
	93	oembed_obj.get('description'))
8708d764 YCH	94
	95	return {
	96	'id': video_id,
08b38d54	97	'url': video_url,
0a147785	98	'title': title,
8708d764	99	'thumbnail': thumb_url,
59eaf69e	100	'description': description,
0a147785 S	101	'creator': oembed_obj.get('author_name'),
0a147785 S	102	'duration': parse_duration(oembed_obj.get('duration')),
59eaf69e	103	'upload_date': upload_date,
08b38d54	104	'view_count': view_count,
8708d764	105	}
c40feaba YCH	106
	107
	108	class CamdemyFolderIE(InfoExtractor):
92519402	109	_VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
c40feaba YCH	110	_TESTS = [{
	111	# links with trailing slash
	112	'url': 'http://www.camdemy.com/folder/450',
	113	'info_dict': {
	114	'id': '450',
	115	'title': '信號與系統 2012 & 2011 (Signals and Systems)',
	116	},
add96eb9	117	'playlist_mincount': 145,
c40feaba YCH	118	}, {
	119	# links without trailing slash
	120	# and multi-page
	121	'url': 'http://www.camdemy.com/folder/853',
	122	'info_dict': {
	123	'id': '853',
add96eb9	124	'title': '科學計算 - 使用 Matlab',
c40feaba	125	},
add96eb9	126	'playlist_mincount': 20,
c40feaba YCH	127	}, {
	128	# with displayMode parameter. For testing the codes to add parameters
	129	'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
	130	'info_dict': {
	131	'id': '853',
add96eb9	132	'title': '科學計算 - 使用 Matlab',
c40feaba	133	},
add96eb9	134	'playlist_mincount': 20,
c40feaba YCH	135	}]
	136
	137	def _real_extract(self, url):
	138	folder_id = self._match_id(url)
	139
	140	# Add displayMode=list so that all links are displayed in a single page
add96eb9	141	parsed_url = list(urllib.parse.urlparse(url))
add96eb9	142	query = dict(urllib.parse.parse_qsl(parsed_url[4]))
c40feaba	143	query.update({'displayMode': 'list'})
add96eb9	144	parsed_url[4] = urllib.parse.urlencode(query)
add96eb9	145	final_url = urllib.parse.urlunparse(parsed_url)
c40feaba YCH	146
	147	page = self._download_webpage(final_url, folder_id)
	148	matches = re.findall(r"href='(/media/\d+/?)'", page)
	149
	150	entries = [self.url_result('http://www.camdemy.com' + media_path)
	151	for media_path in matches]
	152
	153	folder_title = self._html_search_meta('keywords', page)
	154
	155	return self.playlist_result(entries, folder_id, folder_title)