[yt-dlp.git] / yt_dlp / extractor / camdemy.py

import re

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse_urlencode,
    compat_urlparse,
)
from ..utils import (
    clean_html,
    parse_duration,
    str_to_int,
    unified_strdate,
)


class CamdemyIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
    _TESTS = [{
        # single file
        'url': 'http://www.camdemy.com/media/5181/',
        'md5': '5a5562b6a98b37873119102e052e311b',
        'info_dict': {
            'id': '5181',
            'ext': 'mp4',
            'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
            'thumbnail': r're:^https?://.*\.jpg$',
            'creator': 'ss11spring',
            'duration': 1591,
            'upload_date': '20130114',
            'view_count': int,
        }
    }, {
        # With non-empty description
        # webpage returns "No permission or not login"
        'url': 'http://www.camdemy.com/media/13885',
        'md5': '4576a3bb2581f86c61044822adbd1249',
        'info_dict': {
            'id': '13885',
            'ext': 'mp4',
            'title': 'EverCam + Camdemy QuickStart',
            'thumbnail': r're:^https?://.*\.jpg$',
            'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
            'creator': 'evercam',
            'duration': 318,
        }
    }, {
        # External source (YouTube)
        'url': 'http://www.camdemy.com/media/14842',
        'info_dict': {
            'id': '2vsYQzNIsJo',
            'ext': 'mp4',
            'title': 'Excel 2013 Tutorial - How to add Password Protection',
            'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
            'upload_date': '20130211',
            'uploader': 'Hun Kim',
            'uploader_id': 'hunkimtutorials',
        },
        'params': {
            'skip_download': True,
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        src_from = self._html_search_regex(
            r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
            webpage, 'external source', default=None, group='url')
        if src_from:
            return self.url_result(src_from)

        oembed_obj = self._download_json(
            'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)

        title = oembed_obj['title']
        thumb_url = oembed_obj['thumbnail_url']
        video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
        file_list_doc = self._download_xml(
            compat_urlparse.urljoin(video_folder, 'fileList.xml'),
            video_id, 'Downloading filelist XML')
        file_name = file_list_doc.find('./video/item/fileName').text
        video_url = compat_urlparse.urljoin(video_folder, file_name)

        # Some URLs return "No permission or not login" in a webpage despite being
        # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
        upload_date = unified_strdate(self._search_regex(
            r'>published on ([^<]+)<', webpage,
            'upload date', default=None))
        view_count = str_to_int(self._search_regex(
            r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
            webpage, 'view count', default=None))
        description = self._html_search_meta(
            'description', webpage, default=None) or clean_html(
            oembed_obj.get('description'))

        return {
            'id': video_id,
            'url': video_url,
            'title': title,
            'thumbnail': thumb_url,
            'description': description,
            'creator': oembed_obj.get('author_name'),
            'duration': parse_duration(oembed_obj.get('duration')),
            'upload_date': upload_date,
            'view_count': view_count,
        }


class CamdemyFolderIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
    _TESTS = [{
        # links with trailing slash
        'url': 'http://www.camdemy.com/folder/450',
        'info_dict': {
            'id': '450',
            'title': '信號與系統 2012 & 2011 (Signals and Systems)',
        },
        'playlist_mincount': 145
    }, {
        # links without trailing slash
        # and multi-page
        'url': 'http://www.camdemy.com/folder/853',
        'info_dict': {
            'id': '853',
            'title': '科學計算 - 使用 Matlab'
        },
        'playlist_mincount': 20
    }, {
        # with displayMode parameter. For testing the codes to add parameters
        'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
        'info_dict': {
            'id': '853',
            'title': '科學計算 - 使用 Matlab'
        },
        'playlist_mincount': 20
    }]

    def _real_extract(self, url):
        folder_id = self._match_id(url)

        # Add displayMode=list so that all links are displayed in a single page
        parsed_url = list(compat_urlparse.urlparse(url))
        query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
        query.update({'displayMode': 'list'})
        parsed_url[4] = compat_urllib_parse_urlencode(query)
        final_url = compat_urlparse.urlunparse(parsed_url)

        page = self._download_webpage(final_url, folder_id)
        matches = re.findall(r"href='(/media/\d+/?)'", page)

        entries = [self.url_result('http://www.camdemy.com' + media_path)
                   for media_path in matches]

        folder_title = self._html_search_meta('keywords', page)

        return self.playlist_result(entries, folder_id, folder_title)
Commit	Line	Data
8708d764 YCH	1	import re
	2
	3	from .common import InfoExtractor
08b38d54	4	from ..compat import (
15707c7e	5	compat_urllib_parse_urlencode,
08b38d54 PH	6	compat_urlparse,
	7	)
	8	from ..utils import (
59eaf69e	9	clean_html,
0a147785	10	parse_duration,
08b38d54	11	str_to_int,
59eaf69e	12	unified_strdate,
08b38d54	13	)
8708d764 YCH	14
	15
	16	class CamdemyIE(InfoExtractor):
5886b38d	17	_VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
8708d764 YCH	18	_TESTS = [{
	19	# single file
	20	'url': 'http://www.camdemy.com/media/5181/',
	21	'md5': '5a5562b6a98b37873119102e052e311b',
	22	'info_dict': {
	23	'id': '5181',
	24	'ext': 'mp4',
	25	'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
ec85ded8	26	'thumbnail': r're:^https?://.*\.jpg$',
8708d764	27	'creator': 'ss11spring',
0a147785	28	'duration': 1591,
8708d764	29	'upload_date': '20130114',
08b38d54	30	'view_count': int,
8708d764 YCH	31	}
	32	}, {
	33	# With non-empty description
59eaf69e	34	# webpage returns "No permission or not login"
8708d764 YCH	35	'url': 'http://www.camdemy.com/media/13885',
	36	'md5': '4576a3bb2581f86c61044822adbd1249',
	37	'info_dict': {
	38	'id': '13885',
	39	'ext': 'mp4',
	40	'title': 'EverCam + Camdemy QuickStart',
ec85ded8	41	'thumbnail': r're:^https?://.*\.jpg$',
59eaf69e	42	'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
8708d764	43	'creator': 'evercam',
0a147785	44	'duration': 318,
8708d764	45	}
8367d3f3	46	}, {
59eaf69e	47	# External source (YouTube)
8367d3f3	48	'url': 'http://www.camdemy.com/media/14842',
8367d3f3 YCH	49	'info_dict': {
	50	'id': '2vsYQzNIsJo',
	51	'ext': 'mp4',
59eaf69e S	52	'title': 'Excel 2013 Tutorial - How to add Password Protection',
59eaf69e S	53	'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
8367d3f3 YCH	54	'upload_date': '20130211',
8367d3f3 YCH	55	'uploader': 'Hun Kim',
8367d3f3	56	'uploader_id': 'hunkimtutorials',
59eaf69e S	57	},
	58	'params': {
	59	'skip_download': True,
	60	},
8708d764 YCH	61	}]
	62
	63	def _real_extract(self, url):
	64	video_id = self._match_id(url)
59eaf69e S	65
59eaf69e S	66	webpage = self._download_webpage(url, video_id)
8708d764	67
08b38d54	68	src_from = self._html_search_regex(
59eaf69e S	69	r"class=['\"]srcFrom['\"][^>]>Sources?(?:\s+from)?\s:\s*<a[^>]+(?:href\|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
59eaf69e S	70	webpage, 'external source', default=None, group='url')
08b38d54 PH	71	if src_from:
08b38d54 PH	72	return self.url_result(src_from)
8367d3f3	73
8708d764 YCH	74	oembed_obj = self._download_json(
	75	'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
	76
0a147785	77	title = oembed_obj['title']
8708d764	78	thumb_url = oembed_obj['thumbnail_url']
7e601110	79	video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
08b38d54	80	file_list_doc = self._download_xml(
7e601110	81	compat_urlparse.urljoin(video_folder, 'fileList.xml'),
59eaf69e	82	video_id, 'Downloading filelist XML')
08b38d54 PH	83	file_name = file_list_doc.find('./video/item/fileName').text
08b38d54 PH	84	video_url = compat_urlparse.urljoin(video_folder, file_name)
8708d764	85
59eaf69e S	86	# Some URLs return "No permission or not login" in a webpage despite being
	87	# freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
	88	upload_date = unified_strdate(self._search_regex(
	89	r'>published on ([^<]+)<', webpage,
	90	'upload date', default=None))
	91	view_count = str_to_int(self._search_regex(
	92	r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
	93	webpage, 'view count', default=None))
	94	description = self._html_search_meta(
	95	'description', webpage, default=None) or clean_html(
	96	oembed_obj.get('description'))
8708d764 YCH	97
	98	return {
	99	'id': video_id,
08b38d54	100	'url': video_url,
0a147785	101	'title': title,
8708d764	102	'thumbnail': thumb_url,
59eaf69e	103	'description': description,
0a147785 S	104	'creator': oembed_obj.get('author_name'),
0a147785 S	105	'duration': parse_duration(oembed_obj.get('duration')),
59eaf69e	106	'upload_date': upload_date,
08b38d54	107	'view_count': view_count,
8708d764	108	}
c40feaba YCH	109
	110
	111	class CamdemyFolderIE(InfoExtractor):
92519402	112	_VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
c40feaba YCH	113	_TESTS = [{
	114	# links with trailing slash
	115	'url': 'http://www.camdemy.com/folder/450',
	116	'info_dict': {
	117	'id': '450',
	118	'title': '信號與系統 2012 & 2011 (Signals and Systems)',
	119	},
	120	'playlist_mincount': 145
	121	}, {
	122	# links without trailing slash
	123	# and multi-page
	124	'url': 'http://www.camdemy.com/folder/853',
	125	'info_dict': {
	126	'id': '853',
	127	'title': '科學計算 - 使用 Matlab'
	128	},
	129	'playlist_mincount': 20
	130	}, {
	131	# with displayMode parameter. For testing the codes to add parameters
	132	'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
	133	'info_dict': {
	134	'id': '853',
	135	'title': '科學計算 - 使用 Matlab'
	136	},
	137	'playlist_mincount': 20
	138	}]
	139
	140	def _real_extract(self, url):
	141	folder_id = self._match_id(url)
	142
	143	# Add displayMode=list so that all links are displayed in a single page
7e601110 YCH	144	parsed_url = list(compat_urlparse.urlparse(url))
7e601110 YCH	145	query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
c40feaba	146	query.update({'displayMode': 'list'})
15707c7e	147	parsed_url[4] = compat_urllib_parse_urlencode(query)
7e601110	148	final_url = compat_urlparse.urlunparse(parsed_url)
c40feaba YCH	149
	150	page = self._download_webpage(final_url, folder_id)
	151	matches = re.findall(r"href='(/media/\d+/?)'", page)
	152
	153	entries = [self.url_result('http://www.camdemy.com' + media_path)
	154	for media_path in matches]
	155
	156	folder_title = self._html_search_meta('keywords', page)
	157
	158	return self.playlist_result(entries, folder_id, folder_title)