[yt-dlp.git] / youtube_dlc / extractor / camdemy.py

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse_urlencode,
    compat_urlparse,
)
from ..utils import (
    clean_html,
    parse_duration,
    str_to_int,
    unified_strdate,
)


class CamdemyIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
    _TESTS = [{
        # single file
        'url': 'http://www.camdemy.com/media/5181/',
        'md5': '5a5562b6a98b37873119102e052e311b',
        'info_dict': {
            'id': '5181',
            'ext': 'mp4',
            'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
            'thumbnail': r're:^https?://.*\.jpg$',
            'creator': 'ss11spring',
            'duration': 1591,
            'upload_date': '20130114',
            'view_count': int,
        }
    }, {
        # With non-empty description
        # webpage returns "No permission or not login"
        'url': 'http://www.camdemy.com/media/13885',
        'md5': '4576a3bb2581f86c61044822adbd1249',
        'info_dict': {
            'id': '13885',
            'ext': 'mp4',
            'title': 'EverCam + Camdemy QuickStart',
            'thumbnail': r're:^https?://.*\.jpg$',
            'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
            'creator': 'evercam',
            'duration': 318,
        }
    }, {
        # External source (YouTube)
        'url': 'http://www.camdemy.com/media/14842',
        'info_dict': {
            'id': '2vsYQzNIsJo',
            'ext': 'mp4',
            'title': 'Excel 2013 Tutorial - How to add Password Protection',
            'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
            'upload_date': '20130211',
            'uploader': 'Hun Kim',
            'uploader_id': 'hunkimtutorials',
        },
        'params': {
            'skip_download': True,
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        src_from = self._html_search_regex(
            r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
            webpage, 'external source', default=None, group='url')
        if src_from:
            return self.url_result(src_from)

        oembed_obj = self._download_json(
            'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)

        title = oembed_obj['title']
        thumb_url = oembed_obj['thumbnail_url']
        video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
        file_list_doc = self._download_xml(
            compat_urlparse.urljoin(video_folder, 'fileList.xml'),
            video_id, 'Downloading filelist XML')
        file_name = file_list_doc.find('./video/item/fileName').text
        video_url = compat_urlparse.urljoin(video_folder, file_name)

        # Some URLs return "No permission or not login" in a webpage despite being
        # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
        upload_date = unified_strdate(self._search_regex(
            r'>published on ([^<]+)<', webpage,
            'upload date', default=None))
        view_count = str_to_int(self._search_regex(
            r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
            webpage, 'view count', default=None))
        description = self._html_search_meta(
            'description', webpage, default=None) or clean_html(
            oembed_obj.get('description'))

        return {
            'id': video_id,
            'url': video_url,
            'title': title,
            'thumbnail': thumb_url,
            'description': description,
            'creator': oembed_obj.get('author_name'),
            'duration': parse_duration(oembed_obj.get('duration')),
            'upload_date': upload_date,
            'view_count': view_count,
        }


class CamdemyFolderIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
    _TESTS = [{
        # links with trailing slash
        'url': 'http://www.camdemy.com/folder/450',
        'info_dict': {
            'id': '450',
            'title': '信號與系統 2012 & 2011 (Signals and Systems)',
        },
        'playlist_mincount': 145
    }, {
        # links without trailing slash
        # and multi-page
        'url': 'http://www.camdemy.com/folder/853',
        'info_dict': {
            'id': '853',
            'title': '科學計算 - 使用 Matlab'
        },
        'playlist_mincount': 20
    }, {
        # with displayMode parameter. For testing the codes to add parameters
        'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
        'info_dict': {
            'id': '853',
            'title': '科學計算 - 使用 Matlab'
        },
        'playlist_mincount': 20
    }]

    def _real_extract(self, url):
        folder_id = self._match_id(url)

        # Add displayMode=list so that all links are displayed in a single page
        parsed_url = list(compat_urlparse.urlparse(url))
        query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
        query.update({'displayMode': 'list'})
        parsed_url[4] = compat_urllib_parse_urlencode(query)
        final_url = compat_urlparse.urlunparse(parsed_url)

        page = self._download_webpage(final_url, folder_id)
        matches = re.findall(r"href='(/media/\d+/?)'", page)

        entries = [self.url_result('http://www.camdemy.com' + media_path)
                   for media_path in matches]

        folder_title = self._html_search_meta('keywords', page)

        return self.playlist_result(entries, folder_id, folder_title)
Commit	Line	Data
8708d764 YCH	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	import re
	5
	6	from .common import InfoExtractor
08b38d54	7	from ..compat import (
15707c7e	8	compat_urllib_parse_urlencode,
08b38d54 PH	9	compat_urlparse,
	10	)
	11	from ..utils import (
59eaf69e	12	clean_html,
0a147785	13	parse_duration,
08b38d54	14	str_to_int,
59eaf69e	15	unified_strdate,
08b38d54	16	)
8708d764 YCH	17
	18
	19	class CamdemyIE(InfoExtractor):
5886b38d	20	_VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
8708d764 YCH	21	_TESTS = [{
	22	# single file
	23	'url': 'http://www.camdemy.com/media/5181/',
	24	'md5': '5a5562b6a98b37873119102e052e311b',
	25	'info_dict': {
	26	'id': '5181',
	27	'ext': 'mp4',
	28	'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
ec85ded8	29	'thumbnail': r're:^https?://.*\.jpg$',
8708d764	30	'creator': 'ss11spring',
0a147785	31	'duration': 1591,
8708d764	32	'upload_date': '20130114',
08b38d54	33	'view_count': int,
8708d764 YCH	34	}
	35	}, {
	36	# With non-empty description
59eaf69e	37	# webpage returns "No permission or not login"
8708d764 YCH	38	'url': 'http://www.camdemy.com/media/13885',
	39	'md5': '4576a3bb2581f86c61044822adbd1249',
	40	'info_dict': {
	41	'id': '13885',
	42	'ext': 'mp4',
	43	'title': 'EverCam + Camdemy QuickStart',
ec85ded8	44	'thumbnail': r're:^https?://.*\.jpg$',
59eaf69e	45	'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
8708d764	46	'creator': 'evercam',
0a147785	47	'duration': 318,
8708d764	48	}
8367d3f3	49	}, {
59eaf69e	50	# External source (YouTube)
8367d3f3	51	'url': 'http://www.camdemy.com/media/14842',
8367d3f3 YCH	52	'info_dict': {
	53	'id': '2vsYQzNIsJo',
	54	'ext': 'mp4',
59eaf69e S	55	'title': 'Excel 2013 Tutorial - How to add Password Protection',
59eaf69e S	56	'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
8367d3f3 YCH	57	'upload_date': '20130211',
8367d3f3 YCH	58	'uploader': 'Hun Kim',
8367d3f3	59	'uploader_id': 'hunkimtutorials',
59eaf69e S	60	},
	61	'params': {
	62	'skip_download': True,
	63	},
8708d764 YCH	64	}]
	65
	66	def _real_extract(self, url):
	67	video_id = self._match_id(url)
59eaf69e S	68
59eaf69e S	69	webpage = self._download_webpage(url, video_id)
8708d764	70
08b38d54	71	src_from = self._html_search_regex(
59eaf69e S	72	r"class=['\"]srcFrom['\"][^>]>Sources?(?:\s+from)?\s:\s*<a[^>]+(?:href\|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
59eaf69e S	73	webpage, 'external source', default=None, group='url')
08b38d54 PH	74	if src_from:
08b38d54 PH	75	return self.url_result(src_from)
8367d3f3	76
8708d764 YCH	77	oembed_obj = self._download_json(
	78	'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
	79
0a147785	80	title = oembed_obj['title']
8708d764	81	thumb_url = oembed_obj['thumbnail_url']
7e601110	82	video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
08b38d54	83	file_list_doc = self._download_xml(
7e601110	84	compat_urlparse.urljoin(video_folder, 'fileList.xml'),
59eaf69e	85	video_id, 'Downloading filelist XML')
08b38d54 PH	86	file_name = file_list_doc.find('./video/item/fileName').text
08b38d54 PH	87	video_url = compat_urlparse.urljoin(video_folder, file_name)
8708d764	88
59eaf69e S	89	# Some URLs return "No permission or not login" in a webpage despite being
	90	# freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
	91	upload_date = unified_strdate(self._search_regex(
	92	r'>published on ([^<]+)<', webpage,
	93	'upload date', default=None))
	94	view_count = str_to_int(self._search_regex(
	95	r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
	96	webpage, 'view count', default=None))
	97	description = self._html_search_meta(
	98	'description', webpage, default=None) or clean_html(
	99	oembed_obj.get('description'))
8708d764 YCH	100
	101	return {
	102	'id': video_id,
08b38d54	103	'url': video_url,
0a147785	104	'title': title,
8708d764	105	'thumbnail': thumb_url,
59eaf69e	106	'description': description,
0a147785 S	107	'creator': oembed_obj.get('author_name'),
0a147785 S	108	'duration': parse_duration(oembed_obj.get('duration')),
59eaf69e	109	'upload_date': upload_date,
08b38d54	110	'view_count': view_count,
8708d764	111	}
c40feaba YCH	112
	113
	114	class CamdemyFolderIE(InfoExtractor):
92519402	115	_VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P<id>\d+)'
c40feaba YCH	116	_TESTS = [{
	117	# links with trailing slash
	118	'url': 'http://www.camdemy.com/folder/450',
	119	'info_dict': {
	120	'id': '450',
	121	'title': '信號與系統 2012 & 2011 (Signals and Systems)',
	122	},
	123	'playlist_mincount': 145
	124	}, {
	125	# links without trailing slash
	126	# and multi-page
	127	'url': 'http://www.camdemy.com/folder/853',
	128	'info_dict': {
	129	'id': '853',
	130	'title': '科學計算 - 使用 Matlab'
	131	},
	132	'playlist_mincount': 20
	133	}, {
	134	# with displayMode parameter. For testing the codes to add parameters
	135	'url': 'http://www.camdemy.com/folder/853/?displayMode=defaultOrderByOrg',
	136	'info_dict': {
	137	'id': '853',
	138	'title': '科學計算 - 使用 Matlab'
	139	},
	140	'playlist_mincount': 20
	141	}]
	142
	143	def _real_extract(self, url):
	144	folder_id = self._match_id(url)
	145
	146	# Add displayMode=list so that all links are displayed in a single page
7e601110 YCH	147	parsed_url = list(compat_urlparse.urlparse(url))
7e601110 YCH	148	query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
c40feaba	149	query.update({'displayMode': 'list'})
15707c7e	150	parsed_url[4] = compat_urllib_parse_urlencode(query)
7e601110	151	final_url = compat_urlparse.urlunparse(parsed_url)
c40feaba YCH	152
	153	page = self._download_webpage(final_url, folder_id)
	154	matches = re.findall(r"href='(/media/\d+/?)'", page)
	155
	156	entries = [self.url_result('http://www.camdemy.com' + media_path)
	157	for media_path in matches]
	158
	159	folder_title = self._html_search_meta('keywords', page)
	160
	161	return self.playlist_result(entries, folder_id, folder_title)