[yt-dlp.git] / youtube_dl / extractor / nextmedia.py

# coding: utf-8
from __future__ import unicode_literals

from .common import InfoExtractor
from ..utils import parse_iso8601


class NextMediaIE(InfoExtractor):
    _VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)'
    _TESTS = [{
        'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199',
        'md5': 'dff9fad7009311c421176d1ac90bfe4f',
        'info_dict': {
            'id': '53109199',
            'ext': 'mp4',
            'title': '【佔領金鐘】50外國領事議員撐場 讚學生勇敢香港有希望',
            'thumbnail': 're:^https?://.*\.jpg$',
            'description': 'md5:28222b9912b6665a21011b034c70fcc7',
            'timestamp': 1415456273,
            'upload_date': '20141108',
        }
    }]

    _URL_PATTERN = r'\{ url: \'(.+)\' \}'

    def _real_extract(self, url):
        news_id = self._match_id(url)
        page = self._download_webpage(url, news_id)
        return self._extract_from_nextmedia_page(news_id, url, page)

    def _extract_from_nextmedia_page(self, news_id, url, page):
        title = self._fetch_title(page)
        video_url = self._search_regex(self._URL_PATTERN, page, 'video url')

        attrs = {
            'id': news_id,
            'title': title,
            'url': video_url,  # ext can be inferred from url
            'thumbnail': self._fetch_thumbnail(page),
            'description': self._fetch_description(page),
        }

        timestamp = self._fetch_timestamp(page)
        if timestamp:
            attrs['timestamp'] = timestamp
        else:
            attrs['upload_date'] = self._fetch_upload_date(url)

        return attrs

    def _fetch_title(self, page):
        return self._og_search_title(page)

    def _fetch_thumbnail(self, page):
        return self._og_search_thumbnail(page)

    def _fetch_timestamp(self, page):
        dateCreated = self._search_regex('"dateCreated":"([^"]+)"', page, 'created time')
        return parse_iso8601(dateCreated)

    def _fetch_upload_date(self, url):
        return self._search_regex(self._VALID_URL, url, 'upload date', group='date')

    def _fetch_description(self, page):
        return self._og_search_property('description', page)


class NextMediaActionNewsIE(NextMediaIE):
    _VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+'
    _TESTS = [{
        'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460',
        'md5': '05fce8ffeed7a5e00665d4b7cf0f9201',
        'info_dict': {
            'id': '19009428',
            'ext': 'mp4',
            'title': '【壹週刊】細10年男友偷食　50歲邵美琪再失戀',
            'thumbnail': 're:^https?://.*\.jpg$',
            'description': 'md5:cd802fad1f40fd9ea178c1e2af02d659',
            'timestamp': 1421791200,
            'upload_date': '20150120',
        }
    }]

    def _real_extract(self, url):
        news_id = self._match_id(url)
        actionnews_page = self._download_webpage(url, news_id)
        article_url = self._og_search_url(actionnews_page)
        article_page = self._download_webpage(article_url, news_id)
        return self._extract_from_nextmedia_page(news_id, url, article_page)


class AppleDailyRealtimeNewsIE(NextMediaIE):
    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
    _TESTS = [{
        'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
        'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
        'info_dict': {
            'id': '36354694',
            'ext': 'mp4',
            'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
            'thumbnail': 're:^https?://.*\.jpg$',
            'description': 'md5:b23787119933404ce515c6356a8c355c',
            'upload_date': '20150128',
        }
    }, {
        'url': 'http://www.appledaily.com.tw/realtimenews/article/strange/20150128/550549/%E4%B8%8D%E6%BB%BF%E8%A2%AB%E8%B8%A9%E8%85%B3%E3%80%80%E5%B1%B1%E6%9D%B1%E5%85%A9%E5%A4%A7%E5%AA%BD%E4%B8%80%E8%B7%AF%E6%89%93%E4%B8%8B%E8%BB%8A',
        'md5': '86b4e9132d158279c7883822d94ccc49',
        'info_dict': {
            'id': '550549',
            'ext': 'mp4',
            'title': '不滿被踩腳　山東兩大媽一路打下車',
            'thumbnail': 're:^https?://.*\.jpg$',
            'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d',
            'upload_date': '20150128',
        }
    }]

    _URL_PATTERN = r'\{url: \'(.+)\'\}'

    def _fetch_title(self, page):
        return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title')

    def _fetch_thumbnail(self, page):
        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)

    def _fetch_timestamp(self, page):
        return None


class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
    _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
    _TESTS = [{
        'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
        'md5': '03df296d95dedc2d5886debbb80cb43f',
        'info_dict': {
            'id': '5003671',
            'ext': 'mp4',
            'title': '20正妹熱舞　《刀龍傳說Online》火辣上市',
            'thumbnail': 're:^https?://.*\.jpg$',
            'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd',
            'upload_date': '20150128',
        }
    }, {
        # No thumbnail
        'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003673/',
        'md5': 'b06182cd386ea7bc6115ec7ff0f72aeb',
        'info_dict': {
            'id': '5003673',
            'ext': 'mp4',
            'title': '半夜尿尿　好像會看到___',
            'description': 'md5:61d2da7fe117fede148706cdb85ac066',
            'upload_date': '20150128',
        },
        'expected_warnings': [
            'video thumbnail',
        ]
    }]

    def _fetch_title(self, page):
        return self._html_search_meta('description', page, 'news title')

    def _fetch_description(self, page):
        return self._html_search_meta('description', page, 'news description')
Commit	Line	Data
206dba27 YCH	1	# coding: utf-8
	2	from __future__ import unicode_literals
	3
	4	from .common import InfoExtractor
	5	from ..utils import parse_iso8601
	6
	7
	8	class NextMediaIE(InfoExtractor):
	9	_VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)'
	10	_TESTS = [{
	11	'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199',
	12	'md5': 'dff9fad7009311c421176d1ac90bfe4f',
	13	'info_dict': {
	14	'id': '53109199',
	15	'ext': 'mp4',
	16	'title': '【佔領金鐘】50外國領事議員撐場讚學生勇敢香港有希望',
	17	'thumbnail': 're:^https?://.*\.jpg$',
	18	'description': 'md5:28222b9912b6665a21011b034c70fcc7',
	19	'timestamp': 1415456273,
	20	'upload_date': '20141108',
	21	}
	22	}]
	23
	24	_URL_PATTERN = r'\{ url: \'(.+)\' \}'
	25
	26	def _real_extract(self, url):
	27	news_id = self._match_id(url)
	28	page = self._download_webpage(url, news_id)
	29	return self._extract_from_nextmedia_page(news_id, url, page)
	30
	31	def _extract_from_nextmedia_page(self, news_id, url, page):
	32	title = self._fetch_title(page)
	33	video_url = self._search_regex(self._URL_PATTERN, page, 'video url')
	34
	35	attrs = {
	36	'id': news_id,
	37	'title': title,
	38	'url': video_url, # ext can be inferred from url
	39	'thumbnail': self._fetch_thumbnail(page),
	40	'description': self._fetch_description(page),
	41	}
	42
	43	timestamp = self._fetch_timestamp(page)
	44	if timestamp:
	45	attrs['timestamp'] = timestamp
	46	else:
	47	attrs['upload_date'] = self._fetch_upload_date(url)
	48
	49	return attrs
	50
	51	def _fetch_title(self, page):
	52	return self._og_search_title(page)
	53
	54	def _fetch_thumbnail(self, page):
	55	return self._og_search_thumbnail(page)
	56
	57	def _fetch_timestamp(self, page):
	58	dateCreated = self._search_regex('"dateCreated":"([^"]+)"', page, 'created time')
	59	return parse_iso8601(dateCreated)
	60
	61	def _fetch_upload_date(self, url):
	62	return self._search_regex(self._VALID_URL, url, 'upload date', group='date')
	63
	64	def _fetch_description(self, page):
65	return self._og_search_property('description', page)
66
67
68	class NextMediaActionNewsIE(NextMediaIE):
69	_VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+'
70	_TESTS = [{
71	'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460',
72	'md5': '05fce8ffeed7a5e00665d4b7cf0f9201',
73	'info_dict': {
74	'id': '19009428',
75	'ext': 'mp4',
76	'title': '【壹週刊】細10年男友偷食　50歲邵美琪再失戀',
77	'thumbnail': 're:^https?://.*\.jpg$',
78	'description': 'md5:cd802fad1f40fd9ea178c1e2af02d659',
79	'timestamp': 1421791200,
80	'upload_date': '20150120',
81	}
82	}]
83
84	def _real_extract(self, url):
85	news_id = self._match_id(url)
86	actionnews_page = self._download_webpage(url, news_id)
87	article_url = self._og_search_url(actionnews_page)
88	article_page = self._download_webpage(article_url, news_id)
89	return self._extract_from_nextmedia_page(news_id, url, article_page)
90
91
92	class AppleDailyRealtimeNewsIE(NextMediaIE):
93	_VALID_URL = r'http://(www\|ent).appledaily.com.tw/(realtimenews\|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
94	_TESTS = [{
95	'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
96	'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
97	'info_dict': {
98	'id': '36354694',
99	'ext': 'mp4',
100	'title': '周亭羽走過摩鐵陰霾2男陪吃九把刀孤寒看醫生',
101	'thumbnail': 're:^https?://.*\.jpg$',
102	'description': 'md5:b23787119933404ce515c6356a8c355c',
103	'upload_date': '20150128',
104	}
105	}, {
106	'url': 'http://www.appledaily.com.tw/realtimenews/article/strange/20150128/550549/%E4%B8%8D%E6%BB%BF%E8%A2%AB%E8%B8%A9%E8%85%B3%E3%80%80%E5%B1%B1%E6%9D%B1%E5%85%A9%E5%A4%A7%E5%AA%BD%E4%B8%80%E8%B7%AF%E6%89%93%E4%B8%8B%E8%BB%8A',
107	'md5': '86b4e9132d158279c7883822d94ccc49',
108	'info_dict': {
109	'id': '550549',
110	'ext': 'mp4',
111	'title': '不滿被踩腳　山東兩大媽一路打下車',
112	'thumbnail': 're:^https?://.*\.jpg$',
113	'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d',
114	'upload_date': '20150128',
115	}
116	}]
117
118	_URL_PATTERN = r'\{url: \'(.+)\'\}'
119
120	def _fetch_title(self, page):
121	return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title')
122
123	def _fetch_thumbnail(self, page):
124	return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
125
126	def _fetch_timestamp(self, page):
127	return None
128
129
130	class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
131	_VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
132	_TESTS = [{
133	'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
134	'md5': '03df296d95dedc2d5886debbb80cb43f',
135	'info_dict': {
136	'id': '5003671',
137	'ext': 'mp4',
138	'title': '20正妹熱舞　《刀龍傳說Online》火辣上市',
139	'thumbnail': 're:^https?://.*\.jpg$',
140	'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd',
141	'upload_date': '20150128',
142	}
143	}, {
144	# No thumbnail
145	'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003673/',
146	'md5': 'b06182cd386ea7bc6115ec7ff0f72aeb',
147	'info_dict': {
148	'id': '5003673',
149	'ext': 'mp4',
150	'title': '半夜尿尿　好像會看到___',
151	'description': 'md5:61d2da7fe117fede148706cdb85ac066',
152	'upload_date': '20150128',
153	},
154	'expected_warnings': [
155	'video thumbnail',
156	]
157	}]
158
159	def _fetch_title(self, page):
160	return self._html_search_meta('description', page, 'news title')
161
162	def _fetch_description(self, page):
163	return self._html_search_meta('description', page, 'news description')