]>
Commit | Line | Data |
---|---|---|
d5822b96 PH |
1 | import re |
2 | import socket | |
3 | ||
4 | from .common import InfoExtractor | |
5 | from ..utils import ( | |
6 | compat_http_client, | |
7 | compat_str, | |
8 | compat_urllib_error, | |
9 | compat_urllib_parse, | |
10 | compat_urllib_request, | |
11 | ||
12 | ExtractorError, | |
13 | unified_strdate, | |
14 | ) | |
15 | ||
16 | class ArteTvIE(InfoExtractor): | |
17 | """arte.tv information extractor.""" | |
18 | ||
19 | _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' | |
20 | _LIVE_URL = r'index-[0-9]+\.html$' | |
21 | ||
22 | IE_NAME = u'arte.tv' | |
23 | ||
24 | def fetch_webpage(self, url): | |
25 | request = compat_urllib_request.Request(url) | |
26 | try: | |
27 | self.report_download_webpage(url) | |
28 | webpage = compat_urllib_request.urlopen(request).read() | |
29 | except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: | |
30 | raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) | |
31 | except ValueError as err: | |
32 | raise ExtractorError(u'Invalid URL: %s' % url) | |
33 | return webpage | |
34 | ||
35 | def grep_webpage(self, url, regex, regexFlags, matchTuples): | |
36 | page = self.fetch_webpage(url) | |
37 | mobj = re.search(regex, page, regexFlags) | |
38 | info = {} | |
39 | ||
40 | if mobj is None: | |
41 | raise ExtractorError(u'Invalid URL: %s' % url) | |
42 | ||
43 | for (i, key, err) in matchTuples: | |
44 | if mobj.group(i) is None: | |
45 | raise ExtractorError(err) | |
46 | else: | |
47 | info[key] = mobj.group(i) | |
48 | ||
49 | return info | |
50 | ||
9b3a760b PH |
51 | # TODO implement Live Stream |
52 | # def extractLiveStream(self, url): | |
53 | # video_lang = url.split('/')[-4] | |
54 | # info = self.grep_webpage( | |
55 | # url, | |
56 | # r'src="(.*?/videothek_js.*?\.js)', | |
57 | # 0, | |
58 | # [ | |
59 | # (1, 'url', u'Invalid URL: %s' % url) | |
60 | # ] | |
61 | # ) | |
62 | # http_host = url.split('/')[2] | |
63 | # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) | |
64 | # info = self.grep_webpage( | |
65 | # next_url, | |
66 | # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + | |
67 | # '(http://.*?\.swf).*?' + | |
68 | # '(rtmp://.*?)\'', | |
69 | # re.DOTALL, | |
70 | # [ | |
71 | # (1, 'path', u'could not extract video path: %s' % url), | |
72 | # (2, 'player', u'could not extract video player: %s' % url), | |
73 | # (3, 'url', u'could not extract video url: %s' % url) | |
74 | # ] | |
75 | # ) | |
76 | # video_url = u'%s/%s' % (info.get('url'), info.get('path')) | |
d5822b96 PH |
77 | |
78 | def extractPlus7Stream(self, url): | |
79 | video_lang = url.split('/')[-3] | |
80 | info = self.grep_webpage( | |
81 | url, | |
82 | r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', | |
83 | 0, | |
84 | [ | |
85 | (1, 'url', u'Invalid URL: %s' % url) | |
86 | ] | |
87 | ) | |
88 | next_url = compat_urllib_parse.unquote(info.get('url')) | |
89 | info = self.grep_webpage( | |
90 | next_url, | |
91 | r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang, | |
92 | 0, | |
93 | [ | |
94 | (1, 'url', u'Could not find <video> tag: %s' % url) | |
95 | ] | |
96 | ) | |
97 | next_url = compat_urllib_parse.unquote(info.get('url')) | |
98 | ||
99 | info = self.grep_webpage( | |
100 | next_url, | |
101 | r'<video id="(.*?)".*?>.*?' + | |
102 | '<name>(.*?)</name>.*?' + | |
103 | '<dateVideo>(.*?)</dateVideo>.*?' + | |
104 | '<url quality="hd">(.*?)</url>', | |
105 | re.DOTALL, | |
106 | [ | |
107 | (1, 'id', u'could not extract video id: %s' % url), | |
108 | (2, 'title', u'could not extract video title: %s' % url), | |
109 | (3, 'date', u'could not extract video date: %s' % url), | |
110 | (4, 'url', u'could not extract video url: %s' % url) | |
111 | ] | |
112 | ) | |
113 | ||
114 | return { | |
115 | 'id': info.get('id'), | |
116 | 'url': compat_urllib_parse.unquote(info.get('url')), | |
117 | 'uploader': u'arte.tv', | |
118 | 'upload_date': unified_strdate(info.get('date')), | |
119 | 'title': info.get('title').decode('utf-8'), | |
120 | 'ext': u'mp4', | |
121 | 'format': u'NA', | |
122 | 'player_url': None, | |
123 | } | |
124 | ||
125 | def _real_extract(self, url): | |
126 | video_id = url.split('/')[-1] | |
127 | self.report_extraction(video_id) | |
128 | ||
129 | if re.search(self._LIVE_URL, video_id) is not None: | |
9b3a760b PH |
130 | raise ExtractorError(u'Arte live streams are not yet supported, sorry') |
131 | # self.extractLiveStream(url) | |
132 | # return | |
d5822b96 PH |
133 | else: |
134 | info = self.extractPlus7Stream(url) | |
135 | ||
136 | return [info] |