]>
Commit | Line | Data |
---|---|---|
4fcca4bb | 1 | import base64 |
d77c3dfd | 2 | import datetime |
ccf65f9d | 3 | import itertools |
d77c3dfd FV |
4 | import netrc |
5 | import os | |
6 | import re | |
7 | import socket | |
8 | import time | |
d77c3dfd | 9 | import email.utils |
921a1455 | 10 | import xml.etree.ElementTree |
302efc19 | 11 | import random |
12 | import math | |
6324fd1d | 13 | import operator |
de5d66d4 | 14 | import hashlib |
15 | import binascii | |
16 | import urllib | |
d77c3dfd | 17 | |
9e8056d5 | 18 | from .utils import * |
d6983cb4 | 19 | from .extractor.common import InfoExtractor, SearchInfoExtractor |
d5822b96 PH |
20 | |
21 | from .extractor.ard import ARDIE | |
22 | from .extractor.arte import ArteTvIE | |
f5884801 | 23 | from .extractor.bliptv import BlipTVIE, BlipTVUserIE |
ea63e499 | 24 | from .extractor.comedycentral import ComedyCentralIE |
7beb36a5 | 25 | from .extractor.collegehumor import CollegeHumorIE |
219b8130 | 26 | from .extractor.dailymotion import DailymotionIE |
426ff042 | 27 | from .extractor.depositfiles import DepositFilesIE |
82840042 | 28 | from .extractor.eighttracks import EightTracksIE |
15369766 | 29 | from .extractor.escapist import EscapistIE |
a50e1b32 | 30 | from .extractor.facebook import FacebookIE |
8f0578f0 | 31 | from .extractor.funnyordie import FunnyOrDieIE |
9f4e6bba | 32 | from .extractor.gametrailers import GametrailersIE |
9b122384 | 33 | from .extractor.generic import GenericIE |
7aca14a1 PH |
34 | from .extractor.googleplus import GooglePlusIE |
35 | from .extractor.googlesearch import GoogleSearchIE | |
fda7d31a | 36 | from .extractor.infoq import InfoQIE |
79e93125 | 37 | from .extractor.justintv import JustinTVIE |
2c64df03 | 38 | from .extractor.keek import KeekIE |
38cbc40a | 39 | from .extractor.metacafe import MetacafeIE |
80cbb6dd | 40 | from .extractor.mixcloud import MixcloudIE |
33505666 | 41 | from .extractor.mtv import MTVIE |
97d2db01 | 42 | from .extractor.myspass import MySpassIE |
a08dfd27 | 43 | from .extractor.myvideo import MyVideoIE |
5b286728 | 44 | from .extractor.nba import NBAIE |
38cbc40a | 45 | from .extractor.statigram import StatigramIE |
97d6faac | 46 | from .extractor.photobucket import PhotobucketIE |
1183b85f | 47 | from .extractor.pornotube import PornotubeIE |
e10e576f | 48 | from .extractor.rbmaradio import RBMARadioIE |
aad0d6d5 | 49 | from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE |
291a168b | 50 | from .extractor.stanfordoc import StanfordOpenClassroomIE |
462dc88b | 51 | from .extractor.steam import SteamIE |
9fd5ce0c | 52 | from .extractor.ted import TEDIE |
78af8eb1 | 53 | from .extractor.ustream import UstreamIE |
b3d14cbf | 54 | from .extractor.vimeo import VimeoIE |
250f5578 | 55 | from .extractor.worldstarhiphop import WorldStarHipHopIE |
462dc88b | 56 | from .extractor.xnxx import XNXXIE |
cbf46c73 | 57 | from .extractor.xvideos import XVideosIE |
934858ad | 58 | from .extractor.yahoo import YahooIE, YahooSearchIE |
c3c77cec | 59 | from .extractor.youjizz import YouJizzIE |
9c286cfa | 60 | from .extractor.youku import YoukuIE |
0143dc02 | 61 | from .extractor.youporn import YouPornIE |
b05654f0 | 62 | from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE |
d5822b96 | 63 | from .extractor.zdf import ZDFIE |
e30e9318 | 64 | |
d830b7c2 | 65 | |
d77c3dfd | 66 | |
d77c3dfd | 67 | |
d77c3dfd | 68 | |
6de7ef9b | 69 | |
302efc19 | 70 | |
5dc846fa FV |
71 | |
72 | ||
fd873c69 FV |
73 | |
74 | ||
4cc3d074 | 75 | |
0b40544f | 76 | |
21a9c6aa | 77 | |
d0d4f277 | 78 | |
ef0c8d5f | 79 | |
4aeae91f | 80 | |
40634747 | 81 | |
4aeae91f | 82 | |
991ba7fa | 83 | |
991ba7fa | 84 | |
6324fd1d | 85 | |
991ba7fa | 86 | |
991ba7fa | 87 | |
991ba7fa JC |
88 | |
89 | ||
da06e2da | 90 | |
da06e2da | 91 | |
1ad5d872 | 92 | |
e32b06e9 | 93 | class SpiegelIE(InfoExtractor): |
1f46c152 | 94 | _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' |
e32b06e9 PH |
95 | |
96 | def _real_extract(self, url): | |
97 | m = re.match(self._VALID_URL, url) | |
98 | video_id = m.group('videoID') | |
99 | ||
100 | webpage = self._download_webpage(url, video_id) | |
ac3e9394 | 101 | |
979a9dd4 | 102 | video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>', |
ac3e9394 | 103 | webpage, u'title') |
e32b06e9 PH |
104 | |
105 | xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' | |
106 | xml_code = self._download_webpage(xml_url, video_id, | |
107 | note=u'Downloading XML', errnote=u'Failed to download XML') | |
108 | ||
109 | idoc = xml.etree.ElementTree.fromstring(xml_code) | |
110 | last_type = idoc[-1] | |
111 | filename = last_type.findall('./filename')[0].text | |
112 | duration = float(last_type.findall('./duration')[0].text) | |
113 | ||
114 | video_url = 'http://video2.spiegel.de/flash/' + filename | |
115 | video_ext = filename.rpartition('.')[2] | |
116 | info = { | |
117 | 'id': video_id, | |
118 | 'url': video_url, | |
119 | 'ext': video_ext, | |
120 | 'title': video_title, | |
121 | 'duration': duration, | |
122 | } | |
123 | return [info] | |
124 | ||
0cd35867 | 125 | class LiveLeakIE(InfoExtractor): |
43113d92 | 126 | |
0cd35867 | 127 | _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' |
43113d92 | 128 | IE_NAME = u'liveleak' |
129 | ||
130 | def _real_extract(self, url): | |
131 | mobj = re.match(self._VALID_URL, url) | |
132 | if mobj is None: | |
0c021ad1 | 133 | raise ExtractorError(u'Invalid URL: %s' % url) |
43113d92 | 134 | |
0cd35867 | 135 | video_id = mobj.group('video_id') |
43113d92 | 136 | |
137 | webpage = self._download_webpage(url, video_id) | |
138 | ||
ac3e9394 AB |
139 | video_url = self._search_regex(r'file: "(.*?)",', |
140 | webpage, u'video URL') | |
0cd35867 | 141 | |
979a9dd4 FV |
142 | video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"', |
143 | webpage, u'title').replace('LiveLeak.com -', '').strip() | |
43113d92 | 144 | |
979a9dd4 | 145 | video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', |
ac3e9394 | 146 | webpage, u'description', fatal=False) |
43113d92 | 147 | |
979a9dd4 | 148 | video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>', |
ac3e9394 | 149 | webpage, u'uploader', fatal=False) |
43113d92 | 150 | |
151 | info = { | |
152 | 'id': video_id, | |
153 | 'url': video_url, | |
154 | 'ext': 'mp4', | |
ac3e9394 AB |
155 | 'title': video_title, |
156 | 'description': video_description, | |
157 | 'uploader': video_uploader | |
43113d92 | 158 | } |
159 | ||
160 | return [info] | |
161 | ||
f2cd958c | 162 | |
f2cd958c | 163 | |
c15e0241 | 164 | class TumblrIE(InfoExtractor): |
feecf225 | 165 | _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' |
c15e0241 JMF |
166 | |
167 | def _real_extract(self, url): | |
168 | m_url = re.match(self._VALID_URL, url) | |
169 | video_id = m_url.group('id') | |
170 | blog = m_url.group('blog_name') | |
171 | ||
172 | url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) | |
173 | webpage = self._download_webpage(url, video_id) | |
174 | ||
feecf225 | 175 | re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) |
c15e0241 JMF |
176 | video = re.search(re_video, webpage) |
177 | if video is None: | |
ac3e9394 | 178 | raise ExtractorError(u'Unable to extract video') |
c15e0241 JMF |
179 | video_url = video.group('video_url') |
180 | ext = video.group('ext') | |
181 | ||
ac3e9394 AB |
182 | video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22', |
183 | webpage, u'thumbnail', fatal=False) # We pick the first poster | |
184 | if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') | |
c15e0241 JMF |
185 | |
186 | # The only place where you can get a title, it's not complete, | |
187 | # but searching in other places doesn't work for all videos | |
979a9dd4 | 188 | video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>', |
ac3e9394 | 189 | webpage, u'title', flags=re.DOTALL) |
c15e0241 JMF |
190 | |
191 | return [{'id': video_id, | |
192 | 'url': video_url, | |
ac3e9394 AB |
193 | 'title': video_title, |
194 | 'thumbnail': video_thumbnail, | |
c15e0241 JMF |
195 | 'ext': ext |
196 | }] | |
197 | ||
aed523ec | 198 | class BandcampIE(InfoExtractor): |
feecf225 | 199 | _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)' |
aed523ec JMF |
200 | |
201 | def _real_extract(self, url): | |
202 | mobj = re.match(self._VALID_URL, url) | |
203 | title = mobj.group('title') | |
204 | webpage = self._download_webpage(url, title) | |
205 | # We get the link to the free download page | |
206 | m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) | |
207 | if m_download is None: | |
ac3e9394 | 208 | raise ExtractorError(u'No free songs found') |
0c021ad1 | 209 | |
aed523ec JMF |
210 | download_link = m_download.group(1) |
211 | id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', | |
212 | webpage, re.MULTILINE|re.DOTALL).group('id') | |
213 | ||
214 | download_webpage = self._download_webpage(download_link, id, | |
215 | 'Downloading free downloads page') | |
216 | # We get the dictionary of the track from some javascrip code | |
217 | info = re.search(r'items: (.*?),$', | |
218 | download_webpage, re.MULTILINE).group(1) | |
219 | info = json.loads(info)[0] | |
220 | # We pick mp3-320 for now, until format selection can be easily implemented. | |
221 | mp3_info = info[u'downloads'][u'mp3-320'] | |
222 | # If we try to use this url it says the link has expired | |
223 | initial_url = mp3_info[u'url'] | |
feecf225 | 224 | re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$' |
aed523ec JMF |
225 | m_url = re.match(re_url, initial_url) |
226 | #We build the url we will use to get the final track url | |
227 | # This url is build in Bandcamp in the script download_bunde_*.js | |
228 | request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts')) | |
229 | final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url') | |
230 | # If we could correctly generate the .rand field the url would be | |
231 | #in the "download_url" key | |
232 | final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1) | |
233 | ||
234 | track_info = {'id':id, | |
235 | 'title' : info[u'title'], | |
ac3e9394 AB |
236 | 'ext' : 'mp3', |
237 | 'url' : final_url, | |
aed523ec | 238 | 'thumbnail' : info[u'thumb_url'], |
ac3e9394 | 239 | 'uploader' : info[u'artist'] |
aed523ec JMF |
240 | } |
241 | ||
242 | return [track_info] | |
243 | ||
c34407d1 | 244 | class RedTubeIE(InfoExtractor): |
5e34d2eb YUK |
245 | """Information Extractor for redtube""" |
246 | _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)' | |
5e34d2eb YUK |
247 | |
248 | def _real_extract(self,url): | |
249 | mobj = re.match(self._VALID_URL, url) | |
250 | if mobj is None: | |
c34407d1 PH |
251 | raise ExtractorError(u'Invalid URL: %s' % url) |
252 | ||
5e34d2eb YUK |
253 | video_id = mobj.group('id') |
254 | video_extension = 'mp4' | |
255 | webpage = self._download_webpage(url, video_id) | |
ac3e9394 | 256 | |
5e34d2eb | 257 | self.report_extraction(video_id) |
c34407d1 | 258 | |
979a9dd4 | 259 | video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">', |
ac3e9394 | 260 | webpage, u'video URL') |
c34407d1 | 261 | |
979a9dd4 | 262 | video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>', |
ac3e9394 | 263 | webpage, u'title') |
5e34d2eb YUK |
264 | |
265 | return [{ | |
266 | 'id': video_id, | |
267 | 'url': video_url, | |
268 | 'ext': video_extension, | |
269 | 'title': video_title, | |
270 | }] | |
7f5bd09b | 271 | |
272 | class InaIE(InfoExtractor): | |
273 | """Information Extractor for Ina.fr""" | |
d4f76f16 | 274 | _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*' |
dfb9323c | 275 | |
7f5bd09b | 276 | def _real_extract(self,url): |
277 | mobj = re.match(self._VALID_URL, url) | |
dfb9323c | 278 | |
7f5bd09b | 279 | video_id = mobj.group('id') |
dfb9323c PH |
280 | mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id |
281 | video_extension = 'mp4' | |
282 | webpage = self._download_webpage(mrss_url, video_id) | |
7f5bd09b | 283 | |
ac3e9394 | 284 | self.report_extraction(video_id) |
dfb9323c | 285 | |
979a9dd4 | 286 | video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', |
ac3e9394 AB |
287 | webpage, u'video URL') |
288 | ||
289 | video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', | |
290 | webpage, u'title') | |
7f5bd09b | 291 | |
292 | return [{ | |
293 | 'id': video_id, | |
294 | 'url': video_url, | |
295 | 'ext': video_extension, | |
296 | 'title': video_title, | |
297 | }] | |
e32b06e9 | 298 | |
d4f76f16 | 299 | class HowcastIE(InfoExtractor): |
5b0d3cc0 AB |
300 | """Information Extractor for Howcast.com""" |
301 | _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)' | |
d4f76f16 FV |
302 | |
303 | def _real_extract(self, url): | |
304 | mobj = re.match(self._VALID_URL, url) | |
305 | ||
306 | video_id = mobj.group('id') | |
307 | webpage_url = 'http://www.howcast.com/videos/' + video_id | |
308 | webpage = self._download_webpage(webpage_url, video_id) | |
309 | ||
b1d568f0 JMF |
310 | self.report_extraction(video_id) |
311 | ||
ac3e9394 AB |
312 | video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', |
313 | webpage, u'video URL') | |
d4f76f16 | 314 | |
979a9dd4 | 315 | video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', |
ac3e9394 | 316 | webpage, u'title') |
d4f76f16 | 317 | |
979a9dd4 | 318 | video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', |
ac3e9394 | 319 | webpage, u'description', fatal=False) |
d4f76f16 | 320 | |
979a9dd4 | 321 | thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'', |
ac3e9394 | 322 | webpage, u'thumbnail', fatal=False) |
b1d568f0 | 323 | |
d4f76f16 FV |
324 | return [{ |
325 | 'id': video_id, | |
326 | 'url': video_url, | |
327 | 'ext': 'mp4', | |
328 | 'title': video_title, | |
329 | 'description': video_description, | |
b1d568f0 | 330 | 'thumbnail': thumbnail, |
d4f76f16 FV |
331 | }] |
332 | ||
5b0d3cc0 AB |
333 | class VineIE(InfoExtractor): |
334 | """Information Extractor for Vine.co""" | |
335 | _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)' | |
336 | ||
337 | def _real_extract(self, url): | |
5b0d3cc0 AB |
338 | mobj = re.match(self._VALID_URL, url) |
339 | ||
340 | video_id = mobj.group('id') | |
341 | webpage_url = 'https://vine.co/v/' + video_id | |
342 | webpage = self._download_webpage(webpage_url, video_id) | |
343 | ||
17bd1b2f JMF |
344 | self.report_extraction(video_id) |
345 | ||
979a9dd4 | 346 | video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"', |
ac3e9394 | 347 | webpage, u'video URL') |
5b0d3cc0 | 348 | |
979a9dd4 | 349 | video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"', |
ac3e9394 | 350 | webpage, u'title') |
5b0d3cc0 | 351 | |
979a9dd4 | 352 | thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"', |
ac3e9394 | 353 | webpage, u'thumbnail', fatal=False) |
17bd1b2f | 354 | |
979a9dd4 | 355 | uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>', |
ac3e9394 | 356 | webpage, u'uploader', fatal=False, flags=re.DOTALL) |
17bd1b2f | 357 | |
5b0d3cc0 | 358 | return [{ |
17bd1b2f JMF |
359 | 'id': video_id, |
360 | 'url': video_url, | |
361 | 'ext': 'mp4', | |
362 | 'title': video_title, | |
363 | 'thumbnail': thumbnail, | |
364 | 'uploader': uploader, | |
5b0d3cc0 AB |
365 | }] |
366 | ||
afef36c9 AB |
367 | class FlickrIE(InfoExtractor): |
368 | """Information Extractor for Flickr videos""" | |
51d2453c | 369 | _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' |
afef36c9 AB |
370 | |
371 | def _real_extract(self, url): | |
372 | mobj = re.match(self._VALID_URL, url) | |
373 | ||
374 | video_id = mobj.group('id') | |
375 | video_uploader_id = mobj.group('uploader_id') | |
376 | webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id | |
377 | webpage = self._download_webpage(webpage_url, video_id) | |
378 | ||
ac3e9394 | 379 | secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret') |
afef36c9 AB |
380 | |
381 | first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' | |
51d2453c | 382 | first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') |
afef36c9 | 383 | |
979a9dd4 | 384 | node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>', |
ac3e9394 | 385 | first_xml, u'node_id') |
afef36c9 AB |
386 | |
387 | second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' | |
51d2453c FV |
388 | second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') |
389 | ||
390 | self.report_extraction(video_id) | |
afef36c9 AB |
391 | |
392 | mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml) | |
393 | if mobj is None: | |
394 | raise ExtractorError(u'Unable to extract video url') | |
395 | video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) | |
396 | ||
979a9dd4 | 397 | video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', |
ac3e9394 | 398 | webpage, u'video title') |
afef36c9 | 399 | |
979a9dd4 | 400 | video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', |
ac3e9394 | 401 | webpage, u'description', fatal=False) |
afef36c9 | 402 | |
979a9dd4 | 403 | thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', |
ac3e9394 | 404 | webpage, u'thumbnail', fatal=False) |
afef36c9 AB |
405 | |
406 | return [{ | |
51d2453c FV |
407 | 'id': video_id, |
408 | 'url': video_url, | |
409 | 'ext': 'mp4', | |
410 | 'title': video_title, | |
afef36c9 | 411 | 'description': video_description, |
51d2453c | 412 | 'thumbnail': thumbnail, |
afef36c9 AB |
413 | 'uploader_id': video_uploader_id, |
414 | }] | |
415 | ||
45014296 JMF |
416 | class TeamcocoIE(InfoExtractor): |
417 | _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)' | |
418 | ||
419 | def _real_extract(self, url): | |
420 | mobj = re.match(self._VALID_URL, url) | |
421 | if mobj is None: | |
422 | raise ExtractorError(u'Invalid URL: %s' % url) | |
423 | url_title = mobj.group('url_title') | |
424 | webpage = self._download_webpage(url, url_title) | |
425 | ||
979a9dd4 | 426 | video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"', |
ac3e9394 | 427 | webpage, u'video id') |
45014296 JMF |
428 | |
429 | self.report_extraction(video_id) | |
430 | ||
979a9dd4 | 431 | video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"', |
ac3e9394 | 432 | webpage, u'title') |
45014296 | 433 | |
979a9dd4 | 434 | thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"', |
ac3e9394 | 435 | webpage, u'thumbnail', fatal=False) |
45014296 | 436 | |
979a9dd4 | 437 | video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"', |
ac3e9394 | 438 | webpage, u'description', fatal=False) |
45014296 JMF |
439 | |
440 | data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id | |
441 | data = self._download_webpage(data_url, video_id, 'Downloading data webpage') | |
ac3e9394 | 442 | |
979a9dd4 | 443 | video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>', |
ac3e9394 | 444 | data, u'video URL') |
45014296 JMF |
445 | |
446 | return [{ | |
447 | 'id': video_id, | |
448 | 'url': video_url, | |
449 | 'ext': 'mp4', | |
450 | 'title': video_title, | |
451 | 'thumbnail': thumbnail, | |
ac3e9394 | 452 | 'description': video_description, |
45014296 | 453 | }] |
84095012 | 454 | |
71e458d4 YUK |
455 | class XHamsterIE(InfoExtractor): |
456 | """Information Extractor for xHamster""" | |
457 | _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html' | |
458 | ||
459 | def _real_extract(self,url): | |
460 | mobj = re.match(self._VALID_URL, url) | |
461 | ||
462 | video_id = mobj.group('id') | |
84095012 | 463 | mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id |
71e458d4 | 464 | webpage = self._download_webpage(mrss_url, video_id) |
84095012 | 465 | |
71e458d4 YUK |
466 | mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) |
467 | if mobj is None: | |
468 | raise ExtractorError(u'Unable to extract media URL') | |
469 | if len(mobj.group('server')) == 0: | |
470 | video_url = compat_urllib_parse.unquote(mobj.group('file')) | |
471 | else: | |
472 | video_url = mobj.group('server')+'/key='+mobj.group('file') | |
473 | video_extension = video_url.split('.')[-1] | |
474 | ||
979a9dd4 | 475 | video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', |
84095012 | 476 | webpage, u'title') |
71e458d4 | 477 | |
8b59a986 | 478 | # Can't see the description anywhere in the UI |
979a9dd4 | 479 | # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)', |
8b59a986 FV |
480 | # webpage, u'description', fatal=False) |
481 | # if video_description: video_description = unescapeHTML(video_description) | |
71e458d4 YUK |
482 | |
483 | mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) | |
84095012 FV |
484 | if mobj: |
485 | video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') | |
71e458d4 | 486 | else: |
84095012 FV |
487 | video_upload_date = None |
488 | self._downloader.report_warning(u'Unable to extract upload date') | |
71e458d4 | 489 | |
78d3442b | 490 | video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', |
84095012 FV |
491 | webpage, u'uploader id', default=u'anonymous') |
492 | ||
493 | video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'', | |
494 | webpage, u'thumbnail', fatal=False) | |
71e458d4 YUK |
495 | |
496 | return [{ | |
497 | 'id': video_id, | |
498 | 'url': video_url, | |
499 | 'ext': video_extension, | |
500 | 'title': video_title, | |
8b59a986 | 501 | # 'description': video_description, |
71e458d4 YUK |
502 | 'upload_date': video_upload_date, |
503 | 'uploader_id': video_uploader_id, | |
504 | 'thumbnail': video_thumbnail | |
505 | }] | |
afef36c9 | 506 | |
157b864a YK |
507 | class HypemIE(InfoExtractor): |
508 | """Information Extractor for hypem""" | |
509 | _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)' | |
510 | ||
868d62a5 | 511 | def _real_extract(self, url): |
157b864a YK |
512 | mobj = re.match(self._VALID_URL, url) |
513 | if mobj is None: | |
514 | raise ExtractorError(u'Invalid URL: %s' % url) | |
868d62a5 FV |
515 | track_id = mobj.group(1) |
516 | ||
517 | data = { 'ax': 1, 'ts': time.time() } | |
157b864a | 518 | data_encoded = compat_urllib_parse.urlencode(data) |
868d62a5 | 519 | complete_url = url + "?" + data_encoded |
157b864a | 520 | request = compat_urllib_request.Request(complete_url) |
868d62a5 | 521 | response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url') |
157b864a | 522 | cookie = urlh.headers.get('Set-Cookie', '') |
868d62a5 FV |
523 | |
524 | self.report_extraction(track_id) | |
84095012 | 525 | |
979a9dd4 | 526 | html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>', |
84095012 | 527 | response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() |
157b864a YK |
528 | try: |
529 | track_list = json.loads(html_tracks) | |
868d62a5 | 530 | track = track_list[u'tracks'][0] |
157b864a | 531 | except ValueError: |
868d62a5 FV |
532 | raise ExtractorError(u'Hypemachine contained invalid JSON.') |
533 | ||
534 | key = track[u"key"] | |
535 | track_id = track[u"id"] | |
536 | artist = track[u"artist"] | |
537 | title = track[u"song"] | |
538 | ||
539 | serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key)) | |
157b864a YK |
540 | request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'}) |
541 | request.add_header('cookie', cookie) | |
868d62a5 FV |
542 | song_data_json = self._download_webpage(request, track_id, u'Downloading metadata') |
543 | try: | |
544 | song_data = json.loads(song_data_json) | |
545 | except ValueError: | |
546 | raise ExtractorError(u'Hypemachine contained invalid JSON.') | |
157b864a | 547 | final_url = song_data[u"url"] |
868d62a5 | 548 | |
157b864a | 549 | return [{ |
868d62a5 | 550 | 'id': track_id, |
157b864a YK |
551 | 'url': final_url, |
552 | 'ext': "mp3", | |
553 | 'title': title, | |
554 | 'artist': artist, | |
555 | }] | |
556 | ||
ecb3e676 YK |
557 | class Vbox7IE(InfoExtractor): |
558 | """Information Extractor for Vbox7""" | |
559 | _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)' | |
560 | ||
561 | def _real_extract(self,url): | |
562 | mobj = re.match(self._VALID_URL, url) | |
563 | if mobj is None: | |
564 | raise ExtractorError(u'Invalid URL: %s' % url) | |
565 | video_id = mobj.group(1) | |
566 | ||
567 | redirect_page, urlh = self._download_webpage_handle(url, video_id) | |
0251f9c9 FV |
568 | new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location') |
569 | redirect_url = urlh.geturl() + new_location | |
ecb3e676 YK |
570 | webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page') |
571 | ||
0251f9c9 FV |
572 | title = self._html_search_regex(r'<title>(.*)</title>', |
573 | webpage, u'title').split('/')[0].strip() | |
ecb3e676 YK |
574 | |
575 | ext = "flv" | |
576 | info_url = "http://vbox7.com/play/magare.do" | |
577 | data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id}) | |
578 | info_request = compat_urllib_request.Request(info_url, data) | |
579 | info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') | |
580 | info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage') | |
581 | if info_response is None: | |
582 | raise ExtractorError(u'Unable to extract the media url') | |
cd453d38 | 583 | (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) |
ecb3e676 YK |
584 | |
585 | return [{ | |
cd453d38 JMF |
586 | 'id': video_id, |
587 | 'url': final_url, | |
588 | 'ext': ext, | |
589 | 'title': title, | |
590 | 'thumbnail': thumbnail_url, | |
ecb3e676 | 591 | }] |
157b864a | 592 | |
32aa88bc | 593 | |
4aeae91f PH |
594 | def gen_extractors(): |
595 | """ Return a list of an instance of every supported extractor. | |
596 | The order does matter; the first extractor matched is the one handling the URL. | |
597 | """ | |
598 | return [ | |
599 | YoutubePlaylistIE(), | |
600 | YoutubeChannelIE(), | |
601 | YoutubeUserIE(), | |
602 | YoutubeSearchIE(), | |
603 | YoutubeIE(), | |
604 | MetacafeIE(), | |
605 | DailymotionIE(), | |
606 | GoogleSearchIE(), | |
607 | PhotobucketIE(), | |
608 | YahooIE(), | |
609 | YahooSearchIE(), | |
610 | DepositFilesIE(), | |
611 | FacebookIE(), | |
4aeae91f | 612 | BlipTVIE(), |
1b2b22ed | 613 | BlipTVUserIE(), |
4aeae91f PH |
614 | VimeoIE(), |
615 | MyVideoIE(), | |
616 | ComedyCentralIE(), | |
617 | EscapistIE(), | |
618 | CollegeHumorIE(), | |
619 | XVideosIE(), | |
5011cded | 620 | SoundcloudSetIE(), |
4aeae91f PH |
621 | SoundcloudIE(), |
622 | InfoQIE(), | |
623 | MixcloudIE(), | |
624 | StanfordOpenClassroomIE(), | |
625 | MTVIE(), | |
626 | YoukuIE(), | |
627 | XNXXIE(), | |
18be482a JC |
628 | YouJizzIE(), |
629 | PornotubeIE(), | |
630 | YouPornIE(), | |
4aeae91f PH |
631 | GooglePlusIE(), |
632 | ArteTvIE(), | |
633 | NBAIE(), | |
40634747 | 634 | WorldStarHipHopIE(), |
4aeae91f PH |
635 | JustinTVIE(), |
636 | FunnyOrDieIE(), | |
4aeae91f PH |
637 | SteamIE(), |
638 | UstreamIE(), | |
ca0a0bbe | 639 | RBMARadioIE(), |
ccf65f9d | 640 | EightTracksIE(), |
da06e2da | 641 | KeekIE(), |
3a468f2d | 642 | TEDIE(), |
58994225 | 643 | MySpassIE(), |
e32b06e9 | 644 | SpiegelIE(), |
0cd35867 | 645 | LiveLeakIE(), |
df2dedee | 646 | ARDIE(), |
f2cd958c | 647 | ZDFIE(), |
c15e0241 | 648 | TumblrIE(), |
aed523ec | 649 | BandcampIE(), |
c34407d1 | 650 | RedTubeIE(), |
dfb9323c | 651 | InaIE(), |
d4f76f16 | 652 | HowcastIE(), |
5b0d3cc0 | 653 | VineIE(), |
afef36c9 | 654 | FlickrIE(), |
45014296 | 655 | TeamcocoIE(), |
71e458d4 | 656 | XHamsterIE(), |
157b864a | 657 | HypemIE(), |
ecb3e676 | 658 | Vbox7IE(), |
32aa88bc | 659 | GametrailersIE(), |
6b3f5a32 | 660 | StatigramIE(), |
4aeae91f PH |
661 | GenericIE() |
662 | ] | |
93412126 JMF |
663 | |
664 | def get_info_extractor(ie_name): | |
665 | """Returns the info extractor class with the given ie_name""" | |
666 | return globals()[ie_name+'IE'] |