]> jfr.im git - yt-dlp.git/blob - youtube_dl/InfoExtractors.py
[Vine] move into own file
[yt-dlp.git] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.infoq import InfoQIE
38 from .extractor.justintv import JustinTVIE
39 from .extractor.keek import KeekIE
40 from .extractor.liveleak import LiveLeakIE
41 from .extractor.metacafe import MetacafeIE
42 from .extractor.mixcloud import MixcloudIE
43 from .extractor.mtv import MTVIE
44 from .extractor.myspass import MySpassIE
45 from .extractor.myvideo import MyVideoIE
46 from .extractor.nba import NBAIE
47 from .extractor.statigram import StatigramIE
48 from .extractor.photobucket import PhotobucketIE
49 from .extractor.pornotube import PornotubeIE
50 from .extractor.rbmaradio import RBMARadioIE
51 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
52 from .extractor.spiegel import SpiegelIE
53 from .extractor.stanfordoc import StanfordOpenClassroomIE
54 from .extractor.steam import SteamIE
55 from .extractor.ted import TEDIE
56 from .extractor.tumblr import TumblrIE
57 from .extractor.ustream import UstreamIE
58 from .extractor.vbox7 import Vbox7IE
59 from .extractor.vimeo import VimeoIE
60 from .extractor.vine import VineIE
61 from .extractor.worldstarhiphop import WorldStarHipHopIE
62 from .extractor.xnxx import XNXXIE
63 from .extractor.xvideos import XVideosIE
64 from .extractor.yahoo import YahooIE, YahooSearchIE
65 from .extractor.youjizz import YouJizzIE
66 from .extractor.youku import YoukuIE
67 from .extractor.youporn import YouPornIE
68 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
69 from .extractor.zdf import ZDFIE
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105 class RedTubeIE(InfoExtractor):
106 """Information Extractor for redtube"""
107 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
108
109 def _real_extract(self,url):
110 mobj = re.match(self._VALID_URL, url)
111 if mobj is None:
112 raise ExtractorError(u'Invalid URL: %s' % url)
113
114 video_id = mobj.group('id')
115 video_extension = 'mp4'
116 webpage = self._download_webpage(url, video_id)
117
118 self.report_extraction(video_id)
119
120 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
121 webpage, u'video URL')
122
123 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
124 webpage, u'title')
125
126 return [{
127 'id': video_id,
128 'url': video_url,
129 'ext': video_extension,
130 'title': video_title,
131 }]
132
133 class InaIE(InfoExtractor):
134 """Information Extractor for Ina.fr"""
135 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
136
137 def _real_extract(self,url):
138 mobj = re.match(self._VALID_URL, url)
139
140 video_id = mobj.group('id')
141 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
142 video_extension = 'mp4'
143 webpage = self._download_webpage(mrss_url, video_id)
144
145 self.report_extraction(video_id)
146
147 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
148 webpage, u'video URL')
149
150 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
151 webpage, u'title')
152
153 return [{
154 'id': video_id,
155 'url': video_url,
156 'ext': video_extension,
157 'title': video_title,
158 }]
159
160 class HowcastIE(InfoExtractor):
161 """Information Extractor for Howcast.com"""
162 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
163
164 def _real_extract(self, url):
165 mobj = re.match(self._VALID_URL, url)
166
167 video_id = mobj.group('id')
168 webpage_url = 'http://www.howcast.com/videos/' + video_id
169 webpage = self._download_webpage(webpage_url, video_id)
170
171 self.report_extraction(video_id)
172
173 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
174 webpage, u'video URL')
175
176 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
177 webpage, u'title')
178
179 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
180 webpage, u'description', fatal=False)
181
182 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
183 webpage, u'thumbnail', fatal=False)
184
185 return [{
186 'id': video_id,
187 'url': video_url,
188 'ext': 'mp4',
189 'title': video_title,
190 'description': video_description,
191 'thumbnail': thumbnail,
192 }]
193
194
195 class FlickrIE(InfoExtractor):
196 """Information Extractor for Flickr videos"""
197 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
198
199 def _real_extract(self, url):
200 mobj = re.match(self._VALID_URL, url)
201
202 video_id = mobj.group('id')
203 video_uploader_id = mobj.group('uploader_id')
204 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
205 webpage = self._download_webpage(webpage_url, video_id)
206
207 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
208
209 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
210 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
211
212 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
213 first_xml, u'node_id')
214
215 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
216 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
217
218 self.report_extraction(video_id)
219
220 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
221 if mobj is None:
222 raise ExtractorError(u'Unable to extract video url')
223 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
224
225 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
226 webpage, u'video title')
227
228 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
229 webpage, u'description', fatal=False)
230
231 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
232 webpage, u'thumbnail', fatal=False)
233
234 return [{
235 'id': video_id,
236 'url': video_url,
237 'ext': 'mp4',
238 'title': video_title,
239 'description': video_description,
240 'thumbnail': thumbnail,
241 'uploader_id': video_uploader_id,
242 }]
243
244 class TeamcocoIE(InfoExtractor):
245 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
246
247 def _real_extract(self, url):
248 mobj = re.match(self._VALID_URL, url)
249 if mobj is None:
250 raise ExtractorError(u'Invalid URL: %s' % url)
251 url_title = mobj.group('url_title')
252 webpage = self._download_webpage(url, url_title)
253
254 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
255 webpage, u'video id')
256
257 self.report_extraction(video_id)
258
259 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
260 webpage, u'title')
261
262 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
263 webpage, u'thumbnail', fatal=False)
264
265 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
266 webpage, u'description', fatal=False)
267
268 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
269 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
270
271 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
272 data, u'video URL')
273
274 return [{
275 'id': video_id,
276 'url': video_url,
277 'ext': 'mp4',
278 'title': video_title,
279 'thumbnail': thumbnail,
280 'description': video_description,
281 }]
282
283 class XHamsterIE(InfoExtractor):
284 """Information Extractor for xHamster"""
285 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
286
287 def _real_extract(self,url):
288 mobj = re.match(self._VALID_URL, url)
289
290 video_id = mobj.group('id')
291 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
292 webpage = self._download_webpage(mrss_url, video_id)
293
294 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
295 if mobj is None:
296 raise ExtractorError(u'Unable to extract media URL')
297 if len(mobj.group('server')) == 0:
298 video_url = compat_urllib_parse.unquote(mobj.group('file'))
299 else:
300 video_url = mobj.group('server')+'/key='+mobj.group('file')
301 video_extension = video_url.split('.')[-1]
302
303 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
304 webpage, u'title')
305
306 # Can't see the description anywhere in the UI
307 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
308 # webpage, u'description', fatal=False)
309 # if video_description: video_description = unescapeHTML(video_description)
310
311 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
312 if mobj:
313 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
314 else:
315 video_upload_date = None
316 self._downloader.report_warning(u'Unable to extract upload date')
317
318 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
319 webpage, u'uploader id', default=u'anonymous')
320
321 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
322 webpage, u'thumbnail', fatal=False)
323
324 return [{
325 'id': video_id,
326 'url': video_url,
327 'ext': video_extension,
328 'title': video_title,
329 # 'description': video_description,
330 'upload_date': video_upload_date,
331 'uploader_id': video_uploader_id,
332 'thumbnail': video_thumbnail
333 }]
334
335 class HypemIE(InfoExtractor):
336 """Information Extractor for hypem"""
337 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
338
339 def _real_extract(self, url):
340 mobj = re.match(self._VALID_URL, url)
341 if mobj is None:
342 raise ExtractorError(u'Invalid URL: %s' % url)
343 track_id = mobj.group(1)
344
345 data = { 'ax': 1, 'ts': time.time() }
346 data_encoded = compat_urllib_parse.urlencode(data)
347 complete_url = url + "?" + data_encoded
348 request = compat_urllib_request.Request(complete_url)
349 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
350 cookie = urlh.headers.get('Set-Cookie', '')
351
352 self.report_extraction(track_id)
353
354 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
355 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
356 try:
357 track_list = json.loads(html_tracks)
358 track = track_list[u'tracks'][0]
359 except ValueError:
360 raise ExtractorError(u'Hypemachine contained invalid JSON.')
361
362 key = track[u"key"]
363 track_id = track[u"id"]
364 artist = track[u"artist"]
365 title = track[u"song"]
366
367 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
368 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
369 request.add_header('cookie', cookie)
370 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
371 try:
372 song_data = json.loads(song_data_json)
373 except ValueError:
374 raise ExtractorError(u'Hypemachine contained invalid JSON.')
375 final_url = song_data[u"url"]
376
377 return [{
378 'id': track_id,
379 'url': final_url,
380 'ext': "mp3",
381 'title': title,
382 'artist': artist,
383 }]
384
385
386
387 def gen_extractors():
388 """ Return a list of an instance of every supported extractor.
389 The order does matter; the first extractor matched is the one handling the URL.
390 """
391 return [
392 YoutubePlaylistIE(),
393 YoutubeChannelIE(),
394 YoutubeUserIE(),
395 YoutubeSearchIE(),
396 YoutubeIE(),
397 MetacafeIE(),
398 DailymotionIE(),
399 GoogleSearchIE(),
400 PhotobucketIE(),
401 YahooIE(),
402 YahooSearchIE(),
403 DepositFilesIE(),
404 FacebookIE(),
405 BlipTVIE(),
406 BlipTVUserIE(),
407 VimeoIE(),
408 MyVideoIE(),
409 ComedyCentralIE(),
410 EscapistIE(),
411 CollegeHumorIE(),
412 XVideosIE(),
413 SoundcloudSetIE(),
414 SoundcloudIE(),
415 InfoQIE(),
416 MixcloudIE(),
417 StanfordOpenClassroomIE(),
418 MTVIE(),
419 YoukuIE(),
420 XNXXIE(),
421 YouJizzIE(),
422 PornotubeIE(),
423 YouPornIE(),
424 GooglePlusIE(),
425 ArteTvIE(),
426 NBAIE(),
427 WorldStarHipHopIE(),
428 JustinTVIE(),
429 FunnyOrDieIE(),
430 SteamIE(),
431 UstreamIE(),
432 RBMARadioIE(),
433 EightTracksIE(),
434 KeekIE(),
435 TEDIE(),
436 MySpassIE(),
437 SpiegelIE(),
438 LiveLeakIE(),
439 ARDIE(),
440 ZDFIE(),
441 TumblrIE(),
442 BandcampIE(),
443 RedTubeIE(),
444 InaIE(),
445 HowcastIE(),
446 VineIE(),
447 FlickrIE(),
448 TeamcocoIE(),
449 XHamsterIE(),
450 HypemIE(),
451 Vbox7IE(),
452 GametrailersIE(),
453 StatigramIE(),
454 GenericIE()
455 ]
456
457 def get_info_extractor(ie_name):
458 """Returns the info extractor class with the given ie_name"""
459 return globals()[ie_name+'IE']