]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/imgur.py
Improve extraction (Closes #7918)
[yt-dlp.git] / youtube_dl / extractor / imgur.py
CommitLineData
3bf57053
PH
1from __future__ import unicode_literals
2
3import re
4
5from .common import InfoExtractor
96b96909 6from ..compat import compat_urlparse
3bf57053
PH
7from ..utils import (
8 int_or_none,
9 js_to_json,
10 mimetype2ext,
1a13940c 11 ExtractorError,
3bf57053
PH
12)
13
b88ba053 14
3bf57053 15class ImgurIE(InfoExtractor):
dbee18b5 16 _VALID_URL = r'https?://(?:i\.)?imgur\.com/(gallery/)?(?P<id>[a-zA-Z0-9]{6,})'
3bf57053
PH
17
18 _TESTS = [{
19 'url': 'https://i.imgur.com/A61SaA1.gifv',
20 'info_dict': {
21 'id': 'A61SaA1',
22 'ext': 'mp4',
5e9a033e 23 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
dbee18b5 24 'description': 'Imgur: The most awesome images on the Internet.',
3bf57053 25 },
1a13940c
JB
26 }, {
27 'url': 'https://imgur.com/A61SaA1',
28 'info_dict': {
29 'id': 'A61SaA1',
30 'ext': 'mp4',
5e9a033e 31 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
dbee18b5 32 'description': 'Imgur: The most awesome images on the Internet.',
1a13940c 33 },
dbee18b5
AK
34 }, {
35 'url': 'https://imgur.com/gallery/YcAQlkx',
36 'info_dict': {
37 'id': 'YcAQlkx',
38 'ext': 'mp4',
39 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....',
40 'description': 'Imgur: The most awesome images on the Internet.'
41
42 }
3bf57053
PH
43 }]
44
45 def _real_extract(self, url):
46 video_id = self._match_id(url)
96b96909
S
47 webpage = self._download_webpage(
48 compat_urlparse.urljoin(url, video_id), video_id)
3bf57053
PH
49
50 width = int_or_none(self._search_regex(
51 r'<param name="width" value="([0-9]+)"',
52 webpage, 'width', fatal=False))
53 height = int_or_none(self._search_regex(
54 r'<param name="height" value="([0-9]+)"',
55 webpage, 'height', fatal=False))
56
b88ba053 57 video_elements = self._search_regex(
3bf57053 58 r'(?s)<div class="video-elements">(.*?)</div>',
b88ba053 59 webpage, 'video elements', default=None)
9e2d7dca
JB
60 if not video_elements:
61 raise ExtractorError(
b88ba053
PH
62 'No sources found for video %s. Maybe an image?' % video_id,
63 expected=True)
9e2d7dca 64
3bf57053
PH
65 formats = []
66 for m in re.finditer(r'<source\s+src="(?P<src>[^"]+)"\s+type="(?P<type>[^"]+)"', video_elements):
67 formats.append({
68 'format_id': m.group('type').partition('/')[2],
69 'url': self._proto_relative_url(m.group('src')),
70 'ext': mimetype2ext(m.group('type')),
71 'acodec': 'none',
72 'width': width,
73 'height': height,
74 'http_headers': {
75 'User-Agent': 'youtube-dl (like wget)',
76 },
77 })
78
79 gif_json = self._search_regex(
80 r'(?s)var\s+videoItem\s*=\s*(\{.*?\})',
81 webpage, 'GIF code', fatal=False)
82 if gif_json:
83 gifd = self._parse_json(
84 gif_json, video_id, transform_source=js_to_json)
85 formats.append({
86 'format_id': 'gif',
87 'preference': -10,
88 'width': width,
89 'height': height,
90 'ext': 'gif',
91 'acodec': 'none',
92 'vcodec': 'gif',
93 'container': 'gif',
94 'url': self._proto_relative_url(gifd['gifUrl']),
95 'filesize': gifd.get('size'),
96 'http_headers': {
97 'User-Agent': 'youtube-dl (like wget)',
98 },
99 })
100
101 self._sort_formats(formats)
102
103 return {
104 'id': video_id,
105 'formats': formats,
106 'description': self._og_search_description(webpage),
107 'title': self._og_search_title(webpage),
108 }
8875b3d5
S
109
110
111class ImgurAlbumIE(InfoExtractor):
dbee18b5 112 _VALID_URL = r'https?://(?:i\.)?imgur\.com/(gallery/)?(?P<id>[a-zA-Z0-9]{5})(?![a-zA-Z0-9])'
8875b3d5
S
113
114 _TEST = {
115 'url': 'http://imgur.com/gallery/Q95ko',
116 'info_dict': {
117 'id': 'Q95ko',
118 },
119 'playlist_count': 25,
120 }
121
122 def _real_extract(self, url):
123 album_id = self._match_id(url)
124
dbee18b5
AK
125 album_img_data = self._download_json(
126 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, album_id)['data']
8875b3d5 127
dbee18b5
AK
128 if len(album_img_data) == 0:
129 return self.url_result('http://imgur.com/%s' % album_id)
130 else:
131 album_images = album_img_data['images']
132 entries = [
133 self.url_result('http://imgur.com/%s' % image['hash'])
134 for image in album_images if image.get('hash')]
8875b3d5
S
135
136 return self.playlist_result(entries, album_id)