]>
Commit | Line | Data |
---|---|---|
a00d73c8 EJ |
1 | # encoding: utf-8 |
2 | from __future__ import unicode_literals | |
3 | ||
4 | import re | |
5 | ||
6 | from .common import InfoExtractor | |
7 | from ..utils import ( | |
8 | ExtractorError, | |
9 | compat_html_parser, | |
10 | #compat_urllib_request, | |
11 | #compat_urllib_parse, | |
12 | ) | |
13 | ||
14 | ||
15 | class PatreonHTMLParser(compat_html_parser.HTMLParser): | |
16 | _PREFIX = 'http://www.patreon.com' | |
17 | _ATTACH_TAGS = 5 * ['div'] | |
18 | _ATTACH_CLASSES = [ | |
6994e706 | 19 | 'fancyboxhidden', 'box photo', 'boxwrapper', |
a00d73c8 EJ |
20 | 'hiddendisplay shareinfo', 'attach' |
21 | ] | |
22 | _INFO_TAGS = 4 * ['div'] | |
23 | _INFO_CLASSES = [ | |
6994e706 | 24 | 'fancyboxhidden', 'box photo', 'boxwrapper', |
a00d73c8 EJ |
25 | 'hiddendisplay shareinfo' |
26 | ] | |
27 | ||
6994e706 EJ |
28 | def _match(self, attrs_classes, desired): |
29 | if attrs_classes == desired: | |
30 | return True | |
31 | elif len(attrs_classes) == len(desired): | |
32 | return all( | |
33 | x.startswith(y) | |
34 | for x, y in zip(attrs_classes, desired) | |
35 | ) | |
36 | return False | |
37 | ||
a00d73c8 EJ |
38 | def get_creation_info(self, html_data): |
39 | self.tag_stack = [] | |
40 | self.attrs_stack = [] | |
41 | self.creation_info = {} | |
42 | self.feed(html_data) | |
43 | ||
44 | def handle_starttag(self, tag, attrs): | |
45 | self.tag_stack.append(tag.lower()) | |
46 | self.attrs_stack.append(dict(attrs)) | |
47 | ||
48 | def handle_endtag(self, tag): | |
49 | self.tag_stack.pop() | |
50 | self.attrs_stack.pop() | |
51 | ||
52 | def handle_data(self, data): | |
53 | # Check first if this is a creation attachment | |
54 | if self.tag_stack[-6:-1] == self._ATTACH_TAGS: | |
55 | attrs_classes = [ | |
56 | x.get('class', '').lower() for x in self.attrs_stack[-6:-1] | |
57 | ] | |
6994e706 | 58 | if self._match(attrs_classes, self._ATTACH_CLASSES): |
a00d73c8 EJ |
59 | if self.tag_stack[-1] == 'a': |
60 | url = self._PREFIX + self.attrs_stack[-1].get('href') | |
61 | self.creation_info['url'] = url | |
62 | if '.' in data: | |
63 | self.creation_info['ext'] = data.rsplit('.')[-1] | |
64 | # Next, check if this is within the div containing the creation info | |
65 | if self.tag_stack[-5:-1] == self._INFO_TAGS: | |
66 | attrs_classes = [ | |
67 | x.get('class', '').lower() for x in self.attrs_stack[-5:-1] | |
68 | ] | |
6994e706 | 69 | if self._match(attrs_classes, self._INFO_CLASSES): |
a00d73c8 EJ |
70 | if self.attrs_stack[-1].get('class') == 'utitle': |
71 | self.creation_info['title'] = data.strip() | |
72 | ||
73 | ||
74 | class PatreonIE(InfoExtractor): | |
75 | IE_NAME = 'patreon' | |
76 | _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(.+)' | |
77 | _TESTS = [ | |
6994e706 | 78 | # CSS names with "double" in the name, i.e. "boxwrapper double" |
a00d73c8 EJ |
79 | { |
80 | 'url': 'http://www.patreon.com/creation?hid=743933', | |
81 | 'md5': 'e25505eec1053a6e6813b8ed369875cc', | |
a00d73c8 EJ |
82 | 'info_dict': { |
83 | 'id': '743933', | |
84 | 'ext': 'mp3', | |
85 | 'title': 'Episode 166: David Smalley of Dogma Debate', | |
86 | 'uploader': 'Cognitive Dissonance Podcast', | |
87 | }, | |
88 | }, | |
6994e706 EJ |
89 | { |
90 | 'url': 'http://www.patreon.com/creation?hid=754133', | |
91 | 'md5': '3eb09345bf44bf60451b8b0b81759d0a', | |
92 | 'info_dict': { | |
93 | 'id': '754133', | |
94 | 'ext': 'mp3', | |
95 | 'title': 'CD 167 Extra', | |
96 | 'uploader': 'Cognitive Dissonance Podcast', | |
97 | }, | |
98 | }, | |
a00d73c8 EJ |
99 | ] |
100 | ||
101 | # Currently Patreon exposes download URL via hidden CSS, so login is not | |
102 | # needed. Keeping this commented for when this inevitably changes. | |
103 | ''' | |
104 | def _login(self): | |
105 | (username, password) = self._get_login_info() | |
106 | if username is None: | |
107 | return | |
108 | ||
109 | login_form = { | |
110 | 'redirectUrl': 'http://www.patreon.com/', | |
111 | 'email': username, | |
112 | 'password': password, | |
113 | } | |
114 | ||
115 | request = compat_urllib_request.Request( | |
116 | 'https://www.patreon.com/processLogin', | |
117 | compat_urllib_parse.urlencode(login_form).encode('utf-8') | |
118 | ) | |
119 | login_page = self._download_webpage(request, None, note='Logging in as %s' % username) | |
120 | ||
121 | if re.search(r'onLoginFailed', login_page): | |
122 | raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) | |
123 | ||
124 | def _real_initialize(self): | |
125 | self._login() | |
126 | ''' | |
127 | ||
128 | def _real_extract(self, url): | |
129 | mobj = re.match(self._VALID_URL, url) | |
130 | video_id = mobj.group(1) | |
131 | ||
132 | info_page = self._download_webpage(url, video_id) | |
133 | ||
134 | ret = {'id': video_id} | |
135 | try: | |
136 | ret['uploader'] = re.search( | |
137 | r'<strong>(.+)</strong> is creating', info_page | |
138 | ).group(1) | |
139 | except AttributeError: | |
140 | pass | |
141 | ||
142 | parser = PatreonHTMLParser() | |
143 | parser.get_creation_info(info_page) | |
144 | if not parser.creation_info.get('url'): | |
c3f0b12b | 145 | raise ExtractorError('Unable to retrieve creation URL') |
a00d73c8 EJ |
146 | ret.update(parser.creation_info) |
147 | return ret |