2 from __future__
import unicode_literals
8 from urllib
import quote
, urlencode
9 from os
.path
import basename
11 from .common
import InfoExtractor
12 from ..utils
import ExtractorError
, compat_urllib_request
, compat_html_parser
14 from ..utils
import compat_urlparse
16 urlunparse
= compat_urlparse
.urlunparse
17 urldefrag
= compat_urlparse
.urldefrag
20 class GroovesharkHtmlParser(compat_html_parser
.HTMLParser
):
22 self
._current
_object
= None
24 compat_html_parser
.HTMLParser
.__init
__(self
)
26 def handle_starttag(self
, tag
, attrs
):
27 attrs
= dict((k
, v
) for k
, v
in attrs
)
29 self
._current
_object
= {'attrs': attrs, 'params': []}
31 self
._current
_object
['params'].append(attrs
)
33 def handle_endtag(self
, tag
):
35 self
.objects
.append(self
._current
_object
)
36 self
._current
_object
= None
39 def extract_object_tags(cls
, html
):
46 class GroovesharkIE(InfoExtractor
):
47 _VALID_URL
= r
'https?://(www\.)?grooveshark\.com/#!/s/([^/]+)/([^/]+)'
49 'url': 'http://grooveshark.com/#!/s/Jolene+Tenth+Key+Remix+Ft+Will+Sessions/6SS1DW?src=5',
50 'md5': 'bbccc50b19daca23b8f961152c1dc95b',
53 'title': 'Jolene (Tenth Key Remix ft. Will Sessions)',
59 do_playerpage_request
= True
60 do_bootstrap_request
= True
62 def _parse_target(self
, target
):
63 uri
= compat_urlparse
.urlparse(target
)
64 hash = uri
.fragment
[1:].split('?')[0]
65 token
= basename(hash.rstrip('/'))
66 return (uri
, hash, token
)
68 def _build_bootstrap_url(self
, target
):
69 (uri
, hash, token
) = self
._parse
_target
(target
)
70 query
= 'getCommunicationToken=1&hash=%s&%d' % (quote(hash, safe
=''), self
.ts
)
71 return (urlunparse((uri
.scheme
, uri
.netloc
, '/preload.php', None, query
, None)), token
)
73 def _build_meta_url(self
, target
):
74 (uri
, hash, token
) = self
._parse
_target
(target
)
75 query
= 'hash=%s&%d' % (quote(hash, safe
=''), self
.ts
)
76 return (urlunparse((uri
.scheme
, uri
.netloc
, '/preload.php', None, query
, None)), token
)
78 def _build_stream_url(self
, meta
):
79 return urlunparse(('http', meta
['streamKey']['ip'], '/stream.php', None, None, None))
81 def _build_swf_referer(self
, target
, obj
):
82 (uri
, _
, _
) = self
._parse
_target
(target
)
83 return urlunparse((uri
.scheme
, uri
.netloc
, obj
['attrs']['data'], None, None, None))
85 def _transform_bootstrap(self
, js
):
86 return re
.split('(?m)^\s*try\s*{', js
)[0] \
87 .split(' = ', 1)[1].strip().rstrip(';')
89 def _transform_meta(self
, js
):
90 return js
.split('\n')[0].split('=')[1].rstrip(';')
92 def _get_meta(self
, target
):
93 (meta_url
, token
) = self
._build
_meta
_url
(target
)
94 self
.to_screen('Metadata URL: %s' % meta_url
)
96 headers
= {'Referer': urldefrag(target)[0]}
97 req
= compat_urllib_request
.Request(meta_url
, headers
=headers
)
98 res
= self
._download
_json
(req
, token
,
99 transform_source
=self
._transform
_meta
)
101 if 'getStreamKeyWithSong' not in res
:
102 raise ExtractorError(
103 'Metadata not found. URL may be malformed, or Grooveshark API may have changed.')
105 if res
['getStreamKeyWithSong'] is None:
106 raise ExtractorError(
107 'Metadata download failed, probably due to Grooveshark anti-abuse throttling. Wait at least an hour before retrying from this IP.',
110 return res
['getStreamKeyWithSong']
112 def _get_bootstrap(self
, target
):
113 (bootstrap_url
, token
) = self
._build
_bootstrap
_url
(target
)
115 headers
= {'Referer': urldefrag(target)[0]}
116 req
= compat_urllib_request
.Request(bootstrap_url
, headers
=headers
)
117 res
= self
._download
_json
(req
, token
, fatal
=False,
118 note
='Downloading player bootstrap data',
119 errnote
='Unable to download player bootstrap data',
120 transform_source
=self
._transform
_bootstrap
)
123 def _get_playerpage(self
, target
):
124 (_
, _
, token
) = self
._parse
_target
(target
)
127 webpage
= self
._download
_webpage
(
129 note
='Downloading player page',
130 errnote
='Unable to download player page',
133 if webpage
is not None:
134 # Search (for example German) error message
135 error_msg
= self
._html
_search
_regex
(
136 r
'<div id="content">\s*<h2>(.*?)</h2>', webpage
,
137 'error message', default
=None)
138 if error_msg
is not None:
139 error_msg
= error_msg
.replace('\n', ' ')
140 raise ExtractorError('Grooveshark said: %s' % error_msg
)
142 if webpage
is not None:
143 o
= GroovesharkHtmlParser
.extract_object_tags(webpage
)
144 return (webpage
, [x
for x
in o
if x
['attrs']['id'] == 'jsPlayerEmbed'])
146 return (webpage
, None)
148 def _real_initialize(self
):
149 self
.ts
= int(time
.time() * 1000) # timestamp in millis
151 def _real_extract(self
, url
):
152 (target_uri
, _
, token
) = self
._parse
_target
(url
)
154 # 1. Fill cookiejar by making a request to the player page
156 if self
.do_playerpage_request
:
157 (_
, player_objs
) = self
._get
_playerpage
(url
)
158 if player_objs
is not None:
159 swf_referer
= self
._build
_swf
_referer
(url
, player_objs
[0])
160 self
.to_screen('SWF Referer: %s' % swf_referer
)
162 # 2. Ask preload.php for swf bootstrap data to better mimic webapp
163 if self
.do_bootstrap_request
:
164 bootstrap
= self
._get
_bootstrap
(url
)
165 self
.to_screen('CommunicationToken: %s' % bootstrap
['getCommunicationToken'])
167 # 3. Ask preload.php for track metadata.
168 meta
= self
._get
_meta
(url
)
170 # 4. Construct stream request for track.
171 stream_url
= self
._build
_stream
_url
(meta
)
172 duration
= int(math
.ceil(float(meta
['streamKey']['uSecs']) / 1000000))
173 post_dict
= {'streamKey': meta['streamKey']['streamKey']}
174 post_data
= urlencode(post_dict
).encode('utf-8')
176 'Content-Length': len(post_data
),
177 'Content-Type': 'application/x-www-form-urlencoded'
179 if swf_referer
is not None:
180 headers
['Referer'] = swf_referer
184 'title': meta
['song']['Name'],
185 'http_method': 'POST',
188 'format': 'mp3 audio',
189 'duration': duration
,
190 'http_post_data': post_data
,
191 'http_headers': headers
,