]> jfr.im git - yt-dlp.git/blob - yt_dlp/extractor/jiosaavn.py
[ie/jiosaavn] Extract via API and fix playlists (#9656)
[yt-dlp.git] / yt_dlp / extractor / jiosaavn.py
1 import functools
2 import math
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7 InAdvancePagedList,
8 clean_html,
9 int_or_none,
10 make_archive_id,
11 smuggle_url,
12 unsmuggle_url,
13 url_basename,
14 url_or_none,
15 urlencode_postdata,
16 )
17 from ..utils.traversal import traverse_obj
18
19
20 class JioSaavnBaseIE(InfoExtractor):
21 _API_URL = 'https://www.jiosaavn.com/api.php'
22 _VALID_BITRATES = {'16', '32', '64', '128', '320'}
23
24 @functools.cached_property
25 def requested_bitrates(self):
26 requested_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn')
27 if invalid_bitrates := set(requested_bitrates) - self._VALID_BITRATES:
28 raise ValueError(
29 f'Invalid bitrate(s): {", ".join(invalid_bitrates)}. '
30 + f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}')
31 return requested_bitrates
32
33 def _extract_formats(self, song_data):
34 for bitrate in self.requested_bitrates:
35 media_data = self._download_json(
36 self._API_URL, song_data['id'],
37 f'Downloading format info for {bitrate}',
38 fatal=False, data=urlencode_postdata({
39 '__call': 'song.generateAuthToken',
40 '_format': 'json',
41 'bitrate': bitrate,
42 'url': song_data['encrypted_media_url'],
43 }))
44 if not traverse_obj(media_data, ('auth_url', {url_or_none})):
45 self.report_warning(f'Unable to extract format info for {bitrate}')
46 continue
47 ext = media_data.get('type')
48 yield {
49 'url': media_data['auth_url'],
50 'ext': 'm4a' if ext == 'mp4' else ext,
51 'format_id': bitrate,
52 'abr': int(bitrate),
53 'vcodec': 'none',
54 }
55
56 def _extract_song(self, song_data, url=None):
57 info = traverse_obj(song_data, {
58 'id': ('id', {str}),
59 'title': ('song', {clean_html}),
60 'album': ('album', {clean_html}),
61 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}),
62 'duration': ('duration', {int_or_none}),
63 'view_count': ('play_count', {int_or_none}),
64 'release_year': ('year', {int_or_none}),
65 'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}),
66 'webpage_url': ('perma_url', {url_or_none}),
67 })
68 if webpage_url := info.get('webpage_url') or url:
69 info['display_id'] = url_basename(webpage_url)
70 info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])]
71
72 return info
73
74 def _call_api(self, type_, token, note='API', params={}):
75 return self._download_json(
76 self._API_URL, token, f'Downloading {note} JSON', f'Unable to download {note} JSON',
77 query={
78 '__call': 'webapi.get',
79 '_format': 'json',
80 '_marker': '0',
81 'ctx': 'web6dot0',
82 'token': token,
83 'type': type_,
84 **params,
85 })
86
87 def _yield_songs(self, playlist_data):
88 for song_data in traverse_obj(playlist_data, ('songs', lambda _, v: v['id'] and v['perma_url'])):
89 song_info = self._extract_song(song_data)
90 url = smuggle_url(song_info['webpage_url'], {
91 'id': song_data['id'],
92 'encrypted_media_url': song_data['encrypted_media_url'],
93 })
94 yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info)
95
96
97 class JioSaavnSongIE(JioSaavnBaseIE):
98 IE_NAME = 'jiosaavn:song'
99 _VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P<id>[^/?#]+)'
100 _TESTS = [{
101 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk',
102 'md5': '3b84396d15ed9e083c3106f1fa589c04',
103 'info_dict': {
104 'id': 'IcoLuefJ',
105 'display_id': 'OQsEfQFVUXk',
106 'ext': 'm4a',
107 'title': 'Leja Re',
108 'album': 'Leja Re',
109 'thumbnail': r're:https?://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg',
110 'duration': 205,
111 'view_count': int,
112 'release_year': 2018,
113 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'],
114 '_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'],
115 },
116 }, {
117 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU',
118 'only_matching': True,
119 }]
120
121 def _real_extract(self, url):
122 url, smuggled_data = unsmuggle_url(url)
123 song_data = traverse_obj(smuggled_data, ({
124 'id': ('id', {str}),
125 'encrypted_media_url': ('encrypted_media_url', {str}),
126 }))
127
128 if 'id' in song_data and 'encrypted_media_url' in song_data:
129 result = {'id': song_data['id']}
130 else:
131 # only extract metadata if this is not a url_transparent result
132 song_data = self._call_api('song', self._match_id(url))['songs'][0]
133 result = self._extract_song(song_data, url)
134
135 result['formats'] = list(self._extract_formats(song_data))
136 return result
137
138
139 class JioSaavnAlbumIE(JioSaavnBaseIE):
140 IE_NAME = 'jiosaavn:album'
141 _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P<id>[^/?#]+)'
142 _TESTS = [{
143 'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_',
144 'info_dict': {
145 'id': 'buIOjYZDrNA_',
146 'title': '96',
147 },
148 'playlist_count': 10,
149 }]
150
151 def _real_extract(self, url):
152 display_id = self._match_id(url)
153 album_data = self._call_api('album', display_id)
154
155 return self.playlist_result(
156 self._yield_songs(album_data), display_id, traverse_obj(album_data, ('title', {str})))
157
158
159 class JioSaavnPlaylistIE(JioSaavnBaseIE):
160 IE_NAME = 'jiosaavn:playlist'
161 _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/s/playlist/(?:[^/?#]+/){2}(?P<id>[^/?#]+)'
162 _TESTS = [{
163 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__',
164 'info_dict': {
165 'id': 'LlJ8ZWT1ibN5084vKHRj2Q__',
166 'title': 'Mood English',
167 },
168 'playlist_mincount': 301,
169 }, {
170 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-hindi/DVR,pFUOwyXqIp77B1JF,A__',
171 'info_dict': {
172 'id': 'DVR,pFUOwyXqIp77B1JF,A__',
173 'title': 'Mood Hindi',
174 },
175 'playlist_mincount': 801,
176 }]
177 _PAGE_SIZE = 50
178
179 def _fetch_page(self, token, page):
180 return self._call_api(
181 'playlist', token, f'playlist page {page}', {'p': page, 'n': self._PAGE_SIZE})
182
183 def _entries(self, token, first_page_data, page):
184 page_data = first_page_data if not page else self._fetch_page(token, page + 1)
185 yield from self._yield_songs(page_data)
186
187 def _real_extract(self, url):
188 display_id = self._match_id(url)
189 playlist_data = self._fetch_page(display_id, 1)
190 total_pages = math.ceil(int(playlist_data['list_count']) / self._PAGE_SIZE)
191
192 return self.playlist_result(InAdvancePagedList(
193 functools.partial(self._entries, display_id, playlist_data),
194 total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str})))