yt_dlp/extractor/brainpop.py

   1 import json
   2 import re
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     classproperty,
   7     int_or_none,
   8     traverse_obj,
   9     urljoin,
  10 )
  11
  12
  13 class BrainPOPBaseIE(InfoExtractor):
  14     _NETRC_MACHINE = 'brainpop'
  15     _ORIGIN = ''  # So that _VALID_URL doesn't crash
  16     _LOGIN_ERRORS = {
  17         1502: 'The username and password you entered did not match.',  # LOGIN_FAILED
  18         1503: 'Payment method is expired.',  # LOGIN_FAILED_ACCOUNT_NOT_ACTIVE
  19         1506: 'Your BrainPOP plan has expired.',  # LOGIN_FAILED_ACCOUNT_EXPIRED
  20         1507: 'Terms not accepted.',  # LOGIN_FAILED_TERMS_NOT_ACCEPTED
  21         1508: 'Account not activated.',  # LOGIN_FAILED_SUBSCRIPTION_NOT_ACTIVE
  22         1512: 'The maximum number of devices permitted are logged in with your account right now.',  # LOGIN_FAILED_LOGIN_LIMIT_REACHED
  23         1513: 'You are trying to access your account from outside of its allowed IP range.',  # LOGIN_FAILED_INVALID_IP
  24         1514: 'Individual accounts are not included in your plan. Try again with your shared username and password.',  # LOGIN_FAILED_MBP_DISABLED
  25         1515: 'Account not activated.',  # LOGIN_FAILED_TEACHER_NOT_ACTIVE
  26         1523: 'That username and password won\'t work on this BrainPOP site.',  # LOGIN_FAILED_NO_ACCESS
  27         1524: 'You\'ll need to join a class before you can login.',  # LOGIN_FAILED_STUDENT_NO_PERIOD
  28         1526: 'Your account is locked. Reset your password, or ask a teacher or administrator for help.',  # LOGIN_FAILED_ACCOUNT_LOCKED
  29     }
  30
  31     @classproperty
  32     def _VALID_URL(cls):
  33         root = re.escape(cls._ORIGIN).replace(r'https:', r'https?:').replace(r'www\.', r'(?:www\.)?')
  34         return rf'{root}/(?P<slug>[^/]+/[^/]+/(?P<id>[^/?#&]+))'
  35
  36     def _assemble_formats(self, slug, format_id, display_id, token='', extra_fields={}):
  37         formats = []
  38         formats = self._extract_m3u8_formats(
  39             f'{urljoin(self._HLS_URL, slug)}.m3u8?{token}',
  40             display_id, 'mp4', m3u8_id=f'{format_id}-hls', fatal=False)
  41         formats.append({
  42             'format_id': format_id,
  43             'url': f'{urljoin(self._VIDEO_URL, slug)}?{token}',
  44         })
  45         for f in formats:
  46             f.update(extra_fields)
  47         return formats
  48
  49     def _extract_adaptive_formats(self, data, token, display_id, key_format='%s', extra_fields={}):
  50         formats = []
  51         additional_key_formats = {
  52             '%s': {},
  53             'ad_%s': {
  54                 'format_note': 'Audio description',
  55                 'source_preference': -2
  56             }
  57         }
  58         for additional_key_format, additional_key_fields in additional_key_formats.items():
  59             for key_quality, key_index in enumerate(('high', 'low')):
  60                 full_key_index = additional_key_format % (key_format % key_index)
  61                 if data.get(full_key_index):
  62                     formats.extend(self._assemble_formats(data[full_key_index], full_key_index, display_id, token, {
  63                         'quality': -1 - key_quality,
  64                         **additional_key_fields,
  65                         **extra_fields
  66                     }))
  67         return formats
  68
  69     def _perform_login(self, username, password):
  70         login_res = self._download_json(
  71             'https://api.brainpop.com/api/login', None,
  72             data=json.dumps({'username': username, 'password': password}).encode(),
  73             headers={
  74                 'Content-Type': 'application/json',
  75                 'Referer': self._ORIGIN
  76             }, note='Logging in', errnote='Unable to log in', expected_status=400)
  77         status_code = int_or_none(login_res['status_code'])
  78         if status_code != 1505:
  79             self.report_warning(
  80                 f'Unable to login: {self._LOGIN_ERRORS.get(status_code) or login_res.get("message")}'
  81                 or f'Got status code {status_code}')
  82
  83
  84 class BrainPOPIE(BrainPOPBaseIE):
  85     _ORIGIN = 'https://www.brainpop.com'
  86     _VIDEO_URL = 'https://svideos.brainpop.com'
  87     _HLS_URL = 'https://hls.brainpop.com'
  88     _CDN_URL = 'https://cdn.brainpop.com'
  89     _TESTS = [{
  90         'url': 'https://www.brainpop.com/health/conflictresolution/martinlutherkingjr/movie?ref=null',
  91         'md5': '3ead374233ae74c7f1b0029a01c972f0',
  92         'info_dict': {
  93             'id': '1f3259fa457292b4',
  94             'ext': 'mp4',
  95             'title': 'Martin Luther King, Jr.',
  96             'display_id': 'martinlutherkingjr',
  97             'description': 'md5:f403dbb2bf3ccc7cf4c59d9e43e3c349',
  98         },
  99     }, {
 100         'url': 'https://www.brainpop.com/science/space/bigbang/',
 101         'md5': '9a1ff0e77444dd9e437354eb669c87ec',
 102         'info_dict': {
 103             'id': 'acae52cd48c99acf',
 104             'ext': 'mp4',
 105             'title': 'Big Bang',
 106             'display_id': 'bigbang',
 107             'description': 'md5:3e53b766b0f116f631b13f4cae185d38',
 108         },
 109         'skip': 'Requires login',
 110     }]
 111
 112     def _real_extract(self, url):
 113         slug, display_id = self._match_valid_url(url).group('slug', 'id')
 114         movie_data = self._download_json(
 115             f'https://api.brainpop.com/api/content/published/bp/en/{slug}/movie?full=1', display_id,
 116             'Downloading movie data JSON', 'Unable to download movie data')['data']
 117         topic_data = traverse_obj(self._download_json(
 118             f'https://api.brainpop.com/api/content/published/bp/en/{slug}?full=1', display_id,
 119             'Downloading topic data JSON', 'Unable to download topic data', fatal=False),
 120             ('data', 'topic'), expected_type=dict) or movie_data['topic']
 121
 122         if not traverse_obj(movie_data, ('access', 'allow')):
 123             reason = traverse_obj(movie_data, ('access', 'reason'))
 124             if 'logged' in reason:
 125                 self.raise_login_required(reason, metadata_available=True)
 126             else:
 127                 self.raise_no_formats(reason, video_id=display_id)
 128         movie_feature = movie_data['feature']
 129         movie_feature_data = movie_feature['data']
 130
 131         formats, subtitles = [], {}
 132         formats.extend(self._extract_adaptive_formats(movie_feature_data, movie_feature_data.get('token', ''), display_id, '%s_v2', {
 133             'language': movie_feature.get('language') or 'en',
 134             'language_preference': 10
 135         }))
 136         for lang, localized_feature in traverse_obj(movie_feature, 'localization', default={}, expected_type=dict).items():
 137             formats.extend(self._extract_adaptive_formats(localized_feature, localized_feature.get('token', ''), display_id, '%s_v2', {
 138                 'language': lang,
 139                 'language_preference': -10
 140             }))
 141
 142         # TODO: Do localization fields also have subtitles?
 143         for name, url in movie_feature_data.items():
 144             lang = self._search_regex(
 145                 r'^subtitles_(?P<lang>\w+)$', name, 'subtitle metadata', default=None)
 146             if lang and url:
 147                 subtitles.setdefault(lang, []).append({
 148                     'url': urljoin(self._CDN_URL, url)
 149                 })
 150
 151         return {
 152             'id': topic_data['topic_id'],
 153             'display_id': display_id,
 154             'title': topic_data.get('name'),
 155             'description': topic_data.get('synopsis'),
 156             'formats': formats,
 157             'subtitles': subtitles,
 158         }
 159
 160
 161 class BrainPOPLegacyBaseIE(BrainPOPBaseIE):
 162     def _parse_js_topic_data(self, topic_data, display_id, token):
 163         movie_data = topic_data['movies']
 164         # TODO: Are there non-burned subtitles?
 165         formats = self._extract_adaptive_formats(movie_data, token, display_id)
 166
 167         return {
 168             'id': topic_data['EntryID'],
 169             'display_id': display_id,
 170             'title': topic_data.get('name'),
 171             'alt_title': topic_data.get('title'),
 172             'description': topic_data.get('synopsis'),
 173             'formats': formats,
 174         }
 175
 176     def _real_extract(self, url):
 177         slug, display_id = self._match_valid_url(url).group('slug', 'id')
 178         webpage = self._download_webpage(url, display_id)
 179         topic_data = self._search_json(
 180             r'var\s+content\s*=\s*', webpage, 'content data',
 181             display_id, end_pattern=';')['category']['unit']['topic']
 182         token = self._search_regex(r'ec_token\s*:\s*[\'"]([^\'"]+)', webpage, 'video token')
 183         return self._parse_js_topic_data(topic_data, display_id, token)
 184
 185
 186 class BrainPOPJrIE(BrainPOPLegacyBaseIE):
 187     _ORIGIN = 'https://jr.brainpop.com'
 188     _VIDEO_URL = 'https://svideos-jr.brainpop.com'
 189     _HLS_URL = 'https://hls-jr.brainpop.com'
 190     _CDN_URL = 'https://cdn-jr.brainpop.com'
 191     _TESTS = [{
 192         'url': 'https://jr.brainpop.com/health/feelingsandsel/emotions/',
 193         'md5': '04e0561bb21770f305a0ce6cf0d869ab',
 194         'info_dict': {
 195             'id': '347',
 196             'ext': 'mp4',
 197             'title': 'Emotions',
 198             'display_id': 'emotions',
 199         },
 200     }, {
 201         'url': 'https://jr.brainpop.com/science/habitats/arctichabitats/',
 202         'md5': 'b0ed063bbd1910df00220ee29340f5d6',
 203         'info_dict': {
 204             'id': '29',
 205             'ext': 'mp4',
 206             'title': 'Arctic Habitats',
 207             'display_id': 'arctichabitats',
 208         },
 209         'skip': 'Requires login',
 210     }]
 211
 212
 213 class BrainPOPELLIE(BrainPOPLegacyBaseIE):
 214     _ORIGIN = 'https://ell.brainpop.com'
 215     _VIDEO_URL = 'https://svideos-esl.brainpop.com'
 216     _HLS_URL = 'https://hls-esl.brainpop.com'
 217     _CDN_URL = 'https://cdn-esl.brainpop.com'
 218     _TESTS = [{
 219         'url': 'https://ell.brainpop.com/level1/unit1/lesson1/',
 220         'md5': 'a2012700cfb774acb7ad2e8834eed0d0',
 221         'info_dict': {
 222             'id': '1',
 223             'ext': 'mp4',
 224             'title': 'Lesson 1',
 225             'display_id': 'lesson1',
 226             'alt_title': 'Personal Pronouns',
 227         },
 228     }, {
 229         'url': 'https://ell.brainpop.com/level3/unit6/lesson5/',
 230         'md5': 'be19c8292c87b24aacfb5fda2f3f8363',
 231         'info_dict': {
 232             'id': '101',
 233             'ext': 'mp4',
 234             'title': 'Lesson 5',
 235             'display_id': 'lesson5',
 236             'alt_title': 'Review: Unit 6',
 237         },
 238         'skip': 'Requires login',
 239     }]
 240
 241
 242 class BrainPOPEspIE(BrainPOPLegacyBaseIE):
 243     IE_DESC = 'BrainPOP Español'
 244     _ORIGIN = 'https://esp.brainpop.com'
 245     _VIDEO_URL = 'https://svideos.brainpop.com'
 246     _HLS_URL = 'https://hls.brainpop.com'
 247     _CDN_URL = 'https://cdn.brainpop.com/mx'
 248     _TESTS = [{
 249         'url': 'https://esp.brainpop.com/ciencia/la_diversidad_de_la_vida/ecosistemas/',
 250         'md5': 'cb3f062db2b3c5240ddfcfde7108f8c9',
 251         'info_dict': {
 252             'id': '3893',
 253             'ext': 'mp4',
 254             'title': 'Ecosistemas',
 255             'display_id': 'ecosistemas',
 256             'description': 'md5:80fc55b07e241f8c8f2aa8d74deaf3c3',
 257         },
 258     }, {
 259         'url': 'https://esp.brainpop.com/espanol/la_escritura/emily_dickinson/',
 260         'md5': '98c1b9559e0e33777209c425cda7dac4',
 261         'info_dict': {
 262             'id': '7146',
 263             'ext': 'mp4',
 264             'title': 'Emily Dickinson',
 265             'display_id': 'emily_dickinson',
 266             'description': 'md5:2795ad87b1d239c9711c1e92ab5a978b',
 267         },
 268         'skip': 'Requires login',
 269     }]
 270
 271
 272 class BrainPOPFrIE(BrainPOPLegacyBaseIE):
 273     IE_DESC = 'BrainPOP Français'
 274     _ORIGIN = 'https://fr.brainpop.com'
 275     _VIDEO_URL = 'https://svideos.brainpop.com'
 276     _HLS_URL = 'https://hls.brainpop.com'
 277     _CDN_URL = 'https://cdn.brainpop.com/fr'
 278     _TESTS = [{
 279         'url': 'https://fr.brainpop.com/sciencesdelaterre/energie/sourcesdenergie/',
 280         'md5': '97e7f48af8af93f8a2be11709f239371',
 281         'info_dict': {
 282             'id': '1651',
 283             'ext': 'mp4',
 284             'title': 'Sources d\'énergie',
 285             'display_id': 'sourcesdenergie',
 286             'description': 'md5:7eece350f019a21ef9f64d4088b2d857',
 287         },
 288     }, {
 289         'url': 'https://fr.brainpop.com/francais/ecrire/plagiat/',
 290         'md5': '0cf2b4f89804d0dd4a360a51310d445a',
 291         'info_dict': {
 292             'id': '5803',
 293             'ext': 'mp4',
 294             'title': 'Plagiat',
 295             'display_id': 'plagiat',
 296             'description': 'md5:4496d87127ace28e8b1eda116e77cd2b',
 297         },
 298         'skip': 'Requires login',
 299     }]
 300
 301
 302 class BrainPOPIlIE(BrainPOPLegacyBaseIE):
 303     IE_DESC = 'BrainPOP Hebrew'
 304     _ORIGIN = 'https://il.brainpop.com'
 305     _VIDEO_URL = 'https://svideos.brainpop.com'
 306     _HLS_URL = 'https://hls.brainpop.com'
 307     _CDN_URL = 'https://cdn.brainpop.com/he'
 308     _TESTS = [{
 309         'url': 'https://il.brainpop.com/category_9/subcategory_150/subjects_3782/',
 310         'md5': '9e4ea9dc60ecd385a6e5ca12ccf31641',
 311         'info_dict': {
 312             'id': '3782',
 313             'ext': 'mp4',
 314             'title': 'md5:e993632fcda0545d9205602ec314ad67',
 315             'display_id': 'subjects_3782',
 316             'description': 'md5:4cc084a8012beb01f037724423a4d4ed',
 317         },
 318     }]