[yt-dlp.git] / yt_dlp / extractor / franceinter.py

from .common import InfoExtractor
from ..utils import month_by_name


class FranceInterIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P<id>[^?#]+)'

    _TEST = {
        'url': 'https://www.franceinter.fr/emissions/affaires-sensibles/affaires-sensibles-07-septembre-2016',
        'md5': '9e54d7bdb6fdc02a841007f8a975c094',
        'info_dict': {
            'id': 'affaires-sensibles/affaires-sensibles-07-septembre-2016',
            'ext': 'mp3',
            'title': 'Affaire Cahuzac : le contentieux du compte en Suisse',
            'description': 'md5:401969c5d318c061f86bda1fa359292b',
            'thumbnail': r're:^https?://.*\.jpg',
            'upload_date': '20160907',
        },
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        video_url = self._search_regex(
            r'(?s)<div[^>]+class=["\']page-diffusion["\'][^>]*>.*?<button[^>]+data-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
            webpage, 'video url', group='url')

        title = self._og_search_title(webpage)
        description = self._og_search_description(webpage)
        thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)

        upload_date_str = self._search_regex(
            r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<',
            webpage, 'upload date', fatal=False)
        if upload_date_str:
            upload_date_list = upload_date_str.split()
            upload_date_list.reverse()
            upload_date_list[1] = '%02d' % (month_by_name(upload_date_list[1], lang='fr') or 0)
            upload_date_list[2] = '%02d' % int(upload_date_list[2])
            upload_date = ''.join(upload_date_list)
        else:
            upload_date = None

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'upload_date': upload_date,
            'formats': [{
                'url': video_url,
                'vcodec': 'none',
            }],
        }
Commit	Line	Data
bf6705f5	1	from .common import InfoExtractor
0002962f	2	from ..utils import month_by_name
677b3ce8 PH	3
677b3ce8 PH	4
bf6705f5	5	class FranceInterIE(InfoExtractor):
a942d6cb	6	_VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P<id>[^?#]+)'
a942d6cb	7
677b3ce8	8	_TEST = {
c51a7f0b S	9	'url': 'https://www.franceinter.fr/emissions/affaires-sensibles/affaires-sensibles-07-septembre-2016',
c51a7f0b S	10	'md5': '9e54d7bdb6fdc02a841007f8a975c094',
611c1dd9	11	'info_dict': {
c51a7f0b	12	'id': 'affaires-sensibles/affaires-sensibles-07-septembre-2016',
532f5bff	13	'ext': 'mp3',
c51a7f0b S	14	'title': 'Affaire Cahuzac : le contentieux du compte en Suisse',
c51a7f0b S	15	'description': 'md5:401969c5d318c061f86bda1fa359292b',
70c5802b	16	'thumbnail': r're:^https?://.*\.jpg',
c51a7f0b	17	'upload_date': '20160907',
677b3ce8 PH	18	},
677b3ce8 PH	19	}
bf6705f5	20
677b3ce8	21	def _real_extract(self, url):
2db58069	22	video_id = self._match_id(url)
26844eb5	23
677b3ce8	24	webpage = self._download_webpage(url, video_id)
532f5bff	25
a942d6cb	26	video_url = self._search_regex(
0002962f S	27	r'(?s)<div[^>]+class=["\']page-diffusion["\'][^>]>.?<button[^>]+data-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
0002962f S	28	webpage, 'video url', group='url')
a942d6cb	29
	30	title = self._og_search_title(webpage)
	31	description = self._og_search_description(webpage)
70c5802b	32	thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
a942d6cb	33
0002962f	34	upload_date_str = self._search_regex(
b2a027fc	35	r'class=["\']\scover-emission-period\s["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<',
0002962f S	36	webpage, 'upload date', fatal=False)
	37	if upload_date_str:
	38	upload_date_list = upload_date_str.split()
	39	upload_date_list.reverse()
52dc8a9b	40	upload_date_list[1] = '%02d' % (month_by_name(upload_date_list[1], lang='fr') or 0)
c51a7f0b	41	upload_date_list[2] = '%02d' % int(upload_date_list[2])
0002962f S	42	upload_date = ''.join(upload_date_list)
	43	else:
	44	upload_date = None
532f5bff	45
677b3ce8 PH	46	return {
677b3ce8 PH	47	'id': video_id,
532f5bff S	48	'title': title,
532f5bff S	49	'description': description,
70c5802b	50	'thumbnail': thumbnail,
0002962f	51	'upload_date': upload_date,
677b3ce8 PH	52	'formats': [{
	53	'url': video_url,
	54	'vcodec': 'none',
	55	}],
677b3ce8	56	}