[yt-dlp.git] / youtube_dl / extractor / mixcloud.py

from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse,
)
from ..utils import (
    ExtractorError,
    HEADRequest,
    str_to_int,
    parse_iso8601,
)


class MixcloudIE(InfoExtractor):
    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
    IE_NAME = 'mixcloud'

    _TESTS = [{
        'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
        'info_dict': {
            'id': 'dholbach-cryptkeeper',
            'ext': 'mp3',
            'title': 'Cryptkeeper',
            'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
            'uploader': 'Daniel Holbach',
            'uploader_id': 'dholbach',
            'upload_date': '20111115',
            'timestamp': 1321359578,
            'thumbnail': 're:https?://.*\.jpg',
            'view_count': int,
            'like_count': int,
        },
    }, {
        'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
        'info_dict': {
            'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat',
            'ext': 'm4a',
            'title': 'Electric Relaxation vol. 3',
            'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
            'uploader': 'Daniel Drumz',
            'uploader_id': 'gillespeterson',
            'thumbnail': 're:https?://.*\.jpg',
            'view_count': int,
            'like_count': int,
        },
    }]

    def _get_url(self, track_id, template_url):
        server_count = 30
        for i in range(server_count):
            url = template_url % i
            try:
                # We only want to know if the request succeed
                # don't download the whole file
                self._request_webpage(
                    HEADRequest(url), track_id,
                    'Checking URL %d/%d ...' % (i + 1, server_count + 1))
                return url
            except ExtractorError:
                pass

        return None

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        uploader = mobj.group(1)
        cloudcast_name = mobj.group(2)
        track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))

        webpage = self._download_webpage(url, track_id)

        preview_url = self._search_regex(
            r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url')
        song_url = preview_url.replace('/previews/', '/c/originals/')
        template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
        final_song_url = self._get_url(track_id, template_url)
        if final_song_url is None:
            self.to_screen('Trying with m4a extension')
            template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
            final_song_url = self._get_url(track_id, template_url)
        if final_song_url is None:
            raise ExtractorError('Unable to extract track url')

        PREFIX = (
            r'<span class="play-button[^"]*?"'
            r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
        title = self._html_search_regex(
            PREFIX + r'm-title="([^"]+)"', webpage, 'title')
        thumbnail = self._proto_relative_url(self._html_search_regex(
            PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail',
            fatal=False))
        uploader = self._html_search_regex(
            PREFIX + r'm-owner-name="([^"]+)"',
            webpage, 'uploader', fatal=False)
        uploader_id = self._search_regex(
            r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
        description = self._og_search_description(webpage)
        like_count = str_to_int(self._search_regex(
            [r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"',
             r'/favorites/?">([0-9]+)<'],
            webpage, 'like count', fatal=False))
        view_count = str_to_int(self._search_regex(
            [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
             r'/listeners/?">([0-9,.]+)</a>'],
            webpage, 'play count', fatal=False))
        timestamp = parse_iso8601(self._search_regex(
            r'<time itemprop="dateCreated" datetime="([^"]+)">',
            webpage, 'upload date', default=None))

        return {
            'id': track_id,
            'title': title,
            'url': final_song_url,
            'description': description,
            'thumbnail': thumbnail,
            'uploader': uploader,
            'uploader_id': uploader_id,
            'timestamp': timestamp,
            'view_count': view_count,
            'like_count': like_count,
        }
Commit	Line	Data
d0390a0c PH	1	from __future__ import unicode_literals
d0390a0c PH	2
80cbb6dd	3	import re
80cbb6dd PH	4
80cbb6dd PH	5	from .common import InfoExtractor
1cc79574	6	from ..compat import (
abb82f1d	7	compat_urllib_parse,
1cc79574 PH	8	)
1cc79574 PH	9	from ..utils import (
baa7b197	10	ExtractorError,
dbc1366b	11	HEADRequest,
b80505a4	12	str_to_int,
57c7411f	13	parse_iso8601,
80cbb6dd PH	14	)
	15
	16
	17	class MixcloudIE(InfoExtractor):
8b286571	18	_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
d0390a0c	19	IE_NAME = 'mixcloud'
80cbb6dd	20
58ba6c01	21	_TESTS = [{
d0390a0c	22	'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
d0390a0c	23	'info_dict': {
abb82f1d JMF	24	'id': 'dholbach-cryptkeeper',
abb82f1d JMF	25	'ext': 'mp3',
d0390a0c PH	26	'title': 'Cryptkeeper',
	27	'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
	28	'uploader': 'Daniel Holbach',
	29	'uploader_id': 'dholbach',
	30	'upload_date': '20111115',
57c7411f PH	31	'timestamp': 1321359578,
	32	'thumbnail': 're:https?://.*\.jpg',
	33	'view_count': int,
	34	'like_count': int,
19e1d359	35	},
58ba6c01 S	36	}, {
	37	'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
	38	'info_dict': {
	39	'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat',
	40	'ext': 'm4a',
	41	'title': 'Electric Relaxation vol. 3',
	42	'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
	43	'uploader': 'Daniel Drumz',
	44	'uploader_id': 'gillespeterson',
	45	'thumbnail': 're:https?://.*\.jpg',
	46	'view_count': int,
	47	'like_count': int,
	48	},
	49	}]
80cbb6dd	50
62a164e7 PH	51	def _get_url(self, track_id, template_url):
	52	server_count = 30
	53	for i in range(server_count):
	54	url = template_url % i
80cbb6dd	55	try:
baa7b197 JMF	56	# We only want to know if the request succeed
baa7b197 JMF	57	# don't download the whole file
62a164e7 PH	58	self._request_webpage(
	59	HEADRequest(url), track_id,
	60	'Checking URL %d/%d ...' % (i + 1, server_count + 1))
80cbb6dd	61	return url
baa7b197	62	except ExtractorError:
62a164e7	63	pass
80cbb6dd PH	64
	65	return None
	66
80cbb6dd PH	67	def _real_extract(self, url):
80cbb6dd PH	68	mobj = re.match(self._VALID_URL, url)
19e1d359 JMF	69	uploader = mobj.group(1)
19e1d359 JMF	70	cloudcast_name = mobj.group(2)
abb82f1d	71	track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
dd2535c3	72
19e1d359	73	webpage = self._download_webpage(url, track_id)
19e1d359	74
dd2535c3	75	preview_url = self._search_regex(
58ba6c01	76	r'\s(?:data-preview-url\|m-preview)="([^"]+)"', webpage, 'preview url')
5ffecde7	77	song_url = preview_url.replace('/previews/', '/c/originals/')
19e1d359	78	template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)
62a164e7	79	final_song_url = self._get_url(track_id, template_url)
cbfc4702 JMF	80	if final_song_url is None:
	81	self.to_screen('Trying with m4a extension')
	82	template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
62a164e7	83	final_song_url = self._get_url(track_id, template_url)
cbfc4702	84	if final_song_url is None:
57c7411f PH	85	raise ExtractorError('Unable to extract track url')
	86
	87	PREFIX = (
9c1aa1d6	88	r'<span class="play-button[^"]*?"'
57c7411f PH	89	r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
	90	title = self._html_search_regex(
	91	PREFIX + r'm-title="([^"]+)"', webpage, 'title')
	92	thumbnail = self._proto_relative_url(self._html_search_regex(
	93	PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail',
	94	fatal=False))
	95	uploader = self._html_search_regex(
	96	PREFIX + r'm-owner-name="([^"]+)"',
	97	webpage, 'uploader', fatal=False)
	98	uploader_id = self._search_regex(
	99	r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
	100	description = self._og_search_description(webpage)
b80505a4 S	101	like_count = str_to_int(self._search_regex(
	102	[r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"',
	103	r'/favorites/?">([0-9]+)<'],
57c7411f	104	webpage, 'like count', fatal=False))
b80505a4 S	105	view_count = str_to_int(self._search_regex(
	106	[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
	107	r'/listeners/?">([0-9,.]+)</a>'],
57c7411f PH	108	webpage, 'play count', fatal=False))
	109	timestamp = parse_iso8601(self._search_regex(
	110	r'<time itemprop="dateCreated" datetime="([^"]+)">',
b80505a4	111	webpage, 'upload date', default=None))
19e1d359 JMF	112
	113	return {
	114	'id': track_id,
57c7411f	115	'title': title,
19e1d359	116	'url': final_song_url,
57c7411f PH	117	'description': description,
	118	'thumbnail': thumbnail,
	119	'uploader': uploader,
	120	'uploader_id': uploader_id,
	121	'timestamp': timestamp,
	122	'view_count': view_count,
	123	'like_count': like_count,
19e1d359	124	}