]> jfr.im git - yt-dlp.git/blame - yt_dlp/downloader/mhtml.py
[youtube] Improve video upload date handling (#3029)
[yt-dlp.git] / yt_dlp / downloader / mhtml.py
CommitLineData
cdb19aa4 1# coding: utf-8
2from __future__ import unicode_literals
3
4import io
5import quopri
6import re
7import uuid
8
9from .fragment import FragmentFD
10from ..utils import (
11 escapeHTML,
12 formatSeconds,
13 srt_subtitles_timecode,
14 urljoin,
15)
16from ..version import __version__ as YT_DLP_VERSION
17
18
19class MhtmlFD(FragmentFD):
20 FD_NAME = 'mhtml'
21
22 _STYLESHEET = """\
23html, body {
24 margin: 0;
25 padding: 0;
26 height: 100vh;
27}
28
29html {
30 overflow-y: scroll;
31 scroll-snap-type: y mandatory;
32}
33
34body {
35 scroll-snap-type: y mandatory;
36 display: flex;
37 flex-flow: column;
38}
39
40body > figure {
41 max-width: 100vw;
42 max-height: 100vh;
43 scroll-snap-align: center;
44}
45
46body > figure > figcaption {
47 text-align: center;
48 height: 2.5em;
49}
50
51body > figure > img {
52 display: block;
53 margin: auto;
54 max-width: 100%;
55 max-height: calc(100vh - 5em);
56}
57"""
58 _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
59 _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)
60
61 @staticmethod
62 def _escape_mime(s):
63 return '=?utf-8?Q?' + (b''.join(
64 bytes((b,)) if b >= 0x20 else b'=%02X' % b
65 for b in quopri.encodestring(s.encode('utf-8'), header=True)
66 )).decode('us-ascii') + '?='
67
68 def _gen_cid(self, i, fragment, frag_boundary):
69 return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)
70
71 def _gen_stub(self, *, fragments, frag_boundary, title):
72 output = io.StringIO()
73
74 output.write((
75 '<!DOCTYPE html>'
76 '<html>'
77 '<head>'
78 '' '<meta name="generator" content="yt-dlp {version}">'
79 '' '<title>{title}</title>'
80 '' '<style>{styles}</style>'
81 '<body>'
82 ).format(
83 version=escapeHTML(YT_DLP_VERSION),
84 styles=self._STYLESHEET,
85 title=escapeHTML(title)
86 ))
87
88 t0 = 0
89 for i, frag in enumerate(fragments):
90 output.write('<figure>')
91 try:
92 t1 = t0 + frag['duration']
93 output.write((
94 '<figcaption>Slide #{num}: {t0} – {t1} (duration: {duration})</figcaption>'
95 ).format(
96 num=i + 1,
97 t0=srt_subtitles_timecode(t0),
98 t1=srt_subtitles_timecode(t1),
99 duration=formatSeconds(frag['duration'], msec=True)
100 ))
101 except (KeyError, ValueError, TypeError):
102 t1 = None
103 output.write((
104 '<figcaption>Slide #{num}</figcaption>'
105 ).format(num=i + 1))
106 output.write('<img src="cid:{cid}">'.format(
107 cid=self._gen_cid(i, frag, frag_boundary)))
108 output.write('</figure>')
109 t0 = t1
110
111 return output.getvalue()
112
113 def real_download(self, filename, info_dict):
114 fragment_base_url = info_dict.get('fragment_base_url')
115 fragments = info_dict['fragments'][:1] if self.params.get(
116 'test', False) else info_dict['fragments']
d76991ab 117 title = info_dict.get('title', info_dict['format_id'])
118 origin = info_dict.get('webpage_url', info_dict['url'])
cdb19aa4 119
120 ctx = {
121 'filename': filename,
122 'total_frags': len(fragments),
123 }
124
3ba7740d 125 self._prepare_and_start_frag_download(ctx, info_dict)
cdb19aa4 126
127 extra_state = ctx.setdefault('extra_state', {
128 'header_written': False,
129 'mime_boundary': str(uuid.uuid4()).replace('-', ''),
130 })
131
132 frag_boundary = extra_state['mime_boundary']
133
134 if not extra_state['header_written']:
135 stub = self._gen_stub(
136 fragments=fragments,
137 frag_boundary=frag_boundary,
138 title=title
139 )
140
141 ctx['dest_stream'].write((
142 'MIME-Version: 1.0\r\n'
143 'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
144 'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
145 'Subject: {title}\r\n'
146 'Content-type: multipart/related; '
147 '' 'boundary="{boundary}"; '
148 '' 'type="text/html"\r\n'
149 'X.yt-dlp.Origin: {origin}\r\n'
150 '\r\n'
151 '--{boundary}\r\n'
152 'Content-Type: text/html; charset=utf-8\r\n'
153 'Content-Length: {length}\r\n'
154 '\r\n'
155 '{stub}\r\n'
156 ).format(
157 origin=origin,
158 boundary=frag_boundary,
159 length=len(stub),
160 title=self._escape_mime(title),
161 stub=stub
162 ).encode('utf-8'))
163 extra_state['header_written'] = True
164
165 for i, fragment in enumerate(fragments):
166 if (i + 1) <= ctx['fragment_index']:
167 continue
168
169 fragment_url = urljoin(fragment_base_url, fragment['path'])
170 success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
171 if not success:
172 continue
173
174 mime_type = b'image/jpeg'
175 if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
176 mime_type = b'image/png'
177 if frag_content.startswith((b'GIF87a', b'GIF89a')):
178 mime_type = b'image/gif'
179 if frag_content.startswith(b'RIFF') and frag_content[8:12] == 'WEBP':
180 mime_type = b'image/webp'
181
182 frag_header = io.BytesIO()
183 frag_header.write(
184 b'--%b\r\n' % frag_boundary.encode('us-ascii'))
185 frag_header.write(
186 b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
187 frag_header.write(
188 b'Content-type: %b\r\n' % mime_type)
189 frag_header.write(
190 b'Content-length: %u\r\n' % len(frag_content))
191 frag_header.write(
192 b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
193 frag_header.write(
194 b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
195 frag_header.write(b'\r\n')
196 self._append_fragment(
197 ctx, frag_header.getvalue() + frag_content + b'\r\n')
198
199 ctx['dest_stream'].write(
200 b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
3ba7740d 201 self._finish_frag_download(ctx, info_dict)
cdb19aa4 202 return True