]> jfr.im git - yt-dlp.git/blame - yt_dlp/downloader/mhtml.py
[cleanup] Minor fixes (See desc)
[yt-dlp.git] / yt_dlp / downloader / mhtml.py
CommitLineData
cdb19aa4 1import io
2import quopri
3import re
4import uuid
5
6from .fragment import FragmentFD
f8271158 7from ..utils import escapeHTML, formatSeconds, srt_subtitles_timecode, urljoin
cdb19aa4 8from ..version import __version__ as YT_DLP_VERSION
9
10
11class MhtmlFD(FragmentFD):
12 FD_NAME = 'mhtml'
13
14 _STYLESHEET = """\
15html, body {
16 margin: 0;
17 padding: 0;
18 height: 100vh;
19}
20
21html {
22 overflow-y: scroll;
23 scroll-snap-type: y mandatory;
24}
25
26body {
27 scroll-snap-type: y mandatory;
28 display: flex;
29 flex-flow: column;
30}
31
32body > figure {
33 max-width: 100vw;
34 max-height: 100vh;
35 scroll-snap-align: center;
36}
37
38body > figure > figcaption {
39 text-align: center;
40 height: 2.5em;
41}
42
43body > figure > img {
44 display: block;
45 margin: auto;
46 max-width: 100%;
47 max-height: calc(100vh - 5em);
48}
49"""
50 _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
51 _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)
52
53 @staticmethod
54 def _escape_mime(s):
55 return '=?utf-8?Q?' + (b''.join(
56 bytes((b,)) if b >= 0x20 else b'=%02X' % b
0f06bcd7 57 for b in quopri.encodestring(s.encode(), header=True)
cdb19aa4 58 )).decode('us-ascii') + '?='
59
60 def _gen_cid(self, i, fragment, frag_boundary):
61 return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)
62
63 def _gen_stub(self, *, fragments, frag_boundary, title):
64 output = io.StringIO()
65
66 output.write((
67 '<!DOCTYPE html>'
68 '<html>'
69 '<head>'
70 '' '<meta name="generator" content="yt-dlp {version}">'
71 '' '<title>{title}</title>'
72 '' '<style>{styles}</style>'
73 '<body>'
74 ).format(
75 version=escapeHTML(YT_DLP_VERSION),
76 styles=self._STYLESHEET,
77 title=escapeHTML(title)
78 ))
79
80 t0 = 0
81 for i, frag in enumerate(fragments):
82 output.write('<figure>')
83 try:
84 t1 = t0 + frag['duration']
85 output.write((
86 '<figcaption>Slide #{num}: {t0} – {t1} (duration: {duration})</figcaption>'
87 ).format(
88 num=i + 1,
89 t0=srt_subtitles_timecode(t0),
90 t1=srt_subtitles_timecode(t1),
91 duration=formatSeconds(frag['duration'], msec=True)
92 ))
93 except (KeyError, ValueError, TypeError):
94 t1 = None
95 output.write((
96 '<figcaption>Slide #{num}</figcaption>'
97 ).format(num=i + 1))
98 output.write('<img src="cid:{cid}">'.format(
99 cid=self._gen_cid(i, frag, frag_boundary)))
100 output.write('</figure>')
101 t0 = t1
102
103 return output.getvalue()
104
105 def real_download(self, filename, info_dict):
106 fragment_base_url = info_dict.get('fragment_base_url')
107 fragments = info_dict['fragments'][:1] if self.params.get(
108 'test', False) else info_dict['fragments']
d76991ab 109 title = info_dict.get('title', info_dict['format_id'])
110 origin = info_dict.get('webpage_url', info_dict['url'])
cdb19aa4 111
112 ctx = {
113 'filename': filename,
114 'total_frags': len(fragments),
115 }
116
3ba7740d 117 self._prepare_and_start_frag_download(ctx, info_dict)
cdb19aa4 118
119 extra_state = ctx.setdefault('extra_state', {
120 'header_written': False,
121 'mime_boundary': str(uuid.uuid4()).replace('-', ''),
122 })
123
124 frag_boundary = extra_state['mime_boundary']
125
126 if not extra_state['header_written']:
127 stub = self._gen_stub(
128 fragments=fragments,
129 frag_boundary=frag_boundary,
130 title=title
131 )
132
133 ctx['dest_stream'].write((
134 'MIME-Version: 1.0\r\n'
135 'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
136 'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
137 'Subject: {title}\r\n'
138 'Content-type: multipart/related; '
139 '' 'boundary="{boundary}"; '
140 '' 'type="text/html"\r\n'
141 'X.yt-dlp.Origin: {origin}\r\n'
142 '\r\n'
143 '--{boundary}\r\n'
144 'Content-Type: text/html; charset=utf-8\r\n'
145 'Content-Length: {length}\r\n'
146 '\r\n'
147 '{stub}\r\n'
148 ).format(
149 origin=origin,
150 boundary=frag_boundary,
151 length=len(stub),
152 title=self._escape_mime(title),
153 stub=stub
0f06bcd7 154 ).encode())
cdb19aa4 155 extra_state['header_written'] = True
156
157 for i, fragment in enumerate(fragments):
158 if (i + 1) <= ctx['fragment_index']:
159 continue
160
b3edc806 161 fragment_url = fragment.get('url')
162 if not fragment_url:
163 assert fragment_base_url
164 fragment_url = urljoin(fragment_base_url, fragment['path'])
165
d71fd412 166 success = self._download_fragment(ctx, fragment_url, info_dict)
cdb19aa4 167 if not success:
168 continue
d71fd412 169 frag_content = self._read_fragment(ctx)
cdb19aa4 170
171 mime_type = b'image/jpeg'
172 if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
173 mime_type = b'image/png'
174 if frag_content.startswith((b'GIF87a', b'GIF89a')):
175 mime_type = b'image/gif'
1d485a1a 176 if frag_content.startswith(b'RIFF') and frag_content[8:12] == b'WEBP':
cdb19aa4 177 mime_type = b'image/webp'
178
179 frag_header = io.BytesIO()
180 frag_header.write(
181 b'--%b\r\n' % frag_boundary.encode('us-ascii'))
182 frag_header.write(
183 b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
184 frag_header.write(
185 b'Content-type: %b\r\n' % mime_type)
186 frag_header.write(
187 b'Content-length: %u\r\n' % len(frag_content))
188 frag_header.write(
189 b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
190 frag_header.write(
191 b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
192 frag_header.write(b'\r\n')
193 self._append_fragment(
194 ctx, frag_header.getvalue() + frag_content + b'\r\n')
195
196 ctx['dest_stream'].write(
197 b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
3ba7740d 198 self._finish_frag_download(ctx, info_dict)
cdb19aa4 199 return True