1 # SPDX-FileCopyrightText: 2015 Eric Larson
3 # SPDX-License-Identifier: Apache-2.0
6 The httplib2 algorithms ported for use with requests.
12 from email
.utils
import parsedate_tz
14 from pip
._vendor
.requests
.structures
import CaseInsensitiveDict
16 from .cache
import DictCache
, SeparateBodyBaseCache
17 from .serialize
import Serializer
20 logger
= logging
.getLogger(__name__
)
22 URI
= re
.compile(r
"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
24 PERMANENT_REDIRECT_STATUSES
= (301, 308)
28 """Parses a URI using the regex given in Appendix B of RFC 3986.
30 (scheme, authority, path, query, fragment) = parse_uri(uri)
32 groups
= URI
.match(uri
).groups()
33 return (groups
[1], groups
[3], groups
[4], groups
[6], groups
[8])
36 class CacheController(object):
37 """An interface to see if request should cached or not."""
40 self
, cache
=None, cache_etags
=True, serializer
=None, status_codes
=None
42 self
.cache
= DictCache() if cache
is None else cache
43 self
.cache_etags
= cache_etags
44 self
.serializer
= serializer
or Serializer()
45 self
.cacheable_status_codes
= status_codes
or (200, 203, 300, 301, 308)
48 def _urlnorm(cls
, uri
):
49 """Normalize the URL to create a safe key for the cache"""
50 (scheme
, authority
, path
, query
, fragment
) = parse_uri(uri
)
51 if not scheme
or not authority
:
52 raise Exception("Only absolute URIs are allowed. uri = %s" % uri
)
54 scheme
= scheme
.lower()
55 authority
= authority
.lower()
60 # Could do syntax based normalization of the URI before
61 # computing the digest. See Section 6.2.2 of Std 66.
62 request_uri
= query
and "?".join([path
, query
]) or path
63 defrag_uri
= scheme
+ "://" + authority
+ request_uri
68 def cache_url(cls
, uri
):
69 return cls
._urlnorm
(uri
)
71 def parse_cache_control(self
, headers
):
73 # https://tools.ietf.org/html/rfc7234#section-5.2
74 "max-age": (int, True),
75 "max-stale": (int, False),
76 "min-fresh": (int, True),
77 "no-cache": (None, False),
78 "no-store": (None, False),
79 "no-transform": (None, False),
80 "only-if-cached": (None, False),
81 "must-revalidate": (None, False),
82 "public": (None, False),
83 "private": (None, False),
84 "proxy-revalidate": (None, False),
85 "s-maxage": (int, True),
88 cc_headers
= headers
.get("cache-control", headers
.get("Cache-Control", ""))
92 for cc_directive
in cc_headers
.split(","):
93 if not cc_directive
.strip():
96 parts
= cc_directive
.split("=", 1)
97 directive
= parts
[0].strip()
100 typ
, required
= known_directives
[directive
]
102 logger
.debug("Ignoring unknown cache-control directive: %s", directive
)
105 if not typ
or not required
:
106 retval
[directive
] = None
109 retval
[directive
] = typ(parts
[1].strip())
113 "Missing value for cache-control " "directive: %s",
118 "Invalid value for cache-control directive " "%s, must be %s",
125 def cached_request(self
, request
):
127 Return a cached response if it exists in the cache, otherwise
130 cache_url
= self
.cache_url(request
.url
)
131 logger
.debug('Looking up "%s" in the cache', cache_url
)
132 cc
= self
.parse_cache_control(request
.headers
)
134 # Bail out if the request insists on fresh data
136 logger
.debug('Request header has "no-cache", cache bypassed')
139 if "max-age" in cc
and cc
["max-age"] == 0:
140 logger
.debug('Request header has "max_age" as 0, cache bypassed')
143 # Request allows serving from the cache, let's see if we find something
144 cache_data
= self
.cache
.get(cache_url
)
145 if cache_data
is None:
146 logger
.debug("No cache entry available")
149 if isinstance(self
.cache
, SeparateBodyBaseCache
):
150 body_file
= self
.cache
.get_body(cache_url
)
154 # Check whether it can be deserialized
155 resp
= self
.serializer
.loads(request
, cache_data
, body_file
)
157 logger
.warning("Cache entry deserialization failed, entry ignored")
160 # If we have a cached permanent redirect, return it immediately. We
161 # don't need to test our response for other headers b/c it is
162 # intrinsically "cacheable" as it is Permanent.
165 # https://tools.ietf.org/html/rfc7231#section-6.4.2
167 # Client can try to refresh the value by repeating the request
168 # with cache busting headers as usual (ie no-cache).
169 if int(resp
.status
) in PERMANENT_REDIRECT_STATUSES
:
171 "Returning cached permanent redirect response "
172 "(ignoring date and etag information)"
177 headers
= CaseInsensitiveDict(resp
.headers
)
178 if not headers
or "date" not in headers
:
179 if "etag" not in headers
:
180 # Without date or etag, the cached response can never be used
181 # and should be deleted.
182 logger
.debug("Purging cached response: no date or etag")
183 self
.cache
.delete(cache_url
)
184 logger
.debug("Ignoring cached response: no date")
188 date
= calendar
.timegm(parsedate_tz(headers
["date"]))
189 current_age
= max(0, now
- date
)
190 logger
.debug("Current age based on date: %i", current_age
)
192 # TODO: There is an assumption that the result will be a
193 # urllib3 response object. This may not be best since we
194 # could probably avoid instantiating or constructing the
195 # response until we know we need it.
196 resp_cc
= self
.parse_cache_control(headers
)
198 # determine freshness
199 freshness_lifetime
= 0
201 # Check the max-age pragma in the cache control header
202 if "max-age" in resp_cc
:
203 freshness_lifetime
= resp_cc
["max-age"]
204 logger
.debug("Freshness lifetime from max-age: %i", freshness_lifetime
)
206 # If there isn't a max-age, check for an expires header
207 elif "expires" in headers
:
208 expires
= parsedate_tz(headers
["expires"])
209 if expires
is not None:
210 expire_time
= calendar
.timegm(expires
) - date
211 freshness_lifetime
= max(0, expire_time
)
212 logger
.debug("Freshness lifetime from expires: %i", freshness_lifetime
)
214 # Determine if we are setting freshness limit in the
215 # request. Note, this overrides what was in the response.
217 freshness_lifetime
= cc
["max-age"]
219 "Freshness lifetime from request max-age: %i", freshness_lifetime
222 if "min-fresh" in cc
:
223 min_fresh
= cc
["min-fresh"]
224 # adjust our current age by our min fresh
225 current_age
+= min_fresh
226 logger
.debug("Adjusted current age from min-fresh: %i", current_age
)
228 # Return entry if it is fresh enough
229 if freshness_lifetime
> current_age
:
230 logger
.debug('The response is "fresh", returning cached response')
231 logger
.debug("%i > %i", freshness_lifetime
, current_age
)
234 # we're not fresh. If we don't have an Etag, clear it out
235 if "etag" not in headers
:
236 logger
.debug('The cached response is "stale" with no etag, purging')
237 self
.cache
.delete(cache_url
)
239 # return the original handler
242 def conditional_headers(self
, request
):
243 cache_url
= self
.cache_url(request
.url
)
244 resp
= self
.serializer
.loads(request
, self
.cache
.get(cache_url
))
248 headers
= CaseInsensitiveDict(resp
.headers
)
250 if "etag" in headers
:
251 new_headers
["If-None-Match"] = headers
["ETag"]
253 if "last-modified" in headers
:
254 new_headers
["If-Modified-Since"] = headers
["Last-Modified"]
258 def _cache_set(self
, cache_url
, request
, response
, body
=None, expires_time
=None):
260 Store the data in the cache.
262 if isinstance(self
.cache
, SeparateBodyBaseCache
):
263 # We pass in the body separately; just put a placeholder empty
264 # string in the metadata.
267 self
.serializer
.dumps(request
, response
, b
""),
268 expires
=expires_time
,
270 self
.cache
.set_body(cache_url
, body
)
274 self
.serializer
.dumps(request
, response
, body
),
275 expires
=expires_time
,
278 def cache_response(self
, request
, response
, body
=None, status_codes
=None):
280 Algorithm for caching requests.
282 This assumes a requests Response object.
284 # From httplib2: Don't cache 206's since we aren't going to
285 # handle byte range requests
286 cacheable_status_codes
= status_codes
or self
.cacheable_status_codes
287 if response
.status
not in cacheable_status_codes
:
289 "Status code %s not in %s", response
.status
, cacheable_status_codes
293 response_headers
= CaseInsensitiveDict(response
.headers
)
295 if "date" in response_headers
:
296 date
= calendar
.timegm(parsedate_tz(response_headers
["date"]))
300 # If we've been given a body, our response has a Content-Length, that
301 # Content-Length is valid then we can check to see if the body we've
302 # been given matches the expected size, and if it doesn't we'll just
303 # skip trying to cache it.
306 and "content-length" in response_headers
307 and response_headers
["content-length"].isdigit()
308 and int(response_headers
["content-length"]) != len(body
)
312 cc_req
= self
.parse_cache_control(request
.headers
)
313 cc
= self
.parse_cache_control(response_headers
)
315 cache_url
= self
.cache_url(request
.url
)
316 logger
.debug('Updating cache with response from "%s"', cache_url
)
318 # Delete it from the cache if we happen to have it stored there
322 logger
.debug('Response header has "no-store"')
323 if "no-store" in cc_req
:
325 logger
.debug('Request header has "no-store"')
326 if no_store
and self
.cache
.get(cache_url
):
327 logger
.debug('Purging existing cache entry to honor "no-store"')
328 self
.cache
.delete(cache_url
)
332 # https://tools.ietf.org/html/rfc7234#section-4.1:
333 # A Vary header field-value of "*" always fails to match.
334 # Storing such a response leads to a deserialization warning
335 # during cache lookup and is not allowed to ever be served,
336 # so storing it can be avoided.
337 if "*" in response_headers
.get("vary", ""):
338 logger
.debug('Response header has "Vary: *"')
341 # If we've been given an etag, then keep the response
342 if self
.cache_etags
and "etag" in response_headers
:
344 if response_headers
.get("expires"):
345 expires
= parsedate_tz(response_headers
["expires"])
346 if expires
is not None:
347 expires_time
= calendar
.timegm(expires
) - date
349 expires_time
= max(expires_time
, 14 * 86400)
351 logger
.debug("etag object cached for {0} seconds".format(expires_time
))
352 logger
.debug("Caching due to etag")
353 self
._cache
_set
(cache_url
, request
, response
, body
, expires_time
)
355 # Add to the cache any permanent redirects. We do this before looking
356 # that the Date headers.
357 elif int(response
.status
) in PERMANENT_REDIRECT_STATUSES
:
358 logger
.debug("Caching permanent redirect")
359 self
._cache
_set
(cache_url
, request
, response
, b
"")
361 # Add to the cache if the response headers demand it. If there
362 # is no date header then we can't do anything about expiring
364 elif "date" in response_headers
:
365 date
= calendar
.timegm(parsedate_tz(response_headers
["date"]))
366 # cache when there is a max-age > 0
367 if "max-age" in cc
and cc
["max-age"] > 0:
368 logger
.debug("Caching b/c date exists and max-age > 0")
369 expires_time
= cc
["max-age"]
378 # If the request can expire, it means we should cache it
380 elif "expires" in response_headers
:
381 if response_headers
["expires"]:
382 expires
= parsedate_tz(response_headers
["expires"])
383 if expires
is not None:
384 expires_time
= calendar
.timegm(expires
) - date
389 "Caching b/c of expires header. expires in {0} seconds".format(
401 def update_cached_response(self
, request
, response
):
402 """On a 304 we will get a new set of headers that we want to
403 update our cached value with, assuming we have one.
405 This should only ever be called when we've sent an ETag and
406 gotten a 304 as the response.
408 cache_url
= self
.cache_url(request
.url
)
410 cached_response
= self
.serializer
.loads(request
, self
.cache
.get(cache_url
))
412 if not cached_response
:
413 # we didn't have a cached response
416 # Lets update our headers with the headers from the new request:
417 # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
419 # The server isn't supposed to send headers that would make
420 # the cached body invalid. But... just in case, we'll be sure
421 # to strip out ones we know that might be problmatic due to
422 # typical assumptions.
423 excluded_headers
= ["content-length"]
425 cached_response
.headers
.update(
428 for k
, v
in response
.headers
.items()
429 if k
.lower() not in excluded_headers
433 # we want a 200 b/c we have content via the cache
434 cached_response
.status
= 200
437 self
._cache
_set
(cache_url
, request
, cached_response
)
439 return cached_response