=head1 NAME
LWP::Protocol::cachedhttp - Regular http prococol with controlable caching.
=cut
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
package LWP::Protocol::cachedhttp;
our $VERSION = '0.1';
=head1 SYNOPSIS
use LWP::UserAgent;
use LWP::Protocol::cachedhttp;
LWP::Protocol::cachedhttp::init(BasePath => '/tmp/cache');
my $ua = LWP::UserAgent->new();
my $resp = $ua->get( 'cachedhttp://www.google.com' );
my $request = HTTP::Request->new( GET => 'cachedhttp://www.google.com' );
$resp = $ua->get( $request );
=head1 DESCRIPTION
A variant of the http-protocol that enables the caller to specify the caching behaviour.
Caching only happens on GET requests.
This is NOT a HTTP-compliant cache. It is intended to allow the calling program to override the
requested cache behaviour from the server. Hence, all Cache-Control (and family) headers are mostly
ignored.
=over
=cut
use strict;
use Carp;
use vars qw(@ISA);
require LWP::Protocol;
@ISA = qw(LWP::Protocol);
require LWP::Protocol::http;
use Digest::MD5 qw/md5_hex/;
use File::Temp qw/tempfile/;
sub isotime {
my( $time ) = @_;
$time = time unless defined $time;
my( $sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst ) = gmtime $time;
$year += 1900;
return sprintf "%04d%02d%02dT%02d%02d%02d", $year, $mon, $mday, $hour, $min, $sec;
}
=item LWP::Protocol::cachedhttp::init( Option => value, ... )
Initializes the cache. Options are:
=over
=item BaseDir
Specifies the directory where the old cache can be found and/or
where newly cached objects will be stored. Set this to C to get normal http behaviour.
=item Verbose
Set this to a true value to print a line for each request giving the URL and the action taken.
=item CacheFor
How long to cache items for in seconds. Setting this to 0 effectively disables caching.
Default is 86400 seconds (1 day).
Note that an expired cache-object is only deleted when the C is called.
This is done automatically in the C block.
=back
=cut
my $verbose = 0;
my $cachedir = undef;
my $default_cachefor = 86400;
sub init {
my %opt = @_;
while( my($k,$v) = each %opt ) {
if( $k eq "BasePath" ) {
$cachedir = $v;
} elsif( $k eq "Verbose" ) {
$verbose = $v;
} elsif( $k eq "CacheFor" ) {
$default_cachefor = $v;
} else {
carp "Unknown option '$k' with value '$v'";
}
}
}
=back
=head1 REQUESTS
Requests are issued as normally, but they use the special cachedhttp:// protocol.
The cacheing is controlled by a set of headers in the request. They all start with
C. These headers are removed before the request is sent to the server.
These control headers are:
=over
=item X-CachedHttp-CacheFor
Indicates how long the response should be cached in seconds. A value of 0 effectively prevents a request
to be cached. The default is set using the C call, but this allows it to be overridden for every request.
This only has effect when the server is contacted.
=item X-CachedHttp-AllowCache
Default is true. A false value prevents the cache to be used for this request. This forces the behaviour of a cache mis.
Note that the server response is still cached as normal.
=item X-CachedHttp-ExpiresAge
When a request has a cached response, this overrides the cached Expires header: the cached entry is assumed to
expire the specified number of seconds after it was fetched. When a cached response is Expired
(original Expires-header or this override) the server is querried to verify the validity of the cached response.
This is especially usefull to force a cache of pages wich do not return an Expires-header. Also make sure that
you cache the page for long enough: CacheFor defaults to 1 day, setting ExpiresAge longer than 1 day in this
case is useless, since the responses will be removed from cache after 1 day.
=back
The returned response has some extra headers added to identify the cache behaviour:
=over
=item X-CachedHttp-Fetched
The ISO timestamp of when this response was last fetched from the server. Note that the actual content may be
older if the server re-validated the cached entry.
=item X-CachedHttp-Expires
The ISO timestamp indicating at what moment this entry is no longer valid. This does not mean that the entry will
be deleted from cache (see CacheUntil). Expired entries are reused when the server re-validates them or you force
them to expire later by using the ExpiresAge header in the request.
=item X-CachedHttp-CacheUntil
The ISO timestamp indicating at what moment this entry is removed from the cache. Note that the actual deletion
of the file will probably happen later.
=item X-Source
The source of the response. Will be 'server', 'cache' or 'validated-cache' (the content is from cache, but it was
re-validated by the server).
=item X-Reason
The reason why the source was chosen. Possibilities are: 'not supported', 'cache not allowed', 'cache mis',
'cache valid', 'cache assume' (when ExpireAge override kicks in), 'cache expired'.
The 'not supported' reason is given for a variety of problems including POST requests, Range-headers, ...
=back
=cut
use Data::Dumper;
sub request {
my($self, $request, $proxy, $arg, $size, $timeout) = @_;
my $uri = $request->uri;
$uri =~ s/^cachedhttp:/http:/;
$request->uri($uri);
my $cachefor = $default_cachefor;
$cachefor = $request->header('X-CachedHttp-CacheFor') if defined $request->header('X-CachedHttp-CacheFor');
$request->header('X-CachedHttp-CacheFor' => undef);
my $allowcache = 1;
$allowcache = $request->header('X-CachedHttp-AllowCache') if defined $request->header('X-CachedHttp-AllowCache');
$request->header('X-CachedHttp-AllowCache' => undef);
my $expiresage = $request->header('X-CachedHttp-ExpiresAge');
$request->header('X-CachedHttp-ExpiresAge' => undef);
$cachefor = int($cachefor);
$allowcache = int($allowcache);
$expiresage = int($expiresage) if defined $expiresage;
my $response = undef;
my $need_verify = 0;
my %extra_header;
if( $request->method ne "GET" ) {
$extra_header{'X-CachedHttp-Reason'} = 'not supported';
} elsif( defined $request->header("Range") ) {
$extra_header{'X-CachedHttp-Reason'} = 'not supported';
} elsif( $allowcache ) {
$extra_header{'X-CachedHttp-Reason'} = 'cache mis';
$response = _get_from_cache($request);
if( defined $response ) {
# cache hit, let's see if its still usable
$extra_header{'X-CachedHttp-Source'} = 'cache';
if( $response->header('X-CachedHttp-CacheUntil') lt isotime() ) {
# Cached entry is no longer cached, but we only just found this out ;-)
$response = undef;
} elsif( $response->header('X-CachedHttp-Expires') ge isotime() ) {
# Cached entry is still valid, use it
$extra_header{'X-CachedHttp-Reason'} = 'cache valid';
$need_verify = 0;
} elsif( defined $expiresage and $response->header('X-CachedHttp-Fetched') ge isotime(time - $expiresage) ) {
# fetched less than $expiresage ago
$extra_header{'X-CachedHttp-Reason'} = 'cache assume';
$need_verify = 0;
} else {
$extra_header{'X-CachedHttp-Reason'} = 'cache expired';
$need_verify = 1;
}
}
} else {
$extra_header{'X-CachedHttp-Reason'} = 'cache not allowed';
}
if( $need_verify ) {
carp "request() : verify not yet implemented";
# TODO
$response = undef;
}
unless( defined $response ) {
my $http = LWP::Protocol::http->new();
$response = $http->request($request, $proxy, $arg, $size, $timeout);
my $expires = time;
$expires = HTTP::Date::str2time($response->header('Expires')) if defined $response->header('Expires');
if( defined $response->header('Cache-Control') ) {
$response->header('Cache-Control') =~ m/max-age=(\d+)/;
$expires = time + $1;
}
$extra_header{'X-CachedHttp-Fetched'} = isotime();
$extra_header{'X-CachedHttp-Source'} = 'server';
$extra_header{'X-CachedHttp-CacheUntil'} = isotime(time + $cachefor);
$extra_header{'X-CachedHttp-Expires'} = isotime($expires);
}
while( my($h,$v) = each %extra_header ) {
$response->header($h, $v);
}
_write_to_cache($response) if defined $cachedir && $cachefor;
print STDERR $request->uri, ": ",
$extra_header{'X-CachedHttp-Reason'}, " - ",
$extra_header{'X-CachedHttp-Source'}, "\n" if $verbose;
return $response;
}
sub remove_old_entries {
for my $fn (glob "$cachedir/*") {
open my $fh, "<", $fn;
my $cacheuntil = "";
$cacheuntil = <$fh> until( !defined $cacheuntil || $cacheuntil =~ m/^X-CachedHttp-CacheUntil: (\d{8}T\d{6})$/ );
close $fh;
unlink $fn if $1 lt isotime();
}
}
sub _write_to_cache {
my ($response) = @_;
# work in a temporary file
my ($fh, $tfn) = tempfile("part.XXXXXXXXXX", DIR => $cachedir);
# BUG: if we crash here, the tempfile is left dangling
# solution: add "UNLINK => 1" to the call and somehow call link (later on) with the inode number instead of the file
print $fh "Url: ", $response->request->uri, "\n";
my @cache_headers = qw/Content-Type Content-Encoding
Content-Length Last-Modified
X-CachedHttp-Fetched X-CachedHttp-CacheUntil/;
for my $header (@cache_headers) {
print $fh "$header: ", $response->header($header), "\n" if defined $response->header($header);
}
print $fh "\n";
print $fh $response->content;
my $fn = "$cachedir/" . _request_hash($response->request);
# now atomically move the file to its intended name
# BUG: possible race condition if someone has renamed our temporary file
unless( link($tfn, $fn) ) {
carp "Hash collision for $fn";
my $suffix = 0;
while( !link($tfn, "$fn.$suffix") ) {
$suffix++;
}
}
unlink $tfn;
}
sub _get_from_cache {
my ($request) = @_;
my @fn = glob "$cachedir/" . _request_hash($request) . "*";
my $fh;
for my $fn (@fn) {
open $fh, "<", "$fn";
my $url = <$fh>; chomp $url;
croak "Invalid entry in cache directory: $fn" unless $url =~ m/^Url: ([^ ]*)$/;
if( $1 eq $request->uri ) {
unlink $fn;
last;
}
close $fh;
$fh = undef;
}
return undef unless defined $fh;
my $response = HTTP::Response->new(200);
$response->message("OK");
$response->request($request);
while( my $line = <$fh> ) {
chomp $line;
last if $line eq "";
$line =~ m/^([-_a-zA-Z0-9]*): (.*)$/;
$response->header($1 => $2);
}
local $/; # slurp mode, make <$fh> read all remaining lines at once
$response->content(<$fh>);
return $response;
}
sub _request_hash {
my ($request) = @_;
return md5_hex( $request->uri() );
}
END {
remove_old_entries();
}
1;