#!/usr/bin/perl

# Skynet grabber

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.



use strict;
use warnings;

use Getopt::Long 2.36 qw(GetOptionsFromString);
use LWP::UserAgent;
use HTML::TreeBuilder;
use Date::Manip;
use XMLTV;

use LWP::Protocol::cachedhttp;

sub uniq(@) {
	# filters out duplicates from the array, preserves order
	my %seen;
	return grep !$seen{$_}++, @_;
}

sub make_dates ($$) {
	my ($start, $days) = @_;
	my @date;
	# makes an array of dates (in YYYY-MM-DD) format
	# starting from $start, $days days long
	
	my $date = ParseDate($start);
	do {
		push @date, UnixDate($date, "%Y-%m-%d");
		$date = Date_GetNext($date, undef, 0, "00:00");
	} while( --$days );
	return @date;
}

sub file_put_contents ($\$) {
	my ($filename, $content) = @_;
	# creates a file "$filename" and put $$content in it
	
	open my $fh, ">", $filename or return undef;
	print $fh $$content;
	close $fh;

	return "ok";
}

sub file_get_contents($) {
	my ($filename) = @_;
	# reads the file into a scalar

	open my $fh, "<", $filename or return undef;
	my @lines = <$fh>;
	close $fh;

	return join "\n", @lines;
}

my @download_done;
my $parallel_grab = 0;
sub queue_download ($%) {
	my ($req, %option) = @_;
	# add the given request to the download queue
	# the result will be stored in the @download_done array
	#   $download_done[] = { 'url' => $url, 'status' => 'ok', 'content' => $content };
	#   $download_done[] = { 'url' => $url, 'status' => 'failed' };
	#
	# options:
	#   'extra' => { }    add this to the returned hash

	$option{'extra'} = {} if not exists $option{'extra'};

	my $url = $req->uri;
	
	if( $parallel_grab ) {
		# TODO
	} else {
		my $dl = download($req);
		if( defined $dl ) {
			my %item = %{ $option{'extra'} };
			$item{'url'} = $url;
			$item{'status'} = 'ok';
			$item{'content'} = $dl;
			push @download_done, \%item;
		} else {
			push @download_done, { 'url' => $url, 'status' => 'failed' }
		}
	}
}

sub get_next_downloaded () {
	sleep 5;
	return shift @download_done if( @download_done );
	# no download done, wait for one
	# TODO
}

sub download ($) {
	my ($req) = @_;
	# Download the given url
	# returns the data is a hash:
	#   { 'headers' => $headers, 'content' => $content }
	# or undef if the download failed

	my $ua = LWP::UserAgent->new();
	my $resp = $ua->request($req);
	return undef unless $resp->is_success;

	return 'content' => $resp->content;
}

sub channel_req ($$) {
	my ($channel, $date) = @_;
	# generates the HTTP request to get the channel programming for the given date
	my $req = HTTP::Request->new(GET =>
		"cachedhttp://www.skynet.be/entertainment-nl/tv/kanalen?channelid=$channel&date=$date");
	$req->header('X-CachedHttp-CacheFor' => 0); # refetch every time
	return $req;
}

sub parse_channel ($) {
	my ($page) = @_;
	my @prog;
	# parses the channel listings
	# returns an array of program-id's

	my $tree = HTML::TreeBuilder->new_from_content($page);
	my @item = $tree->look_down('id', 'programsByDate')->content_list;	# returns the <li>'s for each program

	while( my $item = shift @item ) {
		my $id = $item->look_down('_tag', 'a')->{'href'}; # find the <a> tag inside and return its href property
		$id =~ m+programkey=([^&/]*)+; # filter out the programkey
		push @prog, $1;
	}

	$tree->delete;

	return @prog;
}

sub guess_date($$) {
	my ($day, $month) = @_;

	my %maanden = ('Januari' => 1,
			'Februari' => 2,
			'Maart' => 3,
			'April' => 4,
			'Mei' => 5,
			'Juni' => 6,
			'Juli' => 7,
			'Augustus' => 8,
			'September' => 9,
			'Oktober' => 10,
			'November' => 11,
			'December' => 12 );
	$month = $maanden{$month};

	my $year = (localtime(time))[5]+1900; # get current year
	my $date = ParseDate("$year-$month-$day");

	if( Date_Cmp($date, ParseDate("yesterday 00:00")) < 0 ) {
		# $date is before yesterday, we are probably end of december
		# and the $date is in january
		$year++;
		$date = ParseDate("$year-$month-$day");
	}

	return UnixDate($date, "%Y%m%d");
}

sub find_stop_date ($$$) {
	my ($start_date, $start_time, $stop_time) = @_;
	# calculates the stop date, if the program crosses midnight

	$start_time =~ s/(\d\d)(\d\d)/$1:$2/;
	$stop_time =~ s/(\d\d)(\d\d)/$1:$2/;
	my $date = Date_GetNext("$start_date $start_time", undef, 1, $stop_time);
	return UnixDate($date, "%Y%m%d");
}

sub program_req ($) {
	my ($prog) = @_;
	# generates the HTTP request to get the program info
	my $req = HTTP::Request->new(GET => 
		"cachedhttp://www.skynet.be/entertainment-nl/tv/tv-gids/detail?programkey=$prog");
	$req->header('X-CachedHttp-ExpiresAge' => 5*24*60*60);
	$req->header('X-CachedHttp-CacheFor' => 5*24*60*60);
	return $req;
}

my %chan_to_grab;
my %cat_map;
sub parse_program ($) {
	my ($page) = @_;
	my %info;
	# parse the program page
	# returns a hash with the xmltv-info in it

	my $tree = HTML::TreeBuilder->new_from_content($page);
	my $details = $tree->look_down('_tag' => 'div', 'class' => 'box programDetails');
	if( !defined $details ) {
		print STDERR "Couldn't find '<div class=\"box programDetails\">'\n";
		return {};
	}

	my $title = $details->look_down('_tag' => 'h2');
	$info{'title'} = [ [ $title->content->[0] ] ];

	my $subtitle = $details->look_down('_tag' => 'h3');
	$info{'sub-title'} = [ [ $subtitle->content->[0] ] ] if defined $subtitle;
	
	my $channel_link = $details->look_down('_tag' => 'a', 'href' => qr/\?channelid=/);
	(my $channelid = $channel_link->{'href'}) =~ s/^.*channelid=([^&]*).*$/$1/;
	$info{'channel'} = $chan_to_grab{ $channelid }->{'id'};

	my $when = ($channel_link->parent->content_list)[0];
	$when =~ m/^[^ ]* (\d+) ([^ ]*) van (\d?\d):(\d\d) tot (\d?\d):(\d\d) /;
	my $start = "$3$4";
	$start = '0' . $start if length($start) == 3;
	my $stop = "$5$6";
	$stop = '0' . $stop if length($stop) == 3;
	my $date = guess_date($1, $2);

	$info{'start'} = $date . $start;
	$info{'stop'} = find_stop_date($date, $start, $stop) . $stop;

	my $belgacomtv = $details->look_down('_tag' => 'a', 'href' => qr/belgacomtv/);
	my $desc_tree = $belgacomtv->parent()->right();
	my $desc = '';
	while( $desc_tree->{'_tag'} eq 'p' && !defined($desc_tree->look_down('_tag' => 'strong')) ) {
		# stop when reaching anything else than <p> items
		# or when the <p> contains a <strong>, in which case it'll be the Cast
		$desc .= "\n" . ( $desc_tree->content_list() )[0] if defined $desc_tree->content;
		$desc_tree = $desc_tree->right();
	}
	$desc =~ s/^\n//s; # remove first newline
	$info{'desc'} = [ [ $desc ] ] if $desc ne '';

	my $info = $details->look_down('_tag' => 'div', 'class' => 'info');
	for my $infoitem ($info->content_list) {
		my $field = $infoitem->content->[0]->content->[0];
		my $data = $infoitem->content->[1]; $data =~ s/^ //;

		if( $field =~ m/^Regisseur:/ ) {
			$info{'credits'}{'director'} = [ split /, ?/, $data ];
		} elsif( $field =~ m/^Jaar:/ ) {
			$info{'date'} = $data;
		} elsif( $field =~ m/^Genre:/ ) {
			$info{'category'} = [ map { $_ = $cat_map{$_} if exists $cat_map{$_}; [ $_ ] } uniq split / - /, $data ];
		} elsif($field =~ m/^Land:/ ) {
			$info{'country'} = [ [ $data ] ];
		} elsif( $field =~ m/^Originele taal:/ ) {
			$info{'orig-language'} = [ $data ];
		} elsif( $field =~ m/^Taal:/ ) {
			$info{'language'} = [ $data ];
		}
	}
	{ # cast is (for whatever reason) listed ABOVE the info-box
		my $cast = $info->left();
		if( defined $cast->look_down('_tag' => 'strong') ) {
			my $field = $cast->content->[0]->content->[0];
			my $data = $cast->content->[1]; $data =~ s/^ //;
			if( $field =~ m/^Cast:/ ) {
				$info{'credits'}{'actor'} = [ split /, ?/, $data ];
			}
		}
	}

	$tree->delete;

	return %info;
}

sub filter_time ($$$$$) {
	my ($prog_start, $prog_stop, $from, $to, $pass) = @_;
	my @passed;
	# filters the program
	# $prog_start and $prog stop are YYYYMMDDHHMM strings indicating when the program is
	# $from and $to are HHMM strings indicating a daily repeating time period. 
	#   $from must be before $to; else switch $from and $to and invert $pass
	# $pass tells what to do: 
	#   a true value filters the program so it remains INSIDE the specified times
	#   a false value filters the program so it remains OUTSIDE the specified times
	# the return value is an array containing the two-element (start, stop) array.
	
	my $prog_start_time = substr $prog_start, 8;
	my $prog_stop_time = substr $prog_stop, 8;

	# first find out if the start of the program is passed or filtered
	# passing if the start time is in from-to and $pass is true
	my $passing = ( ( $from le $prog_start_time and $prog_start_time le $to ) xor !$pass );
	
	# convert to HH:MM (needed for Date_GetNext)
	$from =~ s/(\d\d)(\d\d)/$1:$2/;
	$to =~ s/(\d\d)(\d\d)/$1:$2/;

	# now advance the time until we reach the $prog_stop
	# stop at each $from and $to time
	my $time = $prog_start;
	while( $time lt $prog_stop ) {	# YYYYMMDDHHMM is comparable with regular operators
		my $next_time = Date_GetNext($time, undef, 1, ( ($passing xor !$pass) ? $to : $from) );
		$next_time =~ s/:\d\d$//;	# strip seconds
		$next_time =~ s/[-: ]//g;	# strip markup
		$next_time = $prog_stop if $next_time gt $prog_stop;

		push @passed, [ $time, $next_time ] if $passing;
		$time = $next_time;
		$passing = not $passing; # invert the flag
	}

	return @passed;
}

sub usage () {
	print STDERR "$0 [options]\n",
			"\n",#678901234567890123456789012345678901234567890123456789012345678901234567890
			"  --config file       Read file as if it was on the command line\n",
			"                      Watch out for infinite loops!!!!!\n",
			"  --cachedir dir      Use dir as a cache dir\n",
			"  --output file       Where to save the output (default stdout)\n",
			"  --channel-list      Prints out the channel list and exits\n",
			"  --grab-channel id=name[=display]\n",
			"                      Grab a channel (use multiple times)\n",
			"                      id is the ID used on the website (use --channel-list to\n",
			"                      obtain them)\n",
			"                      name is the xmltv id to use\n",
			"                      display is the xmltv display-name. Defaults to [name]\n",
			"  --grab-date date    Grab the data for the given date in YYYY-MM-DD format\n",
			"                      (use multiple times)\n",
			"  --grab-days days    Grab [days] days, starting from today\n",
			"                      This clears all previous --grab-date entries\n",
			"  --parallel-grab N   Use N children to do the grabbing\n",
			"                      N=0 (default) means all downloads will be done synchroneous\n",
			"  --cat-map orig=new  Map category 'orig' to 'new'\n",
			"  --join base=overlay\@from-to\n",
			"                      Join the two channels 'base' and 'overlay' by replacing\n",
			"                      base's programming from-to the given timeframe with overlay's\n",
			"                      from and to are in HHMM format and may span midnight\n",
			"  --verbose           Output debug messages\n",
			"  --help              This message\n",
			"\n",
			"Multiple channels can be combined into a single XMLTV channel by mapping them\n",
			"to the same 'name' value. (The display-name is taken from the last entry).\n",
			"They can also be joined by the --join command, which will filter first.\n",
			"";
}

# Parse options
my $cachedir = undef;
my $outfile = undef;
my @date_to_grab;
my $verbose = 0;
my @join;
my $retval = 0;

my %cmdline_option = (	'cachedir=s' => \$cachedir,
			'output=s' => \$outfile,
			'channel-list' => sub { die("TODO") },
			'grab-channel=s' => \%chan_to_grab,
			'grab-date=s' => \@date_to_grab,
			'grab-days=i' => sub { @date_to_grab = make_dates("today", $_[1]); },
			'parallel-grab=i' => \$parallel_grab,
			'cat-map=s' => \%cat_map,
			'join=s' => \@join,
			'verbose!' => \$verbose,
			'help' => sub { usage; exit 1; },
			);
$cmdline_option{'config=s'} = sub { my $cnfg = file_get_contents($_[1]);
				    die("Couldn't read config file $_[1]") if not defined $cnfg;
				    GetOptionsFromString($cnfg, %cmdline_option); };	# add this later, so we can reference %options

GetOptions(%cmdline_option);

LWP::Protocol::cachedhttp::init(BasePath => $cachedir, Verbose => $verbose);

# process options
while( my ($k, $v) = each(%chan_to_grab) ) {
	# convert "$id" => "$name=$display"
	# into "id" => { "name" => $name, "display" => $display }
	my ($name, $display) = split /=/, $v, 2;
	$display = $name if not defined $display;
	$chan_to_grab{$k} = { 'id' => $name, 'display-name' => [ [ $display ] ] };
}
foreach my $join (@join) {
	$join =~ m/^(.*?)=(.*?)@(\d\d\d\d)-(\d\d\d\d)$/
		or die("Invalid syntax for --join $join");
	$join = { 'base' => $1, 'overlay' => $2 };
	if( $3 lt $4 ) {
		# normal
		$join->{'start'} = $3;
		$join->{'stop'} = $4;
		$join->{'invert'} = 0;
	} else {
		# spans midnight, invert it
		$join->{'start'} = $4;
		$join->{'stop'} = $3;
		$join->{'invert'} = 1;
	}
}

# start grabber children
print STDERR "parallel-grab not supported yet, disabling\n" if $parallel_grab;
$parallel_grab = 0;
for my $child (1..$parallel_grab) {
	# TODO
}

# prepare XMLTV output
my $xmltv_out;
if( defined $outfile ) {
	open my $output, ">", $outfile or die("Couldn't open output file $outfile");
	$xmltv_out = new XMLTV::Writer('OUTPUT' => $output);
} else {
	$xmltv_out = new XMLTV::Writer();
}
$xmltv_out->start({});

# iterate over channels, fetch listings
while( my ($id, $info) = each %chan_to_grab ) {
	$xmltv_out->write_channel( $info );
	foreach my $date (@date_to_grab) {
		queue_download( channel_req($id, $date), 'extra' => { 'type' => 'listing' } );
	}
}

# parse downloads
while( my $dl = get_next_downloaded ) {
	if( $dl->{'status'} eq 'failed'  ) {
		print STDERR "Failed to DL " . $dl->{'url'} . "\n";
		$retval = 1;
	} else {
		print STDERR "Parsing ", $dl->{'url'}, "\n" if $verbose;
		if( $dl->{'type'} eq 'listing' ) {
			my @prog = parse_channel( $dl->{'content'} );
			foreach my $prog (@prog) {
				queue_download( program_req($prog), 'extra' => { 'type' => 'prog' } );
			}

		} elsif( $dl->{'type'} eq 'prog' ) {
			my %prog = parse_program( $dl->{'content'} );
			#my %prog = eval { parse_program( $dl->{'content'} ) };
			#if( $@ ) {
				# error occured; it's already printed, so we just clean up
			#	$retval = 1;
			#	next;
			#}
			# see if it is joined
			my $prog_start = $prog{'start'};
			my $prog_stop = $prog{'stop'};
			my @time = ( [$prog_start, $prog_stop] );
			for my $join (@join) {
				if( $prog{'channel'} eq $join->{'base'} ) {
					# see if this program falls OUTSIDE from-to
					@time = filter_time( $prog_start, $prog_stop, $join->{'start'}, $join->{'stop'},  $join->{'invert'} );
				} elsif( $prog{'channel'} eq $join->{'overlay'} ) {
					$prog{'channel'} = $join->{'base'};
					# see if this program falls INSIDE from-to
					@time = filter_time( $prog_start, $prog_stop, $join->{'start'}, $join->{'stop'}, !$join->{'invert'} );
				}
			}
			# add a programme for each (possible split) time
			# and adapt the title
			my $title = $prog{'title'}->[0]->[0];
			for my $i ( 0..(@time-1) ) {
				$prog{'start'} = $time[$i]->[0];
				$prog{'stop'} = $time[$i]->[1];

				my $t = $title;
				$t = "|< " . $t unless $time[$i]->[0] eq $prog_start;
				$t = $t . " >|" unless $time[$i]->[1] eq $prog_stop;
				$prog{'title'} = [ [ $t ] ];

				#print STDERR $prog{'start'} . " - " . $prog{'stop'} . " : " . $prog{'title'}->[0]->[0] . "\n";
				$xmltv_out->write_programme( \%prog );
			}
		} else {
			die("Unknown download: " . $dl->{'type'});
		}
	}
}


$xmltv_out->end();

exit $retval;

