#!/usr/bin/perl

# taco - a mass downloader of instrument approach information

$taco_version = "0.0.0.1";
$taco_date= "September 10, 2007";
$taco_developer="Frank Stutzman (taco-dev\@stutzman.com)";

# set up some constants
$freebsd=1;          #the freebsd machine I sometimes develop on has
                     #some things in differnt places.  My one concession
                     #to the gods of portability

$wget = $freebsd ? "/usr/local/bin/wget" : "/usr/bin/wget";

$wget_file = $wget . " -q -O ";
$wget_pipe = $wget_file ."- ";


$no_to_mins=1;         #flag denoting we don't want the take off minumums
$no_alt_mins=1;        #flag denoting we don't want the alternate minumums
$no_apt_diag=1;        #flag denoting we don't want the airport diagrams
$index_only=0;         #dont actually download the files, just list them
$id_code_style="ICAO"; #use ICAO airport ID codes as default

$main_naco_url="http://www.naco.faa.gov/";
$naco_index_url=$main_naco_url . "accessible.asp?xml=naco/online/d_tpp";

#these are the template URLs that we substiute parameters into
$naco_url{'airport'}=$main_naco_url . "digital_tpp_search_acc.asp?fldIdent=AIRPORT_ID\\&fld_ident_type=CODE_STYLE\\&ver=VERSION\\&btnSubmit=Complete+Search";
$naco_url{'state'}= $main_naco_url . "digital_tpp_search_acc.asp?fldCodeShowAllState=Y\\&st=STATE_ABBR\\&ver=VERSION\\&btnSubmit=Complete+Search";

#Wait! Wait! We forgot to parse the arguements!  Better do that now!
for ($i=0; $i <= $#ARGV; $i++) {
   $ARGV[$i] =~ tr/A-Z/a-z/;          #down shift the case on everything

   if ($ARGV[$i] =~ /--index-only/) {
       $index_only = 1;
       next;
   }

   if ($ARGV[$i] =~ /--faa-codes/) {
       $id_code_style="FAA";
       next;
   }

   ($search_item[$i], $search_value[$i]) = ($1,$2)  
    if ($ARGV[$i] =~ /^(airport|state)=(\w+)/i);

  ($search_item[$i], $search_value[$i]) = ($1,$2) 
    if ($ARGV[$i] =~ /^(city)=(\w+,\w\w)/i);

#like anything that ya might want to use more than once, we need some
#documentation.  We will put it right here in the program so it doesn't
#get lost

   if ($ARGV[$i] =~ /--help|-h/) {
print <<XDOC; 
NAME
	taco -mass download of instrument approach information from
	$main_naco_url

SYNOPSIS
	taco [OPTION]... [airport=id|city=city,state|state=state_abbr]...

VERSION
	$taco_version    $taco_date
	
DESCRIPTION
	The $main_naco_url web site is structured in such a 
	way as to make it diffcult to download multipul instrument 
	approach PDF files.  Taco attempts to fix this. It downloads 
	the PDF files specified by the user and a creates HTML formatted
	index files that identify each PDF.

ARGUEMENTS
	Each arguement is of the form the following tags:
           
            airport=<ICAO or FAA airport Identifer>
            city=<cityname,state_abbreviation>
            state=<state_abbreviation>
	    
	The "airport=" tag signifies that that the instrument approach
	information is for the specified airport to be downloaded.

	The "city=" tag signifies that all instrument procedures for 
	a particular city is to be downloaded.  Note that this tag 
	requires that a comma be used to separate the name of the city 
	from the state 	abbreviation.  Additionally, no spaces can be 
	used.	For example, "city=seattle,WA" works just fine, but 
	"city=seattle, WA" would not.  Moot issue at this point as 
	this tag is not not implemented yet (see bugs)

	The "state=" tag takes the standard two letter abbreviation 
	for a state and downloads all the instrument procedures in the 
	state (you might want to look at the --index-only option before 
	attempting this).

	Tags and their values are case in-sensitive.

	Multiple mixed tags and options  are allowed (see examples).

	Calling taco with out any arguements causes it to print out
	the current revision number and effective dates and then exit.

OPTIONS 
	--index-only
		Generate the index of files that would be downloaded for
		given set of tags, but don't actually download anything.

	--faa-codes
		Use FAA airport identifiers.  The default is ICAO.
EXAMPLES
	Download all the the instrument procedures for Portland 
	International airport.
		taco airport=kpdx
	
	Download all the instrument procedures for the Portland 
	International, Seattle	International and San Francisco 
	International, using FAA identifiers
		taco airport=pdx airport=sea --faa-codes airport=sfo

	Download all of the instrument approaches for Portland 
	International and the entire state of Washington.
		taco airport=kpdx state=WA

	Don't download anything but get a index of all the IFR 
	information for The state of Texas minimums.
		taco state=Tx --index-only
BUGS
	Almost too many to list.  The most glaring are:

		The "city=" tag is not implemented.  Doing so
		is more difficult than first imagined.

		As in the 0.0.0.-1 release, the downloaded files 
		still poorly named.  The html index file that is 
		generated is a start in resolving this issue.
		
		This version lost some abilities to filter out some
		types of information (notably, the take-off minimums,
		alternate miniums and airport diagrams).  Hopefully,
		those will re-appear in a subsequent version.

		The index file generated is hard-coded to be 
		"index-<tag type>-<tag value>.html" in the current 
		directory.  Probably ought to be a command line option 
		to allow the user to specify a different name.
REVISIONS
        Sometime in late August 2007, NACO changed their urls slightly
	on the links to the IAPs.  What they actually did was drop
	a leading redundant "/".  This broke taco.  Note that something
	simular to this has a high probabilty of happening again.  See 
	the comments down were the urls are rewritten (don't forget
        to check where the 2nd and subsequent index pages are touch).
        Anyway I guess this is enough to warrent version 0.0.0.1

PLATFORMS
	Taco was developed on several different Linux distributions
	and one fairly generic FreeBSD system.  It should work on
	most any unix-eque system that has a vaguely current version
	of perl.  As it relies heavily on reading pipes to wget, it 
	probably doesn't have whisper of a chance of working under 
	windows without a big rewrite.  There is a report of it
        working under cgywin under windows (Thanks, Greg!)

ANSWERS TO QUESTIONS SOMEBODY IS GOING TO ASK
	Because taco rhymes with NACO and thats good enough for me.
	
	It is unlikely downloading based on routes will ever be 
	implemented. Thats starting to look like real work.  If 
	you can deal with a windows based solution for routes, 
	see www.postriver.com

AUTHOR
	$taco_developer

REPORTING BUGS
	If it makes you feel better, you can send your bug reports 
	to the above email address.  If you are desparate to have 
	bugs fixed though, I suggest buying a book on perl and doing
	it yourself.  I already have a life.

COPYRIGHT/WARNINGS
	Copyright, Copyleft, copy it however you like.
	
	This  is  free software; There is NO warranty; not even for 
	MERCHANTABILITY or  FITNESS  FOR  A  PARTICULAR
	PURPOSE.

	And whatever you do don't be a dolt and trust your life
	to the bad programming of some guy on the internet that
	never has gotten rid of the flashing 12:00 on his VCR.
	
SEE ALSO
	Not much.  Maybe the postriver.com thing mentioned above.

XDOC
	exit;

   }
}
       
# OK, enough pussyfooting around - lets do something!
# Lets go to the main NACO site and parse out the revision number
# and the effective period

open(SITE,"$wget_pipe $naco_index_url |") 
  || die "unable to open pipe to $wget_pipe to get $main_naco_site:$!\n";


while (<SITE>) {
    ($rev_number, $eff_date, $end_date) = ($1,$2,$3)
      if (/<a href=\"\/digital_tpp.asp\?ver=(\d\d\d\d)\&amp;eff=(\d\d-\d\d-\d\d\d\d)\&amp;end=(\d\d-\d\d-\d\d\d\d)/);
}


close(SITE);

# Well, that seemed to go well enough. Now for each tag, generate a 
# new url that will get us what we want.  This will get us the
# first of the NACO pages.  From that we will figure out how 
# many pages there are and get those later.


for ($i=0; $i <= $#search_item; $i++) {
# first we have to open up our output file for the index
    open(INDEX,">index-$search_item[$i]-$search_value[$i].html") 
        || die "unable to open index.html for writing: $!\n";
    print INDEX <<HEADER;
<head>
    <title>Instrument Procedures for the $search_value[$i] $search_item[$i]</title>
</head>
<body>
"<center><h2>This is for revision $rev_number<br>Effective from $eff_date to $end_date</h2></center>\n";
HEADER

    print "\nGenerating index file for $search_item[$i]: $search_value[$i]\n";


# now to munge togather a url for the files we are looking for
    $url=$naco_url{$search_item[$i]};      # get the right template
    $url =~ s/VERSION/$rev_number/;        # stuff in the revision number
    $url =~ s/CODE_STYLE/$id_code_style/;  # stuff in ID the code style
#if we are looking for an airport, stuff in the airport identifier
    $url =~ s/AIRPORT_ID/$search_value[$i]/ if ($search_item[$i] == 'airport');
#if we are looking for an state, stuff in the state abbreviation
    $url =~ s/STATE_ABBR/$search_value[$i]/ if ($search_item[$i] == 'state');
     
    open(PAGE,"$wget_pipe $url |") 
      || die "unable to open pipe to $wget_pipe to get $url:$!\n";

    $table_started=0;
    while (<PAGE>) {
	s/\r/\n/g;               #remove the annoying ^M stuff used by windows
	if (/^\s*<!-- Begin Results Layout -->/) {  
	    $table_started=1;      #OK found the start of the table we want to extract
	    next;
	}
	if ($table_started) {
	    if (/^\s*<\/table>/) {   # OK this is the end of the table
		$table_started=0;
		next;
	    } else {                   #rewrite the urls for the PDF files
		# watch these string searches because of NACO changes their
		# urls this is likely to break 
		if (/<a href=\"d-tpp\/\d{4}\/(.*\.PDF)\"/) {
		    $apr_name=$1;
		    # same thing down here
		    s/<a href=\"d-tpp\/\d{4}\/.*\.PDF\"/<a href=$apr_name/;
		    s/>PDF<\/a>/>$apr_name<\/a>/;
		}
		print INDEX $_;   # this is where the contents of the table spew out
	    }
	}
	if (/^\s*<p class=\"right\">\(Page 1 of (\d+)\)/) {
	    $total_pages = $1;   # this is where we find out how many pages there are
	    last;
        }
    }
    close(PAGE);

# now we go get all the other pages
    $url =~ s/btnSubmit\=Complete\+Search$/page=999/;	# set=up the url for the page

    for ($j=2; $j <= $total_pages; $j++) {
	$url =~ s/page=\d+$/page=$j/;
	open(PAGE,"$wget_pipe $url |") 
	    || die "unable to open pipe to $wget_pipe to get $url:$!\n";
	$table_started=0;

        while (<PAGE>) {
	    s/\r/\n/g;               #remove the annoying ^M stuff used by windows
	    if (/^\s*<caption>\d+ Document\(s\):<\/caption>/) {
		$table_started=1; # this is the start of the table on the subsequent pages
		next;
 	    }
	    if ($table_started) {
		if (/^\s*<\/table>/) {  # ok,its the end of the table
		    $table_started=0;
		    last;
		} else {
		    # watch these string searches because of NACO changes their
		    # urls this is likely to break 
		    if (/<a href=\"\/d-tpp\/\d{4}\/(.*\.PDF)\"/) {
			$apr_name=$1;
			# same thing down here
			s/<a href=\"\/d-tpp\/\d{4}\/.*\.PDF\"/<a href=$apr_name/;
			s/>PDF<\/a>/>$apr_name<\/a>/;
		    }
		    print INDEX $_;     #and spit out everything in-between
		}
	    }
	}
	close(PAGE);
    }
# OK thats everything, lets terminate the table
    print INDEX "</table>\n";
    close(INDEX);
}

# Ok, we've got all the indexes.  if the --index-only option is set we can quit at 
# this point

exit if ($index_only);

#Alrighty then off to get the files then.

#first, lets open up all those index files we just created and parse them for 
#the file names.  This is probably a bit of a stupid way of doing this, but
#we had to create the index files anyway.

for ($i=0; $i <= $#search_item; $i++) {
    open(INDEX,"index-$search_item[$i]-$search_value[$i].html")
	|| die "unable to open index-$search_item[$i]-$search_value[$i].html: $!\n";

#this is kinda in-effecient, but by stuffing the file names into an associated array
#I can get rid of duplicate file names and avoid downloading the same file twice or 
#more times.
    while(<INDEX>) {
	$file_list{$1}=$i if (/<a href\=(.*\.PDF) title/);
    }
}

#OK, every thing is cleaned up, let the rape and pillage begin

foreach $i (keys %file_list) {
    $url = "$wget_file$i $main_naco_url" . "d-tpp/$rev_number/$i";
    print "\tdownloading $i\n";
    system($url);
}

# tht tht tthats all folks	    






