#!/usr/local/bin/perl -w
# ===========================================================================
# Generic Tool to given a list of genes (and excluded terms) search entrez
#  for all aliases of the gene followed by a pubmed query to see how the
#  number of publications has changed over time.
# ===========================================================================
use LWP::Simple;
use XML::Parser;
use XML::Simple;	# module to parse XML data which is the response to the query
use Data::Dumper;	# module for visualization of parsed XML data; not obligatory for function but was used for development

use Getopt::Long; # to parse the command line options.

my $query_ending  = "[sym] AND (\"mus musculus\"[Organism] OR \"homo sapiens\"[Organism])";
my $report = "xml";
my $email = "ruth.isserlin\@utoronto.ca";
my $tool = "publication_distribution";

#url for the ncbi e-utils
my $utils = "http://www.ncbi.nlm.nih.gov/entrez/eutils";

#the different queries
my $esearch_pubmed = "$utils/esearch.fcgi?" .
              "db=Pubmed&usehistory=n&email=$email&tool=$tool&term=";

my $esearch_gene = "$utils/esearch.fcgi?" .
              "db=Gene&usehistory=y&email=$email&tool=$tool&term=";



#if you want to limit the pubmed search to anything (for example [Title]) add it to constraints
my $constraints = "";

#variables to hold command line arguments
my (
	$genes_filename,
	$exclusions_filename,
	$output_dirname,
	$output_filename,
	$restrict,
);

GetOptions (
	"genes|g=s"		=> \$genes_filename,
	"exclusions|e=s"	=> \$exclusions_filename,
	"outputdir|o=s"	=> \$output_dirname,
      "outputfile|f=s"  => \$output_filename,
	"restrict|r=s"	=> \$restrict,
);

#open the output file
open(PUBS, ">$output_dirname/$output_filename") or die "Error opening $output_dirname/$output_filename : $!\n";

#print Header
print(PUBS "Initial Gene Query\tPrimary Name\tAliases\t1950-1979\t1980-1984\t1985-1989\t1990-1994\t1995-2000\t2001-2005\t2006-2010\t1980\t1981\t1982\t1983\t1984\t1985\t1986\t1987\t1988\t1989\t1990\t1991\t1992\t1993\t1994\t1995\t1996\t1997\t1998\t1999\t2000\t2001\t2002\t2003\t2004\t2005\t2006\t2007\t2008\t2009\t2010\tPubmed Query\n");
	

#get all the names we want to exclude
# There are aliases that can bring back loads of pubmed hits, for example gene name "MR", that you might
# want to exclude from pubmed search
open(IGNORE_IN, "$exclusions_filename") or die "error opening $exclusions_filename : $!\n";
my @ignore_list = ();

while(<IGNORE_IN>){
	chomp;
	my @cur = split(/\n/);
	
	push @ignore_list, $cur[0];	
}

#open the file that stores all the gene symbols for nuclear receptors
#get the gene Names to do each pubmed search
open(Names_IN, "$genes_filename") or die "Error opening $genes_filename : $!\n";


#go through each name (in the genes_file) 
# search entrez for aliases 
# make sure none of the aliases are in the exclusion list
# make sure that each individual search term returns results when it is searched in pubmed.  Because if no
#     results are returned by pubmed for a particular term pubmed expands it to try and get hits.
#	for example the term "orphan nuclear hormone receptor 1" is not found as a quoted term so pubmed
#	translates it to:("child, orphaned"[MeSH Terms] OR ("child"[All Fields] AND "orphaned"[All Fields]) 
# 		OR "orphaned child"[All Fields] OR "orphan"[All Fields]) AND ("receptors, cytoplasmic and 
#		nuclear"[MeSH Terms] OR ("receptors"[All Fields] AND "cytoplasmic"[All Fields] AND 
#		"nuclear"[All Fields]) OR "cytoplasmic and nuclear receptors"[All Fields] OR ("nuclear"[All Fields] 
#		AND "hormone"[All Fields] AND "receptor"[All Fields]) OR "nuclear hormone receptor"[All Fields]) 
#		AND 1[All Fields]
# construct the base pubmed query
# and then search pubmed for the number of publications
while (<Names_IN>){
	chomp;
	my @name  = split(/\n/);
	print "$name[0]\n";

	#search entrez for aliases.
	# Get the entrez Entry in XML format and parse out all the aliases and alternate names		
	my $query = $name[0].$query_ending; 

	my $esearch_result = get($esearch_gene . $query);

	$esearch_result =~ 
	m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count    = $1;
	my $QueryKey = $2;
	my $WebEnv   = $3;


	my $retstart;
	my $retmax=1;
	
	my $pubmed_query = "";
	@aliases = ();
	@alternate_names = ();

	#go through all records returned by Entrez, there could be one mouse and one human
	for($retstart = 0; $retstart < $Count; $retstart += $retmax) {
  
  		my $efetch = "$utils/efetch.fcgi?" .
               		"rettype=$report&retmode=text&retstart=$retstart&retmax=$retmax&" .
               		"db=Gene&query_key=$QueryKey&WebEnv=$WebEnv";
	
  		my $efetch_result = get($efetch);
 
  		$tag = "";
  		$primaryName = "";
  		
  		$p3 = new XML::Parser(ErrorContext => 2);

  		$p3->setHandlers(Start => \&startElement,
                           End => \&endElement,
                           Char => \&characterData,Default => \&default);
		
		#use XML parse function to parse the results returned by Entrez.
  		eval{$p3->parse($efetch_result);};
  		if($@){
			my $error = $@;
			print "$error\n";
			next;
  		}
  		
	}
	
	#start constructing the pubmed query
	$pubmed_query = "((\"" . $primaryName ."\"";
	
	#go through each of the aliases in the list.  Only add it to the query if the individual
	# query returns somethings and it is not in the exclusion list.
	foreach $aliases (@aliases){
						
		#before doing a search check to see if the name is in the exclude list.
		my $ignore = "no";
		foreach $a (@ignore_list){
			#one of the novel
			if($a eq $aliases){
				$ignore = "yes";
			}

		}
			
		if($ignore eq "no"){
			#for each gene name do a separate pubmed query and get the counts
			#if the individual query comes back with one of the below warnings then don't 
			#add it to the search.
			
			my $notFound = "";

			$pubmed_name = "\"" . $aliases . "\"" . $constraints;
			$esearch_result = get($esearch_pubmed . $pubmed_name);
 			
			#tag within the pubmed search results that indicate the quoted phrase was not found
			# and that pubmed will try and expand it.
			$esearch_result =~ 
  				m|<QuotedPhraseNotFound>(.*)</QuotedPhraseNotFound>|s;

			$notFound = $1;

			if($notFound ne $pubmed_name){
						
				$pubmed_query = $pubmed_query . " OR \"" . $aliases . "\"" . $constraints;
					

			}
		}
  	}
		
		
	#finished adding all the gene names, now add the alternate names
	# For more restrictive search change change to AND here.
	$pubmed_query = $pubmed_query . ") OR ( ";
	

	foreach $alternate_names (@alternate_names){
			
		#before doing a search check to see if the name is in the exclude list.
		$ignore = "no";
		foreach $a (@ignore_list){
			#one of the novel
			if($a eq $alternate_names){
				$ignore = "yes";
			}

		}
			
		if($ignore eq "no"){
			#for each gene name do a separate pubmed query and get the counts
			my $notFound = "";
			$pubmed_name = "\"" . $alternate_names . "\"" . $constraints;
			$esearch_result = get($esearch_pubmed . $pubmed_name);
 			
			#tag within the pubmed search results that indicate the quoted phrase was not found
			# and that pubmed will try and expand it.										
			$esearch_result =~ 
  				m|<QuotedPhraseNotFound>(.*)</QuotedPhraseNotFound>|s;

			$notFound = $1;
		
			if($notFound ne $pubmed_name){										
				$pubmed_query = $pubmed_query . " \"" . $alternate_names . "\"" . $constraints ." OR ";
			}
		}
  	}


	#add an extra name to the RXRs
	if($name[0] eq "NR2B1"){
		$pubmed_query = $pubmed_query . " \"RXR alpha\"" . $constraints." OR \"RXRalpha\"". $constraints;
	}
	#add an extra name to the RXRs
	if($name[0] eq "NR2B2"){
		$pubmed_query = $pubmed_query . " \"RXR beta\"" . $constraints." OR \"RXRbeta\"" . $constraints;
	}
	#add an extra name to the RXRs
	if($name[0] eq "NR2B3"){
		$pubmed_query = $pubmed_query . " \"RXR gamma\"" . $constraints." OR \"RXRgamma\"". $constraints;
	}

      #add the generic name "thyroid hormone receptor" to one of the receptors.
	if($name[0] eq "NR1A2"){
		$pubmed_query = $pubmed_query . " (\"thyroid hormone receptor\" AND \"beta\")". $constraints;
	}
	if($name[0] eq "NR1A1"){
		$pubmed_query = $pubmed_query . " (\"thyroid hormone receptor\" AND \"alpha\")". $constraints;
	}


	$pubmed_query = $pubmed_query . ")) and \"". $restrict . "\"" . $constraints;
		
	#create a file to output all the pmids to 
	#do the search without any year restriction

	$esearch_result = get($esearch_pubmed . $pubmed_query);
	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_all    = $1;
	my $Querykey = $2;
	my $WebKey = $3;
		

	my $efetch_pmids = "$utils/efetch.fcgi?" .
             		"db=pubmed&usehistory=y&rettype=uilist&retmode=text&WebEnv=$WebKey&query_key=$QueryKey";
	
		
	my $efetch_pmids_results = get($efetch_pmids);
		
	#create a directory in the output directory called PMIDs to contain the PMIDs
	mkdir "$output_dirname/PMIDS";

	#open a file the name of the main search
	open(PMIDS, ">$output_dirname/PMIDS/$name[0].txt");
		
	my @pmids  = split(/\n/, $efetch_pmids_results);
	foreach $pmids (@pmids){
			print PMIDS "PMID(".$pmids.") OR \n";
	}
	close PMIDS;


	##########################################
	# Do pubmed searches for years 1950-2010 in 5 year subsets
	# and individually for year 1980-2010
	#########################################

	#search 1950-1979
	$pubmed_query_year = $pubmed_query . " and 1950:1979[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 
	$esearch_result =~ 
  	m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_195079    = $1;

	#search 1980-1984
	$pubmed_query_year = $pubmed_query . " and 1980:1984[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 
	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_198084    = $1;

	#search 1985-1989
	$pubmed_query_year = $pubmed_query . " and 1985:1989[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 
	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_198589    = $1;

	#search 1990-1994
	$pubmed_query_year = $pubmed_query . " and 1990:1994[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 
	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_199094    = $1;
	
	#1995-2000
	$pubmed_query_year = $pubmed_query . " and 1995:2000[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_19952000    = $1;

	#and then search by each year individually
	# 1980
	$pubmed_query_year = $pubmed_query . " and 1980[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1980    = $1;

	# 1981
	$pubmed_query_year = $pubmed_query . " and 1981[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1981    = $1;

	# 1982
	$pubmed_query_year = $pubmed_query . " and 1982[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1982    = $1;

	# 1983
	$pubmed_query_year = $pubmed_query . " and 1983[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1983    = $1;	
		
	# 1984
	$pubmed_query_year = $pubmed_query . " and 1984[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1984    = $1;

	# 1985
	$pubmed_query_year = $pubmed_query . " and 1985[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1985    = $1;

	# 1986
	$pubmed_query_year = $pubmed_query . " and 1986[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1986    = $1;

	# 1987
	$pubmed_query_year = $pubmed_query . " and 1987[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1987    = $1;	

	# 1988
	$pubmed_query_year = $pubmed_query . " and 1988[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1988    = $1;

	# 1989
	$pubmed_query_year = $pubmed_query . " and 1989[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1989    = $1;		

	# 1990
	$pubmed_query_year = $pubmed_query . " and 1990[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1990    = $1;

	# 1991
	$pubmed_query_year = $pubmed_query . " and 1991[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1991    = $1;

	# 1992
	$pubmed_query_year = $pubmed_query . " and 1992[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1992    = $1;

	# 1993
	$pubmed_query_year = $pubmed_query . " and 1993[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1993    = $1;	

	# 1994
	$pubmed_query_year = $pubmed_query . " and 1994[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1994    = $1;


	# 1995
	$pubmed_query_year = $pubmed_query . " and 1995[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1995    = $1;


	# 1996
	$pubmed_query_year = $pubmed_query . " and 1996[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1996    = $1;

	# 1997
	$pubmed_query_year = $pubmed_query . " and 1997[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1997    = $1;	

	# 1998
	$pubmed_query_year = $pubmed_query . " and 1998[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1998    = $1;

	# 1999
	$pubmed_query_year = $pubmed_query . " and 1999[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_1999    = $1;

	# 2000
	$pubmed_query_year = $pubmed_query . " and 2000[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2000    = $1;

	# 2001
	$pubmed_query_year = $pubmed_query . " and 2001[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2001    = $1;

	# 2002
	$pubmed_query_year = $pubmed_query . " and 2002[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2002    = $1;
	
	# 2003
	$pubmed_query_year = $pubmed_query . " and 2003[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2003    = $1;
	# 2004
	$pubmed_query_year = $pubmed_query . " and 2004[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2004    = $1;

	# 2005
	$pubmed_query_year = $pubmed_query . " and 2005[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2005    = $1;

	# 2006
	$pubmed_query_year = $pubmed_query . " and 2006[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2006    = $1;

	# 2007
	$pubmed_query_year = $pubmed_query . " and 2007[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2007    = $1;

	# 2008
	$pubmed_query_year = $pubmed_query . " and 2008[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2008    = $1;

	# 2009
	$pubmed_query_year = $pubmed_query . " and 2009[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2009    = $1;

	# 2010
	$pubmed_query_year = $pubmed_query . " and 2010[dp]";
	$esearch_result = get($esearch_pubmed . $pubmed_query_year);
 	$esearch_result =~ 
  		m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

	my $Count_2010    = $1;

	my $Count_20002005 = ($Count_2001+$Count_2002+$Count_2003+$Count_2004+$Count_2005);
	my $count_20062010 = ($Count_2006+$Count_2007+$Count_2008+$Count_2009+$Count_2010);

	print(PUBS "$name[0]\t$primaryName\t@aliases\t$Count_195079\t$Count_198084\t$Count_198589\t$Count_199094\t$Count_19952000\t$Count_20002005\t$count_20062010\t$Count_1980\t$Count_1981\t$Count_1982\t$Count_1983\t$Count_1984\t$Count_1985\t$Count_1986\t$Count_1987\t$Count_1988\t$Count_1989\t$Count_1990\t$Count_1991\t$Count_1992\t$Count_1993\t$Count_1994\t$Count_1995\t$Count_1996\t$Count_1997\t$Count_1998\t$Count_1999\t$Count_2000\t$Count_2001\t$Count_2002\t$Count_2003\t$Count_2004\t$Count_2005\t$Count_2006\t$Count_2007\t$Count_2008\t$Count_2009\t$Count_2010\t$pubmed_query\n");
	

}

close(PUBS);

#Methods used by XML parser to get the fields of interest.
# Gene-ref_locus = primary name (is assigned to primary name variable)  Each record should only contain one primary name
# Gene-ref_syn_E = aliases (is assigned to an array of aliases)
# Prot-ref_name_E - alternate names ( is assigned to an array of alternate names) 
sub startElement {
       my( $parseinst, $element, %attrs ) = @_;
       SWITCH: {
              if ($element eq "Gene-ref_locus") {
                     $tag = "Gene-ref_locus";
                     last SWITCH;
              }
	      if ($element eq "Gene-ref_syn_E") {
                     $tag = "Gene-ref_syn_E";
                     last SWITCH;
              }
	     if ($element eq "Prot-ref_name_E") {
                     $tag = "Prot-ref_name_E";
                     last SWITCH;
              }
		                 
             
       }
}
sub endElement {
       my( $parseinst, $element ) = @_;
       if ($element eq "Gene-ref_locus") {
              #print "\n";
       } elsif ($element eq "Gene-ref_syn_E") {
              #print "\n";
       }elsif ($element eq "Prot-ref_name_E") {
              #print "\n";
       }  
}
sub characterData {
       my( $parseinst, $data ) = @_;
       if (($tag eq "Gene-ref_locus")) {
              $data =~ s/\n|\t//g;
	      $primaryName=$data;
	      $tag = "";
       }
       if (($tag eq "Gene-ref_syn_E")) {
              $data =~ s/\n|\t//g;
	      push(@aliases,$data) ;
	      $tag = "";
       }
	if (($tag eq "Prot-ref_name_E")) {
              $data =~ s/\n|\t//g;
	      push(@alternate_names,$data) ;
	      $tag = "";
       }
}

sub default {
       my( $parseinst, $data ) = @_;
       # you could do something here
}


sub usage {
    print <<EOF
USAGE:
    ./query_entrez.pl --exclusions|-e exc.txt --genes|-g genes.txt --outputdir|-o outputdir --outputfile|-f outputfile

DESCRIPTION:
    Given a list of genes (and exclusion terms) query entrez gene to get all the aliases
    of the given gene.  Construct a pubmed query with the gene name and all aliases (excluding
    any term in the exclusion list) and query pubmed for the years 1980-2010.

OPTIONS:
    --genes, -g 
	  The path to a tab-delimited file, where
		1st column = gene name
    --exclusions, -e
        The path to a tab-delimited file, where
            1st column = term      # corresponding to a term/name/alias to excluded from the pubmed search
            
    --outputdir, -o
	  The path to output the results to.  If doesn't exist a new
        file will be created. 

	--outputfile, -f
	  The name fo the output file.  If doesn't exist a new
        file will be created. 


EOF
}
