#!/usr/bin/env perl

####################################################################
#
# File: bibweb
# Author: John H. Palmieri <palmieri@math.washington.edu>
#         URL: http://www.math.washington.edu/~palmieri/Bibweb/
# Version: 0.63 of Tue Dec  3 11:18:27 PST 2019
# Description: retrieve bibliographical information from MathSciNet
#              automatically
# Copyright (c) 1997-2019 John H. Palmieri
# License: distributed under GNU General Public License -- see below.
#
####################################################################
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# (see file COPYING) along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA  02111-1307, USA.
#
####################################################################
#
# Command line options:
#    FILE        use FILE.aux as input, FILE.bib as output
#    -i FILE     specify FILE as input (aux) file.
#    -o FILE     specify FILE as output (bib) file.  If FILE ends in
#                 ".bib", write to FILE; otherwise, write to FILE.bib
#    -c REF      looks up REF, rather than using an auxfile for input
#    -m NUM      return at most NUM entries (where NUM is rounded up
#                 to 5, 10, 20, 50, 100, 1000)
#    -e WEB_SITE use WEB_SITE for MathSciNet search
#    -std        write output to STDOUT (the screen, ordinarily)
#    -sep CHAR   use CHAR to delimit field in long citation format,
#                instead of semicolon (;)
#    -lax        use % to comment lines in bibtex
#    -h          print brief help message
#
# If you use only one of the -i and -o options, bibweb makes a guess
# as to what the other file should be.
####################################################################

use LWP::UserAgent;
use HTTP::Request::Common;

# Without the following, I get the error
#
# "Can't verify SSL peers without knowing which Certificate Authorities to trust"
#
# There are two optons: use Mozilla::CA or set the environment variable
# PERL_LWP_SSL_VERIFY_HOSTNAME to 0. The former seems safer, so try that first.
my $mozca = eval
{
  require Mozilla::CA;
  1;
};
unless ($mozca) {
    $ENV{'PERL_LWP_SSL_VERIFY_HOSTNAME'} = '0';
}

$bibtex = 'bibtex';
$thisprog = 'bibweb';
$version = '0.63';
# good choices for e_math: 'www.ams.org', 'ams.rice.edu',
#   'ams.mathematik.uni-bielefeld.de', 'ams.mpim-bonn.mpg.de',
#   'ams.u-strasbg.fr', 'ams.impa.br'
$e_math = $ENV{MATHSCINET_SITE};
unless ($e_math) {$e_math = 'www.ams.org'}
$use_stdout = 0;
$debug = 0;
$lax_comments = 0;
$max_matches = 20;
$usage = <<EOT;
This is $thisprog, version $version.
To run bibtex on FILE, and then scan MathSciNet for missing
  bibliographical information: $0 [options] FILE
  options can be:
    -i FILE     use \'FILE\' as input (aux) file
    -o FILE     use \'FILE\' as output (bib) file
    -c REF      looks up single citation \'REF\', instead of using
                 an auxfile for input
    -m NUM      return at most NUM entries (rounded up)
    -e WEB_SITE  use WEB_SITE for MathSciNet search
    -std        write output to STDOUT (the screen, ordinarily)
    -sep CHAR   use CHAR to delimit fields in long citation format,
                instead of semicolon (;)
    -lax        use % to comment lines in bibtex
    -h          print this help message
EOT

# read command line arguments
use Getopt::Long;
GetOptions("input|i=s" => \$auxfile,
	   "output|o=s" => \$bibfile,
	   "stdout|std|s" => \$use_stdout,
	   "cite|c=s" => \$only_cite,
	   "max|m=i" => \$max_matches,
	   "emath|e=s" => \$e_math,
	   "separator|sep=s" => \$new_separator,
	   "lax" => \$lax_comments,
	   "debug|D" => \$debug,
	   "help|h" => \$help) or
    die "$usage\n";
if ($help) { die "$usage\n" };

if ($e_math eq 'ams') {$e_math = 'www.ams.org'}
if ($e_math eq 'rice') {$e_math = 'ams.rice.edu'}
if ($e_math eq 'bielefeld') {$e_math = 'ams.mathematik.uni-bielefeld.de'}
if ($e_math eq 'bonn') {$e_math = 'ams.mpim-bonn.mpg.de'}
if ($e_math eq 'strasbg') {$e_math = 'ams.u-strasbg.fr'}
if ($e_math eq 'impa') {$e_math = 'ams.impa.br'}

if ($e_math =~ /
	^www.ams.org$|
	^ams.rice.edu$|
	^ams.mathematik.uni-bielefeld.de$|
	^ams.mpim-bonn.mpg.de$|
	^ams.u-strasbg.fr$|
        ^ams.impa.br$
	/x) {
    print "Using \`$e_math\' for the MathSciNet search.\n"
    }
    else {
	print "Warning: Your choice of \`$e_math\' for the MathSciNet site
is not one of the recommended choices.  Proceeding anyway...\n\n";
}

if ($e_math eq 'www.ams.org') {$redirect = 'Providence%2C+RI+USA'}
if ($e_math eq 'ams.rice.edu') {$redirect = 'Houston%2C+TX+USA'}
if ($e_math eq 'ams.mathematik.uni-bielefeld.de') {
    $redirect = 'Bielefeld%2C+Germany'}
if ($e_math eq 'ams.mpim-bonn.mpg.de') {$redirect = 'Bonn%2C+Germany'}
if ($e_math eq 'ams.u-strasbg.fr') {$redirect = 'Strasbourg%2C+France'}
if ($e_math eq 'ams.impa.br') {$redirect = 'Rio+de+Janeiro%2C+Brazil'}

unless ($auxfile and $bibfile) {
    $file = shift(@ARGV);
    $auxfile = ($auxfile or $file);
    $bibfile = ($bibfile or "$file.bib");
}

if ($bibfile and $bibfile !~ m/\.bib$/) {
    $bibfile = "$bibfile.bib";
}

if ($only_cite) {
    $only_cite =~ s(\'\")();
    @bibtex_output = (1);
    $use_stdout = 1 if ($bibfile eq "" or $bibfile eq ".bib");
}
else {
    $auxfile =~ s/\.aux$//;
    if ($auxfile and not $bibfile) { $bibfile = "$auxfile.bib" }
    unless (-e "$auxfile.aux") {die "Couldn't read $auxfile.aux\n"};
    unless ($use_stdout) {
	open(BIBFILE, ">>$bibfile") or die "Couldn't open $bibfile\n";
	print "Appending output to $bibfile . . . \n\n";
    }
    @bibtex_output = `$bibtex $auxfile`;
}

if ($max_matches > 50) {
    $max_matches = 50;
    print "Rounding -m argument down to $max_matches.\n\n";
}

$semicolon = ";";
if ($new_separator) { $semicolon = quotemeta ($new_separator) }

if ($lax_comments) {
    $bibtex_short_comment = '%%';
    $bsc = $bibtex_short_comment;
    $bibtex_long_comment = '%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%';
}
else {
    $bibtex_short_comment = '@comment  ';
    $bsc = $bibtex_short_comment;
    $bibtex_long_comment = $bsc;
}

foreach $warning (@bibtex_output) {
    # get citation
    if ($only_cite) {
	$citation_orig = $citation = $only_cite;
	$citation =~ tr/,/./;
    }
    elsif ($warning =~ (/^Warning--I didn\'t find a database entry for \"([^\"]*)\"$/)) {
	$citation_orig = $citation = $1;
	next if &check_bibfile($citation_orig);
    }
    else { next; }

    # split citation into author, etc.
    $citation =~ s/-and-/-/g;
    $citation =~ tr/./,/;
    $author = '';
    $title = '';
    $journal = '';
    $year = '';
    $dr = 'all';
    $misc = '';
    
    if ($citation =~ m/$semicolon/) {
	($authors, $titles, $journals, $year) = split(/$semicolon/, $citation);
	$author = join('+and+', split(/-/, $authors));
	$title = join('+and+', split(/-/, $titles));
	$journal = join('+and+', split(/-/, $journals));
    }
    else {
	($author, $subcitation) = split(/-/, $citation, 2);
	if ($subcitation =~ /([<>=]?\d+)\Z/) {
	    $subcitation = $`;
	    $year = $1; }
	$misc = join('+and+', split(/-/, $subcitation));
    }

    # parse year entry
  YEAR: {
	  $dr = 'yearRangeFirst=&yearRangeSecond=&pg8=ET&s8=All';
	  last YEAR;
	  $yr = '';
  }

    # construct URL
    $url = "http://$e_math/mathscinet/search/publications.html?" .
	"pg4=AUCN&s4=" .
	"$author" .
	"&co4=AND&pg5=TI&s5=" .
	"$title" .
	"&co5=AND&pg6=JOUR&s6=" .
	"$journal" .
	"&co6=AND&pg7=ALLF&s7=" .
	"$misc" .
	"&Submit=Search&dr=all&yrop=eq&arg3=&" .
	"$yr" .
	"&fmt=bibtex"	;
    unless ($use_stdout) {
	&bib_print("", "$bibtex_long_comment \n");
    }
    &bib_print("working on citation \'$citation_orig\' \n", 
		"$bsc citation \'$citation_orig\' \n" );
    $match_info_printed = '';

    if ($debug) {print "debug information: URL is \n$url\n"};

    # get response from MathSciNet
    my $ua = LWP::UserAgent->new;
    my $response = $ua->request(GET $url);

    if ($response->is_success) {
	$line = $response->as_string;
	#if ($debug) {print "debug information: line is \n$line\n"};

	if ($line =~ /No Records Selected/) {
	    &bib_print("No matches found; please revise your search criteria.\n\n",
		       "$bsc No matches found. \n$bsc\n");
	}
	else {

	    @lines = ($line =~ m/\s*<pre>\n([^<]*)\n\s*<\/pre>/g);
	    
	    #if ($debug) {print "debug information: now, lines are \n@lines \n"};

	    #if ($debug) {print "number of matches: " . scalar(@lines) . "\n\n"};
	    
	    $matches = scalar(@lines);

	    if ($matches == 1) {
		# get mr number, insert comment, and replace with citation
		$line = $1;
		
		# if ($debug) {print "NOW line is **$line**"};
		
		$line =~ /MRNUMBER = {([^}]*)}/;
		$mr = $1;
		&bib_print("", "$bsc Math Reviews number: $mr \n");
		$line =~ s/(MR[^,]*)/$citation_orig/;
		&bib_print ("", "$line\n");
	    }
	    elsif ($matches <= $max_matches) {
		foreach $line2 (@lines) {
		    if ($line2 =~ /[a-zA-Z]/) {
			# get mr number and insert comment
			$line2 =~ /MRNUMBER = {([^}]*)}/;
			$mr = $1;
			&bib_print("", "$bsc Math Reviews number: $mr \n");
			&bib_print ("", "$line2\n");
		    }
		}
	    }
	    elsif ($matches < 50) {
		# too many matches found
		&bib_print("More than $max_matches matches found " .
			   "($matches); " .
			   "please refine your search criteria,\n" .
			   "or use the -m option. \n\n",
			   "$bsc More than $max_matches matches found " .
			   "($matches). \n$bsc\n");
	    }
	    else {
		# way too many matches found
		&bib_print("At least $matches matches found; " .
			   "please refine your search criteria.\n\n",
			   "$bsc At least $matches matches found. \n$bsc\n");
	    }

	    # can max_matches be larger than 50??
	}
    }
}

close(BIBFILE);

# run bibtex again, to make use of new keys
exec "$bibtex $auxfile" unless ($only_cite);

########## simple subroutines

# scan BIBFILE for citation, to see if you've looked for it before
sub check_bibfile {
    local($answer) = 0;
    local($cite) = $_[0];
    if ($] >= 5) { $cite = "\Q$_[0]\E"; }
    if (-e $bibfile) {
	open(BIBFILE, "$bibfile");
	while (<BIBFILE>) {
	    if (/$bsc\s*citation\s*\'$cite\'/) {
		$answer = 1;
		last;
	    }
	}
	close(BIBFILE);
	if ($answer) { print "You've searched for $_[0] before.\n\n" };
	$answer;
    }
    else {
	0;
    }
}

# print to screen and to BIBFILE, unless writing to STDOUT
sub bib_print {
    local($line1, $line2) = ($_[0], $_[1]);
    if ($use_stdout) {
	print $line2;
    }
    else {
	open(BIBFILE, ">>$bibfile") or die "Couldn't open $bibfile\n";
	print $line1;
	print BIBFILE "$line2";
    }
}
