#!/usr/bin/env perl
# -----------------------------
#  Perl 5.8 or later required
# -----------------------------
require 5.008;

$main_info = $ARGV[0];
$infofile_encoding = ":utf8";

binmode STDOUT, $infofile_encoding;

$unit_separator = "";

$item_cnt = 0;
$section_cnt = 0;

# ------------------------------------------------------------------
# PART 1. BUILD INDEX FOR @DEFFN AND @DEFVR ITEMS
# ------------------------------------------------------------------

# (1.1)  Build index tables.

# (1.1a) Scan the *.info-* files for unit separator characters;
#        those mark the start of each texinfo node.
#        Build a hash table which associates the node name with the filename
#        and byte offset (NOT character offset) of the unit separator.
#
#        Do NOT use the indirect table + tag table (generated by makeinfo),
#        because those tables give character offsets; we want byte offsets.
#        It is easier to construct a byte offset table by hand,
#        rather than attempting to fix up the character offsets.
#        (Which are strange anyway.)

open (FH, "<" . $infofile_encoding, $main_info);
read (FH, $stuff, -s FH);

# check which version of makeinfo produced $main_info
# for purposes of bug workaround
($makeinfo_major_version, $makeinfo_minor_version) = 
    $stuff =~ /makeinfo version (\d+)\.(\d+)/;
# print STDERR "makeinfo version $makeinfo_major_version . $makeinfo_minor_version\n";

$filename = $main_info;
push @info_filenames, $filename;

while ($stuff =~ m/\G.*?(?=\n$unit_separator)/cgsm) {
    $offset = pos $stuff;

    if ($stuff =~ m/^File:.*?Node: (.*?),/csgm) {
        $node_name = $1;
        $last_node_name = $node_name;
    }

    # print ";; IN SEC 1.1a, SEARCH MAIN INFO; NODE NAME=$node_name, FILENAME=$filename, OFFSET=$offset\n";
    $node_offset{$node_name} = [($filename, int($offset))];
}

close $FH;

open (FH, "<" . $infofile_encoding, $main_info);
read (FH, $stuff, -s FH);

while ($stuff =~ m/^($main_info-\d+): (\d+)/cgsm) {
    $filename = $1;
    push @info_filenames, $filename;

    open FH2, "<" . $infofile_encoding, $filename;
    read FH2, $stuff2, -s FH2;

    while ($stuff2 =~ m/\G.*?(?=\n$unit_separator)/cgsm) {
        $offset = pos $stuff2;

        if ($stuff2 =~ m/^File:.*?Node: (.*?),/csgm) {
            $node_name = $1;
            $last_node_name = $node_name;
        }

        # print ";; IN SEC 1.1a, SEARCH SUBSIDIARY INFO; NODE NAME=$node_name, FILENAME=$filename, OFFSET=$offset\n";
        $node_offset{$node_name} = [($filename, int($offset))];
    }

    close $FH2;
}

close FH;

# (1.1b) Read the info index, which gives the node name and number of lines offset
#        for each indexed item. 

# ASSUME THAT THE INFO INDEX IS THE LAST NODE.
# (GETTING THE NODE NAME FROM THE COMMAND LINE IS PROBLEMATIC.)
$index_node_name = $last_node_name;

($index_filename, $index_node_offset) = @{$node_offset{$index_node_name}};
# print ";; IN SEC 1.1b, INDEX NODE NAME=$index_node_name, INDEX FILENAME=$index_filename, INDEX NODE OFFSET=$index_node_offset\n";

open (FH, "<" . $infofile_encoding, $index_filename);
read (FH, $stuff, -s FH);

while ($stuff =~ m/^File:.*?Node: $index_node_name/icgsm) {
    while ($stuff =~ m/\G.*?^\* (?!Menu)(\S+|[^:]+):\s+(.*?)\.\s+\(line\s+(\d+)\)/cgsm) {
        $topic_name = $1;
        $node_name = $2;
        $lines_offset = $3;
        # print ";; IN SEC 1.1b, TOPIC NAME=$topic_name, NODE NAME=$node_name, LINES OFFSET=$lines_offset\n";
        $topic_locator{$topic_name} = [($node_name, $lines_offset)];
    }
}

close FH;

# (1.2)  Translate node name and number of lines offset into file name and byte offset
#        for each indexed item.
#        Also find the length of each item.

foreach $key (sort keys %topic_locator) {
    ($node_name, $lines_offset) = @{$topic_locator{$key}};
    ($filename, $character_offset) = @{$node_offset{$node_name}};
    $byte_offset = seek_lines($filename, $character_offset, $lines_offset);

    open FH, "<" . $infofile_encoding, $filename;
    seek FH, $byte_offset, 0;
    read FH, $stuff, -s FH;
    if ($stuff =~ m/(.*?)(?:\n\n(?= -- )|\n(?=[0-9])|(?=$unit_separator))/cgsm) {
        $text_length = length $1;
    }
    else {
        # Eat everything up til end of file.
        $stuff =~ m/(.*)/cgsm;
        $text_length = length $1;
    }
    close FH;

    # print ";; IN SEC 1.2, KEY=$key, NODE NAME=$node_name, FILENAME=$filename, BYTE OFFSET=$byte_offset, TEXT LENGTH=$text_length\n";
    $topic_locator{$key} = [($node_name, $filename, $byte_offset, $text_length)];
}

# (1.3)  Generate Lisp code. The functions in info.lisp expect this stuff.

print "(in-package :cl-info)\n";

#        Pairs of the form (<index topic> . (<filename> <byte offset> <length> <node name>))

print "(let (\n";
print "(deffn-defvr-pairs '(\n";
print "; CONTENT: (<INDEX TOPIC> . (<FILENAME> <BYTE OFFSET> <LENGTH IN CHARACTERS> <NODE NAME>))\n";

foreach $key (sort keys %topic_locator) {
    $item_cnt++;
    my $sanitized_key = $key;
    $sanitized_key =~ s/"/\\"/g;
    my $file_name = $topic_locator{$key}[1];
    my $byte_offset = $topic_locator{$key}[2];
    my $nchars = $topic_locator{$key}[3];
    my $node_name = $topic_locator{$key}[0];
    if ($sanitized_key eq '' or $file_name eq '' or $byte_offset < 0 or $nchars < 0 or $node_name eq '') {
        print STDERR "build_index.pl: something seems wrong for key=\"$sanitized_key\"; emit it anyway.\n";
        print STDERR "build_index.pl: sanitized_key=\"$sanitized_key\", file_name=\"$file_name\", byte_offset=$byte_offset, nchars=$nchars, node_name=\"$node_name\"\n";
        print ";; build_index.pl: something seems wrong for this next item\n";
    }
    print "(\"$sanitized_key\" . (\"$file_name\" $byte_offset $nchars \"$node_name\"))\n";
}

print "))\n";

# ------------------------------------------------------------------
# PART 2. BUILD INDEX FOR @NODE ITEMS
# ------------------------------------------------------------------

# (2.1)  Search for 'mmm.nnn' at the start of a line,
#        and take each one of those to be the start of a node.
#
#        We could use the node table ($node_offset here), but we don't.

#        (a) The node table indexes nodes which contain only menus.
#            We don't want those because they have no useful text.
#
#        (b) The offset stated in the node table tells the location
#            of the "File: ..." header. We would have to cut off that stuff.
#
#        (c) Offsets computed by makeinfo are character offsets,
#            so we would have to convert those to byte offsets.
#            (But we have to do that anyway, so I guess there's no
#            advantage either way on that point.)

for $filename (@info_filenames) {

    open (FH, "<" . $infofile_encoding, $filename);
    read (FH, $stuff, -s FH);

    while ($stuff =~ m/\G(.*?)(?=^\d+\.\d+ .*?\n)/cgsm) {

        # Since FH was opened with $infofile_encoding,
        # pos returns a CHARACTER offset.
        $begin_node_offset = pos($stuff);

        if ($stuff =~ m/((^\d+\.\d+) (.*?)\n)/cgsm) {
            $node_title = $3;
            $node_length = length $1;
        }

        # Node text ends at a unit separator character,
        # or at the end of the file.

        if ($stuff =~ m/\G(.*?)($unit_separator)/cgsm) {
            $node_length += length $1;
        }
        else {
            $stuff =~ m/\G(.*)/csgm;
            $node_length += length $1;
        }

        $node_locator{$node_title} = [($filename, $begin_node_offset, $node_length)];
    }

    close FH;
}

# Translate character offsets to byte offsets.

foreach $node_title (sort keys %node_locator) {
    ($filename, $begin_node_offset, $node_length) = @{$node_locator{$node_title}};
    open FH, "<" . $infofile_encoding, $filename;
    read FH, $stuff, $begin_node_offset;
    my $begin_node_offset_bytes = tell FH;
    close FH;

    $node_locator{$node_title} = [($filename, $begin_node_offset_bytes, $node_length)];
}

# (2.2)  Generate Lisp code.
#
#        Pairs of the form (<node name> . (<filename> <byte offset> <length>))

print "(section-pairs '(\n";
print "; CONTENT: (<NODE NAME> . (<FILENAME> <BYTE OFFSET> <LENGTH IN CHARACTERS>))\n";

foreach $node_title (sort keys %node_locator) {
    $section_cnt++;
    ($filename, $begin_node_offset, $length) = @{$node_locator{$node_title}};
    my $sanitized_title = $node_title;
    $sanitized_title =~ s/"/\\"/g;
    if ($sanitized_title eq '' or $filename eq '' or $begin_node_offset < 0 or $length < 0) {
        print STDERR "build_index.pl: something seems wrong for title=\"$sanitized_title\"; emit it anyway.\n";
        print STDERR "build_index.pl: sanitized_title=\"$sanitized_title\", filename=\"$filename\", begin_node_offset=$begin_node_offset, length=$length\n";
        print ";; build_index.pl: something seems wrong for this next item\n";
    }
    print "(\"$sanitized_title\" . (\"$filename\" $begin_node_offset ", $length, "))\n";
}

print ")))\n";

#        Construct hashtables from the lists given above.

print "(load-info-hashtables (maxima::maxima-load-pathname-directory) deffn-defvr-pairs section-pairs))\n";

# (2.3)  Do we have any items or sections?
#
#        Warn if no index items or secions found. 

($item_cnt+$section_cnt)>0 || 
    print STDERR "WARNING: Empty index. Not sure what's going on.\n";

# ------------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------------

sub seek_lines {
    my ($filename, $character_offset, $lines_offset) = @_;
    open FH, "<" . $infofile_encoding, $filename;
    read FH, $stuff, $character_offset;

    # MAKEINFO BUG: LINE OFFSET IS LINE NUMBER OF LAST LINE IN FUNCTION DEFINITION
    # (BUT WE NEED THE FIRST LINE OF THE FUNCTION DEFINITION)
    #
    # EXAMPLE. THE PROBLEM IS THAT THE FUNCTION DEFINITION IS BROKEN ACROSS TWO
    # OR MORE LINES (NOT THAT THERE ARE MULTIPLE FUNCTION DEFINITIONS):
    #  -- Function: setup_autoload (<filename>, <function_1>, ...,
    #            <function_n>)
    #
    # BUG IS PRESENT IN MAKEINFO 4.8, NOT PRESENT IN MAKEINFO 5.1
    
    my $x;
    if ($makeinfo_major_version == 4) {
	$x = -1;
	my $x_maybe;

	for (1 .. $lines_offset + 1) {
	    $x_maybe = tell FH;
	    my $line = <FH>;
	    if ($line =~ /^ -- \S/) {
		$x = $x_maybe;
	    }
	}
	
	if ($x == -1) {
	    # We didn't encounter any match for "^ -- \S".
	    $x = $x_maybe;
	}
    } else {
	# VERSION WITHOUT BUG WORKAROUND,
	# FOR MAKEINFO VERSION 5
	<FH> for 1 .. $lines_offset;
	$x = tell FH;
    }

    close FH;
    return $x;
}
