#!/usr/bin/perl
#
# Copyright (C) 2009 Jason R. Blevins
#
# Version: 2009-06-07
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

use strict;
use warnings;

# Use this to allow a buffer between the maximum known good catid and
# the point at which to begin assigning new catids to fix duplicates.
my $offset = 10000;

my %goodcats;
my %gooddups;
my %badcats;
my %baddups;
my %fixedcats;

if ($#ARGV + 1 < 2) {
    die "usage: perl catidfix.pl good.rdf.u8 bad.rdf.u8 > catids.txt\n";
}

my $goodfn = $ARGV[0];
my $badfn = $ARGV[1];

# Process both the good and bad RDFs
warn "Reading last known good RDF dump...\n";
read_rdf($goodfn, \%goodcats, \%gooddups);

# Test: %gooddups should be empty
if (scalar(keys(%gooddups)) > 0) {
    warn "Duplicates in good RDF dump:\n";
    print_dups(\%gooddups);
    die "Error: supposedly good RDF had duplicate catids.\n"
}

warn "\nReading most recent bad RDF dump...\n";
read_rdf($badfn, \%badcats, \%baddups);

# warn "Duplicates in bad RDF dump:\n";
# print_dups(\%baddups);

# Fix duplicates
fix_dups(\%goodcats, \%badcats, \%baddups, \%fixedcats);

# Print results
print_cats(\%fixedcats);

# Reads a compressed RDF dump (content.rdf.u8.gz) and returns
# lists of unique category ids and duplicate category ids.
sub read_rdf {
    my $filename = shift;
    my $catsref = shift;
    my $dupsref = shift;

    my $cat;
    my $catid;
    my $count = 0;

    open (RDF, $filename) or die $!;

    # To process gzipped files, use this insted
    #open (RDF, "gzip -dc $filename |") or die $!;

    my $line;
    while (<RDF>) {

        # Look for a category
        if (m!^<Topic r:id="(.*)">$!) {
            $cat = $1;

            # Increment counter
            $count = $count + 1;

            # Look for the catid
            $_ = <RDF>;
            if (m!^\s*<catid>([0-9]+)</catid>$!) {
                $catid = $1;
            } else {
                die "Error processing RDF: catid not found for category $cat\n";
            }

            # Is the catid unique?
            if ($catsref->{$catid}) {
                $dupsref->{$cat} = $catid;
            } else {
                $catsref->{$catid} = $cat;
            }
        }
    }

    # Move duplicate categories from %cats to %dups
    my %dupids;
    foreach (keys(%$dupsref)) {
        $dupids{$dupsref->{$_}} = 1;
    }
    foreach $catid (keys %dupids) {
        $cat = $catsref->{$catid};
        $dupsref->{$cat} = $catid;
        delete($catsref->{$catid});
    }

    # Print results
    my $numdupid = scalar(keys %dupids);
    my $numdupcats = scalar(keys %$dupsref);
    warn "Processed $count categories.\n";
    warn "Duplicate catids: $numdupid.\n";
    warn "Categories with invalid ids: $numdupcats.\n";

    close (RDF);
}

# Use both good and bad RDFs to choose unique IDs for duplicates
sub fix_dups {
    my $goodcatsref = shift;
    my $badcatsref = shift;
    my $dupsref = shift;
    my $fixedref = shift;

    my $maxid = 0;
    my $cat;
    my $catid;

    # Copy unique categories from badcatsref
    foreach $catid (keys %$badcatsref) {
        $fixedref->{$catid} = $badcatsref->{$catid};

        # Track the maximum used catid
        if ($catid > $maxid) {
            $maxid = $catid;
        }
    }

    # Determine the original category for each duplicate ID
    foreach $cat (keys %$dupsref) {
        $catid = $dupsref->{$cat};
        if ($goodcatsref->{$catid}) {
            if ($goodcatsref->{$catid} eq $cat) {
                #warn "Original catid $catid: $cat\n";
                $fixedref->{$catid} = $goodcatsref->{$catid};
                delete($dupsref->{$cat});

                # Track the maximum used catid
                if ($catid > $maxid) {
                    $maxid = $catid;
                }
            } else {
                #warn "Catid $catid not originally assigned to $cat\n";
            }
        } else {
            #warn "No original category with catid $catid found!\n";
        }
    }

    # Print the largest catid
    warn "Largest good catid in use: $maxid\n";

    # Add $offset to $maxid, just in case
    $catid = $maxid + $offset;

    # Assign unique catids for duplicates
    foreach $cat (keys %$dupsref) {
        $catid = $catid + 1;
        $fixedref->{$catid} = $cat;
    }
}

# Print ids and categories
sub print_cats {
    my $catsref = shift;
    my @ids = sort { $a <=> $b } keys %$catsref; # numeric sort
    foreach my $catid (@ids) {
        print "$catid $catsref->{$catid}\n";
    }
}

# Print duplicate ids and categories
sub print_dups {
    my $dupsref = shift;
    foreach my $cat (keys %$dupsref) {
        warn "$dupsref->{$cat} $cat\n";
    }
}