#!/usr/bin/perl # # Copyright (C) 2009 Jason R. Blevins # # Version: 2009-06-07 # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. use strict; use warnings; # Use this to allow a buffer between the maximum known good catid and # the point at which to begin assigning new catids to fix duplicates. my $offset = 10000; my %goodcats; my %gooddups; my %badcats; my %baddups; my %fixedcats; if ($#ARGV + 1 < 2) { die "usage: perl catidfix.pl good.rdf.u8 bad.rdf.u8 > catids.txt\n"; } my $goodfn = $ARGV[0]; my $badfn = $ARGV[1]; # Process both the good and bad RDFs warn "Reading last known good RDF dump...\n"; read_rdf($goodfn, \%goodcats, \%gooddups); # Test: %gooddups should be empty if (scalar(keys(%gooddups)) > 0) { warn "Duplicates in good RDF dump:\n"; print_dups(\%gooddups); die "Error: supposedly good RDF had duplicate catids.\n" } warn "\nReading most recent bad RDF dump...\n"; read_rdf($badfn, \%badcats, \%baddups); # warn "Duplicates in bad RDF dump:\n"; # print_dups(\%baddups); # Fix duplicates fix_dups(\%goodcats, \%badcats, \%baddups, \%fixedcats); # Print results print_cats(\%fixedcats); # Reads a compressed RDF dump (content.rdf.u8.gz) and returns # lists of unique category ids and duplicate category ids. sub read_rdf { my $filename = shift; my $catsref = shift; my $dupsref = shift; my $cat; my $catid; my $count = 0; open (RDF, $filename) or die $!; # To process gzipped files, use this insted #open (RDF, "gzip -dc $filename |") or die $!; my $line; while () { # Look for a category if (m!^$!) { $cat = $1; # Increment counter $count = $count + 1; # Look for the catid $_ = ; if (m!^\s*([0-9]+)$!) { $catid = $1; } else { die "Error processing RDF: catid not found for category $cat\n"; } # Is the catid unique? if ($catsref->{$catid}) { $dupsref->{$cat} = $catid; } else { $catsref->{$catid} = $cat; } } } # Move duplicate categories from %cats to %dups my %dupids; foreach (keys(%$dupsref)) { $dupids{$dupsref->{$_}} = 1; } foreach $catid (keys %dupids) { $cat = $catsref->{$catid}; $dupsref->{$cat} = $catid; delete($catsref->{$catid}); } # Print results my $numdupid = scalar(keys %dupids); my $numdupcats = scalar(keys %$dupsref); warn "Processed $count categories.\n"; warn "Duplicate catids: $numdupid.\n"; warn "Categories with invalid ids: $numdupcats.\n"; close (RDF); } # Use both good and bad RDFs to choose unique IDs for duplicates sub fix_dups { my $goodcatsref = shift; my $badcatsref = shift; my $dupsref = shift; my $fixedref = shift; my $maxid = 0; my $cat; my $catid; # Copy unique categories from badcatsref foreach $catid (keys %$badcatsref) { $fixedref->{$catid} = $badcatsref->{$catid}; # Track the maximum used catid if ($catid > $maxid) { $maxid = $catid; } } # Determine the original category for each duplicate ID foreach $cat (keys %$dupsref) { $catid = $dupsref->{$cat}; if ($goodcatsref->{$catid}) { if ($goodcatsref->{$catid} eq $cat) { #warn "Original catid $catid: $cat\n"; $fixedref->{$catid} = $goodcatsref->{$catid}; delete($dupsref->{$cat}); # Track the maximum used catid if ($catid > $maxid) { $maxid = $catid; } } else { #warn "Catid $catid not originally assigned to $cat\n"; } } else { #warn "No original category with catid $catid found!\n"; } } # Print the largest catid warn "Largest good catid in use: $maxid\n"; # Add $offset to $maxid, just in case $catid = $maxid + $offset; # Assign unique catids for duplicates foreach $cat (keys %$dupsref) { $catid = $catid + 1; $fixedref->{$catid} = $cat; } } # Print ids and categories sub print_cats { my $catsref = shift; my @ids = sort { $a <=> $b } keys %$catsref; # numeric sort foreach my $catid (@ids) { print "$catid $catsref->{$catid}\n"; } } # Print duplicate ids and categories sub print_dups { my $dupsref = shift; foreach my $cat (keys %$dupsref) { warn "$dupsref->{$cat} $cat\n"; } }