#!/GWD/bioinfo/apps/bin/perl # # Purpose: Used as shim between GoogieSpell and "aspell". # Cleans up the incoming text for aspell (Removes TWiki words, HTML, URL's, TML Variables and Macros) # Then converts the aspell suggestions into the expected Google XML # # Author: Craig Meyer Oct 31, 2006 # use strict; use FileHandle; BEGIN { use vars qw( $CONTENT $ASPELL_CMD $XML_HDR $SPELLRESULT $CHANGE %UnEscape $WikiWordRegex $DEBUG $TEMP_FILE $DEBUG_FILE ); $DEBUG = 0; # set to 1 for more information in /tmp/Debug_spell_$$ # $DEBUG = 1; # This turns on debug $ASPELL_CMD = qq[/opt/TWWfsw/TWW/bin/aspell --pipe --mode=none --sug-mode=normal]; $CONTENT = qq[Content-Type: text/xml\n]; $XML_HDR = qq[]; $SPELLRESULT = qq[]; $CHANGE = qq[%s]; %UnEscape = ( LT => '<', GT => '>', AMP => '&', QUOT => qq["], ); $TEMP_FILE = "/tmp/Googie_Spell_$$.txt"; $DEBUG_FILE = "/tmp/Googie_Debug_$$.txt"; # Would use definition in TWiki.pm, but then we load too much code $WikiWordRegex = qr/[A-Z]+[a-z]+[A-Z]+[A-Za-z0-9]*/o; } # BEGIN sub untaintUnchecked { my ( $string ) = @_; if ( defined( $string) && $string =~ /^(.*)$/ ) { return $1; } return $string; # Can't happen. } sub UnEscape { my($text) = @_; $text =~ s/&([a-zA-Z]+);/ defined($UnEscape{uc($1)}) ? $UnEscape{uc($1)} : '?????' /ge; # too much # $text =~ s/%([A-Fa-f0-9]{2})/ chr(hex($1)) /ge; return( $text ); } # UnEscape sub make_choices { my($str) = @_; return( join("\t", split(/, /, $str)) ); } # make_choices sub Main { my($text, $xml, %form, $debug); $ENV{PATH} = untaintUnchecked($ENV{PATH}); # Needed for running in taint mode if( $DEBUG > 0 ){ $debug = new FileHandle(">$DEBUG_FILE") or die "Can't open debug\n"; print $debug 'StartTime: ', `date`, "\n"; } read (STDIN, $xml, $ENV{'CONTENT_LENGTH'}); if( $DEBUG > 0 ){ print $debug "Received: [$xml]\n"; } if( $xml =~ m/(.*)<\/text>/s ){ $text = $1; } else { print STDERR "Unable to parse [$xml]\n"; exit(0); } # # Remove any HTML $text = UnEscape($text); $text =~ s/(<[^>]+>)/ ' ' x length($1)/ges; # Remove URL's $text =~ s/(https?:\/\/.*?\s)/ ' ' x length($1)/ges; # Remove %TWIKIVARIABLE% $text =~ s/(%[A-Z]+?%)/ ' ' x length($1)/ges; # Remove %TWIKIMACROS{ $text =~ s/(%[A-Z]+?{)/ ' ' x length($1)/ges; # Remove WikiWords $text =~ s/($WikiWordRegex)/ ' ' x length($1)/ges; $text =~ s/twiki/ /gsi; # Remove all TWiki's my($hout); $hout = new FileHandle(">$TEMP_FILE") or die "Can't open tmp [$TEMP_FILE]\n"; print $hout "!\n"; # puts aspell into Terse mode foreach ( split(/[\r\n]+/, $text) ){ # escape each line, so leading characters don't confuse aspell print $hout '^', $_, "\n"; } $hout->close(); if( $DEBUG > 0 ){ print $debug "Raw text: [$text]\n"; } my($hin, $offset, $nlines, @lines, $word, $word_len, $choices, $i); $hin = new FileHandle("cat $TEMP_FILE | $ASPELL_CMD |") or die "Can't open pipe\n"; $nlines = <$hin>; # Eat 1st line header @lines = <$hin>; # read in everything $hin->close(); # put out Headers; Content-Header, XML_HDR, and spellresult print STDOUT $CONTENT, "\n", $XML_HDR, "\n", sprintf($SPELLRESULT, length($text)) ,"\n"; $offset = 0; $nlines = scalar(@lines); for($i=0; $i < $nlines; $i++ ){ if( $lines[$i] =~ m/^& ([^ ]+) \d+ \d+: (.*)$/ ){ $word = $1; $word_len = length($word); $choices = make_choices($2); $offset = index($text, $word, $offset); print STDOUT sprintf($CHANGE, $offset, $word_len, 1, $choices), "\n"; print $debug $lines[$i], sprintf($CHANGE, $offset, $word_len, 1, $choices), "\n" if( $DEBUG > 0 ); $offset += $word_len; } elsif( $DEBUG > 0 ){ print $debug "*** UnExpected aspell line [$lines[$i]]\n"; } } print STDOUT "\n"; $debug->close() if( $DEBUG > 0); unlink($TEMP_FILE) if( $DEBUG <= 0 ); } # Main &Main();