diff options
Diffstat (limited to 'AntiSpoof/maintenance/generateEquivset.php')
-rw-r--r-- | AntiSpoof/maintenance/generateEquivset.php | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/AntiSpoof/maintenance/generateEquivset.php b/AntiSpoof/maintenance/generateEquivset.php new file mode 100644 index 00000000..ba539729 --- /dev/null +++ b/AntiSpoof/maintenance/generateEquivset.php @@ -0,0 +1,170 @@ +<?php + +$IP = getenv( 'MW_INSTALL_PATH' ); +if ( $IP === false ) { + $IP = __DIR__ . '/../../..'; +} +require_once( "$IP/maintenance/Maintenance.php" ); + +class GenerateEquivset extends Maintenance { + public function execute() { + $dir = __DIR__; + + $endl = "\n"; + + $lines = file( "$dir/equivset.in" ); + if ( !$lines ) { + $this->error( "Unable to open equivset.in\n", 1 ); + } + + $setsFile = fopen( "$dir/equivset.txt", 'w' ); + if ( !$setsFile ) { + $this->error( "Unable to open equivset.txt for writing\n", 1 ); + } + + fwrite( $setsFile, <<<EOT +# This file is generated by generateEquivset.php +# It shows sets of equivalent characters, one set per line, with characters +# separated by whitespace. This file is not used by MediaWiki, rather it is +# intended as a human-readable version of equivset.php, for debugging and +# review purposes. + +EOT + ); + + $outputFile = fopen( "$dir/equivset.php", 'w' ); + if ( !$outputFile ) { + $this->error( "Unable to open equivset.php for writing\n", 1 ); + } + + fwrite( $outputFile, "<?" . "php$endl" . <<<EOT +# This file is generated by generateEquivset.php +# It contains a map of characters, encoded in UTF-8, such that running strtr() +# on a string with this map will cause confusable characters to be reduced to +# a canonical representation. The same array is also available in serialized +# form, in equivset.ser. + +EOT + ); + + $serializedFile = fopen( "$dir/equivset.ser", 'w' ); + if ( !$serializedFile ) { + $this->error( "Unable to open equivset.ser for writing\n", 1 ); + } + + # \s matches \xa0 in non-unicode mode, which is not what we want + # So we need to make our own whitespace class + $sp = '[\ \t]'; + + $lineNum = 0; + $setsByChar = array(); + $sets = array(); + $exitStatus = 0; + + foreach ( $lines as $line ) { + ++$lineNum; + $mapToEmpty = false; + + # Whether the line ends with a null character + $mapToEmpty = ( strpos( $line, "\0" ) === strlen( $line ) - 2 ); + + $line = trim( $line ); + + # Filter comments + if ( !$line || $line[0] == '#' ) { + continue; + } + + # Process line + if ( !preg_match( + "/^(?P<hexleft> [A-F0-9]+) $sp+ (?P<charleft> .+?) $sp+ => $sp+ (?:(?P<hexright> [A-F0-9]+) $sp+|) (?P<charright> .+?) $sp* (?: \#.*|) $ /x", $line, $m ) ) + { + $this->output( "Error: invalid entry at line $lineNum: $line\n" ); + $exitStatus = 1; + continue; + } + $error = false; + + if ( $mapToEmpty ) { + $m['charright'] = ''; + } else { + if ( codepointToUtf8( hexdec( $m['hexleft'] ) ) != $m['charleft'] ) { + $actual = utf8ToCodepoint( $m['charleft'] ); + if ( $actual === false ) { + $this->output( "Bytes: " . strlen( $m['charleft'] ) . "\n" ); + $this->output( bin2hex( $line ) . "\n" ); + $hexForm = bin2hex( $m['charleft'] ); + $this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at line $lineNum: $line\n" ); + } else { + $this->output( "Error: left number ({$m['hexleft']}) does not match left character ($actual) " . + "at line $lineNum: $line\n" ); + } + $error = true; + } + if ( !empty( $m['hexright'] ) && codepointToUtf8( hexdec( $m['hexright'] ) ) != $m['charright'] ) { + $actual = utf8ToCodepoint( $m['charright'] ); + if ( $actual === false ) { + $hexForm = bin2hex( $m['charright'] ); + $this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at line $lineNum: $line\n" ); + } else { + $this->output( "Error: right number ({$m['hexright']}) does not match right character ($actual) " . + "at line $lineNum: $line\n" ); + } + $error = true; + } + if ( $error ) { + $exitStatus = 1; + continue; + } + } + + # Find the set for the right character, add a new one if necessary + if ( isset( $setsByChar[$m['charright']] ) ) { + $setName = $setsByChar[$m['charright']]; + } else { + # New set + $setName = $m['charright']; + $sets[$setName] = array( $m['charright'] ); + $setsByChar[$setName] = $setName; + } + + # Add the left character to the set + $sets[$setName][] = $m['charleft']; + $setsByChar[$m['charleft']] = $setName; + } + + # Sets output + foreach ( $sets as $members ) { + fwrite( $setsFile, implode( ' ', $members ) . $endl ); + } + + # Map output + $output = var_export( $setsByChar, true ); + $output = str_replace( "\n", $endl, $output ); + fwrite( $outputFile, '$equivset = ' . "$output;$endl" ); + + # Serialized codepoint map + $codepointMap = array(); + foreach ( $setsByChar as $char => $setName ) { + $key = $char === '' ? '' : utf8ToCodepoint( $char ); + $value = $setName === '' ? '' : utf8ToCodepoint( $setName ); + + $codepointMap[ $key ] = $value; + } + fwrite( $serializedFile, serialize( $codepointMap ) ); + + fclose( $setsFile ); + fclose( $outputFile ); + fclose( $serializedFile ); + + $text = 'Finished'; + if ( $exitStatus > 0 ) { + $text .= ' with errors'; + } + $this->error( $text, $exitStatus ); + } +} + +$maintClass = "GenerateEquivset"; +require_once( DO_MAINTENANCE ); + |