summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'AntiSpoof/maintenance/generateEquivset.php')
-rw-r--r--AntiSpoof/maintenance/generateEquivset.php170
1 files changed, 170 insertions, 0 deletions
diff --git a/AntiSpoof/maintenance/generateEquivset.php b/AntiSpoof/maintenance/generateEquivset.php
new file mode 100644
index 00000000..ba539729
--- /dev/null
+++ b/AntiSpoof/maintenance/generateEquivset.php
@@ -0,0 +1,170 @@
+<?php
+
+$IP = getenv( 'MW_INSTALL_PATH' );
+if ( $IP === false ) {
+ $IP = __DIR__ . '/../../..';
+}
+require_once( "$IP/maintenance/Maintenance.php" );
+
+class GenerateEquivset extends Maintenance {
+ public function execute() {
+ $dir = __DIR__;
+
+ $endl = "\n";
+
+ $lines = file( "$dir/equivset.in" );
+ if ( !$lines ) {
+ $this->error( "Unable to open equivset.in\n", 1 );
+ }
+
+ $setsFile = fopen( "$dir/equivset.txt", 'w' );
+ if ( !$setsFile ) {
+ $this->error( "Unable to open equivset.txt for writing\n", 1 );
+ }
+
+ fwrite( $setsFile, <<<EOT
+# This file is generated by generateEquivset.php
+# It shows sets of equivalent characters, one set per line, with characters
+# separated by whitespace. This file is not used by MediaWiki, rather it is
+# intended as a human-readable version of equivset.php, for debugging and
+# review purposes.
+
+EOT
+ );
+
+ $outputFile = fopen( "$dir/equivset.php", 'w' );
+ if ( !$outputFile ) {
+ $this->error( "Unable to open equivset.php for writing\n", 1 );
+ }
+
+ fwrite( $outputFile, "<?" . "php$endl" . <<<EOT
+# This file is generated by generateEquivset.php
+# It contains a map of characters, encoded in UTF-8, such that running strtr()
+# on a string with this map will cause confusable characters to be reduced to
+# a canonical representation. The same array is also available in serialized
+# form, in equivset.ser.
+
+EOT
+ );
+
+ $serializedFile = fopen( "$dir/equivset.ser", 'w' );
+ if ( !$serializedFile ) {
+ $this->error( "Unable to open equivset.ser for writing\n", 1 );
+ }
+
+ # \s matches \xa0 in non-unicode mode, which is not what we want
+ # So we need to make our own whitespace class
+ $sp = '[\ \t]';
+
+ $lineNum = 0;
+ $setsByChar = array();
+ $sets = array();
+ $exitStatus = 0;
+
+ foreach ( $lines as $line ) {
+ ++$lineNum;
+ $mapToEmpty = false;
+
+ # Whether the line ends with a null character
+ $mapToEmpty = ( strpos( $line, "\0" ) === strlen( $line ) - 2 );
+
+ $line = trim( $line );
+
+ # Filter comments
+ if ( !$line || $line[0] == '#' ) {
+ continue;
+ }
+
+ # Process line
+ if ( !preg_match(
+ "/^(?P<hexleft> [A-F0-9]+) $sp+ (?P<charleft> .+?) $sp+ => $sp+ (?:(?P<hexright> [A-F0-9]+) $sp+|) (?P<charright> .+?) $sp* (?: \#.*|) $ /x", $line, $m ) )
+ {
+ $this->output( "Error: invalid entry at line $lineNum: $line\n" );
+ $exitStatus = 1;
+ continue;
+ }
+ $error = false;
+
+ if ( $mapToEmpty ) {
+ $m['charright'] = '';
+ } else {
+ if ( codepointToUtf8( hexdec( $m['hexleft'] ) ) != $m['charleft'] ) {
+ $actual = utf8ToCodepoint( $m['charleft'] );
+ if ( $actual === false ) {
+ $this->output( "Bytes: " . strlen( $m['charleft'] ) . "\n" );
+ $this->output( bin2hex( $line ) . "\n" );
+ $hexForm = bin2hex( $m['charleft'] );
+ $this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at line $lineNum: $line\n" );
+ } else {
+ $this->output( "Error: left number ({$m['hexleft']}) does not match left character ($actual) " .
+ "at line $lineNum: $line\n" );
+ }
+ $error = true;
+ }
+ if ( !empty( $m['hexright'] ) && codepointToUtf8( hexdec( $m['hexright'] ) ) != $m['charright'] ) {
+ $actual = utf8ToCodepoint( $m['charright'] );
+ if ( $actual === false ) {
+ $hexForm = bin2hex( $m['charright'] );
+ $this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at line $lineNum: $line\n" );
+ } else {
+ $this->output( "Error: right number ({$m['hexright']}) does not match right character ($actual) " .
+ "at line $lineNum: $line\n" );
+ }
+ $error = true;
+ }
+ if ( $error ) {
+ $exitStatus = 1;
+ continue;
+ }
+ }
+
+ # Find the set for the right character, add a new one if necessary
+ if ( isset( $setsByChar[$m['charright']] ) ) {
+ $setName = $setsByChar[$m['charright']];
+ } else {
+ # New set
+ $setName = $m['charright'];
+ $sets[$setName] = array( $m['charright'] );
+ $setsByChar[$setName] = $setName;
+ }
+
+ # Add the left character to the set
+ $sets[$setName][] = $m['charleft'];
+ $setsByChar[$m['charleft']] = $setName;
+ }
+
+ # Sets output
+ foreach ( $sets as $members ) {
+ fwrite( $setsFile, implode( ' ', $members ) . $endl );
+ }
+
+ # Map output
+ $output = var_export( $setsByChar, true );
+ $output = str_replace( "\n", $endl, $output );
+ fwrite( $outputFile, '$equivset = ' . "$output;$endl" );
+
+ # Serialized codepoint map
+ $codepointMap = array();
+ foreach ( $setsByChar as $char => $setName ) {
+ $key = $char === '' ? '' : utf8ToCodepoint( $char );
+ $value = $setName === '' ? '' : utf8ToCodepoint( $setName );
+
+ $codepointMap[ $key ] = $value;
+ }
+ fwrite( $serializedFile, serialize( $codepointMap ) );
+
+ fclose( $setsFile );
+ fclose( $outputFile );
+ fclose( $serializedFile );
+
+ $text = 'Finished';
+ if ( $exitStatus > 0 ) {
+ $text .= ' with errors';
+ }
+ $this->error( $text, $exitStatus );
+ }
+}
+
+$maintClass = "GenerateEquivset";
+require_once( DO_MAINTENANCE );
+