1 files changed, 170 insertions, 0 deletions
diff --git a/AntiSpoof/maintenance/generateEquivset.php b/AntiSpoof/maintenance/generateEquivset.php
new file mode 100644
index 00000000..ba539729
--- /dev/null
+++ b/AntiSpoof/maintenance/generateEquivset.php
@@ -0,0 +1,170 @@
+<?php
+
+$IP = getenv( 'MW_INSTALL_PATH' );
+if ( $IP === false ) {
+	$IP = __DIR__ . '/../../..';
+}
+require_once( "$IP/maintenance/Maintenance.php" );
+
+class GenerateEquivset extends Maintenance {
+	public function execute() {
+		$dir = __DIR__;
+
+		$endl = "\n";
+
+		$lines = file( "$dir/equivset.in" );
+		if ( !$lines ) {
+			$this->error( "Unable to open equivset.in\n", 1 );
+		}
+
+		$setsFile = fopen( "$dir/equivset.txt", 'w' );
+		if ( !$setsFile ) {
+			$this->error( "Unable to open equivset.txt for writing\n", 1 );
+		}
+
+		fwrite( $setsFile, <<<EOT
+# This file is generated by generateEquivset.php
+# It shows sets of equivalent characters, one set per line, with characters
+# separated by whitespace. This file is not used by MediaWiki, rather it is
+# intended as a human-readable version of equivset.php, for debugging and
+# review purposes.
+
+EOT
+		);
+
+		$outputFile = fopen( "$dir/equivset.php", 'w' );
+		if ( !$outputFile ) {
+			$this->error( "Unable to open equivset.php for writing\n", 1 );
+		}
+
+		fwrite( $outputFile, "<?" . "php$endl" . <<<EOT
+# This file is generated by generateEquivset.php
+# It contains a map of characters, encoded in UTF-8, such that running strtr()
+# on a string with this map will cause confusable characters to be reduced to
+# a canonical representation. The same array is also available in serialized
+# form, in equivset.ser.
+
+EOT
+		);
+
+		$serializedFile = fopen( "$dir/equivset.ser", 'w' );
+		if ( !$serializedFile ) {
+			$this->error( "Unable to open equivset.ser for writing\n", 1 );
+		}
+
+		# \s matches \xa0 in non-unicode mode, which is not what we want
+		# So we need to make our own whitespace class
+		$sp = '[\ \t]';
+
+		$lineNum = 0;
+		$setsByChar = array();
+		$sets = array();
+		$exitStatus = 0;
+
+		foreach ( $lines as $line ) {
+			++$lineNum;
+			$mapToEmpty = false;
+
+			# Whether the line ends with a null character
+			$mapToEmpty = ( strpos( $line, "\0" ) === strlen( $line ) - 2 );
+
+			$line = trim( $line );
+
+			# Filter comments
+			if ( !$line || $line[0] == '#' ) {
+				continue;
+			}
+
+			# Process line
+			if ( !preg_match(
+				"/^(?P<hexleft> [A-F0-9]+) $sp+ (?P<charleft> .+?) $sp+ => $sp+ (?:(?P<hexright> [A-F0-9]+) $sp+|) (?P<charright> .+?) $sp* (?: \#.*|) $ /x", $line, $m ) )
+			{
+				$this->output( "Error: invalid entry at line $lineNum: $line\n" );
+				$exitStatus = 1;
+				continue;
+			}
+			$error = false;
+
+			if ( $mapToEmpty ) {
+				$m['charright'] = '';
+			} else {
+				if ( codepointToUtf8( hexdec( $m['hexleft'] ) ) != $m['charleft'] ) {
+					$actual = utf8ToCodepoint( $m['charleft'] );
+					if ( $actual === false ) {
+						$this->output( "Bytes: " . strlen( $m['charleft'] ) . "\n" );
+						$this->output( bin2hex( $line ) . "\n" );
+						$hexForm = bin2hex( $m['charleft'] );
+						$this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at line $lineNum: $line\n" );
+					} else {
+						$this->output( "Error: left number ({$m['hexleft']}) does not match left character ($actual) " .
+								"at line $lineNum: $line\n" );
+					}
+					$error = true;
+				}
+				if ( !empty( $m['hexright'] ) && codepointToUtf8( hexdec( $m['hexright'] ) ) != $m['charright'] ) {
+					$actual = utf8ToCodepoint( $m['charright'] );
+					if ( $actual === false ) {
+						$hexForm = bin2hex( $m['charright'] );
+						$this->output( "Invalid UTF-8 character \"{$m['charleft']}\" ($hexForm) at line $lineNum: $line\n" );
+					} else {
+						$this->output( "Error: right number ({$m['hexright']}) does not match right character ($actual) " .
+								"at line $lineNum: $line\n" );
+					}
+					$error = true;
+				}
+				if ( $error ) {
+					$exitStatus = 1;
+					continue;
+				}
+			}
+
+			# Find the set for the right character, add a new one if necessary
+			if ( isset( $setsByChar[$m['charright']] ) ) {
+				$setName = $setsByChar[$m['charright']];
+			} else {
+				# New set
+				$setName = $m['charright'];
+				$sets[$setName] = array( $m['charright'] );
+				$setsByChar[$setName] = $setName;
+			}
+
+			# Add the left character to the set
+			$sets[$setName][] = $m['charleft'];
+			$setsByChar[$m['charleft']] = $setName;
+		}
+
+		# Sets output
+		foreach ( $sets as $members ) {
+			fwrite( $setsFile, implode( ' ', $members ) . $endl );
+		}
+
+		# Map output
+		$output = var_export( $setsByChar, true );
+		$output = str_replace( "\n", $endl, $output );
+		fwrite( $outputFile, '$equivset = ' . "$output;$endl" );
+
+		# Serialized codepoint map
+		$codepointMap = array();
+		foreach ( $setsByChar as $char => $setName ) {
+			$key = $char === '' ? '' : utf8ToCodepoint( $char );
+			$value = $setName === '' ? '' : utf8ToCodepoint( $setName );
+
+			$codepointMap[ $key ] = $value;
+		}
+		fwrite( $serializedFile, serialize( $codepointMap ) );
+
+		fclose( $setsFile );
+		fclose( $outputFile );
+		fclose( $serializedFile );
+
+		$text = 'Finished';
+		if ( $exitStatus > 0 ) {
+			$text .= ' with errors';
+		}
+		$this->error( $text, $exitStatus );
+	}
+}
+
+$maintClass = "GenerateEquivset";
+require_once( DO_MAINTENANCE );
+