diff options
Diffstat (limited to 'distindex/read-index.pl')
-rw-r--r-- | distindex/read-index.pl | 43 |
1 files changed, 0 insertions, 43 deletions
diff --git a/distindex/read-index.pl b/distindex/read-index.pl deleted file mode 100644 index c279750..0000000 --- a/distindex/read-index.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -my $filename = "sample.out"; - -open(my $fh, $filename) or die "could not open $filename"; - -my %documents; -while (my $line=<$fh>) { - $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s; - my $fileid = $1; # numeric or "dist" - my $field = $2; # string, non-empty - my $value = $3; # string, may be empty - #print "Fileid: ". $fileid . "\n"; - #print "field: ". $field . "\n"; - #print "Value: ". $value . "\n"; - - if ( ! $documents{$fileid} ) { - $documents{$fileid} = { $field => $value }; - } else { - $documents{$fileid}{$field} = $value; - } -} -close($fh); - - -# Fields for indexing. - -# our %fields = ( -# distfile => 'text', -# filename => 'text', -# isdist => 'UnAnalyzedField', -# size => 'UnAnalyzedField', -# mtime => 'UnAnalyzedField', -# md5 => 'UnAnalyzedField', -# sha1 => 'UnAnalyzedField', -# ); - -# analyzer should simply tokenize filenames by its parts -# i would split up by [/.-_] at least. technically, using -# (\W|_|\d) as the class of split characters might be reasonable - |