summaryrefslogtreecommitdiff
blob: 3041a1c4a3635937ca0c201b90927f8909618123 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/perl
use strict;
use warnings;

use Lucene;
use File::Basename;

# Lucene stuff by Robin H. Johnson <robbat2@gentoo.org>


my $filename = "sample.out";

open(my $fh, $filename) or die "could not open $filename";

my %rawdocs;
while (my $line=<$fh>) {
    $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s;
    my $fileid = $1; # numeric or "dist"
    my $field = $2; # string, non-empty
    my $value = $3; # string, may be empty
    #print "Fileid: ". $fileid . "\n";
    #print "field: ". $field . "\n";
    #print "Value: ". $value . "\n";
	
    if ( ! $rawdocs{$fileid} ) {
        $rawdocs{$fileid} = { $field => $value };
    } else {
        $rawdocs{$fileid}{$field} = $value;
    }
}
close($fh);


# Fields for indexing.

# our %fields = (
#     distfile => 'text',
#     filename => 'text',
#     isdist   => 'UnAnalyzedField',
#     size     => 'UnAnalyzedField',
#     mtime    => 'UnAnalyzedField',
#     md5      => 'UnAnalyzedField',
#     sha1     => 'UnAnalyzedField',
# );

# analyzer should simply tokenize filenames by its parts
# i would split up by [/.-_] at least. technically, using
# (\W|_|\d) as the class of split characters might be reasonable

my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer();
mkdir "data";
my $store = Lucene::Store::FSDirectory->getDirectory("data", 0);
my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 1);
$writer->setMergeFactor(100);
$writer->setUseCompoundFile(0);
$writer->setMaxFieldLength(2048);
$writer->setMinMergeDocs(10);
$writer->setMaxMergeDocs(100);

# Add Documents here
sub createdoc {
	my ($distfile, $rawdoc) = @_;
	my $isdist = defined($rawdoc->{isdistfile}) && $rawdoc->{isdistfile} ? 1 : 0;
	my $doc = new Lucene::Document;
	$doc->add(Lucene::Document::Field->Text("distfile", $distfile));
	$doc->add(Lucene::Document::Field->Keyword("isdistfile", $isdist));
	if($isdist) {
		for my $f (qw(origin cat pn cpv)) {
			$doc->add(Lucene::Document::Field->Text($f, $rawdoc->{$f})) if defined($rawdoc->{$f});
		}
		for my $f (qw(pv pr pf)) {
			$doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f});
		}
	} else {
		my $name = $rawdoc->{name};
		$doc->add(Lucene::Document::Field->Text("path", $name));
		$doc->add(Lucene::Document::Field->Text("filename", basename($name)));
		$doc->add(Lucene::Document::Field->Text("directory", dirname($name)));
	}
	for my $f (qw(md5 sha1 mtime size)) {
		$doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f});
	}
	return $doc;
}

my $distfile = $rawdocs{dist}{name};
foreach my $f (keys(%rawdocs)) {
	printf "%s\n", $f;
	my $doc = createdoc($distfile, $rawdocs{$f});
	$writer->addDocument($doc);
}

# End of Document adding
$writer->optimize();
$writer->close;
undef $writer;