aboutsummaryrefslogtreecommitdiff
path: root/lib/gorg
diff options
context:
space:
mode:
authorHans de Graaff <hans@degraaff.org>2012-10-26 13:00:04 +0200
committerHans de Graaff <hans@degraaff.org>2012-10-26 13:00:04 +0200
commit7670cccf083e25676804c503582091a3eadb00cf (patch)
tree8aa8d9a6a760e2aa10c3d2f30df315f34c70ffc7 /lib/gorg
parentInitial commit (diff)
downloadgorg-7670cccf083e25676804c503582091a3eadb00cf.tar.gz
gorg-7670cccf083e25676804c503582091a3eadb00cf.tar.bz2
gorg-7670cccf083e25676804c503582091a3eadb00cf.zip
Import distributed 0.6.4 release.0.6.4
Diffstat (limited to 'lib/gorg')
-rw-r--r--lib/gorg/base.rb602
-rw-r--r--lib/gorg/cache.rb493
-rwxr-xr-xlib/gorg/cgi-bin/gorg.cgi45
-rwxr-xr-xlib/gorg/cgi-bin/search.cgi50
-rw-r--r--lib/gorg/cgi.rb198
-rwxr-xr-xlib/gorg/fcgi-bin/gorg.fcgi61
-rw-r--r--lib/gorg/log.rb56
-rw-r--r--lib/gorg/search.rb444
-rw-r--r--lib/gorg/www.rb207
9 files changed, 2156 insertions, 0 deletions
diff --git a/lib/gorg/base.rb b/lib/gorg/base.rb
new file mode 100644
index 0000000..c3851a9
--- /dev/null
+++ b/lib/gorg/base.rb
@@ -0,0 +1,602 @@
+### Copyright 2004, Xavier Neys (neysx@gentoo.org)
+# #
+# # This file is part of gorg.
+# #
+# # gorg is free software; you can redistribute it and/or modify
+# # it under the terms of the GNU General Public License as published by
+# # the Free Software Foundation; either version 2 of the License, or
+# # (at your option) any later version.
+# #
+# # gorg is distributed in the hope that it will be useful,
+# # but WITHOUT ANY WARRANTY; without even the implied warranty of
+# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# # GNU General Public License for more details.
+# #
+# # You should have received a copy of the GNU General Public License
+# # along with Foobar; if not, write to the Free Software
+### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+module Gorg
+ Version = "0.6"
+end
+
+# Some required stuff for gorg
+require 'time'
+
+require 'gorg/xsl'
+require 'gorg/log'
+require 'gorg/cache'
+require 'timeout'
+require 'cgi'
+require 'stringio'
+require 'zlib'
+require 'ipaddr'
+
+
+module Gorg
+
+ def xproc(path, params, list=false, printredirect=false)
+ # Process file through xslt passing params to the processor
+ # path should be the absolute path of the file, i.e. not relative to DocumentRoot
+ #
+ # Since 0.4, path can also be a string containing
+ # the actual xml to be processed
+ #
+ # Use default stylesheet if none can be found in the file
+ # Return a list of files read by the processor (useful to do caching) if requested
+ #
+ # Return an error condition and, hopefully, some useful output
+ # Do not raise any exception
+ # In most cases, an error will result in no output but
+ # the xslt processor can consider some errors as warnings and
+ # return the best result it could come up with along with a warning
+ # e.g. if a file used in a document() function cannot be found,
+ # the xslt processor will return some output and a warning.
+ # It's up to the caller to decide whether to use the output or b0rk
+ #
+ # The return value is an array of 2 to 4 items: [{}, "", [[]], []]
+ # 1. hash with error information, its keys are
+ # 1.a "xmlErrCode" 0 is no error, -9999 means an exception has been raised in this block (unlikely),
+ # anything else is an error code (see /usr/include/libxml2/libxml/xmlerror.h)
+ # 1.b "xmlErrLevel" again, from libxml2, 0==OK, 1==Warning, 2==Error, 3==Fatal
+ # 1.c "xmlErrLevel" again, from libxml2, some explanation about what went wrong
+ # 2. output from xsltprocessor (or error message from a raised exception)
+ # 3. list of files that the xslt processor accessed if the list was requested,
+ # paths are absolute, i.e. not relative to your docroot.
+ # Each entry is an array [access type, path] with access_type being
+ # "r" for read, "w" for written (with exsl:document) or "o" for other (ftp:// or http://)
+ # 4. array of CGI::Cookie to be sent back
+ #
+ # Examples: [{"xmlErrMsg"=>"blah warning blah", "xmlErrCode"=>1509, "xmlErrLevel"=>1}, "This is the best XSLT could do!", nil]
+ # [{"xmlErrCode"=>0}, "Result of XSLT processing. Well done!", ["/etc/xml/catalog","/var/www/localhost/htdocs/doc/en/index.xml","/var/www/localhost/htdocs/dtd/guide.dtd"]]
+
+ xsltproc = Gorg::XSL.new
+ xsltproc.xroot = $Config["root"]
+ # Grab strings from xsl:message
+ xslMessages = []
+ # Does the caller want a list of accessed files?
+ xsltproc.xtrack = list; filelist = Array.new
+ # Process .xml file with stylesheet(s) specified in file, or with default stylesheet
+ xsltproc.xml = path
+ # Look for stylesheet href (there can be more than one)
+ regexp = Regexp.new('<\?xml-stylesheet.*href="([^"]*)".*')
+ l = $Config["headXSL"] ; styles = Array.new
+ if FileTest.file?(path) then
+ # Path is indeed a file name
+ IO.foreach(path) { |line|
+ styles << $1 if regexp.match(line)
+ break if (l-=1) == 0
+ }
+ else
+ # Scan xml for stylesheet names
+ path.each { |line| styles << $1 if regexp.match(line) }
+ end
+ # Use default stylesheet if none were found in the doc
+ styles << $Config["defaultXSL"] if styles.length == 0
+ # Add params, we expect a hash of {param name => param value,...}
+ xsltproc.xparams = params
+ # Process through list of stylesheets
+ firstErr = {}
+ while xsltproc.xsl = styles.shift
+ xsltproc.process
+ filelist += xsltproc.xfiles if xsltproc.xtrack?
+ # Break and raise 301 on redirects
+ xsltproc.xmsg.each { |r|
+ if r =~ /Redirect=(.+)/ then
+ if printredirect then
+ STDERR.puts "Location: #{$1}"
+ else
+ raise Gorg::Status::MovedPermanently.new($1)
+ end
+ end
+ }
+ xslMessages += xsltproc.xmsg
+ # Remember 1st warning / error
+ firstErr = xsltproc.xerr if firstErr["xmlErrLevel"].nil? && xsltproc.xerr["xmlErrLevel"] > 0
+ # B0rk on error, an exception should have been raised by the lib, but, er, well, you never know
+ break if xsltproc.xerr["xmlErrLevel"] > 1
+ xsltproc.xml = xsltproc.xres
+ end
+ # Keep 1st warning / error if there has been one
+ firstErr = xsltproc.xerr if firstErr["xmlErrLevel"].nil?
+ # Return values
+ [ firstErr, xsltproc.xres, (filelist.uniq if xsltproc.xtrack?), xslMessages ]
+ rescue => ex
+ if ex.respond_to?(:errCode) then
+ # One of ours (Gorg::Status::HTTPStatus)
+ # Propagate exception
+ raise
+ else
+ debug "in xproc exception handler: #{ex.inspect} // #{xsltproc.xerr.inspect}"
+ # Return exception message and an error hash as expected from the xslt processor
+ # Use error codes that the xslt lib might have returned
+ [ if (xsltproc.xerr["xmlErrCode"]||-1) == 0 then
+ { "xmlErrMsg" => ex.to_s,
+ "xmlErrCode" => 9999,
+ "xmlErrLevel" => 3
+ }
+ else
+ { "xmlErrMsg" => xsltproc.xerr["xmlErrMsg"] || ex.to_s,
+ "xmlErrCode" => xsltproc.xerr["xmlErrCode"],
+ "xmlErrLevel" => xsltproc.xerr["xmlErrLevel"]
+ }
+ end ,
+ ex.to_s,
+ (filelist.uniq if xsltproc.xtrack?)
+ ]
+ end
+ end
+
+ # HTTP status codes and html output
+ module Status
+ class HTTPStatus < StandardError
+ def html(err="")
+ <<-EOR
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
+<HTML>
+<HEAD><TITLE>#{errSts}</TITLE></HEAD>
+<BODY>
+<H1>#{errLabel}</H1>
+<font color="#FF0000">#{err}</font>
+<HR>
+</BODY>
+</HTML>
+ EOR
+ end
+ def errSts
+ "#{errCode} #{errLabel}"
+ end
+ # Default is unknown error
+ def errLabel
+ "Undefined Error"
+ end
+ def errCode
+ 999
+ end
+ def header
+ {'Status' => errSts}
+ end
+ end
+
+ class NotModified < HTTPStatus
+ def initialize(stat)
+ # 304 needs to send ETag and Last-Modified back
+ @mstat=stat
+ end
+ def header
+ {'Last-Modified' => @mstat.mtime.httpdate.dup, 'ETag' => makeETag(@mstat).dup}.merge(super)
+ end
+ def html
+ ""
+ end
+ def errLabel
+ "Not Modified"
+ end
+ def errCode
+ 304
+ end
+ end
+
+ class MovedPermanently < HTTPStatus
+ def initialize(loc)
+ # 301 needs to send Location:
+ @location=loc
+ end
+ def errLabel
+ "Moved Permanently"
+ end
+ def errCode
+ 301
+ end
+ def header
+ {'Location' => @location}.merge(super)
+ end
+ def html
+ # RFC says "should" not "must" add a body
+ ""
+ end
+ def html301 # Not used
+ <<-EO301
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
+<html><head>
+<title>301 Moved Permanently</title>
+</head><body>
+<h1>Moved Permanently</h1>
+<p>The document has moved <a href="#{@location}">here</a>.</p>
+</body></html>
+ EO301
+ end
+ end
+
+ class Forbidden < HTTPStatus
+ def errLabel
+ "Forbidden"
+ end
+ def errCode
+ 403
+ end
+ end
+
+ class NotFound < HTTPStatus
+ def errLabel
+ "Not Found"
+ end
+ def errCode
+ 404
+ end
+ end
+
+ class NotAllowed < HTTPStatus
+ def errLabel
+ "Method Not Allowed"
+ end
+ def header
+ {'Allow'=>'GET,HEAD'}.merge(super)
+ end
+ def errCode
+ 405
+ end
+ end
+
+ class SysError < HTTPStatus
+ def errLabel
+ "Internal Server Error"
+ end
+ def errCode
+ 500
+ end
+ end
+ end #Status module
+
+
+ def gorgInit
+ # Initialize gorg, i.e. read config file, init cache, ...
+ # Simply build a hash of params => value in a global variable called $Config
+
+ # Set up default values
+ $Config = { "AppName" => "gorg", # Used for syslog entries, please keep 'gorg' (cannot be changed in config file)
+ "root" => nil, # No root dir by default (cgi uses DOCUMENT_ROOT from its environment)
+ "port" => 8000, # Used for stand-alone web server (WEBrick)
+ "headXSL" => 12, # Only read 12 lines in xml files to identify required stylesheets
+ "defaultXSL" => nil, # No default stylesheet, how could I guess?
+ "cacheDir" => nil, # No cache by default. Directory must exist and be writable.
+ "cacheTTL" => 0, # Number of seconds after which a document is considered too old, 0=never
+ "cacheSize" => 40, # in MegaBytes, max size of cache, used when autocleanig
+ "zipLevel" => 2, # Compresion level used for gzip support (HTTP accept_encoding) (0-9, 0=none, 9=max)
+ "maxFiles" => 9999, # Max number of files in a single directory in the cache tree
+ "cacheTree" => 0, # Use same tree as on site in cache, 0 = disabled
+ "cacheWash" => 0, # Clean cache automatically and regularly when a store into the cache occurs. 0 = disabled
+ # gorg cleans up if random(param_value) < 10. It will only clean same dir it caches to, not whole tree.
+ # i.e. a value<=10 means at every call (not a good idea), 100 means once/10 stores, 1000 means once/100 stores
+ "logLevel" => 4, # INFO, be slightly verbose by default (messages go to syslog) OFF, FATAL, ERROR, WARN, INFO, DEBUG = 0, 1, 2, 3, 4, 5
+ "passthru" => true, # Allow return of requested file without processing it if passthru="anything but 0" is passed
+ "acceptCookies" =>false,# Allow cookies in & out of transforms
+ "linkParam" => "link", # Pass pathname of requested file in 'link' param to xsl transform
+ "HTTP_HOST" => nil, # Pass host value from HTTP header to xsl transform
+ "accessLog" => "syslog",# or a filename or STDERR, used to report hits from WEBrick, not used by cgi's
+ "autoKill" => 0, # Only used by fastCGI, exit after so many requests (0 means no, <=1000 means 1000). Just in case you fear memory leaks.
+ "in/out" => [], # (In/Ex)clude files from indexing
+ "mounts" => [], # Extran mounts for stand-alone server
+ "listen" => "127.0.0.1" # Let webrick listen on given IP
+ }
+ # Always open syslog
+ @syslog = Gorg::Log::MySyslog.new($Config["AppName"])
+ $Log = Gorg::Log::MyLog.new(@syslog, 5) # Start with max
+
+ # Check for config file
+ configf = ENV["GORG_CONF"]||"/etc/gorg/gorg.conf"
+ raise "Cannot find config file (#{configf})" unless FileTest.file?(configf) and FileTest.readable?(configf)
+ file = IO.read(configf)
+ parseConfig($Config, file)
+
+ # Init cache
+ Cache.init($Config) if $Config["cacheDir"]
+
+ # Set requested log level
+ $Log.level = $Config["logLevel"]
+ rescue
+ error("Gorg::init failed: #{$!}")
+ STDERR.puts("Gorg::init failed: #{$!}")
+ exit(1)
+ end
+
+ def scanParams(argv)
+ # Scan argv for --param paramName paramValue sequences
+ # params are removed from argv
+ # Return a hash of {"name" => "value"}
+ h = Hash.new
+ while idx = argv.index('--param')
+ break if argv.length <= idx+2 # We need at least 2 more args after --param
+ argv.delete_at(idx) # Remove --param from argv
+ name = argv.delete_at(idx) # Remove param name from argv
+ value = argv.delete_at(idx) # Remove param value from argv
+ h[name] = value # Add entry in result
+ end
+
+ h if h.length > 0
+ end
+
+ private
+ def parseConfig(h, config)
+ config.each {|line|
+ line.strip!
+ next if line.length == 0 or line[0,1] == '#' # Skip blank lines and comments
+ raise "Invalid Configuration (#{line})" unless line =~ /^([a-zA-Z_]*)\s*=\s*/
+ param = $1
+ value = $'
+ # If value starts with ' or ", it ends with a similar sign and does not accept any in the value, no escaping... We keep it simple
+ # otherwise, it ends with EOL or first space
+ if value =~ /["'](.*)['"]/ then
+ value = $1
+ end
+ value.strip!
+ raise "No value for #{param}" unless value.length > 0
+ # Check param / value (only syntactical checks here)
+ case param.downcase
+ when "root"
+ h["root"] = value
+ when "port"
+ h["port"] = value.to_i
+ when "passthru"
+ h["passthru"] = value.squeeze != "0"
+ when "acceptcookies"
+ h["acceptCookies"] = value.squeeze == "1"
+ when "linkparam"
+ if value =~ /^\s*([a-zA-Z]+)\s*$/ then
+ h["linkParam"] = $1
+ else
+ h["linkParam"] = nil
+ end
+ when "httphost"
+ hosts = value.squeeze(" ")
+ case hosts
+ when /^0?$/
+ hh = nil
+ when "*"
+ hh = ["*"]
+ else
+ hh = hosts.split(" ")
+ # Add IPs
+ hosts.split(" ").each { |ho|
+ begin
+ hh += TCPSocket.gethostbyname(ho)[3..-1] if ho != '*'
+ rescue
+ # Ignore
+ nil
+ end
+ }
+ hh.uniq!
+ end
+ h["httphost"] = hh
+ when "headxsl"
+ h["headXSL"] = value.to_i
+ when "defaultxsl"
+ h["defaultXSL"] = value
+ when "cachedir"
+ h["cacheDir"] = value
+ when "cachettl"
+ h["cacheTTL"] = value.to_i
+ when "cachesize"
+ h["cacheSize"] = value.to_i
+ when "maxfiles"
+ h["maxFiles"] = value.to_i
+ when "cachetree"
+ h["cacheTree"] = value.squeeze != "0"
+ when "ziplevel"
+ if value =~ /^\s*([0-9])\s*$/ then
+ h["zipLevel"] = $1.to_i
+ else
+ h["zipLevel"] = 2
+ end
+ when "cachewash"
+ h["cacheWash"] = value.to_i
+ when "loglevel"
+ h["logLevel"] = value.to_i
+ when "accesslog"
+ h["accessLog"] = value
+ when "autokill"
+ h["autoKill"] = value.to_i
+ when "listen"
+ begin
+ ip = IPAddr.new(value)
+ h["listen"] = ip.to_s
+ rescue
+ h["listen"] = "127.0.0.1"
+ end
+ when "dbconnect"
+ h["dbConnect"] = value
+ when "dbuser"
+ h["dbUser"] = value
+ when "dbpassword"
+ h["dbPassword"] = value
+ when "exclude"
+ h["in/out"] << [false, Regexp.new(value)]
+ when "include"
+ h["in/out"] << [true, Regexp.new(value)]
+ when "fpath_to_lang"
+ h["flang"] = Regexp.new(value)
+ when "xpath_to_lang"
+ h["xlang"] = value
+ when "mount"
+ if value =~ /^([^\s]+)\s+ON\s+(.+)$/i then
+ h["mounts"] << [$1, $2]
+ end
+ else
+ raise "Unknown parameter (#{param})"
+ end
+ }
+ rescue
+ raise "Could not parse config file: #{$!}"
+ end
+
+ # Utilities
+ def contentType(aMsg)
+ # Find the Content-Type=xxx/yyy line in aMsg
+ # from the Meta file in the cache
+ ct = nil
+ aMsg.each { |s|
+ if s =~ /^Content-Type:(.+)$/ then
+ ct = $1
+ break
+ end
+ }
+ ct
+ end
+
+ def setContentType(data)
+ # Set content-type according to x(ht)ml headers
+ charset = nil
+ if data =~ /^<\?xml .*encoding=['"](.+)['"]/i then
+ charset = $1 if $1
+ # XML / XHTML
+ if data[0..250] =~ /^<\!DOCTYPE\s+html/i then
+ # XHTML
+ ct = 'application/xhtml+xml'
+ else
+ # XML
+ ct = 'text/xml'
+ end
+ if charset then
+ ct << "; charset=#{charset}"
+ end
+ elsif data =~ /^<\!DOCTYPE\s+html\sPUBLIC\s(.+DTD XHTML)?/i then
+ # (X)HTML
+ if $1 then
+ # XHTML
+ ct = 'application/xhtml+xml'
+ else
+ # HTML
+ ct = 'text/html'
+ end
+ elsif data =~ /<html/i then
+ # HTML
+ ct = 'text/html'
+ else
+ # TXT
+ ct = 'text/plain'
+ end
+ ct
+ end
+
+ def makeCookies(aMsg)
+ # Make an array of CGI::Cookie objects
+ # msg is expected to be an array of strings like 'Set-Cookie(name)value=param'
+ # (output by the xsl transform with xsl:message)
+ cookies = Hash.new
+ aMsg.each { |s|
+ if s =~ /^Set-Cookie\(([^\)]+)\)([a-zA-Z0-9_-]+)=(.+)$/ then
+ # $1 = cookie name $2 = key $3 = value
+ if cookies.has_key?($1) then
+ cookies[$1] << "#{$2}=#{$3}"
+ else
+ cookies[$1] = ["#{$2}=#{$3}"]
+ end
+ end
+ }
+ if cookies.length > 0 then
+ # Make CGI::Cookie objects
+ cookies.map { |k,v|
+ CGI::Cookie.new('name' => k, 'value' => v, 'expires' => Time.now + 3600*24*30)
+ }
+ else
+ nil
+ end
+ end
+
+ def cookies_to_params(cookies)
+ # Turn array of CGI::Cookie objects into a Hash of key=>value
+ # cookies is a hash, forget the keys,
+ # each value should be an array of strings, each string should be like 'param=value'
+ h = {}
+ cookies.values.each { |v|
+ if v.class==String and v =~ /^([a-zA-Z0-9_-]+)=(.+)$/ then
+ h[$1] = $2
+ elsif v.class==Array then
+ v.each { |vv|
+ if vv.class==String and vv =~ /^([a-zA-Z0-9_-]+)=(.+)$/ then
+ h[$1] = $2
+ end
+ }
+ elsif v.class==CGI::Cookie then
+ v.value.each { |vv|
+ if vv.class==String and vv =~ /^([a-zA-Z0-9_-]+)=(.+)$/ then
+ h[$1] = $2
+ end
+ }
+ end
+ }
+ h
+ rescue
+ error "Could not parse cookies (#{$!}) "
+ {}
+ end
+
+ def notModified?(fstat, etags, ifmodsince)
+ # Decide whether file has been modified according to either etag, last mod timestamp or both
+ # If both If-None-Match and If-Modified-Since request header fields are present,
+ # they have to be tested both
+ res = false
+ if fstat then
+ a = etags.to_a
+ if ifmodsince && etags then
+ res = (ifmodsince >= fstat.mtime) && (a.include?(makeETag(fstat)) || a.include?('*'))
+ elsif etags
+ res = a.include?(makeETag(fstat)) || a.include?('*')
+ elsif ifmodsince
+ res = ifmodsince >= fstat.mtime
+ end
+ end
+ # Return result
+ res
+ end
+
+ def split_header_etags(str)
+ # Split header values expected as "value1", "value2", ... into an array of strings
+ str.scan(/((?:"(?:\\.|[^"])+?"|[^",]+)+)(?:,\s*|\Z)/xn).collect{|v| v[0].strip }
+ end
+
+ def makeETag(st)
+ # Format file stat object into an ETag using its size & mtime
+ # Parameter can either be a filename or a stat object
+ st = File.stat(st) unless st.respond_to?(:ino)
+ sprintf('"%x-%x"', st.size, st.mtime.to_i)
+ end
+
+ def gzip(data, level)
+ gz = ""
+ io = StringIO.new(gz)
+ gzw = Zlib::GzipWriter.new(io, level)
+ gzw.write data
+ gzw.close
+ gz
+ end
+
+ def gunzip(data)
+ io = StringIO.new(data)
+ gzw = Zlib::GzipReader.new(io)
+ gunz = gzw.read
+ gzw.close
+ gunz
+ end
+
+end
diff --git a/lib/gorg/cache.rb b/lib/gorg/cache.rb
new file mode 100644
index 0000000..543b6a2
--- /dev/null
+++ b/lib/gorg/cache.rb
@@ -0,0 +1,493 @@
+### Copyright 2004, Xavier Neys (neysx@gentoo.org)
+# #
+# # This file is part of gorg.
+# #
+# # gorg is free software; you can redistribute it and/or modify
+# # it under the terms of the GNU General Public License as published by
+# # the Free Software Foundation; either version 2 of the License, or
+# # (at your option) any later version.
+# #
+# # gorg is distributed in the hope that it will be useful,
+# # but WITHOUT ANY WARRANTY; without even the implied warranty of
+# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# # GNU General Public License for more details.
+# #
+# # You should have received a copy of the GNU General Public License
+# # along with Foobar; if not, write to the Free Software
+### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+# Cache a bit of data based on
+# . a path name as received by a webserver e.g.
+# . a list of parameters as received by a webserver e.g.
+# . a list of files it depends on
+
+require "parsedate"
+require "fileutils"
+require "find"
+require "digest"
+require "digest/md5"
+
+module Gorg
+
+CacheStamp = "Gorg-#{Gorg::Version} Cached This Data. Do not alter this file. Thanks."
+
+module Cache
+ def Cache.init(config)
+ @@lockfile = ".cache.cleaner.lock"
+ @cacheDir = nil
+ if FileTest.directory?(config["cacheDir"])
+ if FileTest.writable?(config["cacheDir"])
+ @cacheDir = config["cacheDir"].chomp("/")
+ else
+ warn "Cache directory not writable"
+ end
+ else
+ warn "Invalid cache directory"
+ end
+
+ # Time-To-Live in seconds, cached items older than that will be considered too old
+ @zipLevel = config["zipLevel"]
+ @zip = @zipLevel > 0 ? ".gz" : ""
+ @ttl = config["cacheTTL"]
+ @cacheTree = config["cacheTree"]
+ @maxFiles = config["maxFiles"] # Max number of files in a single directory
+ @maxSize = config["cacheSize"]*1024*1024 # Now in bytes
+ @washNumber = config["cacheWash"] # Clean cache dir after a store operation whenever rand(@washNumber) < 10
+ @lastCleanup = Time.new-8e8 # Remember last time we started a cleanup so we don't pile them up
+ end
+
+ def Cache.hit(objPath, objParam={}, etags=nil, ifmodsince=nil)
+ # objPath is typically a requested path passed from a web request but it
+ # can be just any string. It is not checked against any actual files on the file system
+ #
+ # objParam is expected to be a hash or any object whose iterator yields two values
+ #
+ # 2 filenames are built with the arguments and should give
+ # the name of a metafile and a result file
+ # if the result file is older than @ttl seconds, hit fails
+ # The metafile is then checked for dependencies
+ # It contains a list of filenames along with their size and mtime separated by ;;
+
+ # etag and ifmodsince are used in a webserver context
+ # etag is defined if an ETag was part of an If-None-Match request field
+ # etag can be an array or a single string
+ # If the current ETag of the meta file matches, no data is returned (webserver should return a 304)
+ #
+ # ifmodsince is a time object passed on an If-Modified-Since request field
+ # If the creation date of the meta file is earlier, no data is returned (webserver should return a 304)
+
+ return nil if @cacheDir.nil? # Not initialized, ignore request
+
+ # Reminder: filenames are full path, no need to prepend dirname
+ dirname, basename, filename, metaname = makeNames(objPath, objParam)
+
+ raise "Cache subdir does not exist" unless FileTest.directory?(dirname)
+
+ # Hit the cache
+ meta, mstat = IO.read(metaname), File.stat(metaname) if metaname && FileTest.file?(metaname) && FileTest.readable?(metaname)
+ raise "Empty/No meta file" if meta.nil? || meta.length < 1
+
+ fstat = File.stat(filename) if filename && FileTest.file?(filename)
+ raise "Empty/No data file" if fstat.nil?
+
+ # Check the timestamps of files in the metadata
+ meta = meta.split("\n")
+ raise "I did not write that meta file" unless CacheStamp == meta.shift
+ mline = meta.shift
+ while mline and mline !~ /^;;extra meta$/ do
+ f, s, d = mline.split(";;")
+ if s.to_i < 0
+ # File did not exist when cache entry was created
+ raise "Required file #{f} has (re)appeared" if FileTest.file?(f) && FileTest.readable?(f)
+ else
+ # File did exist when cache entry was created, is it still there?
+ raise "Required file #{f} has disappeared" unless FileTest.file?(f) && FileTest.readable?(f)
+
+ fst = File.stat(f)
+ raise "Size of #{f} has changed from #{fst.size} to #{s.to_i}" unless fst.size == s.to_i
+ raise "Timestamp of #{f} has changed" unless Time.utc(*ParseDate.parsedate(d)) == fst.mtime.utc
+ end
+ mline = meta.shift
+ end
+ if mline =~ /^;;extra meta$/ then
+ extrameta = meta.dup
+ else
+ extrameta = []
+ end
+
+ if notModified?(fstat, etags, ifmodsince) and extrameta.join !~ /set-cookie/i
+ raise Gorg::Status::NotModified.new(fstat)
+ end
+
+ file = IO.read(filename) if filename && FileTest.file?(filename) && FileTest.readable?(filename)
+ raise "Empty/No data file" if file.nil? || file.length < 1
+
+ # Is the data file too old
+ raise "Data file too old" unless @ttl==0 or (Time.new - fstat.mtime) < @ttl
+
+ # Update atime of files, ignore failures as files might have just been removed
+ begin
+ t = Time.new
+ File.utime(t, fstat.mtime, filename)
+ File.utime(t, mstat.mtime, metaname)
+ rescue
+ nil
+ end
+
+ # If we get here, it means the data file can be used, return cache object (data, stat(datafile), extrameta)
+ # The file is left (un)compressed, it's returned as it was stored
+ [file, fstat, extrameta]
+
+ rescue Gorg::Status::NotModified
+ # Nothing changed, should return a 304
+ debug("Client cache is up-to-date")
+ raise
+ rescue
+ # cache hit fails if anything goes wrong, no exception raised
+ debug("Cache hit on #{objPath} failed: (#{$!})")
+ nil
+ end
+
+
+ def Cache.store(data, objPath, objParam={}, deps=[], extrameta=[])
+ # Store data in cache so it can be retrieved based on the objPath and objParams
+ # deps should contain a list of files that the object depends on
+ # as returnd by our xsl processor, i.e. an array of [access_type, path] where
+ # access_type can be "r", "w", or "o" for recpectively read, write, other.
+
+ # Define content-type
+ ct = setContentType(data)
+ extrameta << "Content-Type:#{ct}"
+
+ return nil if @cacheDir.nil? # Not initialized, ignore request
+
+ # Cache only if no remote objects (ftp:// or http://) in list of used files
+ if deps && deps.detect{|f| f[0] =~ /^o$/i }
+ debug "#{objPath} not cached because it needs remote resources"
+ return nil
+ end
+
+ dirname, basename, filename, metaname = makeNames(objPath, objParam)
+
+ FileUtils.mkdir_p(dirname) unless FileTest.directory?(dirname)
+
+ # Write Meta file to a temp file (with .timestamp.randomNumber appended)
+ metaname_t = "#{metaname}.#{Time.new.strftime('%Y%m%d%H%M%S')}.#{rand(9999)}"
+
+ # Data might need to be just a link to another .Data file
+ # if we find another requested path with different params but
+ # with identical MD5 sums
+ # Which is why we keep a ...xml.Data.[md5 sum] file without the parameters
+ # in its name that we can hard link to.
+ # e.g. A moron hits for 10 full handbooks with toto=1..10 in the URI,
+ # we'd end up with 10 identical large copies. With links we have only one
+
+ # Old versions are expected to be cleaned up by the cacheWash() routine
+ # A Dir.glob() to find the previous ones would be too expensive
+
+ # Compute MD5 digest
+ md5 = Digest::MD5.hexdigest(data)
+
+ # Compress data if required
+ if @zipLevel > 0 then
+ bodyZ = data = gzip(data, @zipLevel)
+ else
+ bodyZ = nil
+ end
+
+ # Set mtime of data file to latest mtime of all required files
+ # so that caching can work better because mtimes will be
+ # identical on all webnodes whereas creation date of data
+ # would be different on all nodes.
+ maxmtime = Time.now-8e8
+ fstat = nil
+
+ begin
+ timeout(10){
+ File.open("#{metaname_t}", "w") {|fmeta|
+ fmeta.puts(CacheStamp)
+ # Write filename;;size;;mtime for each file in deps[]
+ deps.each {|ffe|
+ ftype = ffe[0]
+ fdep = ffe[1]
+ if FileTest.file?(fdep)
+ s = File.stat(fdep)
+ fmeta.puts("#{fdep};;#{s.size};;#{s.mtime.utc};;#{ftype}")
+ maxmtime = s.mtime if s.mtime > maxmtime and ftype =~ /^r$/i
+ else
+ # A required file does not exist, use size=-1 and old timestamp
+ # so that when the file comes back, the cache notices a difference
+ # and no cache miss gets triggered as long as file does not exist
+ fmeta.puts("#{fdep};;-1;;Thu Nov 11 11:11:11 UTC 1971")
+ end
+ }
+ fmeta.puts ";;extra meta"
+ extrameta.each { |m| fmeta.puts m }
+ }
+ # Get exclusive access to the cache directory while moving files and/or creating data files
+ File.open(dirname) { |lockd|
+ while not lockd.flock(File::LOCK_NB|File::LOCK_EX)
+ # Timeout does not occur on a blocking lock
+ # Try a non-bloking one repeatedly for a few seconds until timeout occurs or lock is granted
+ # We are in a timeout block, remember
+ sleep 0.1
+ end
+ # Remove previous Data
+ FileUtils.rm_rf(filename)
+
+ # mv temp meta file to meta file
+ FileUtils.mv(metaname_t, metaname)
+
+ # We keep a data file for the same requested path, with different params,
+ # but which ends up with same MD5 sum, i.e. identical results because of unused params
+ linkname = "#{basename}.#{md5}#{@zip}"
+ if FileTest.file?(linkname) then
+ # Data file already there, link to it
+ File.link(linkname, filename)
+ else
+ # Write data file and set its mtime to latest of all files it depends on
+ File.open("#{filename}", "w") {|fdata| fdata.write(data)}
+ # Create link
+ File.link(filename, linkname)
+ end
+ # mtime might need to be updated, or needs to be set
+ # e.g. when a dependency had changed but result files is identical
+ # This is needed to keep Last-Modified dates consistent across web nodes
+ File.utime(Time.now, maxmtime, filename)
+ fstat = File.stat(filename)
+ }
+ }
+ ensure
+ FileUtils.rm_rf(metaname_t)
+ end
+
+ # Do we clean the cache?
+ washCache(dirname, 10) if @washNumber > 0 and rand(@washNumber) < 10
+
+ # Return stat(datafile) even if it's just been removed by washCache
+ # because another web node might still have it or will have it.
+ # Anyway, the cached item would be regenerated on a later request
+ # and a 304 would be returned if still appropriate at the time.
+
+ # Return fstat of data file (for etag...) and zipped file
+ [fstat, bodyZ]
+
+ rescue Timeout::Error, StandardError =>ex
+ if ex.class.to_s =~ /timeout::error/i then
+ warn("Timeout in cache store operation")
+ else
+ warn("Cache store error (#{$!})")
+ end
+ # Clean up before leaving
+ FileUtils.rm_rf(filename||"")
+ FileUtils.rm_rf(metaname||"")
+ nil # return nil so that caller can act if a failed store really is a problem
+ end
+
+
+ def Cache.washCache(dirname, tmout=30, cleanTree=false)
+ # Clean cache entries that are either too old compared to TTL (in seconds)
+ # or reduce total size to maxSize (in MB)
+ # oldDataOnly means to look only for unused *.Data.[md5] files that are not used anymore
+ # because file has been modified and has generated a new *.Data.[md5] file
+
+ # timeout is the maximum time (in seconds) spent in here
+
+ return nil if @cacheDir.nil? # Not initialized, ignore request
+
+ # Also ignore request if dirname not equal to @cacheDir or under it
+ return nil unless dirname[0, @cacheDir.length] == @cacheDir
+
+ # Also ignore request if dirname does not exist yet
+ return nil unless FileTest.directory?(dirname)
+
+ # Also return if less than a minute has elapsed since latest cleanup
+ t0 = Time.new
+ return nil if t0 - @lastCleanup < 60
+
+ # Remember for next time
+ @lastCleanup = t0
+
+ Dir.chdir(dirname) { |d|
+ # Recreate lock file if it's been lost
+ unless File.exist?(@@lockfile)
+ File.open(@@lockfile, "w") { |lockf| lockf.puts("Lock file created on #{Time.now.utc} by gorg")}
+ end
+
+ # Grab lockfile
+ File.open(@@lockfile) { |lockf|
+ if lockf.flock(File::LOCK_NB|File::LOCK_EX) then
+ infoMsg = "Cleaning up cache in #{dirname} (cleanTree=#{cleanTree}, tmout=#{tmout})"
+ info(infoMsg)
+ puts infoMsg if cleanTree
+
+ timeout(tmout) {
+ totalSize, deletedFiles, scannedDirectories = washDir(dirname, cleanTree)
+ if totalSize >= 0 then
+ # Size == -1 means dir was locked, throwing an exception would have been nice :)
+ infoMsg = if cleanTree then
+ "Cache in #{dirname} is now #{totalSize/1024/1024} MB, #{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{scannedDirectories} directories"
+ else
+ "#{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{dirname}"
+ end
+ info(infoMsg)
+ puts infoMsg if cleanTree
+ end
+ }
+ else
+ # Locked dir, another process is busy cleaning up/
+ debug("#{dirname} locked, skipping")
+ puts("#{dirname} locked, skipping") if cleanTree
+ end # of lock test
+ } # end of File.open(@@lockfile), close & release lock automatically
+ }
+ rescue Timeout::Error
+ info("Timeout while cleaning #{dirname}")
+ puts("Timeout while cleaning #{dirname}") if cleanTree
+ rescue StandardError =>ex
+ error("Error while cleaning cache: #{ex}")
+ puts("Error while cleaning cache: #{ex}") if cleanTree
+ end
+
+
+ private
+
+ def Cache.washDir(dirname, cleanTree)
+ # Clean up cache starting from dirname and in subdirectories if cleanTree is true
+ # Return [newSize in bytes, # deleted files, # scanned directories]
+ size = nDeleted = nDirectories = 0
+
+ Dir.chdir(dirname) { |d|
+ hIno = Hash.new(0) # hash of file inodes with more than one link
+ lst = Array.new # array of file names, atime, ...
+ ttl = @ttl
+ ttl = 8e8 if ttl == 0 # No ttl, keep very old docs!
+
+ # Get list of files sorted on their dirname+atime
+ Find.find('.') { |f|
+ begin
+ unless f =~ /^\.$|#{@@lockfile}/ # ignore "." and lockfile
+ ff = File.stat(f)
+ if ff.directory? then
+ Find.prune unless cleanTree
+ elsif ff.file? and f =~ /Meta|Data/ then
+ hIno[ff.ino] = ff.nlink if ff.nlink > 1
+ # List of files has [name, atime, size, # links, inode]
+ lst << [f, ff.atime, ff.size, ff.nlink, ff.ino]
+ end
+ end
+ rescue
+ nil # File.stat can fail because file could have been deleted, ignore error
+ end
+ }
+
+ # Compute total size
+ size = lst.inject(0){ |tot, a| tot + if a[3] > 0 then a[2]/a[3] else 0 end }
+
+ # Delete old *.Data.[md5] files that are not being referenced anymore/
+ lst.each { |a|
+ if a[3] == 1 && a[0] =~ /\.Data\.[0-9a-f]+(.gz)?$/ then
+ # Data file with no more links pointing to it
+ FileUtils.rm_rf(a[0])
+ nDeleted += 1
+ size -= a[2]
+ a[3] = 0 # Mark as deleted
+ end
+ }
+
+ # Sort all files on atime
+ lst.sort!{ |a1, a2| a1[1] <=> a2[1] }
+
+ t0 = Time.new
+ # Clean until size < maxSize _AND_ atime more recent than TTL
+ lst.each { |a|
+ break if size < @maxSize and t0-a[1] < ttl
+ next if a[3] < 1 # Already deleted in previous step
+ FileUtils.rm_rf(a[0])
+ nDeleted += 1
+ # Total size -= file size IF last link to data
+ if a[3] == 1 || hIno[a[4]] <= 1 then
+ size -= a[2]
+ end
+ hIno[a[4]] -= 1 if hIno[a[4]] > 0
+ a[3] = 0 # Mark as deleted by setting nlinks to 0
+ }
+
+ # Remove deleted files from array
+ lst.reject! { |a| a[3] < 1 }
+
+
+ # Sort files per directory to enforce maxFiles
+ if cleanTree then
+ # Split the array in an array per directory
+ # and keep the files sorted on atime in each directory
+ slst = Hash.new
+ lst.length.times {
+ a = lst.shift
+ d = File.dirname(a[0])
+ if slst[d] then
+ slst[d] << a
+ else
+ slst[d] = [a]
+ end
+ }
+ else
+ # If not cleaning whole tree, we have only a single dir
+ slst = {"." => lst}
+ end
+
+ nDirectories = slst.length
+
+ slst.each { |d, lst|
+ # Remove oldest files so that we have less than @maxFiles in it
+ if lst.length >= @maxFiles then
+ # Remove to leave up 90% of #maxFiles so we don't clean up only a handful of files repeatedly
+ (lst.length - 9*@maxFiles/10).times {
+ if a = lst.shift then
+ FileUtils.rm_rf(a[0])
+ nDeleted += 1
+ # Total size -= file size IF last link to data
+ if a[3] == 1 || hIno[a[4]] <= 1 then
+ size -= a[2]
+ end
+ hIno[a[4]] -= 1 if hIno[a[4]] > 0
+ end
+ }
+ end
+ }
+ } #end of chdir
+ [size, nDeleted, nDirectories]
+ end
+
+
+ def Cache.makeNames(obj, params)
+ # Build meta filename and data filename from arguments
+ #
+ # obj is broken into a path and a filename with appended params
+ # e.g. /proj/en/index.xml?style=printable becomes /proj/en and index.xml+printable+yes
+ # or .#proj#en#index.xml+printable+yes
+ # depending on cacheTree param value
+
+ # .Meta and .Data are appended respectively to the meta filename and data filename
+ # Base is the filename without appending params, e.g. .#proj#en#index.xml.Data
+ if @cacheTree then
+ # Use a path and a file
+ dir = "#{@cacheDir}#{File.dirname(obj)}"
+ base = f = File.basename(obj)
+ else
+ # Convert full path into a single filename
+ dir = @cacheDir
+ base = f = ".#{obj.gsub(/\//,'#')}"
+ end
+
+ f = "#{f}+#{params.reject{|k,v| k.nil?}.sort.join('+')}" if params && params.to_a.length > 0
+ # Remove funky chars and squeeze duplicates into single chars
+ f = f.gsub(/[^\w\#.+_-]/, "~").squeeze("~.#+")
+
+ # Return names for Data and Meta files, and just the filepath (e.g. #proj#en#index.xml)
+ [dir, "#{dir}/#{base}.Data", "#{dir}/#{f}.Data#{@zip}", "#{dir}/#{f}.Meta"]
+ end
+end
+
+end
diff --git a/lib/gorg/cgi-bin/gorg.cgi b/lib/gorg/cgi-bin/gorg.cgi
new file mode 100755
index 0000000..3c75dbc
--- /dev/null
+++ b/lib/gorg/cgi-bin/gorg.cgi
@@ -0,0 +1,45 @@
+#! /usr/bin/ruby
+
+### Copyright 2004, Xavier Neys (neysx@gentoo.org)
+# #
+# # This file is part of gorg.
+# #
+# # gorg is free software; you can redistribute it and/or modify
+# # it under the terms of the GNU General Public License as published by
+# # the Free Software Foundation; either version 2 of the License, or
+# # (at your option) any later version.
+# #
+# # gorg is distributed in the hope that it will be useful,
+# # but WITHOUT ANY WARRANTY; without even the implied warranty of
+# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# # GNU General Public License for more details.
+# #
+# # You should have received a copy of the GNU General Public License
+# # along with Foobar; if not, write to the Free Software
+### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+require 'cgi'
+
+require 'gorg/cgi'
+
+if ARGV.length == 1 and ['-F', '--filter'].include?(ARGV[0]) then
+ # cgi does not accept any params like gorg,
+ # Only test on -F or --filter being there and nothing else
+ do_Filter unless STDIN.tty?
+else
+ # Make CGI's env public to get access to REQUEST_URI and DOCUMENT_ROOT
+ class CGI
+ public :env_table
+ end
+
+ include Gorg
+
+ # Config file is named in env var. GORG_CONF, or possibly REDIRECT_GORG_CONF
+ ENV["GORG_CONF"] = ENV["GORG_CONF"]||ENV["REDIRECT_GORG_CONF"]
+
+ gorgInit
+ STDERR.close
+
+ cgi = CGI.new
+ do_CGI(cgi)
+end
diff --git a/lib/gorg/cgi-bin/search.cgi b/lib/gorg/cgi-bin/search.cgi
new file mode 100755
index 0000000..396001e
--- /dev/null
+++ b/lib/gorg/cgi-bin/search.cgi
@@ -0,0 +1,50 @@
+#! /usr/bin/ruby
+
+### Copyright 2004, Xavier Neys (neysx@gentoo.org)
+# #
+# # This file is part of gorg.
+# #
+# # gorg is free software; you can redistribute it and/or modify
+# # it under the terms of the GNU General Public License as published by
+# # the Free Software Foundation; either version 2 of the License, or
+# # (at your option) any later version.
+# #
+# # gorg is distributed in the hope that it will be useful,
+# # but WITHOUT ANY WARRANTY; without even the implied warranty of
+# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# # GNU General Public License for more details.
+# #
+# # You should have received a copy of the GNU General Public License
+# # along with gorg; if not, write to the Free Software
+### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+require 'cgi'
+require 'gorg/search'
+
+# Make CGI's env public to get access to REQUEST_URI and DOCUMENT_ROOT
+class CGI
+ public :env_table
+end
+
+include Gorg
+
+# Config file is named in env var. GORG_CONF, or possibly REDIRECT_GORG_CONF
+# ENV["PATH"] is used as a dirty hackish workaround a limitation of
+# webrick's cgi handler: environment variables can't be passed to cgi's
+# (REDIRECT_)GORG_CONF should be defined when running cgi's under apache
+ENV["GORG_CONF"] = ENV["GORG_CONF"]||ENV["REDIRECT_GORG_CONF"]||ENV["PATH"]
+
+gorgInit
+cgi = CGI.new
+
+# Params
+#
+# l = language code, no param will default to en, empty param defaults to any)
+# q = query string
+# p = page number in search result (0 < p < 1e6)
+# s = page size (9 < p < 120)
+# b = boolean search (y|Y|1 means yes, anything else no)
+
+gs = GDig::GSearch.new
+gs.do_CGI(cgi)
diff --git a/lib/gorg/cgi.rb b/lib/gorg/cgi.rb
new file mode 100644
index 0000000..dfe8451
--- /dev/null
+++ b/lib/gorg/cgi.rb
@@ -0,0 +1,198 @@
+### Copyright 2004, Xavier Neys (neysx@gentoo.org)
+# #
+# # This file is part of gorg.
+# #
+# # gorg is free software; you can redistribute it and/or modify
+# # it under the terms of the GNU General Public License as published by
+# # the Free Software Foundation; either version 2 of the License, or
+# # (at your option) any later version.
+# #
+# # gorg is distributed in the hope that it will be useful,
+# # but WITHOUT ANY WARRANTY; without even the implied warranty of
+# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# # GNU General Public License for more details.
+# #
+# # You should have received a copy of the GNU General Public License
+# # along with Foobar; if not, write to the Free Software
+### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+# Process CGI request, either from cgi or fcgi
+
+require "gorg/base"
+
+module Gorg
+ def do_Filter(tmout=30, params=nil)
+ # Read STDIN, transform, spit result out
+ timeout(tmout) {
+ # Give it a few seconds to read it all, then timeout
+ xml = STDIN.read
+ err, body, filelist = xproc(xml, params, false, true)
+ if err["xmlErrLevel"] > 0 then
+ STDERR.puts("#{err.collect{|e|e.join(':')}.join("\n")}")
+ elsif (body||"").length < 1 then
+ # Some transforms can yield empty content
+ STDERR.puts("Empty body")
+ else
+ STDOUT.puts(body)
+ end
+ }
+ rescue Timeout::Error, StandardError =>ex
+ # Just spew it out
+ STDERR.puts(ex)
+ end
+
+ def do_CGI(cgi)
+ header = Hash.new
+ if cgi.path_info.nil? || cgi.env_table["REQUEST_URI"].index("/#{File.basename($0)}/")
+ # Sorry, I'm not supposed to be called directly, e.g. /cgi-bin/gorg.cgi/bullshit_from_smartass_skriptbaby
+ raise Gorg::Status::Forbidden
+ elsif cgi.request_method == "OPTIONS"
+ cgi.out('Allow'=>'GET,HEAD'){""}
+ elsif cgi.request_method == "HEAD" or cgi.request_method == "GET"
+ # lighttp is b0rked despite what they say :(
+ # PATH_INFO == "" and PATH_TRANSLATED == nil
+ if cgi.path_info.length > 0 then
+ # Apache, or any web browser that works
+ path_info = cgi.path_info
+ else
+ # lighttp, use SCRIPT_NAME instead
+ path_info = cgi.env_table['SCRIPT_NAME']
+ end
+ query = Hash.new
+ cgi.params.each{ |p, v| query[p] = v.to_s}
+ # Get DOCUMENT_ROOT from environment
+ $Config["root"] = cgi.env_table['DOCUMENT_ROOT']
+
+ xml_file = cgi.path_translated||(cgi.env_table['DOCUMENT_ROOT']+cgi.env_table['SCRIPT_NAME'])
+ if not FileTest.file?(xml_file)
+ # Should have been checked by apache, check anyway
+ raise Gorg::Status::NotFound
+ else
+ # Process request
+ # Parse If-None-Match and If-Modified-Since request header fields if any
+ inm=ims=nil
+ begin
+ inm = split_header_etags(cgi.env_table['HTTP_IF_NONE_MATCH']) if cgi.env_table['HTTP_IF_NONE_MATCH']
+ ims = Time.parse(cgi.env_table['HTTP_IF_MODIFIED_SINCE']) if cgi.env_table['HTTP_IF_MODIFIED_SINCE']
+ ims = nil if ims > Time.now # Dates later than current must be ignored
+ rescue
+ # Just ignore ill-formated data
+ nil
+ end
+ if $Config['passthru'] && query["passthru"] && query["passthru"] != "0" then
+ # passthru allowed by config and requested by visitor, return file as text/plain
+ debug("Passthru granted for #{path_info}")
+ mstat = File.stat(xml_file)
+ raise Gorg::Status::NotModified.new(mstat) if notModified?(mstat, inm, ims)
+ body = IO.read(xml_file)
+ header['type'] = 'text/plain'
+ # If client accepts gzip encoding and we support it, return gzipped file
+ if $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then
+ body = gzip(body, $Config["zipLevel"])
+ header['Content-Encoding'] = "gzip"
+ header['Vary'] = "Accept-Encoding"
+ end
+ else
+ # Get cookies and add them to the parameters
+ if $Config["acceptCookies"] then
+ # Add cookies to our params
+ query.merge!(cookies_to_params(cgi.cookies))
+ end
+
+ if $Config["httphost"] then
+ # Add HTTP_HOST to stylesheet params
+ query["httphost"] = if $Config["httphost"][0] == '*' then
+ cgi.host||""
+ elsif $Config["httphost"].include?('*') then
+ $Config["httphost"][0]
+ elsif $Config["httphost"].include?(cgi.host) then
+ $Config["httphost"][0]
+ else
+ cgi.host||""
+ end
+ end
+
+ xml_query = query.dup # xml_query==params passed to the XSL, query=>metadata in cache
+ if $Config["linkParam"] then
+ xml_query[$Config["linkParam"]] = path_info
+ end
+
+ bodyZ = nil # Compressed version
+ body, mstat, extrameta = Cache.hit(path_info, query, inm, ims)
+ if body.nil? then
+ # Cache miss, process file and cache result
+ err, body, filelist, extrameta = xproc(xml_file, xml_query, true)
+ if err["xmlErrLevel"] > 0 then
+ raise "#{err.collect{|e|e.join(':')}.join('<br/>')}"
+ elsif (body||"").length < 1 then
+ # Some transforms can yield empty content (handbook?part=9&chap=99)
+ # Consider this a 404
+ raise Gorg::Status::NotFound
+ else
+ # Cache the output if all was OK
+ mstat, bodyZ = Cache.store(body, path_info, query, filelist, extrameta)
+ debug("Cached #{path_info}, mstat=#{mstat.inspect}")
+ # Check inm & ims again as they might match if another web node had
+ # previously delivered the same data
+ if notModified?(mstat, inm, ims) and extrameta.join !~ /set-cookie/i
+ raise Gorg::Status::NotModified.new(mstat)
+ end
+ end
+ else
+ if $Config["zipLevel"] > 0 then
+ bodyZ = body
+ body = nil
+ end
+ end
+ # If client accepts gzip encoding and we support it, return gzipped file
+ if bodyZ and $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then
+ body = bodyZ
+ header['Content-Encoding'] = "gzip"
+ header['Vary'] = "Accept-Encoding"
+ else
+ unless body then
+ # We need to unzip bodyZ into body, i.e. we cached zipped data but client does not support gzip
+ body = gunzip(bodyZ)
+ end
+ end
+ # Add cookies to http header
+ cookies = makeCookies(extrameta)
+ if cookies then
+ header['cookie'] = cookies
+ end
+ # Add Content-Type to header
+ ct = contentType(extrameta)
+ if ct then
+ # Turn application/xhtml+xml into text/html if browser does not accept it
+ if cgi.accept !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then
+ header['type'] = "text/html#{$1}"
+ else
+ header['type'] = ct
+ end
+ else
+ header['type'] = 'text/plain'
+ end
+ end
+ # Add ETag & Last-Modified http headers
+ # NB: it's simply mstat(file.xml) when passthru=1
+ if mstat then
+ header['ETag'] = makeETag(mstat)
+ header['Last-Modified'] = mstat.mtime.httpdate
+ end
+ end
+ cgi.out(header){body}
+ else # Not a HEAD or GET
+ raise Gorg::Status::NotAllowed
+ end
+ rescue => ex
+ if ex.respond_to?(:errCode) then
+ # One of ours (Gorg::Status::HTTPStatus)
+ cgi.out(ex.header){ex.html}
+ else
+ # Some ruby exceptions occurred, make it a 500
+ syserr = Gorg::Status::SysError.new
+ cgi.out('Status'=>syserr.errSts){syserr.html(ex)}
+ error("do_CGI() failed: #{$!}")
+ end
+ end
+end
diff --git a/lib/gorg/fcgi-bin/gorg.fcgi b/lib/gorg/fcgi-bin/gorg.fcgi
new file mode 100755
index 0000000..1f81cf2
--- /dev/null
+++ b/lib/gorg/fcgi-bin/gorg.fcgi
@@ -0,0 +1,61 @@
+#! /usr/bin/ruby
+
+### Copyright 2004, Xavier Neys (neysx@gentoo.org)
+# #
+# # This file is part of gorg.
+# #
+# # gorg is free software; you can redistribute it and/or modify
+# # it under the terms of the GNU General Public License as published by
+# # the Free Software Foundation; either version 2 of the License, or
+# # (at your option) any later version.
+# #
+# # gorg is distributed in the hope that it will be useful,
+# # but WITHOUT ANY WARRANTY; without even the implied warranty of
+# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# # GNU General Public License for more details.
+# #
+# # You should have received a copy of the GNU General Public License
+# # along with gorg; if not, write to the Free Software
+### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+require 'cgi'
+require 'fcgi'
+
+# Overload read_from_cmdline to avoid crashing when request method
+# is neither GET/HEAD/POST. Default behaviour is to read input from
+# STDIN. Not really useful when your webserver gets OPTIONS / :-(
+class CGI
+ module QueryExtension
+ def read_from_cmdline
+ ''
+ end
+ end
+end
+
+
+require 'gorg/cgi'
+
+include Gorg
+
+gorgInit
+STDERR.close
+
+# Should I commit suicide after a while, life can be so boring!
+ak47 = $Config["autoKill"]||0
+
+countReq = 0; t0 = Time.new
+# Process CGI requests sent by the fastCGI engine
+FCGI.each_cgi do |cgi|
+ countReq += 1
+ do_CGI(cgi)
+ # Is it time to leave?
+ # If maximum number of requests has been exceeded _AND_ at least 1 full minute has gone by
+ if ak47 > 0 && countReq >= ak47 && Time.new - t0 > 60 then
+ info("Autokill : #{countReq} requests have been processed in #{Time.new-t0} seconds")
+ Process.kill("USR1",$$)
+ else
+ # Garbage Collect regularly to help keep memory
+ # footprint low enough without costing too much time.
+ GC.start if countReq%50==0
+ end
+end
diff --git a/lib/gorg/log.rb b/lib/gorg/log.rb
new file mode 100644
index 0000000..4ef05d6
--- /dev/null
+++ b/lib/gorg/log.rb
@@ -0,0 +1,56 @@
+### Copyright 2004, Xavier Neys (neysx@gentoo.org)
+# #
+# # This file is part of gorg.
+# #
+# # gorg is free software; you can redistribute it and/or modify
+# # it under the terms of the GNU General Public License as published by
+# # the Free Software Foundation; either version 2 of the License, or
+# # (at your option) any later version.
+# #
+# # gorg is distributed in the hope that it will be useful,
+# # but WITHOUT ANY WARRANTY; without even the implied warranty of
+# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# # GNU General Public License for more details.
+# #
+# # You should have received a copy of the GNU General Public License
+# # along with gorg; if not, write to the Free Software
+### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+# Write logging info for our little gorg
+
+require 'syslog'
+require 'webrick/log'
+
+module Gorg
+ # Make log functions available as if we were inside a log instance
+ # If no $Log global variable has been initialized, do nothing
+ def fatal(msg) $Log.fatal(msg) if $Log; end
+ def error(msg) $Log.error(msg) if $Log; end
+ def warn(msg) $Log.warn(msg) if $Log; end
+ def info(msg) $Log.info(msg) if $Log; end
+ def debug(msg) $Log.debug(msg) if $Log; end
+
+ module Log
+
+ class MyLog < WEBrick::BasicLog
+ # Interface to WEBrick log system
+ # Not much to add at this time ;-)
+ end
+
+ class MySyslog
+ # Interface to syslog
+ def initialize(appname)
+ # Open syslog if not already done (only one open is allowed)
+ @@syslog = Syslog.open(appname) unless defined?(@@syslog)
+ # Make sure messages get through (WEBrick has its own filter)
+ @@syslog.mask = Syslog::LOG_UPTO(Syslog::LOG_ERR)
+ end
+
+ def <<(str)
+ # WEBrick's logging requires the << method
+ # Just forward string to syslog
+ @@syslog.err(str)
+ end
+ end
+ end
+end
diff --git a/lib/gorg/search.rb b/lib/gorg/search.rb
new file mode 100644
index 0000000..c90448a
--- /dev/null
+++ b/lib/gorg/search.rb
@@ -0,0 +1,444 @@
+### Copyright 2004, Xavier Neys (neysx@gentoo.org)
+# #
+# # This file is part of gorg.
+# #
+# # gorg is free software; you can redistribute it and/or modify
+# # it under the terms of the GNU General Public License as published by
+# # the Free Software Foundation; either version 2 of the License, or
+# # (at your option) any later version.
+# #
+# # gorg is distributed in the hope that it will be useful,
+# # but WITHOUT ANY WARRANTY; without even the implied warranty of
+# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# # GNU General Public License for more details.
+# #
+# # You should have received a copy of the GNU General Public License
+# # along with Foobar; if not, write to the Free Software
+### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+require 'dbi'
+require 'yaml'
+require 'gorg/base'
+require 'cgi'
+
+module GDig
+ class GFile
+
+ def initialize(root, f, xlang)
+ @root = root
+ @fname = f
+ @xpath2lang = xlang
+ end
+
+ def txt
+ unless @txt then
+ @txt, @lang = txtifyFile
+ end
+ @txt
+ end
+
+ def lang
+ unless @lang then
+ @txt, @lang = txtifyFile
+ end
+ @lang
+ end
+
+ private
+
+ def txtifyFile
+ x=Gorg::XSL.new
+ x.xsl = <<EOXSL
+<?xml version="1.0" encoding="UTF-8"?>
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+ <xsl:output encoding="UTF-8" method="text" indent="no"/>
+ <xsl:template match="/">
+EOXSL
+ if (@xpath2lang||"").length > 1 then
+ x.xsl << <<EOXSL
+ <xsl:if test="#{@xpath2lang}">
+ <xsl:value-of select="concat('%%LANG%%', #{@xpath2lang}, '%%&#x0A;')"/>
+ </xsl:if>
+EOXSL
+ end
+ x.xsl << <<EOXSL
+ <xsl:apply-templates/>
+ </xsl:template>
+ <xsl:template match="*">
+ <xsl:apply-templates select="@*"/>
+ <xsl:apply-templates/>
+ </xsl:template>
+ <xsl:template match="@*">
+ <xsl:value-of select="concat(' ',.,' ')"/>
+ </xsl:template>
+ </xsl:stylesheet>
+EOXSL
+ x.xroot = @root
+ x.xml = @fname
+ x.process
+
+ if x.xerr and x.xerr["xmlErrLevel"] >= 3 then
+ raise x.xerr["xmlErrMsg"]
+ end
+
+ t = x.xres
+ if t =~ /^%%LANG%%([^%]+)%%/ then
+ l = $1
+ t = $'.strip
+ else
+ l = nil
+ end
+ t << @fname
+ [t.squeeze("\n"), l]
+ end
+ end
+
+ class DBFile
+ attr_reader :fid, :webname
+ def initialize(dbh, webname, localname)
+ @dbh = dbh
+ @webname = webname
+ @localname = localname
+ @row = @dbh.select_one("SELECT id,path,lang,timestamp,size FROM files where path = ?", webname)
+ if @row then
+ @fid = @row['id']
+ else
+ @fid = nil
+ end
+ end
+
+ def DBFile.remove(dbh, fid)
+ if fid then
+ dbh.do("delete from files where id=#{fid}")
+ end
+ end
+
+ def uptodate?
+ if @fid then
+ unless @row then
+ @row = @dbh.select_one("SELECT id,path,lang,timestamp,size FROM files where id=#{@fid}")
+ end
+ if (fstat=File.stat(@localname)) and @row then
+ @row['timestamp']==fstat.mtime.to_s and @row['size']==fstat.size
+ else
+ false
+ end
+ end
+ end
+
+ def update(blob, lang)
+ fstat=File.stat(@localname)
+ if @fid then
+ # update
+ sql = "update files set lang = ?, txt = ?, timestamp = ?, size = ? where id=#{@fid}"
+ @dbh.do(sql, lang, blob, fstat.mtime.to_s, fstat.size)
+ else
+ # insert new one
+ sql = "insert into files (path, lang, txt, timestamp, size) values (?, ?, ?, ?, ?)"
+ @dbh.do(sql, webname, lang, blob, fstat.mtime.to_s, fstat.size)
+ if id=@dbh.select_one("select last_insert_id()") then
+ @fid = id[0]
+ else
+ @fid = nil
+ end
+ end
+ end
+ end
+
+ class GSearch
+ attr_reader :dbh, :searchTxt, :searchResult
+ include Gorg
+
+ def initialize
+ @dbh = DBI.connect($Config['dbConnect'], $Config['dbUser'], $Config['dbPassword'])
+ @dbh['AutoCommit'] = true
+ end
+
+ def indexDir
+ wipe = false
+ scanDir { |webName, localName|
+ begin
+ dbf = GDig::DBFile.new(@dbh, webName, localName)
+ unless dbf.uptodate? then
+ gf = GFile.new($Config['root'], webName, $Config['xlang'])
+ blob = gf.txt
+ lang = gf.lang
+ if (lang||"").length < 1 then
+ # No lang attribute, see if we can use the filename
+ if $Config['flang'] and $Config['flang'].match(webName) then
+ lang = $Config['flang'].match(webName)[1]
+ end
+ end
+ dbf.update(blob, lang)
+ wipe = true
+ debug "#{Time.new.to_i} #{webName} indexed"
+ end
+ rescue Exception => e
+ error "Failed to index #{webName} : #{e.to_s}"
+ end
+ }
+ wipeSearches if wipe
+ end
+
+ def cleanup
+ # Remove files from db either because
+ # they should now be excluded or because they do not exist anymore
+ wipe = false
+ @dbh.select_all('select id, path from files') { |row|
+ if not fileMatch(row[1]) or not File.file?($Config['root']+row[1]) then
+ DBFile.remove(@dbh, row[0])
+ debug "GDig::GSearch: #{row[1]} removed"
+ wipe = true
+ end
+ }
+ wipeSearches if wipe
+ end
+
+ def do_CGI(cgi)
+ $Config["root"] = cgi.env_table['DOCUMENT_ROOT']||$Config["root"]
+ query = {}
+ # Get cookies
+ if $Config["acceptCookies"] then
+ # Add cookies to our params
+ query = cookies_to_params(cgi.cookies)
+ end
+ # Add URI params that are not used by search engine (p,q,l,s)
+ cgi.params.each{ |p, v| query[p] = v.to_s}
+
+ # Choose language
+ if cgi.has_key?("l") then
+ lang = cgi["l"]
+ elsif query.has_key?("SL") then
+ lang = query["SL"]
+ else
+ lang = nil
+ end
+
+ # Perform search
+ search(cgi["q"], lang)
+
+ if cgi.has_key?("p") and cgi["p"] =~ /^[0-9]{1,5}$/ then
+ p = cgi["p"].to_i
+ else
+ p = 1
+ end
+
+ if cgi.has_key?("s") and cgi["s"] =~ /^[0-9]{2,3}$/ then
+ s = cgi["s"].to_i
+ elsif query.has_key?("PL") and query["PL"] =~ /^[0-9]{2,3}$/ then
+ s = query["PL"].to_i
+ else
+ s = 20
+ end
+ s = 120 if s > 120
+
+ xml = xmlResult(p,s)
+ header = {}; body = ""
+ if cgi.has_key?("passthru") and $Config["passthru"] then
+ header = {'type' => 'text/plain'}
+ body = xml
+ else
+ if $Config["linkParam"] then
+ query[$Config["linkParam"]] = cgi.script_name
+ end
+ if $Config["httphost"] then
+ # Add HTTP_HOST to stylesheet params
+ query["httphost"] = if $Config["httphost"][0] == '*' then
+ cgi.host||""
+ elsif $Config["httphost"].include?('*') then
+ $Config["httphost"][0]
+ elsif $Config["httphost"].include?(cgi.host) then
+ $Config["httphost"][0]
+ else
+ cgi.host
+ end
+ end
+
+ err, body, filelist, extra = xproc(xml, query, false)
+ if err["xmlErrLevel"] > 0 then
+ raise "#{err.collect{|e|e.join(':')}.join('<br/>')}"
+ end
+ cookies = makeCookies(extra)
+ ct = setContentType(body)
+ # Turn application/xhtml+xml into text/html if browser does not accept it
+ if cgi.accept !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then
+ header = {'type' => "text/html#{$1}"}
+ else
+ header = {'type' => ct}
+ end
+
+ # Add cookies to http header
+ if cookies then
+ header['cookie'] = cookies
+ end
+ end
+ # If client accepts gzip encoding and we support it, return gzipped file
+ if $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then
+ body = gzip(body, $Config["zipLevel"])
+ header['Content-Encoding'] = "gzip"
+ header['Vary'] = "Accept-Encoding"
+ end
+ cgi.out(header){body}
+ rescue => ex
+ syserr = Gorg::Status::SysError.new
+ cgi.out('Status'=>syserr.errSts){syserr.html(ex)}
+ error("GSearch::do_CGI() failed: #{$!}")
+ end
+
+ def search(str, lang)
+ @searchTxt = str
+ @searchResult = nil
+ if (lang||"") == "" then
+ @searchLang = '%'
+ else
+ @searchLang = lang
+ end
+ if str =~ /(^|\s)(([+<)(>~-][^+<)(>~-]+)|([^+<)(>~-]+\*))(\s|$)/ then
+ @searchBool = "Y"
+ boolClause = "in boolean mode"
+ else
+ @searchBool = "N"
+ boolClause = ""
+ end
+ if @searchTxt.length > 0 then
+ @searchResult = loadSearch
+ unless @searchResult then
+ @searchResult = []
+ # Perform full text search
+ sql = <<EOSQL
+select id, path, lang, match (txt) against ( ? ) as score
+from files
+where lang like ? and match (txt) against ( ? #{boolClause} )
+order by score desc
+EOSQL
+ @dbh.select_all(sql, @searchTxt, @searchLang, @searchTxt).each { |r| @searchResult << [r[0],r[1],r[2],r[3]] }
+ saveSearch
+ end
+ end
+ @searchResult
+ end
+
+ def xmlResult(page=1, pageLength=25)
+ # <search page="p" pages="n">
+ # <for>search string</for>
+ # <found link="/path/to/file.xml" lang="fr">
+ # blah blah <b>word2</b> bleh
+ # </found>
+ pageLength = 20 if pageLength < 1
+ xml = "<?xml version='1.0' encoding='UTF-8'?>\n\n"
+
+ if @searchResult and @searchResult.length >= 1 then
+ removeDeadFiles
+ nPages = @searchResult.length / pageLength #/
+ nPages += 1 unless 0 == @searchResult.length.modulo(pageLength)
+ page = nPages if page > nPages
+ page = 1 if page < 1
+
+ xml << "<search page='#{page}' pages='#{nPages}' pageLength='#{pageLength}' lang='#{xmlEscape(@searchLang)}' bool='#{@searchBool}'>\n"
+ xml << xmlSearchFor
+ @searchResult[(page-1)*pageLength..page*pageLength-1].each { |r|
+ xml << " <found link='#{r[1]}' lang='#{r[2]}' score='#{r[3]}'>\n"
+ xml << xmlBlobSample(r[0]) << "\n"
+ xml << " </found>\n"
+ }
+ else
+ xml << "<search page='0' pages='0'>\n"
+ xml << xmlSearchFor
+ end
+ xml << "</search>\n"
+ end
+
+ def scanDir
+ Dir.chdir($Config['root']) {
+ `find -L . -type f`.split("\n").each{ |localFile|
+ if File.file?(localFile) then
+ webFile = localFile[1..-1]
+ if fileMatch(webFile) then
+ yield [webFile, File.expand_path(localFile)]
+ end
+ end
+ }
+ }
+ end
+
+ private
+
+ def xmlBlobSample(fileID)
+ blob = ""
+ r = @dbh.select_one("select txt from files where id = #{fileID}")
+ if r then
+ blob = r[0]
+ # Find first matching word and extract some text around it
+ stxt = @searchTxt.tr('`.,\'"\-_+~<>/?;:[]{}+|\\)(*&^%\$\#@!', ' ').split(' ')
+ regs = stxt.collect { |w| Regexp.new(w, true, 'U') }
+ ix = nil
+ regs.each { |r| break if ix=blob.index(r) }
+ if ix then
+ if ix < 80 then
+ x = 0
+ else
+ x = blob[0,ix-60].rindex(/[ ,\.]/)
+ x = 0 unless x
+ end
+ y = blob.index(/[,\. ]/, ix+80)
+ y = -1 unless y
+ blob = xmlEscape(blob[x..y])
+ # Mark up sought words
+ regs.each { |r| blob.gsub!(r){|t| "<b>#{t}</b>"} }
+ else
+ x = blob[120..-1].index(/[ ,\.]/)
+ blob = xmlEscape(blob[0..x])
+ end
+ end
+ blob
+ end
+
+ def xmlEscape(str)
+ if str
+ str.gsub('&','&amp;').gsub('>','&gt;').gsub('<','&lt;')
+ else
+ "w00t"
+ end
+ end
+
+ def loadSearch
+ if @searchTxt then
+ r = @dbh.select_one("select result from savedsearches where words = ? and lang = ? and bool = ?", @searchTxt, @searchLang, @searchBool)
+ if r then
+ YAML::load(r[0])
+ end
+ end
+ end
+
+ def saveSearch
+ if @searchTxt then
+ @dbh.do("delete from savedsearches where words = ? and lang = ? and bool = ?", @searchTxt, @searchLang, @searchBool)
+ @dbh.do("insert into savedsearches (words, lang, bool, result) values(?, ?, ?, ?)", @searchTxt, @searchLang, @searchBool, @searchResult.to_yaml)
+ end
+ end
+
+ def wipeSearches
+ @dbh.do("delete from savedsearches")
+ end
+
+ def fileMatch(f)
+ $Config['in/out'].each { |inout|
+ return inout[0] if inout[1].match(f)
+ }
+ false
+ end
+
+ def removeDeadFiles
+ if @searchResult then
+ @searchResult.reject!{ |r| not File.file?($Config['root']+r[1]) }
+ end
+ end
+
+ def xmlSearchFor
+ " <for>#{xmlEscape(@searchTxt)}</for>\n" if @searchTxt
+ end
+
+ end
+
+end
diff --git a/lib/gorg/www.rb b/lib/gorg/www.rb
new file mode 100644
index 0000000..eb0c8fa
--- /dev/null
+++ b/lib/gorg/www.rb
@@ -0,0 +1,207 @@
+### Copyright 2004, Xavier Neys (neysx@gentoo.org)
+# #
+# # This file is part of gorg.
+# #
+# # gorg is free software; you can redistribute it and/or modify
+# # it under the terms of the GNU General Public License as published by
+# # the Free Software Foundation; either version 2 of the License, or
+# # (at your option) any later version.
+# #
+# # gorg is distributed in the hope that it will be useful,
+# # but WITHOUT ANY WARRANTY; without even the implied warranty of
+# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# # GNU General Public License for more details.
+# #
+# # You should have received a copy of the GNU General Public License
+# # along with gorg; if not, write to the Free Software
+### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+# Run the stand-alone webserver and serve gentoo.org
+
+require 'gorg/base'
+require 'webrick'
+require 'cgi'
+
+class GentooServlet < WEBrick::HTTPServlet::FileHandler
+ include Gorg
+
+ def do_GET(req, res)
+ hit = "#{$Config["root"]}#{req.path}"
+ cacheName = req.path
+ if FileTest.directory?(hit) and FileTest.exist?(hit+"/index.xml") then
+ # Use $URI/index.xml for directories that have an index.xml file
+ hit << "/index.xml"
+ cacheName << "/index.xml"
+ end
+ hit.squeeze!('/')
+ cacheName.squeeze!('/')
+ if FileTest.directory?(hit) then
+ super # Use default FileHandler for directories that have no index.xml
+ else
+ if hit !~ /\.(xml)|(rdf)|(rss)$/ then
+ super # Use default FileHandler if not an xml file
+ else
+ if not FileTest.exist?(hit) then
+ super # Use default FileHandler to handle 404 (file does not exist)
+ else
+ # Parse If-None-Match and If-Modified-Since request header fields if any
+ ims=inm=nil
+ begin
+ ims = Time.parse(req['if-modified-since']) if req['if-modified-since']
+ inm = split_header_etags(req['if-none-match']) if req['if-none-match']
+ rescue
+ # Just ignore ill-formated data
+ nil
+ end
+ begin
+ res['Charset'] = 'UTF-8'
+ # Process xml file or return xml file if passthru=1
+ if $Config['passthru'] && req.query && req.query["passthru"] && req.query["passthru"] != "0" then
+ # passthru allowed by config and requested by visitor, return file as text/plain
+ mstat = File.stat(hit)
+ raise Gorg::Status::NotModified.new(mstat) if notModified?(mstat, inm, ims)
+ debug("Passthru granted for #{hit}")
+ body = IO.read(hit)
+ # If client accepts gzip encoding and we support it, return gzipped file
+ if $Config["zipLevel"] > 0 and (req.accept_encoding.include?("gzip") or req.accept_encoding.include?("x-gzip")) then
+ res.body = gzip(body, $Config["zipLevel"])
+ res['Content-Encoding'] = "gzip"
+ res['Vary'] = "Accept-Encoding"
+ else
+ res.body = body
+ end
+ res['Content-Type'] = 'text/plain'
+ else
+ query_params = req.query.dup
+ # Get cookies and add them to the parameters
+ if $Config["acceptCookies"] then
+ # We need CGI:Cookie objects to be compatible with our cgi modules (stupid WEBrick)
+ ck = req.raw_header.find{|l| l =~ /^cookie: /i}
+ if ck then
+ query_params.merge!(cookies_to_params(CGI::Cookie.parse($'.strip)))
+ debug "query params are " + query_params.inspect
+ end
+ end
+ if $Config["httphost"] then
+ # Add HTTP_HOST to stylesheet params
+ query_params["httphost"] = if $Config["httphost"][0] == '*' then
+ req.host||""
+ elsif $Config["httphost"].include?('*') then
+ $Config["httphost"][0]
+ elsif $Config["httphost"].include?(req.host) then
+ $Config["httphost"][0]
+ else
+ req.host||""
+ end
+ end
+
+ bodyZ = nil
+ body, mstat, extrameta = Gorg::Cache.hit(cacheName, query_params, inm, ims)
+ if body.nil? then
+ xml_query = query_params.dup
+ if $Config["linkParam"] then
+ xml_query[$Config["linkParam"]] = req.path
+ end
+ # Cache miss, process file and cache result
+ err, body, filelist, extrameta = xproc(hit, xml_query, true)
+ warn("#{err.collect{|e|e.join(':')}.join('; ')}") if err["xmlErrLevel"] == 1
+ error("#{err.collect{|e|e.join(':')}.join('; ')}") if err["xmlErrLevel"] > 1
+ # Display error message if any, just like the cgi/fcgi versions
+ raise ("#{err.collect{|e|e.join(':')}.join('<br/>')}") if err["xmlErrLevel"] > 0
+ # Cache output
+ mstat, bodyZ = Gorg::Cache.store(body, cacheName, query_params, filelist, extrameta)
+ else
+ if $Config["zipLevel"] > 0 then
+ bodyZ = body
+ body = nil
+ end
+ end
+ # If client accepts gzip encoding and we support it, return gzipped file
+ if bodyZ and $Config["zipLevel"] > 0 and (req.accept_encoding.include?("gzip") or req.accept_encoding.include?("x-gzip")) then
+ res.body = bodyZ
+ res['Content-Encoding'] = "gzip"
+ res['Vary'] = "Accept-Encoding"
+ else
+ if body then
+ res.body = body
+ else
+ # We need to unzip bodyZ into body, i.e. we cached zipped data but client does not support gzip
+ res.body = gunzip(bodyZ)
+ end
+ end
+ # Add cookies to http header
+ cookies = makeCookies(extrameta)
+ if cookies then
+ cookies.each{|c| res.cookies << c.to_s}
+ end
+ # Add Content-Type to header
+ ct = contentType(extrameta).split(';')[0]
+ if ct then
+ # Turn application/xhtml+xml into text/html if browser does not accept it
+ if req.accept.to_s !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then
+ res['Content-Type'] = "text/html#{$1}"
+ else
+ res['Content-Type'] = ct
+ end
+ else
+ res['Content-Type'] = 'text/plain'
+ end
+ end
+ if mstat then
+ res['ETag'] = makeETag(mstat)
+ res['Last-Modified'] = mstat.mtime.httpdate
+ end
+ rescue => ex
+ if ex.respond_to?(:errCode) then
+ # One of ours (Gorg::Status::HTTPStatus)
+ res.body = ex.html
+ res.status = ex.errCode
+ ex.header.each {|k,v| res[k]=v unless k =~ /status|cookie/i}
+ else
+ # Some ruby exceptions occurred, make it a syserr
+ syserr = Gorg::Status::SysError.new
+ res.body = syserr.html(ex)
+ res.status = syserr.errCode
+ end
+ end
+ end
+ end
+ end
+ end
+end
+
+###
+#|# Start Here
+###
+
+def www
+ # Log accesses to either stderr, syslog or a file
+ if $Config["accessLog"] == "syslog"
+ # Use syslog again, use our own format based on default but without timestamp
+ access_log = [ [ @syslog, "HIT %h \"%r\" %s %b" ] ]
+ STDERR.close
+ elsif $Config["accessLog"] == "stderr"
+ # Use syslog again, use our own format based on default but without timestamp
+ access_log = [ [ STDERR, "HIT %h \"%r\" %s %b" ] ]
+ else
+ # Open file and use it, if it's not writable, tough!
+ access_log_stream = File.open($Config["accessLog"], "a")
+ access_log = [ [ access_log_stream, WEBrick::AccessLog::COMBINED_LOG_FORMAT ] ]
+ STDERR.close
+ end
+
+ s = WEBrick::HTTPServer.new( :BindAddress => $Config["listen"], :AccessLog=>access_log, :Logger => $Log, :Port => $Config["port"], :CGIPathEnv => ENV["GORG_CONF"])
+
+ # Mount directories
+ $Config["mounts"].each { |m|
+ s.mount(m[0], WEBrick::HTTPServlet::FileHandler, m[1])
+ }
+ s.mount("/", GentooServlet, $Config["root"])
+
+ # Start server
+ trap("INT"){ s.shutdown }
+
+ puts "\n\nStarting the Gorg web server on #{$Config['listen']}:#{$Config['port']}\n\nHit Ctrl-C or type \"kill #{$$}\" to stop it\n\n"
+
+ s.start
+end