1 files changed, 493 insertions, 0 deletions
diff --git a/lib/gorg/cache.rb b/lib/gorg/cache.rb
new file mode 100644
index 0000000..543b6a2
--- /dev/null
+++ b/lib/gorg/cache.rb
@@ -0,0 +1,493 @@
+###   Copyright 2004,   Xavier Neys   (neysx@gentoo.org)
+# #
+# #   This file is part of gorg.
+# #
+# #   gorg is free software; you can redistribute it and/or modify
+# #   it under the terms of the GNU General Public License as published by
+# #   the Free Software Foundation; either version 2 of the License, or
+# #   (at your option) any later version.
+# #
+# #   gorg is distributed in the hope that it will be useful,
+# #   but WITHOUT ANY WARRANTY; without even the implied warranty of
+# #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# #   GNU General Public License for more details.
+# #
+# #   You should have received a copy of the GNU General Public License
+# #   along with Foobar; if not, write to the Free Software
+###   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+# Cache a bit of data based on 
+#  . a path name as received by a webserver e.g.
+#  . a list of parameters as received by a webserver e.g.
+#  . a list of files it depends on
+
+require "parsedate"
+require "fileutils"
+require "find"
+require "digest"
+require "digest/md5"
+
+module Gorg
+
+CacheStamp = "Gorg-#{Gorg::Version} Cached This Data. Do not alter this file. Thanks."
+
+module Cache
+  def Cache.init(config)
+    @@lockfile = ".cache.cleaner.lock"
+    @cacheDir = nil
+    if FileTest.directory?(config["cacheDir"])
+      if FileTest.writable?(config["cacheDir"])
+        @cacheDir = config["cacheDir"].chomp("/")
+      else
+        warn "Cache directory not writable"
+      end
+    else
+      warn "Invalid cache directory"
+    end
+
+    # Time-To-Live in seconds, cached items older than that will be considered too old
+    @zipLevel = config["zipLevel"]
+    @zip = @zipLevel > 0 ? ".gz" : ""
+    @ttl = config["cacheTTL"]
+    @cacheTree = config["cacheTree"]
+    @maxFiles = config["maxFiles"]            # Max number of files in a single directory
+    @maxSize = config["cacheSize"]*1024*1024  # Now in bytes
+    @washNumber = config["cacheWash"]         # Clean cache dir after a store operation whenever rand(@washNumber) < 10
+    @lastCleanup = Time.new-8e8               # Remember last time we started a cleanup so we don't pile them up
+  end
+  
+  def Cache.hit(objPath, objParam={}, etags=nil, ifmodsince=nil)
+    # objPath is typically a requested path passed from a web request but it
+    # can be just any string. It is not checked against any actual files on the file system
+    #
+    # objParam is expected to be a hash or any object whose iterator yields two values
+    #
+    # 2 filenames are built with the arguments and should give 
+    # the name of a metafile and a result file
+    # if the result file is older than @ttl seconds, hit fails
+    # The metafile is then checked for dependencies
+    # It contains a list of filenames along with their size and mtime separated by ;;
+    
+    # etag and ifmodsince are used in a webserver context
+    #   etag is defined if an ETag was part of an If-None-Match request field
+    #   etag can be an array or a single string
+    #   If the current ETag of the meta file matches, no data is returned (webserver should return a 304)
+    #
+    #   ifmodsince is a time object passed on an If-Modified-Since request field
+    #   If the creation date of the meta file is earlier, no data is returned (webserver should return a 304)
+
+    return nil if @cacheDir.nil? # Not initialized, ignore request
+    
+    # Reminder: filenames are full path, no need to prepend dirname
+    dirname, basename, filename, metaname = makeNames(objPath, objParam)
+    
+    raise "Cache subdir does not exist" unless FileTest.directory?(dirname)
+
+    # Hit the cache
+    meta, mstat = IO.read(metaname), File.stat(metaname)  if metaname && FileTest.file?(metaname) && FileTest.readable?(metaname)
+    raise "Empty/No meta file" if meta.nil? || meta.length < 1
+
+    fstat = File.stat(filename) if filename && FileTest.file?(filename)
+    raise "Empty/No data file" if fstat.nil?
+
+    # Check the timestamps of files in the metadata
+    meta = meta.split("\n")
+    raise "I did not write that meta file" unless CacheStamp == meta.shift
+    mline = meta.shift
+    while mline and mline !~ /^;;extra meta$/ do
+      f, s, d = mline.split(";;")
+      if s.to_i < 0
+        # File did not exist when cache entry was created
+        raise "Required file #{f} has (re)appeared" if FileTest.file?(f) && FileTest.readable?(f)
+      else
+        # File did exist when cache entry was created, is it still there?
+        raise "Required file #{f} has disappeared" unless FileTest.file?(f) && FileTest.readable?(f)
+      
+        fst = File.stat(f)
+        raise "Size of #{f} has changed from #{fst.size} to #{s.to_i}" unless fst.size == s.to_i
+        raise "Timestamp of #{f} has changed" unless Time.utc(*ParseDate.parsedate(d)) == fst.mtime.utc
+      end
+      mline = meta.shift
+    end
+    if mline =~ /^;;extra meta$/ then
+      extrameta = meta.dup
+    else
+      extrameta = []
+    end
+    
+    if notModified?(fstat, etags, ifmodsince) and extrameta.join !~ /set-cookie/i
+      raise Gorg::Status::NotModified.new(fstat)
+    end
+    
+    file = IO.read(filename) if filename && FileTest.file?(filename) && FileTest.readable?(filename)
+    raise "Empty/No data file" if file.nil? || file.length < 1
+
+    # Is the data file too old
+    raise "Data file too old" unless @ttl==0 or (Time.new - fstat.mtime) < @ttl
+    
+    # Update atime of files, ignore failures as files might have just been removed
+    begin
+      t = Time.new
+      File.utime(t, fstat.mtime, filename)
+      File.utime(t, mstat.mtime, metaname)
+    rescue
+      nil
+    end
+    
+    # If we get here, it means the data file can be used, return cache object (data, stat(datafile), extrameta)
+    # The file is left (un)compressed, it's returned as it was stored
+    [file, fstat, extrameta]
+    
+  rescue Gorg::Status::NotModified
+    # Nothing changed, should return a 304
+    debug("Client cache is up-to-date")
+    raise
+  rescue
+    # cache hit fails if anything goes wrong, no exception raised
+    debug("Cache hit on #{objPath} failed: (#{$!})")
+    nil
+  end
+
+
+  def Cache.store(data, objPath, objParam={}, deps=[], extrameta=[])
+    # Store data in cache so it can be retrieved based on the objPath and objParams
+    # deps should contain a list of files that the object depends on
+    # as returnd by our xsl processor, i.e. an array of [access_type, path] where
+    # access_type can be "r", "w", or "o" for recpectively read, write, other.
+
+    # Define content-type
+    ct = setContentType(data)
+    extrameta << "Content-Type:#{ct}"
+    
+    return nil if @cacheDir.nil? # Not initialized, ignore request
+    
+    # Cache only if no remote objects (ftp:// or http://) in list of used files
+    if deps && deps.detect{|f| f[0] =~ /^o$/i }
+      debug "#{objPath} not cached because it needs remote resources"
+      return nil
+    end
+
+    dirname, basename, filename, metaname = makeNames(objPath, objParam)
+
+    FileUtils.mkdir_p(dirname) unless FileTest.directory?(dirname)
+    
+    # Write Meta file to a temp file (with .timestamp.randomNumber appended)
+    metaname_t = "#{metaname}.#{Time.new.strftime('%Y%m%d%H%M%S')}.#{rand(9999)}"
+
+    # Data might need to be just a link to another .Data file
+    # if we find another requested path with different params but
+    # with identical MD5 sums
+    # Which is why we keep a ...xml.Data.[md5 sum] file without the parameters
+    # in its name that we can hard link to.
+    # e.g. A moron hits for 10 full handbooks with toto=1..10 in the URI,
+    # we'd end up with 10 identical large copies. With links we have only one
+
+    # Old versions are expected to be cleaned up by the cacheWash() routine
+    # A Dir.glob() to find the previous ones would be too expensive
+    
+    # Compute MD5 digest
+    md5 = Digest::MD5.hexdigest(data)
+    
+    # Compress data if required
+    if @zipLevel > 0 then
+      bodyZ = data = gzip(data, @zipLevel)
+    else
+      bodyZ = nil
+    end
+    
+    # Set mtime of data file to latest mtime of all required files
+    # so that caching can work better because mtimes will be
+    # identical on all webnodes whereas creation date of data
+    # would be different on all nodes.
+    maxmtime = Time.now-8e8
+    fstat = nil
+    
+    begin
+      timeout(10){
+        File.open("#{metaname_t}", "w") {|fmeta|
+          fmeta.puts(CacheStamp)
+          # Write filename;;size;;mtime for each file in deps[]
+          deps.each {|ffe|
+            ftype = ffe[0]
+            fdep = ffe[1]
+            if FileTest.file?(fdep)
+              s = File.stat(fdep)
+              fmeta.puts("#{fdep};;#{s.size};;#{s.mtime.utc};;#{ftype}")
+              maxmtime = s.mtime if s.mtime > maxmtime and ftype =~ /^r$/i
+            else
+              # A required file does not exist, use size=-1 and old timestamp
+              # so that when the file comes back, the cache notices a difference
+              # and no cache miss gets triggered as long as file does not exist
+              fmeta.puts("#{fdep};;-1;;Thu Nov 11 11:11:11 UTC 1971")
+            end
+          }
+          fmeta.puts ";;extra meta"
+          extrameta.each { |m| fmeta.puts m }
+        }
+        # Get exclusive access to the cache directory while moving files and/or creating data files
+        File.open(dirname) { |lockd|
+          while not lockd.flock(File::LOCK_NB|File::LOCK_EX)
+            # Timeout does not occur on a blocking lock
+            # Try a non-bloking one repeatedly for a few seconds until timeout occurs or lock is granted
+            # We are in a timeout block, remember
+            sleep 0.1
+          end
+          # Remove previous Data
+          FileUtils.rm_rf(filename)
+
+          # mv temp meta file to meta file
+          FileUtils.mv(metaname_t, metaname)
+
+          # We keep a data file for the same requested path, with different params,
+          # but which ends up with same MD5 sum, i.e. identical results because of unused params
+          linkname = "#{basename}.#{md5}#{@zip}"
+          if FileTest.file?(linkname) then
+            # Data file already there, link to it
+            File.link(linkname, filename)
+          else
+            # Write data file and set its mtime to latest of all files it depends on
+            File.open("#{filename}", "w") {|fdata| fdata.write(data)}
+            # Create link
+            File.link(filename, linkname)
+          end
+          # mtime might need to be updated, or needs to be set
+          # e.g. when a dependency had changed but result files is identical
+          # This is needed to keep Last-Modified dates consistent across web nodes
+          File.utime(Time.now, maxmtime, filename)
+          fstat = File.stat(filename)
+        }
+      }
+    ensure
+      FileUtils.rm_rf(metaname_t)
+    end
+    
+    # Do we clean the cache?
+    washCache(dirname, 10) if @washNumber > 0 and rand(@washNumber) < 10
+    
+    # Return stat(datafile) even if it's just been removed by washCache
+    # because another web node might still have it or will have it.
+    # Anyway, the cached item would be regenerated on a later request
+    # and a 304 would be returned if still appropriate at the time.
+
+    # Return fstat of data file (for etag...) and zipped file
+    [fstat, bodyZ]
+    
+  rescue Timeout::Error, StandardError =>ex
+    if ex.class.to_s =~ /timeout::error/i then
+      warn("Timeout in cache store operation")
+    else
+      warn("Cache store error (#{$!})")
+    end
+    # Clean up before leaving
+    FileUtils.rm_rf(filename||"")
+    FileUtils.rm_rf(metaname||"")
+    nil # return nil so that caller can act if a failed store really is a problem
+  end
+    
+    
+  def Cache.washCache(dirname, tmout=30, cleanTree=false)
+    # Clean cache entries that are either too old compared to TTL (in seconds)
+    # or reduce total size to maxSize (in MB)
+    # oldDataOnly means to look only for unused *.Data.[md5] files that are not used anymore 
+    # because file has been modified and has generated a new *.Data.[md5] file
+    
+    # timeout is the maximum time (in seconds) spent in here
+
+    return nil if @cacheDir.nil? # Not initialized, ignore request
+    
+    # Also ignore request if dirname not equal to @cacheDir or under it
+    return nil unless dirname[0, @cacheDir.length] == @cacheDir
+    
+    # Also ignore request if dirname does not exist yet
+    return nil unless FileTest.directory?(dirname)
+    
+    # Also return if less than a minute has elapsed since latest cleanup
+    t0 = Time.new
+    return nil if t0 - @lastCleanup < 60
+    
+    # Remember for next time
+    @lastCleanup = t0
+
+    Dir.chdir(dirname) { |d|
+      # Recreate lock file if it's been lost
+      unless File.exist?(@@lockfile)
+        File.open(@@lockfile, "w") { |lockf| lockf.puts("Lock file created on #{Time.now.utc} by gorg")}
+      end
+        
+      # Grab lockfile
+      File.open(@@lockfile) { |lockf| 
+        if lockf.flock(File::LOCK_NB|File::LOCK_EX) then
+          infoMsg = "Cleaning up cache in #{dirname} (cleanTree=#{cleanTree}, tmout=#{tmout})"
+          info(infoMsg)
+          puts infoMsg if cleanTree
+
+          timeout(tmout) {
+            totalSize, deletedFiles, scannedDirectories = washDir(dirname, cleanTree)
+            if totalSize >= 0 then
+              # Size == -1 means dir was locked, throwing an exception would have been nice :)
+              infoMsg = if cleanTree then
+                          "Cache in #{dirname} is now #{totalSize/1024/1024} MB, #{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{scannedDirectories} directories"
+                        else
+                          "#{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{dirname}"
+                        end
+              info(infoMsg)
+              puts infoMsg if cleanTree
+            end
+          }
+        else
+          # Locked dir, another process is busy cleaning up/
+          debug("#{dirname} locked, skipping")
+          puts("#{dirname} locked, skipping") if cleanTree
+        end # of lock test
+      } # end of File.open(@@lockfile),  close & release lock automatically
+    }
+  rescue Timeout::Error
+    info("Timeout while cleaning #{dirname}")
+    puts("Timeout while cleaning #{dirname}") if cleanTree
+  rescue StandardError =>ex
+    error("Error while cleaning cache: #{ex}")
+    puts("Error while cleaning cache: #{ex}") if cleanTree
+  end
+
+  
+  private
+
+  def Cache.washDir(dirname, cleanTree)
+    # Clean up cache starting from dirname and in subdirectories if cleanTree is true
+    # Return [newSize in bytes, # deleted files, # scanned directories]
+    size = nDeleted = nDirectories = 0
+
+    Dir.chdir(dirname) { |d|
+      hIno = Hash.new(0) # hash of file inodes with more than one link
+      lst = Array.new    # array of file names, atime, ...
+      ttl = @ttl
+      ttl = 8e8 if ttl == 0 # No ttl, keep very old docs!
+
+      # Get list of files sorted on their dirname+atime
+      Find.find('.') { |f|
+        begin
+          unless f =~ /^\.$|#{@@lockfile}/  # ignore "." and lockfile 
+            ff = File.stat(f)
+            if ff.directory? then
+              Find.prune unless cleanTree
+            elsif ff.file? and f =~ /Meta|Data/ then
+              hIno[ff.ino] = ff.nlink if ff.nlink > 1
+              # List of files has [name, atime, size, # links, inode]
+              lst << [f, ff.atime, ff.size, ff.nlink, ff.ino]
+            end
+          end
+        rescue
+          nil # File.stat can fail because file could have been deleted, ignore error
+        end
+      }
+      
+      # Compute total size
+      size = lst.inject(0){ |tot, a| tot + if a[3] > 0 then a[2]/a[3] else 0 end }
+      
+      # Delete old *.Data.[md5] files that are not being referenced anymore/
+      lst.each { |a|
+        if a[3] == 1 && a[0] =~ /\.Data\.[0-9a-f]+(.gz)?$/ then
+          # Data file with no more links pointing to it
+          FileUtils.rm_rf(a[0])
+          nDeleted += 1
+          size -= a[2]
+          a[3] = 0 # Mark as deleted
+        end
+      }
+      
+      # Sort all files on atime
+      lst.sort!{ |a1, a2| a1[1] <=> a2[1] }
+      
+      t0 = Time.new
+      # Clean until size < maxSize _AND_ atime more recent than TTL
+      lst.each { |a|
+        break if size < @maxSize and t0-a[1] < ttl
+        next if a[3] < 1 # Already deleted in previous step
+        FileUtils.rm_rf(a[0])
+        nDeleted += 1
+        # Total size -= file size IF last link to data
+        if a[3] == 1 || hIno[a[4]] <= 1 then
+          size -= a[2]
+        end
+        hIno[a[4]] -= 1 if hIno[a[4]] > 0
+        a[3] = 0 # Mark as deleted by setting nlinks to 0
+      }
+      
+      # Remove deleted files from array
+      lst.reject! { |a| a[3] < 1 }
+      
+      
+      # Sort files per directory to enforce maxFiles
+      if cleanTree then
+        # Split the array in an array per directory
+        # and keep the files sorted on atime in each directory
+        slst = Hash.new
+        lst.length.times {
+          a = lst.shift
+          d = File.dirname(a[0])
+          if slst[d] then
+            slst[d] << a
+          else
+            slst[d] = [a]
+          end
+        }
+      else
+        # If not cleaning whole tree, we have only a single dir
+        slst = {"." => lst}
+      end
+      
+      nDirectories = slst.length
+
+      slst.each { |d, lst|
+        # Remove oldest files so that we have less than @maxFiles in it
+        if lst.length >= @maxFiles then
+          # Remove to leave up 90% of #maxFiles so we don't clean up only a handful of files repeatedly
+          (lst.length - 9*@maxFiles/10).times {
+            if a = lst.shift then
+              FileUtils.rm_rf(a[0])
+              nDeleted += 1
+              # Total size -= file size IF last link to data
+              if a[3] == 1 || hIno[a[4]] <= 1 then
+                size -= a[2]
+              end
+              hIno[a[4]] -= 1 if hIno[a[4]] > 0
+            end
+          }
+        end
+      }
+    } #end of chdir
+    [size, nDeleted, nDirectories]
+  end
+                    
+  
+  def Cache.makeNames(obj, params)
+    # Build meta filename and data filename from arguments
+    #
+    # obj is broken into a path and a filename with appended params
+    # e.g. /proj/en/index.xml?style=printable becomes /proj/en and index.xml+printable+yes
+    #  or  .#proj#en#index.xml+printable+yes
+    # depending on cacheTree param value
+
+    # .Meta and .Data are appended respectively to the meta filename and data filename
+    # Base is the filename without appending params, e.g. .#proj#en#index.xml.Data
+    if @cacheTree then
+      # Use a path and a file
+      dir = "#{@cacheDir}#{File.dirname(obj)}"
+      base = f = File.basename(obj)
+    else
+      # Convert full path into a single filename
+      dir = @cacheDir
+      base = f = ".#{obj.gsub(/\//,'#')}"
+    end
+
+    f = "#{f}+#{params.reject{|k,v| k.nil?}.sort.join('+')}" if params && params.to_a.length > 0    
+    # Remove funky chars and squeeze duplicates into single chars
+    f = f.gsub(/[^\w\#.+_-]/, "~").squeeze("~.#+")
+    
+    # Return names for Data and Meta files, and just the filepath (e.g. #proj#en#index.xml)
+    [dir, "#{dir}/#{base}.Data", "#{dir}/#{f}.Data#{@zip}", "#{dir}/#{f}.Meta"]
+  end
+end
+
+end