aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'lib/gorg/cache.rb')
-rw-r--r--lib/gorg/cache.rb493
1 files changed, 493 insertions, 0 deletions
diff --git a/lib/gorg/cache.rb b/lib/gorg/cache.rb
new file mode 100644
index 0000000..543b6a2
--- /dev/null
+++ b/lib/gorg/cache.rb
@@ -0,0 +1,493 @@
+### Copyright 2004, Xavier Neys (neysx@gentoo.org)
+# #
+# # This file is part of gorg.
+# #
+# # gorg is free software; you can redistribute it and/or modify
+# # it under the terms of the GNU General Public License as published by
+# # the Free Software Foundation; either version 2 of the License, or
+# # (at your option) any later version.
+# #
+# # gorg is distributed in the hope that it will be useful,
+# # but WITHOUT ANY WARRANTY; without even the implied warranty of
+# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# # GNU General Public License for more details.
+# #
+# # You should have received a copy of the GNU General Public License
+# # along with Foobar; if not, write to the Free Software
+### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+# Cache a bit of data based on
+# . a path name as received by a webserver e.g.
+# . a list of parameters as received by a webserver e.g.
+# . a list of files it depends on
+
+require "parsedate"
+require "fileutils"
+require "find"
+require "digest"
+require "digest/md5"
+
+module Gorg
+
+CacheStamp = "Gorg-#{Gorg::Version} Cached This Data. Do not alter this file. Thanks."
+
+module Cache
+ def Cache.init(config)
+ @@lockfile = ".cache.cleaner.lock"
+ @cacheDir = nil
+ if FileTest.directory?(config["cacheDir"])
+ if FileTest.writable?(config["cacheDir"])
+ @cacheDir = config["cacheDir"].chomp("/")
+ else
+ warn "Cache directory not writable"
+ end
+ else
+ warn "Invalid cache directory"
+ end
+
+ # Time-To-Live in seconds, cached items older than that will be considered too old
+ @zipLevel = config["zipLevel"]
+ @zip = @zipLevel > 0 ? ".gz" : ""
+ @ttl = config["cacheTTL"]
+ @cacheTree = config["cacheTree"]
+ @maxFiles = config["maxFiles"] # Max number of files in a single directory
+ @maxSize = config["cacheSize"]*1024*1024 # Now in bytes
+ @washNumber = config["cacheWash"] # Clean cache dir after a store operation whenever rand(@washNumber) < 10
+ @lastCleanup = Time.new-8e8 # Remember last time we started a cleanup so we don't pile them up
+ end
+
+ def Cache.hit(objPath, objParam={}, etags=nil, ifmodsince=nil)
+ # objPath is typically a requested path passed from a web request but it
+ # can be just any string. It is not checked against any actual files on the file system
+ #
+ # objParam is expected to be a hash or any object whose iterator yields two values
+ #
+ # 2 filenames are built with the arguments and should give
+ # the name of a metafile and a result file
+ # if the result file is older than @ttl seconds, hit fails
+ # The metafile is then checked for dependencies
+ # It contains a list of filenames along with their size and mtime separated by ;;
+
+ # etag and ifmodsince are used in a webserver context
+ # etag is defined if an ETag was part of an If-None-Match request field
+ # etag can be an array or a single string
+ # If the current ETag of the meta file matches, no data is returned (webserver should return a 304)
+ #
+ # ifmodsince is a time object passed on an If-Modified-Since request field
+ # If the creation date of the meta file is earlier, no data is returned (webserver should return a 304)
+
+ return nil if @cacheDir.nil? # Not initialized, ignore request
+
+ # Reminder: filenames are full path, no need to prepend dirname
+ dirname, basename, filename, metaname = makeNames(objPath, objParam)
+
+ raise "Cache subdir does not exist" unless FileTest.directory?(dirname)
+
+ # Hit the cache
+ meta, mstat = IO.read(metaname), File.stat(metaname) if metaname && FileTest.file?(metaname) && FileTest.readable?(metaname)
+ raise "Empty/No meta file" if meta.nil? || meta.length < 1
+
+ fstat = File.stat(filename) if filename && FileTest.file?(filename)
+ raise "Empty/No data file" if fstat.nil?
+
+ # Check the timestamps of files in the metadata
+ meta = meta.split("\n")
+ raise "I did not write that meta file" unless CacheStamp == meta.shift
+ mline = meta.shift
+ while mline and mline !~ /^;;extra meta$/ do
+ f, s, d = mline.split(";;")
+ if s.to_i < 0
+ # File did not exist when cache entry was created
+ raise "Required file #{f} has (re)appeared" if FileTest.file?(f) && FileTest.readable?(f)
+ else
+ # File did exist when cache entry was created, is it still there?
+ raise "Required file #{f} has disappeared" unless FileTest.file?(f) && FileTest.readable?(f)
+
+ fst = File.stat(f)
+ raise "Size of #{f} has changed from #{fst.size} to #{s.to_i}" unless fst.size == s.to_i
+ raise "Timestamp of #{f} has changed" unless Time.utc(*ParseDate.parsedate(d)) == fst.mtime.utc
+ end
+ mline = meta.shift
+ end
+ if mline =~ /^;;extra meta$/ then
+ extrameta = meta.dup
+ else
+ extrameta = []
+ end
+
+ if notModified?(fstat, etags, ifmodsince) and extrameta.join !~ /set-cookie/i
+ raise Gorg::Status::NotModified.new(fstat)
+ end
+
+ file = IO.read(filename) if filename && FileTest.file?(filename) && FileTest.readable?(filename)
+ raise "Empty/No data file" if file.nil? || file.length < 1
+
+ # Is the data file too old
+ raise "Data file too old" unless @ttl==0 or (Time.new - fstat.mtime) < @ttl
+
+ # Update atime of files, ignore failures as files might have just been removed
+ begin
+ t = Time.new
+ File.utime(t, fstat.mtime, filename)
+ File.utime(t, mstat.mtime, metaname)
+ rescue
+ nil
+ end
+
+ # If we get here, it means the data file can be used, return cache object (data, stat(datafile), extrameta)
+ # The file is left (un)compressed, it's returned as it was stored
+ [file, fstat, extrameta]
+
+ rescue Gorg::Status::NotModified
+ # Nothing changed, should return a 304
+ debug("Client cache is up-to-date")
+ raise
+ rescue
+ # cache hit fails if anything goes wrong, no exception raised
+ debug("Cache hit on #{objPath} failed: (#{$!})")
+ nil
+ end
+
+
+ def Cache.store(data, objPath, objParam={}, deps=[], extrameta=[])
+ # Store data in cache so it can be retrieved based on the objPath and objParams
+ # deps should contain a list of files that the object depends on
+ # as returnd by our xsl processor, i.e. an array of [access_type, path] where
+ # access_type can be "r", "w", or "o" for recpectively read, write, other.
+
+ # Define content-type
+ ct = setContentType(data)
+ extrameta << "Content-Type:#{ct}"
+
+ return nil if @cacheDir.nil? # Not initialized, ignore request
+
+ # Cache only if no remote objects (ftp:// or http://) in list of used files
+ if deps && deps.detect{|f| f[0] =~ /^o$/i }
+ debug "#{objPath} not cached because it needs remote resources"
+ return nil
+ end
+
+ dirname, basename, filename, metaname = makeNames(objPath, objParam)
+
+ FileUtils.mkdir_p(dirname) unless FileTest.directory?(dirname)
+
+ # Write Meta file to a temp file (with .timestamp.randomNumber appended)
+ metaname_t = "#{metaname}.#{Time.new.strftime('%Y%m%d%H%M%S')}.#{rand(9999)}"
+
+ # Data might need to be just a link to another .Data file
+ # if we find another requested path with different params but
+ # with identical MD5 sums
+ # Which is why we keep a ...xml.Data.[md5 sum] file without the parameters
+ # in its name that we can hard link to.
+ # e.g. A moron hits for 10 full handbooks with toto=1..10 in the URI,
+ # we'd end up with 10 identical large copies. With links we have only one
+
+ # Old versions are expected to be cleaned up by the cacheWash() routine
+ # A Dir.glob() to find the previous ones would be too expensive
+
+ # Compute MD5 digest
+ md5 = Digest::MD5.hexdigest(data)
+
+ # Compress data if required
+ if @zipLevel > 0 then
+ bodyZ = data = gzip(data, @zipLevel)
+ else
+ bodyZ = nil
+ end
+
+ # Set mtime of data file to latest mtime of all required files
+ # so that caching can work better because mtimes will be
+ # identical on all webnodes whereas creation date of data
+ # would be different on all nodes.
+ maxmtime = Time.now-8e8
+ fstat = nil
+
+ begin
+ timeout(10){
+ File.open("#{metaname_t}", "w") {|fmeta|
+ fmeta.puts(CacheStamp)
+ # Write filename;;size;;mtime for each file in deps[]
+ deps.each {|ffe|
+ ftype = ffe[0]
+ fdep = ffe[1]
+ if FileTest.file?(fdep)
+ s = File.stat(fdep)
+ fmeta.puts("#{fdep};;#{s.size};;#{s.mtime.utc};;#{ftype}")
+ maxmtime = s.mtime if s.mtime > maxmtime and ftype =~ /^r$/i
+ else
+ # A required file does not exist, use size=-1 and old timestamp
+ # so that when the file comes back, the cache notices a difference
+ # and no cache miss gets triggered as long as file does not exist
+ fmeta.puts("#{fdep};;-1;;Thu Nov 11 11:11:11 UTC 1971")
+ end
+ }
+ fmeta.puts ";;extra meta"
+ extrameta.each { |m| fmeta.puts m }
+ }
+ # Get exclusive access to the cache directory while moving files and/or creating data files
+ File.open(dirname) { |lockd|
+ while not lockd.flock(File::LOCK_NB|File::LOCK_EX)
+ # Timeout does not occur on a blocking lock
+ # Try a non-bloking one repeatedly for a few seconds until timeout occurs or lock is granted
+ # We are in a timeout block, remember
+ sleep 0.1
+ end
+ # Remove previous Data
+ FileUtils.rm_rf(filename)
+
+ # mv temp meta file to meta file
+ FileUtils.mv(metaname_t, metaname)
+
+ # We keep a data file for the same requested path, with different params,
+ # but which ends up with same MD5 sum, i.e. identical results because of unused params
+ linkname = "#{basename}.#{md5}#{@zip}"
+ if FileTest.file?(linkname) then
+ # Data file already there, link to it
+ File.link(linkname, filename)
+ else
+ # Write data file and set its mtime to latest of all files it depends on
+ File.open("#{filename}", "w") {|fdata| fdata.write(data)}
+ # Create link
+ File.link(filename, linkname)
+ end
+ # mtime might need to be updated, or needs to be set
+ # e.g. when a dependency had changed but result files is identical
+ # This is needed to keep Last-Modified dates consistent across web nodes
+ File.utime(Time.now, maxmtime, filename)
+ fstat = File.stat(filename)
+ }
+ }
+ ensure
+ FileUtils.rm_rf(metaname_t)
+ end
+
+ # Do we clean the cache?
+ washCache(dirname, 10) if @washNumber > 0 and rand(@washNumber) < 10
+
+ # Return stat(datafile) even if it's just been removed by washCache
+ # because another web node might still have it or will have it.
+ # Anyway, the cached item would be regenerated on a later request
+ # and a 304 would be returned if still appropriate at the time.
+
+ # Return fstat of data file (for etag...) and zipped file
+ [fstat, bodyZ]
+
+ rescue Timeout::Error, StandardError =>ex
+ if ex.class.to_s =~ /timeout::error/i then
+ warn("Timeout in cache store operation")
+ else
+ warn("Cache store error (#{$!})")
+ end
+ # Clean up before leaving
+ FileUtils.rm_rf(filename||"")
+ FileUtils.rm_rf(metaname||"")
+ nil # return nil so that caller can act if a failed store really is a problem
+ end
+
+
+ def Cache.washCache(dirname, tmout=30, cleanTree=false)
+ # Clean cache entries that are either too old compared to TTL (in seconds)
+ # or reduce total size to maxSize (in MB)
+ # oldDataOnly means to look only for unused *.Data.[md5] files that are not used anymore
+ # because file has been modified and has generated a new *.Data.[md5] file
+
+ # timeout is the maximum time (in seconds) spent in here
+
+ return nil if @cacheDir.nil? # Not initialized, ignore request
+
+ # Also ignore request if dirname not equal to @cacheDir or under it
+ return nil unless dirname[0, @cacheDir.length] == @cacheDir
+
+ # Also ignore request if dirname does not exist yet
+ return nil unless FileTest.directory?(dirname)
+
+ # Also return if less than a minute has elapsed since latest cleanup
+ t0 = Time.new
+ return nil if t0 - @lastCleanup < 60
+
+ # Remember for next time
+ @lastCleanup = t0
+
+ Dir.chdir(dirname) { |d|
+ # Recreate lock file if it's been lost
+ unless File.exist?(@@lockfile)
+ File.open(@@lockfile, "w") { |lockf| lockf.puts("Lock file created on #{Time.now.utc} by gorg")}
+ end
+
+ # Grab lockfile
+ File.open(@@lockfile) { |lockf|
+ if lockf.flock(File::LOCK_NB|File::LOCK_EX) then
+ infoMsg = "Cleaning up cache in #{dirname} (cleanTree=#{cleanTree}, tmout=#{tmout})"
+ info(infoMsg)
+ puts infoMsg if cleanTree
+
+ timeout(tmout) {
+ totalSize, deletedFiles, scannedDirectories = washDir(dirname, cleanTree)
+ if totalSize >= 0 then
+ # Size == -1 means dir was locked, throwing an exception would have been nice :)
+ infoMsg = if cleanTree then
+ "Cache in #{dirname} is now #{totalSize/1024/1024} MB, #{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{scannedDirectories} directories"
+ else
+ "#{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{dirname}"
+ end
+ info(infoMsg)
+ puts infoMsg if cleanTree
+ end
+ }
+ else
+ # Locked dir, another process is busy cleaning up/
+ debug("#{dirname} locked, skipping")
+ puts("#{dirname} locked, skipping") if cleanTree
+ end # of lock test
+ } # end of File.open(@@lockfile), close & release lock automatically
+ }
+ rescue Timeout::Error
+ info("Timeout while cleaning #{dirname}")
+ puts("Timeout while cleaning #{dirname}") if cleanTree
+ rescue StandardError =>ex
+ error("Error while cleaning cache: #{ex}")
+ puts("Error while cleaning cache: #{ex}") if cleanTree
+ end
+
+
+ private
+
+ def Cache.washDir(dirname, cleanTree)
+ # Clean up cache starting from dirname and in subdirectories if cleanTree is true
+ # Return [newSize in bytes, # deleted files, # scanned directories]
+ size = nDeleted = nDirectories = 0
+
+ Dir.chdir(dirname) { |d|
+ hIno = Hash.new(0) # hash of file inodes with more than one link
+ lst = Array.new # array of file names, atime, ...
+ ttl = @ttl
+ ttl = 8e8 if ttl == 0 # No ttl, keep very old docs!
+
+ # Get list of files sorted on their dirname+atime
+ Find.find('.') { |f|
+ begin
+ unless f =~ /^\.$|#{@@lockfile}/ # ignore "." and lockfile
+ ff = File.stat(f)
+ if ff.directory? then
+ Find.prune unless cleanTree
+ elsif ff.file? and f =~ /Meta|Data/ then
+ hIno[ff.ino] = ff.nlink if ff.nlink > 1
+ # List of files has [name, atime, size, # links, inode]
+ lst << [f, ff.atime, ff.size, ff.nlink, ff.ino]
+ end
+ end
+ rescue
+ nil # File.stat can fail because file could have been deleted, ignore error
+ end
+ }
+
+ # Compute total size
+ size = lst.inject(0){ |tot, a| tot + if a[3] > 0 then a[2]/a[3] else 0 end }
+
+ # Delete old *.Data.[md5] files that are not being referenced anymore/
+ lst.each { |a|
+ if a[3] == 1 && a[0] =~ /\.Data\.[0-9a-f]+(.gz)?$/ then
+ # Data file with no more links pointing to it
+ FileUtils.rm_rf(a[0])
+ nDeleted += 1
+ size -= a[2]
+ a[3] = 0 # Mark as deleted
+ end
+ }
+
+ # Sort all files on atime
+ lst.sort!{ |a1, a2| a1[1] <=> a2[1] }
+
+ t0 = Time.new
+ # Clean until size < maxSize _AND_ atime more recent than TTL
+ lst.each { |a|
+ break if size < @maxSize and t0-a[1] < ttl
+ next if a[3] < 1 # Already deleted in previous step
+ FileUtils.rm_rf(a[0])
+ nDeleted += 1
+ # Total size -= file size IF last link to data
+ if a[3] == 1 || hIno[a[4]] <= 1 then
+ size -= a[2]
+ end
+ hIno[a[4]] -= 1 if hIno[a[4]] > 0
+ a[3] = 0 # Mark as deleted by setting nlinks to 0
+ }
+
+ # Remove deleted files from array
+ lst.reject! { |a| a[3] < 1 }
+
+
+ # Sort files per directory to enforce maxFiles
+ if cleanTree then
+ # Split the array in an array per directory
+ # and keep the files sorted on atime in each directory
+ slst = Hash.new
+ lst.length.times {
+ a = lst.shift
+ d = File.dirname(a[0])
+ if slst[d] then
+ slst[d] << a
+ else
+ slst[d] = [a]
+ end
+ }
+ else
+ # If not cleaning whole tree, we have only a single dir
+ slst = {"." => lst}
+ end
+
+ nDirectories = slst.length
+
+ slst.each { |d, lst|
+ # Remove oldest files so that we have less than @maxFiles in it
+ if lst.length >= @maxFiles then
+ # Remove to leave up 90% of #maxFiles so we don't clean up only a handful of files repeatedly
+ (lst.length - 9*@maxFiles/10).times {
+ if a = lst.shift then
+ FileUtils.rm_rf(a[0])
+ nDeleted += 1
+ # Total size -= file size IF last link to data
+ if a[3] == 1 || hIno[a[4]] <= 1 then
+ size -= a[2]
+ end
+ hIno[a[4]] -= 1 if hIno[a[4]] > 0
+ end
+ }
+ end
+ }
+ } #end of chdir
+ [size, nDeleted, nDirectories]
+ end
+
+
+ def Cache.makeNames(obj, params)
+ # Build meta filename and data filename from arguments
+ #
+ # obj is broken into a path and a filename with appended params
+ # e.g. /proj/en/index.xml?style=printable becomes /proj/en and index.xml+printable+yes
+ # or .#proj#en#index.xml+printable+yes
+ # depending on cacheTree param value
+
+ # .Meta and .Data are appended respectively to the meta filename and data filename
+ # Base is the filename without appending params, e.g. .#proj#en#index.xml.Data
+ if @cacheTree then
+ # Use a path and a file
+ dir = "#{@cacheDir}#{File.dirname(obj)}"
+ base = f = File.basename(obj)
+ else
+ # Convert full path into a single filename
+ dir = @cacheDir
+ base = f = ".#{obj.gsub(/\//,'#')}"
+ end
+
+ f = "#{f}+#{params.reject{|k,v| k.nil?}.sort.join('+')}" if params && params.to_a.length > 0
+ # Remove funky chars and squeeze duplicates into single chars
+ f = f.gsub(/[^\w\#.+_-]/, "~").squeeze("~.#+")
+
+ # Return names for Data and Meta files, and just the filepath (e.g. #proj#en#index.xml)
+ [dir, "#{dir}/#{base}.Data", "#{dir}/#{f}.Data#{@zip}", "#{dir}/#{f}.Meta"]
+ end
+end
+
+end