diff options
author | Alex Legler <alex@a3li.li> | 2015-02-19 20:02:20 +0100 |
---|---|---|
committer | Alex Legler <alex@a3li.li> | 2015-02-19 20:02:20 +0100 |
commit | 5407f1f169e932063fb145bbb2a971a2188b9cd4 (patch) | |
tree | 3b1d38bcf4b14cffbd54899614c1cbda156e6861 /lib | |
download | backend-5407f1f169e932063fb145bbb2a971a2188b9cd4.tar.gz backend-5407f1f169e932063fb145bbb2a971a2188b9cd4.tar.bz2 backend-5407f1f169e932063fb145bbb2a971a2188b9cd4.zip |
Initial version
Diffstat (limited to 'lib')
-rw-r--r-- | lib/rendering.rb | 58 | ||||
-rw-r--r-- | lib/storage.rb | 208 | ||||
-rw-r--r-- | lib/threading.rb | 68 | ||||
-rw-r--r-- | lib/utils.rb | 14 |
4 files changed, 348 insertions, 0 deletions
diff --git a/lib/rendering.rb b/lib/rendering.rb new file mode 100644 index 0000000..3e77414 --- /dev/null +++ b/lib/rendering.rb @@ -0,0 +1,58 @@ +require 'sanitize' +require 'cgi' + +module Ag::Rendering + class HTMLizer + def self.HTMLize(mail) + if mail.multipart? + content_type = mime_split(mail.parts.first.content_type) + + if content_type == 'text/plain' or content_type == 'text/html' + to_content(content_type, mail.parts.first.decoded) + else + # Nested multipart? + if mail.parts.first.multipart? + content_type = mime_split(mail.parts.first.parts.first.content_type) + + if content_type == 'text/plain' or content_type == 'text/html' + to_content(content_type, mail.parts.first.parts.first.decoded) + else + raise "Cannot find body: #{mail.message_id}" + end + # Specialty: Gnus/Emacs signed emails with no explicit multipart type + elsif mime_split(mail.content_type) == 'multipart/signed' + to_content('text/plain', mail.parts.first.decoded) + end + end + else + # No Content-Type, assume plain text (git-send-email) + if mail.content_type == nil + to_content('text/plain', mail.body.decoded) + else + to_content(mime_split(mail.content_type), mail.body.decoded) + end + end + end + + def self.to_content(content_type, content) + if content_type == 'text/plain' + escaped_content = CGI::escapeHTML(content) + escaped_content.lines.map do |line| + if line.start_with? '>' + "<div class=\"ag-quote\">#{line.rstrip}</div>\n" + else + line + end + end.join.gsub("</div>\n<div class=\"ag-quote\">", "\n") + elsif content_type == 'text/html' + '<div class="ag-html-content">' + Sanitize.clean(content, Sanitize::Config::BASIC) + '</div>' + else + '<div class="alert alert-danger" role="alert"><strong>Unsupported Content-Type</strong></div>' + end + end + + def self.mime_split(content_type) + (content_type || '').split(';').first + end + end +end
\ No newline at end of file diff --git a/lib/storage.rb b/lib/storage.rb new file mode 100644 index 0000000..9045a0b --- /dev/null +++ b/lib/storage.rb @@ -0,0 +1,208 @@ +require 'elasticsearch' +require 'date' + +module Ag::Storage + module_function + def create_index(list) + begin + $es.indices.delete index: 'ml-' + list + rescue Elasticsearch::Transport::Transport::Errors::NotFound => e + $stderr.puts "Index did not exist yet. Creating." + end + + $es.indices.create( + index: 'ml-' + list, + body: { + mappings: { + message: { + properties: { + attachments: { + properties: { + filename: { + type: 'string', + index: 'not_analyzed' + }, + mime: { + type: 'string', + index: 'not_analyzed' + } + } + }, + cc: { + type: 'string' + }, + content: { + type: 'string' + }, + date: { + type: 'date', + format: 'dateOptionalTime' + }, + from: { + type: 'string' + }, + from_realname: { + type: 'string' + }, + month: { + type: 'integer' + }, + parent: { + type: 'string', + index: 'not_analyzed' + }, + raw_message_id: { + type: 'string', + index: 'not_analyzed' + }, + raw_parent: { + type: 'string' + }, + subject: { + type: 'string' + }, + to: { + type: 'string' + } + } + } + } + }) + end + + def get_content(message) + content = "Cannot parse MIME/contents." + begin + raw_content = Ag::Rendering::HTMLizer.HTMLize(message) + content = Ag::Utils.fix_encoding(raw_content || '').strip + + if content == '' + $stderr.puts "#{message.message_id}: Content empty" + end + rescue + $stderr.puts "#{message.message_id}: Invalid encoding" + end + + content + end + + def get_parent_message(list, parent_message_id = nil) + return nil if parent_message_id == nil + + result = $es.search( + index: 'ml-' + list, + body: { + query: { + filtered: { + filter: { + term: { raw_message_id: parent_message_id } + } + } + }, + fields: ['_id'] + } + ) + + return nil if result['hits']['total'] == 0 + + result['hits']['hits'].first['_id'] + end + + def store(list, message) + content = get_content(message) + + identifier = message['X-Archives-Hash'].value + raw_parent = Ag::Threading.get_parent_message_id(message) + + from = Ag::Utils.fix_encoding(message[:from].formatted.first) + from_realname = from.gsub(/<(.*)>/, '').strip + + to = '' + if message[:to] + to = Ag::Utils.fix_encoding(message[:to].formatted.join(',')) + end + + cc = '' + if message[:cc] + cc = Ag::Utils.fix_encoding(message[:cc].formatted.join(',')) + end + + subject = Ag::Utils.fix_encoding(message.subject) + + attachments = [] + if message.has_attachments? + message.attachments.each do |attachment| + attachments << { + filename: attachment.filename, + mime: attachment.mime_type + } + end + end + + $es.index( + index: 'ml-' + list, + type: 'message', + id: identifier, + body: { + raw_message_id: message.message_id, + subject: subject, + to: to, + cc: cc, + from: from, + from_realname: from_realname, + date: message.date, + month: ("%i%02i" % [message.date.year, message.date.month]).to_i, # this is a sortable number! + content: content, + attachments: attachments, + raw_parent: raw_parent + } + ) + end + + def fix_threading(list) + result = $es.search( + index: 'ml-' + list, + size: 100000, + body: { + size: 100000, + query: { + filtered: { + filter: { + and: [ + { + missing: { + field: 'parent' + } + }, + { + exists: { + field: 'raw_parent' + } + } + ] + } + } + } + } + ) + + result['hits']['hits'].each do |hit| + msg = get_parent_message(list, hit['_source']['raw_parent']) + + unless msg == nil + $es.update( + index: 'ml-' + list, + type: 'message', + id: hit['_id'], + body: { + doc: { + parent: msg + } + } + ) + end + end + + result['hits']['total'] + end +end
\ No newline at end of file diff --git a/lib/threading.rb b/lib/threading.rb new file mode 100644 index 0000000..8988f23 --- /dev/null +++ b/lib/threading.rb @@ -0,0 +1,68 @@ +module Ag + module Threading + module_function + # Figures out the Message-Id of the parent message, + # or returns nil if we asusme this message is not a reply + def get_parent_message_id(mail) + # No headers -> no parent message + if mail.in_reply_to == nil and mail.references == nil + return nil + else + irt_value = nil + + if mail.in_reply_to.is_a? Array + irt_value = mail.in_reply_to.last + elsif mail.in_reply_to.is_a? String + irt_value = mail.in_reply_to + + # Gnus/Emacs specialty du jour + # => "<1075186049.4264.1.camel@TesterTop.tester.ca> (Olivier CrĂȘte's message of \"Tue, 27 Jan 2004 07:47:29 +0100\")" + if irt_value.start_with? '<' + irt_value = irt_value[0..irt_value.rindex('>')] unless irt_value.end_with? '>' + irt_value.gsub!(/(^<|>$)/, '') + end + elsif mail.in_reply_to == nil + # nothing to do + else + $stderr.puts "In-Reply-To is a weird type: #{mail.message_id}" if $options.debug + end + + ref_value = nil + if mail.references.is_a? Array + ref_value = mail.references.last + elsif mail.references.is_a? String + ref_value = mail.references + elsif mail.references == nil + # nothing to do + else + $stderr.puts "References is a weird type: #{mail.message_id}" if $options.debug + end + + if irt_value == ref_value + return irt_value.to_s + elsif irt_value == nil + return ref_value.to_s + elsif ref_value == nil + return irt_value.to_s + else + $stderr.puts "In-Reply-To and References disagree: #{mail.message_id}" if $options.debug + # If in doubt, let In-Reply-To win + return irt_value.to_s + end + end + + $stderr.puts "Couldn't find a parent id for Message-Id: #{mail.message_id}" if $options.debug + nil + end + + def calc(list) + number_of_root_threads = -1 + loop do + new_num = Ag::Storage.fix_threading(list) + + break if new_num == number_of_root_threads + number_of_root_threads = new_num + end + end + end +end
\ No newline at end of file diff --git a/lib/utils.rb b/lib/utils.rb new file mode 100644 index 0000000..109a6a5 --- /dev/null +++ b/lib/utils.rb @@ -0,0 +1,14 @@ +require 'charlock_holmes' + +module Ag + module Utils + module_function + def fix_encoding(str) + detection = CharlockHolmes::EncodingDetector.detect(str) + CharlockHolmes::Converter.convert(str, detection[:encoding], 'UTF-8') + rescue => e + $stderr.puts e.message if $options.debug + 'Encoding could not be reliably detected. Message contents not available.' + end + end +end
\ No newline at end of file |