aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorAlex Legler <alex@a3li.li>2015-02-19 20:02:20 +0100
committerAlex Legler <alex@a3li.li>2015-02-19 20:02:20 +0100
commit5407f1f169e932063fb145bbb2a971a2188b9cd4 (patch)
tree3b1d38bcf4b14cffbd54899614c1cbda156e6861 /lib
downloadbackend-5407f1f169e932063fb145bbb2a971a2188b9cd4.tar.gz
backend-5407f1f169e932063fb145bbb2a971a2188b9cd4.tar.bz2
backend-5407f1f169e932063fb145bbb2a971a2188b9cd4.zip
Initial version
Diffstat (limited to 'lib')
-rw-r--r--lib/rendering.rb58
-rw-r--r--lib/storage.rb208
-rw-r--r--lib/threading.rb68
-rw-r--r--lib/utils.rb14
4 files changed, 348 insertions, 0 deletions
diff --git a/lib/rendering.rb b/lib/rendering.rb
new file mode 100644
index 0000000..3e77414
--- /dev/null
+++ b/lib/rendering.rb
@@ -0,0 +1,58 @@
+require 'sanitize'
+require 'cgi'
+
+module Ag::Rendering
+ class HTMLizer
+ def self.HTMLize(mail)
+ if mail.multipart?
+ content_type = mime_split(mail.parts.first.content_type)
+
+ if content_type == 'text/plain' or content_type == 'text/html'
+ to_content(content_type, mail.parts.first.decoded)
+ else
+ # Nested multipart?
+ if mail.parts.first.multipart?
+ content_type = mime_split(mail.parts.first.parts.first.content_type)
+
+ if content_type == 'text/plain' or content_type == 'text/html'
+ to_content(content_type, mail.parts.first.parts.first.decoded)
+ else
+ raise "Cannot find body: #{mail.message_id}"
+ end
+ # Specialty: Gnus/Emacs signed emails with no explicit multipart type
+ elsif mime_split(mail.content_type) == 'multipart/signed'
+ to_content('text/plain', mail.parts.first.decoded)
+ end
+ end
+ else
+ # No Content-Type, assume plain text (git-send-email)
+ if mail.content_type == nil
+ to_content('text/plain', mail.body.decoded)
+ else
+ to_content(mime_split(mail.content_type), mail.body.decoded)
+ end
+ end
+ end
+
+ def self.to_content(content_type, content)
+ if content_type == 'text/plain'
+ escaped_content = CGI::escapeHTML(content)
+ escaped_content.lines.map do |line|
+ if line.start_with? '&gt;'
+ "<div class=\"ag-quote\">#{line.rstrip}</div>\n"
+ else
+ line
+ end
+ end.join.gsub("</div>\n<div class=\"ag-quote\">", "\n")
+ elsif content_type == 'text/html'
+ '<div class="ag-html-content">' + Sanitize.clean(content, Sanitize::Config::BASIC) + '</div>'
+ else
+ '<div class="alert alert-danger" role="alert"><strong>Unsupported Content-Type</strong></div>'
+ end
+ end
+
+ def self.mime_split(content_type)
+ (content_type || '').split(';').first
+ end
+ end
+end \ No newline at end of file
diff --git a/lib/storage.rb b/lib/storage.rb
new file mode 100644
index 0000000..9045a0b
--- /dev/null
+++ b/lib/storage.rb
@@ -0,0 +1,208 @@
+require 'elasticsearch'
+require 'date'
+
+module Ag::Storage
+ module_function
+ def create_index(list)
+ begin
+ $es.indices.delete index: 'ml-' + list
+ rescue Elasticsearch::Transport::Transport::Errors::NotFound => e
+ $stderr.puts "Index did not exist yet. Creating."
+ end
+
+ $es.indices.create(
+ index: 'ml-' + list,
+ body: {
+ mappings: {
+ message: {
+ properties: {
+ attachments: {
+ properties: {
+ filename: {
+ type: 'string',
+ index: 'not_analyzed'
+ },
+ mime: {
+ type: 'string',
+ index: 'not_analyzed'
+ }
+ }
+ },
+ cc: {
+ type: 'string'
+ },
+ content: {
+ type: 'string'
+ },
+ date: {
+ type: 'date',
+ format: 'dateOptionalTime'
+ },
+ from: {
+ type: 'string'
+ },
+ from_realname: {
+ type: 'string'
+ },
+ month: {
+ type: 'integer'
+ },
+ parent: {
+ type: 'string',
+ index: 'not_analyzed'
+ },
+ raw_message_id: {
+ type: 'string',
+ index: 'not_analyzed'
+ },
+ raw_parent: {
+ type: 'string'
+ },
+ subject: {
+ type: 'string'
+ },
+ to: {
+ type: 'string'
+ }
+ }
+ }
+ }
+ })
+ end
+
+ def get_content(message)
+ content = "Cannot parse MIME/contents."
+ begin
+ raw_content = Ag::Rendering::HTMLizer.HTMLize(message)
+ content = Ag::Utils.fix_encoding(raw_content || '').strip
+
+ if content == ''
+ $stderr.puts "#{message.message_id}: Content empty"
+ end
+ rescue
+ $stderr.puts "#{message.message_id}: Invalid encoding"
+ end
+
+ content
+ end
+
+ def get_parent_message(list, parent_message_id = nil)
+ return nil if parent_message_id == nil
+
+ result = $es.search(
+ index: 'ml-' + list,
+ body: {
+ query: {
+ filtered: {
+ filter: {
+ term: { raw_message_id: parent_message_id }
+ }
+ }
+ },
+ fields: ['_id']
+ }
+ )
+
+ return nil if result['hits']['total'] == 0
+
+ result['hits']['hits'].first['_id']
+ end
+
+ def store(list, message)
+ content = get_content(message)
+
+ identifier = message['X-Archives-Hash'].value
+ raw_parent = Ag::Threading.get_parent_message_id(message)
+
+ from = Ag::Utils.fix_encoding(message[:from].formatted.first)
+ from_realname = from.gsub(/<(.*)>/, '').strip
+
+ to = ''
+ if message[:to]
+ to = Ag::Utils.fix_encoding(message[:to].formatted.join(','))
+ end
+
+ cc = ''
+ if message[:cc]
+ cc = Ag::Utils.fix_encoding(message[:cc].formatted.join(','))
+ end
+
+ subject = Ag::Utils.fix_encoding(message.subject)
+
+ attachments = []
+ if message.has_attachments?
+ message.attachments.each do |attachment|
+ attachments << {
+ filename: attachment.filename,
+ mime: attachment.mime_type
+ }
+ end
+ end
+
+ $es.index(
+ index: 'ml-' + list,
+ type: 'message',
+ id: identifier,
+ body: {
+ raw_message_id: message.message_id,
+ subject: subject,
+ to: to,
+ cc: cc,
+ from: from,
+ from_realname: from_realname,
+ date: message.date,
+ month: ("%i%02i" % [message.date.year, message.date.month]).to_i, # this is a sortable number!
+ content: content,
+ attachments: attachments,
+ raw_parent: raw_parent
+ }
+ )
+ end
+
+ def fix_threading(list)
+ result = $es.search(
+ index: 'ml-' + list,
+ size: 100000,
+ body: {
+ size: 100000,
+ query: {
+ filtered: {
+ filter: {
+ and: [
+ {
+ missing: {
+ field: 'parent'
+ }
+ },
+ {
+ exists: {
+ field: 'raw_parent'
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ )
+
+ result['hits']['hits'].each do |hit|
+ msg = get_parent_message(list, hit['_source']['raw_parent'])
+
+ unless msg == nil
+ $es.update(
+ index: 'ml-' + list,
+ type: 'message',
+ id: hit['_id'],
+ body: {
+ doc: {
+ parent: msg
+ }
+ }
+ )
+ end
+ end
+
+ result['hits']['total']
+ end
+end \ No newline at end of file
diff --git a/lib/threading.rb b/lib/threading.rb
new file mode 100644
index 0000000..8988f23
--- /dev/null
+++ b/lib/threading.rb
@@ -0,0 +1,68 @@
+module Ag
+ module Threading
+ module_function
+ # Figures out the Message-Id of the parent message,
+ # or returns nil if we asusme this message is not a reply
+ def get_parent_message_id(mail)
+ # No headers -> no parent message
+ if mail.in_reply_to == nil and mail.references == nil
+ return nil
+ else
+ irt_value = nil
+
+ if mail.in_reply_to.is_a? Array
+ irt_value = mail.in_reply_to.last
+ elsif mail.in_reply_to.is_a? String
+ irt_value = mail.in_reply_to
+
+ # Gnus/Emacs specialty du jour
+ # => "<1075186049.4264.1.camel@TesterTop.tester.ca> (Olivier CrĂȘte's message of \"Tue, 27 Jan 2004 07:47:29 +0100\")"
+ if irt_value.start_with? '<'
+ irt_value = irt_value[0..irt_value.rindex('>')] unless irt_value.end_with? '>'
+ irt_value.gsub!(/(^<|>$)/, '')
+ end
+ elsif mail.in_reply_to == nil
+ # nothing to do
+ else
+ $stderr.puts "In-Reply-To is a weird type: #{mail.message_id}" if $options.debug
+ end
+
+ ref_value = nil
+ if mail.references.is_a? Array
+ ref_value = mail.references.last
+ elsif mail.references.is_a? String
+ ref_value = mail.references
+ elsif mail.references == nil
+ # nothing to do
+ else
+ $stderr.puts "References is a weird type: #{mail.message_id}" if $options.debug
+ end
+
+ if irt_value == ref_value
+ return irt_value.to_s
+ elsif irt_value == nil
+ return ref_value.to_s
+ elsif ref_value == nil
+ return irt_value.to_s
+ else
+ $stderr.puts "In-Reply-To and References disagree: #{mail.message_id}" if $options.debug
+ # If in doubt, let In-Reply-To win
+ return irt_value.to_s
+ end
+ end
+
+ $stderr.puts "Couldn't find a parent id for Message-Id: #{mail.message_id}" if $options.debug
+ nil
+ end
+
+ def calc(list)
+ number_of_root_threads = -1
+ loop do
+ new_num = Ag::Storage.fix_threading(list)
+
+ break if new_num == number_of_root_threads
+ number_of_root_threads = new_num
+ end
+ end
+ end
+end \ No newline at end of file
diff --git a/lib/utils.rb b/lib/utils.rb
new file mode 100644
index 0000000..109a6a5
--- /dev/null
+++ b/lib/utils.rb
@@ -0,0 +1,14 @@
+require 'charlock_holmes'
+
+module Ag
+ module Utils
+ module_function
+ def fix_encoding(str)
+ detection = CharlockHolmes::EncodingDetector.detect(str)
+ CharlockHolmes::Converter.convert(str, detection[:encoding], 'UTF-8')
+ rescue => e
+ $stderr.puts e.message if $options.debug
+ 'Encoding could not be reliably detected. Message contents not available.'
+ end
+ end
+end \ No newline at end of file