Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvements for people parsing email headers #505

Merged
merged 6 commits into from
May 14, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions lib/mail/core_extensions/string.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
# encoding: utf-8
class String #:nodoc:

if RUBY_VERSION >= '1.9'
# This 1.9 only regex can save a reasonable amount of time (~20%)
# by not matching "\r\n" so the string is returned unchanged in
# the common case.
CRLF_REGEX = Regexp.new("(?<!\r)\n|\r(?!\n)")
else
CRLF_REGEX = /\n|\r\n|\r/
end

def to_crlf
to_str.gsub(/\n|\r\n|\r/) { "\r\n" }
to_str.gsub(CRLF_REGEX, "\r\n")
end

def to_lf
to_str.gsub(/\n|\r\n|\r/) { "\n" }
to_str.gsub(/\r\n|\r/, "\n")
end

unless String.instance_methods(false).map {|m| m.to_sym}.include?(:blank?)
Expand Down
64 changes: 47 additions & 17 deletions lib/mail/field.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ module Mail
#
class Field

include Patterns
include Utilities
include Comparable

STRUCTURED_FIELDS = %w[ bcc cc content-description content-disposition
Expand Down Expand Up @@ -67,6 +67,10 @@ class Field
"content-location" => ContentLocationField,
}

FIELD_NAME_MAP = FIELDS_MAP.inject({}) do |map, (field, field_klass)|
map.update(field => field_klass::CAPITALIZED_FIELD)
end

# Generic Field Exception
class FieldError < StandardError
end
Expand Down Expand Up @@ -110,47 +114,59 @@ class SyntaxError < FieldError #:nodoc:
def initialize(name, value = nil, charset = 'utf-8')
case
when name =~ /:/ # Field.new("field-name: field data")
charset = value unless value.blank?
name, value = split(name)
create_field(name, value, charset)
@charset = value.blank? ? charset : value
@name = name[FIELD_PREFIX]
@raw_value = name
@value = nil
when name !~ /:/ && value.blank? # Field.new("field-name")
create_field(name, nil, charset)
@name = name
@value = nil
@raw_value = nil
@charset = charset
else # Field.new("field-name", "value")
create_field(name, value, charset)
@name = name
@value = value
@raw_value = nil
@charset = charset
end
return self
@name = FIELD_NAME_MAP[@name.to_s.downcase] || @name
end

def field=(value)
@field = value
end

def field
@field
_, @value = split(@raw_value) if @raw_value && !@value
@field ||= create_field(@name, @value, @charset)
end

def name
field.name
@name
end

def value
field.value
end

def value=(val)
create_field(name, val, charset)
@field = create_field(name, val, @charset)
end

def to_s
field.to_s
end

def update(name, value)
create_field(name, value, charset)
@field = create_field(name, value, @charset)
end

def same( other )
match_to_s(other.name, field.name)
match_to_s(other.name, self.name)
end

def responsible_for?( val )
name.to_s.casecmp(val.to_s) == 0
end

alias_method :==, :same
Expand Down Expand Up @@ -182,18 +198,32 @@ def method_missing(name, *args, &block)

def split(raw_field)
match_data = raw_field.mb_chars.match(FIELD_SPLIT)
[match_data[1].to_s.mb_chars.strip, match_data[2].to_s.mb_chars.strip]
[match_data[1].to_s.mb_chars.strip, match_data[2].to_s.mb_chars.strip.to_s]
rescue
STDERR.puts "WARNING: Could not parse (and so ignoring) '#{raw_field}'"
end

# 2.2.3. Long Header Fields
#
# The process of moving from this folded multiple-line representation
# of a header field to its single line representation is called
# "unfolding". Unfolding is accomplished by simply removing any CRLF
# that is immediately followed by WSP. Each header field should be
# treated in its unfolded form for further syntactic and semantic
# evaluation.
def unfold(string)
string.gsub(/[\r\n \t]+/m, ' ')
end

def create_field(name, value, charset)
value = unfold(value) if value.is_a?(String)

begin
self.field = new_field(name, value, charset)
new_field(name, value, charset)
rescue Mail::Field::ParseError => e
self.field = Mail::UnstructuredField.new(name, value)
self.field.errors << [name, value, e]
self.field
field = Mail::UnstructuredField.new(name, value)
field.errors << [name, value, e]
field
end
end

Expand Down
36 changes: 18 additions & 18 deletions lib/mail/field_list.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,26 @@ class FieldList < Array

include Enumerable

# Insert the field in sorted order.
#
# Heavily based on bisect.insort from Python, which is:
# Copyright (C) 2001-2013 Python Software Foundation.
# Licensed under <http://docs.python.org/license.html>
# From <http://hg.python.org/cpython/file/2.7/Lib/bisect.py>
def <<( new_field )
current_entry = self.rindex(new_field)
if current_entry
self.insert((current_entry + 1), new_field)
else
insert_idx = -1
self.each_with_index do |item, idx|
case item <=> new_field
when -1
next
when 0
next
when 1
insert_idx = idx
break
end
lo = 0
hi = size

while lo < hi
mid = (lo + hi) / 2
if new_field < self[mid]
hi = mid
else
lo = mid + 1
end
insert(insert_idx, new_field)
end

insert(lo, new_field)
end

end
end
end
23 changes: 2 additions & 21 deletions lib/mail/header.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def self.maximum_amount=(value)
# these cases, please make a patch and send it in, or at the least, send
# me the example so we can fix it.
def initialize(header_text = nil, charset = nil)
@errors = []
@charset = charset
self.raw_source = header_text.to_crlf.lstrip
split_header if header_text
Expand Down Expand Up @@ -91,7 +90,6 @@ def fields=(unfolded_fields)
unfolded_fields[0..(self.class.maximum_amount-1)].each do |field|

field = Field.new(field, nil, charset)
field.errors.each { |error| self.errors << error }
if limited_field?(field.name) && (selected = select_field_for(field.name)) && selected.any?
selected.first.update(field.name, field.value)
else
Expand All @@ -102,7 +100,7 @@ def fields=(unfolded_fields)
end

def errors
@errors
@fields.map(&:errors).flatten(1)
end

# 3.6. Field definitions
Expand Down Expand Up @@ -246,27 +244,10 @@ def raw_source=(val)
@raw_source = val
end

# 2.2.3. Long Header Fields
#
# The process of moving from this folded multiple-line representation
# of a header field to its single line representation is called
# "unfolding". Unfolding is accomplished by simply removing any CRLF
# that is immediately followed by WSP. Each header field should be
# treated in its unfolded form for further syntactic and semantic
# evaluation.
def unfold(string)
string.gsub(/#{CRLF}#{WSP}+/, ' ').gsub(/#{WSP}+/, ' ')
end

# Returns the header with all the folds removed
def unfolded_header
@unfolded_header ||= unfold(raw_source)
end

# Splits an unfolded and line break cleaned header into individual field
# strings.
def split_header
self.fields = unfolded_header.split(CRLF)
self.fields = raw_source.split(HEADER_SPLIT)
end

def select_field_for(name)
Expand Down
4 changes: 3 additions & 1 deletion lib/mail/patterns.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@ module Patterns
FWS = /#{CRLF}#{WSP}*/
TEXT = /[#{text}]/ # + obs-text
FIELD_NAME = /[#{field_name}]+/
FIELD_BODY = /.+/
FIELD_PREFIX = /\A(#{FIELD_NAME})/
FIELD_BODY = /.+/m
FIELD_LINE = /^[#{field_name}]+:\s*.+$/
FIELD_SPLIT = /^(#{FIELD_NAME})\s*:\s*(#{FIELD_BODY})?$/
HEADER_LINE = /^([#{field_name}]+:\s*.+)$/
HEADER_SPLIT = /#{CRLF}(?!#{WSP})/

QP_UNSAFE = /[^#{qp_safe}]/
QP_SAFE = /[#{qp_safe}]/
Expand Down
8 changes: 4 additions & 4 deletions spec/mail/header_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -461,10 +461,10 @@
header = Mail::Header.new("Content-Transfer-Encoding: vl@d\r\nReply-To: a b b")
header.errors.should_not be_blank
header.errors.size.should eq 2
header.errors[0][0].should eq 'Content-Transfer-Encoding'
header.errors[0][1].should eq 'vl@d'
header.errors[1][0].should eq 'Reply-To'
header.errors[1][1].should eq 'a b b'
header.errors[0][0].should eq 'Reply-To'
header.errors[0][1].should eq 'a b b'
header.errors[1][0].should eq 'Content-Transfer-Encoding'
header.errors[1][1].should eq 'vl@d'
end
end

Expand Down
10 changes: 5 additions & 5 deletions spec/mail/message_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def basic_email

it "should raise a warning (and keep parsing) on having an incorrectly formatted header" do
STDERR.should_receive(:puts).with("WARNING: Could not parse (and so ignoring) 'quite Delivered-To: [email protected]'")
Mail.read(fixture('emails', 'plain_emails', 'raw_email_incorrect_header.eml'))
Mail.read(fixture('emails', 'plain_emails', 'raw_email_incorrect_header.eml')).to_s
end

it "should read in an email message and basically parse it" do
Expand Down Expand Up @@ -1537,10 +1537,10 @@ def self.delivering_email(mail)
mail = Mail.new("Content-Transfer-Encoding: vl@d\r\nReply-To: a b b\r\n")
mail.errors.should_not be_blank
mail.errors.size.should eq 2
mail.errors[0][0].should eq 'Content-Transfer-Encoding'
mail.errors[0][1].should eq 'vl@d'
mail.errors[1][0].should eq 'Reply-To'
mail.errors[1][1].should eq 'a b b'
mail.errors[0][0].should eq 'Reply-To'
mail.errors[0][1].should eq 'a b b'
mail.errors[1][0].should eq 'Content-Transfer-Encoding'
mail.errors[1][1].should eq 'vl@d'
end
end

Expand Down