Skip to content

Commit

Permalink
Use Base.hash and add profiling script
Browse files Browse the repository at this point in the history
  • Loading branch information
pfitzseb committed Nov 17, 2021
1 parent bed7f32 commit 4f91b83
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 16 deletions.
2 changes: 2 additions & 0 deletions src/Tokenize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import .Tokens: untokenize

export tokenize, untokenize, Tokens

# disable precompilation when profiling runtime performance, as
# it can lead to wrong traces
include("_precompile.jl")
_precompile_()

Expand Down
21 changes: 5 additions & 16 deletions src/lexer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,7 @@ function lex_digit(l::Lexer, kind)
kind = Tokens.HEX_INT
isfloat = false
readchar(l)
!(ishex(ppc) || ppc =='.') && return emit_error(l, Tokens.INVALID_NUMERIC_CONSTANT)
!(ishex(ppc) || ppc == '.') && return emit_error(l, Tokens.INVALID_NUMERIC_CONSTANT)
accept_number(l, ishex)
if accept(l, '.')
accept_number(l, ishex)
Expand Down Expand Up @@ -1025,10 +1025,10 @@ function lex_identifier(l::Lexer{IO_t,T}, c) where {IO_t,T}
if T == Token
readon(l)
end
h = simple_hash(c, 0)
h = simple_hash(c, UInt64(0))
while true
pc, ppc = dpeekchar(l)
if !is_identifier_char(pc) || (pc == '!' && ppc == '=')
if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
break
end
c = readchar(l)
Expand All @@ -1038,22 +1038,11 @@ function lex_identifier(l::Lexer{IO_t,T}, c) where {IO_t,T}
return emit(l, get(kw_hash, h, IDENTIFIER))
end

# This creates a hash using 5 bit per lower case ASCII char.
# It checks its input to be between 'a' and 'z' (because only those chars)
# are valid in keywords, and returns a sentinel value for invalid inputs
# or when the hash is about to overflow.
function simple_hash(c, h)
h == UInt64(0xff) && return h
# only 'a' - 'z' actually need to be hashed
'a' <= c <= 'z' || return UInt64(0xff)
# catch possible overflow by checking the 10 high bits
(h & (UInt64(0x3ff) << (64 - 10))) > 0 && return UInt64(0xff)
UInt64(h) << 5 + UInt8(c - 'a' + 1)
end
@inline simple_hash(c::Char, h::UInt64) = hash(c, h)

function simple_hash(str)
ind = 1
h = 0
h = UInt64(0)
while ind <= length(str)
h = simple_hash(str[ind], h)
ind = nextind(str, ind)
Expand Down
54 changes: 54 additions & 0 deletions test/profile.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
using Tokenize

nt = @timed @eval(collect(Tokenize.tokenize("foo + bar")))
println("First run took $(nt.time) seconds with $(nt.bytes/1e6) MB allocated")

srcdir = joinpath(Sys.BINDIR, Base.DATAROOTDIR, "..")

allfiles = []
for (root, dirs, files) in walkdir(srcdir, follow_symlinks = true)
for file in files
splitext(file)[2] == ".jl" || continue
push!(allfiles, joinpath(root, file))
end
end

# warmup
let time_taken = 0.0, allocated = 0.0
for file in allfiles
content = IOBuffer(read(file, String))
nt = @timed collect(Tokenize.tokenize(content, Tokens.RawToken))
time_taken += nt.time
allocated += nt.bytes
end
end

# actual run
let time_taken = 0.0, allocated = 0.0
for file in allfiles
content = IOBuffer(read(file, String))
nt = @timed collect(Tokenize.tokenize(content, Tokens.RawToken))
time_taken += nt.time
allocated += nt.bytes
end
println("Tokenized $(length(allfiles)) files in $(time_taken) seconds with $(allocated/1e6) MB allocated")
end

isempty(ARGS) && exit(0)

using PProf, Profile

# warm up profiler
let content = read(first(allfiles), String)
@profile collect(Tokenize.tokenize(content, Tokens.RawToken))
end

Profile.clear()
for file in allfiles
content = read(file, String)
@profile collect(Tokenize.tokenize(content, Tokens.RawToken))
end
pprof()

println("Press any key to exit...")
readline()

0 comments on commit 4f91b83

Please sign in to comment.