Skip to content

Commit

Permalink
Use ASCIIString instead of Latin1String (closes #4).
Browse files Browse the repository at this point in the history
It still remains to implement the optimizations this enables as
well as making sure that all strings are output such that when
input again via repl they are equal to the original string.

Also fixes the UTF-8 test "suite" (fixes #9).
  • Loading branch information
StefanKarpinski committed May 2, 2011
1 parent 23eaf3a commit 83bd7b1
Show file tree
Hide file tree
Showing 17 changed files with 62 additions and 43 deletions.
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@ pcre_h.j:
test: debug
./julia tests.j

testall: test
test-utf8:
./julia test_utf8.j

testall: test test-utf8

SLOCCOUNT = sloccount \
--addlang makefile \
--personcost 100000 \
Expand All @@ -51,4 +53,4 @@ clean:
cleanall: clean
$(MAKE) -C src cleanother

.PHONY: default debug release julia-debug julia-release test testall sloccount clean cleanall
.PHONY: default debug release julia-debug julia-release test test-* testall sloccount clean cleanall
28 changes: 28 additions & 0 deletions ascii.j
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
## from src/boot.j
# type ASCIIString <: String; data::Array{Uint8,1}; end

next(s::ASCIIString, i::Index) = (char(s.data[i]), i+1)

## overload methods for efficiency ##

length(s::ASCIIString) = length(s.data)
cmp(a::ASCIIString, b::ASCIIString) = lexcmp(a.data, b.data)
ind2chr(s::ASCIIString, i::Int) = i
chr2ind(s::ASCIIString, i::Int) = i
strchr(s::ASCIIString, c::Char) = c < 0x80 ? memchr(s.data, c) : error("char not found")
nextind(s::ASCIIString, i::Int) = i
prevind(s::ASCIIString, i::Int) = i-1
strcat(s::ASCIIString, t::ASCIIString, x::ASCIIString...) = ASCIIString(strdatacat(s, t, x...))

## outputing ASCII strings ##

print(s::ASCIIString) = print(s.data)
write(io, s::ASCIIString) = write(io, s.data)

## transcoding to ASCII ##

ascii(s::ASCIIString) = s
function ascii(s::String)
f = c -> (c < 0x80) ? uint8(c) : error("invalid ASCII code point: U+$(hex(c))")
ASCIIString(map(f, chars(s)))
end
2 changes: 1 addition & 1 deletion expr.j
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## symbols ##

symbol(s::Latin1String) = symbol(s.data)
symbol(s::ASCIIString) = symbol(s.data)
symbol(s::UTF8String) = symbol(s.data)
symbol(a::Array{Uint8,1}) =
ccall(:jl_symbol_n, Any, (Ptr{Uint8}, Int32), a, int32(length(a)))::Symbol
Expand Down
10 changes: 4 additions & 6 deletions latin1.j
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
## from boot.j:
# type Latin1String <: String
# data::Array{Uint8,1}
# end
type Latin1String <: String
data::Array{Uint8,1}
end

next(s::Latin1String, i::Index) = (char(s.data[i]), i+1)

Expand All @@ -27,7 +26,6 @@ write(io, s::Latin1String) = write(io, s.data)

latin1(s::Latin1String) = s
function latin1(s::String)
f = c -> (c <= 0xff) ? uint8(c) :
error("invalid Latin-1 code point: U+$(hex(c))")
f = c -> (c <= 0xff) ? uint8(c) : error("invalid Latin-1 code point: U+$(hex(c))")
Latin1String(map(f, chars(s)))
end
2 changes: 1 addition & 1 deletion multi.j
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ function identify_socket(otherid, fd, sock)
@assert i < PGRP.myid
PGRP.workers[i] = Worker(locs[i].host, locs[i].port, fd, sock)
PGRP.workers[i].id = i
#write(stdout_stream, latin1("$(PGRP.myid) heard from $i\n"))
#write(stdout_stream, "$(PGRP.myid) heard from $i\n")
()
end

Expand Down
2 changes: 1 addition & 1 deletion src/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jl_type_t *jl_array_uint8_type;
jl_type_t *jl_array_any_type;
jl_struct_type_t *jl_weakref_type;
jl_tag_type_t *jl_string_type;
jl_struct_type_t *jl_latin1_string_type;
jl_struct_type_t *jl_ascii_string_type;
jl_struct_type_t *jl_utf8_string_type;
jl_struct_type_t *jl_expr_type;
jl_bits_type_t *jl_intrinsic_type;
Expand Down
4 changes: 2 additions & 2 deletions src/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ jl_value_t *jl_pchar_to_string(char *str, size_t len)
{
jl_array_t *a = jl_pchar_to_array(str, len);
JL_GC_PUSH(&a);
jl_struct_type_t* string_type = u8_isvalid(a->data, len) < 2 ?
jl_latin1_string_type : jl_utf8_string_type;
jl_struct_type_t* string_type = u8_isvalid(a->data, len) == 1 ? // ASCII
jl_ascii_string_type : jl_utf8_string_type;
jl_value_t *s = jl_apply((jl_function_t*)string_type, (jl_value_t**)&a, 1);
JL_GC_POP();
return s;
Expand Down
11 changes: 3 additions & 8 deletions src/boot.j
Original file line number Diff line number Diff line change
Expand Up @@ -117,15 +117,10 @@ isequal(w, v::WeakRef) = isequal(w, v.value)

abstract String

type Latin1String <: String
data::Array{Uint8,1}
end

type UTF8String <: String
data::Array{Uint8,1}
end
type ASCIIString <: String; data::Array{Uint8,1}; end
type UTF8String <: String; data::Array{Uint8,1}; end

typealias ByteString Union(Latin1String,UTF8String)
typealias ByteString Union(ASCIIString,UTF8String)

abstract Exception

Expand Down
4 changes: 2 additions & 2 deletions src/dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,7 @@ void jl_save_system_image(char *fname, char *startscriptname)
jl_serialize_value(&f, jl_float64_type);
jl_serialize_value(&f, jl_weakref_type);
jl_serialize_value(&f, jl_string_type);
jl_serialize_value(&f, jl_latin1_string_type);
jl_serialize_value(&f, jl_ascii_string_type);
jl_serialize_value(&f, jl_utf8_string_type);
jl_serialize_value(&f, jl_errorexception_type);
jl_serialize_value(&f, jl_typeerror_type);
Expand Down Expand Up @@ -852,7 +852,7 @@ void jl_restore_system_image(char *fname)
jl_weakref_type->env = NULL;
jl_weakref_type->linfo = NULL;
jl_string_type = (jl_tag_type_t*)jl_deserialize_value(&f);
jl_latin1_string_type = (jl_struct_type_t*)jl_deserialize_value(&f);
jl_ascii_string_type = (jl_struct_type_t*)jl_deserialize_value(&f);
jl_utf8_string_type = (jl_struct_type_t*)jl_deserialize_value(&f);
jl_errorexception_type = (jl_struct_type_t*)jl_deserialize_value(&f);
jl_typeerror_type = (jl_struct_type_t*)jl_deserialize_value(&f);
Expand Down
2 changes: 1 addition & 1 deletion src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ void jl_get_builtin_hooks()
jl_weakref_type->env = NULL;
jl_weakref_type->linfo = NULL;
jl_string_type = (jl_tag_type_t*)global("String");
jl_latin1_string_type = (jl_struct_type_t*)global("Latin1String");
jl_ascii_string_type = (jl_struct_type_t*)global("ASCIIString");
jl_utf8_string_type = (jl_struct_type_t*)global("UTF8String");
jl_errorexception_type = (jl_struct_type_t*)global("ErrorException");
jl_typeerror_type = (jl_struct_type_t*)global("TypeError");
Expand Down
6 changes: 3 additions & 3 deletions src/julia.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ extern jl_struct_type_t *jl_array_type;
extern jl_typename_t *jl_array_typename;
extern jl_struct_type_t *jl_weakref_type;
extern jl_tag_type_t *jl_string_type;
extern jl_struct_type_t *jl_latin1_string_type;
extern jl_struct_type_t *jl_ascii_string_type;
extern jl_struct_type_t *jl_utf8_string_type;
extern jl_struct_type_t *jl_errorexception_type;
extern jl_struct_type_t *jl_typeerror_type;
Expand Down Expand Up @@ -401,9 +401,9 @@ void *allocb_permanent(size_t sz);
#define jl_is_task(v) jl_typeis(v,jl_task_type)
#define jl_is_func(v) (jl_is_func_type(jl_typeof(v)) || jl_is_struct_type(v))
#define jl_is_function(v) jl_is_func(v)
#define jl_is_latin1_string(v) jl_typeis(v,jl_latin1_string_type)
#define jl_is_ascii_string(v) jl_typeis(v,jl_ascii_string_type)
#define jl_is_utf8_string(v) jl_typeis(v,jl_utf8_string_type)
#define jl_is_byte_string(v) (jl_is_latin1_string(v) || jl_is_utf8_string(v))
#define jl_is_byte_string(v) (jl_is_ascii_string(v) || jl_is_utf8_string(v))
#define jl_is_string(v) jl_subtype(v,(jl_value_t*)jl_string_type,1)
#define jl_is_cpointer(v) jl_is_cpointer_type(jl_typeof(v))
#define jl_is_pointer(v) jl_is_cpointer_type(jl_typeof(v))
Expand Down
2 changes: 1 addition & 1 deletion start.j
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ set_current_output_stream(stdout_stream)
stdin_stream = fdio(ccall(:jl_stdin, Int32, ()))
stderr_stream = fdio(ccall(:jl_stderr, Int32, ()))
load("string.j")
load("latin1.j")
load("ascii.j")
load("utf8.j")
load("show.j")
load("regex.j")
Expand Down
6 changes: 4 additions & 2 deletions string.j
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,9 @@ function print_escaped(s::String, q::Bool, xmax::Char)
if q; print('"'); end
end

print_escaped(s::Latin1String, q) = print_escaped(s, q, '\xff')
# TODO: make sure ASCII, Latin-1 and UTF-8 strings all get
# printed so that when input back they are equivalent.

print_escaped(s::String, q) = print_escaped(s, q, '\x7f')
print_escaped(s::String) = print_escaped(s, false)
print_quoted (s::String) = print_escaped(s, true)
Expand Down Expand Up @@ -708,7 +710,7 @@ function uint2str(n::Int, b::Int)
ccall(:uint2str, Ptr{Uint8},
(Ptr{Uint8}, Ulong, Uint64, Uint32),
data, ulong(sz), uint64(n), uint32(b))
Latin1String(data[1:(sz-1)]) # cut out terminating NUL
ASCIIString(data[1:(sz-1)]) # cut out terminating NUL
end

uint2str(n::Int, b::Int, len::Int) = lpad(uint2str(n,b),len,'0')
Expand Down
2 changes: 1 addition & 1 deletion sysimg.j
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ load("io.j")
ccall(:jl_set_memio_func, Void, ())
set_current_output_stream(make_stdout_stream()) # for error reporting
load("string.j")
load("latin1.j")
load("ascii.j")
load("utf8.j")
load("show.j")

Expand Down
6 changes: 1 addition & 5 deletions table.j
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,7 @@ function hash(a::Array)
h
end

# TODO: should we distinguish a UTF8String and
# a Latin1String containing the same exact data?

hash(s::Union(UTF8String,Latin1String)) =
ccall(:memhash32, Uint32, (Ptr{Void}, Size), s.data, length(s.data))
hash(s::ByteString) = ccall(:memhash32, Uint32, (Ptr{Void}, Size), s.data, length(s.data))

# hash table

Expand Down
6 changes: 3 additions & 3 deletions test_utf8.j
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
utf32 = CharString(read(open("unicode/UTF-32LE.txt"), Char, 1112065)[2:]);
utf8 = UTF8String(read(open("unicode/UTF-8.txt"), Uint8, 4382595)[4:]);
@assert utf32 == utf8
str1 = CharString(read(open("unicode/UTF-32LE.txt"), Char, 1112065)[2:]);
str2 = UTF8String(read(open("unicode/UTF-8.txt"), Uint8, 4382595)[4:]);
@assert str1 == str2

str1 = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε"
str2 = CharString(
Expand Down
6 changes: 2 additions & 4 deletions utf8.j
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
## from boot.j:
# type UTF8String <: String
# data::Array{Uint8,1}
# end
## from src/boot.j:
# type UTF8String <: String; data::Array{Uint8,1}; end

## basic UTF-8 decoding & iteration ##

Expand Down

0 comments on commit 83bd7b1

Please sign in to comment.