Skip to content

Commit

Permalink
Add binary:split/3 and string:find/2,3
Browse files Browse the repository at this point in the history
Also update `string:split/2,3` to work with unicode binaries and fix specs.

Signed-off-by: Paul Guyot <[email protected]>
  • Loading branch information
pguyot committed Sep 29, 2024
1 parent 4b37211 commit c946931
Show file tree
Hide file tree
Showing 10 changed files with 266 additions and 38 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ also non string parameters (e.g. `Enum.join([1, 2], ",")`
- Support for `code:ensure_loaded/1`
- Support for `io_lib:latin1_char_list/1`
- Add support to Elixir for `Keyword.split/2`
- Support for `binary:split/3` and `string:find/2,3`

### Changed

Expand Down
18 changes: 17 additions & 1 deletion libs/estdlib/src/binary.erl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
%%-----------------------------------------------------------------------------
-module(binary).

-export([at/2, part/3, split/2]).
-export([at/2, part/3, split/2, split/3]).

%%-----------------------------------------------------------------------------
%% @param Binary binary to get a byte from
Expand All @@ -51,6 +51,7 @@ part(_Binary, _Pos, _Len) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @equiv split(Binary, Pattern, [])
%% @param Binary binary to split
%% @param Pattern pattern to perform the split
%% @return a list composed of one or two binaries
Expand All @@ -62,3 +63,18 @@ part(_Binary, _Pos, _Len) ->
-spec split(Binary :: binary(), Pattern :: binary()) -> [binary()].
split(_Binary, _Pattern) ->
erlang:nif_error(undefined).

%%-----------------------------------------------------------------------------
%% @param Binary binary to split
%% @param Pattern pattern to perform the split
%% @param Options options for the split
%% @return a list composed of one or two binaries
%% @doc Split a binary according to pattern.
%% If pattern is not found, returns a singleton list with the passed binary.
%% Unlike Erlang/OTP, pattern must be a binary.
%% Only implemented option is `global'
%% @end
%%-----------------------------------------------------------------------------
-spec split(Binary :: binary(), Pattern :: binary(), Option :: [global]) -> [binary()].
split(_Binary, _Pattern, _Option) ->
erlang:nif_error(undefined).
102 changes: 93 additions & 9 deletions libs/estdlib/src/string.erl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
%%-----------------------------------------------------------------------------
-module(string).

-export([to_upper/1, to_lower/1, split/2, split/3, trim/1, trim/2]).
-export([to_upper/1, to_lower/1, split/2, split/3, trim/1, trim/2, find/2, find/3]).

%%-----------------------------------------------------------------------------
%% @param Input a string or character to convert
Expand Down Expand Up @@ -76,7 +76,7 @@ lower_char(C) when is_integer(C) ->
%% @returns chardata
%% @end
%%-----------------------------------------------------------------------------
-spec split(String :: string(), Pattern :: string()) -> string() | char().
-spec split(String :: unicode:chardata(), Pattern :: unicode:chardata()) -> [unicode:chardata()].
split(String, Pattern) ->
split(String, Pattern, leading).

Expand All @@ -98,25 +98,50 @@ split(String, Pattern) ->
%% [<<"ab">>,<<"bc">>,<<>>,<<"cd">>]'''
%% @end
%%-----------------------------------------------------------------------------
-spec split(String :: string(), Pattern :: string() | char(), Where :: atom()) -> string() | char().
split(String, Pattern, Where) ->
split(String, Pattern, Where, [], []).
-spec split(
String :: unicode:chardata(), Pattern :: unicode:chardata(), Where :: leading | trailing | all
) -> [unicode:chardata()].
split(String, Pattern, Where) when is_binary(String) andalso is_list(Pattern) ->
split_binary(String, unicode:characters_to_binary(Pattern), Where);
split(String, Pattern, Where) when is_binary(String) andalso is_binary(Pattern) ->
split_binary(String, Pattern, Where);
split(String, Pattern, Where) when is_list(String) andalso is_binary(Pattern) ->
split_list(String, unicode:characters_to_list(Pattern), Where);
split(String, Pattern, Where) when is_list(String) andalso is_list(Pattern) ->
split_list(String, Pattern, Where).

%% @private
split([], _Pattern, _Where, Token, Accum) ->
split_binary(String, Pattern, leading) ->
binary:split(String, Pattern);
split_binary(String, Pattern, all) ->
binary:split(String, Pattern, [global]);
split_binary(String, Pattern, trailing) ->
case find_binary(String, Pattern, trailing) of
nomatch ->
[String];
Rest ->
[binary:part(String, 0, byte_size(String) - byte_size(Rest) - byte_size(Pattern)), Rest]
end.

%% @private
split_list(String, Pattern, Where) ->
split_list(String, Pattern, Where, [], []).

%% @private
split_list([], _Pattern, _Where, Token, Accum) ->
lists:reverse([lists:reverse(Token) | Accum]);
split(String, Pattern, Where, Token, Accum) ->
split_list(String, Pattern, Where, Token, Accum) ->
case prefix_match(String, Pattern) of
{ok, Rest} ->
case Where of
leading ->
[lists:reverse(Token), Rest];
all ->
split(Rest, Pattern, Where, [], [lists:reverse(Token) | Accum])
split_list(Rest, Pattern, Where, [], [lists:reverse(Token) | Accum])
end;
no ->
[Char | Rest] = String,
split(Rest, Pattern, Where, [Char | Token], Accum)
split_list(Rest, Pattern, Where, [Char | Token], Accum)
end.

%% @private
Expand Down Expand Up @@ -167,3 +192,62 @@ triml([$\s | R]) ->
triml(R);
triml(R) ->
R.

%%-----------------------------------------------------------------------------
%% @equiv find(String, SearchPattern, leading)
%% @param String string to search in
%% @param SearchPattern pattern to search
%% @returns remainder of String starting from first occurrence of SearchPattern
%% or `nomatch' if SearchPattern cannot be found in String
%% @end
%%-----------------------------------------------------------------------------
-spec find(String :: unicode:chardata(), SearchPattern :: unicode:chardata()) ->
unicode:chardata() | nomatch.
find(String, SearchPattern) ->
find(String, SearchPattern, leading).

%%-----------------------------------------------------------------------------
%% @param String string to search in
%% @param SearchPattern pattern to search
%% @param Direction direction to search, `leading' or `trailing'
%% @returns remainder of String starting from first or last occurrence of
%% SearchPattern or `nomatch' if SearchPattern cannot be found in String
%% @end
%%-----------------------------------------------------------------------------
-spec find(
String :: unicode:chardata(),
SearchPattern :: unicode:chardata(),
Direction :: leading | trailing
) -> unicode:chardata() | nomatch.
find(String, "", _Direction) ->
String;
find(String, <<>>, _Direction) ->
String;
find(String, SearchPattern, Direction) when is_binary(String) andalso is_list(SearchPattern) ->
find_binary(String, unicode:characters_to_binary(SearchPattern), Direction);
find(String, SearchPattern, Direction) when is_binary(String) andalso is_binary(SearchPattern) ->
find_binary(String, SearchPattern, Direction);
find(String, SearchPattern, Direction) when is_list(String) andalso is_binary(SearchPattern) ->
find_list(String, unicode:characters_to_list(SearchPattern), Direction);
find(String, SearchPattern, Direction) when is_list(String) andalso is_list(SearchPattern) ->
find_list(String, SearchPattern, Direction).

%% @private
find_binary(<<_C, Rest/binary>> = String, SearchPattern, leading) when
byte_size(String) >= byte_size(SearchPattern)
->
case binary:part(String, 0, byte_size(SearchPattern)) =:= SearchPattern of
true -> String;
false -> find_binary(Rest, SearchPattern, leading)
end;
find_binary(_Sring, _SearchPattern, leading) ->
nomatch.

%% @private
find_list([_C | Rest] = String, SearchPattern, leading) ->
case prefix_match(String, SearchPattern) of
{ok, _Rest} -> String;
no -> find_list(Rest, SearchPattern, leading)
end;
find_list([], _SearchPattern, leading) ->
nomatch.
4 changes: 4 additions & 0 deletions src/libAtomVM/defaultatoms.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ static const char *const cast_atom = "\x5" "$cast";

static const char *const unicode_atom = "\x7" "unicode";

static const char *const global_atom = "\x6" "global";

void defaultatoms_init(GlobalContext *glb)
{
int ok = 1;
Expand Down Expand Up @@ -304,6 +306,8 @@ void defaultatoms_init(GlobalContext *glb)

ok &= globalcontext_insert_atom(glb, unicode_atom) == UNICODE_ATOM_INDEX;

ok &= globalcontext_insert_atom(glb, global_atom) == GLOBAL_ATOM_INDEX;

if (!ok) {
AVM_ABORT();
}
Expand Down
6 changes: 5 additions & 1 deletion src/libAtomVM/defaultatoms.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,9 @@ extern "C" {

#define UNICODE_ATOM_INDEX 110

#define PLATFORM_ATOMS_BASE_INDEX 111
#define GLOBAL_ATOM_INDEX 111

#define PLATFORM_ATOMS_BASE_INDEX 112

#define FALSE_ATOM TERM_FROM_ATOM_INDEX(FALSE_ATOM_INDEX)
#define TRUE_ATOM TERM_FROM_ATOM_INDEX(TRUE_ATOM_INDEX)
Expand Down Expand Up @@ -313,6 +315,8 @@ extern "C" {

#define UNICODE_ATOM TERM_FROM_ATOM_INDEX(UNICODE_ATOM_INDEX)

#define GLOBAL_ATOM TERM_FROM_ATOM_INDEX(GLOBAL_ATOM_INDEX)

void defaultatoms_init(GlobalContext *glb);

void platform_defaultatoms_init(GlobalContext *glb);
Expand Down
104 changes: 77 additions & 27 deletions src/libAtomVM/nifs.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ static term nif_binary_at_2(Context *ctx, int argc, term argv[]);
static term nif_binary_first_1(Context *ctx, int argc, term argv[]);
static term nif_binary_last_1(Context *ctx, int argc, term argv[]);
static term nif_binary_part_3(Context *ctx, int argc, term argv[]);
static term nif_binary_split_2(Context *ctx, int argc, term argv[]);
static term nif_binary_split(Context *ctx, int argc, term argv[]);
static term nif_calendar_system_time_to_universal_time_2(Context *ctx, int argc, term argv[]);
static term nif_erlang_delete_element_2(Context *ctx, int argc, term argv[]);
static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[]);
Expand Down Expand Up @@ -232,7 +232,7 @@ static const struct Nif binary_part_nif =
static const struct Nif binary_split_nif =
{
.base.type = NIFFunctionType,
.nif_ptr = nif_binary_split_2
.nif_ptr = nif_binary_split
};

static const struct Nif make_ref_nif =
Expand Down Expand Up @@ -3007,16 +3007,33 @@ static term nif_binary_part_3(Context *ctx, int argc, term argv[])
return term_maybe_create_sub_binary(bin_term, pos, len, &ctx->heap, ctx->global);
}

static term nif_binary_split_2(Context *ctx, int argc, term argv[])
static term nif_binary_split(Context *ctx, int argc, term argv[])
{
UNUSED(argc);

term bin_term = argv[0];
term pattern_term = argv[1];

VALIDATE_VALUE(bin_term, term_is_binary);
VALIDATE_VALUE(pattern_term, term_is_binary);

bool global = false;
if (argc == 3) {
term options = argv[2];
if (UNLIKELY(!term_is_list(options))) {
RAISE_ERROR(BADARG_ATOM);
}
if (term_is_nonempty_list(options)) {
term head = term_get_list_head(options);
term tail = term_get_list_tail(options);
if (UNLIKELY(head != GLOBAL_ATOM)) {
RAISE_ERROR(BADARG_ATOM);
}
if (UNLIKELY(!term_is_nil(tail))) {
RAISE_ERROR(BADARG_ATOM);
}
global = true;
}
}

int bin_size = term_binary_size(bin_term);
int pattern_size = term_binary_size(pattern_term);

Expand All @@ -3027,38 +3044,71 @@ static term nif_binary_split_2(Context *ctx, int argc, term argv[])
const char *bin_data = term_binary_data(bin_term);
const char *pattern_data = term_binary_data(pattern_term);

const char *found = (const char *) memmem(bin_data, bin_size, pattern_data, pattern_size);
// Count segments first to allocate memory once.
size_t num_segments = 1;
const char *temp_bin_data = bin_data;
int temp_bin_size = bin_size;
do {
const char *found = (const char *) memmem(temp_bin_data, temp_bin_size, pattern_data, pattern_size);
if (!found) break;
num_segments++;
int next_search_offset = found - temp_bin_data + pattern_size;
temp_bin_data += next_search_offset;
temp_bin_size -= next_search_offset;
} while (global && temp_bin_size >= pattern_size);

term result_list = term_nil();

if (num_segments == 1) {
// not found
if (UNLIKELY(memory_ensure_free_with_roots(ctx, 2, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
}

int offset = found - bin_data;
return term_list_prepend(argv[0], result_list, &ctx->heap);
}

if (found) {
int tok_size = offset;
size_t tok_size_in_terms = term_sub_binary_heap_size(bin_term, tok_size);
// binary:split/2,3 always return sub binaries, except when copied binaries are as small as sub-binaries.
if (UNLIKELY(memory_ensure_free_with_roots(ctx, LIST_SIZE(num_segments, TERM_BOXED_SUB_BINARY_SIZE), 2, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
}

int rest_size = bin_size - offset - pattern_size;
size_t rest_size_in_terms = term_sub_binary_heap_size(bin_term, rest_size);
// Allocate list first
for (size_t index_segments = 0; index_segments < num_segments; index_segments++) {
result_list = term_list_prepend(term_nil(), result_list, &ctx->heap);
}

// + 4 which is the result cons
if (UNLIKELY(memory_ensure_free_with_roots(ctx, tok_size_in_terms + rest_size_in_terms + 4, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
}
// Reset pointers after allocation
bin_data = term_binary_data(argv[0]);
pattern_data = term_binary_data(argv[1]);

term list_cursor = result_list;
temp_bin_data = bin_data;
temp_bin_size = bin_size;
term *list_ptr = term_get_list_ptr(list_cursor);
do {
const char *found = (const char *) memmem(temp_bin_data, temp_bin_size, pattern_data, pattern_size);

bin_term = argv[0];
term tok = term_maybe_create_sub_binary(bin_term, 0, tok_size, &ctx->heap, ctx->global);
term rest = term_maybe_create_sub_binary(bin_term, offset + pattern_size, rest_size, &ctx->heap, ctx->global);
if (found) {
term tok = term_maybe_create_sub_binary(argv[0], temp_bin_data - bin_data, found - temp_bin_data, &ctx->heap, ctx->global);
list_ptr[LIST_HEAD_INDEX] = tok;

term result_list = term_list_prepend(rest, term_nil(), &ctx->heap);
result_list = term_list_prepend(tok, result_list, &ctx->heap);
list_cursor = list_ptr[LIST_TAIL_INDEX];
list_ptr = term_get_list_ptr(list_cursor);

return result_list;
int next_search_offset = found - temp_bin_data + pattern_size;
temp_bin_data += next_search_offset;
temp_bin_size -= next_search_offset;
}

} else {
if (UNLIKELY(memory_ensure_free_with_roots(ctx, 2, 1, argv, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
RAISE_ERROR(OUT_OF_MEMORY_ATOM);
if (!found || !global) {
term rest = term_maybe_create_sub_binary(argv[0], temp_bin_data - bin_data, temp_bin_size, &ctx->heap, ctx->global);
list_ptr[LIST_HEAD_INDEX] = rest;
break;
}
} while (!term_is_nil(list_cursor));

return term_list_prepend(argv[0], term_nil(), &ctx->heap);
}
return result_list;
}

static term nif_erlang_throw(Context *ctx, int argc, term argv[])
Expand Down
1 change: 1 addition & 0 deletions src/libAtomVM/nifs.gperf
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ binary:first/1, &binary_first_nif
binary:last/1, &binary_last_nif
binary:part/3, &binary_part_nif
binary:split/2, &binary_split_nif
binary:split/3, &binary_split_nif
calendar:system_time_to_universal_time/2, &system_time_to_universal_time_nif
erlang:atom_to_binary/1, &atom_to_binary_nif
erlang:atom_to_binary/2, &atom_to_binary_nif
Expand Down
1 change: 1 addition & 0 deletions tests/libs/estdlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ include(BuildErlang)

set(ERLANG_MODULES
test_apply
test_binary
test_calendar
test_gen_event
test_gen_server
Expand Down
Loading

0 comments on commit c946931

Please sign in to comment.