-
-
Notifications
You must be signed in to change notification settings - Fork 2.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Type Independent String Functions #891
Changes from 4 commits
40f016f
54b5c00
ae1b7e4
3f20233
39d92e8
ec3194a
714b89c
4793904
23d424c
1f47341
48028ce
ebca31a
66a0559
932eb6f
1648192
1a75cdf
15ccce7
6868c71
a307291
9e4206c
2ace38b
f570f18
dfdfde2
0999ea8
fb24ebc
8b9db75
2f027b7
3639e7a
dc3cfda
dd349fb
5b8012b
f3bf4f6
b0ea58a
6fcbf6e
2d2477a
b2cbcb5
ded5e5d
8453786
c6ff5e4
a2269f9
6e61be7
6294dd4
4dc263e
17dc853
b8d1995
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
zig-cache/ | ||
build/ | ||
build-*/ | ||
.vscode/ | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,7 @@ pub const rand = @import("rand/index.zig"); | |
pub const sort = @import("sort.zig"); | ||
pub const unicode = @import("unicode.zig"); | ||
pub const zig = @import("zig/index.zig"); | ||
pub const stringUtils = @import("string_utils.zig"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The decision to put these things in something called "memory" rather than "string" is intentional. A "string" would be something like String utilities would be a welcome addition to the zig standard library, but they should operate on unicode points, not encoded bytes. Utilities that operate on encoded bytes should be clear about that, using parameter names such as "bytes" and explaining the difference between encoded strings and actual strings in the docs. |
||
|
||
test "std" { | ||
// run tests from these | ||
|
@@ -62,4 +63,5 @@ test "std" { | |
_ = @import("sort.zig"); | ||
_ = @import("unicode.zig"); | ||
_ = @import("zig/index.zig"); | ||
_ = @import("string_utils.zig"); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
const std = @import("index.zig"); | ||
const unicode = std.unicode; | ||
const mem = std.mem; | ||
const math = std.math; | ||
const Set = std.BufSet; | ||
const assert = std.debug.assert; | ||
const allocator = std.debug.debug_allocator; | ||
|
||
// Handles a series of string utilities that are focused around handling and manipulating strings | ||
|
||
// Hash code for a string | ||
pub fn hash_str(k: []const u8) u32 { | ||
// FNV 32-bit hash | ||
var h: u32 = 2166136261; | ||
for (k) |b| { | ||
h = (h ^ b) *% 16777619; | ||
} | ||
return h; | ||
} | ||
|
||
// Just directs you to the standard string handler | ||
pub fn hash_unicode(k: unicode.Utf8View) u32 { | ||
return hash_str(k.bytes); | ||
} | ||
|
||
pub fn find_str(a: []const u8, target: []const u8, start: usize, end: usize, highest: bool) ?usize { | ||
var array = a[start..end]; | ||
var i : usize = 0; | ||
var index : ?usize = null; | ||
|
||
while (i < array.len) { | ||
// If there is no possible way we could fit the string early return | ||
if (array.len - i < target.len) return index; | ||
|
||
if (array[i] == target[0]) { | ||
var equal = true; | ||
var j : usize = 1; | ||
|
||
while (j < target.len) { | ||
if (array[i + j] != target[j]) { | ||
equal = false; | ||
|
||
// Reduce amount of comparisons | ||
i += j - 1; | ||
break; | ||
} | ||
j += 1; | ||
} | ||
|
||
if (equal) { | ||
index = i; | ||
if (!highest) { | ||
return index; | ||
} else { | ||
i += j - 1; | ||
} | ||
} | ||
} | ||
i += 1; | ||
} | ||
|
||
return index; | ||
} | ||
|
||
pub const Side = enum { | ||
LEFT, | ||
RIGHT, | ||
BOTH, | ||
}; | ||
|
||
pub fn strip_whitespace(a: []const u8, sides: Side)[]const u8 { | ||
// Just a placeholder replace later with proper locale whitespace | ||
return strip(a, " \t\n\r", sides); | ||
} | ||
|
||
fn impl_strip_side(a: []const u8, characters: []const u8, start: usize, change: usize, decrement: bool) usize { | ||
var moved = true; | ||
var index = start; | ||
|
||
while (moved) { | ||
moved = false; | ||
for (characters) |char| { | ||
if (char == a[index]) { | ||
moved = true; | ||
if (decrement) index -= change else index += change; | ||
break; | ||
} | ||
} | ||
} | ||
return index; | ||
} | ||
|
||
// If max is 0 then it'll do forever | ||
pub fn split(a: []const u8, sep: u8, out: &[][]const u8)void { | ||
var actualCount: usize = 0; | ||
var previousIndex: usize = 0; | ||
|
||
for (a) |char, i| { | ||
if (char == sep) { | ||
if (i - previousIndex == 0) { | ||
(*out)[actualCount] = ""; | ||
} else { | ||
(*out)[actualCount] = a[previousIndex..i]; | ||
} | ||
|
||
previousIndex = i + 1; | ||
actualCount += 1; | ||
|
||
if (actualCount == out.len) break; | ||
} | ||
} | ||
|
||
(*out)[actualCount] = a[previousIndex..]; | ||
actualCount += 1; | ||
*out = (*out)[0..actualCount]; | ||
} | ||
|
||
// Note: characters is an array of u8 not a string! | ||
// So passing in "abc" doesn't strip abc it strips a, b, and c | ||
pub fn strip(a: []const u8, characters: []const u8, sides: Side)[]const u8 { | ||
var start: usize = 0; | ||
var end: usize = a.len - 1; | ||
if (sides == Side.LEFT or sides == Side.BOTH) { | ||
// Trim left | ||
start = impl_strip_side(a, characters, start, 1, false); | ||
} | ||
|
||
if (sides == Side.RIGHT or sides == Side.BOTH) { | ||
// Trim right | ||
end = impl_strip_side(a, characters, end, 1, true); | ||
} | ||
|
||
// +1 to convert to 1-index | ||
return a[start..end + 1]; | ||
} | ||
|
||
pub fn starts_with(a: []const u8, target: []const u8) bool { | ||
// Because we are 0-indexing it not 1-indexing it | ||
if (a.len < target.len) return false; | ||
var i : usize = 0; | ||
|
||
while (i < target.len) { | ||
if (a[i] != target[i]) return false; | ||
i += 1; | ||
} | ||
return true; | ||
} | ||
|
||
pub fn ends_with(a: []const u8, target: []const u8) bool { | ||
if (a.len < target.len) return false; | ||
var diff : usize = a.len - target.len; | ||
var i : usize = a.len - 1; | ||
|
||
while (i >= target.len) { | ||
if (a[i] != target[i - diff]) return false; | ||
i -= 1; | ||
} | ||
return true; | ||
} | ||
|
||
pub fn str_eql(a: []const u8, b: []const u8) bool { | ||
return mem.eql(u8, a, b); | ||
} | ||
|
||
pub fn is_num(byte: u8) bool { | ||
return byte >= '0' and byte <= '9'; | ||
} | ||
|
||
pub fn to_upper(byte: u8) u8 { | ||
return if(is_ascii_lower(byte)) byte - 32 else byte; | ||
} | ||
|
||
pub fn to_lower(byte: u8) u8 { | ||
return if(is_ascii_upper(byte)) byte + 32 else byte; | ||
} | ||
|
||
pub fn is_ascii_letter(byte: u8) bool { | ||
return is_ascii_lower(byte) or is_ascii_upper(byte); | ||
} | ||
|
||
pub fn is_ascii_lower(byte: u8) bool { | ||
return byte >= 'a' and byte <= 'z'; | ||
} | ||
|
||
pub fn is_ascii_upper(byte: u8) bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm fairly confident that we should not have asciiUpper and asciiLower in the standard library for these reasons:
Do you have an actual use case for ascii upper/lower right now? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the randomizer I use a few ascii functions(They take There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, that's fair. Maybe it's not so bad to have them. We should definitely put a bunch of noticeable warning signs near them and avoid the use of the word "string". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They could live in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm ok with that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note that "ascii" is different from those ascii functions you linked are correct, because they check for specific ranges of values, but something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One could also just check if it was within 127? That avoids the bit masking operations (which are more expensive and remove clarity)? But yes it probably does have to be checked, though I would rather verify data through the locale viewer (which would check just once) rather than an indeterminate amount of times :). |
||
return byte >= 'A' and byte <= 'Z'; | ||
} | ||
|
||
test "String_Utils" { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I recommend splitting this into some smaller tests like:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes of course, I just wanted to get a basic testing suite up first :). |
||
assert(is_ascii_letter('C')); | ||
assert(is_ascii_letter('e')); | ||
assert(!is_ascii_letter('2')); | ||
assert(!is_ascii_upper('a')); | ||
assert(is_ascii_upper('B')); | ||
assert(!is_ascii_upper('5')); | ||
assert(is_ascii_lower('a')); | ||
assert(!is_ascii_lower('K')); | ||
assert(!is_ascii_lower('-')); | ||
|
||
assert(is_num('0')); | ||
assert(!is_num('a')); | ||
|
||
assert(str_eql("HOPE", "HOPE")); | ||
assert(!str_eql("Piece", "Peace")); | ||
|
||
assert(ends_with("Hopie", "pie")); | ||
assert(!ends_with("Cat", "ta")); | ||
|
||
assert(starts_with("bat", "ba")); | ||
assert(!starts_with("late", "ma")); | ||
|
||
assert(?? find_str("boo", "o", 0, 3, true) == 2); | ||
assert(?? find_str("nookies", "ook", 0, 7, false) == 1); | ||
assert(find_str("answer to the universe", "42", 0, 22, false) == null); | ||
|
||
assert(str_eql(strip_whitespace(" a ", Side.BOTH), "a")); | ||
assert(str_eql(strip_whitespace(" a ", Side.LEFT), "a ")); | ||
assert(str_eql(strip_whitespace(" a ", Side.RIGHT), " a")); | ||
|
||
assert(str_eql(strip("mississippi", "ipz", Side.BOTH), "mississ")); | ||
|
||
var splits : [3][]const u8 = undefined; | ||
split("Cat,Bat,Mat", ',', &splits[0..]); | ||
var expected = [][3]u8 { "Cat", "Bat", "Mat" }; | ||
|
||
assert(expected.len == splits.len); | ||
for (expected) |str, i| { | ||
assert(str_eql(splits[i], str)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See #888
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair enough I'll remove it :), I shouldn't be calling 'git add .' anyways.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you should be able to use
git add .
. Everything to ignore falls into 2 categories: system-wide ignore, or project-specific ignore. If both are correct thengit add .
works just fine.