johnkerl · johnkerl · Feb 5, 2022 · Feb 5, 2022 · Feb 5, 2022
diff --git a/docs/src/manpage.md b/docs/src/manpage.md
@@ -2170,11 +2170,11 @@ FUNCTIONS FOR FILTER/PUT
         (class=math #args=1) e**x - 1.
 
    flatten
-        (class=collections #args=2,3) Flattens multi-level maps to single-level ones. Useful for nested JSON-like structures for non-JSON file formats like CSV.
+        (class=collections #args=2,3) Flattens multi-level maps to single-level ones. Useful for nested JSON-like structures for non-JSON file formats like CSV. With two arguments, the first argument is a map (maybe $*) and the second argument is the flatten separator. With three arguments, the first argument is prefix, the second is the flatten separator, and the third argument is a map, and flatten($*, ".") is the same as flatten("", ".", $*). See "Flatten/unflatten: converting between JSON and tabular formats" at https://miller.readthedocs.io for more information.
        Examples:
+       flatten({"a":[1,2],"b":3}, ".") is {"a.1": 1, "a.2": 2, "b": 3}.
        flatten("a", ".", {"b": { "c": 4 }}) is {"a.b.c" : 4}.
        flatten("", ".", {"a": { "b": 3 }}) is {"a.b" : 3}.
-       Two-argument version: flatten($*, ".") is the same as flatten("", ".", $*).
 
    float
         (class=conversion #args=1) Convert int/float/bool/string to float.
@@ -2222,7 +2222,7 @@ FUNCTIONS FOR FILTER/PUT
        gmt2sec("2001-02-03T04:05:06Z") = 981173106
 
    gsub
-        (class=string #args=3) '$name=gsub($name, "old", "new")' (replace all).
+        (class=string #args=3) '$name = gsub($name, "old", "new")': replace all, with support for regular expressions. Capture groups \1 through \9 in the new part are matched from (...) in the old part, and must be used within the same call to gsub -- they don't persist for subsequent DSL statements. See also =~ and regextract. See also "Regular expressions" at https://miller.readthedocs.io.
        Examples:
        gsub("ababab", "ab", "XY") gives "XYXYXY"
        gsub("abc.def", ".", "X") gives "XXXXXXX"
@@ -2414,10 +2414,16 @@ FUNCTIONS FOR FILTER/PUT
        Map example: reduce({"a":1, "b":3, "c": 5}, func(acck,accv,ek,ev) {return {"sum_of_squares": accv + ev**2}}) returns {"sum_of_squares": 35}.
 
    regextract
-        (class=string #args=2) '$name=regextract($name, "[A-Z]{3}[0-9]{2}")'
+        (class=string #args=2) Extracts a substring (the first, if there are multiple matches), matching a regular expression, from the input. Does not use capture groups; see also the =~ operator which does.
+       Examples:
+       regextract("index ab09 file", "[a-z][a-z][0-9][0-9]") gives "ab09"
+       regextract("index a999 file", "[a-z][a-z][0-9][0-9]") gives (absent), which will result in an assignment not happening.
 
    regextract_or_else
-        (class=string #args=3) '$name=regextract_or_else($name, "[A-Z]{3}[0-9]{2}", "default")'
+        (class=string #args=3) Like regextract but the third argument is the return value in case the input string (first argument) doesn't match the pattern (second argument).
+       Examples:
+       regextract_or_else("index ab09 file", "[a-z][a-z][0-9][0-9]", "nonesuch") gives "ab09"
+       regextract_or_else("index a999 file", "[a-z][a-z][0-9][0-9]", "nonesuch") gives "nonesuch"
 
    round
         (class=math #args=1) Round to nearest integer.
@@ -2529,7 +2535,7 @@ FUNCTIONS FOR FILTER/PUT
        ssub("abc.def", ".", "X") gives "abcXdef"
 
    strftime
-        (class=time #args=2) Formats seconds since the epoch as timestamp. Format strings are as in the C library (please see "man strftime" on your system), with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also strftime_local.
+        (class=time #args=2) Formats seconds since the epoch as timestamp. Format strings are mostly as in the C library (see "man strftime" on your system), with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also strftime_local. See also "DSL datetime/timezone functions" at https://miller.readthedocs.io for more information on the differences from the C library.
        Examples:
        strftime(1440768801.7,"%Y-%m-%dT%H:%M:%SZ")  = "2015-08-28T13:33:21Z"
        strftime(1440768801.7,"%Y-%m-%dT%H:%M:%3SZ") = "2015-08-28T13:33:21.700Z"
@@ -2567,7 +2573,7 @@ FUNCTIONS FOR FILTER/PUT
        strptime_local("2015-08-28 13:33:21",     "%Y-%m-%d %H:%M:%S", "Asia/Istanbul") = 1440758001
 
    sub
-        (class=string #args=3) '$name=sub($name, "old", "new")' (replace once).
+        (class=string #args=3) '$name = sub($name, "old", "new")': replace once (first match, if there are multiple matches), with support for regular expressions. Capture groups \1 through \9 in the new part are matched from (...) in the old part, and must be used within the same call to sub -- they don't persist for subsequent DSL statements. See also =~ and regextract. See also "Regular expressions" at https://miller.readthedocs.io.
        Examples:
        sub("ababab", "ab", "XY") gives "XYabab"
        sub("abc.def", ".", "X") gives "Xbc.def"
@@ -2612,7 +2618,7 @@ FUNCTIONS FOR FILTER/PUT
         (class=typing #args=1) Convert argument to type of argument (e.g. "str"). For debug.
 
    unflatten
-        (class=collections #args=2) Reverses flatten. Useful for nested JSON-like structures for non-JSON file formats like CSV. See also arrayify.
+        (class=collections #args=2) Reverses flatten. Useful for nested JSON-like structures for non-JSON file formats like CSV. The first argument is a map, and the second argument is the flatten separator. See also arrayify. See "Flatten/unflatten: converting between JSON and tabular formats" at https://miller.readthedocs.io for more information.
        Example:
        unflatten({"a.b.c" : 4}, ".") is {"a": "b": { "c": 4 }}.
 
@@ -2720,7 +2726,10 @@ FUNCTIONS FOR FILTER/PUT
         (class=boolean #args=2) String/numeric equality. Mixing number and string results in string compare.
 
    =~
-        (class=boolean #args=2) String (left-hand side) matches regex (right-hand side), e.g. '$name =~ "^a.*b$"'.
+        (class=boolean #args=2) String (left-hand side) matches regex (right-hand side), e.g. '$name =~ "^a.*b$"'. Capture groups \1 through \9 are matched from (...) in the right-hand side, and can be used within subsequent DSL statements. See also "Regular expressions" at https://miller.readthedocs.io.
+       Examples:
+       With if-statement: if ($url =~ "http.*com") { ... }
+       Without if-statement: given $line = "index ab09 file", and $line =~ "([a-z][a-z])([0-9][0-9])", then $label = "[\1:\2]", $label is "[ab:09]"
 
    &gt;
         (class=boolean #args=2) String/numeric greater-than. Mixing number and string results in string compare.

diff --git a/docs/src/manpage.txt b/docs/src/manpage.txt
@@ -2149,11 +2149,11 @@ FUNCTIONS FOR FILTER/PUT
         (class=math #args=1) e**x - 1.
 
    flatten
-        (class=collections #args=2,3) Flattens multi-level maps to single-level ones. Useful for nested JSON-like structures for non-JSON file formats like CSV.
+        (class=collections #args=2,3) Flattens multi-level maps to single-level ones. Useful for nested JSON-like structures for non-JSON file formats like CSV. With two arguments, the first argument is a map (maybe $*) and the second argument is the flatten separator. With three arguments, the first argument is prefix, the second is the flatten separator, and the third argument is a map, and flatten($*, ".") is the same as flatten("", ".", $*). See "Flatten/unflatten: converting between JSON and tabular formats" at https://miller.readthedocs.io for more information.
        Examples:
+       flatten({"a":[1,2],"b":3}, ".") is {"a.1": 1, "a.2": 2, "b": 3}.
        flatten("a", ".", {"b": { "c": 4 }}) is {"a.b.c" : 4}.
        flatten("", ".", {"a": { "b": 3 }}) is {"a.b" : 3}.
-       Two-argument version: flatten($*, ".") is the same as flatten("", ".", $*).
 
    float
         (class=conversion #args=1) Convert int/float/bool/string to float.
@@ -2201,7 +2201,7 @@ FUNCTIONS FOR FILTER/PUT
        gmt2sec("2001-02-03T04:05:06Z") = 981173106
 
    gsub
-        (class=string #args=3) '$name=gsub($name, "old", "new")' (replace all).
+        (class=string #args=3) '$name = gsub($name, "old", "new")': replace all, with support for regular expressions. Capture groups \1 through \9 in the new part are matched from (...) in the old part, and must be used within the same call to gsub -- they don't persist for subsequent DSL statements. See also =~ and regextract. See also "Regular expressions" at https://miller.readthedocs.io.
        Examples:
        gsub("ababab", "ab", "XY") gives "XYXYXY"
        gsub("abc.def", ".", "X") gives "XXXXXXX"
@@ -2393,10 +2393,16 @@ FUNCTIONS FOR FILTER/PUT
        Map example: reduce({"a":1, "b":3, "c": 5}, func(acck,accv,ek,ev) {return {"sum_of_squares": accv + ev**2}}) returns {"sum_of_squares": 35}.
 
    regextract
-        (class=string #args=2) '$name=regextract($name, "[A-Z]{3}[0-9]{2}")'
+        (class=string #args=2) Extracts a substring (the first, if there are multiple matches), matching a regular expression, from the input. Does not use capture groups; see also the =~ operator which does.
+       Examples:
+       regextract("index ab09 file", "[a-z][a-z][0-9][0-9]") gives "ab09"
+       regextract("index a999 file", "[a-z][a-z][0-9][0-9]") gives (absent), which will result in an assignment not happening.
 
    regextract_or_else
-        (class=string #args=3) '$name=regextract_or_else($name, "[A-Z]{3}[0-9]{2}", "default")'
+        (class=string #args=3) Like regextract but the third argument is the return value in case the input string (first argument) doesn't match the pattern (second argument).
+       Examples:
+       regextract_or_else("index ab09 file", "[a-z][a-z][0-9][0-9]", "nonesuch") gives "ab09"
+       regextract_or_else("index a999 file", "[a-z][a-z][0-9][0-9]", "nonesuch") gives "nonesuch"
 
    round
         (class=math #args=1) Round to nearest integer.
@@ -2508,7 +2514,7 @@ FUNCTIONS FOR FILTER/PUT
        ssub("abc.def", ".", "X") gives "abcXdef"
 
    strftime
-        (class=time #args=2) Formats seconds since the epoch as timestamp. Format strings are as in the C library (please see "man strftime" on your system), with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also strftime_local.
+        (class=time #args=2) Formats seconds since the epoch as timestamp. Format strings are mostly as in the C library (see "man strftime" on your system), with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also strftime_local. See also "DSL datetime/timezone functions" at https://miller.readthedocs.io for more information on the differences from the C library.
        Examples:
        strftime(1440768801.7,"%Y-%m-%dT%H:%M:%SZ")  = "2015-08-28T13:33:21Z"
        strftime(1440768801.7,"%Y-%m-%dT%H:%M:%3SZ") = "2015-08-28T13:33:21.700Z"
@@ -2546,7 +2552,7 @@ FUNCTIONS FOR FILTER/PUT
        strptime_local("2015-08-28 13:33:21",     "%Y-%m-%d %H:%M:%S", "Asia/Istanbul") = 1440758001
 
    sub
-        (class=string #args=3) '$name=sub($name, "old", "new")' (replace once).
+        (class=string #args=3) '$name = sub($name, "old", "new")': replace once (first match, if there are multiple matches), with support for regular expressions. Capture groups \1 through \9 in the new part are matched from (...) in the old part, and must be used within the same call to sub -- they don't persist for subsequent DSL statements. See also =~ and regextract. See also "Regular expressions" at https://miller.readthedocs.io.
        Examples:
        sub("ababab", "ab", "XY") gives "XYabab"
        sub("abc.def", ".", "X") gives "Xbc.def"
@@ -2591,7 +2597,7 @@ FUNCTIONS FOR FILTER/PUT
         (class=typing #args=1) Convert argument to type of argument (e.g. "str"). For debug.
 
    unflatten
-        (class=collections #args=2) Reverses flatten. Useful for nested JSON-like structures for non-JSON file formats like CSV. See also arrayify.
+        (class=collections #args=2) Reverses flatten. Useful for nested JSON-like structures for non-JSON file formats like CSV. The first argument is a map, and the second argument is the flatten separator. See also arrayify. See "Flatten/unflatten: converting between JSON and tabular formats" at https://miller.readthedocs.io for more information.
        Example:
        unflatten({"a.b.c" : 4}, ".") is {"a": "b": { "c": 4 }}.
 
@@ -2699,7 +2705,10 @@ FUNCTIONS FOR FILTER/PUT
         (class=boolean #args=2) String/numeric equality. Mixing number and string results in string compare.
 
    =~
-        (class=boolean #args=2) String (left-hand side) matches regex (right-hand side), e.g. '$name =~ "^a.*b$"'.
+        (class=boolean #args=2) String (left-hand side) matches regex (right-hand side), e.g. '$name =~ "^a.*b$"'. Capture groups \1 through \9 are matched from (...) in the right-hand side, and can be used within subsequent DSL statements. See also "Regular expressions" at https://miller.readthedocs.io.
+       Examples:
+       With if-statement: if ($url =~ "http.*com") { ... }
+       Without if-statement: given $line = "index ab09 file", and $line =~ "([a-z][a-z])([0-9][0-9])", then $label = "[\1:\2]", $label is "[ab:09]"
 
    >
         (class=boolean #args=2) String/numeric greater-than. Mixing number and string results in string compare.

diff --git a/docs/src/reference-dsl-builtin-functions.md b/docs/src/reference-dsl-builtin-functions.md
@@ -304,7 +304,10 @@ pow  (class=arithmetic #args=2) Exponentiation. Same as **, but as a function.
 <a id=regmatch />
 ### =~
 <pre class="pre-non-highlight-non-pair">
-=~  (class=boolean #args=2) String (left-hand side) matches regex (right-hand side), e.g. '$name =~ "^a.*b$"'.
+=~  (class=boolean #args=2) String (left-hand side) matches regex (right-hand side), e.g. '$name =~ "^a.*b$"'. Capture groups \1 through \9 are matched from (...) in the right-hand side, and can be used within subsequent DSL statements. See also "Regular expressions" at https://miller.readthedocs.io.
+Examples:
+With if-statement: if ($url =~ "http.*com") { ... }
+Without if-statement: given $line = "index ab09 file", and $line =~ "([a-z][a-z])([0-9][0-9])", then $label = "[\1:\2]", $label is "[ab:09]"
 </pre>
 
 
@@ -389,11 +392,11 @@ depth  (class=collections #args=1) Prints maximum depth of map/array. Scalars ha
 
 ### flatten
 <pre class="pre-non-highlight-non-pair">
-flatten  (class=collections #args=2,3) Flattens multi-level maps to single-level ones. Useful for nested JSON-like structures for non-JSON file formats like CSV.
+flatten  (class=collections #args=2,3) Flattens multi-level maps to single-level ones. Useful for nested JSON-like structures for non-JSON file formats like CSV. With two arguments, the first argument is a map (maybe $*) and the second argument is the flatten separator. With three arguments, the first argument is prefix, the second is the flatten separator, and the third argument is a map, and flatten($*, ".") is the same as flatten("", ".", $*). See "Flatten/unflatten: converting between JSON and tabular formats" at https://miller.readthedocs.io for more information.
 Examples:
+flatten({"a":[1,2],"b":3}, ".") is {"a.1": 1, "a.2": 2, "b": 3}.
 flatten("a", ".", {"b": { "c": 4 }}) is {"a.b.c" : 4}.
 flatten("", ".", {"a": { "b": 3 }}) is {"a.b" : 3}.
-Two-argument version: flatten($*, ".") is the same as flatten("", ".", $*).
 </pre>
 
 
@@ -465,7 +468,7 @@ mapsum  (class=collections #args=variadic) With 0 args, returns empty map. With
 
 ### unflatten
 <pre class="pre-non-highlight-non-pair">
-unflatten  (class=collections #args=2) Reverses flatten. Useful for nested JSON-like structures for non-JSON file formats like CSV. See also arrayify.
+unflatten  (class=collections #args=2) Reverses flatten. Useful for nested JSON-like structures for non-JSON file formats like CSV. The first argument is a map, and the second argument is the flatten separator. See also arrayify. See "Flatten/unflatten: converting between JSON and tabular formats" at https://miller.readthedocs.io for more information.
 Example:
 unflatten({"a.b.c" : 4}, ".") is {"a": "b": { "c": 4 }}.
 </pre>
@@ -939,7 +942,7 @@ format("{}:{}:{}", 1,2,3,4) gives "1:2:3".
 
 ### gsub
 <pre class="pre-non-highlight-non-pair">
-gsub  (class=string #args=3) '$name=gsub($name, "old", "new")' (replace all).
+gsub  (class=string #args=3) '$name = gsub($name, "old", "new")': replace all, with support for regular expressions. Capture groups \1 through \9 in the new part are matched from (...) in the old part, and must be used within the same call to gsub -- they don't persist for subsequent DSL statements. See also =~ and regextract. See also "Regular expressions" at https://miller.readthedocs.io.
 Examples:
 gsub("ababab", "ab", "XY") gives "XYXYXY"
 gsub("abc.def", ".", "X") gives "XXXXXXX"
@@ -957,13 +960,19 @@ lstrip  (class=string #args=1) Strip leading whitespace from string.
 
 ### regextract
 <pre class="pre-non-highlight-non-pair">
-regextract  (class=string #args=2) '$name=regextract($name, "[A-Z]{3}[0-9]{2}")'
+regextract  (class=string #args=2) Extracts a substring (the first, if there are multiple matches), matching a regular expression, from the input. Does not use capture groups; see also the =~ operator which does.
+Examples:
+regextract("index ab09 file", "[a-z][a-z][0-9][0-9]") gives "ab09"
+regextract("index a999 file", "[a-z][a-z][0-9][0-9]") gives (absent), which will result in an assignment not happening.
 </pre>
 
 
 ### regextract_or_else
 <pre class="pre-non-highlight-non-pair">
-regextract_or_else  (class=string #args=3) '$name=regextract_or_else($name, "[A-Z]{3}[0-9]{2}", "default")'
+regextract_or_else  (class=string #args=3) Like regextract but the third argument is the return value in case the input string (first argument) doesn't match the pattern (second argument).
+Examples:
+regextract_or_else("index ab09 file", "[a-z][a-z][0-9][0-9]", "nonesuch") gives "ab09"
+regextract_or_else("index a999 file", "[a-z][a-z][0-9][0-9]", "nonesuch") gives "nonesuch"
 </pre>
 
 
@@ -995,7 +1004,7 @@ strlen  (class=string #args=1) String length.
 
 ### sub
 <pre class="pre-non-highlight-non-pair">
-sub  (class=string #args=3) '$name=sub($name, "old", "new")' (replace once).
+sub  (class=string #args=3) '$name = sub($name, "old", "new")': replace once (first match, if there are multiple matches), with support for regular expressions. Capture groups \1 through \9 in the new part are matched from (...) in the old part, and must be used within the same call to sub -- they don't persist for subsequent DSL statements. See also =~ and regextract. See also "Regular expressions" at https://miller.readthedocs.io.
 Examples:
 sub("ababab", "ab", "XY") gives "XYabab"
 sub("abc.def", ".", "X") gives "Xbc.def"
@@ -1219,7 +1228,7 @@ sec2localtime(1234567890.123456, 6, "Asia/Istanbul") = "2009-02-14 01:31:30.1234
 
 ### strftime
 <pre class="pre-non-highlight-non-pair">
-strftime  (class=time #args=2) Formats seconds since the epoch as timestamp. Format strings are as in the C library (please see "man strftime" on your system), with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also strftime_local.
+strftime  (class=time #args=2) Formats seconds since the epoch as timestamp. Format strings are mostly as in the C library (see "man strftime" on your system), with the Miller-specific addition of "%1S" through "%9S" which format the seconds with 1 through 9 decimal places, respectively. ("%S" uses no decimal places.) See also strftime_local. See also "DSL datetime/timezone functions" at https://miller.readthedocs.io for more information on the differences from the C library.
 Examples:
 strftime(1440768801.7,"%Y-%m-%dT%H:%M:%SZ")  = "2015-08-28T13:33:21Z"
 strftime(1440768801.7,"%Y-%m-%dT%H:%M:%3SZ") = "2015-08-28T13:33:21.700Z"