Added codec: tokenize

dhondta · dhondta · commit 281ca1bacbbc · 2022-09-07T20:36:19.000+02:00
diff --git a/codext/common/dummy.py b/codext/common/dummy.py
@@ -22,7 +22,7 @@ def code(input, errors="strict"):
 # important note:                                              ^
 #                                           using "{2}" here instead will break the codec
 #  this is due to the fact the codext.__common__.generate_string_from_regex DOES NOT handle ASSERT_NOT (?!) and will
-#   faill to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo
+#   fail to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo
 
 
 def substitute(token, replacement):
@@ -45,3 +45,13 @@ def code(input, errors="strict"):
 strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i))
 add("strip-spaces", strip_spaces, strip_spaces, guess=None)
 
+def tokenize(n):
+    tlen = int(n[8:].lstrip("-_"))
+    def code(input, errors="strict"):
+        l = len(input)
+        if tlen > l:
+            raise LookupError("unknown encoding: %s" % n)
+        return " ".join(input[i:i+tlen] for i in range(0, l, tlen)), l
+    return code
+add("tokenize", tokenize, tokenize, r"^(tokenize[-_]?[1-9][0-9]*)$", guess=None)
+
diff --git a/docs/manipulations.md b/docs/manipulations.md
@@ -43,11 +43,12 @@ These transformation functions are simple string transformations.
 
 **Codec** | **Conversions** | **Aliases** | **Comment**
 :---: | :---: | --- | ---
-`replace` | text <-> text with single-char replaced |  | 
+`replace` | text <-> text with multi-chars replaced |  | parametrized with a _string_ and its _replacement_
 `reverse` | text <-> reversed text |  | 
 `reverse-words` | text <-> reversed words |  | same as `reverse` but not on the whole text, only on the words (text split by whitespace)
 `strip-spaces` | text <-> all whitespaces stripped |  | 
 `substitute` | text <-> text with token substituted |  | 
+`tokenize` | text <-> text split in tokens of length N |  | parametrized with _N_
 
 As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)).
 
@@ -58,6 +59,13 @@ $ echo -en "test string" | codext encode reverse-words | codext encode reverse r
 string_test
 ```
 
+Another example:
+
+```sh
+$ echo -en "3132333435" | codext encode tokenize-2
+31 32 33 34 35
+```
+
 Or using encodings chaining:
 
 ```sh
diff --git a/tests/test_manual.py b/tests/test_manual.py
@@ -100,6 +100,8 @@ def test_codec_dummy_str_manips(self):
         self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR)
         self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that"))
         self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR)
+        self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is  i s  a  te st")
+        self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200")
     
     def test_codec_hash_functions(self):
         STR = b"This is a test string!"