Blosc
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ROADMAP-TO-4.0.md‎
Lines changed: 23 additions & 0 deletions b/‎ROADMAP-TO-4.0.md‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎bench/ndarray/linear-constructor.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/ndarray/linear-constructor.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/ndarray/sum-linear-idx.py‎
Lines changed: 2 additions & 2 deletions b/‎bench/ndarray/sum-linear-idx.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/getting_started/tutorials/09.ucodecs-ufilters.ipynb‎
Lines changed: 1 addition & 3 deletions b/‎doc/getting_started/tutorials/09.ucodecs-ufilters.ipynb‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎doc/getting_started/tutorials/10.prefilters.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎doc/getting_started/tutorials/10.prefilters.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/ndarray/string_arrays.ipynb‎
Lines changed: 192 additions & 0 deletions b/‎examples/ndarray/string_arrays.ipynb‎
Lines changed: 192 additions & 0 deletions
diff --git a/‎src/blosc2/__init__.py‎
Lines changed: 19 additions & 11 deletions b/‎src/blosc2/__init__.py‎
Lines changed: 19 additions & 11 deletions
@@ -63,7 +63,7 @@ endif()
 
 FetchContent_Declare(miniexpr
     GIT_REPOSITORY https://github.com/Blosc/miniexpr.git
-    GIT_TAG 06ee29cdc9c5cfbf22f9a6fedcdaf46af1fec373
+    GIT_TAG 320240a849e185c84114501200052bfeb8d66f2b
     # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../miniexpr
 )
 FetchContent_MakeAvailable(miniexpr)
 
@@ -21,3 +21,26 @@ The constructor for the `Table` object should take some parameters to specify pr
     * `.__iter__()` for easy and fast iteration over rows.
     * `.where()`: an iterator for querying with conditions that are evaluated with the internal compute engine.
     * `.index()` for indexing a column and getting better performance in queries (desirable, but optional for 4.0).
+
+In particular, it should try to mimic much of the functionality of data-querying libraries such as ``pandas`` (see [this blog](https://datapythonista.me/blog/whats-new-in-pandas-3) for much of the followin). Hence, one should be able to filter rows of the `Table` via querying on multiple columns (accessed via `.` or perhaps ``__getitem__``), with conditions to select rows implemented via `.index`, `.where` like so
+
+```
+tbl.where((tbl.property_type == "hotel") & (tbl.country == "us"))
+```
+
+It should also be possible to modify the filtered ``Table`` in-place, using some operation which only acts on the filtered elements (e.g ``assign``)
+
+```
+tbl = tbl.where((tbl.property_type == "hotel") & (tbl.country == "us")).assign(max_people = tbl.max_people + tbl.max_children)
+```
+
+Secondly, it should be possible to write bespoke transformation functions which act row-wise and then may be applied to get results from the `Table` and/or modify the ``Table`` in-place:
+
+```
+def myudf(row):
+    col = row.name_of_column
+    # do things with col
+    return result
+
+ans = tbl.apply(myudf, axis=1)
+```
@@ -17,7 +17,7 @@
 
 @blosc2.dsl_kernel
 def kernel_ramp(start):
-    return start + _global_linear_idx  # noqa: F821  # DSL index/shape symbols resolved by miniexpr
+    return start + _flat_idx  # noqa: F821  # DSL index/shape symbols resolved by miniexpr
 
 t0 = time()
 npa = np.arange(np.prod(shape), dtype=dtype).reshape(shape)
 
@@ -6,7 +6,7 @@
 #######################################################################
 
 # Compare reduction performance on DSL kernels.
-# This uses the special _global_linear_idx var.
+# This uses the special _flat_idx var.
 
 from time import time
 import numpy as np
@@ -19,7 +19,7 @@
 @blosc2.dsl_kernel
 def kernel_ramp():
     # return _i0 * _n1 + _i1  # noqa: F821  # DSL index/shape symbols resolved by miniexpr
-    return _global_linear_idx  # noqa: F821  # DSL index/shape symbols resolved by miniexpr
+    return _flat_idx  # noqa: F821  # DSL index/shape symbols resolved by miniexpr
 
 print(kernel_ramp.dsl_source)
 a = blosc2.lazyudf(kernel_ramp, (), dtype=dtype, shape=shape)
 
@@ -621,9 +621,7 @@
    ],
    "source": [
     "array = blosc2.zeros((30, 30))\n",
-    "array.schunk.cparams = blosc2.CParams(\n",
-    "    **{\"codec\": 184, \"filters\": [filter_id], \"filters_meta\": [0], \"nthreads\": 1}\n",
-    ")\n",
+    "array.schunk.cparams = blosc2.CParams(codec=184, filters=[filter_id], filters_meta=[0], nthreads=1)\n",
     "array.schunk.cparams"
    ]
   }
 
@@ -324,7 +324,7 @@
    },
    "outputs": [],
    "source": [
-    "my_schunk.dparams = blosc2.DParams(**{\"nthreads\": 1})  # Disable parallelism for decompression\n",
+    "my_schunk.dparams = blosc2.DParams(nthreads=1)  # Disable parallelism for decompression\n",
     "\n",
     "\n",
     "@my_schunk.postfilter(input_dtype)\n",
 
@@ -0,0 +1,192 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Working with arrays of strings in Blosc2\n",
+    "\n",
+    "Blosc2 provides support for arrays in which the elements are strings, either of bytes (``np.bytes_`` equivalent to ``np.dtype('S0')``) or of unicode characters (``np.str_``, equivalent to ``np.dtype('U0')``), with the typesize determined by the longest element in the array. That is"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bytes array - dtype: |S3, typesize: 3\n",
+      "Unicode array - dtype: <U3, typesize: 12\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "arr = np.array([b\"a23\", b\"89u\"])\n",
+    "print(f\"Bytes array - dtype: {arr.dtype}, typesize: {arr.dtype.itemsize}\")\n",
+    "arr = np.array([\"a23\", \"89u\"])\n",
+    "print(f\"Unicode array - dtype: {arr.dtype}, typesize: {arr.dtype.itemsize}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "since each unicode character encodes 4 bytes. This carries over to the ``blosc2.NDArray`` object. Indeed, such arrays, particularly those of Unicode type, are highly compressible, since almost all of the bits encoding each item will be 0 (i.e. ``\\x00``), as can be seen by viewing the second array above as an array of bytestrings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([b'a\\x00\\x00\\x002\\x00\\x00\\x003', b'8\\x00\\x00\\x009\\x00\\x00\\x00u'],\n",
+       "      dtype='|S12')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "arr.view(\"S12\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(The trailing ``\\x00`` bytes are suppressed for the last character). Consequently, using Blosc2 can save you a lot of space (in memory or disk) when working with arrays of strings, if one exploits the structure of the (unicode) strings correctly. Specifically, the fundamental building block of the array should be the byte, and not the element - in this way, using the shuffle filter groups the $N$ elements having $m$ characters of bytesize 4 into $4$ streams of $Nm$ bytes, so that the corresponding bytes for all characters are grouped together. For the array above, one transforms the array from (2 elements of $3 \\times 4 = 12$ bytes)\n",
+    "```\n",
+    "|a\\x00\\x00\\x002\\x00\\x00\\x003\\x00\\x00\\x00|8\\x00\\x00\\x009\\x00\\x00\\x00u\\x00\\x00\\x00|\n",
+    "```\n",
+    "to (4 streams of $2 \\times 3 = 6$ bytes)\n",
+    "```\n",
+    "|a2389u|\\x00\\x00\\x00\\x00\\x00\\x00|\\x00\\x00\\x00\\x00\\x00\\x00|\\x00\\x00\\x00\\x00\\x00\\x00|\n",
+    "```\n",
+    "For the example above, 3 of the bytes for each character are 0, and so by grouping these zeros together, it is more likely to have chunks composed entirely or almost entirely of zeros, which may then be readily compressed.\n",
+    "If one were to break up the array by elements, one would end up with $4m$ streams of $N$ bytes, i.e. ($4 \\times 3 = 12$ streams of $2$ bytes)\n",
+    "```\n",
+    "|a8|\\x00\\x00|\\x00\\x00|\\x00\\x00|29|\\x00\\x00|\\x00\\x00|\\x00\\x00|3u|\\x00\\x00|\\x00\\x00|\\x00\\x00|\n",
+    "```\n",
+    "which is not very compressible, since the informative bytes are fragmented by small groups of 0 bytes. This heuristic has been implemented as a default for Blosc2 string compression, so you can just compress your arrays of strings and reap the benefits without having to worry about such intricacies though! Check it out below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "cratio forcing non-string defaults: 2417x\n",
+      "cratio allowing blosc2 to optimise: 3902x\n"
+     ]
+    }
+   ],
+   "source": [
+    "import blosc2\n",
+    "\n",
+    "N = int(1e5)\n",
+    "nparr = np.repeat(np.array([\"josé\", \"pepe\", \"francisco\"]), N)\n",
+    "cparams = blosc2.cparams_dflts\n",
+    "arr1 = blosc2.asarray(nparr, cparams=cparams)\n",
+    "print(f\"cratio forcing non-string defaults: {round(arr1.cratio)}x\")\n",
+    "arr1 = blosc2.asarray(nparr)\n",
+    "print(f\"cratio allowing blosc2 to optimise: {round(arr1.cratio)}x\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Operating on arrays of strings\n",
+    "\n",
+    "Blosc2 has two sides, compression and computation, which are tightly enmeshed, and for arrays of strings the same applies. We have implemented a subset of useful functions for strings:\n",
+    "- comparison operations ``<, <=, ==, !=, >=, >``\n",
+    "- 2-argument functions ``contains, startswith, endswith``\n",
+    "- 1-argument functions ``lower, upper``\n",
+    "\n",
+    "Where possible these will be computed by the ``miniexpr`` backend, which is a highly optimised, fully compiled, multithreaded library which is the most complete expression of Blosc2's goal: fully vertically integrated decompression/computation/recompression with optimal cache hierarchy exploitation and super-fast compiled-C code for as much of the pipeline as possible. In cases where this is not possible, a more robust path which still avoids memory overload for large arrays is used.\n",
+    "\n",
+    "The arguments may be scalars or arrays (``blosc2.NDArray`` or other types) of strings or bytes. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for t in (\"bytes\", \"string\"):\n",
+    "    if t == \"bytes\":\n",
+    "        a1 = np.array([b\"abc\", b\"def\", b\"atErr\", b\"oot\", b\"zu\", b\"ab c\"])\n",
+    "        a2 = a2_blosc = b\"a\"\n",
+    "    else:\n",
+    "        a1 = np.array([\"abc\", \"def\", \"atErr\", \"oot\", \"zu\", \"ab c\"])\n",
+    "        a2 = a2_blosc = \"a\"\n",
+    "    a1_blosc = blosc2.asarray(a1)\n",
+    "    for func, npfunc in zip(\n",
+    "        (blosc2.startswith, blosc2.endswith, blosc2.contains),\n",
+    "        (np.char.startswith, np.char.endswith, lambda *args: np.char.find(*args) != -1),\n",
+    "        strict=True,\n",
+    "    ):\n",
+    "        expr_lazy = func(a1_blosc, a2_blosc)\n",
+    "        res_numexpr = npfunc(a1, a2)\n",
+    "        assert expr_lazy.shape == res_numexpr.shape\n",
+    "        assert expr_lazy.dtype == blosc2.bool_\n",
+    "        np.testing.assert_array_equal(expr_lazy[:], res_numexpr)\n",
+    "\n",
+    "    np.testing.assert_array_equal((a1_blosc < a2_blosc)[:], a1 < a2)\n",
+    "    np.testing.assert_array_equal((a1_blosc <= a2_blosc)[:], a1 <= a2)\n",
+    "    np.testing.assert_array_equal((a1_blosc == a2_blosc)[:], a1 == a2)\n",
+    "    np.testing.assert_array_equal((a1_blosc != a2_blosc)[:], a1 != a2)\n",
+    "    np.testing.assert_array_equal((a1_blosc >= a2_blosc)[:], a1 >= a2)\n",
+    "    np.testing.assert_array_equal((a1_blosc > a2_blosc)[:], a1 > a2)\n",
+    "\n",
+    "    for func, npfunc in zip((blosc2.lower, blosc2.upper), (np.char.lower, np.char.upper), strict=True):\n",
+    "        expr_lazy = func(a1_blosc)\n",
+    "        res_numexpr = npfunc(a1)\n",
+    "        assert expr_lazy.shape == res_numexpr.shape\n",
+    "        np.testing.assert_array_equal(expr_lazy[:], res_numexpr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "blosc2env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -393,17 +393,21 @@ def _raise(exc):
             "max dimensions": MAX_DIM,
         },
         default_device=lambda: "cpu",
-        default_dtypes=lambda device=None: {
-            "real floating": DEFAULT_FLOAT,
-            "complex floating": DEFAULT_COMPLEX,
-            "integral": DEFAULT_INT,
-            "indexing": DEFAULT_INDEX,
-        }
-        if (device == "cpu" or device is None)
-        else _raise(ValueError("Only cpu devices allowed")),
-        dtypes=lambda device=None, kind=None: np.__array_namespace_info__().dtypes(kind=kind, device=device)
-        if (device == "cpu" or device is None)
-        else _raise(ValueError("Only cpu devices allowed")),
+        default_dtypes=lambda device=None: (
+            {
+                "real floating": DEFAULT_FLOAT,
+                "complex floating": DEFAULT_COMPLEX,
+                "integral": DEFAULT_INT,
+                "indexing": DEFAULT_INDEX,
+            }
+            if (device == "cpu" or device is None)
+            else _raise(ValueError("Only cpu devices allowed"))
+        ),
+        dtypes=lambda device=None, kind=None: (
+            np.__array_namespace_info__().dtypes(kind=kind, device=device)
+            if (device == "cpu" or device is None)
+            else _raise(ValueError("Only cpu devices allowed"))
+        ),
         devices=lambda: ["cpu"],
         name="blosc2",
         version=__version__,
@@ -649,6 +653,7 @@ def _raise(exc):
     logical_not,
     logical_or,
     logical_xor,
+    lower,
     max,
     maximum,
     mean,
@@ -681,6 +686,7 @@ def _raise(exc):
     tan,
     tanh,
     trunc,
+    upper,
     var,
     where,
 )
@@ -852,6 +858,7 @@ def _raise(exc):
     "logical_not",
     "logical_or",
     "logical_xor",
+    "lower",
     "matmul",
     "matrix_transpose",
     "max",
@@ -923,6 +930,7 @@ def _raise(exc):
     "unpack_array",
     "unpack_array2",
     "unpack_tensor",
+    "upper",
     "validate_expr",
     "var",
     "vecdot",
Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ endif()`
`63`	`63`
`64`	`64`	`FetchContent_Declare(miniexpr`
`65`	`65`	`GIT_REPOSITORY https://github.com/Blosc/miniexpr.git`
`66`		`- GIT_TAG 06ee29cdc9c5cfbf22f9a6fedcdaf46af1fec373`
	`66`	`+ GIT_TAG 320240a849e185c84114501200052bfeb8d66f2b`
`67`	`67`	`# SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../miniexpr`
`68`	`68`	`)`
`69`	`69`	`FetchContent_MakeAvailable(miniexpr)`
Original file line number	Diff line number	Diff line change
`@@ -621,9 +621,7 @@`
`621`	`621`	`],`
`622`	`622`	`"source": [`
`623`	`623`	`"array = blosc2.zeros((30, 30))\n",`
`624`		`- "array.schunk.cparams = blosc2.CParams(\n",`
`625`		`- " **{\"codec\": 184, \"filters\": [filter_id], \"filters_meta\": [0], \"nthreads\": 1}\n",`
`626`		`- ")\n",`
	`624`	`+ "array.schunk.cparams = blosc2.CParams(codec=184, filters=[filter_id], filters_meta=[0], nthreads=1)\n",`
`627`	`625`	`"array.schunk.cparams"`
`628`	`626`	`]`
`629`	`627`	`}`