added confusion matrix (to be tested with v24!), tiny text changes

DanielaSchacherer · DanielaSchacherer · commit 91b06ed1edbd · 2026-02-20T13:56:04.000+01:00
diff --git a/notebooks/collections_demos/bonemarrowwsi_pediatricleukemia.ipynb b/notebooks/collections_demos/bonemarrowwsi_pediatricleukemia.ipynb
@@ -101,16 +101,19 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
+      "execution_count": null,
       "metadata": {
         "id": "dgtRNVatzl2s"
       },
       "outputs": [],
       "source": [
         "import os\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "import seaborn as sns\n",
         "import highdicom as hd\n",
         "from idc_index import IDCClient\n",
-        "import pandas as pd\n",
         "from google.cloud import storage\n",
         "from pathlib import Path\n",
         "from typing import List, Union"
@@ -11390,13 +11393,13 @@
       },
       "source": [
         "## How to use the `BoneMarrowWSI-PediatricLeukemia` annotations\n",
-        "The `BoneMarrowWSI-PediatricLeukemia` collection stands out due to the extensive amount of information contained in its annotations. More than 40000 cells are annotated with bounding boxes suitable for training **cell detection models**, 28000 of those additionally received expert-generated class labels for **cell type classification** tasks. Particularly noteworthy is the uncertainty information embedded in the consensus labelling process, giving insight into which cell types are particularly challenging to determine or easy to confuse with others.\n",
-        "In the cell below, we catch some of those cases:"
+        "The `BoneMarrowWSI-PediatricLeukemia` collection stands out due to the large amount of information contained in its annotations. More than 40000 cells are annotated with bounding boxes suitable for training **cell detection models**, 28000 of those additionally received expert-generated class labels for **cell type classification** tasks. Particularly noteworthy is the uncertainty information embedded in the consensus labelling process, giving insight into which cell types are particularly challenging to determine or easy to confuse with others.\n",
+        "In the cell below, we catch some of those cases and display them in a confusion matrix: "
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 33,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -11629,11 +11632,50 @@
         }
       ],
       "source": [
-        "grouped_cell_labels = sorted_cell_labels.groupby('cell_id').agg({'cell_label': list, 'cell_label_code_scheme': list,\n",
+        "# Note: this cell may run for 2-3 minutes\n",
+        "labeled_cells = get_cell_annotations(subset='labeled', ann_to_process=500)\n",
+        "grouped_cell_labels = labeled_cells.groupby('cell_id').agg({'cell_label': list, 'cell_label_code_scheme': list,\n",
         "                                                              'reference_SOPInstanceUID': 'first',\n",
         "                                                              'cell_coordinates': 'first'})\n",
         "uncertain = grouped_cell_labels['cell_label'].apply(lambda x: len(set(x)) > 1)\n",
-        "display(grouped_cell_labels[uncertain])"
+        "uncertain_cells = grouped_cell_labels[uncertain]\n",
+        "display(uncertain_cells)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Flatten all label pairs for each uncertain cell\n",
+        "label_pairs = []\n",
+        "for labels in uncertain_cells['cell_label']:\n",
+        "    unique_labels = list(set(labels))\n",
+        "    if len(unique_labels) > 1:\n",
+        "        # Add all pairwise confusions (unordered, so sort)\n",
+        "        for i in range(len(unique_labels)):\n",
+        "            for j in range(i+1, len(unique_labels)):\n",
+        "                label_pairs.append(tuple(sorted([unique_labels[i], unique_labels[j]])))\n",
+        "\n",
+        "# Get all unique labels involved in confusion\n",
+        "all_confused_labels = sorted(set([l for pair in label_pairs for l in pair]))\n",
+        "label_to_idx = {label: idx for idx, label in enumerate(all_confused_labels)}\n",
+        "\n",
+        "# Build confusion matrix\n",
+        "conf_matrix = np.zeros((len(all_confused_labels), len(all_confused_labels)), dtype=int)\n",
+        "for l1, l2 in label_pairs:\n",
+        "    i, j = label_to_idx[l1], label_to_idx[l2]\n",
+        "    conf_matrix[i, j] += 1\n",
+        "    conf_matrix[j, i] += 1  # symmetric\n",
+        "\n",
+        "# Plot confusion matrix\n",
+        "plt.figure(figsize=(10, 8))\n",
+        "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Reds', xticklabels=all_confused_labels, yticklabels=all_confused_labels)\n",
+        "plt.title('Confusion Matrix of Uncertain Cell Labels')\n",
+        "plt.xlabel('Cell Label')\n",
+        "plt.ylabel('Cell Label')\n",
+        "plt.show()"
       ]
     },
     {