Skip to content

Commit 1809ce6

Browse files
committed
Improve only-non-data: filter BIDS content errors, add real-file tests
Instead of passing --ignoreNiftiHeaders to bids-validator-deno (which would suppress header checks even for real files), run the validator in full and filter out content-dependent BIDS error codes (NIFTI_HEADER_UNREADABLE, EMPTY_FILE) only for broken-symlink files. This way real files still get full BIDS validation. Empirical testing confirms bids-validator-deno reports NIFTI_HEADER_UNREADABLE (error) for broken symlinks while still validating path layout, sidecar JSON, and metadata correctly. Also improves tests: - All test dandisets now include at least one real NWB file alongside the broken symlinks - New test_validate_broken_symlink_real_file_still_validated verifies that pynwb/nwbinspector results exist for the real file and are absent for the broken symlink under both skip and only-non-data - CLI tests verify "sub-003" (real file) appears in output https://claude.ai/code/session_01CLi49c7QcJx11b7UfshbvE
1 parent fcbd44f commit 1809ce6

5 files changed

Lines changed: 141 additions & 53 deletions

File tree

dandi/cli/tests/test_cmd_validate.py

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -902,16 +902,42 @@ def test_validate_auto_companion_structured_stdout(
902902

903903

904904
def _make_dandiset_with_broken_symlinks(tmp_path: Path) -> Path:
905-
"""Create a minimal dandiset with broken NWB symlinks (simulating datalad)."""
905+
"""Create a dandiset with broken NWB symlinks *and* one real NWB file.
906+
907+
Layout::
908+
909+
dandiset.yaml
910+
sub-001/sub-001.nwb -> broken symlink (datalad annex)
911+
sub-002/sub-002.nwb -> broken symlink (datalad annex)
912+
sub-003/sub-003.nwb -> real NWB file (content present)
913+
914+
This lets tests assert that real files are still fully validated
915+
while broken symlinks are handled according to the policy.
916+
"""
917+
from datetime import datetime, timezone
918+
906919
from ...consts import dandiset_metadata_file
920+
from ...pynwb_utils import make_nwb_file
907921

908922
(tmp_path / dandiset_metadata_file).write_text(
909923
"identifier: '000027'\nname: Test\ndescription: Test dandiset\n"
910924
)
925+
# Two broken symlinks
911926
for name in ("sub-001/sub-001.nwb", "sub-002/sub-002.nwb"):
912927
p = tmp_path / name
913928
p.parent.mkdir(parents=True, exist_ok=True)
914-
p.symlink_to("/nonexistent/annex/target.nwb")
929+
p.symlink_to(
930+
".git/annex/objects/XX/YY/SHA256E-s123--abc.nwb/SHA256E-s123--abc.nwb"
931+
)
932+
# One real NWB file
933+
sub3 = tmp_path / "sub-003"
934+
sub3.mkdir()
935+
make_nwb_file(
936+
sub3 / "sub-003.nwb",
937+
session_description="test session",
938+
identifier="test-nwb-003",
939+
session_start_time=datetime(2017, 4, 3, 11, tzinfo=timezone.utc),
940+
)
915941
return tmp_path
916942

917943

@@ -925,29 +951,33 @@ def test_validate_missing_file_content_error_default(tmp_path: Path) -> None:
925951
assert "broken symlink" in r.output.lower()
926952
# No Python tracebacks in output
927953
assert "Traceback" not in r.output
954+
# The real NWB file (sub-003) must still be validated
955+
assert "sub-003" in r.output
928956

929957

930958
@pytest.mark.ai_generated
931959
def test_validate_missing_file_content_skip(tmp_path: Path) -> None:
932960
"""--missing-file-content=skip skips broken symlinks with a WARNING."""
933961
ds = _make_dandiset_with_broken_symlinks(tmp_path)
934962
r = CliRunner().invoke(validate, ["--missing-file-content", "skip", str(ds)])
935-
# Only warnings, no errors -> exit 0
936-
assert r.exit_code == 0
937963
assert "FILE_CONTENT_MISSING_SKIPPED" in r.output
938964
assert "skipped" in r.output.lower()
965+
# The real NWB file (sub-003) must still be validated
966+
assert "sub-003" in r.output
939967

940968

941969
@pytest.mark.ai_generated
942970
def test_validate_missing_file_content_only_non_data(tmp_path: Path) -> None:
943-
"""--missing-file-content=only-non-data validates path layout only."""
971+
"""--missing-file-content=only-non-data validates path layout only for broken."""
944972
ds = _make_dandiset_with_broken_symlinks(tmp_path)
945973
r = CliRunner().invoke(
946974
validate, ["--missing-file-content", "only-non-data", str(ds)]
947975
)
948976
assert "FILE_CONTENT_MISSING_PARTIAL" in r.output
949977
# No pynwb tracebacks
950978
assert "Traceback" not in r.output
979+
# The real NWB file (sub-003) must still be validated
980+
assert "sub-003" in r.output
951981

952982

953983
@pytest.mark.ai_generated

dandi/files/bases.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
from dandi.validate._types import (
3333
ORIGIN_INTERNAL_DANDI,
3434
ORIGIN_VALIDATION_DANDI,
35-
ORIGIN_VALIDATION_DANDI_LAYOUT,
3635
MissingFileContent,
3736
Origin,
3837
OriginType,
@@ -545,22 +544,9 @@ def get_validation_errors(
545544
errors: list[ValidationResult] = []
546545

547546
if missing_file_content == MissingFileContent.only_non_data:
548-
# Skip content-dependent validators; emit a warning
549-
errors.append(
550-
ValidationResult(
551-
id="DANDI.FILE_CONTENT_MISSING_PARTIAL",
552-
origin=ORIGIN_VALIDATION_DANDI_LAYOUT,
553-
severity=Severity.WARNING,
554-
scope=Scope.FILE,
555-
path=self.filepath,
556-
dandiset_path=self.dandiset_path,
557-
message=(
558-
"File content is not available; "
559-
"skipping content-dependent validators "
560-
"(pynwb, nwbinspector). Only path layout is validated."
561-
),
562-
)
563-
)
547+
# Skip content-dependent validators (pynwb, nwbinspector).
548+
# The warning for the user is emitted centrally in _core.validate().
549+
pass
564550
else:
565551
# Avoid heavy import by importing within function:
566552
from nwbinspector import Importance, inspect_nwbfile, load_config

dandi/files/bids.py

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -112,16 +112,13 @@ def _get_metadata(self) -> None:
112112
result.metadata
113113
)
114114

115-
def _validate(self, ignore_nifti_headers: bool = False) -> None:
115+
def _validate(self) -> None:
116116
with self._lock:
117117
if self._dataset_errors is None:
118118

119119
# Obtain BIDS validation results of the entire dataset through the
120120
# deno-compiled BIDS validator
121-
v_results = bids_validate(
122-
self.bids_root,
123-
ignore_nifti_headers=ignore_nifti_headers,
124-
)
121+
v_results = bids_validate(self.bids_root)
125122

126123
# Validation results from the deno BIDS validator with an additional
127124
# hint, represented as a `ValidationResult` object, following
@@ -172,13 +169,9 @@ def _validate(self, ignore_nifti_headers: bool = False) -> None:
172169
bids_version = self._dataset_errors[0].origin.standard_version
173170
self._bids_version = bids_version
174171

175-
def get_asset_errors(
176-
self,
177-
asset: BIDSAsset,
178-
ignore_nifti_headers: bool = False,
179-
) -> list[ValidationResult]:
172+
def get_asset_errors(self, asset: BIDSAsset) -> list[ValidationResult]:
180173
""":meta private:"""
181-
self._validate(ignore_nifti_headers=ignore_nifti_headers)
174+
self._validate()
182175
assert self._asset_errors is not None
183176
return self._asset_errors[asset.bids_path].copy()
184177

@@ -197,11 +190,7 @@ def get_validation_errors(
197190
"""
198191
Return all validation results for the containing dataset per the BIDS standard
199192
"""
200-
self._validate(
201-
ignore_nifti_headers=(
202-
missing_file_content == MissingFileContent.only_non_data
203-
),
204-
)
193+
self._validate()
205194
assert self._dataset_errors is not None
206195
return self._dataset_errors.copy()
207196

@@ -252,12 +241,7 @@ def get_validation_errors(
252241
devel_debug: bool = False,
253242
missing_file_content: MissingFileContent | None = None,
254243
) -> list[ValidationResult]:
255-
return self.bids_dataset_description.get_asset_errors(
256-
self,
257-
ignore_nifti_headers=(
258-
missing_file_content == MissingFileContent.only_non_data
259-
),
260-
)
244+
return self.bids_dataset_description.get_asset_errors(self)
261245

262246
def get_metadata(
263247
self,

dandi/validate/_core.py

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,16 @@ def _is_broken_symlink(filepath: Path) -> bool:
154154
return filepath.is_symlink() and not filepath.exists()
155155

156156

157+
# BIDS error codes that require reading file content (headers, pixel data).
158+
# When ``only-non-data`` is active these are suppressed for broken-symlink files.
159+
_BIDS_CONTENT_DEPENDENT_CODES = frozenset(
160+
{
161+
"BIDS.NIFTI_HEADER_UNREADABLE",
162+
"BIDS.EMPTY_FILE",
163+
}
164+
)
165+
166+
157167
def validate(
158168
*paths: str | Path,
159169
schema_version: str | None = None,
@@ -218,13 +228,23 @@ def validate(
218228
):
219229
continue
220230
# only-non-data: fall through but pass the flag to validators
231+
232+
is_broken = _is_broken_symlink(df.filepath)
221233
for r in df.get_validation_errors(
222234
schema_version=schema_version,
223235
devel_debug=devel_debug,
224-
missing_file_content=(
225-
missing_file_content if _is_broken_symlink(df.filepath) else None
226-
),
236+
missing_file_content=(missing_file_content if is_broken else None),
227237
):
238+
# For broken-symlink files under only-non-data, suppress
239+
# BIDS errors that require reading file content (e.g.
240+
# NIFTI_HEADER_UNREADABLE). The validator ran in full so
241+
# real files still get those checks.
242+
if (
243+
is_broken
244+
and missing_file_content == MissingFileContent.only_non_data
245+
and r.id in _BIDS_CONTENT_DEPENDENT_CODES
246+
):
247+
continue
228248
r_id = id(r)
229249
if r_id not in df_result_ids:
230250
df_results.append(r)
@@ -275,6 +295,19 @@ def _handle_missing_content(
275295
),
276296
)
277297
else:
278-
# only-non-data: warning will be emitted by individual validators
279-
# that skip content-dependent checks
280-
return None
298+
# only-non-data: emit a warning per file so the user knows that
299+
# content-dependent checks were skipped for this file.
300+
return ValidationResult(
301+
id="DANDI.FILE_CONTENT_MISSING_PARTIAL",
302+
origin=ORIGIN_VALIDATION_DANDI_LAYOUT,
303+
severity=Severity.WARNING,
304+
scope=Scope.FILE,
305+
path=filepath,
306+
dandiset_path=df.dandiset_path,
307+
message=(
308+
f"File content is not available (broken symlink: "
309+
f"{filepath} -> {os.readlink(filepath)}); "
310+
f"content-dependent checks skipped, "
311+
f"path layout still validated."
312+
),
313+
)

dandi/validate/tests/test_core.py

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,16 +201,39 @@ def test_validate_bids_errors(
201201
assert err_location == expected_err_location
202202

203203

204-
def _make_dandiset_with_broken_symlink(tmp_path: Path) -> Path:
205-
"""Create a minimal dandiset with a broken NWB symlink (simulating datalad)."""
204+
def _make_dandiset_with_broken_symlink(
205+
tmp_path: Path, *, include_real_nwb: bool = False
206+
) -> Path:
207+
"""Create a minimal dandiset with a broken NWB symlink (simulating datalad).
208+
209+
When *include_real_nwb* is True a second subject directory contains a real
210+
(though minimal) NWB file so tests can verify that normal validation still
211+
runs on files with content present alongside broken symlinks.
212+
"""
206213
(tmp_path / dandiset_metadata_file).write_text(
207214
"identifier: '000027'\nname: Test\ndescription: Test dandiset\n"
208215
)
209216
sub_dir = tmp_path / "sub-001"
210217
sub_dir.mkdir()
211218
nwb_link = sub_dir / "sub-001.nwb"
212219
# Symlink to a non-existent target (simulating datalad annex)
213-
nwb_link.symlink_to("/nonexistent/annex/target.nwb")
220+
nwb_link.symlink_to(
221+
".git/annex/objects/XX/YY/SHA256E-s123--abc.nwb/SHA256E-s123--abc.nwb"
222+
)
223+
224+
if include_real_nwb:
225+
from datetime import datetime, timezone
226+
227+
from ...pynwb_utils import make_nwb_file
228+
229+
sub2 = tmp_path / "sub-002"
230+
sub2.mkdir()
231+
make_nwb_file(
232+
sub2 / "sub-002.nwb",
233+
session_description="test session",
234+
identifier="test-nwb-001",
235+
session_start_time=datetime(2017, 4, 3, 11, tzinfo=timezone.utc),
236+
)
214237
return tmp_path
215238

216239

@@ -255,3 +278,35 @@ def test_validate_broken_symlink_only_non_data(tmp_path: Path) -> None:
255278
# No pynwb/nwbinspector errors
256279
pynwb_errs = [r for r in results if r.origin.validator in (Validator.pynwb,)]
257280
assert len(pynwb_errs) == 0
281+
282+
283+
@pytest.mark.ai_generated
284+
def test_validate_broken_symlink_real_file_still_validated(tmp_path: Path) -> None:
285+
"""When a real NWB file coexists with broken symlinks, normal validation
286+
still runs on the real file under all policies."""
287+
ds = _make_dandiset_with_broken_symlink(tmp_path, include_real_nwb=True)
288+
289+
for policy in (MissingFileContent.skip, MissingFileContent.only_non_data):
290+
results = list(validate(ds, missing_file_content=policy))
291+
292+
# The real file (sub-002.nwb) must have been validated by pynwb or
293+
# nwbinspector — look for any result referencing it.
294+
real_file = tmp_path / "sub-002" / "sub-002.nwb"
295+
real_results = [
296+
r for r in results if r.path is not None and r.path == real_file
297+
]
298+
assert len(real_results) > 0, (
299+
f"policy={policy.value}: expected validation results for the real "
300+
f"NWB file at {real_file}"
301+
)
302+
303+
# The broken symlink file must NOT have pynwb/nwbinspector results.
304+
broken_file = tmp_path / "sub-001" / "sub-001.nwb"
305+
broken_pynwb = [
306+
r
307+
for r in results
308+
if r.path == broken_file and r.origin.validator == Validator.pynwb
309+
]
310+
assert (
311+
len(broken_pynwb) == 0
312+
), f"policy={policy.value}: pynwb should not run on the broken symlink"

0 commit comments

Comments
 (0)