Skip to content

Commit 7792041

Browse files
finewebedu preprocess: fix path
1 parent 8ffafb9 commit 7792041

1 file changed

Lines changed: 6 additions & 14 deletions

File tree

dataset/dataset_setup.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -782,29 +782,21 @@ def download_finewebedu(
782782
):
783783
"""Download FineWebEdu-10B."""
784784

785-
if not skip_download:
786-
data_dir = os.path.join(data_dir, 'fineweb_edu_10B')
787-
tmp_dir = tmp_dir if tmp_dir is not None else '/tmp'
788-
cache_dir = (
789-
os.path.join(tmp_dir, 'lm')
790-
if tmp_dir is not None
791-
else os.path.expanduser('~/.cache/huggingface/datasets')
792-
)
793-
794-
_maybe_mkdir(data_dir)
795-
_maybe_mkdir(tmp_dir)
796-
_maybe_mkdir(cache_dir)
785+
data_dir = os.path.join(data_dir, 'fineweb_edu_10B')
786+
_maybe_mkdir(data_dir)
787+
_maybe_mkdir(tmp_dir)
797788

789+
if not skip_download:
798790
os.environ['TMPDIR'] = tmp_dir
799791

800792
ds = hf_datasets.load_dataset(
801793
'HuggingFaceFW/fineweb-edu',
802794
name='sample-10BT',
803795
split='train',
804-
cache_dir=cache_dir,
796+
cache_dir=tmp_dir,
805797
)
806798
ds.save_to_disk(os.path.join(tmp_dir, 'fwedu_10B_raw'))
807-
else:
799+
elif not skip_tokenization:
808800
ds = hf_datasets.load_from_disk(os.path.join(tmp_dir, 'fwedu_10B_raw'))
809801

810802
if not skip_tokenization:

0 commit comments

Comments
 (0)