File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -782,29 +782,21 @@ def download_finewebedu(
782782):
783783 """Download FineWebEdu-10B."""
784784
785- if not skip_download :
786- data_dir = os .path .join (data_dir , 'fineweb_edu_10B' )
787- tmp_dir = tmp_dir if tmp_dir is not None else '/tmp'
788- cache_dir = (
789- os .path .join (tmp_dir , 'lm' )
790- if tmp_dir is not None
791- else os .path .expanduser ('~/.cache/huggingface/datasets' )
792- )
793-
794- _maybe_mkdir (data_dir )
795- _maybe_mkdir (tmp_dir )
796- _maybe_mkdir (cache_dir )
785+ data_dir = os .path .join (data_dir , 'fineweb_edu_10B' )
786+ _maybe_mkdir (data_dir )
787+ _maybe_mkdir (tmp_dir )
797788
789+ if not skip_download :
798790 os .environ ['TMPDIR' ] = tmp_dir
799791
800792 ds = hf_datasets .load_dataset (
801793 'HuggingFaceFW/fineweb-edu' ,
802794 name = 'sample-10BT' ,
803795 split = 'train' ,
804- cache_dir = cache_dir ,
796+ cache_dir = tmp_dir ,
805797 )
806798 ds .save_to_disk (os .path .join (tmp_dir , 'fwedu_10B_raw' ))
807- else :
799+ elif not skip_tokenization :
808800 ds = hf_datasets .load_from_disk (os .path .join (tmp_dir , 'fwedu_10B_raw' ))
809801
810802 if not skip_tokenization :
You can’t perform that action at this time.
0 commit comments