@@ -53,9 +53,7 @@ To enable GPU, click "Edit > Notebook settings" and select GPU. If enabled, this
5353These installation commands will take time to run. Begin them now.
5454
5555``` python
56- ! pip install - U accelerate
57- ! pip install - U transformers
58- ! pip install seqeval
56+ ! pip install transformers datasets evaluate seqeval
5957```
6058
6159
@@ -174,7 +172,7 @@ Now, let's take a look at the example data from the dataset used in the example.
174172``` python
175173from datasets import load_dataset, load_metric
176174
177- ds = load_dataset(" conll2003" )
175+ ds = load_dataset(" conll2003" , trust_remote_code = True )
178176print (ds)
179177```
180178
@@ -339,7 +337,7 @@ Finally, lets make our own tweaks to the HuggingFace colab notebook. We'll start
339337
340338``` python
341339import datasets
342- from datasets import load_dataset, load_metric, Features
340+ from datasets import load_dataset, Features
343341```
344342
345343The HuggingFace example uses [ CONLL 2003 dataset] ( https://www.aclweb.org/anthology/W03-0419.pdf ) .
@@ -364,7 +362,7 @@ Now that we have a modified huggingface script, let's load our data.
364362
365363
366364``` python
367- ds = load_dataset(" /content/drive/MyDrive/Colab Notebooks/text-analysis/code/mit_restaurants.py" )
365+ ds = load_dataset(" /content/drive/MyDrive/Colab Notebooks/text-analysis/code/mit_restaurants.py" , trust_remote_code = True )
368366```
369367
370368 /usr/local/lib/python3.10/dist-packages/datasets/load.py:926: FutureWarning: The repository for mit_restaurants contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at /content/drive/MyDrive/Colab Notebooks/text-analysis/code/mit_restaurants.py
@@ -601,13 +599,16 @@ model_name = model_checkpoint.split("/")[-1]
601599args = TrainingArguments(
602600 # f"{model_name}-finetuned-{task}",
603601 f " { model_name} -carpentries-restaurant-ner " ,
604- evaluation_strategy = " epoch" ,
605602 learning_rate = 2e-5 ,
606603 per_device_train_batch_size = batch_size,
607604 per_device_eval_batch_size = batch_size,
608605 num_train_epochs = 3 ,
609606 weight_decay = 0.01 ,
610- # push_to_hub=True, #You can have your model automatically pushed to HF if you uncomment this and log in.
607+ report_to = " none" ,
608+ eval_strategy = " epoch" ,
609+ save_strategy = " epoch" ,
610+ load_best_model_at_end = True ,
611+ push_to_hub = False , # You can have your model automatically pushed to HF if you uncomment this and log in.
611612)
612613```
613614
@@ -628,32 +629,10 @@ The last thing we want to define is the metric by which we evaluate how our mode
628629
629630
630631``` python
631- metric = load_metric(" seqeval" )
632- labels = [label_list[i] for i in example[f " { task} _tags " ]]
633- metric.compute(predictions = [labels], references = [labels])
634- ```
635-
636- <ipython-input-25-d0b6118e6d86>:1: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
637- metric = load_metric("seqeval")
638- /usr/local/lib/python3.10/dist-packages/datasets/load.py:756: FutureWarning: The repository for seqeval contains custom code which must be executed to correctly load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.18.0/metrics/seqeval/seqeval.py
639- You can avoid this message in future by passing the argument `trust_remote_code=True`.
640- Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
641- warnings.warn(
642-
643-
644-
645- Downloading builder script: 0%| | 0.00/2.47k [00:00<?, ?B/s]
632+ import evaluate
646633
647-
648-
649-
650-
651- {'Hours': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
652- 'Restaurant_Name': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
653- 'overall_precision': 1.0,
654- 'overall_recall': 1.0,
655- 'overall_f1': 1.0,
656- 'overall_accuracy': 1.0}
634+ seqeval = evaluate.load(" seqeval" )
635+ ```
657636
658637
659638
@@ -808,72 +787,42 @@ Now let's see how our model did. We'll run a more detailed evaluation step from
808787
809788
810789``` python
811- trainer.evaluate()
812-
813- predictions, labels, _ = trainer.predict(tokenized_datasets[" validation" ])
814- predictions = np.argmax(predictions, axis = 2 )
790+ from evaluate import evaluator
815791
816- # Remove ignored index (special tokens)
817- true_predictions = [
818- [label_list[p] for (p, l) in zip (prediction, label) if l != - 100 ]
819- for prediction, label in zip (predictions, labels)
820- ]
821- true_labels = [
822- [label_list[l] for (p, l) in zip (prediction, label) if l != - 100 ]
823- for prediction, label in zip (predictions, labels)
824- ]
792+ task_evaluator = evaluator(" ner" )
793+ data= load_dataset(" /content/drive/MyDrive/Colab Notebooks/text-analysis/code/mit_restaurants.py" , split = " test" , trust_remote_code = True )
825794
826- results = metric.compute(predictions = true_predictions, references = true_labels)
827- results
795+ eval_results = task_evaluator.compute(
796+ model_or_pipeline = " /content/drive/MyDrive/Colab Notebooks/text-analysis/ft-model" ,
797+ data = data,
798+ )
828799```
829800
801+ ``` python
802+ for r in eval_results:
803+ print (r, eval_results[r])
804+ ```
830805
806+ ```
807+ Amenity {'precision': 0.6354515050167224, 'recall': 0.7011070110701108, 'f1': 0.6666666666666667, 'number': 271}
808+ Cuisine {'precision': 0.8378378378378378, 'recall': 0.8641114982578397, 'f1': 0.8507718696397942, 'number': 287}
809+ Dish {'precision': 0.6935483870967742, 'recall': 0.6991869918699187, 'f1': 0.6963562753036437, 'number': 123}
810+ Hours {'precision': 0.5675675675675675, 'recall': 0.7078651685393258, 'f1': 0.6299999999999999, 'number': 89}
811+ Location {'precision': 0.8277777777777777, 'recall': 0.8713450292397661, 'f1': 0.849002849002849, 'number': 342}
812+ Price {'precision': 0.7875, 'recall': 0.863013698630137, 'f1': 0.8235294117647058, 'number': 73}
813+ Rating {'precision': 0.7311827956989247, 'recall': 0.8395061728395061, 'f1': 0.7816091954022988, 'number': 81}
814+ Restaurant_Name {'precision': 0.8323699421965318, 'recall': 0.8323699421965318, 'f1': 0.8323699421965318, 'number': 173}
815+ overall_precision 0.7552083333333334
816+ overall_recall 0.8061153578874218
817+ overall_f1 0.7798319327731092
818+ overall_accuracy 0.9171441163508154
819+ total_time_in_seconds 4.749443094000526
820+ samples_per_second 148.64900705765186
821+ latency_in_seconds 0.006727256507082897
822+ ```
831823
832824
833-
834-
835-
836-
837- {'Amenity': {'precision': 0.6298701298701299,
838- 'recall': 0.6689655172413793,
839- 'f1': 0.6488294314381271,
840- 'number': 290},
841- 'Cuisine': {'precision': 0.8291814946619217,
842- 'recall': 0.8175438596491228,
843- 'f1': 0.8233215547703181,
844- 'number': 285},
845- 'Dish': {'precision': 0.8,
846- 'recall': 0.8715953307392996,
847- 'f1': 0.8342644320297952,
848- 'number': 257},
849- 'Hours': {'precision': 0.7132352941176471,
850- 'recall': 0.776,
851- 'f1': 0.7432950191570882,
852- 'number': 125},
853- 'Location': {'precision': 0.8140900195694716,
854- 'recall': 0.8253968253968254,
855- 'f1': 0.8197044334975369,
856- 'number': 504},
857- 'Price': {'precision': 0.7723577235772358,
858- 'recall': 0.8482142857142857,
859- 'f1': 0.8085106382978723,
860- 'number': 112},
861- 'Rating': {'precision': 0.6896551724137931,
862- 'recall': 0.8130081300813008,
863- 'f1': 0.746268656716418,
864- 'number': 123},
865- 'Restaurant_Name': {'precision': 0.8666666666666667,
866- 'recall': 0.8802083333333334,
867- 'f1': 0.8733850129198966,
868- 'number': 384},
869- 'overall_precision': 0.7805887764489421,
870- 'overall_recall': 0.8158653846153846,
871- 'overall_f1': 0.7978373295721672,
872- 'overall_accuracy': 0.9095345345345346}
873-
874-
875-
876- Whether a F1 score of .795 is 'good enough' depends on the performance of other models, how difficult the task is, and so on. It may be good enough for our needs, or we may want to collect more data, train on a bigger model, or adjust our parameters. For the purposes of the workshop, we will say that this is fine.
825+ Whether a F1 score of .779 is 'good enough' depends on the performance of other models, how difficult the task is, and so on. It may be good enough for our needs, or we may want to collect more data, train on a bigger model, or adjust our parameters. For the purposes of the workshop, we will say that this is fine.
877826
878827## Using our Model
879828
0 commit comments