paperswithcode
diff --git a/‎MANIFEST.in‎
Lines changed: 2 additions & 0 deletions b/‎MANIFEST.in‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 0 additions & 83 deletions b/‎Makefile‎
Lines changed: 0 additions & 83 deletions
diff --git a/‎axcell/config.py‎
Lines changed: 1 addition & 1 deletion b/‎axcell/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎extract_tables.py‎ ‎axcell/data/extract_tables.py‎extract_tables.py renamed to axcell/data/extract_tables.py
Lines changed: 1 addition & 1 deletion b/‎extract_tables.py‎ ‎axcell/data/extract_tables.py‎extract_tables.py renamed to axcell/data/extract_tables.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎axcell/data/json.py‎
Lines changed: 51 additions & 2 deletions b/‎axcell/data/json.py‎
Lines changed: 51 additions & 2 deletions
diff --git a/‎axcell/data/paper_collection.py‎
Lines changed: 22 additions & 9 deletions b/‎axcell/data/paper_collection.py‎
Lines changed: 22 additions & 9 deletions
diff --git a/‎axcell/helpers/datasets.py‎
Lines changed: 11 additions & 0 deletions b/‎axcell/helpers/datasets.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎axcell/helpers/jupyter.py‎
Lines changed: 1 addition & 1 deletion b/‎axcell/helpers/jupyter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎axcell/helpers/latex_converter.py‎
Lines changed: 3 additions & 3 deletions b/‎axcell/helpers/latex_converter.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎axcell/helpers/paper_extractor.py‎
Lines changed: 56 additions & 0 deletions b/‎axcell/helpers/paper_extractor.py‎
Lines changed: 56 additions & 0 deletions
@@ -0,0 +1,2 @@
+include axcell/scripts/*
+include axcell/scripts/patches/*
@@ -15,7 +15,7 @@
 goldtags_dump = data / "dumps" / "goldtags-2019.10.15_2227.json.gz"
 
 
-elastic = dict(hosts=['localhost'], timeout=20)
+elastic = dict(hosts=['127.0.0.1'], timeout=20)
 grobid = dict(host='grobid')
 
 arxiv = data/'arxiv'
 
@@ -348,7 +348,7 @@ def remove_footnotes(soup):
 
 
 def extract_tables(html):
-    soup = BeautifulSoup(html, "lxml", from_encoding="utf-8")
+    soup = BeautifulSoup(html, "lxml")
     set_ids_by_labels(soup)
     fix_span_tables(soup)
     fix_th(soup)
 
@@ -71,13 +71,62 @@ def cut(s, length=20):
         vals = pprint.pformat({to_snake_case(k): cut(str(self[k]))  for k in self.keys()})
         return f"NodeWrap({vals})"
 
+
+def _annotations_to_gql(annotations):
+    nodes = []
+    for a in annotations:
+        tables = []
+        for t in a['tables']:
+            tags = []
+            if t['leaderboard']:
+                tags.append('leaderboard')
+            if t['ablation']:
+                tags.append('ablation')
+            if not tags:
+                tags = ['irrelevant']
+
+            records = {}
+            for r in t['records']:
+                d = dict(r)
+                del d['row']
+                del d['column']
+                records[f'{r["row"]}.{r["column"]}'] = d
+            table = {
+                'node': {
+                    'name': f'table_{t["index"] + 1:02}.csv',
+                    'datasetText': t['dataset_text'],
+                    'notes': '',
+                    'goldTags': ' '.join(tags),
+                    'matrixGoldTags': t['segmentation'],
+                    'cellsSotaRecords': json.dumps(records),
+                    'parser': 'latexml'
+                }
+            }
+            tables.append(table)
+        node = {
+            'arxivId': a['arxiv_id'],
+            'goldTags': a['fold'],
+            'tableSet': {'edges': tables}
+        }
+        nodes.append({'node': node})
+    return {
+        'data': {
+            'allPapers': {
+                'edges': nodes
+            }
+        }
+    }
+
+
 def load_gql_dump(data_or_file, compressed=True):
-    if isinstance(data_or_file, dict):
+    if isinstance(data_or_file, dict) or isinstance(data_or_file, list):
         papers_data = data_or_file
     else:
         open_fn = gzip.open if compressed else open
         with open_fn(data_or_file, "rt") as f:
-                papers_data = json.load(f)
+            papers_data = json.load(f)
+    if "data" not in papers_data:
+        papers_data = _annotations_to_gql(papers_data)
     data = papers_data["data"]
     return {k:wrap_dict(v) for k,v in data.items()}
 
 
@@ -11,7 +11,7 @@
 from ..helpers.jupyter import display_table
 import string
 import random
-from extract_tables import extract_tables
+from axcell.data.extract_tables import extract_tables
 
 
 class Paper:
@@ -75,23 +75,32 @@ def _load_tables(path, annotations, jobs, migrate):
     return {f.parent.name: tbls for f, tbls in zip(files, tables)}
 
 
+def _gql_dump_to_annotations(dump):
+    annotations = {remove_arxiv_version(a.arxiv_id): a for a in dump}
+    annotations.update({a.arxiv_id: a for a in dump})
+    return annotations
+
 def _load_annotated_papers(data_or_path):
-    if isinstance(data_or_path, dict):
+    if isinstance(data_or_path, dict) or isinstance(data_or_path, list):
         compressed = False
     else:
         compressed = data_or_path.suffix == ".gz"
     dump = load_gql_dump(data_or_path, compressed=compressed)["allPapers"]
-    annotations = {remove_arxiv_version(a.arxiv_id): a for a in dump}
-    annotations.update({a.arxiv_id: a for a in dump})
-    return annotations
+    return _gql_dump_to_annotations(dump)
 
 
 class PaperCollection(UserList):
     def __init__(self, data=None):
         super().__init__(data)
 
     @classmethod
-    def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=True, load_annotations=True, jobs=-1, migrate=False):
+    def from_files(cls, path, annotations=None, load_texts=True, load_tables=True, jobs=-1):
+        return cls._from_files(path, annotations=annotations, annotations_path=None,
+                               load_texts=load_texts, load_tables=load_tables, load_annotations=False,
+                               jobs=jobs)
+
+    @classmethod
+    def _from_files(cls, path, annotations=None, annotations_path=None, load_texts=True, load_tables=True, load_annotations=True, jobs=-1, migrate=False):
         path = Path(path)
         if annotations_path is None:
             annotations_path = path / "structure-annotations.json"
@@ -102,7 +111,10 @@ def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=Tr
         else:
             texts = {}
 
-        annotations = {}
+        if annotations is None:
+            annotations = {}
+        else:
+            annotations = _load_annotated_papers(annotations)
         if load_tables:
             if load_annotations:
                 annotations = _load_annotated_papers(annotations_path)
@@ -131,8 +143,9 @@ def get_by_id(self, paper_id, ignore_version=True):
     def cells_gold_tags_legend(cls):
         tags = [
             ("Tag", "description"),
-            ("model-best", "model that has results that author most likely would like to have exposed"),
-            ("model-paper", "an example of a generic model, (like LSTM)"),
+            ("model-best", "the best performing model introduced in the paper"),
+            ("model-paper", "model introduced in the paper"),
+            ("model-ensemble", "ensemble of models introduced in the paper"),
             ("model-competing", "model from another paper used for comparison"),
             ("dataset-task", "Task"),
             ("dataset", "Dataset"),
 
@@ -0,0 +1,11 @@
+#  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+import pandas as pd
+
+
+def read_arxiv_papers(path):
+    return pd.read_csv(path)
+
+
+def read_tables_annotations(path):
+    return pd.read_json(path)
@@ -47,4 +47,4 @@ def table_to_html(table, structure=None, layout=None, predictions=None, tooltips
 
 def display_table(table, structure=None, layout=None):
     html = table_to_html(table, structure, layout)
-    display_html("\n".join(html))
+    display_html(html)
@@ -21,13 +21,13 @@ def rw_bind(path): return dict(bind=path, mode='rw')
 
 
 class LatexConverter:
-    def __init__(self, base_path):
+    def __init__(self):
         # pull arxivvanity/engrafo image
         self.client = docker.from_env()
-        self.base_path = Path(base_path)
+        self._scripts_path = Path(__file__).resolve().parent.parent / 'scripts'
 
     def latex2html(self, source_dir, output_dir, use_named_volumes=False):
-        base = self.base_path
+        base = self._scripts_path
         source_dir = Path(source_dir)
         output_dir = Path(output_dir)
         scriptname = "/files/latex2html.sh"
 
@@ -0,0 +1,56 @@
+#  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from pathlib import Path
+from axcell.helpers import LatexConverter, Unpack
+from axcell.errors import UnpackError, LatexConversionError
+from axcell.data.elastic import Paper as PaperText
+import axcell.data.extract_tables as table_extraction
+
+import re
+import warnings
+
+arxiv_re = re.compile(r"^(?P<arxiv_id>\d{4}\.\d+(v\d+)?)(\..*)?$")
+
+
+class PaperExtractor:
+    def __init__(self, root):
+        self.root = Path(root)
+        self.unpack = Unpack()
+        self.latex = LatexConverter()
+
+    def __call__(self, source):
+        source = Path(source)
+
+        m = arxiv_re.match(source.name)
+        if not m:
+            warnings.warn(f'Unable to infer arxiv_id from "{source.name}" filename')
+            arxiv_id = source.name
+        else:
+            arxiv_id = m.group('arxiv_id')
+
+        subpath = source.relative_to(self.root / 'sources').parent / arxiv_id
+        unpack_path = self.root / 'unpacked_sources' / subpath
+        try:
+            self.unpack(source, unpack_path)
+        except UnpackError as e:
+            if e.message.startswith('The paper has been withdrawn'):
+                return 'withdrawn'
+            return 'no-tex'
+        html_path = self.root / 'htmls' / subpath / 'index.html'
+        try:
+            html = self.latex.to_html(unpack_path)
+            html_path.parent.mkdir(parents=True, exist_ok=True)
+            html_path.write_text(html, 'utf-8')
+        except LatexConversionError:
+            return 'processing-error'
+
+        text_path = self.root / 'papers' / subpath / 'text.json'
+        doc = PaperText.from_html(html, arxiv_id)
+        text_path.parent.mkdir(parents=True, exist_ok=True)
+        text_path.write_text(doc.to_json(), 'utf-8')
+
+        tables_path = self.root / 'papers' / subpath
+        tables = table_extraction.extract_tables(html)
+        table_extraction.save_tables(tables, tables_path)
+
+        return 'success'
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+include axcell/scripts/*`
	`2`	`+include axcell/scripts/patches/*`