cms-dev
diff --git a/‎cms/server/contest/submission/file_retrieval.py‎
Lines changed: 52 additions & 29 deletions b/‎cms/server/contest/submission/file_retrieval.py‎
Lines changed: 52 additions & 29 deletions
diff --git a/‎cms/server/contest/submission/workflow.py‎
Lines changed: 35 additions & 6 deletions b/‎cms/server/contest/submission/workflow.py‎
Lines changed: 35 additions & 6 deletions
diff --git a/‎cmscommon/archive.py‎
Lines changed: 81 additions & 0 deletions b/‎cmscommon/archive.py‎
Lines changed: 81 additions & 0 deletions
@@ -33,14 +33,14 @@
 
 """
 
-import os.path
+import io
+import pathlib
 import typing
 
-from patoolib.util import PatoolError
 if typing.TYPE_CHECKING:
     from tornado.httputil import HTTPFile
 
-from cmscommon.archive import Archive
+from cmscommon.archive import open_archive
 
 
 # Represents a file received through HTTP from an HTML form.
@@ -57,19 +57,30 @@ class ReceivedFile(typing.NamedTuple):
 class InvalidArchive(Exception):
     """Raised when the archive submitted by the user cannot be opened."""
 
-    pass
+    def __init__(self, too_big: bool = False, too_many_files: bool = False):
+        """
+        too_big: Whether the InvalidArchive was raised because the files in it
+            exceeded the size limit after decompression.
+        too_many_files: Whether the InvalidArchive was raised because the
+            archive contained more than the maximum number of files.
+        """
+        self.too_big = too_big
+        self.too_many_files = too_many_files
+        super().__init__()
 
 
-def extract_files_from_archive(data: bytes) -> list[ReceivedFile]:
+def extract_files_from_archive(
+    data: bytes, max_size: int | None = None, max_files: int | None = None
+) -> list[ReceivedFile]:
     """Return the files contained in the given archive.
 
-    Given the binary data of an archive in any of the formats supported
-    by patool, extract its contents and return them in our format. The
-    archive's contents must be a valid directory structure (i.e., its
-    contents cannot have conflicting/duplicated paths) but the structure
-    will be ignored and the files will be returned with their basename.
+    Given the binary data of an archive in a supported format, extract its
+    contents and return them in our format. The directory structure of the
+    archive is ignored; files will be returned with their basename.
 
     data: the raw contents of the archive.
+    max_size: maximum decompressed size of the archive.
+    max_files: maximum number of files to allow in the archive.
 
     return: the files contained in the archive, with
         their filename filled in but their codename set to None.
@@ -78,31 +89,35 @@ def extract_files_from_archive(data: bytes) -> list[ReceivedFile]:
         archive, its contents are invalid, or other issues.
 
     """
-    archive = Archive.from_raw_data(data)
-
-    if archive is None:
-        raise InvalidArchive()
-
-    result = list()
 
+    result: list[ReceivedFile] = []
+    total_size = 0
     try:
-        archive.unpack()
-        for name in archive.namelist():
-            with archive.read(name) as f:
-                result.append(
-                    ReceivedFile(None, os.path.basename(name), f.read()))
-
-    except (PatoolError, OSError):
+        archive = open_archive(io.BytesIO(data))
+        for (filepath, size, handle) in archive.iter_regular_files():
+            total_size += size
+            if max_size is not None and total_size > max_size:
+                raise InvalidArchive(too_big=True)
+            if max_files is not None and len(result) + 1 > max_files:
+                raise InvalidArchive(too_many_files=True)
+            filedata = archive.get_file_bytes(handle)
+            # archive file paths are always /-separated, so we can use
+            # PosixPath to extract the basename.
+            filename = pathlib.PurePosixPath(filepath).name
+            result.append(ReceivedFile(None, filename, filedata))
+    except InvalidArchive:
+        raise
+    # the Archive class might raise all kinds of exceptions when fed invalid data. Catch them all here.
+    except Exception:
         raise InvalidArchive()
 
-    finally:
-        archive.cleanup()
-
     return result
 
 
 def extract_files_from_tornado(
     tornado_files: dict[str, list["HTTPFile"]],
+    max_size: int | None = None,
+    max_files: int | None = None,
 ) -> list[ReceivedFile]:
     """Transform some files as received by Tornado into our format.
 
@@ -112,16 +127,24 @@ def extract_files_from_tornado(
     it and return its contents instead.
 
     tornado_files: a bunch of files, in Tornado's format.
+    max_size: limit on total size of decompressed files
+        (protects against zip bombs).
+    max_files: maximum number of files to allow in the archive
 
     return: the same bunch of files, in our format
         (except if it was an archive: then it's the archive's contents).
 
     raise (InvalidArchive): if there are issues extracting the archive.
 
     """
-    if len(tornado_files) == 1 and "submission" in tornado_files \
-            and len(tornado_files["submission"]) == 1:
-        return extract_files_from_archive(tornado_files["submission"][0].body)
+    if (
+        len(tornado_files) == 1
+        and "submission" in tornado_files
+        and len(tornado_files["submission"]) == 1
+    ):
+        return extract_files_from_archive(
+            tornado_files["submission"][0].body, max_size, max_files
+        )
 
     result = list()
     for codename, files in tornado_files.items():
 
@@ -155,12 +155,36 @@ def accept_submission(
 
     required_codenames = set(task.submission_format)
 
+    # To protect against zip bombs, we raise an error if the archive's contents
+    # are too big even before extracting everything. The largest "reasonable"
+    # archive size is with every submission file provided, and every file being
+    # the largest allowed. Since we don't yet know which files from the archive
+    # are used and which are extraneous, this size limit applies to the entire
+    # archive in total.
+    archive_size_limit = config.max_submission_length * len(required_codenames)
+    # Honest users never need to submit more than required_codenames files, but
+    # we are a bit lenient to allow .DS_Store or other hidden files that might
+    # accidentally end up in an archive.
+    archive_max_files = 2 * len(required_codenames)
     try:
-        received_files = extract_files_from_tornado(tornado_files)
-    except InvalidArchive:
-        raise UnacceptableSubmission(
-            N_("Invalid archive format!"),
-            N_("The submitted archive could not be opened."))
+        received_files = extract_files_from_tornado(
+            tornado_files, archive_size_limit, archive_max_files
+        )
+    except InvalidArchive as e:
+        if e.too_big:
+            raise UnacceptableSubmission(
+                N_("Submission too big!"),
+                N_("Each source file must be at most %d bytes long."),
+                config.max_submission_length)
+        if e.too_many_files:
+            raise UnacceptableSubmission(
+                N_("Submission too big!"),
+                N_("The submission should contain at most %d files."),
+                len(required_codenames))
+        else:
+            raise UnacceptableSubmission(
+                N_("Invalid archive format!"),
+                N_("The submitted archive could not be opened."))
 
     try:
         files, language = match_files_and_language(
@@ -342,8 +366,13 @@ def accept_user_test(
     required_codenames.update(task_type.get_user_managers())
     required_codenames.add("input")
 
+    # See accept_submission() for these variables.
+    archive_size_limit = config.max_submission_length * len(required_codenames)
+    archive_max_files = 2 * len(required_codenames)
     try:
-        received_files = extract_files_from_tornado(tornado_files)
+        received_files = extract_files_from_tornado(
+            tornado_files, archive_size_limit, archive_max_files
+        )
     except InvalidArchive:
         raise UnacceptableUserTest(
             N_("Invalid archive format!"),
 
@@ -21,10 +21,14 @@
 
 """
 
+from abc import ABCMeta, abstractmethod
+from collections.abc import Generator, Iterable
 import os
 import shutil
+import tarfile
 import tempfile
 import typing
+import zipfile
 
 import patoolib
 from patoolib.util import PatoolError
@@ -225,3 +229,80 @@ def write(self, file_path: str, file_object):
                 "You should write the file directly, in the "
                 "folder returned by unpack(), and then "
                 "call the repack() method.")
+
+class ArchiveBase(metaclass=ABCMeta):
+    """Base class for archive reader implementations."""
+
+    @abstractmethod
+    def iter_regular_files(self) -> Iterable[tuple[str, int, object]]:
+        """Yields tuples of (filepath, decompressed size, handle),
+        regular files only.
+
+        filepath will always be /-separated.
+        handle can be passed to open_file.
+        """
+        pass
+
+    @abstractmethod
+    def open_file(self, handle: object) -> typing.IO[bytes]:
+        """Open a member of the archive for reading."""
+        pass
+
+    def get_file_bytes(self, handle: object) -> bytes:
+        """Read an archive member into bytes."""
+        with self.open_file(handle) as f:
+            return f.read()
+
+
+class ArchiveZipfile(ArchiveBase):
+    """Archive reader using `zipfile`, see ArchiveBase for method descriptions."""
+
+    def __init__(self, inner: zipfile.ZipFile):
+        self.inner = inner
+
+    def iter_regular_files(self) -> list[tuple[str, int, zipfile.ZipInfo]]:
+        return [
+            (x.filename, x.file_size, x)
+            for x in self.inner.infolist()
+            if not x.is_dir()
+        ]
+
+    def open_file(self, handle: object) -> typing.IO[bytes]:
+        assert isinstance(handle, zipfile.ZipInfo)
+        return self.inner.open(handle, "r")
+
+
+class ArchiveTarfile(ArchiveBase):
+    """Archive reader using `tarfile`, see ArchiveBase for method descriptions."""
+
+    def __init__(self, inner: tarfile.TarFile):
+        self.inner = inner
+
+    def iter_regular_files(self) -> Generator[tuple[str, int, tarfile.TarInfo]]:
+        while (member := self.inner.next()) is not None:
+            if member.isfile():
+                yield (member.path, member.size, member)
+
+    def open_file(self, handle: object) -> typing.IO[bytes]:
+        assert isinstance(handle, tarfile.TarInfo)
+        fobj = self.inner.extractfile(handle)
+        if fobj is None:
+            raise ValueError("not a regular file")
+        return fobj
+
+
+def open_archive(input: typing.IO[bytes]) -> ArchiveBase:
+    """Open an archive for reading.
+
+    input: the archive, opened for reading in binary mode.
+    """
+    # Order is not entirely arbitrary here: is_zipfile is a very lenient check
+    # that will also return True when the file is an uncompressed tar file that
+    # happens to contain a zip file inside it. So check is_tarfile first (which
+    # only reads the very start of the file).
+    if tarfile.is_tarfile(input):
+        return ArchiveTarfile(tarfile.open(fileobj=input))
+    elif zipfile.is_zipfile(input):
+        return ArchiveZipfile(zipfile.ZipFile(input))
+    else:
+        raise ValueError("not a known archive format")