Skip to content

Commit 2578000

Browse files
authored
Remove use of patoolib in CWS (#1437)
use python builtin archive handling libraries instead of patoolib when handling user-provided files. This is much safer, since patoolib can run any of hundreds of archiving utilities, some of which might have various bugs that can lead to arbitrary file overwrites. The new system never touches the filesystem and thus avoids all vulnerabilities that could stem from that. I have also implemented protection against zip bombs: archives are now rejected as soon as it's clear that they are too large, without ever decompressing the entire archive. The downside is, of course, that this greatly restricts the set of allowed file formats (new system supports only zip and tar.gz/xz/bz2); this was deemed to be not too big of a deal.
1 parent bb3baaa commit 2578000

5 files changed

Lines changed: 249 additions & 63 deletions

File tree

cms/server/contest/submission/file_retrieval.py

Lines changed: 52 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@
3333
3434
"""
3535

36-
import os.path
36+
import io
37+
import pathlib
3738
import typing
3839

39-
from patoolib.util import PatoolError
4040
if typing.TYPE_CHECKING:
4141
from tornado.httputil import HTTPFile
4242

43-
from cmscommon.archive import Archive
43+
from cmscommon.archive import open_archive
4444

4545

4646
# Represents a file received through HTTP from an HTML form.
@@ -57,19 +57,30 @@ class ReceivedFile(typing.NamedTuple):
5757
class InvalidArchive(Exception):
5858
"""Raised when the archive submitted by the user cannot be opened."""
5959

60-
pass
60+
def __init__(self, too_big: bool = False, too_many_files: bool = False):
61+
"""
62+
too_big: Whether the InvalidArchive was raised because the files in it
63+
exceeded the size limit after decompression.
64+
too_many_files: Whether the InvalidArchive was raised because the
65+
archive contained more than the maximum number of files.
66+
"""
67+
self.too_big = too_big
68+
self.too_many_files = too_many_files
69+
super().__init__()
6170

6271

63-
def extract_files_from_archive(data: bytes) -> list[ReceivedFile]:
72+
def extract_files_from_archive(
73+
data: bytes, max_size: int | None = None, max_files: int | None = None
74+
) -> list[ReceivedFile]:
6475
"""Return the files contained in the given archive.
6576
66-
Given the binary data of an archive in any of the formats supported
67-
by patool, extract its contents and return them in our format. The
68-
archive's contents must be a valid directory structure (i.e., its
69-
contents cannot have conflicting/duplicated paths) but the structure
70-
will be ignored and the files will be returned with their basename.
77+
Given the binary data of an archive in a supported format, extract its
78+
contents and return them in our format. The directory structure of the
79+
archive is ignored; files will be returned with their basename.
7180
7281
data: the raw contents of the archive.
82+
max_size: maximum decompressed size of the archive.
83+
max_files: maximum number of files to allow in the archive.
7384
7485
return: the files contained in the archive, with
7586
their filename filled in but their codename set to None.
@@ -78,31 +89,35 @@ def extract_files_from_archive(data: bytes) -> list[ReceivedFile]:
7889
archive, its contents are invalid, or other issues.
7990
8091
"""
81-
archive = Archive.from_raw_data(data)
82-
83-
if archive is None:
84-
raise InvalidArchive()
85-
86-
result = list()
8792

93+
result: list[ReceivedFile] = []
94+
total_size = 0
8895
try:
89-
archive.unpack()
90-
for name in archive.namelist():
91-
with archive.read(name) as f:
92-
result.append(
93-
ReceivedFile(None, os.path.basename(name), f.read()))
94-
95-
except (PatoolError, OSError):
96+
archive = open_archive(io.BytesIO(data))
97+
for (filepath, size, handle) in archive.iter_regular_files():
98+
total_size += size
99+
if max_size is not None and total_size > max_size:
100+
raise InvalidArchive(too_big=True)
101+
if max_files is not None and len(result) + 1 > max_files:
102+
raise InvalidArchive(too_many_files=True)
103+
filedata = archive.get_file_bytes(handle)
104+
# archive file paths are always /-separated, so we can use
105+
# PosixPath to extract the basename.
106+
filename = pathlib.PurePosixPath(filepath).name
107+
result.append(ReceivedFile(None, filename, filedata))
108+
except InvalidArchive:
109+
raise
110+
# the Archive class might raise all kinds of exceptions when fed invalid data. Catch them all here.
111+
except Exception:
96112
raise InvalidArchive()
97113

98-
finally:
99-
archive.cleanup()
100-
101114
return result
102115

103116

104117
def extract_files_from_tornado(
105118
tornado_files: dict[str, list["HTTPFile"]],
119+
max_size: int | None = None,
120+
max_files: int | None = None,
106121
) -> list[ReceivedFile]:
107122
"""Transform some files as received by Tornado into our format.
108123
@@ -112,16 +127,24 @@ def extract_files_from_tornado(
112127
it and return its contents instead.
113128
114129
tornado_files: a bunch of files, in Tornado's format.
130+
max_size: limit on total size of decompressed files
131+
(protects against zip bombs).
132+
max_files: maximum number of files to allow in the archive
115133
116134
return: the same bunch of files, in our format
117135
(except if it was an archive: then it's the archive's contents).
118136
119137
raise (InvalidArchive): if there are issues extracting the archive.
120138
121139
"""
122-
if len(tornado_files) == 1 and "submission" in tornado_files \
123-
and len(tornado_files["submission"]) == 1:
124-
return extract_files_from_archive(tornado_files["submission"][0].body)
140+
if (
141+
len(tornado_files) == 1
142+
and "submission" in tornado_files
143+
and len(tornado_files["submission"]) == 1
144+
):
145+
return extract_files_from_archive(
146+
tornado_files["submission"][0].body, max_size, max_files
147+
)
125148

126149
result = list()
127150
for codename, files in tornado_files.items():

cms/server/contest/submission/workflow.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -155,12 +155,36 @@ def accept_submission(
155155

156156
required_codenames = set(task.submission_format)
157157

158+
# To protect against zip bombs, we raise an error if the archive's contents
159+
# are too big even before extracting everything. The largest "reasonable"
160+
# archive size is with every submission file provided, and every file being
161+
# the largest allowed. Since we don't yet know which files from the archive
162+
# are used and which are extraneous, this size limit applies to the entire
163+
# archive in total.
164+
archive_size_limit = config.max_submission_length * len(required_codenames)
165+
# Honest users never need to submit more than required_codenames files, but
166+
# we are a bit lenient to allow .DS_Store or other hidden files that might
167+
# accidentally end up in an archive.
168+
archive_max_files = 2 * len(required_codenames)
158169
try:
159-
received_files = extract_files_from_tornado(tornado_files)
160-
except InvalidArchive:
161-
raise UnacceptableSubmission(
162-
N_("Invalid archive format!"),
163-
N_("The submitted archive could not be opened."))
170+
received_files = extract_files_from_tornado(
171+
tornado_files, archive_size_limit, archive_max_files
172+
)
173+
except InvalidArchive as e:
174+
if e.too_big:
175+
raise UnacceptableSubmission(
176+
N_("Submission too big!"),
177+
N_("Each source file must be at most %d bytes long."),
178+
config.max_submission_length)
179+
if e.too_many_files:
180+
raise UnacceptableSubmission(
181+
N_("Submission too big!"),
182+
N_("The submission should contain at most %d files."),
183+
len(required_codenames))
184+
else:
185+
raise UnacceptableSubmission(
186+
N_("Invalid archive format!"),
187+
N_("The submitted archive could not be opened."))
164188

165189
try:
166190
files, language = match_files_and_language(
@@ -342,8 +366,13 @@ def accept_user_test(
342366
required_codenames.update(task_type.get_user_managers())
343367
required_codenames.add("input")
344368

369+
# See accept_submission() for these variables.
370+
archive_size_limit = config.max_submission_length * len(required_codenames)
371+
archive_max_files = 2 * len(required_codenames)
345372
try:
346-
received_files = extract_files_from_tornado(tornado_files)
373+
received_files = extract_files_from_tornado(
374+
tornado_files, archive_size_limit, archive_max_files
375+
)
347376
except InvalidArchive:
348377
raise UnacceptableUserTest(
349378
N_("Invalid archive format!"),

cmscommon/archive.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,14 @@
2121
2222
"""
2323

24+
from abc import ABCMeta, abstractmethod
25+
from collections.abc import Generator, Iterable
2426
import os
2527
import shutil
28+
import tarfile
2629
import tempfile
2730
import typing
31+
import zipfile
2832

2933
import patoolib
3034
from patoolib.util import PatoolError
@@ -225,3 +229,80 @@ def write(self, file_path: str, file_object):
225229
"You should write the file directly, in the "
226230
"folder returned by unpack(), and then "
227231
"call the repack() method.")
232+
233+
class ArchiveBase(metaclass=ABCMeta):
234+
"""Base class for archive reader implementations."""
235+
236+
@abstractmethod
237+
def iter_regular_files(self) -> Iterable[tuple[str, int, object]]:
238+
"""Yields tuples of (filepath, decompressed size, handle),
239+
regular files only.
240+
241+
filepath will always be /-separated.
242+
handle can be passed to open_file.
243+
"""
244+
pass
245+
246+
@abstractmethod
247+
def open_file(self, handle: object) -> typing.IO[bytes]:
248+
"""Open a member of the archive for reading."""
249+
pass
250+
251+
def get_file_bytes(self, handle: object) -> bytes:
252+
"""Read an archive member into bytes."""
253+
with self.open_file(handle) as f:
254+
return f.read()
255+
256+
257+
class ArchiveZipfile(ArchiveBase):
258+
"""Archive reader using `zipfile`, see ArchiveBase for method descriptions."""
259+
260+
def __init__(self, inner: zipfile.ZipFile):
261+
self.inner = inner
262+
263+
def iter_regular_files(self) -> list[tuple[str, int, zipfile.ZipInfo]]:
264+
return [
265+
(x.filename, x.file_size, x)
266+
for x in self.inner.infolist()
267+
if not x.is_dir()
268+
]
269+
270+
def open_file(self, handle: object) -> typing.IO[bytes]:
271+
assert isinstance(handle, zipfile.ZipInfo)
272+
return self.inner.open(handle, "r")
273+
274+
275+
class ArchiveTarfile(ArchiveBase):
276+
"""Archive reader using `tarfile`, see ArchiveBase for method descriptions."""
277+
278+
def __init__(self, inner: tarfile.TarFile):
279+
self.inner = inner
280+
281+
def iter_regular_files(self) -> Generator[tuple[str, int, tarfile.TarInfo]]:
282+
while (member := self.inner.next()) is not None:
283+
if member.isfile():
284+
yield (member.path, member.size, member)
285+
286+
def open_file(self, handle: object) -> typing.IO[bytes]:
287+
assert isinstance(handle, tarfile.TarInfo)
288+
fobj = self.inner.extractfile(handle)
289+
if fobj is None:
290+
raise ValueError("not a regular file")
291+
return fobj
292+
293+
294+
def open_archive(input: typing.IO[bytes]) -> ArchiveBase:
295+
"""Open an archive for reading.
296+
297+
input: the archive, opened for reading in binary mode.
298+
"""
299+
# Order is not entirely arbitrary here: is_zipfile is a very lenient check
300+
# that will also return True when the file is an uncompressed tar file that
301+
# happens to contain a zip file inside it. So check is_tarfile first (which
302+
# only reads the very start of the file).
303+
if tarfile.is_tarfile(input):
304+
return ArchiveTarfile(tarfile.open(fileobj=input))
305+
elif zipfile.is_zipfile(input):
306+
return ArchiveZipfile(zipfile.ZipFile(input))
307+
else:
308+
raise ValueError("not a known archive format")

0 commit comments

Comments
 (0)