Skip to content

Commit 3745cee

Browse files
authored
feat: is_safe_url to mitigate SSRF in URL requests (#2131)
Signed-off-by: tdruez <tdruez@aboutcode.org>
1 parent 51f25af commit 3745cee

9 files changed

Lines changed: 164 additions & 25 deletions

File tree

scanpipe/models.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2003,16 +2003,20 @@ def size(self):
20032003

20042004
def fetch(self):
20052005
"""Fetch the file from this instance ``download_url`` field."""
2006-
from scanpipe.pipes.fetch import fetch_url
2006+
from scanpipe.pipes import fetch
20072007

20082008
if self.exists():
20092009
logger.info("The input source file already exists.")
20102010
return
20112011

20122012
if not self.download_url:
2013-
raise Exception("No `download_url` value to be fetched.")
2013+
raise ValueError("No `download_url` value to be fetched.")
20142014

2015-
downloaded = fetch_url(url=self.download_url)
2015+
is_safe_and_available = fetch.check_url(self.download_url)
2016+
if not is_safe_and_available:
2017+
raise ValidationError(f"Could not fetch: {self.download_url}")
2018+
2019+
downloaded = fetch.fetch_url(url=self.download_url)
20162020
destination = self.project.move_input_from(downloaded.path)
20172021

20182022
# Force a commit to the database to ensure the file on disk is not rendered

scanpipe/pipes/fetch.py

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23+
import ipaddress
2324
import json
2425
import logging
2526
import os
2627
import re
28+
import socket
2729
import tempfile
2830
from collections import namedtuple
2931
from pathlib import Path
@@ -302,7 +304,7 @@ def fetch_docker_image(docker_url, to=None):
302304

303305

304306
def fetch_git_repo(url, to=None):
305-
"""Fetch provided git ``url`` as a clone and return a ``Download`` object."""
307+
"""Fetch provided git `url` as a clone and return a `Download` object."""
306308
download_directory = to or tempfile.mkdtemp()
307309
url = url.rstrip("/")
308310
filename = url.split("/")[-1]
@@ -324,6 +326,7 @@ def fetch_git_repo(url, to=None):
324326

325327

326328
def fetch_package_url(url):
329+
"""Fetch a package from the provided `url` and return a `Download` object."""
327330
# Ensure the provided Package URL is valid, or raise a ValueError.
328331
purl = PackageURL.from_string(url)
329332

@@ -371,7 +374,7 @@ def get_fetcher(url):
371374

372375

373376
def fetch_url(url):
374-
"""Fetch provided `url` and returns the result as a `Download` object."""
377+
"""Fetch provided `url` and return the result as a `Download` object."""
375378
fetcher = get_fetcher(url)
376379
logger.info(f'Fetching "{url}" using {fetcher.__name__}')
377380
downloaded = fetcher(url)
@@ -404,19 +407,55 @@ def fetch_urls(urls):
404407
return downloads, errors
405408

406409

407-
def check_urls_availability(urls):
408-
"""Check the accessibility of a list of URLs."""
409-
errors = []
410+
def is_safe_url(url):
411+
"""
412+
Check that a URL does not point to a private or internal network address.
413+
Mitigates SSRF by ensuring the target host resolves only to public IPs.
414+
"""
415+
parsed = urlparse(url)
416+
417+
# Only allow http and https schemes
418+
if parsed.scheme not in ("http", "https"):
419+
return False
420+
421+
# Reject URLs with no hostname
422+
if not parsed.hostname:
423+
return False
424+
425+
# Resolve the hostname to catch internal addresses hidden behind DNS
426+
try:
427+
resolved_ip = socket.gethostbyname(parsed.hostname)
428+
except socket.gaierror:
429+
return False
430+
431+
# Reject private, loopback, link-local, and reserved addresses
432+
ip = ipaddress.ip_address(resolved_ip)
433+
unsafe = (
434+
ip.is_private
435+
or ip.is_loopback
436+
or ip.is_link_local
437+
or ip.is_reserved
438+
or ip.is_multicast
439+
)
440+
return not unsafe
410441

411-
for url in urls:
412-
if not url.startswith("http"):
413-
continue
414442

415-
request_session = get_request_session(url)
416-
try:
417-
response = request_session.head(url, timeout=HTTP_REQUEST_TIMEOUT)
418-
response.raise_for_status()
419-
except requests.exceptions.RequestException:
420-
errors.append(url)
443+
def check_url(url):
444+
"""Check that a URL is safe and accessible."""
445+
if not is_safe_url(url):
446+
return False
447+
448+
request_session = get_request_session(url)
449+
try:
450+
response = request_session.head(url, timeout=HTTP_REQUEST_TIMEOUT)
451+
response.raise_for_status()
452+
except requests.exceptions.RequestException:
453+
return False
421454

455+
return True
456+
457+
458+
def check_urls_availability(urls):
459+
"""Check the safety and accessibility of a list of URLs."""
460+
errors = [url for url in urls if not check_url(url)]
422461
return errors

scanpipe/pipes/matchcode.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def is_available():
9393

9494

9595
def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT):
96-
"""Wrap the HTTP request calls on the API."""
96+
"""Send a GET request to `url` with optional `payload` and return the response."""
9797
if not url:
9898
return
9999

@@ -113,6 +113,7 @@ def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT):
113113

114114

115115
def request_post(url, data=None, headers=None, files=None, timeout=DEFAULT_TIMEOUT):
116+
"""Send a POST request with `data` to `url` and return the response."""
116117
try:
117118
response = session.post(
118119
url, data=data, timeout=timeout, headers=headers, files=files

scanpipe/pipes/purldb.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def check_service_availability(*args):
123123

124124

125125
def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT, raise_on_error=False):
126-
"""Wrap the HTTP request calls on the API."""
126+
"""Send a GET request to `url` with optional `payload` and return the response."""
127127
if not url:
128128
return
129129

@@ -147,6 +147,7 @@ def request_get(url, payload=None, timeout=DEFAULT_TIMEOUT, raise_on_error=False
147147

148148

149149
def request_post(url, data=None, headers=None, files=None, timeout=DEFAULT_TIMEOUT):
150+
"""Send a POST request with `data` to `url` and return the response."""
150151
try:
151152
response = session.post(
152153
url, data=data, timeout=timeout, headers=headers, files=files
@@ -413,6 +414,7 @@ def get_packages_for_purl(package_url):
413414

414415

415416
def collect_data_for_purl(package_url, raise_on_error=False):
417+
"""Fetch and return PurlDB entries for the provided `package_url`."""
416418
collect_api_url = f"{PURLDB_API_URL}collect/"
417419
payload = {
418420
"purl": str(package_url),

scanpipe/pipes/vulnerablecode.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def request_get(
9696
payload=None,
9797
timeout=None,
9898
):
99-
"""Wrap the HTTP request calls on the API."""
99+
"""Send a GET request to `url` with optional `payload` and return the response."""
100100
if not url:
101101
return
102102

@@ -118,6 +118,7 @@ def request_post(
118118
data,
119119
timeout=None,
120120
):
121+
"""Send a POST request with `data` as JSON to `url` and return the response."""
121122
try:
122123
response = session.post(url, json=data, timeout=timeout)
123124
response.raise_for_status()
@@ -216,8 +217,8 @@ def fetch_vulnerabilities(
216217
packages, chunk_size=1000, logger=logger.info, ignore_set=None
217218
):
218219
"""
219-
Fetch and store vulnerabilities for each provided ``packages``.
220-
The PURLs are used for the lookups in batch of ``chunk_size`` per request.
220+
Fetch and store vulnerabilities for each provided `packages`.
221+
The PURLs are used for the lookups in batch of `chunk_size` per request.
221222
"""
222223
vulnerabilities_by_purl = {}
223224

scanpipe/tests/pipes/test_fetch.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,14 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/nexB/scancode.io for support and download.
2222

23+
import socket
2324
from pathlib import Path
2425
from unittest import mock
2526

2627
from django.test import TestCase
2728
from django.test import override_settings
2829

30+
import requests
2931
from requests import auth as request_auth
3032

3133
from scanpipe.pipes import fetch
@@ -265,3 +267,83 @@ def test_scanpipe_pipes_fetch_git_repo(self, mock_clone_from):
265267
self.assertEqual("", download.size)
266268
self.assertEqual("", download.sha1)
267269
self.assertEqual("", download.md5)
270+
271+
@mock.patch("scanpipe.pipes.fetch.socket.gethostbyname")
272+
def test_scanpipe_pipes_fetch_is_safe_url(self, mock_gethostbyname):
273+
# Valid public URLs
274+
mock_gethostbyname.return_value = "93.184.216.34" # example.com
275+
self.assertTrue(fetch.is_safe_url("https://example.com/file.zip"))
276+
self.assertTrue(fetch.is_safe_url("http://example.com/file.zip"))
277+
278+
# Invalid schemes
279+
self.assertFalse(fetch.is_safe_url("ftp://example.com/file.zip"))
280+
self.assertFalse(fetch.is_safe_url("docker://example.com/image"))
281+
self.assertFalse(fetch.is_safe_url(""))
282+
283+
# No hostname
284+
self.assertFalse(fetch.is_safe_url("https://"))
285+
286+
# Unresolvable hostname
287+
mock_gethostbyname.side_effect = socket.gaierror
288+
self.assertFalse(fetch.is_safe_url("https://thisdomaindoesnotexist.invalid/"))
289+
mock_gethostbyname.side_effect = None
290+
291+
# Private ranges
292+
mock_gethostbyname.return_value = "192.168.1.1"
293+
self.assertFalse(fetch.is_safe_url("http://192.168.1.1/"))
294+
mock_gethostbyname.return_value = "10.0.0.1"
295+
self.assertFalse(fetch.is_safe_url("http://10.0.0.1/"))
296+
mock_gethostbyname.return_value = "172.16.0.1"
297+
self.assertFalse(fetch.is_safe_url("http://172.16.0.1/"))
298+
299+
# Loopback
300+
mock_gethostbyname.return_value = "127.0.0.1"
301+
self.assertFalse(fetch.is_safe_url("http://127.0.0.1/"))
302+
mock_gethostbyname.return_value = "127.0.0.1"
303+
self.assertFalse(fetch.is_safe_url("http://localhost/"))
304+
305+
# Link-local
306+
mock_gethostbyname.return_value = "169.254.169.254"
307+
self.assertFalse(fetch.is_safe_url("http://169.254.169.254/"))
308+
309+
# Multicast
310+
mock_gethostbyname.return_value = "224.0.0.1"
311+
self.assertFalse(fetch.is_safe_url("http://224.0.0.1/"))
312+
313+
@mock.patch("scanpipe.pipes.fetch.socket.gethostbyname")
314+
@mock.patch("requests.sessions.Session.head")
315+
def test_scanpipe_pipes_fetch_check_url(self, mock_head, mock_gethostbyname):
316+
url = "https://example.com/file.zip"
317+
318+
# Safe and accessible URL
319+
mock_gethostbyname.return_value = "93.184.216.34"
320+
mock_head.return_value = make_mock_response(url=url)
321+
self.assertTrue(fetch.check_url(url))
322+
323+
# Unsafe URL
324+
mock_gethostbyname.return_value = "127.0.0.1"
325+
self.assertFalse(fetch.check_url("http://localhost/"))
326+
327+
# Safe URL but request fails
328+
mock_gethostbyname.return_value = "93.184.216.34"
329+
mock_head.side_effect = requests.exceptions.RequestException
330+
self.assertFalse(fetch.check_url(url))
331+
332+
@mock.patch("scanpipe.pipes.fetch.socket.gethostbyname")
333+
@mock.patch("requests.sessions.Session.head")
334+
def test_scanpipe_pipes_fetch_check_urls_availability(
335+
self, mock_head, mock_gethostbyname
336+
):
337+
urls = [
338+
"https://example.com/file.zip",
339+
"https://example.com/archive.tar.gz",
340+
]
341+
342+
# All URLs safe and accessible
343+
mock_gethostbyname.return_value = "93.184.216.34"
344+
mock_head.return_value = make_mock_response(url="mocked_url")
345+
self.assertEqual([], fetch.check_urls_availability(urls))
346+
347+
# All URLs fail
348+
mock_head.side_effect = requests.exceptions.RequestException
349+
self.assertEqual(urls, fetch.check_urls_availability(urls))

scanpipe/tests/test_commands.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1016,8 +1016,12 @@ def test_scanpipe_management_command_run(self):
10161016
self.assertEqual("do_nothing", runs[1]["pipeline_name"])
10171017
self.assertEqual(["Group1", "Group2"], runs[1]["selected_groups"])
10181018

1019+
@mock.patch("scanpipe.pipes.fetch.check_url")
10191020
@mock.patch("requests.sessions.Session.get")
1020-
def test_scanpipe_management_command_run_multiple_inputs(self, mock_get):
1021+
def test_scanpipe_management_command_run_multiple_inputs(
1022+
self, mock_get, mock_check_url
1023+
):
1024+
mock_check_url.return_value = True
10211025
source_download_url = "https://example.com/z-source.zip#from"
10221026
bin_download_url = "https://example.com/z-bin.zip#to"
10231027
mock_get.side_effect = [

scanpipe/tests/test_models.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1546,10 +1546,12 @@ def test_scanpipe_input_source_model_delete_file(self):
15461546
input_source.delete_file()
15471547
self.assertFalse(input_source.exists())
15481548

1549+
@mock.patch("scanpipe.pipes.fetch.check_url")
15491550
@mock.patch("requests.sessions.Session.get")
1550-
def test_scanpipe_input_source_model_fetch(self, mock_get):
1551+
def test_scanpipe_input_source_model_fetch(self, mock_get, mock_check_url):
15511552
download_url = "https://download.url/file.zip"
15521553
mock_get.return_value = make_mock_response(url=download_url)
1554+
mock_check_url.return_value = True
15531555

15541556
input_source = self.project1.add_input_source(download_url=download_url)
15551557
destination = input_source.fetch()

scanpipe/tests/test_pipelines.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,12 @@ def test_scanpipe_pipeline_class_download_inputs_attribute(self):
197197
pipeline.execute()
198198
self.assertNotIn("Step [download_missing_inputs]", run.log)
199199

200+
@mock.patch("scanpipe.pipes.fetch.check_url")
200201
@mock.patch("requests.sessions.Session.get")
201-
def test_scanpipe_pipeline_class_download_missing_inputs(self, mock_get):
202+
def test_scanpipe_pipeline_class_download_missing_inputs(
203+
self, mock_get, mock_check_url
204+
):
205+
mock_check_url.return_value = True
202206
project1 = make_project()
203207
run = project1.add_pipeline("do_nothing")
204208
pipeline = run.make_pipeline_instance()

0 commit comments

Comments
 (0)