99
1010import logging
1111import os .path
12- from dataclasses import dataclass
1312from itertools import zip_longest
1413
1514import saneyaml
15+ from aboutcode .pipeline import LoopProgress
1616
1717from fedcode .activitypub import Activity
1818from fedcode .activitypub import UpdateActivity
1919from fedcode .models import Note
2020from fedcode .models import Package
2121from fedcode .models import Repository
22- from fedcode .models import Service
2322from fedcode .models import Vulnerability
23+ from fedcode .pipelines import FederatedCodePipeline
2424from fedcode .pipes import utils
2525
26- logger = logging .getLogger (__name__ )
2726
27+ class SyncVulnerableCode (FederatedCodePipeline ):
28+ """Sync VulnerableCode data from FederatedCode git repositories."""
2829
29- @dataclass
30- class Importer :
31- repo_obj : Repository
32- default_service : Service
30+ pipeline_id = "sync_vulnerablecode"
3331
34- def run (self ):
35- repo = self .repo_obj .git_repo_obj
36- latest_commit_hash = repo .head .commit .hexsha
37- latest_commit = repo .commit (latest_commit_hash )
38- if self .repo_obj .last_imported_commit :
39- last_imported_commit = repo .commit (self .repo_obj .last_imported_commit )
40- diffs = last_imported_commit .diff (latest_commit )
41- else :
42- last_imported_commit = None
43- # Diff between empty trees and last_imported_commit
44- diffs = latest_commit .diff ("4b825dc642cb6eb9a060e54bf8d69288fbee4904" , R = True )
45-
46- if repo .head .commit .hexsha == self .repo_obj .last_imported_commit :
47- logger .error ("Nothing to import!" )
48- return
32+ @classmethod
33+ def steps (cls ):
34+ return (
35+ cls .get_git_repos ,
36+ cls .sync_vulnerablecode_repositories ,
37+ )
4938
50- for diff in diffs :
51- if not diff .a_path .endswith (".yaml" ):
52- continue
39+ def get_git_repos (self ):
40+ self .git_repos = Repository .objects .all ()
5341
54- if diff .a_path .startswith ("." ):
55- continue
42+ def sync_vulnerablecode_repositories (self ):
43+ repositories_count = self .git_repos .count ()
44+ self .log (f"Syncing vulnerability from { repositories_count :,d} repositories" )
5645
57- yaml_data_a_blob = (
58- saneyaml .load (diff .a_blob .data_stream .read ()) if diff .a_blob else None
59- )
60- yaml_data_b_blob = (
61- saneyaml .load (diff .b_blob .data_stream .read ()) if diff .b_blob else None
46+ progress = LoopProgress (total_iterations = repositories_count , logger = self .log )
47+ for repository in progress .iter (self .git_repos .iterator (chunk_size = 2000 )):
48+ repository .git_repo_obj .remotes .origin .pull ()
49+ sync_vulnerabilities (
50+ repository = repository ,
51+ logger = self .log ,
6252 )
6353
64- if os .path .split (diff .a_path )[1 ].startswith ("VCID" ) or os .path .split (diff .b_path )[
65- 1
66- ].startswith ("VCID" ):
67- vul_handler (
68- diff .change_type ,
69- self .repo_obj ,
70- yaml_data_a_blob ,
71- yaml_data_b_blob ,
72- diff .a_path ,
73- diff .b_path ,
74- )
75- continue
7654
77- pkg_handler (
55+ def sync_vulnerabilities (repository , logger ):
56+ repo = repository .git_repo_obj
57+ latest_commit_hash = repo .head .commit .hexsha
58+ latest_commit = repo .commit (latest_commit_hash )
59+ if repository .last_imported_commit :
60+ last_imported_commit = repo .commit (repository .last_imported_commit )
61+ diffs = last_imported_commit .diff (latest_commit )
62+ else :
63+ last_imported_commit = None
64+ # Diff between empty trees and last_imported_commit
65+ diffs = latest_commit .diff ("4b825dc642cb6eb9a060e54bf8d69288fbee4904" , R = True )
66+
67+ if repo .head .commit .hexsha == repository .last_imported_commit :
68+ logger ("Nothing to import!" , level = logging .ERROR )
69+ return
70+
71+ diff_count = len (diffs )
72+
73+ logger (f"Syncing { diff_count :,d} vulnerability scan from { repository .url } " )
74+ progress = LoopProgress (total_iterations = diff_count , logger = logger )
75+ for diff in progress .iter (diffs ):
76+ if not diff .a_path .endswith (".yaml" ):
77+ continue
78+
79+ if diff .a_path .startswith ("." ):
80+ continue
81+
82+ yaml_data_a_blob = saneyaml .load (diff .a_blob .data_stream .read ()) if diff .a_blob else None
83+ yaml_data_b_blob = saneyaml .load (diff .b_blob .data_stream .read ()) if diff .b_blob else None
84+
85+ if os .path .split (diff .a_path )[1 ].startswith ("VCID" ) or os .path .split (diff .b_path )[
86+ 1
87+ ].startswith ("VCID" ):
88+ vul_handler (
7889 diff .change_type ,
79- self . default_service ,
90+ repository ,
8091 yaml_data_a_blob ,
8192 yaml_data_b_blob ,
93+ logger ,
8294 )
83- self .repo_obj .last_imported_commit = latest_commit_hash
84- self .repo_obj .save ()
85- logger .info ("The Importer run successfully" )
95+ continue
96+
97+ pkg_handler (
98+ diff .change_type ,
99+ repository .admin ,
100+ yaml_data_a_blob ,
101+ yaml_data_b_blob ,
102+ )
103+ break
104+ repository .last_imported_commit = latest_commit_hash
105+ repository .save ()
106+ logger ("The Importer run successfully" )
86107
87108
88- def vul_handler (change_type , repo_obj , yaml_data_a_blob , yaml_data_b_blob , a_path , b_path ):
109+ def vul_handler (change_type , repo_obj , yaml_data_a_blob , yaml_data_b_blob , logger ):
89110 if change_type == "A" : # A for added paths
90111 Vulnerability .objects .get_or_create (
91112 id = yaml_data_b_blob .get ("vulnerability_id" ),
@@ -108,7 +129,7 @@ def vul_handler(change_type, repo_obj, yaml_data_a_blob, yaml_data_b_blob, a_pat
108129 )
109130 vul .delete ()
110131 else :
111- logger . error (f"Invalid Vulnerability File" )
132+ logger (f"Invalid Vulnerability File" , level = logging . ERROR )
112133
113134
114135def pkg_handler (change_type , default_service , yaml_data_a_blob , yaml_data_b_blob ):
0 commit comments