Skip to content

Commit 62a0b95

Browse files
committed
inital commit
0 parents  commit 62a0b95

39 files changed

Lines changed: 626930 additions & 0 deletions

DOSS.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import pandas as pd
2+
import numpy as np
3+
import os
4+
from copy import deepcopy
5+
6+
7+
# Paths
8+
DIR_ANNOTATIONS = "annotations/"
9+
10+
# Similarities bewteen terms
11+
sims = pd.read_csv("all_similarities.csv", index_col=0)
12+
13+
# Datasets indices and annotaitons
14+
ds_idxs = range(1, 17)
15+
annotations = {idx: pd.read_csv(DIR_ANNOTATIONS + f"{idx}_annotation.csv", index_col=0) for idx in ds_idxs}
16+
17+
# calculate DOSS between two datasets
18+
def calc_DOSS(i_annotation, j_annotation):
19+
idxs = [int(el) for el in i_annotation["term_id"] if el != "None"]
20+
cols = [str(el) for el in j_annotation["term_id"] if el != "None"]
21+
sim_matrix = sims.loc[idxs, cols]
22+
max_vector = sim_matrix.apply(max, axis=1)
23+
return np.mean(max_vector)
24+
25+
# get DOSS matrix
26+
DOSS_matrix = pd.DataFrame(columns=ds_idxs, index=ds_idxs)
27+
for ds_i in ds_idxs:
28+
for ds_j in ds_idxs:
29+
DOSS_matrix.loc[ds_i, ds_j] = calc_DOSS(annotations[ds_i], annotations[ds_j])
30+
31+
DOSS_matrix.to_csv("DOSS_matrix.csv")

DOSS_matrix.csv

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
2+
1,1.0,0.110221947579648,0.5599052414909874,0.19713639448438344,0.44929950652162076,0.4602077880626424,0.5599052414909874,0.3305949706491196,0.2264173668045877,0.20080048458029445,0.13083899633588356,0.39895164741180195,0.18266081220759983,0.09842951366525914,0.42681595895625696,0.2042883191100372
3+
2,0.13302234777293048,1.0,0.1288969212370587,0.16002251645798377,0.4923965941120048,0.1082030337584364,0.1288969212370587,0.11967405047313528,0.15368326956396378,0.18263341448903575,0.06355396525676633,0.12889363696272865,0.12880284600323844,0.07660827833644887,0.5952546649975261,0.1363946710343064
4+
3,0.3945160800661892,0.09848492704250322,1.0,0.14969800151606513,0.1569518442952902,0.3290759946811734,1.0,0.22524522782829987,0.17524596334742973,0.15615611605505408,0.04010283904956182,0.18648661944619277,0.13481406342305655,0.08129360680144763,0.32291677290901194,0.1606973084114635
5+
4,0.09783656290814645,0.100179540992158,0.09129453879109392,1.0,0.10523869562249676,0.10118868772596361,0.09129453879109392,0.0903645935159826,0.1151991511357782,0.1206469464414704,0.07088212086809649,0.06523888109477297,0.08637694662213279,0.05353624230116096,0.12998171243463968,0.10811964046179942
6+
5,0.27258448914085315,0.21639207097142793,0.12987765507303575,0.1169029005335696,1.0,0.1136583465261072,0.12987765507303575,0.16321412036746963,0.23945497401039792,0.2294017066434088,0.26037311199767554,0.2270049737296446,0.18469359596058701,0.043668732945464674,0.5833487671292285,0.13847593881477752
7+
6,0.6094755886517453,0.13197466237963518,0.5838012595349032,0.27586719420715844,0.17816389836628443,1.0,0.5838012595349032,0.33909138342516454,0.22331902652771357,0.2337156768293509,0.08530256804252101,0.2477749076036266,0.21793652178674383,0.12465928058290379,0.4409056033802781,0.2318593539243223
8+
7,0.3945160800661892,0.0984849270425032,1.0,0.14969800151606513,0.15695184429529016,0.3290759946811733,1.0,0.2252452278282998,0.17524596334742973,0.1561561160550541,0.04010283904956182,0.18648661944619274,0.13481406342305652,0.08129360680144763,0.32291677290901194,0.1606973084114635
9+
8,0.3893875352173101,0.12963155991878975,0.41457167539580664,0.25417466008862283,0.2709683536469047,0.37778242301359977,0.41457167539580664,1.0,0.3131153241123292,0.2484637611307363,0.09559590649342196,0.19943913739967953,0.23338193395189927,0.1262480170143209,0.370594183149902,0.24003029530858389
10+
9,0.3044280348876145,0.19275514256003828,0.31931810686524553,0.24040135065686463,0.48175662579833417,0.22604914593746595,0.31931810686524553,0.31931810686524553,1.0,0.5470497043571146,0.14251658713758691,0.20358956912284054,0.6495998161753612,0.09724676211429621,0.5589670389099892,0.30215807179471615
11+
10,0.17548766188320195,0.1286355517626427,0.18155898146450777,0.1630533856953713,0.25848398588954086,0.15369156101104928,0.18155898146450777,0.16215958294254032,0.3137267014952254,1.0,0.07240390221684252,0.12160403559010555,0.2949104560704989,0.07693160462649994,0.36430955732939874,0.2041233981953159
12+
11,0.15919873169796492,0.04251904058053078,0.05926915366027751,0.12274918761213438,0.357748235401689,0.09535625625761768,0.05926915366027751,0.09667444681812751,0.08582760725833086,0.09045056329071607,1.0,0.10052256040264461,0.08141783331680631,0.03171049134336872,0.25651270495179385,0.09104573637093034
13+
12,0.6573845265194238,0.17176392364514054,0.4303458855343393,0.21256362306314552,0.567422092121421,0.3336472999655013,0.4303458855343393,0.2438724971953053,0.21087719381692027,0.21517728071028785,0.09765190535311705,1.0,0.19681313140621592,0.15774346085997595,0.3721245240224595,0.21590159225363734
14+
13,0.3192723470202331,0.1803593792521558,0.2920559150043798,0.25522936143734953,0.4996499217662407,0.26088159019242796,0.2920559150043798,0.2920559150043798,0.827157812560693,0.6633194106211688,0.169425625553802,0.23208141933850782,1.0,0.11625692826190322,0.602410800127564,0.3479098031919501
15+
14,0.14439910819792612,0.13404816464528058,0.1465331265781724,0.14388587892116378,0.1453530157770675,0.143081927609903,0.1465331265781724,0.14215077274396337,0.14256673418486432,0.14681959267296726,0.03414332395439167,0.13236084502611856,0.15126102431304309,1.0,0.15419934495103543,0.17422072444256986
16+
15,0.1611456745396416,0.24005627114638048,0.14518763997551287,0.12705858105641296,0.44013483163361716,0.13507067762600308,0.14518763997551287,0.12436623054796332,0.1993328398765459,0.21870898822115944,0.16850169291821476,0.137482523776315,0.16667861664751943,0.03474235273777884,1.0,0.12528558406102008
17+
16,0.1838459124274196,0.11000003730051083,0.17437849246048295,0.13949760886537044,0.18069127720377726,0.14096534314457695,0.17437849246048295,0.1530479811500955,0.19707338819163495,0.21523521021910652,0.10463918317607891,0.1408871956099892,0.17530815140814837,0.06971951393900133,0.20826371590747522,1.0

README.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# SeFNet: Bridging Tabular Datasets with Semantic Feature Nets
2+
3+
This repository contains code and resources that can be used to reproduce the results presented in the article "SeFNet: Bridging Tabular Datasets with Semantic Feature Nets".
4+
5+
## Reproducting results
6+
### 0. Annotating datasets <br>
7+
The annotation of datasets' features is a tedious process, so the annotations we made manually have been made available in the `annotations` directory. Every annotation file is in .csv format and it consists of two columns: column_name (original feature names) and term_id (SNOMED-CT term ids).
8+
9+
### 1. Calculating similarity between terms <br>
10+
Similarity of terms is callculated using Maven. The necessary dependency information and java configuration are contained in the file pom.xml. Key functionalities used, such as computing semantic similarity between terms, have been implemented in the [slib-sml](https://github.com/sharispe/slib) library.
11+
12+
In order to reproduce our results you have to first get access to [SNOMED-CT ontology](https://www.snomed.org/get-snomed). After downloading the ontology place the folder in the main catalog of the repository. In our research we have used the US version released on March 1, 2023.
13+
14+
When ontology files are present all that is needed is to execute AllTermsSimilarity.java.
15+
16+
### 2. Calculating DOSS matrix <br>
17+
Before the DOSS matrix can be calculated, python and the necessary packages must be installed (`numpy` and `pandas`). We have used python 3.9 and the versions of the packages specified in requirements.txt.
18+
```
19+
pip install -r requirements.txt
20+
```
21+
Now all that is required is to exectute the script:
22+
```
23+
python DOSS.py
24+
```
25+
26+
## Repository structure
27+
```
28+
├── annotations - directory containing datasets annotations
29+
├── calculate-term-similarities
30+
│ ├── src/main/java
31+
│ │ ├── AllTermsSimilarity.java - calculate semantic similarity between all annotated terms (term_similarities.csv)
32+
│ │ ├── Dataset2DatasetSimilarity.java - calculate semantic similarity between terms in two datasets
33+
│ │ ├── SingleTermSimilarity.java - calculate semantic similarity between two terms
34+
│ ├── pom.xml - maven project configuration
35+
├── datasets - directory containing datasets which could be shared
36+
├── DOSS.py - python script which creates DOSS_matrix.csv
37+
├── DOSS_matrix.csv
38+
├── README.md
39+
├── annotations.csv - annotations of all used terms
40+
├── requirements.txy - python necessary packages
41+
└── term_similarities.csv - semantic similarity between all annotated terms calculated in AllTermsSimilarity.java
42+
```
43+
44+
## Citation
45+
```
46+
TBC
47+
```

annotations.csv

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
,term_id
2+
0,10200004
3+
1,103579009
4+
2,104133003
5+
3,104142005
6+
4,104589004
7+
5,104847001
8+
6,104866001
9+
7,104976002
10+
8,105011006
11+
9,108252007
12+
10,108480007
13+
11,109071007
14+
12,11455007
15+
13,1153637007
16+
14,116566001
17+
15,118600007
18+
16,121868005
19+
17,1240581000000104
20+
18,128272009
21+
19,128462008
22+
20,12856003
23+
21,14089001
24+
22,14304000
25+
23,160592001
26+
24,161712005
27+
25,165581004
28+
26,167305006
29+
27,16983000
30+
28,174255007
31+
29,182838006
32+
30,184107009
33+
31,185952002
34+
32,19943007
35+
33,224168007
36+
34,224209007
37+
35,224362002
38+
36,22569008
39+
37,226448008
40+
38,226452008
41+
39,228698009
42+
40,230690007
43+
41,23573002
44+
42,238131007
45+
43,246211005
46+
44,249572006
47+
45,250546000
48+
46,250564007
49+
47,250637003
50+
48,250668004
51+
49,251070002
52+
50,251071003
53+
51,251073000
54+
52,251075007
55+
53,25173007
56+
54,25469001
57+
55,263495000
58+
56,263605001
59+
57,26523005
60+
58,266918002
61+
59,26758005
62+
60,27051004
63+
61,271062006
64+
62,27113001
65+
63,271226002
66+
64,271285000
67+
65,271649006
68+
66,271650006
69+
67,271737000
70+
68,271921002
71+
69,272670002
72+
70,274075007
73+
71,27913002
74+
72,281395000
75+
73,281396004
76+
74,281397008
77+
75,282195009
78+
76,28317006
79+
77,289908002
80+
78,29857009
81+
79,300328001
82+
80,300995000
83+
81,301851003
84+
82,302019003
85+
83,304383000
86+
84,305342007
87+
85,309632008
88+
86,309904001
89+
87,312468003
90+
88,312469006
91+
89,312471006
92+
90,312472004
93+
91,313408009
94+
92,31542002
95+
93,33747003
96+
94,34486009
97+
95,34608000
98+
96,359846004
99+
97,359986008
100+
98,36048009
101+
99,363478007
102+
100,363800008
103+
101,363803005
104+
102,364075005
105+
103,365581002
106+
104,367391008
107+
105,370992007
108+
106,3716002
109+
107,37254006
110+
108,372567009
111+
109,372701006
112+
110,373216001
113+
111,373481003
114+
112,373713005
115+
113,373864002
116+
114,38082009
117+
115,38341003
118+
116,384759009
119+
117,384978002
120+
118,386725007
121+
119,386964000
122+
120,386965004
123+
121,386966003
124+
122,386967007
125+
123,386991003
126+
124,387070004
127+
125,387143009
128+
126,387186009
129+
127,387210001
130+
128,387713003
131+
129,389026000
132+
130,38929004
133+
131,39104002
134+
132,394733009
135+
133,394960005
136+
134,395828009
137+
135,395869000
138+
136,396451008
139+
137,39748002
140+
138,397669002
141+
139,405161002
142+
140,409073007
143+
141,409120009
144+
142,40930008
145+
143,413307004
146+
144,413320001
147+
145,414545008
148+
146,416800000
149+
147,417005
150+
148,418416001
151+
149,418835009
152+
150,419620001
153+
151,42351005
154+
152,423902002
155+
153,42525009
156+
154,427081008
157+
155,428630002
158+
156,429622005
159+
157,431314004
160+
158,43396009
161+
159,438173002
162+
160,442790008
163+
161,443527007
164+
162,446325007
165+
163,45007003
166+
164,457441000124102
167+
165,45896001
168+
166,461541000124102
169+
167,49436004
170+
168,50711007
171+
169,5113004
172+
170,53741008
173+
171,54706004
174+
172,55235003
175+
173,56265001
176+
174,56882008
177+
175,60153001
178+
176,60170009
179+
177,60621009
180+
178,61167004
181+
179,61928009
182+
180,62479008
183+
181,66493003
184+
182,66590003
185+
183,66842004
186+
184,67487000
187+
185,67776007
188+
186,67866001
189+
187,6797001
190+
188,68130003
191+
189,68615006
192+
190,68677007
193+
191,687005
194+
192,690051000119100
195+
193,70901006
196+
194,71388002
197+
195,714797009
198+
196,71960002
199+
197,73211009
200+
198,735146006
201+
199,75672003
202+
200,767002
203+
201,76752008
204+
202,77068002
205+
203,77176002
206+
204,776106002
207+
205,776110004
208+
206,776113002
209+
207,80515008
210+
208,82799009
211+
209,8319008
212+
210,84114007
213+
211,84229001
214+
212,85097005
215+
213,85899009
216+
214,86290005
217+
215,88810008

annotations/10_annotation.csv

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
,column_name,term_id
2+
0,class,419620001
3+
1,age,397669002
4+
2,sex,263495000
5+
3,steroid,116566001
6+
4,antivirals,372701006
7+
5,fatigue,84229001
8+
6,malaise,367391008
9+
7,anorexia,56882008
10+
8,liver_big,80515008
11+
9,liver_firm,300328001
12+
10,spleen_palpable,249572006
13+
11,spiders,None
14+
12,ascites,389026000
15+
13,varices,12856003
16+
14,bilirubin,359986008
17+
15,alk_phosphate,104866001
18+
16,sgot,45896001
19+
17,albumin,26758005
20+
18,protime,396451008
21+
19,histology,714797009

annotations/11_annotation.csv

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
,column_name,term_id
2+
0,vm1,364075005
3+
1,vm3,251071003
4+
2,vm4,251073000
5+
3,vm5,251075007
6+
4,vm13,82799009
7+
5,vm20,104847001
8+
6,vm28,457441000124102
9+
7,vm62,27913002
10+
8,vm136,394960005
11+
9,vm146,None
12+
10,vm172,165581004
13+
11,vm174,22569008
14+
12,vm176,55235003
15+
13,vm41,26523005
16+
14,vm42,108480007
17+
15,vm43,442790008
18+
16,vm44,66493003
19+
17,vm87,373216001

annotations/12_annotation.csv

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
,column_name,term_id
2+
0,Pregnancies,246211005
3+
1,Glucose,36048009
4+
2,BloodPressure,271650006
5+
3,SkinThickness,301851003
6+
4,Insulin,271226002
7+
5,BMI,60621009
8+
6,DiabetesPedigreeFunction,None
9+
7,Age,397669002
10+
8,Outcome,73211009

annotations/13_annotation.csv

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
,column_name,term_id
2+
0,Age,397669002
3+
1,Gender,263495000
4+
2,Total_Bilirubin,359986008
5+
3,Direct_Bilirubin,39748002
6+
4,Alkaline_Phosphotase,88810008
7+
5,Alamine_Aminotransferase,34608000
8+
6,Aspartate_Aminotransferase,45896001
9+
7,Total_Protiens,304383000
10+
8,Albumin,26758005
11+
9,Albumin_and_Globulin_Ratio,687005
12+
10,target,10200004

annotations/14_annotation.csv

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
,column_name,term_id
2+
0,age,397669002
3+
1,menopause,161712005
4+
2,tumor-size,263605001
5+
3,inv-nodes,443527007
6+
4,node-caps,68677007
7+
5,deg-malig,None
8+
6,breast,76752008
9+
7,breast-quad,272670002
10+
8,irradiat,461541000124102
11+
9,Class,25173007

0 commit comments

Comments
 (0)