-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdataloader.py
More file actions
118 lines (57 loc) · 2.52 KB
/
dataloader.py
File metadata and controls
118 lines (57 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import pandas as pd
import random
# # WOS Dataset
# ## Note: WOS results in paper area only reported for depth=2.
# In[ ]:
# Data must be downloaded from https://data.mendeley.com/datasets/9rw3vkcfy4/6
# We utilize Meta-data/Data.xlsx
data_path_wos = "Meta-data/Data.xlsx"
df_wos = pd.read_excel(data_path_wos).dropna()
# In[ ]:
freg = df_wos.groupby('area')['area'].transform('count')
prob = freg / freg.sum()
df_sampled = df_wos.sample(n=3000, replace=False, weights=prob.tolist())
# In[ ]:
df_sampled.area.value_counts()
# In[ ]:
import matplotlib.pyplot as plt
plt.bar( [i for i in range(0, len(df_sampled.area.value_counts()))], df_sampled.area.value_counts().tolist())
# In[ ]:
labels = list(set(df_wos.Domain))
unseen_list = random.sample(labels, 2)
seen_list = [value for value in labels if value not in unseen_list]
test_unseen = test[pd.DataFrame(test.Domain.tolist()).isin(unseen_list).any(1).values]
test_unseen.to_csv('test.csv')
# # Amazon Beauty Dataset
# ## Note: Based on the number of nodes, the categories may have to be modified. The code for "depth=2" is commented out and "depth=3" is utilized below.
# In[ ]:
# this dataset must be downloaded from https://amazon-reviews-2023.github.io/ under the category of All_Beauty
data_path = 'datasets/Beauty'
df_beauty = pd.read_json(data_path + '/beauty_meta', lines=True)
# In[ ]:
df_beauty = df_beauty[['description', 'categories']]
df_beauty.loc[:, 'categories'] = df_beauty.categories.map(lambda x: x[0])
# two_categories = [l[1:3] for l in df_beauty.categories]
three_categories = [l[1:4] for l in df_beauty.categories]
set([l[-1] for l in three_categories])
# In[ ]:
# df_beauty['category_limit'] = two_categories
df_beauty['category_limit'] = three_categories
df_beauty = df_beauty.dropna()
df_beauty['len'] = df_beauty['category_limit'].str.len()
# df_beauty = df_beauty.loc[df_beauty['len'] == 2]
df_beauty = df_beauty.loc[df_beauty['len'] == 3]
df_beauty.loc[:, 'category_parent'] = df_beauty.category_limit.map(lambda x: x[0])
df_beauty.loc[:, 'category_pchild'] = df_beauty.category_limit.map(lambda x: x[1])
df_beauty.loc[:, 'category_child'] = df_beauty.category_limit.map(lambda x: x[2])
# In[ ]:
df_beauty
# In[ ]:
labels = list(set(df_beauty.category_parent))
unseen_list = random.sample(labels, 2)
seen_list = [value for value in labels if value not in unseen_list]
test_unseen = test[pd.DataFrame(test.category_parent.tolist()).isin(unseen_list).any(1).values]
test_unseen.to_csv('test.csv')