A-Simple-and-Effective-Framework-for-Strict-Zero-Shot-Hierarchical-Classification/dataloader.py at main · RohanVB/A-Simple-and-Effective-Framework-for-Strict-Zero-Shot-Hierarchical-Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


import pandas as pd
import random


# # WOS Dataset

# ## Note: WOS results in paper area only reported for depth=2.

# In[ ]:


# Data must be downloaded from https://data.mendeley.com/datasets/9rw3vkcfy4/6
# We utilize Meta-data/Data.xlsx

data_path_wos = "Meta-data/Data.xlsx"
df_wos = pd.read_excel(data_path_wos).dropna()


# In[ ]:


freg = df_wos.groupby('area')['area'].transform('count')
prob = freg / freg.sum()
df_sampled = df_wos.sample(n=3000, replace=False, weights=prob.tolist())


# In[ ]:


df_sampled.area.value_counts()


# In[ ]:


import matplotlib.pyplot as plt

plt.bar( [i for i in range(0, len(df_sampled.area.value_counts()))], df_sampled.area.value_counts().tolist())


# In[ ]:


labels = list(set(df_wos.Domain))

unseen_list = random.sample(labels, 2)
seen_list = [value for value in labels if value not in unseen_list]

test_unseen = test[pd.DataFrame(test.Domain.tolist()).isin(unseen_list).any(1).values]
test_unseen.to_csv('test.csv')


# # Amazon Beauty Dataset

# ## Note: Based on the number of nodes, the categories may have to be modified. The code for "depth=2" is commented out and "depth=3" is utilized below.

# In[ ]:


# this dataset must be downloaded from https://amazon-reviews-2023.github.io/ under the category of All_Beauty

data_path = 'datasets/Beauty'

df_beauty = pd.read_json(data_path + '/beauty_meta', lines=True)


# In[ ]:


df_beauty = df_beauty[['description', 'categories']]
df_beauty.loc[:, 'categories'] = df_beauty.categories.map(lambda x: x[0])

# two_categories = [l[1:3] for l in df_beauty.categories]
three_categories = [l[1:4] for l in df_beauty.categories]
set([l[-1] for l in three_categories])


# In[ ]:


# df_beauty['category_limit'] = two_categories
df_beauty['category_limit'] = three_categories

df_beauty = df_beauty.dropna()
df_beauty['len'] = df_beauty['category_limit'].str.len()

# df_beauty = df_beauty.loc[df_beauty['len'] == 2]
df_beauty = df_beauty.loc[df_beauty['len'] == 3]

df_beauty.loc[:, 'category_parent'] = df_beauty.category_limit.map(lambda x: x[0])
df_beauty.loc[:, 'category_pchild'] = df_beauty.category_limit.map(lambda x: x[1])
df_beauty.loc[:, 'category_child'] = df_beauty.category_limit.map(lambda x: x[2])


# In[ ]:


df_beauty


# In[ ]:


labels = list(set(df_beauty.category_parent))

unseen_list = random.sample(labels, 2)
seen_list = [value for value in labels if value not in unseen_list]

test_unseen = test[pd.DataFrame(test.category_parent.tolist()).isin(unseen_list).any(1).values]

test_unseen.to_csv('test.csv')