Update app.py
Browse files
app.py
CHANGED
|
@@ -999,29 +999,45 @@ target_datasets = {
|
|
| 999 |
|
| 1000 |
def get_korea_datasets():
|
| 1001 |
"""Korea 관련 데이터셋 검색"""
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
"full": "True",
|
| 1005 |
-
"limit": 1000
|
| 1006 |
-
}
|
| 1007 |
|
| 1008 |
-
|
| 1009 |
-
|
| 1010 |
-
"
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
|
| 1014 |
|
| 1015 |
-
|
| 1016 |
-
|
| 1017 |
-
|
| 1018 |
-
|
| 1019 |
-
|
| 1020 |
-
|
| 1021 |
-
|
| 1022 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1023 |
|
| 1024 |
-
def get_all_datasets(limit=
|
| 1025 |
"""모든 데이터셋과 Korea 관련 데이터셋 가져오기"""
|
| 1026 |
all_datasets = []
|
| 1027 |
page_size = 1000
|
|
@@ -1033,27 +1049,37 @@ def get_all_datasets(limit=3000):
|
|
| 1033 |
'offset': offset
|
| 1034 |
}
|
| 1035 |
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
|
| 1045 |
-
|
| 1046 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1047 |
break
|
| 1048 |
|
| 1049 |
# Korea 검색 결과 추가
|
| 1050 |
korea_datasets = get_korea_datasets()
|
| 1051 |
existing_ids = {dataset.get('id', '') for dataset in all_datasets}
|
| 1052 |
|
|
|
|
| 1053 |
for korea_dataset in korea_datasets:
|
| 1054 |
if korea_dataset.get('id', '') not in existing_ids:
|
| 1055 |
all_datasets.append(korea_dataset)
|
| 1056 |
existing_ids.add(korea_dataset.get('id', ''))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1057 |
|
| 1058 |
return all_datasets[:limit]
|
| 1059 |
|
|
@@ -1115,7 +1141,7 @@ def get_datasets_data(progress=gr.Progress()):
|
|
| 1115 |
else:
|
| 1116 |
filtered_datasets.append({
|
| 1117 |
'id': dataset_id,
|
| 1118 |
-
'global_rank': 'Not in top
|
| 1119 |
'downloads': 0,
|
| 1120 |
'likes': 0,
|
| 1121 |
'title': 'No Title',
|
|
@@ -1154,9 +1180,9 @@ def get_datasets_data(progress=gr.Progress()):
|
|
| 1154 |
xaxis_title="Dataset ID",
|
| 1155 |
yaxis_title="Global Rank",
|
| 1156 |
yaxis=dict(
|
| 1157 |
-
ticktext=[f"#{i}" for i in range(1,
|
| 1158 |
-
tickvals=[
|
| 1159 |
-
range=[0,
|
| 1160 |
),
|
| 1161 |
height=800,
|
| 1162 |
showlegend=False,
|
|
|
|
| 999 |
|
| 1000 |
def get_korea_datasets():
|
| 1001 |
"""Korea 관련 데이터셋 검색"""
|
| 1002 |
+
search_terms = ['korea', 'korean', 'kor'] # 검색어 확장
|
| 1003 |
+
all_korea_datasets = []
|
|
|
|
|
|
|
|
|
|
| 1004 |
|
| 1005 |
+
for term in search_terms:
|
| 1006 |
+
params = {
|
| 1007 |
+
"search": term,
|
| 1008 |
+
"full": "True",
|
| 1009 |
+
"limit": 10000 # 검색 범위 확장
|
| 1010 |
+
}
|
| 1011 |
|
| 1012 |
+
try:
|
| 1013 |
+
response = requests.get(
|
| 1014 |
+
"https://huggingface.co/api/datasets",
|
| 1015 |
+
headers={'Authorization': f'Bearer {HF_TOKEN}'},
|
| 1016 |
+
params=params
|
| 1017 |
+
)
|
| 1018 |
+
|
| 1019 |
+
if response.status_code == 200:
|
| 1020 |
+
datasets = response.json()
|
| 1021 |
+
all_korea_datasets.extend(datasets)
|
| 1022 |
+
print(f"Found {len(datasets)} datasets for search term '{term}'")
|
| 1023 |
+
else:
|
| 1024 |
+
print(f"Failed to fetch datasets for term '{term}': {response.status_code}")
|
| 1025 |
+
except Exception as e:
|
| 1026 |
+
print(f"Error fetching datasets for term '{term}': {str(e)}")
|
| 1027 |
+
|
| 1028 |
+
# 중복 제거
|
| 1029 |
+
seen_ids = set()
|
| 1030 |
+
unique_datasets = []
|
| 1031 |
+
for dataset in all_korea_datasets:
|
| 1032 |
+
dataset_id = dataset.get('id', '')
|
| 1033 |
+
if dataset_id and dataset_id not in seen_ids:
|
| 1034 |
+
seen_ids.add(dataset_id)
|
| 1035 |
+
unique_datasets.append(dataset)
|
| 1036 |
+
|
| 1037 |
+
print(f"Total unique Korea-related datasets found: {len(unique_datasets)}")
|
| 1038 |
+
return unique_datasets
|
| 1039 |
|
| 1040 |
+
def get_all_datasets(limit=10000): # 기본 limit 증가
|
| 1041 |
"""모든 데이터셋과 Korea 관련 데이터셋 가져오기"""
|
| 1042 |
all_datasets = []
|
| 1043 |
page_size = 1000
|
|
|
|
| 1049 |
'offset': offset
|
| 1050 |
}
|
| 1051 |
|
| 1052 |
+
try:
|
| 1053 |
+
response = requests.get(
|
| 1054 |
+
"https://huggingface.co/api/datasets",
|
| 1055 |
+
headers={'Authorization': f'Bearer {HF_TOKEN}'},
|
| 1056 |
+
params=params
|
| 1057 |
+
)
|
| 1058 |
+
|
| 1059 |
+
if response.status_code == 200:
|
| 1060 |
+
datasets = response.json()
|
| 1061 |
+
all_datasets.extend(datasets)
|
| 1062 |
+
print(f"Fetched datasets {offset+1} to {offset+len(datasets)}")
|
| 1063 |
+
else:
|
| 1064 |
+
print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
|
| 1065 |
+
break
|
| 1066 |
+
except Exception as e:
|
| 1067 |
+
print(f"Error fetching datasets at offset {offset}: {str(e)}")
|
| 1068 |
break
|
| 1069 |
|
| 1070 |
# Korea 검색 결과 추가
|
| 1071 |
korea_datasets = get_korea_datasets()
|
| 1072 |
existing_ids = {dataset.get('id', '') for dataset in all_datasets}
|
| 1073 |
|
| 1074 |
+
added_count = 0
|
| 1075 |
for korea_dataset in korea_datasets:
|
| 1076 |
if korea_dataset.get('id', '') not in existing_ids:
|
| 1077 |
all_datasets.append(korea_dataset)
|
| 1078 |
existing_ids.add(korea_dataset.get('id', ''))
|
| 1079 |
+
added_count += 1
|
| 1080 |
+
|
| 1081 |
+
print(f"Added {added_count} additional Korea-related datasets")
|
| 1082 |
+
print(f"Total datasets: {len(all_datasets)}")
|
| 1083 |
|
| 1084 |
return all_datasets[:limit]
|
| 1085 |
|
|
|
|
| 1141 |
else:
|
| 1142 |
filtered_datasets.append({
|
| 1143 |
'id': dataset_id,
|
| 1144 |
+
'global_rank': 'Not in top 10000',
|
| 1145 |
'downloads': 0,
|
| 1146 |
'likes': 0,
|
| 1147 |
'title': 'No Title',
|
|
|
|
| 1180 |
xaxis_title="Dataset ID",
|
| 1181 |
yaxis_title="Global Rank",
|
| 1182 |
yaxis=dict(
|
| 1183 |
+
ticktext=[f"#{i}" for i in range(1, 10001, 100)],
|
| 1184 |
+
tickvals=[10001 - i for i in range(1, 10001, 100)],
|
| 1185 |
+
range=[0, 10000]
|
| 1186 |
),
|
| 1187 |
height=800,
|
| 1188 |
showlegend=False,
|