-
Notifications
You must be signed in to change notification settings - Fork 0
/
abs_score.py
111 lines (98 loc) · 4.8 KB
/
abs_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gspread
import pandas as pd
from oauth2client.service_account import ServiceAccountCredentials
import pycountry
import numpy as np
# convert country names to ISO 3166-1 alpha-2 or alpha-3 codes -- might not be needed anymore
def convert_country_name_to_iso(country_name, code_type="alpha-2"):
try:
country = pycountry.countries.search_fuzzy(country_name)[0]
if code_type == "alpha-2":
return country.alpha_2
elif code_type == "alpha-3":
return country.alpha_3
else:
return None
except LookupError:
return None
# Define the scope and credentials for accessing the Google Spreadsheet
scope = [
"https://spreadsheets.google.com/feeds",
"https://www.googleapis.com/auth/drive",
]
credentials = ServiceAccountCredentials.from_json_keyfile_name(
"credentials.json", scope
)
# Authenticate and open the Google Spreadsheet - needs to be a 'real' gspreadsheet!
gc = gspread.authorize(credentials)
spreadsheet = gc.open_by_key("google spreadsheet key")
# Load the "ABSLevels" tab as a pandas DataFrame
abs_levels_sheet = spreadsheet.worksheet("ABSLevels")
abs_levels_data = abs_levels_sheet.get_all_records()
abs_levels_df = pd.DataFrame(abs_levels_data)
# replace empty strings by nan
abs_levels_df = abs_levels_df.replace("", np.nan)
# Load the "MissingFoods20230508" tab as a pandas DataFrame
missing_foods_sheet = spreadsheet.worksheet("Curation20230517")
missing_foods_data = missing_foods_sheet.get_all_records()
missing_foods_df = pd.DataFrame(missing_foods_data)
# replace empty strings by nan
missing_foods_df = missing_foods_df.replace("", np.nan)
# Load the result tab
results_sheet = spreadsheet.worksheet("resultsABS")
# Iterate over the rows in the "MissingFoods20230508" DataFrame
for index, row in missing_foods_df.iterrows():
specimen_location = row["Specimen Origin Location"]
origin_variety = row["Country of origin/Variety"]
origin_species = row["Country of origin/ Species level"]
abs_score_specimen_location = 4
abs_score_origin_variety = 4
abs_score_origin_species = 4
# Check if the specimen location is not empty
if pd.notna(specimen_location):
if specimen_location != "unknown":
# Find the corresponding ABS score in the "ABSLevels" DataFrame based on the country code
abs_score = abs_levels_df.loc[
abs_levels_df["Country code"] == specimen_location, "ABS score"
].values
if pd.notna(abs_score):
abs_score_specimen_location = abs_score
# if both country of originin (variety and species) are empty, keep the ABS of the specimen location - can be simplify if unknown == 4
if not pd.notna(origin_variety) and not pd.notna(origin_species):
abs_score_origin_variety = abs_score_specimen_location
abs_score_origin_species = abs_score_specimen_location
else:
# Look at Country of Origin/diversity Variety
if pd.notna(origin_variety):
country_list = origin_variety.split(",")
# Find the corresponding ABS score in the "ABSLevels" DataFrame based on the country code list and keep the highest
abs_score_origin_variety = abs_levels_df.loc[
abs_levels_df["Country name"].isin(country_list), "ABS score"
].max()
else:
# if country of originin variety is empty, keep the abs of the specimen location - remove if we want to keep 4
abs_score_origin_variety = abs_score_specimen_location
# Look at Country of Origin/diversity Species
if pd.notna(origin_species):
country_list = origin_species.split(",")
# Find the corresponding ABS score in the "ABSLevels" DataFrame based on the country code list and keep the highest
abs_score_origin_species = abs_levels_df.loc[
abs_levels_df["Country name"].isin(country_list), "ABS score"
].max()
else:
# if country of originin species is empty, keep the abs of the specimen location - remove if we want to keep 4
abs_score_origin_species = abs_score_specimen_location
# Update the value in the "MissingFoods20230508" DataFrame under "Specimen ABS Status" by keeping the highest score between the different ABS scores
missing_foods_df.at[index, "Specimen ABS Status"] = max(
abs_score_specimen_location, abs_score_origin_variety, abs_score_origin_species
)
# replace nan for json export
missing_foods_df = missing_foods_df.replace(np.nan, "")
# Clear the existing "MissingFoods20230508" tab and write the updated data
results_sheet.clear()
results_sheet.update(
[missing_foods_df.columns.values.tolist()] + missing_foods_df.values.tolist()
)
# Print the updated "MissingFoods20230508" DataFrame
print("Updated MissingFoods20230508:")
# print(missing_foods_df)