From e89ac3db2e54178dfedd5341d8808c3baa3a6a6a Mon Sep 17 00:00:00 2001 From: fozy81 Date: Sat, 17 Jun 2023 17:25:16 +0100 Subject: [PATCH 1/2] fix: remove duplicate word plural forms --- ODSCategories.json | 19 +------------------ merge_data.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/ODSCategories.json b/ODSCategories.json index 3eefe34..89e06b3 100644 --- a/ODSCategories.json +++ b/ODSCategories.json @@ -17,7 +17,6 @@ "memorial", "military", "museum", - "museums", "poems", "theatre" ], @@ -76,7 +75,6 @@ "shop", "shopping", "shopping centres", - "shops", "simd", "social security", "tenders", @@ -106,7 +104,6 @@ "council policies", "council tax", "councillors", - "councils", "data protection", "data zone lookup", "disability living allowance", @@ -134,21 +131,17 @@ "pupil", "school", "school meals", - "schools", "scqf", "sqa", "student" ], "Elections / Politics": [ "community council", - "community councils", "council area", "councillor", - "councillors", "democracy", "elecorate", "election", - "elections", "electoral", "electorate", "local authority", @@ -239,7 +232,6 @@ "health board", "hospital", "hospital admissions", - "hospitals", "human services", "implant", "induced abortion", @@ -304,20 +296,17 @@ ], "Law and Licensing": [ "court", - "courts", "crime", "law", "licence", "licenses", "licensing", "permit", - "permits", "police", "policing", "regulation", - "regulations", "tribunal", - "tribunals", "Fixed Penalty Notices" + "Fixed Penalty Notices" ], "Parks / Recreation": [ "features of interest", @@ -390,8 +379,6 @@ "parks", "societies", "sport", - "sport", - "sports", "sports activities", "sports clubs", "sports facilities", "Community Centres" @@ -402,7 +389,6 @@ "cafes", "historic buildings", "hotel", - "hotels", "public toilets", "restaurants", "tourism", @@ -420,7 +406,6 @@ "commuting", "core paths", "cycle", - "cycles", "cycling", "electric vehicle", "electric vehicle charging points", @@ -440,9 +425,7 @@ "road closures", "road safety", "road works", - "roads", "salt bin", - "salt bins", "scotrail", "speed limits", "station", diff --git a/merge_data.py b/merge_data.py index f81e435..382f33f 100644 --- a/merge_data.py +++ b/merge_data.py @@ -229,7 +229,23 @@ def tidy_categories(categories_string): data["OriginalTags"] = data["OriginalTags"].apply(tidy_categories) data["ManualTags"] = data["ManualTags"].apply(tidy_categories) - ### Creating dataset categories for ODS + ### Creating dataset categories for ODS + def remove_trailing_s(string): + """Remove trailing 's' from all words in string to remove requirement to search for pural categories in + + Args: + string: String to remove trialing 's' from + + Returns: + string: the resulting string, with trailing 's' removed from all words. + """ + s = [] + words = string.split() + for word in words: + s.append(re.sub('s$', "", word)) + sentence = ' '.join(s) + return sentence + def find_keyword(str_tofind, str_findin): """Finds if single word or phrase exists in string @@ -240,6 +256,8 @@ def find_keyword(str_tofind, str_findin): Returns: boolean: True if match is found """ + str_findin = remove_trailing_s(str_findin) + str_tofind = remove_trailing_s(str_tofind) if re.search(r"\b" + re.escape(str_tofind) + r"\b", str_findin, re.I): return True return False From 9385b6394ce30f157237a1022e53946d9b27bf05 Mon Sep 17 00:00:00 2001 From: fozy81 Date: Sat, 17 Jun 2023 19:15:06 +0100 Subject: [PATCH 2/2] fix: remove plural duplicates - remove trailing 's' or ' S' (case insensitive) --- merge_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/merge_data.py b/merge_data.py index 382f33f..3a5d922 100644 --- a/merge_data.py +++ b/merge_data.py @@ -242,7 +242,7 @@ def remove_trailing_s(string): s = [] words = string.split() for word in words: - s.append(re.sub('s$', "", word)) + s.append(re.sub('[Ss]$', "", word)) sentence = ' '.join(s) return sentence