Skip to content

Commit

Permalink
Merge pull request #240 from fozy81/main
Browse files Browse the repository at this point in the history
fix: remove duplicate word plural forms
  • Loading branch information
JackGilmore authored Jul 28, 2023
2 parents 0d8a1a1 + 9385b63 commit 4a5bfdb
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 19 deletions.
19 changes: 1 addition & 18 deletions ODSCategories.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
"memorial",
"military",
"museum",
"museums",
"poems",
"theatre"
],
Expand Down Expand Up @@ -76,7 +75,6 @@
"shop",
"shopping",
"shopping centres",
"shops",
"simd",
"social security",
"tenders",
Expand Down Expand Up @@ -106,7 +104,6 @@
"council policies",
"council tax",
"councillors",
"councils",
"data protection",
"data zone lookup",
"disability living allowance",
Expand Down Expand Up @@ -134,21 +131,17 @@
"pupil",
"school",
"school meals",
"schools",
"scqf",
"sqa",
"student"
],
"Elections / Politics": [
"community council",
"community councils",
"council area",
"councillor",
"councillors",
"democracy",
"elecorate",
"election",
"elections",
"electoral",
"electorate",
"local authority",
Expand Down Expand Up @@ -239,7 +232,6 @@
"health board",
"hospital",
"hospital admissions",
"hospitals",
"human services",
"implant",
"induced abortion",
Expand Down Expand Up @@ -304,20 +296,17 @@
],
"Law and Licensing": [
"court",
"courts",
"crime",
"law",
"licence",
"licenses",
"licensing",
"permit",
"permits",
"police",
"policing",
"regulation",
"regulations",
"tribunal",
"tribunals", "Fixed Penalty Notices"
"Fixed Penalty Notices"
],
"Parks / Recreation": [
"features of interest",
Expand Down Expand Up @@ -390,8 +379,6 @@
"parks",
"societies",
"sport",
"sport",
"sports",
"sports activities",
"sports clubs",
"sports facilities", "Community Centres"
Expand All @@ -402,7 +389,6 @@
"cafes",
"historic buildings",
"hotel",
"hotels",
"public toilets",
"restaurants",
"tourism",
Expand All @@ -420,7 +406,6 @@
"commuting",
"core paths",
"cycle",
"cycles",
"cycling",
"electric vehicle",
"electric vehicle charging points",
Expand All @@ -440,9 +425,7 @@
"road closures",
"road safety",
"road works",
"roads",
"salt bin",
"salt bins",
"scotrail",
"speed limits",
"station",
Expand Down
20 changes: 19 additions & 1 deletion merge_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,23 @@ def tidy_categories(categories_string):
data["OriginalTags"] = data["OriginalTags"].apply(tidy_categories)
data["ManualTags"] = data["ManualTags"].apply(tidy_categories)

### Creating dataset categories for ODS
### Creating dataset categories for ODS
def remove_trailing_s(string):
"""Remove trailing 's' from all words in string to remove requirement to search for pural categories in
Args:
string: String to remove trialing 's' from
Returns:
string: the resulting string, with trailing 's' removed from all words.
"""
s = []
words = string.split()
for word in words:
s.append(re.sub('[Ss]$', "", word))
sentence = ' '.join(s)
return sentence

def find_keyword(str_tofind, str_findin):
"""Finds if single word or phrase exists in string
Expand All @@ -241,6 +257,8 @@ def find_keyword(str_tofind, str_findin):
Returns:
boolean: True if match is found
"""
str_findin = remove_trailing_s(str_findin)
str_tofind = remove_trailing_s(str_tofind)
if re.search(r"\b" + re.escape(str_tofind) + r"\b", str_findin, re.I):
return True
return False
Expand Down

0 comments on commit 4a5bfdb

Please sign in to comment.