Skip to content

Commit

Permalink
Do not mistake "océan indien" as IN + escape ’ character as '
Browse files Browse the repository at this point in the history
  • Loading branch information
Roxane committed Feb 6, 2024
1 parent c5054aa commit 35ea160
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 2 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)

- Support for Python 3.12
- Detection for DE and FR subdivisions using NUTS codes
- escape ’ character as '

### Fixed

- Do not mistake "océan indien" as IN

### Dependencies

Expand Down
6 changes: 4 additions & 2 deletions geoconvert/data/countries.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@
"south sudan": "SS", # en
"sudsudan": "SS", # de
"sudan del sur": "SS", # es
# Make sure we never mistake "océan Indien" for IN
"territoire britannique de l'ocean indien": "IO", # fr
"ocean indien": None, # fr
"indien": "IN", # de
# Make sure we never mistake some countries or subdivisions for IS (Iceland)
# We would mistake them because Iceland spells Island in German.
"prince edward island": "CA", # en
Expand Down Expand Up @@ -222,7 +226,6 @@
"norvege": "NO",
"nouvelle caledonie": "NC",
"nouvelle zelande": "NZ",
"ocean indien": "IO",
"oman": "OM",
"ouganda": "UG",
"ouzbekistan": "UZ",
Expand Down Expand Up @@ -651,7 +654,6 @@
"guyana": "GY",
"haiti": "HT",
"honduras": "HN",
"indien": "IN",
"indonesien": "ID",
"irak": "IQ",
"iran": "IR",
Expand Down
4 changes: 4 additions & 0 deletions geoconvert/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,14 @@ def safe_string(text):
'washington dc'
>>> safe_string('How aRE yOU?')
'how are you'
>>> safe_string("l’ocean")
"l'ocean"
"""
text = remove_accents(text)
# Replace "-" and ":" with a whitespace
text = re.sub(r"[-:]", " ", text)
# Replace weird '
text = re.sub(r"[ʼ]", "'", text)
# Only keep word or space characters as well as "_", and "'".
text = re.sub(r"[^\w\s']", "", text)
# Always remove multiple whitespaces at the very last minute
Expand Down
5 changes: 5 additions & 0 deletions tests/test_countries.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ class TestCountries:
("sudan del sur", {}, "SS"), # es
("Soudan du Sud", {}, "SS"), # fr
("sudao do sul", {}, "SS"), # pt
# No confusion between the indian ocean, the British Indian Ocean Territory
# and the deutsch name for india
("indien", {}, "IN"), # de
("Territoire britannique de l’océan Indien", {}, "IO"), # fr
("Océan Indien", {}, None), # fr
# No confusion with Iceland, which spells "Island" in German ("IS")
("Cayman islands", {}, "KY"), # en
("Christmas island", {}, "CX"), # en
Expand Down

0 comments on commit 35ea160

Please sign in to comment.