Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial scripts to cleanup data #1

Merged
merged 11 commits into from
May 9, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.vscode
.vscode/
data/


# Byte-compiled / optimized / DLL files
Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,16 @@ pip install -r requirements.txt

## Running the scripts

TBD - but probably something like this:
The basic format is `python cleanup.py <command> --input <path/to/input.xlsx>`

For example:
```
python cleanup /path/to/file1.xlsx /path/to/file2.xlsx
python cleanup.py all-covid-calls --input "Data from 4.2.20 Fake Data.xlsx"
python cleanup.py --debug keep-calm-with-covid --input "Data from 4.2.20 Fake Data.xlsx" --sheetname "Uncleaned data type 2 VIA LINK"
```

If you want to see the basic usage you can run `python cleanup.py` and for a specifc command you can use the `--help` flag

```
python cleanup.py all-covid-calls --help
```
84 changes: 84 additions & 0 deletions cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import logging

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler()],
)
import os
import sys

import click
import pandas as pd

from cleanup_all_covid_calls import cleanup as cleanup_all_covid_calls
from cleanup_keep_calm_with_covid import (
CONVERTERS,
cleanup as cleanup_keep_calm_with_covid,
)
from utils import write_output_file


@click.group()
@click.option("--debug/--no-debug", default=False)
@click.pass_context
def cleanup(ctx, debug):
ctx.ensure_object(dict)
ctx.obj["DEBUG"] = debug


@cleanup.command()
@click.pass_context
@click.option(
"--input",
"infile",
required=True,
help="Path to the input spreadsheet (.xlsx file)",
)
@click.option("--sheetname", default=None, help="Name of the sheet to use")
@click.option(
"--output",
default="data/all_covid_calls_cleaned.xlsx",
help="Path to the output spreadsheet (cleaned .xlsx file)",
)
def all_covid_calls(ctx, infile, sheetname, output):
if ctx.obj["DEBUG"]:
logging.getLogger().setLevel(logging.DEBUG)
logging.debug("Running in debug mode")
logging.debug(f"Reading input file '{infile}'")
df = pd.read_excel(infile, sheet_name=sheetname)
logging.info("Cleaning data for All COVID Calls Dashboard")
df = cleanup_all_covid_calls(df)
logging.info(f"Writing data for All COVID Calls Dashboard to '{output}'")
write_output_file(df, output)


@cleanup.command()
@click.pass_context
@click.option(
"--input",
"infile",
required=True,
help="Path to the input spreadsheet (.xlsx file)",
)
@click.option("--sheetname", required=True, help="Name of the sheet to use")
@click.option(
"--output",
default="data/keep_calm_with_covid_cleaned.xlsx",
help="Path to the output spreadsheet (cleaned .xlsx file)",
)
def keep_calm_with_covid(ctx, infile, sheetname, output):
if ctx.obj["DEBUG"]:
logging.getLogger().setLevel(logging.DEBUG)
logging.debug("Running in debug mode")
logging.debug(f"Reading input file '{infile}'")
df = pd.read_excel(infile, sheet_name=sheetname, converters=CONVERTERS)
logging.info("Cleaning data for Keep Calm with COVID Dashboard")
cleanup_keep_calm_with_covid(df)
logging.info(f"Writing data for Keep Calm with COVID Dashboard to '{output}'")
write_output_file(df, output)


if __name__ == "__main__":
cleanup(obj={})
130 changes: 130 additions & 0 deletions cleanup_all_covid_calls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import pandas as pd
import numpy as np
from datetime import datetime
from utils import (
explode_needs,
get_lat,
get_lng,
replacements,
write_output_file,
)

pd.options.mode.chained_assignment = None


def cleanup(dfs):
### Cleanup for All COVID Calls dashboard

# step 1
# select required columns from VIA LINK’s Disaster Form
# pretty sure the distaster form is "Uncleaned data type 1 VIA LINK"
VIA_LINK_REQUIRED_COLUMNS_DISASTER = [
"CallReportNum",
"ReportVersion",
"CallDateAndTimeStart",
"CityName",
"CountyName",
"StateProvince",
"PostalCode",
"Client Information - Age Group",
"Client Information - Call Type",
"Client Information - Identifies as",
"Concerns/Needs - Concerns/Needs",
"Contact Source - Program ", # ending space is needed
"Needs - Basic Needs Requested",
]
vialink1_df = dfs["Uncleaned data type 1 VIA LINK"][
mrcnc marked this conversation as resolved.
Show resolved Hide resolved
VIA_LINK_REQUIRED_COLUMNS_DISASTER
]

# step 2
# select required columns from 232-Help’s Disaster Form
TWO32_HELP_REQUIRED_COLUMNS = [
"CallReportNum",
"ReportVersion",
"CallDateAndTimeStart",
"CityName",
"CountyName",
"StateProvince",
"PostalCode",
"Client Information - Date of Birth",
"Client Information - Call Type",
"Call Outcome - What concerns/needs were identified?",
"Client Information - Identifies as",
"Needs - Basic Needs Requested",
]
two32_help_df = dfs["Uncleaned Data from 232-Help"][TWO32_HELP_REQUIRED_COLUMNS]

# step 3
# Create age ranges from date of birth
# use ranges 0-5, 6-12, 13-17, 18-24, 25-40, 41-59, 60+.
now = datetime.now()
bins = [0, 5, 12, 17, 24, 40, 59, 150]
labels = ["0-5", "6-12", "13-17", "18-24", "24-40", "41-49", "60+"]
dob = pd.to_datetime(
two32_help_df["Client Information - Date of Birth"], errors="coerce"
)
years_old = (now - dob).astype("timedelta64[Y]")
age_range = pd.cut(years_old, bins=bins, labels=labels, include_lowest=True)
two32_help_df["Client Information - Age Group"] = age_range
# remove original Date of Birth column
two32_help_df.drop(columns=["Client Information - Date of Birth"], inplace=True)

# step 4
# add "Data From" column
vialink1_df["Data From"] = "VIA LINK"
two32_help_df["Data From"] = "232-HELP"

# step 5
# add data to master spreadsheet
# first merge "Call Outcome - What concerns/needs were identified" from 232-HELP
# into "Concerns/Needs - Concerns/Needs"
two32_help_df.rename(
columns={
"Call Outcome - What concerns/needs were identified?": "Concerns/Needs - Concerns/Needs"
},
inplace=True,
)

# new steps
# cleanup invalid values
vialink1_df["Contact Source - Program "].replace(
to_replace=datetime(2001, 2, 1, 0, 0), value=np.nan, inplace=True
)

# then combine data
master_df = pd.concat([vialink1_df, two32_help_df], join="outer", ignore_index=True)

# step 6
# add lat/lon columns
master_df["Latitude"] = master_df["PostalCode"].apply(get_lat)
master_df["Longitude"] = master_df["PostalCode"].apply(get_lng)

# step 7
# first put the values from "Needs - Basic Needs Requested" into "Concerns/Needs - Concerns/Needs"
cn = "Concerns/Needs - Concerns/Needs"
master_df["all_needs"] = master_df[[cn, "Needs - Basic Needs Requested"]].apply(
lambda x: "; ".join(x[x.notnull()]), axis=1
)
master_df.drop(columns=[cn, "Needs - Basic Needs Requested"], inplace=True)
master_df.rename(columns={"all_needs": cn}, inplace=True)
master_df = explode_needs(master_df, cn)

# step 8
# cleanup Concerns/Needs
master_df[cn] = master_df[cn].str.strip()
master_df = master_df[master_df[cn] != "Hangup / Wrong Number"]
master_df = master_df[master_df[cn] != "Hangup / Wrong #"]
master_df.replace(to_replace=replacements, value=None, inplace=True)

return master_df


if __name__ == "__main__":
mrcnc marked this conversation as resolved.
Show resolved Hide resolved
file = "Data from 4.2.20 Fake Data.xlsx"

# read all sheets, returns a dict of dataframes
dfs = pd.read_excel(file, sheet_name=None)

df = cleanup(dfs)
write_output_file(df, "data/all_covid_calls_cleaned.xlsx")
105 changes: 105 additions & 0 deletions cleanup_keep_calm_with_covid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import pandas as pd
import numpy as np
from datetime import datetime
from utils import explode_needs, get_lat, get_lng, replacements

CONVERTERS = {
"Concerns/Needs - Disaster Services ": str,
"Concerns/Needs - Domestic Abuse/IPV": str,
"Concerns/Needs - Early Childhood Education ": str,
"Concerns/Needs - Education/ Employment ": str,
"Concerns/Needs - Environmental Quality & Prtcn ": str,
"Concerns/Needs - Health Care ": str,
"Concerns/Needs - Interpersonal": str,
"Concerns/Needs - Mental Health": str,
"Concerns/Needs - Mental Health Concerns": str,
"Concerns/Needs - Organizational Development": str,
"Concerns/Needs - Other ": str,
"Concerns/Needs - Other Community Services": str,
"Concerns/Needs - Protective Service/Abuse": str,
"Concerns/Needs - Public Asst & Social Insurance": str,
"Concerns/Needs - Relationship Concerns / Issues ": str,
"Concerns/Needs - Self-Harm": str,
"Concerns/Needs - Sexuality": str,
}


def cleanup(df):
### Cleanup for Keeping Calm with COVID dashboard
# step 1
# select only the required columns
needs_columns = [
"Concerns/Needs - Disaster Services ",
"Concerns/Needs - Domestic Abuse/IPV",
"Concerns/Needs - Early Childhood Education ",
"Concerns/Needs - Education/ Employment ",
"Concerns/Needs - Environmental Quality & Prtcn ",
"Concerns/Needs - Health Care ",
"Concerns/Needs - Interpersonal",
"Concerns/Needs - Mental Health",
"Concerns/Needs - Mental Health Concerns",
"Concerns/Needs - Organizational Development",
"Concerns/Needs - Other ",
"Concerns/Needs - Other Community Services",
"Concerns/Needs - Protective Service/Abuse",
"Concerns/Needs - Public Asst & Social Insurance",
"Concerns/Needs - Relationship Concerns / Issues ",
"Concerns/Needs - Self-Harm",
"Concerns/Needs - Sexuality",
]
VIA_LINK_REQUIRED_COLUMNS_CALLS = [
"CallReportNum",
"ReportVersion",
"CallDateAndTimeStart",
"CityName",
"CountyName",
"StateProvince",
"PostalCode",
"Call Information - Program",
"Demographics - Age",
"Demographics - Gender",
] + needs_columns
df = df[VIA_LINK_REQUIRED_COLUMNS_CALLS]

# step 2
# remove calls not from LA Spirit line
df = df[df["Call Information - Program"] == "LA Spirit Crisis Line"]

# step 3
# combine all needs column into 1 column
all_needs = "Concerns/Needs - Concerns/Needs"
df[all_needs] = df[needs_columns].apply(lambda x: "; ".join(x[x.notnull()]), axis=1)
df = explode_needs(df, all_needs)

# step 4
# add "Data From" column
df["Data From"] = "VIA LINK"

# step 5
# cleanup Concerns/Needs Data
df[all_needs] = df[all_needs].str.strip()
df = df[df[all_needs] != "Wrong #"]
df = df[df[all_needs] != "hangup"]
df.replace(to_replace=replacements, value=None, inplace=True)

# step 6
# drop all the original needs columns
df.drop(columns=needs_columns, inplace=True)

# step 7
# add the Lat/Lng columns
df["Latitude"] = df["PostalCode"].apply(get_lat)
df["Longitude"] = df["PostalCode"].apply(get_lng)

return df


if __name__ == "__main__":
file = "Data from 4.2.20 Fake Data.xlsx"
df = pd.read_excel(
file, sheet_name="Uncleaned data type 2 VIA LINK", converters=CONVERTERS
)
df = cleanup(df)
df.to_excel(
"data/keep_calm_with_covid_cleaned.xlsx", sheet_name="codefornola cleaned"
)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
click==7.1.1
openpyxl==3.0.3
pandas==1.0.3
uszipcode==0.2.4
Expand Down
Loading