Skip to content

Commit

Permalink
Issue22 (#23) and Issue18
Browse files Browse the repository at this point in the history
* Issue #18 - start of updating the datasource for 2020 timeseries pits and some todos inthe file

* new sources

* issue #18 working towards modified 2020 timeseries pits upload script

* path logic

* make sure to not use gap filled density at this point

* Issue #18 - file for 2021 timeseries pits

* Issue #18 no perimeter depth files for 2021 TS pits

* having issues creating the test database

* Modify create script for sqlalchemy>2.0

* Switch to 2020 V1 pits - there are some data format and header issues in the V2 data

* Use db_session function

* Slight tweaks to 2021 timeseries script

* Script to delete pits

* start using insitupy for metadata handling

* working through handling metadata

* 2020 V2 data, allow split header line logic. ALSO - use the non-gap-filled density because the gap filled density files break the logic as they don't show the profile at all

* get rid of spaces in flags

* Script for 2021 pits is working

* start working on SWE files for pits

* move towards row based SRID and timezone ability

* bulk swe property upload script working

* Remove Python 3.7 compatability

* fixing reqs in build

* bump insitupy

* Fixing tests and build. SMP profile depths were not inverted

* Seem to have a version issue because the etag comparison is still working locally, but not in github. Try just using contentlength

* update hash

* Issue #22 - start working on AK pits

* some progress on the alaska data

* We don't need to manage empty files as long as headers are standard

* Script for SWE summary of Alaksa pits working

* update db name for 2023 pits script
  • Loading branch information
micah-prime authored Aug 13, 2024
1 parent d5236ed commit 5faf656
Show file tree
Hide file tree
Showing 23 changed files with 833 additions and 251 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.7, 3.8, 3.9]
python-version: [3.8, 3.9, "3.10"]

services:

Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@ scripts/upload/test*.txt
.idea/*
scripts/download/data/*
venv/

credentials.json
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ pandoc==1.0.2
sphinxcontrib-apidoc==0.3.0
ipython==7.31.1
MarkupSafe<2.1.0
jupyterlab==2.2.10
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
wheel>0.34.0, <0.35.0
snowexsql>=0.3.0, <0.4.0
snowexsql>=0.4.1, <0.5.0
snowmicropyn
matplotlib>=3.2.2, <3.3.0
matplotlib>=3.2.2
moto==3.1.11
coloredlogs>=14.0
progressbar2>=3.51.3
rasterio>=1.1.5
boto3>=1.23.7,<1.24
timezonefinder>=6.0,<7.0
insitupy==0.1.2
1 change: 0 additions & 1 deletion requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,4 @@ coverage==4.5.4
twine==1.14.0
pytest==6.2.3
pytest-runner==5.1
jupyterlab==2.2.10
moto==3.1.11
2 changes: 2 additions & 0 deletions scripts/download/nsidc_sources.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX20_SD.001/
https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX20_GM_CSU_GPR.001/2020.02.06/SNEX20_GM_CSU_GPR_1GHz_v01.csv
https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX20_UNM_GPR.001/2020.01.28/SNEX20_UNM_GPR.csv
https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX20_SD_TLI.001/2019.09.29/SNEX20_SD_TLI_clean.csv
https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX20_TS_SP.002/
https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX21_TS_SP.001/
68 changes: 68 additions & 0 deletions scripts/remove_data/remove_pits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
File to remove all snowpits from the database
"""
import argparse
from snowexsql.api import db_session
from snowexsql.data import LayerData
from snowexsql.db import get_db


def main():
parser = argparse.ArgumentParser(
description='Script to create our databases using the python library')
parser.add_argument('--db', dest='db', default='snowex',
help='Name of the database locally to add tables to')
parser.add_argument('--dry_run', dest='dry_run', action='store_true',
help='Try a dry run or not')
parser.add_argument('--credentials', dest='credentials',
default='./credentials.json',
help='Past to a json containing')
args = parser.parse_args()

credentials = args.credentials
db_name = f'localhost/{args.db}'
dry_run = args.dry_run

# All measurement 'types' associate with pits
types_pit = [
'sample_signal', 'grain_size', 'density', 'reflectance',
'permittivity', 'lwc_vol', 'manual_wetness',
'equivalent_diameter', 'specific_surface_area', 'grain_type',
'temperature', 'hand_hardness'
]
# Start a session
engine, session = get_db(db_name, credentials=credentials)
print(f"Connected to {db_name}")
try:
q = session.query(LayerData).filter(
LayerData.pit_id is not None # Filter to results with pit id
).filter(
LayerData.type.in_(types_pit) # Filter to correct type
)
result = q.count()
# Rough count of pits
estimated_number = int(result / float(len(types_pit)) / 10.0)
print(f"Found {result} records")
print(f"This is roughly {estimated_number} pits")
if dry_run:
print("THIS IS A DRYRUN, not deleting")
else:
if result > 0:
print("Deleting pits from the database")
# Delete
q.delete()
session.commit()
else:
print("No results, nothing to delete")
session.close()
except Exception as e:
print("Errored out, rolling back")
print(e)
session.rollback()
raise e

print("Done")


if __name__ == '__main__':
main()
114 changes: 114 additions & 0 deletions scripts/upload/add_alaska_pits_2023.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""
Script to upload the Snowex Time Series pits
"""

import glob
import re
from os.path import abspath, join
from pathlib import Path

from snowex_db.batch import UploadProfileBatch, UploadSiteDetailsBatch
from snowex_db.upload import PointDataCSV
from snowex_db import db_session


tz_map = {'US/Pacific': ['CA', 'NV', 'WA'],
'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'],
'US/Alaska': ["AK"]
}


def main():
"""
Add 2020 timeseries pits
"""
db_name = 'localhost/snowex'
# Preliminary data
doi = "None"
debug = True
timezone = "US/Alaska"

# Point to the downloaded data from
data_dir = abspath('../download/data/SNEX23_preliminary/Data/pits')
error_msg = []

# Files to ignore
ignore_files = [
"SnowEx23_SnowPits_AKIOP_Summary_Environment_v01.csv",
"SnowEx23_SnowPits_AKIOP_Summary_SWE_v01.csv"
]

# Get all the date folders
unique_folders = Path(
data_dir
).expanduser().absolute().glob("ALASKA*/*20*SNOW_PIT")
for udf in unique_folders:
# get all the csvs in the folder
dt_folder_files = list(udf.glob("*.csv"))
site_ids = []
# Get the unique site ids for this date folder
compiled = re.compile(
r'SnowEx23_SnowPits_AKIOP_([a-zA-Z0-9]*)_\d{8}.*_v01\.csv'
)
for file_path in dt_folder_files:
file_name = file_path.name
if file_name in ignore_files:
print(f"Skipping {file_name}")
continue
match = compiled.match(file_name)
if match:
code = match.group(1)
site_ids.append(code)
else:
raise RuntimeError(f"No site ID found for {file_name}")

# Get the unique site ids
site_ids = list(set(site_ids))

for site_id in site_ids:
# Grab all the csvs in the pits folder
filenames = glob.glob(join(str(udf), f'*_{site_id}_*.csv'))

# Grab all the site details files
sites = glob.glob(join(
str(udf), f'*_{site_id}_*siteDetails*.csv'
))

# Use no-gap-filled density
density_files = glob.glob(join(
str(udf), f'*_{site_id}_*_gapFilled_density*.csv'
))

# Remove the site details from the total file list to get only the
profiles = list(
set(filenames) - set(sites) -
set(density_files) # remove non-gap-filled denisty
)

# Submit all profiles associated with pit at a time
b = UploadProfileBatch(
filenames=profiles, debug=debug, doi=doi,
in_timezone=timezone,
db_name=db_name,
allow_split_lines=True, # Logic for split header lines
header_sep=":"
)
b.push()
error_msg += b.errors

# Upload the site details
sd = UploadSiteDetailsBatch(
filenames=sites, debug=debug, doi=doi,
in_timezone=timezone,
db_name=db_name
)
sd.push()
error_msg += sd.errors

for f, m in error_msg:
print(f)
return len(error_msg)


if __name__ == '__main__':
main()
77 changes: 77 additions & 0 deletions scripts/upload/add_pits_bulk_properties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Script to upload the Snowex Time Series pits
"""

import glob
import re
from os.path import abspath, join
from pathlib import Path

import pandas as pd

from snowex_db.upload import PointDataCSV
from snowex_db import db_session


def main():
"""
Add bulk SWE, Depth, Density for 2020 and 2021 timeseires pits
"""
db_name = 'localhost/snowex'
debug = True

# Point to the downloaded data from
data_dir = abspath('../download/data/SNOWEX/')
error_msg = []

path_details = [
{
"DOI": "https://doi.org/10.5067/KZ43HVLZV6G4",
"path": "SNEX20_TS_SP.002/2019.10.24/SNEX20_TS_SP_Summary_SWE_v02.csv"
},
{
"DOI": "https://doi.org/10.5067/QIANJYJGRWOV",
"path": "SNEX21_TS_SP.001/2020.11.16/SNEX21_TS_SP_Summary_SWE_v01.csv"
},
# Preliminary data from 2023 Alask pits
{
"DOI": None,
"path": "../SNEX23_preliminary/Data/SnowEx23_SnowPits_AKIOP_Summary_SWE_v01.csv"
}
]
for info in path_details:
doi = info["DOI"]
file_path = join(data_dir, info["path"])
# Read csv and dump new one without the extra header lines
df = pd.read_csv(
file_path,
skiprows=list(range(32)) + [33]
)
new_name = file_path.replace(".csv", "_modified.csv")
# Filter to columns we want (density, swe, etc)
columns = [
'Location', 'Site', 'PitID', 'Date/Local Standard Time', 'UTM Zone',
'Easting (m)', 'Northing (m)', 'Latitude (deg)', 'Longitude (deg)',
'Density Mean (kg/m^3)',
'SWE (mm)', 'HS (cm)', "Snow Void (cm)", 'Flag'
]
df_columns = df.columns.values
filtered_columns = [c for c in columns if c in df_columns]
df = df.loc[:, filtered_columns]
df.to_csv(new_name, index=False)

# Submit SWE file data as point data
with db_session(
db_name, credentials='credentials.json'
) as (session, engine):
pcsv = PointDataCSV(
new_name, doi=doi, debug=debug,
depth_is_metadata=False,
row_based_crs=True,
row_based_timezone=True
)
pcsv.submit(session)


if __name__ == '__main__':
main()
80 changes: 0 additions & 80 deletions scripts/upload/add_time_series_pits.py

This file was deleted.

Loading

0 comments on commit 5faf656

Please sign in to comment.