Issue22 (#23) and Issue18

* Issue #18 - start of updating the datasource for 2020 timeseries pits and some todos inthe file * new sources * issue #18 working towards modified 2020 timeseries pits upload script * path logic * make sure to not use gap filled density at this point * Issue #18 - file for 2021 timeseries pits * Issue #18 no perimeter depth files for 2021 TS pits * having issues creating the test database * Modify create script for sqlalchemy>2.0 * Switch to 2020 V1 pits - there are some data format and header issues in the V2 data * Use db_session function * Slight tweaks to 2021 timeseries script * Script to delete pits * start using insitupy for metadata handling * working through handling metadata * 2020 V2 data, allow split header line logic. ALSO - use the non-gap-filled density because the gap filled density files break the logic as they don't show the profile at all * get rid of spaces in flags * Script for 2021 pits is working * start working on SWE files for pits * move towards row based SRID and timezone ability * bulk swe property upload script working * Remove Python 3.7 compatability * fixing reqs in build * bump insitupy * Fixing tests and build. SMP profile depths were not inverted * Seem to have a version issue because the etag comparison is still working locally, but not in github. Try just using contentlength * update hash * Issue #22 - start working on AK pits * some progress on the alaska data * We don't need to manage empty files as long as headers are standard * Script for SWE summary of Alaksa pits working * update db name for 2023 pits script
SnowEx · Aug 13, 2024 · 5faf656 · 5faf656
1 parent d5236ed
commit 5faf656
Show file tree

Hide file tree

Showing 23 changed files with 833 additions and 251 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8, 3.9, "3.10"]
 
     services:
 

diff --git a/.gitignore b/.gitignore
@@ -23,3 +23,5 @@ scripts/upload/test*.txt
 .idea/*
 scripts/download/data/*
 venv/
+
+credentials.json
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -6,3 +6,4 @@ pandoc==1.0.2
 sphinxcontrib-apidoc==0.3.0
 ipython==7.31.1
 MarkupSafe<2.1.0
+jupyterlab==2.2.10
diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,11 @@
 wheel>0.34.0, <0.35.0
-snowexsql>=0.3.0, <0.4.0
+snowexsql>=0.4.1, <0.5.0
 snowmicropyn
-matplotlib>=3.2.2, <3.3.0
+matplotlib>=3.2.2
 moto==3.1.11
 coloredlogs>=14.0
 progressbar2>=3.51.3
 rasterio>=1.1.5
 boto3>=1.23.7,<1.24
+timezonefinder>=6.0,<7.0
+insitupy==0.1.2
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -8,5 +8,4 @@ coverage==4.5.4
 twine==1.14.0
 pytest==6.2.3
 pytest-runner==5.1
-jupyterlab==2.2.10
 moto==3.1.11
diff --git a/scripts/download/nsidc_sources.txt b/scripts/download/nsidc_sources.txt
@@ -6,3 +6,5 @@ https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX20_SD.001/
 https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX20_GM_CSU_GPR.001/2020.02.06/SNEX20_GM_CSU_GPR_1GHz_v01.csv
 https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX20_UNM_GPR.001/2020.01.28/SNEX20_UNM_GPR.csv
 https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX20_SD_TLI.001/2019.09.29/SNEX20_SD_TLI_clean.csv
+https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX20_TS_SP.002/
+https://n5eil01u.ecs.nsidc.org/SNOWEX/SNEX21_TS_SP.001/
diff --git a/scripts/remove_data/remove_pits.py b/scripts/remove_data/remove_pits.py
@@ -0,0 +1,68 @@
+"""
+File to remove all snowpits from the database
+"""
+import argparse
+from snowexsql.api import db_session
+from snowexsql.data import LayerData
+from snowexsql.db import get_db
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Script to create our databases using the python library')
+    parser.add_argument('--db', dest='db', default='snowex',
+                        help='Name of the database locally to add tables to')
+    parser.add_argument('--dry_run', dest='dry_run', action='store_true',
+                        help='Try a dry run or not')
+    parser.add_argument('--credentials', dest='credentials',
+                        default='./credentials.json',
+                        help='Past to a json containing')
+    args = parser.parse_args()
+
+    credentials = args.credentials
+    db_name = f'localhost/{args.db}'
+    dry_run = args.dry_run
+
+    # All measurement 'types' associate with pits
+    types_pit = [
+        'sample_signal', 'grain_size', 'density', 'reflectance',
+        'permittivity', 'lwc_vol', 'manual_wetness',
+        'equivalent_diameter', 'specific_surface_area', 'grain_type',
+        'temperature', 'hand_hardness'
+    ]
+    # Start a session
+    engine, session = get_db(db_name, credentials=credentials)
+    print(f"Connected to {db_name}")
+    try:
+        q = session.query(LayerData).filter(
+            LayerData.pit_id is not None  # Filter to results with pit id
+        ).filter(
+            LayerData.type.in_(types_pit)  # Filter to correct type
+        )
+        result = q.count()
+        # Rough count of pits
+        estimated_number = int(result / float(len(types_pit)) / 10.0)
+        print(f"Found {result} records")
+        print(f"This is roughly {estimated_number} pits")
+        if dry_run:
+            print("THIS IS A DRYRUN, not deleting")
+        else:
+            if result > 0:
+                print("Deleting pits from the database")
+                # Delete
+                q.delete()
+                session.commit()
+            else:
+                print("No results, nothing to delete")
+        session.close()
+    except Exception as e:
+        print("Errored out, rolling back")
+        print(e)
+        session.rollback()
+        raise e
+
+    print("Done")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/upload/add_alaska_pits_2023.py b/scripts/upload/add_alaska_pits_2023.py
@@ -0,0 +1,114 @@
+"""
+Script to upload the Snowex Time Series pits
+"""
+
+import glob
+import re
+from os.path import abspath, join
+from pathlib import Path
+
+from snowex_db.batch import UploadProfileBatch, UploadSiteDetailsBatch
+from snowex_db.upload import PointDataCSV
+from snowex_db import db_session
+
+
+tz_map = {'US/Pacific': ['CA', 'NV', 'WA'],
+          'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'],
+          'US/Alaska': ["AK"]
+          }
+
+
+def main():
+    """
+    Add 2020 timeseries pits
+    """
+    db_name = 'localhost/snowex'
+    # Preliminary data
+    doi = "None"
+    debug = True
+    timezone = "US/Alaska"
+
+    # Point to the downloaded data from
+    data_dir = abspath('../download/data/SNEX23_preliminary/Data/pits')
+    error_msg = []
+
+    # Files to ignore
+    ignore_files = [
+        "SnowEx23_SnowPits_AKIOP_Summary_Environment_v01.csv",
+        "SnowEx23_SnowPits_AKIOP_Summary_SWE_v01.csv"
+    ]
+
+    # Get all the date folders
+    unique_folders = Path(
+        data_dir
+    ).expanduser().absolute().glob("ALASKA*/*20*SNOW_PIT")
+    for udf in unique_folders:
+        # get all the csvs in the folder
+        dt_folder_files = list(udf.glob("*.csv"))
+        site_ids = []
+        # Get the unique site ids for this date folder
+        compiled = re.compile(
+            r'SnowEx23_SnowPits_AKIOP_([a-zA-Z0-9]*)_\d{8}.*_v01\.csv'
+        )
+        for file_path in dt_folder_files:
+            file_name = file_path.name
+            if file_name in ignore_files:
+                print(f"Skipping {file_name}")
+                continue
+            match = compiled.match(file_name)
+            if match:
+                code = match.group(1)
+                site_ids.append(code)
+            else:
+                raise RuntimeError(f"No site ID found for {file_name}")
+
+        # Get the unique site ids
+        site_ids = list(set(site_ids))
+
+        for site_id in site_ids:
+            # Grab all the csvs in the pits folder
+            filenames = glob.glob(join(str(udf), f'*_{site_id}_*.csv'))
+
+            # Grab all the site details files
+            sites = glob.glob(join(
+                str(udf), f'*_{site_id}_*siteDetails*.csv'
+            ))
+
+            # Use no-gap-filled density
+            density_files = glob.glob(join(
+                str(udf), f'*_{site_id}_*_gapFilled_density*.csv'
+            ))
+
+            # Remove the site details from the total file list to get only the
+            profiles = list(
+                set(filenames) - set(sites) -
+                set(density_files)  # remove non-gap-filled denisty
+            )
+
+            # Submit all profiles associated with pit at a time
+            b = UploadProfileBatch(
+                filenames=profiles, debug=debug, doi=doi,
+                in_timezone=timezone,
+                db_name=db_name,
+                allow_split_lines=True,  # Logic for split header lines
+                header_sep=":"
+            )
+            b.push()
+            error_msg += b.errors
+
+            # Upload the site details
+            sd = UploadSiteDetailsBatch(
+                filenames=sites, debug=debug, doi=doi,
+                in_timezone=timezone,
+                db_name=db_name
+            )
+            sd.push()
+            error_msg += sd.errors
+
+    for f, m in error_msg:
+        print(f)
+    return len(error_msg)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/upload/add_pits_bulk_properties.py b/scripts/upload/add_pits_bulk_properties.py
@@ -0,0 +1,77 @@
+"""
+Script to upload the Snowex Time Series pits
+"""
+
+import glob
+import re
+from os.path import abspath, join
+from pathlib import Path
+
+import pandas as pd
+
+from snowex_db.upload import PointDataCSV
+from snowex_db import db_session
+
+
+def main():
+    """
+    Add bulk SWE, Depth, Density for 2020 and 2021 timeseires pits
+    """
+    db_name = 'localhost/snowex'
+    debug = True
+
+    # Point to the downloaded data from
+    data_dir = abspath('../download/data/SNOWEX/')
+    error_msg = []
+
+    path_details = [
+        {
+            "DOI": "https://doi.org/10.5067/KZ43HVLZV6G4",
+            "path": "SNEX20_TS_SP.002/2019.10.24/SNEX20_TS_SP_Summary_SWE_v02.csv"
+        },
+        {
+            "DOI": "https://doi.org/10.5067/QIANJYJGRWOV",
+            "path": "SNEX21_TS_SP.001/2020.11.16/SNEX21_TS_SP_Summary_SWE_v01.csv"
+        },
+        # Preliminary data from 2023 Alask pits
+        {
+            "DOI": None,
+            "path": "../SNEX23_preliminary/Data/SnowEx23_SnowPits_AKIOP_Summary_SWE_v01.csv"
+        }
+    ]
+    for info in path_details:
+        doi = info["DOI"]
+        file_path = join(data_dir, info["path"])
+        # Read csv and dump new one without the extra header lines
+        df = pd.read_csv(
+            file_path,
+            skiprows=list(range(32)) + [33]
+        )
+        new_name = file_path.replace(".csv", "_modified.csv")
+        # Filter to columns we want (density, swe, etc)
+        columns = [
+            'Location', 'Site', 'PitID', 'Date/Local Standard Time', 'UTM Zone',
+            'Easting (m)', 'Northing (m)', 'Latitude (deg)', 'Longitude (deg)',
+            'Density Mean (kg/m^3)',
+            'SWE (mm)', 'HS (cm)', "Snow Void (cm)", 'Flag'
+        ]
+        df_columns = df.columns.values
+        filtered_columns = [c for c in columns if c in df_columns]
+        df = df.loc[:, filtered_columns]
+        df.to_csv(new_name, index=False)
+
+        # Submit SWE file data as point data
+        with db_session(
+            db_name, credentials='credentials.json'
+        ) as (session, engine):
+            pcsv = PointDataCSV(
+                new_name, doi=doi, debug=debug,
+                depth_is_metadata=False,
+                row_based_crs=True,
+                row_based_timezone=True
+            )
+            pcsv.submit(session)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/upload/add_time_series_pits.py b/scripts/upload/add_time_series_pits.py