From 21f1a439e6d9a002415f34d040468119ee192761 Mon Sep 17 00:00:00 2001 From: Chris Whong Date: Wed, 15 Apr 2020 13:51:46 -0400 Subject: [PATCH 1/2] change back to nyc --- socrata-etl/datasets-from-ids.js | 2 +- socrata-etl/process-dataset.js | 4 ++-- socrata-etl/util/build-component.js | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/socrata-etl/datasets-from-ids.js b/socrata-etl/datasets-from-ids.js index 10fb758..80ca5fc 100644 --- a/socrata-etl/datasets-from-ids.js +++ b/socrata-etl/datasets-from-ids.js @@ -3,7 +3,7 @@ const processDataset = require('./process-dataset') // helper function to fetch a dataset's row count const getRowCount = async (id) => { - const res = await fetch(`https://data.cityofchicago.org/api/id/${id}.json?$select=count(*)%20as%20count`) + const res = await fetch(`https://data.cityofnewyork.us/api/id/${id}.json?$select=count(*)%20as%20count`) .then(d => d.json()) const [{ count }] = res diff --git a/socrata-etl/process-dataset.js b/socrata-etl/process-dataset.js index ae53f17..fcff3b9 100644 --- a/socrata-etl/process-dataset.js +++ b/socrata-etl/process-dataset.js @@ -6,7 +6,7 @@ const buildComponent = require('./util/build-component') // helper function to fetch socrata metadata const fetchMetaData = (datasetId) => { - const metadataUrl = `https://data.cityofchicago.org/api/views/${datasetId}.json` + const metadataUrl = `https://data.cityofnewyork.us/api/views/${datasetId}.json` // get the metadata json console.log('getting metadata...', metadataUrl) return fetch(metadataUrl).then(d => d.json()) @@ -25,7 +25,7 @@ const processDataset = async (id) => { const readme = buildComponent.readme(metadata) - const bodyBuffer = await fetch(`https://data.cityofchicago.org/api/views/${id}/rows.csv?accessType=DOWNLOAD`).then(d => d.arrayBuffer()) + const bodyBuffer = await fetch(`https://data.cityofnewyork.us/api/views/${id}/rows.csv?accessType=DOWNLOAD`).then(d => d.arrayBuffer()) await writer(`tmp/${id}/readme.md`, readme) diff --git a/socrata-etl/util/build-component.js b/socrata-etl/util/build-component.js index 332e35f..b899e38 100644 --- a/socrata-etl/util/build-component.js +++ b/socrata-etl/util/build-component.js @@ -27,13 +27,13 @@ const meta = (sourceMetadata, datasetId) => { email: undefined } ], - accessUrl: `https://data.cityofchicago.org/api/views/${datasetId}/rows.csv?accessType=DOWNLOAD + accessUrl: `https://data.data.cityofnewyork.us/api/views/${datasetId}/rows.csv?accessType=DOWNLOAD `, createdAt, downloadCount, rowsUpdatedAt, - // accrualPeriodicity: metadata.custom_fields.Update['Update Frequency'].trim(), - // agency: metadata.custom_fields['Dataset Information']['Agency'] + accrualPeriodicity: metadata.custom_fields.Update['Update Frequency'].trim(), + agency: metadata.custom_fields['Dataset Information']['Agency'] } } @@ -44,7 +44,7 @@ ${description} ## Import Details -This qri dataset was programmatically created from a dataset published on the Chicago Open Data Portal. [Original Dataset on data.cityofchicago.org/](https://data.cityofchicago.org/d/d/${id}) +This qri dataset was programmatically created from a dataset published on the Chicago Open Data Portal. [Original Dataset on data.data.cityofnewyork.us/](https://data.data.cityofnewyork.us/d/d/${id}) The latest update ran on ${Date(Date.now()).toString()}` } From afb12e539564133738484b0164e020744d976265 Mon Sep 17 00:00:00 2001 From: Chris Whong Date: Fri, 17 Apr 2020 16:11:16 -0400 Subject: [PATCH 2/2] add scripts for nyc-open-data-catalog --- nyc-open-data-catalog/.gitignore | 3 + nyc-open-data-catalog/README.md | 63 +++++++++ nyc-open-data-catalog/create-qri-dataset.js | 13 ++ nyc-open-data-catalog/package.json | 11 ++ nyc-open-data-catalog/process-datasets.js | 148 ++++++++++++++++++++ nyc-open-data-catalog/yarn-error.log | 51 +++++++ nyc-open-data-catalog/yarn.lock | 18 +++ 7 files changed, 307 insertions(+) create mode 100644 nyc-open-data-catalog/.gitignore create mode 100644 nyc-open-data-catalog/README.md create mode 100644 nyc-open-data-catalog/create-qri-dataset.js create mode 100644 nyc-open-data-catalog/package.json create mode 100644 nyc-open-data-catalog/process-datasets.js create mode 100644 nyc-open-data-catalog/yarn-error.log create mode 100644 nyc-open-data-catalog/yarn.lock diff --git a/nyc-open-data-catalog/.gitignore b/nyc-open-data-catalog/.gitignore new file mode 100644 index 0000000..9bd7b7c --- /dev/null +++ b/nyc-open-data-catalog/.gitignore @@ -0,0 +1,3 @@ +/tmp +/node_modules +.DS_Store diff --git a/nyc-open-data-catalog/README.md b/nyc-open-data-catalog/README.md new file mode 100644 index 0000000..0cf33f9 --- /dev/null +++ b/nyc-open-data-catalog/README.md @@ -0,0 +1,63 @@ +# NYC Open Data Catalog + +Scripts to generate a table of metadata for all datasets on the NYC Open Data Portal. + +## Background + +The simple question to answer is "What the most viewed/downloaded" datasets published on the NYC Open Data Portal. It's possible to sort the catalog website by most viewed, but the download count is elusive. It's available on each dataset's landing page, but there's no quick way to see them all at once. This script gathers the download count (and other platform metadata) for each dataset and compiles them into a qri dataset. + +## Approach + +Use `data.json` as the list of all datasets. `data.json` is a catalog feed, and contains an abbreviated set of metadata for all of the datasets. We are ignoring just about all of it, and are only interested in the dataset ids which we can use to get each dataset's detailed metadata. + +`curl https://data.cityofnewyork.us/data.json > ./tmp/nyc.json` + +Now that we know all of the dataset ids, we can call the metadata API for each one: + +`https://data.cityofnewyork.us/api/views/:id.json` + +The following fields will be added to our new dataset: + +``` +id +name +attribution +averageRating +category +createdAt +description +displayType +downloadCount +hideFromCatalog +hideFromDataJson +indexUpdatedAt +newBackend +numberOfComments +oid +provenance +publicationAmmendEnabled +publicationDate +publicationGroup +publicationStage +rowClass +rowsUpdatedAt +rowsUpdatedBy +tableId +totalTimesRated +viewCount +viewLastModified +viewType +automated +dataMadePublic +updateFrequency +agency +tags +``` + +## Scripts + +`process-datasets.js` iterates over the datasets in `data.json`, calls the metadata API, processes the response, and writes a new line to the CSV at `tmp/output.csv` + +It takes only a few minutes to fetch metadata for the 2,712 datasets listed in `data.json` + +`create-and-publish.js` creates a new qri dataset in the local qri store, and publishes it. diff --git a/nyc-open-data-catalog/create-qri-dataset.js b/nyc-open-data-catalog/create-qri-dataset.js new file mode 100644 index 0000000..455b209 --- /dev/null +++ b/nyc-open-data-catalog/create-qri-dataset.js @@ -0,0 +1,13 @@ +// creates and publishes a qri dataset from the csv in tmp/output.csv +const qri = require(`${__dirname}/../../../qri/node-qri`) + +qri.save('me/catalog-metadata', { + body: `${__dirname}/tmp/output.csv`, + file: [ + `${__dirname}/tmp/meta.json`, + `${__dirname}/tmp/readme.md`, + `${__dirname}/tmp/structure.json` + ] +}) + +qri.publish('me/catalog-metadata') diff --git a/nyc-open-data-catalog/package.json b/nyc-open-data-catalog/package.json new file mode 100644 index 0000000..7e9fd20 --- /dev/null +++ b/nyc-open-data-catalog/package.json @@ -0,0 +1,11 @@ +{ + "name": "nyc-open-data-catalog", + "version": "1.0.0", + "main": "index.js", + "license": "MIT", + "dependencies": { + "csv-string": "^3.2.0", + "moment": "^2.24.0", + "node-fetch": "^2.6.0" + } +} diff --git a/nyc-open-data-catalog/process-datasets.js b/nyc-open-data-catalog/process-datasets.js new file mode 100644 index 0000000..833ea57 --- /dev/null +++ b/nyc-open-data-catalog/process-datasets.js @@ -0,0 +1,148 @@ +const fs = require('fs') +const fetch = require('node-fetch') +const moment = require('moment') + +const toISO8601 = (unix) => { + return moment.unix(unix).format() +} + +// from https://stackoverflow.com/questions/46637955/write-a-string-containing-commas-and-double-quotes-to-csv +const sanitizeString = (desc) => { + var itemDesc; + if (desc) { + itemDesc = desc.replace(/(\r\n|\n|\r|\s+|\t| )/gm,' '); + itemDesc = itemDesc.replace(/,/g, '\,'); + itemDesc = itemDesc.replace(/"/g, '""'); + itemDesc = itemDesc.replace(/'/g, '\''); + itemDesc = itemDesc.replace(/ +(?= )/g,''); + } else { + itemDesc = ''; + } + return `"${itemDesc}"`; +} + +const fetchMetaData = async (datasetId) => { + const metadataUrl = `https://data.cityofnewyork.us/api/views/${datasetId}.json` + // get the metadata json + console.log('getting metadata...', metadataUrl) + const raw = await fetch(metadataUrl).then(d => d.json()) + const { + id, + name, + attribution, + averageRating, + category, + createdAt, + description, + displayType, + downloadCount, + hideFromCatalog, + hideFromDataJson, + indexUpdatedAt, + newBackend, + numberOfComments, + oid, + provenance, + publicationAmmendEnabled, + publicationDate, + publicationGroup, + publicationStage, + rowClass, + rowsUpdatedAt, + rowsUpdatedBy, + tableId, + totalTimesRated, + viewCount, + viewLastModified, + viewType, + metadata, + tags + } = raw + + // clean up the metadata + + const { custom_fields } = metadata + const { Update, 'Dataset Information': datasetInformation } = custom_fields + const { + Automation: automation, + 'Date Made Public': dateMadePublic, + 'Update Frequency': updateFrequency + } = Update + + let agency = '' + if (datasetInformation && datasetInformation.Agency) { + agency = datasetInformation.Agency + } + + const tagsAsString = tags ? tags.join(';') : '' + + return { + id, + name: sanitizeString(name), + attribution: sanitizeString(attribution), + averageRating, + category, + createdAt: toISO8601(createdAt), + description: sanitizeString(description), + displayType, + downloadCount, + hideFromCatalog, + hideFromDataJson, + indexUpdatedAt: toISO8601(indexUpdatedAt), + newBackend, + numberOfComments, + oid, + provenance, + publicationAmmendEnabled, + publicationDate: toISO8601(publicationDate), + publicationGroup, + publicationStage, + rowClass, + rowsUpdatedAt: toISO8601(rowsUpdatedAt), + rowsUpdatedBy, + tableId, + totalTimesRated, + viewCount, + viewLastModified: toISO8601(viewLastModified), + viewType, + automation, + dateMadePublic: sanitizeString(dateMadePublic), + updateFrequency, + agency: sanitizeString(agency), + tags: sanitizeString(tagsAsString) + } +} + +(async () => { + const { dataset: catalog } = require('./tmp/nyc.json') + + // const subset = catalog.slice(0, 50) + const subset = catalog + + const ids = subset.map(d => d.landingPage.split('/')[4]) + + console.log(ids) + + // const ids = ['v2kq-qrx6'] + + const output = fs.createWriteStream('./tmp/output.csv') + console.log(ids.length) + for (let i = 0; i < ids.length; i++) { + const id = ids[i] + try { + const metadataRow = await fetchMetaData(id) + console.log(metadataRow.description) + + // add header on first result + if (i === 0) output.write(Object.keys(metadataRow).join(',')) + + const row = Object.keys(metadataRow).map(d => metadataRow[d]).join(',') + output.write(`\n${row}`) + + + } catch(e) { + console.log('SCRIPT ERROR', e) + } + + } +})(); diff --git a/nyc-open-data-catalog/yarn-error.log b/nyc-open-data-catalog/yarn-error.log new file mode 100644 index 0000000..4f6ab50 --- /dev/null +++ b/nyc-open-data-catalog/yarn-error.log @@ -0,0 +1,51 @@ +Arguments: + /Users/chriswhong/.nvm/versions/node/v12.1.0/bin/node /Users/chriswhong/.yarn/bin/yarn.js init + +PATH: + /Users/chriswhong/opt/anaconda3/bin:/Users/chriswhong/opt/anaconda3/condabin:/Users/chriswhong/.yarn/bin:/Users/chriswhong/.config/yarn/global/node_modules/.bin:/Users/chriswhong/google-cloud-sdk/bin:/Users/chriswhong/.nvm/versions/node/v12.1.0/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/go/bin:/Applications/qri + +Yarn version: + 1.21.1 + +Node version: + 12.1.0 + +Platform: + darwin x64 + +Trace: + Error: canceled + at Interface. (/Users/chriswhong/.yarn/lib/cli.js:136925:13) + at Interface.emit (events.js:196:13) + at Interface._ttyWrite (readline.js:877:16) + at ReadStream.onkeypress (readline.js:189:10) + at ReadStream.emit (events.js:196:13) + at emitKeys (internal/readline.js:424:14) + at emitKeys.next () + at ReadStream.onData (readline.js:1145:36) + at ReadStream.emit (events.js:196:13) + at addChunk (_stream_readable.js:290:12) + +npm manifest: + { + "name": "nyc-open-data-catalog", + "version": "1.0.0", + "main": "index.js", + "license": "MIT", + "dependencies": { + "moment": "^2.24.0" + } + } + +yarn manifest: + No manifest + +Lockfile: + # THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. + # yarn lockfile v1 + + + moment@^2.24.0: + version "2.24.0" + resolved "https://registry.yarnpkg.com/moment/-/moment-2.24.0.tgz#0d055d53f5052aa653c9f6eb68bb5d12bf5c2b5b" + integrity sha512-bV7f+6l2QigeBBZSM/6yTNq4P2fNpSWj/0e7jQcy87A8e7o2nAfP/34/2ky5Vw4B9S446EtIhodAzkFCcR4dQg== diff --git a/nyc-open-data-catalog/yarn.lock b/nyc-open-data-catalog/yarn.lock new file mode 100644 index 0000000..8adc2ab --- /dev/null +++ b/nyc-open-data-catalog/yarn.lock @@ -0,0 +1,18 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +csv-string@^3.2.0: + version "3.2.0" + resolved "https://registry.yarnpkg.com/csv-string/-/csv-string-3.2.0.tgz#d034b62dfcd10b95ff7e584401d15355805673bd" + integrity sha512-JN3iAuFJ+r7+CwF6UtP3U8ryorRkQp8NT+9VufeiRV+Xyv+Q8HPPBHGm4LAq7YihTQYmUnIeYy5CPQ8Y2GhMkg== + +moment@^2.24.0: + version "2.24.0" + resolved "https://registry.yarnpkg.com/moment/-/moment-2.24.0.tgz#0d055d53f5052aa653c9f6eb68bb5d12bf5c2b5b" + integrity sha512-bV7f+6l2QigeBBZSM/6yTNq4P2fNpSWj/0e7jQcy87A8e7o2nAfP/34/2ky5Vw4B9S446EtIhodAzkFCcR4dQg== + +node-fetch@^2.6.0: + version "2.6.0" + resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.0.tgz#e633456386d4aa55863f676a7ab0daa8fdecb0fd" + integrity sha512-8dG4H5ujfvFiqDmVu9fQ5bOHUC15JMjMY/Zumv26oOvvVJjM67KF8koCWIabKQ1GJIa9r2mMZscBq/TbdOcmNA==