From 3f0bad15ac3491e0ee15e2c1a61d1f0f79140fa3 Mon Sep 17 00:00:00 2001 From: Alex Massen-Hane <104514872+alexmassen-hane@users.noreply.github.com> Date: Thu, 7 Dec 2023 07:09:59 +0800 Subject: [PATCH] Update: Fix OpenAlex schema and add missing fields (#204) --- .../database/schema/openalex/works.json | 202 +++++++++++++++++- .../openalex/2023-04-02/expected/works.json | 4 +- .../openalex/2023-04-16/expected/works.json | 4 +- 3 files changed, 201 insertions(+), 9 deletions(-) diff --git a/academic_observatory_workflows/database/schema/openalex/works.json b/academic_observatory_workflows/database/schema/openalex/works.json index fc71810a2..f22b7d93d 100644 --- a/academic_observatory_workflows/database/schema/openalex/works.json +++ b/academic_observatory_workflows/database/schema/openalex/works.json @@ -40,6 +40,16 @@ "mode": "NULLABLE", "description": "APC converted to USD" }, + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "value_usd", + "type": "INTEGER" + }, { "name": "provenance", "type": "STRING", @@ -69,6 +79,16 @@ "mode": "NULLABLE", "description": "APC converted to USD" }, + { + "mode": "NULLABLE", + "name": "value", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "value_usd", + "type": "INTEGER" + }, { "name": "provenance", "type": "STRING", @@ -77,6 +97,16 @@ ], "description": "Object: Information about the paid APC (article processing charge) for this work. You can find the listed APC price (when we know it) for a given work using apc_list. However, authors don\u2019t always pay the listed price; often they get a discounted price from publishers. So it\u2019s useful to know the APC actually paid by authors, as distinct from the list price. This is our effort to provide this. Our best source for the actually paid price is the OpenAPC project. Where available, we use that data, and so apc_paid.provenance is openapc. Where OpenAPC data is unavailable (and unfortunately this is common) we make our best guess by assuming the author paid the APC list price, and apc_paid.provenance will be set to wherever we got the list price from." }, + { + "mode": "NULLABLE", + "name": "authors_count", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "authorships_truncated", + "type": "BOOLEAN" + }, { "name": "authorships", "type": "RECORD", @@ -114,6 +144,11 @@ "mode": "NULLABLE", "description": "A summarized description of this author's position in the work's author list. Possible values are first, middle, and last. It's not strictly necessary, because author order is already implicitly recorded by the list order of Authorship objects; however it's useful in some contexts to have this as a categorical value." }, + { + "mode": "REPEATED", + "name": "countries", + "type": "STRING" + }, { "name": "institutions", "type": "RECORD", @@ -188,12 +223,27 @@ "type": "RECORD", "mode": "NULLABLE", "fields": [ + { + "mode": "NULLABLE", + "name": "doi", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "is_accepted", + "type": "BOOLEAN" + }, { "name": "is_oa", "type": "BOOLEAN", "mode": "NULLABLE", "description": "True if this work is Open Access (OA)." }, + { + "mode": "NULLABLE", + "name": "is_published", + "type": "BOOLEAN" + }, { "name": "landing_page_url", "type": "STRING", @@ -223,6 +273,16 @@ "mode": "NULLABLE", "description": "The name of the source." }, + { + "mode": "REPEATED", + "name": "host_institution_lineage", + "type": "STRING" + }, + { + "mode": "REPEATED", + "name": "host_institution_lineage_names", + "type": "STRING" + }, { "name": "host_organization", "type": "STRING", @@ -259,6 +319,11 @@ "mode": "NULLABLE", "description": "Whether this is a journal listed in the Directory of Open Access Journals (DOAJ)." }, + { + "name": "is_oa", + "type": "BOOLEAN", + "mode": "NULLABLE" + }, { "name": "issn", "type": "STRING", @@ -275,13 +340,23 @@ "name": "publisher", "type": "STRING", "mode": "NULLABLE", - "description": "" + "description": "The publisher name." }, { "name": "publisher_id", "type": "STRING", "mode": "NULLABLE", - "description": "" + "description": "The OpenAlex ID of the publisher." + }, + { + "mode": "REPEATED", + "name": "publisher_lineage", + "type": "STRING" + }, + { + "mode": "REPEATED", + "name": "publisher_lineage_names", + "type": "STRING" }, { "name": "type", @@ -378,6 +453,11 @@ ], "description": "List of dehydrated Concept objects. \nEach Concept object in the list also has one additional property" }, + { + "mode": "NULLABLE", + "name": "concepts_count", + "type": "INTEGER" + }, { "name": "corresponding_author_ids", "type": "STRING", @@ -440,6 +520,16 @@ "mode": "NULLABLE", "description": "The DOI for the work. This is the Canonical External ID for works.\nOccasionally, a work has more than one DOI--for example, there might be one DOI for a preprint version hosted on bioRxiv, and another DOI for the published version. However, this field always has just one DOI, the DOI for the published work." }, + { + "mode": "NULLABLE", + "name": "doi_registration_agency", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "fulltext_origin", + "type": "STRING" + }, { "name": "grants", "type": "RECORD", @@ -463,6 +553,11 @@ ], "description": "List of grant objects, which include the Funder and the award ID, if available. Our grants data comes from Crossref, and is currently fairly limited." }, + { + "mode": "NULLABLE", + "name": "has_fulltext", + "type": "BOOLEAN" + }, { "name": "id", "type": "STRING", @@ -474,6 +569,11 @@ "type": "RECORD", "mode": "NULLABLE", "fields": [ + { + "mode": "NULLABLE", + "name": "arxiv_id", + "type": "STRING" + }, { "name": "doi", "type": "STRING", @@ -542,6 +642,11 @@ "type": "RECORD", "mode": "REPEATED", "fields": [ + { + "mode": "NULLABLE", + "name": "doi", + "type": "STRING" + }, { "name": "is_accepted", "type": "BOOLEAN", @@ -589,6 +694,16 @@ "mode": "NULLABLE", "description": "The name of the source." }, + { + "mode": "REPEATED", + "name": "host_institution_lineage", + "type": "STRING" + }, + { + "mode": "REPEATED", + "name": "host_institution_lineage_names", + "type": "STRING" + }, { "name": "host_organization", "type": "STRING", @@ -625,6 +740,11 @@ "mode": "NULLABLE", "description": "Whether this is a journal listed in the Directory of Open Access Journals (DOAJ)." }, + { + "name": "is_oa", + "type": "BOOLEAN", + "mode": "NULLABLE" + }, { "name": "issn", "type": "STRING", @@ -647,7 +767,17 @@ "name": "publisher_id", "type": "STRING", "mode": "NULLABLE", - "description": "The OpenAlex publisher ID." + "description": "The OpenAlex ID of the publisher." + }, + { + "mode": "REPEATED", + "name": "publisher_lineage", + "type": "STRING" + }, + { + "mode": "REPEATED", + "name": "publisher_lineage_names", + "type": "STRING" }, { "name": "type", @@ -741,12 +871,27 @@ "type": "RECORD", "mode": "NULLABLE", "fields": [ + { + "mode": "NULLABLE", + "name": "doi", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "is_accepted", + "type": "BOOLEAN" + }, { "name": "is_oa", "type": "BOOLEAN", "mode": "NULLABLE", "description": "True if this work is Open Access (OA)." }, + { + "mode": "NULLABLE", + "name": "is_published", + "type": "BOOLEAN" + }, { "name": "landing_page_url", "type": "STRING", @@ -776,6 +921,16 @@ "mode": "NULLABLE", "description": "The name of the source." }, + { + "mode": "REPEATED", + "name": "host_institution_lineage", + "type": "STRING" + }, + { + "mode": "REPEATED", + "name": "host_institution_lineage_names", + "type": "STRING" + }, { "name": "host_organization", "type": "STRING", @@ -812,6 +967,11 @@ "mode": "NULLABLE", "description": "Whether this is a journal listed in the Directory of Open Access Journals (DOAJ)." }, + { + "name": "is_oa", + "type": "BOOLEAN", + "mode": "NULLABLE" + }, { "name": "issn", "type": "STRING", @@ -836,6 +996,16 @@ "mode": "NULLABLE", "description": "The OpenAlex ID of the publisher." }, + { + "mode": "REPEATED", + "name": "publisher_lineage", + "type": "STRING" + }, + { + "mode": "REPEATED", + "name": "publisher_lineage_names", + "type": "STRING" + }, { "name": "type", "type": "STRING", @@ -871,6 +1041,11 @@ "mode": "REPEATED", "description": "OpenAlex IDs for works that this work cites. These are citations that go from this work out to another work: This work \u279e Other works." }, + { + "mode": "NULLABLE", + "name": "referenced_works_count", + "type": "INTEGER" + }, { "name": "related_works", "type": "STRING", @@ -974,6 +1149,11 @@ "mode": "NULLABLE", "description": "Legacy type information, using Crossref's \"type\" controlled vocabulary." }, + { + "mode": "NULLABLE", + "name": "updated", + "type": "TIMESTAMP" + }, { "name": "updated_date", "type": "TIMESTAMP", @@ -994,8 +1174,20 @@ }, { "name": "keywords", - "type": "STRING", - "mode": "REPEATED" + "type": "RECORD", + "mode": "REPEATED", + "fields": [ + { + "mode": "NULLABLE", + "name": "keyword", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "score", + "type": "FLOAT" + } + ] }, { "name": "cited_by_percentile_year", diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json index 920cd2a5c..af4f61f8a 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-02/expected/works.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eee314608d43ed9cca74cb3510072de8e96e32464293ddaf22500ba5a29c1c34 -size 61269 +oid sha256:b326edc6a653647565cc2f263f879572af4ce17c9a9a04677ded9e97128b8425 +size 64498 diff --git a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json index 845bef42d..2f0c308ac 100644 --- a/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json +++ b/academic_observatory_workflows/fixtures/openalex/2023-04-16/expected/works.json @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0eb79c3a86b2a8740c91db8b528a26221a7430bd1556ea15307e02ecdf272380 -size 51213 +oid sha256:91f3d78d50a83d22bbe82dcbdcfba4d9ed67b8e485104ba035054a6d6eec398f +size 53929