Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

INF-572 : query formatting / linting and minor refactoring #150

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
{# Copyright 2020 Curtin University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Author: Richard Hosking, James Diprose #}
{#
# Copyright 2020 Curtin University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Author: Richard Hosking, James Diprose
#}
SELECT
id AS identifier,
id AS identifier,
SUM(total_outputs) AS total_publications,
MAX(name) AS group_name,
ARRAY_AGG(STRUCT(time_period,
total_outputs)) AS years
FROM
`{{ project_id }}.{{ dataset_id }}.{{ table_id }}`
GROUP BY
id
MAX(name) AS group_name,
ARRAY_AGG(STRUCT(
time_period,
total_outputs
)) AS years
FROM `{{ project_id }}.{{ dataset_id }}.{{ table_id }}`
GROUP BY id
2,180 changes: 1,168 additions & 1,012 deletions academic_observatory_workflows/database/sql/create_aggregate.sql.jinja2

Large diffs are not rendered by default.

275 changes: 122 additions & 153 deletions academic_observatory_workflows/database/sql/create_book.sql.jinja2
Original file line number Diff line number Diff line change
@@ -1,174 +1,143 @@
{# Copyright 2020 Curtin University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Author: Richard Hosking, Tuan Chien #}

{#
# Copyright 2020 Curtin University
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Author: Richard Hosking, Tuan Chien, Julian Tonti-Filippini
#}
CREATE TEMPORARY FUNCTION normalise_isbn_string(input STRING)
RETURNS STRING
LANGUAGE js AS r"""
const valid10 = (s='') => /^[0-9]{9}[0-9X]$/.test(s) && s[ 9] == check10(s);
const valid13 = (s='') => /^[0-9]{13}$/.test(s) && s[12] == check13(s);

function calc_isbn13_check_digit(isbn13) {
var mask = [1,3,1,3,1,3,1,3,1,3,1,3];

var prefix = [];
for(let i = 0; i < 12; i++) {
prefix.push(Number(isbn13[i]));
}

let check_digit = 0;
for(let i = 0; i < 12; i++) {
check_digit += mask[i]*prefix[i];
}

return (10-(check_digit % 10)) % 10;
function check10(s='') {
const csum = [10,9,8,7,6,5,4,3,2].reduce((agg,val,key) => agg + val * +s[key], 0);
const cval = (11 - csum % 11) % 11;
return cval == 10 ? 'X' : String(cval);
}

function calc_isbn10_check_digit(isbn10) {
var mask = [10,9,8,7,6,5,4,3,2];

var prefix = [];
for(let i = 0; i < 9; i++) {
prefix.push(Number(isbn10[i]));
}

let check_digit = 0;
for(let i = 0; i < 9; i++) {
check_digit += mask[i]*prefix[i];
}

check_digit = (11-(check_digit % 11)) % 11;

if(check_digit == 10)
return 'X';

return check_digit;
function check13(s='') {
const csum = [1,3,1,3,1,3,1,3,1,3,1,3].reduce((agg,val,key) => agg + val * +s[key], 0);
return String((10-(csum % 10)) % 10);
}

function is_isbn13(isbn) {
if(isbn.length != 13) {
return false;
}

if(isNaN(Number(isbn))) {
return false;
}

let check_digit = String(calc_isbn13_check_digit(isbn));
return check_digit == isbn[12];
}

function is_isbn10(isbn) {
if(isbn.length != 10) {
return false;
}

if(isNaN(Number(isbn.slice(0,9)))) {
return false;
}

let check_digit = String(calc_isbn10_check_digit(isbn));
return check_digit == isbn[9];
}

function convert_isbn10_to_isbn13(isbn10) {
let isbn = "978" + isbn10.slice(0, 9);
let check_digit = calc_isbn13_check_digit(isbn);
isbn += String(check_digit);
return isbn;
function upgrade(s='') {
s = '978' + s.slice(0,9);
return s + check13(s);
}

function strip_isbn_string(isbn) {
var regexp = /[^0-9X]/gi;
return isbn.replace(regexp, "");
}

let stripped = strip_isbn_string(input);

if(stripped.length == 13 && is_isbn13(stripped)) {
return stripped;
}

if(stripped.length == 10 && is_isbn10(stripped)) {
return convert_isbn10_to_isbn13(stripped);
}

return null;
const s = input.toUpperCase().replaceAll(/[^0-9X]/gi,'');
return valid10(s) ? upgrade(s) : valid13(s) ? s : null;
""";

with isbns as (
SELECT
normalise_isbn_string(isbn) as isbn,
{#
--test
SELECT [
IF(normalise_isbn_string('978-1-60309-517-4') = '9781603095174', TRUE, FALSE),
IF(normalise_isbn_string('978-1-60309-520-4') = '9781603095204', TRUE, FALSE),
IF(normalise_isbn_string('978-1-60309-511-2') = '9781603095112', TRUE, FALSE),
IF(normalise_isbn_string('978-1-60309-508-2') = '9781603095082', TRUE, FALSE),
IF(normalise_isbn_string('978-1-60309-515-0') = '9781603095150', TRUE, FALSE),
IF(normalise_isbn_string('978-1-60309-521-1') = '9781603095211', TRUE, FALSE),
IF(normalise_isbn_string('978-1-60309-522-8') = '9781603095228', TRUE, FALSE),
IF(normalise_isbn_string('978-1-60309-519-8') = '9781603095198', TRUE, FALSE),
IF(normalise_isbn_string('978-1-60309-516-7') = '9781603095167', TRUE, FALSE),

normalise_isbn_string('978-1-60309-517-5') IS NULL,
normalise_isbn_string('978-1-60309-520-6') IS NULL,
normalise_isbn_string('978-1-60309- -2') IS NULL,
normalise_isbn_string('978-1- -508-2') IS NULL,
normalise_isbn_string('978- -60309-515-0') IS NULL,
normalise_isbn_string(' -1-60309-521-0') IS NULL,
normalise_isbn_string(' ') IS NULL,
normalise_isbn_string('ahkel;AJKJF:IJALK') IS NULL,
normalise_isbn_string('\t\n\"\\\r\t\n\t ') IS NULL
];
#}

WITH
isbns AS (
SELECT
normalise_isbn_string(isbn) AS isbn,
doi,
crossref.title,
crossref.type,
crossref.publisher,
crossref.published_year,
crossref.published_year_month,
(SELECT ARRAY_AGG(normalise_isbn_string(candidate_isbn) IGNORE NULLS) FROM UNNEST(crossref.isbn) as candidate_isbn) as work_isbns
--ARRAY(SELECT normalise_isbn_string(candidate_isbn) as work_isbn from UNNEST(crossref.isbn) as candidate_isbn) as work_isbns
FROM `{{ observatory.project_id }}.{{ observatory.dataset_id }}.doi{{ observatory.release_date.strftime('%Y%m%d') }}` as original, UNNEST(crossref.ISBN) as isbn),

books as (
SELECT
(
SELECT ARRAY_AGG(normalise_isbn_string(candidate_isbn) IGNORE NULLS) FROM UNNEST(crossref.isbn) AS candidate_isbn
) AS work_isbns
--ARRAY(SELECT normalise_isbn_string(candidate_isbn) AS work_isbn from UNNEST(crossref.isbn) AS candidate_isbn) AS work_isbns
FROM `{{ observatory.project_id }}.{{ observatory.dataset_id }}.doi{{ observatory.release_date.strftime('%Y%m%d') }}` AS original,
UNNEST(crossref.ISBN) AS isbn),

books AS (
SELECT
book.isbn,
book.crossref_objects,
book_part.chapters
FROM (
SELECT
isbn,
ARRAY_AGG(STRUCT(doi, title, type, publisher, published_year, published_year_month, work_isbns)) as crossref_objects
FROM isbns
WHERE type in ("book", "monograph") AND isbn IS NOT NULL
GROUP BY isbn) as book
left join (
SELECT
isbn,
ARRAY_AGG(STRUCT(doi, title, type) ORDER BY doi) as chapters
FROM (
SELECT
isbn,
ARRAY_AGG(STRUCT(doi, title, type, publisher, published_year, published_year_month, work_isbns)) AS crossref_objects
FROM isbns
WHERE type in ("book-chapter") AND isbn IS NOT NULL
WHERE type IN ("book", "monograph") AND isbn IS NOT NULL
GROUP BY isbn
) as book_part on book.isbn = book_part.isbn),

ISBN_DOI as (SELECT
ISBN,
ARRAY_CONCAT(ARRAY(SELECT doi FROM book.crossref_objects), ARRAY(SELECT doi FROM book.chapters)) as dois
FROM books as book),

events_matched as (SELECT
ISBN,
ARRAY_CONCAT_AGG(crossref_events.events) as events,
ARRAY_CONCAT_AGG(crossref_events.months) as months,
ARRAY_CONCAT_AGG(crossref_events.years) as years,
FROM ISBN_DOI, UNNEST(dois) as doi
LEFT JOIN `{{ crossref_events.project_id }}.{{ crossref_events.dataset_id }}.{{ crossref_events.table_id }}{{ crossref_events.release_date.strftime('%Y%m%d') }}` as crossref_events ON crossref_events.doi = doi
GROUP BY ISBN),

events_aggregated as (SELECT
ISBN,
ARRAY(SELECT as STRUCT source, SUM(count) as count FROM UNNEST(events) GROUP BY source) as overall,
ARRAY(SELECT as STRUCT month, source, SUM(count) as count FROM UNNEST(months) GROUP BY month, source ORDER BY month DESC) as months,
ARRAY(SELECT as STRUCT year, source, SUM(count) as count FROM UNNEST(years) GROUP BY year, source ORDER BY year DESC) as years
FROM events_matched)
) AS book
LEFT JOIN (
SELECT
isbn,
ARRAY_AGG(STRUCT(doi, title, type) ORDER BY doi) AS chapters
FROM isbns
WHERE type IN ("book-chapter") AND isbn IS NOT NULL
GROUP BY isbn
) AS book_part ON book.isbn = book_part.isbn
),

ISBN_DOI AS (
SELECT
ISBN,
ARRAY_CONCAT(ARRAY(SELECT doi FROM book.crossref_objects), ARRAY(SELECT doi FROM book.chapters)) AS dois
FROM books AS book
),

events_matched AS (
SELECT
ISBN,
ARRAY_CONCAT_AGG(crossref_events.events) AS events,
ARRAY_CONCAT_AGG(crossref_events.months) AS months,
ARRAY_CONCAT_AGG(crossref_events.years ) AS years
FROM ISBN_DOI,
UNNEST(dois) AS doi
LEFT JOIN `{{ crossref_events.project_id }}.{{ crossref_events.dataset_id }}.{{ crossref_events.table_id }}{{ crossref_events.release_date.strftime('%Y%m%d') }}` AS crossref_events ON crossref_events.doi = doi
GROUP BY ISBN
),

events_aggregated AS (
SELECT
ISBN,
ARRAY(SELECT AS STRUCT source, SUM(count) AS count FROM UNNEST(events) GROUP BY source ) AS overall,
ARRAY(SELECT AS STRUCT month, source, SUM(count) AS count FROM UNNEST(months) GROUP BY month, source ORDER BY month DESC) AS months,
ARRAY(SELECT AS STRUCT year , source, SUM(count) AS count FROM UNNEST(years ) GROUP BY year , source ORDER BY year DESC) AS years
FROM events_matched
)

SELECT
book.isbn,
book.crossref_objects,
book.chapters,
STRUCT(
events.overall,
events.months,
events.years
) as events
FROM books as book
LEFT join events_aggregated as events on events.ISBN = book.ISBN
book.isbn,
book.crossref_objects,
book.chapters,
STRUCT(
events.overall,
events.months,
events.years
) AS events
FROM books AS book
LEFT JOIN events_aggregated AS events ON events.ISBN = book.ISBN
Loading