Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UKBB disease table generation updates #572

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions ml4h/.idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions ml4h/.idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions ml4h/.idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions ml4h/.idea/ml4h.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ml4h/.idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions ml4h/.idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions phenotype_labels/disease/cvdidisease.osx
Git LFS file not shown
52 changes: 52 additions & 0 deletions phenotype_labels/disease/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
module main

go 1.21.0

require (
cloud.google.com/go/bigquery v1.54.0
github.com/carbocation/pfx v0.0.0-20230108194214-fcea663adae5
google.golang.org/api v0.138.0
)

require (
cloud.google.com/go v0.110.7 // indirect
cloud.google.com/go/compute v1.23.0 // indirect
cloud.google.com/go/compute/metadata v0.2.3 // indirect
cloud.google.com/go/iam v1.1.2 // indirect
github.com/andybalholm/brotli v1.0.5 // indirect
github.com/apache/arrow/go/v12 v12.0.1 // indirect
github.com/apache/thrift v0.19.0 // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/google/flatbuffers v23.5.26+incompatible // indirect
github.com/google/go-cmp v0.5.9 // indirect
github.com/google/s2a-go v0.1.7 // indirect
github.com/google/uuid v1.3.1 // indirect
github.com/googleapis/enterprise-certificate-proxy v0.2.5 // indirect
github.com/googleapis/gax-go/v2 v2.12.0 // indirect
github.com/klauspost/asmfmt v1.3.2 // indirect
github.com/klauspost/compress v1.16.7 // indirect
github.com/klauspost/cpuid/v2 v2.2.5 // indirect
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 // indirect
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 // indirect
github.com/pierrec/lz4/v4 v4.1.18 // indirect
github.com/zeebo/xxh3 v1.0.2 // indirect
go.opencensus.io v0.24.0 // indirect
golang.org/x/crypto v0.13.0 // indirect
golang.org/x/mod v0.12.0 // indirect
golang.org/x/net v0.15.0 // indirect
golang.org/x/oauth2 v0.11.0 // indirect
golang.org/x/sync v0.3.0 // indirect
golang.org/x/sys v0.12.0 // indirect
golang.org/x/text v0.13.0 // indirect
golang.org/x/tools v0.12.0 // indirect
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/genproto v0.0.0-20230822172742-b8732ec3820d // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20230822172742-b8732ec3820d // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d // indirect
google.golang.org/grpc v1.57.0 // indirect
google.golang.org/protobuf v1.31.0 // indirect
)
304 changes: 304 additions & 0 deletions phenotype_labels/disease/go.sum

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions phenotype_labels/disease/main.go
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ var (
)

var materializedDB string
var useGPData bool

func main() {
defer STDOUT.Flush()
Expand All @@ -36,13 +37,15 @@ func main() {
var displayQuery bool
var override bool
var diseaseName string


flag.StringVar(&BQ.Project, "project", "", "Google Cloud project you want to use for billing purposes only")
flag.StringVar(&BQ.Database, "database", "", "BigQuery source database name (note: must be formatted as project.database, e.g., broad-ml4cvd.ukbb7089_201904)")
flag.StringVar(&tabfile, "tabfile", "", "Tabfile-formatted phenotype definition")
flag.StringVar(&materializedDB, "materialized", "broad-ml4cvd.ukbb7089_201904", "project.database storing materialized view tables")
flag.BoolVar(&displayQuery, "display-query", false, "Display the constructed query and exit?")
flag.BoolVar(&override, "override", false, "Force run, even if this tool thinks your tabfile is inadequate?")
flag.BoolVar(&useGPData, "use-gp-data", false, "Use general practitioner data? Note that materialized_gp_dates table must first be created. ")
flag.StringVar(&diseaseName, "disease", "", "If not specified, the tabfile will be parsed and become the disease name.")
flag.Parse()

Expand All @@ -62,7 +65,7 @@ func main() {
diseaseName = strings.Join(parts[0:len(parts)-1], ".")
}
}

log.Println("Processing disease", diseaseName)

missingFields, err := tabs.CheckSensibility()
Expand All @@ -80,7 +83,7 @@ func main() {
}
defer BQ.Client.Close()

query, err := BuildQuery(BQ, tabs, displayQuery)
query, err := BuildQuery(BQ, tabs, displayQuery, useGPData)
if err != nil {
log.Fatalln(diseaseName, err)
}
Expand Down
33 changes: 33 additions & 0 deletions phenotype_labels/disease/materialized_gp_dates.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
WITH read_2_only AS (
SELECT gpc.eid sample_id, 41202 FieldID, cv2.icd10_code value, gpc.event_dt vdate,
FROM `ukbb-analyses.ukbb7089_202109.gp_clinical` gpc
INNER JOIN `ukbb-analyses.ukbb7089_202109.map_read_v2_icd10` cv2 on gpc.read_2 = cv2.read_code
WHERE gpc.read_2 is not null and cv2.icd10_code_def=1
and cv2.icd10_code not like '%-%' and cv2.icd10_code not like '%,%'
and cv2.icd10_code not like '%+%' and cv2.icd10_code not like '% %'
and cv2.icd10_code not like '%X'
), result_read_2 AS (
SELECT sample_id, FieldID, value, MIN(vdate) first_date from read_2_only
GROUP BY sample_id, FieldID, value
), read_3_only AS (
SELECT gpc.eid sample_id, 41202 FieldID, cv3.icd10_code value, gpc.event_dt vdate,
FROM `ukbb-analyses.ukbb7089_202109.gp_clinical` gpc
INNER JOIN `ukbb-analyses.ukbb7089_202109.map_read_v3_icd10` cv3 on gpc.read_3 = cv3.read_code
WHERE gpc.read_3 is not null
and cv3.icd10_code not like '%-%' and cv3.icd10_code not like '%,%'
and cv3.icd10_code not like '%+%' and cv3.icd10_code not like '% %'
and cv3.icd10_code not like '%X' and cv3.icd10_code not like '%D'
and cv3.icd10_code not like '%A'
and ((cv3.mapping_status='E' and cv3.refine_flag != 'M') or (cv3.mapping_status='D' and cv3.refine_flag in ('C','P') and cv3.add_code_flag in ('C', 'P', 'M')))
), result_read_3 AS (
SELECT sample_id, FieldID, value, MIN(vdate) first_date from read_3_only
GROUP BY sample_id, FieldID, value
)

SELECT all_results.sample_id, all_results.FieldID, all_results.value, MIN(all_results.first_date) first_date
FROM (
SELECT * FROM result_read_2
UNION DISTINCT
SELECT * FROM result_read_3
) all_results
GROUP BY all_results.sample_id, all_results.FieldID, all_results.value
66 changes: 31 additions & 35 deletions phenotype_labels/disease/materialized_hesin_dates.sql
Original file line number Diff line number Diff line change
@@ -1,70 +1,66 @@
WITH oper4 AS (
SELECT 41200 FieldID, eid, oper4 code,
SELECT 41200 FieldID, oper.eid, oper4 code,
CASE
WHEN oper.opdate IS NOT NULL THEN oper.opdate
WHEN h.admidate IS NOT NULL THEN h.admidate
WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate
ELSE h.epistart
END vdate
FROM `broad-ml4cvd.ukbb7089_201904.hesin` h
WHERE oper4 IS NOT NULL
FROM `broad-ml4cvd.ukbb7089_2024_01_20.hesin_oper` oper
LEFT JOIN `broad-ml4cvd.ukbb7089_2024_01_20.hesin` h ON oper.eid=h.eid AND oper.ins_index = h.ins_index
WHERE oper4 IS NOT NULL AND oper.level=1
), diag_icd10 AS (
SELECT 41202 FieldID, eid, diag_icd10 code,
SELECT 41202 FieldID, hd.eid, diag_icd10 code,
CASE
WHEN h.admidate IS NOT NULL THEN h.admidate
WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate
ELSE h.epistart
END vdate
FROM `broad-ml4cvd.ukbb7089_201904.hesin` h
WHERE diag_icd10 IS NOT NULL
FROM `broad-ml4cvd.ukbb7089_2024_01_20.hesin_diag` hd
LEFT JOIN `broad-ml4cvd.ukbb7089_2024_01_20.hesin` h ON h.eid = hd.eid and h.ins_index = hd.ins_index
WHERE diag_icd10 IS NOT NULL and hd.level=1
), diag_icd9 AS (
SELECT 41203 FieldID, eid, diag_icd9 code,
SELECT 41203 FieldID, hd.eid, diag_icd9 code,
CASE
WHEN h.admidate IS NOT NULL THEN h.admidate
WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate
ELSE h.epistart
END vdate
FROM `broad-ml4cvd.ukbb7089_201904.hesin` h
WHERE diag_icd9 IS NOT NULL
FROM `broad-ml4cvd.ukbb7089_2024_01_20.hesin_diag` hd
LEFT JOIN `broad-ml4cvd.ukbb7089_2024_01_20.hesin` h ON h.eid = hd.eid and h.ins_index = hd.ins_index
WHERE diag_icd9 IS NOT NULL and hd.level=1
), oper4secondary AS (
SELECT 41210 FieldID, h.eid, sec.oper4 code,
SELECT 41210 FieldID, sec_oper.eid, sec_oper.oper4 code,
CASE
WHEN sec_oper.opdate IS NOT NULL THEN sec_oper.opdate
WHEN h.admidate IS NOT NULL THEN h.admidate
WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate
ELSE h.epistart
END vdate
FROM `broad-ml4cvd.ukbb7089_201904.hesin_oper` sec
LEFT JOIN `broad-ml4cvd.ukbb7089_201904.hesin` h ON sec.eid=h.eid AND sec.record_id=h.record_id
WHERE TRUE
AND sec.oper4 IS NOT NULL
FROM `broad-ml4cvd.ukbb7089_2024_01_20.hesin_oper` sec_oper
LEFT JOIN `broad-ml4cvd.ukbb7089_2024_01_20.hesin` h ON sec_oper.eid=h.eid AND sec_oper.ins_index = h.ins_index
WHERE sec_oper.oper4 IS NOT NULL AND sec_oper.level=2
), diag_icd10_secondary AS (
SELECT 41204 FieldID, h.eid, sec.diag_icd10 code,
CASE
CASE
WHEN h.admidate IS NOT NULL THEN h.admidate
WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate
ELSE h.epistart
END vdate
FROM `broad-ml4cvd.ukbb7089_201904.hesin_diag10` sec
LEFT JOIN `broad-ml4cvd.ukbb7089_201904.hesin` h ON sec.eid=h.eid AND sec.record_id=h.record_id
WHERE TRUE
AND sec.diag_icd10 IS NOT NULL
FROM `broad-ml4cvd.ukbb7089_2024_01_20.hesin_diag` sec
LEFT JOIN `broad-ml4cvd.ukbb7089_2024_01_20.hesin` h ON sec.eid=h.eid AND sec.ins_index = h.ins_index
WHERE sec.diag_icd10 IS NOT NULL and sec.level=2
), diag_icd9_secondary AS (
SELECT 41205 FieldID, h.eid, sec.diag_icd9 code,
CASE
WHEN h.admidate IS NOT NULL THEN h.admidate
WHEN h.admidate IS NULL AND h.opdate IS NOT NULL THEN h.opdate
ELSE h.epistart
END vdate
FROM `broad-ml4cvd.ukbb7089_201904.hesin_diag9` sec
LEFT JOIN `broad-ml4cvd.ukbb7089_201904.hesin` h ON sec.eid=h.eid AND sec.record_id=h.record_id
WHERE TRUE
AND sec.diag_icd9 IS NOT NULL
FROM `broad-ml4cvd.ukbb7089_2024_01_20.hesin_diag` sec
LEFT JOIN `broad-ml4cvd.ukbb7089_2024_01_20.hesin` h ON sec.eid=h.eid AND sec.ins_index = h.ins_index
WHERE sec.diag_icd9 IS NOT NULL and sec.level=2
)

SELECT
diagnostics.eid sample_id, diagnostics.FieldID, diagnostics.code value,
CASE
WHEN MIN(PARSE_DATE("%E4Y-%m-%d", vdate)) IS NULL THEN MIN(PARSE_DATE("%E4Y-%m-%d", p.value))
ELSE MIN(PARSE_DATE("%E4Y-%m-%d", vdate))
CASE
WHEN MIN(PARSE_DATE("%d/%m/%E4Y", vdate)) IS NULL THEN MIN(PARSE_DATE("%E4Y-%m-%d", p.value))
ELSE MIN(PARSE_DATE("%d/%m/%E4Y", vdate))
END first_date
FROM (
SELECT * FROM oper4
Expand All @@ -79,7 +75,7 @@ FROM (
UNION DISTINCT
SELECT * FROM diag_icd9_secondary
) diagnostics
JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` p ON p.sample_id = diagnostics.eid AND p.array_idx=0 AND p.instance=0 AND p.FieldID=53
GROUP BY diagnostics.eid, diagnostics.FieldID, diagnostics.code
JOIN `broad-ml4cvd.ukbb7089_2024_01_20.phenotype` p ON p.sample_id = diagnostics.eid AND p.array_idx=0 AND p.instance=0 AND p.FieldID=53
GROUP BY diagnostics.eid, diagnostics.FieldID, diagnostics.code, diagnostics.vdate
ORDER BY first_date ASC
;

37 changes: 27 additions & 10 deletions phenotype_labels/disease/materialized_special_dates.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ WITH dated_fields AS (
WHEN cod.meaning LIKE ('%unknown%') THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value)
ELSE SAFE.PARSE_DATE("%E4Y-%m-%d", d.value)
END vdate
FROM `broad-ml4cvd.ukbb7089_201904.phenotype` p
JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` denroll ON denroll.FieldID=53 AND denroll.sample_id=p.sample_id AND denroll.instance = 0 AND denroll.array_idx = 0
JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` d ON d.sample_id=p.sample_id AND d.instance = p.instance AND d.array_idx = p.array_idx
FROM `broad-ml4cvd.ukbb7089_2024_01_20.phenotype` p
JOIN `broad-ml4cvd.ukbb7089_2024_01_20.phenotype` denroll ON denroll.FieldID=53 AND denroll.sample_id=p.sample_id AND denroll.instance = 0 AND denroll.array_idx = 0
JOIN `broad-ml4cvd.ukbb7089_2024_01_20.phenotype` d ON d.sample_id=p.sample_id AND d.instance = p.instance AND d.array_idx = p.array_idx
AND (
FALSE
OR (p.FieldID=42013 AND d.FieldID=42012)
Expand All @@ -16,32 +16,49 @@ WITH dated_fields AS (
OR (p.FieldID=42007 AND d.FieldID=42006)
OR (p.FieldID=42001 AND d.FieldID=42000)
)
LEFT JOIN `broad-ml4cvd.ukbb7089_201904.coding` cod ON cod.coding_file_id = d.coding_file_id AND cod.coding = d.value
),
dated_fields_fractional AS (
LEFT JOIN `broad-ml4cvd.ukbb7089_2024_01_20.coding` cod ON cod.coding_file_id = d.coding_file_id AND cod.coding = d.value
),dated_death_cause_fields AS (
SELECT p.FieldID, p.sample_id eid, p.value code, cod.meaning,
CASE
WHEN SAFE.PARSE_DATE("%E4Y-%m-%d", d.value) IS NULL THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value)
WHEN cod.meaning LIKE ('%unknown%') THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value)
ELSE SAFE.PARSE_DATE("%E4Y-%m-%d", d.value)
END vdate
FROM `broad-ml4cvd.ukbb7089_2024_01_20.phenotype` p
JOIN `broad-ml4cvd.ukbb7089_2024_01_20.phenotype` denroll ON denroll.FieldID=53 AND denroll.sample_id=p.sample_id AND denroll.instance = 0 AND denroll.array_idx = 0
JOIN `broad-ml4cvd.ukbb7089_2024_01_20.phenotype` d ON d.sample_id=p.sample_id AND d.instance = p.instance
AND (
FALSE
OR (p.FieldID=40001 AND d.FieldID=40000)
OR (p.FieldID=40002 AND d.FieldID=40000)
)
LEFT JOIN `broad-ml4cvd.ukbb7089_2024_01_20.coding` cod ON cod.coding_file_id = d.coding_file_id AND cod.coding = d.value
),dated_fields_fractional AS (
SELECT p.FieldID, p.sample_id eid, p.value code, cod.meaning,
CASE
WHEN SAFE.PARSE_DATE("%Y", d.value) IS NULL THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value)
WHEN cod.meaning LIKE ('%unknown%') THEN SAFE.PARSE_DATE("%E4Y-%m-%d", denroll.value)
ELSE SAFE.PARSE_DATE("%Y", d.value)
END vdate
FROM `broad-ml4cvd.ukbb7089_201904.phenotype` p
JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` denroll ON denroll.FieldID=53 AND denroll.sample_id=p.sample_id AND denroll.instance = 0 AND denroll.array_idx = 0
JOIN `broad-ml4cvd.ukbb7089_201904.phenotype` d ON d.sample_id=p.sample_id AND d.instance = p.instance AND d.array_idx = p.array_idx
FROM `broad-ml4cvd.ukbb7089_2024_01_20.phenotype` p
JOIN `broad-ml4cvd.ukbb7089_2024_01_20.phenotype` denroll ON denroll.FieldID=53 AND denroll.sample_id=p.sample_id AND denroll.instance = 0 AND denroll.array_idx = 0
JOIN `broad-ml4cvd.ukbb7089_2024_01_20.phenotype` d ON d.sample_id=p.sample_id AND d.instance = p.instance AND d.array_idx = p.array_idx
AND (
FALSE
OR (p.FieldID=20004 AND d.FieldID=20010)
OR (p.FieldID=20002 AND d.FieldID=20008)
OR (p.FieldID=20001 AND d.FieldID=20006)
)
LEFT JOIN `broad-ml4cvd.ukbb7089_201904.coding` cod ON cod.coding_file_id = d.coding_file_id AND cod.coding = d.value
LEFT JOIN `broad-ml4cvd.ukbb7089_2024_01_20.coding` cod ON cod.coding_file_id = d.coding_file_id AND cod.coding = d.value
)

SELECT
diagnostics.eid sample_id, diagnostics.FieldID, diagnostics.code value, MIN(vdate) first_date
FROM (
SELECT * FROM dated_fields
UNION DISTINCT
SELECT * FROM dated_death_cause_fields
UNION DISTINCT
SELECT * FROM dated_fields_fractional
) diagnostics
WHERE TRUE
Expand Down
Loading
Loading