Skip to content

Commit

Permalink
Update NCIt updater to use TSV file
Browse files Browse the repository at this point in the history
  • Loading branch information
susannasiebert committed May 16, 2024
1 parent 91d0233 commit 5387c3b
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 40 deletions.
3 changes: 3 additions & 0 deletions server/Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ gem 'scenic', '~>1.5.4'
#entrez symbol downloads
gem 'net-ftp', '~>0.3.3'

#NCIt term download
gem 'rubyzip', '~>2.3.2'

#higher performance json encoding
gem 'oj', '~> 3.16.3'

Expand Down
2 changes: 2 additions & 0 deletions server/Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ GEM
ffi (~> 1.12)
ruby2_keywords (0.0.5)
ruby_dig (0.0.2)
rubyzip (2.3.2)
sanitize (6.0.2)
crass (~> 1.0.2)
nokogiri (>= 1.12.0)
Expand Down Expand Up @@ -574,6 +575,7 @@ DEPENDENCIES
rack-mini-profiler (~> 2.0)
rails (~> 7.1)
rinku (~> 2.0.6)
rubyzip (~> 2.3.2)
sanitize (~> 6.0.2)
sass-rails (>= 6)
scenic (~> 1.5.4)
Expand Down
2 changes: 1 addition & 1 deletion server/app/jobs/update_nci_thesaurus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def remove_download
end

def latest_ncit_path
"https://stars.renci.org/var/NCIt/ncit.obo"
"https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Thesaurus.FLAT.zip"
end
end

56 changes: 17 additions & 39 deletions server/app/lib/importer/nci_thesaurus_mirror.rb
Original file line number Diff line number Diff line change
@@ -1,48 +1,45 @@
require "csv"
require "zip"

module Importer
class NciThesaurusMirror
attr_reader :parser, :version

def initialize(path, version = Time.now.utc.iso8601)
@parser = Obo::Parser.new(path)
zip_file = Zip::File.open(path)
entry = zip_file.glob('*.txt').first
csv_text = entry.get_input_stream.read
@parser = CSV.parse(
csv_text,
col_sep: "\t",
liberal_parsing: true,
headers: ['code', 'concept_iri', 'parents', 'synonyms', 'definition', 'display_name', 'concept_status', 'semantic_type', 'concept_in_subset'],
)
@version = version
end

def import
parser.elements.each do |elem|
parser.each do |elem|
if valid_entry?(elem)
create_object_from_entry(elem)
end
end
end

def valid_entry?(entry)
semantic_types = semantic_types(entry)
obsolete_concepts = obsolete_concepts(entry)
(entry['id'].present? && entry['name'].present? && entry.respond_to?(:name) && entry.name == 'Term' &&
(semantic_types & valid_semantic_types).length > 0 &&
(obsolete_concepts & ['Obsolete_Concept']).length == 0)
end

def semantic_types(entry)
matcher = /^NCIT:P106 "(?<semantic_type>.+)"/
entry['property_value'].map { |s| s.match(matcher) }.compact.map { |s| s[:semantic_type] }
valid_semantic_types.include?(entry['semantic_type']) && entry['concept_status'].nil?
end

def valid_semantic_types
['Pharmacologic Substance', 'Pharmacological Substance', 'Clinical Drug', 'Therapeutic or Preventive Procedure', 'Hazardous or Poisonous Substance']
end

def obsolete_concepts(entry)
matcher = /^NCIT:P310 "(?<obsolete_concept>.+)"/
entry['property_value'].map { |s| s.match(matcher) }.compact.map { |s| s[:obsolete_concept] }
end

def create_object_from_entry(entry)
name = Therapy.capitalize_name(entry['name'])
ncit_id = entry['id'].sub('NCIT:', '')
synonyms = entry['synonyms'].split('|').map{|s| Therapy.capitalize_name(s)}
name = synonyms.shift()
ncit_id = entry['code']
therapy = ::Therapy.where(ncit_id: ncit_id).first_or_initialize
therapy.name = name
synonyms = process_synonyms(entry['synonym']).uniq
synonyms.each do |syn|
therapy_alias = ::TherapyAlias.where(name: syn).first_or_create
if !therapy.therapy_aliases.map{|a| a.name.downcase}.include?(syn.downcase) && !(syn.downcase == therapy.name.downcase)
Expand All @@ -51,25 +48,6 @@ def create_object_from_entry(entry)
end
therapy.save
end

def process_synonyms(synonym_element)
vals = if synonym_element.blank?
[]
elsif synonym_element.is_a?(String)
[extract_synonym(synonym_element)]
elsif synonym_element.is_a?(Array)
synonym_element.map { |s| extract_synonym(s) }
end
vals.compact
end

def extract_synonym(value)
if match_data = value.match(/^"(?<name>.+)" EXACT \[.*\]/)
Therapy.capitalize_name(match_data[:name])
else
nil
end
end
end
end

0 comments on commit 5387c3b

Please sign in to comment.