From cfd3bf5411e680a72db625f1e329b3a9e9a3883a Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Tue, 14 May 2019 08:33:00 -0400 Subject: [PATCH] convert tabs to spaces --- .gitignore | 3 + CHANGELOG.md | 1 + combine/settings.py | 27 +- combine/urls.py | 2 +- core/__init__.py | 2 +- core/admin.py | 6 +- core/apps.py | 51 +- core/celery.py | 2 +- core/context_processors.py | 113 +- core/forms.py | 17 +- .../commands/ensuremongocollections.py | 30 +- core/management/commands/exportstate.py | 67 +- .../commands/quickstartbootstrap.py | 113 +- .../commands/removeorphanedrecords.py | 31 +- core/management/commands/update.py | 477 +- core/mongo.py | 4 +- core/oai.py | 1018 ++- core/spark/console.py | 769 +- core/spark/es.py | 619 +- core/spark/jobs.py | 3752 +++++---- core/spark/record_validation.py | 1017 ++- core/spark/utils.py | 177 +- core/tasks.py | 2172 +++-- core/templatetags/core_template_filters.py | 55 +- core/urls.py | 376 +- core/views.py | 7150 ++++++++--------- core/xml2kvp.py | 2155 +++-- docs/conf.py | 9 +- inc/console.py | 17 +- tests/conftest.py | 26 +- tests/data/python_validation.py | 59 +- tests/data/qs_python_validation.py | 68 +- tests/test_basic.py | 699 +- tests/test_bg_tasks.py | 54 +- 34 files changed, 10372 insertions(+), 10766 deletions(-) diff --git a/.gitignore b/.gitignore index 21570401..3b8b8c12 100644 --- a/.gitignore +++ b/.gitignore @@ -116,3 +116,6 @@ combine/localsettings.py # Combine static files /static + +# PyCharm IDE +.idea \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 8957e112..f2c8ea43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added ### Changed - Update XML2kvp to limit values at 32k characters [#403](https://github.com/WSULib/combine/issues/403) + - Converting tabs to spaces per PEP 8 recs ## `v0.8` diff --git a/combine/settings.py b/combine/settings.py index 0bc28cf2..9c14b1d7 100644 --- a/combine/settings.py +++ b/combine/settings.py @@ -12,15 +12,12 @@ import os - # Combine Version COMBINE_VERSION = 'v0.9' - # Build paths inside the project like this: os.path.join(BASE_DIR, ...) BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/ @@ -28,13 +25,10 @@ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = '$)1(piv72**m&3bmb)j!=f-h4=of6_knu=c8lj31n7k=y36oi%' - # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True - -ALLOWED_HOSTS = ['*'] # for dev, allowing all hosts - +ALLOWED_HOSTS = ['*'] # for dev, allowing all hosts # Application definition @@ -83,7 +77,6 @@ WSGI_APPLICATION = 'combine.wsgi.application' - # Database # https://docs.djangoproject.com/en/1.11/ref/settings/#databases @@ -134,13 +127,11 @@ USE_TZ = True - # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/1.11/howto/static-files/ STATIC_URL = '/static/' STATIC_ROOT = '/opt/combine/static' - # Logging LOGGING = { 'version': 1, @@ -188,7 +179,6 @@ }, } - # shell_plus additional imports SHELL_PLUS_POST_IMPORTS = { ('inc.console', '*') @@ -201,18 +191,3 @@ from combine.localsettings import * except ImportError: pass - - - - - - - - - - - - - - - diff --git a/combine/urls.py b/combine/urls.py index be8e63a8..010e35c6 100644 --- a/combine/urls.py +++ b/combine/urls.py @@ -17,6 +17,6 @@ from django.contrib import admin urlpatterns = [ - url(r'^combine/', include('core.urls')), + url(r'^combine/', include('core.urls')), url(r'^admin/', admin.site.urls), ] diff --git a/core/__init__.py b/core/__init__.py index af1ca566..337218e8 100644 --- a/core/__init__.py +++ b/core/__init__.py @@ -5,4 +5,4 @@ # Django starts so that shared_task will use this app. from .celery import celery_app -__all__ = ('celery_app',) \ No newline at end of file +__all__ = ('celery_app',) diff --git a/core/admin.py b/core/admin.py index 0fdc0665..2aaede91 100644 --- a/core/admin.py +++ b/core/admin.py @@ -3,7 +3,9 @@ from django.contrib import admin -from .models import Organization, RecordGroup, Job, OAIEndpoint, LivySession, Transformation, ValidationScenario, RecordIdentifierTransformationScenario, DPLABulkDataDownload, FieldMapper +from .models import Organization, RecordGroup, Job, OAIEndpoint, LivySession, Transformation, ValidationScenario, \ + RecordIdentifierTransformationScenario, DPLABulkDataDownload, FieldMapper # register models -admin.site.register([Organization, RecordGroup, Job, OAIEndpoint, LivySession, Transformation, ValidationScenario, RecordIdentifierTransformationScenario, DPLABulkDataDownload, FieldMapper]) \ No newline at end of file +admin.site.register([Organization, RecordGroup, Job, OAIEndpoint, LivySession, Transformation, ValidationScenario, + RecordIdentifierTransformationScenario, DPLABulkDataDownload, FieldMapper]) diff --git a/core/apps.py b/core/apps.py index 389442d2..43fa76c1 100644 --- a/core/apps.py +++ b/core/apps.py @@ -12,41 +12,38 @@ # Get an instance of a logger logger = logging.getLogger(__name__) + # NOTE: manual configuration of core app not currently used, but leaving if needed class CoreConfig(AppConfig): + name = 'core' - name = 'core' - - def ready(self): - - ''' - ready() method fires once, when application is loaded and ready - https://docs.djangoproject.com/en/dev/ref/applications/#django.apps.AppConfig.ready - - This fires any functions defined here that are needed when Combine starts. - - Args: - (django.apps.AppConfig): instance of 'Core' application config + def ready(self): + ''' + ready() method fires once, when application is loaded and ready + https://docs.djangoproject.com/en/dev/ref/applications/#django.apps.AppConfig.ready - Returns: - None - ''' + This fires any functions defined here that are needed when Combine starts. - logger.debug('Core application ready method preperations firing') + Args: + (django.apps.AppConfig): instance of 'Core' application config - # create home working directory - self.create_home_working_directory() + Returns: + None + ''' + logger.debug('Core application ready method preperations firing') - def create_home_working_directory(self): + # create home working directory + self.create_home_working_directory() - ''' - Method to create directory /home/combine/data/combine if does not exist - ''' + def create_home_working_directory(self): + ''' + Method to create directory /home/combine/data/combine if does not exist + ''' - # parse home working directory - hwd = settings.BINARY_STORAGE.split('file://')[-1] + # parse home working directory + hwd = settings.BINARY_STORAGE.split('file://')[-1] - # create if not exists - if not os.path.exists(hwd): - os.makedirs(hwd) \ No newline at end of file + # create if not exists + if not os.path.exists(hwd): + os.makedirs(hwd) diff --git a/core/celery.py b/core/celery.py index b5c5407e..2dccffe3 100644 --- a/core/celery.py +++ b/core/celery.py @@ -19,4 +19,4 @@ @celery_app.task(bind=True) def debug_task(self): - print('Request: {0!r}'.format(self.request)) \ No newline at end of file + print('Request: {0!r}'.format(self.request)) diff --git a/core/context_processors.py b/core/context_processors.py index 20adf270..2b6901fa 100644 --- a/core/context_processors.py +++ b/core/context_processors.py @@ -1,4 +1,3 @@ - # general from core.celery import celery_app from django.conf import settings @@ -10,75 +9,59 @@ def combine_settings(request): + ''' + Make some settings variables available to all templates + ''' - ''' - Make some settings variables available to all templates - ''' - - # prepare combine settings - combine_settings_keys = [ - 'APP_HOST', - 'DPLA_API_KEY', - 'OAI_RESPONSE_SIZE', - 'COMBINE_OAI_IDENTIFIER', - 'COMBINE_DEPLOYMENT', - 'COMBINE_VERSION' - ] - combine_settings_dict = { k:getattr(settings,k,None) for k in combine_settings_keys } + # prepare combine settings + combine_settings_keys = [ + 'APP_HOST', + 'DPLA_API_KEY', + 'OAI_RESPONSE_SIZE', + 'COMBINE_OAI_IDENTIFIER', + 'COMBINE_DEPLOYMENT', + 'COMBINE_VERSION' + ] + combine_settings_dict = {k: getattr(settings, k, None) for k in combine_settings_keys} - # return - return combine_settings_dict + # return + return combine_settings_dict def livy_session(request): - - ''' - Make Livy session information available to all views - ''' - - # get active livy session - lv = LivySession.get_active_session() - if lv: - if type(lv) == LivySession: - # refresh single session - lv.refresh_from_livy() - elif type(lv) == QuerySet: - # multiple Combine LivySession founds, loop through - for s in lv: - s.refresh_from_livy() - else: - pass - - return { - 'LIVY_SESSION':lv - } + ''' + Make Livy session information available to all views + ''' + + # get active livy session + lv = LivySession.get_active_session() + if lv: + if type(lv) == LivySession: + # refresh single session + lv.refresh_from_livy() + elif type(lv) == QuerySet: + # multiple Combine LivySession founds, loop through + for s in lv: + s.refresh_from_livy() + else: + pass + + return { + 'LIVY_SESSION': lv + } def combine_git_info(request): - - ''' - Return state of HEAD for Combine git repo - ''' - - # one liner for branch or tag - git_head = subprocess.Popen('head_name="$(git symbolic-ref HEAD 2>/dev/null)" || head_name="$(git describe --tags)"; echo $head_name', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stdout.read().decode('utf-8').rstrip("\n") - if "/" in git_head: - git_head = git_head.split('/')[-1] - - # return - return {'COMBINE_GIT_BRANCH':git_head} - - - - - - - - - - - - - - - + ''' + Return state of HEAD for Combine git repo + ''' + + # one liner for branch or tag + git_head = subprocess.Popen( + 'head_name="$(git symbolic-ref HEAD 2>/dev/null)" || head_name="$(git describe --tags)"; echo $head_name', + shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).stdout.read().decode('utf-8').rstrip("\n") + if "/" in git_head: + git_head = git_head.split('/')[-1] + + # return + return {'COMBINE_GIT_BRANCH': git_head} diff --git a/core/forms.py b/core/forms.py index cb6eca68..85187336 100644 --- a/core/forms.py +++ b/core/forms.py @@ -4,18 +4,13 @@ from core.models import Organization, RecordGroup, OAIEndpoint, Transformation - class OrganizationForm(ModelForm): - - class Meta: - model = Organization - fields = ['name', 'description'] - + class Meta: + model = Organization + fields = ['name', 'description'] class RecordGroupForm(ModelForm): - - class Meta: - model = RecordGroup - fields = ['organization', 'name', 'description'] - + class Meta: + model = RecordGroup + fields = ['organization', 'name', 'description'] diff --git a/core/management/commands/ensuremongocollections.py b/core/management/commands/ensuremongocollections.py index 7022d976..d686d40e 100644 --- a/core/management/commands/ensuremongocollections.py +++ b/core/management/commands/ensuremongocollections.py @@ -1,4 +1,3 @@ - # generic imports import logging @@ -11,23 +10,22 @@ # Get an instance of a logger logger = logging.getLogger(__name__) -class Command(BaseCommand): - - help = 'Ensure Mongo collections are created, and have proper indices' - def handle(self, *args, **options): +class Command(BaseCommand): + help = 'Ensure Mongo collections are created, and have proper indices' - # Record model - logger.debug('ensuring indices for record collection') - Record.ensure_indexes() + def handle(self, *args, **options): + # Record model + logger.debug('ensuring indices for record collection') + Record.ensure_indexes() - # RecordValidation model - logger.debug('ensuring indices for record_validation collection') - RecordValidation.ensure_indexes() + # RecordValidation model + logger.debug('ensuring indices for record_validation collection') + RecordValidation.ensure_indexes() - # IndexMappingFailure model - logger.debug('ensuring indices for index_mapping_failures collection') - IndexMappingFailure.ensure_indexes() + # IndexMappingFailure model + logger.debug('ensuring indices for index_mapping_failures collection') + IndexMappingFailure.ensure_indexes() - # return - self.stdout.write(self.style.SUCCESS('Mongo collections and indices verified and/or created')) \ No newline at end of file + # return + self.stdout.write(self.style.SUCCESS('Mongo collections and indices verified and/or created')) diff --git a/core/management/commands/exportstate.py b/core/management/commands/exportstate.py index dd56fbc0..3dbeb6a5 100644 --- a/core/management/commands/exportstate.py +++ b/core/management/commands/exportstate.py @@ -1,4 +1,3 @@ - # generic imports import datetime import logging @@ -12,39 +11,35 @@ # Get an instance of a logger logger = logging.getLogger(__name__) -class Command(BaseCommand): - - ''' - Manage command to trigger the state export of: - - all Organizations (and downstream Jobs); - - all Configuration Scenarios - ''' - - help = 'Using State Export/Import, export state of all Jobs and Configuration Scenarios' - - - def add_arguments(self, parser): - - # add optional organization ids to skip - parser.add_argument( - '--skip_json', - dest='skip_json', - help='JSON for objects to skip', - type=str - ) - - def handle(self, *args, **options): - - logger.debug('Exporting state of all Jobs and Configuration Scenarios') - - # init StateIO instance - sio - sio = StateIO( - name="Full State Export - %s" % datetime.datetime.now().strftime('%b. %d, %Y, %-I:%M:%S %p') - ) - - # run export - sio.export_all(skip_dict=json.loads(options['skip_json'])) - - # return - self.stdout.write(self.style.SUCCESS('Export complete.')) \ No newline at end of file +class Command(BaseCommand): + ''' + Manage command to trigger the state export of: + - all Organizations (and downstream Jobs); + - all Configuration Scenarios + ''' + + help = 'Using State Export/Import, export state of all Jobs and Configuration Scenarios' + + def add_arguments(self, parser): + # add optional organization ids to skip + parser.add_argument( + '--skip_json', + dest='skip_json', + help='JSON for objects to skip', + type=str + ) + + def handle(self, *args, **options): + logger.debug('Exporting state of all Jobs and Configuration Scenarios') + + # init StateIO instance - sio + sio = StateIO( + name="Full State Export - %s" % datetime.datetime.now().strftime('%b. %d, %Y, %-I:%M:%S %p') + ) + + # run export + sio.export_all(skip_dict=json.loads(options['skip_json'])) + + # return + self.stdout.write(self.style.SUCCESS('Export complete.')) diff --git a/core/management/commands/quickstartbootstrap.py b/core/management/commands/quickstartbootstrap.py index 8250ccc1..b6ccdd5f 100644 --- a/core/management/commands/quickstartbootstrap.py +++ b/core/management/commands/quickstartbootstrap.py @@ -3,69 +3,66 @@ # import core from core.models import * -class Command(BaseCommand): - - help = 'Bootstrap Combine with some demo Transformation and Validation scenarios' - - # def add_arguments(self, parser): - # parser.add_argument('poll_id', nargs='+', type=int) - def handle(self, *args, **options): - - ## prepare demo MODS files - # parse file - xml_tree = etree.parse('tests/data/mods_250.xml') - xml_root = xml_tree.getroot() - # get namespaces - nsmap = {} - for ns in xml_root.xpath('//namespace::*'): - if ns[0]: - nsmap[ns[0]] = ns[1] - # find mods records - mods_roots = xml_root.xpath('//mods:mods', namespaces=nsmap) - # create temp dir - payload_dir = '/tmp/combine/qs/mods' - os.makedirs(payload_dir) - # write MODS to temp dir - for mods in mods_roots: - with open(os.path.join(payload_dir, '%s.xml' % uuid.uuid4().hex), 'w') as f: - f.write(etree.tostring(mods).decode('utf-8')) +class Command(BaseCommand): + help = 'Bootstrap Combine with some demo Transformation and Validation scenarios' + # def add_arguments(self, parser): + # parser.add_argument('poll_id', nargs='+', type=int) - ## create demo XSLT transformation - with open('tests/data/mods_transform.xsl','r') as f: - xsl_string = f.read() - trans = Transformation( - name='MODS to Service Hub profile', - payload=xsl_string, - transformation_type='xslt' - ) - trans.save() + def handle(self, *args, **options): + ## prepare demo MODS files + # parse file + xml_tree = etree.parse('tests/data/mods_250.xml') + xml_root = xml_tree.getroot() + # get namespaces + nsmap = {} + for ns in xml_root.xpath('//namespace::*'): + if ns[0]: + nsmap[ns[0]] = ns[1] + # find mods records + mods_roots = xml_root.xpath('//mods:mods', namespaces=nsmap) + # create temp dir + payload_dir = '/tmp/combine/qs/mods' + os.makedirs(payload_dir) + # write MODS to temp dir + for mods in mods_roots: + with open(os.path.join(payload_dir, '%s.xml' % uuid.uuid4().hex), 'w') as f: + f.write(etree.tostring(mods).decode('utf-8')) - ## create demo validation scenarios - # schematron validation - with open('tests/data/qs_schematron_validation.sch','r') as f: - sch_payload = f.read() - schematron_validation_scenario = ValidationScenario( - name='DPLA minimum', - payload=sch_payload, - validation_type='sch', - default_run=True - ) - schematron_validation_scenario.save() + ## create demo XSLT transformation + with open('tests/data/mods_transform.xsl', 'r') as f: + xsl_string = f.read() + trans = Transformation( + name='MODS to Service Hub profile', + payload=xsl_string, + transformation_type='xslt' + ) + trans.save() - # python validation - with open('tests/data/qs_python_validation.py','r') as f: - py_payload = f.read() - python_validation_scenario = ValidationScenario( - name='Date checker', - payload=py_payload, - validation_type='python', - default_run=True - ) - python_validation_scenario.save() + ## create demo validation scenarios + # schematron validation + with open('tests/data/qs_schematron_validation.sch', 'r') as f: + sch_payload = f.read() + schematron_validation_scenario = ValidationScenario( + name='DPLA minimum', + payload=sch_payload, + validation_type='sch', + default_run=True + ) + schematron_validation_scenario.save() + # python validation + with open('tests/data/qs_python_validation.py', 'r') as f: + py_payload = f.read() + python_validation_scenario = ValidationScenario( + name='Date checker', + payload=py_payload, + validation_type='python', + default_run=True + ) + python_validation_scenario.save() - # return - self.stdout.write(self.style.SUCCESS('Quickstart bootstrapping complete.')) \ No newline at end of file + # return + self.stdout.write(self.style.SUCCESS('Quickstart bootstrapping complete.')) diff --git a/core/management/commands/removeorphanedrecords.py b/core/management/commands/removeorphanedrecords.py index 2c14387b..44e05763 100644 --- a/core/management/commands/removeorphanedrecords.py +++ b/core/management/commands/removeorphanedrecords.py @@ -10,24 +10,23 @@ # Get an instance of a logger logger = logging.getLogger(__name__) -class Command(BaseCommand): - - help = 'Remove orphaned Records from Mongo that have no associated Job' - def handle(self, *args, **options): +class Command(BaseCommand): + help = 'Remove orphaned Records from Mongo that have no associated Job' - # removing records with no job_id field - no_job_ids = Record.objects(mongoengine.Q(job_id__exists=False)) - logger.debug('removing %s records without job_id field' % no_job_ids.count()) - no_job_ids.delete() + def handle(self, *args, **options): + # removing records with no job_id field + no_job_ids = Record.objects(mongoengine.Q(job_id__exists=False)) + logger.debug('removing %s records without job_id field' % no_job_ids.count()) + no_job_ids.delete() - # remove records where Job no longer exists - # get list of Jobs - job_ids = list(Job.objects.values_list('id',flat=True)) + # remove records where Job no longer exists + # get list of Jobs + job_ids = list(Job.objects.values_list('id', flat=True)) - # get Records that have job_ids that do not match - delete_results = mc_handle.combine.record.delete_many({'job_id':{'$nin':job_ids}}) - logger.debug('removed %s records where Job does not exist' % delete_results.deleted_count) + # get Records that have job_ids that do not match + delete_results = mc_handle.combine.record.delete_many({'job_id': {'$nin': job_ids}}) + logger.debug('removed %s records where Job does not exist' % delete_results.deleted_count) - # return - self.stdout.write(self.style.SUCCESS('Orphaned Records removed from MongoDB')) + # return + self.stdout.write(self.style.SUCCESS('Orphaned Records removed from MongoDB')) diff --git a/core/management/commands/update.py b/core/management/commands/update.py index 83ff5e29..1443d176 100644 --- a/core/management/commands/update.py +++ b/core/management/commands/update.py @@ -1,4 +1,3 @@ - # generic imports import datetime import logging @@ -19,334 +18,306 @@ logger = logging.getLogger(__name__) - class Command(BaseCommand): + ''' + Manage command to update Combine. - ''' - Manage command to update Combine. - - Performs the following: - - pull from github, updates all branches - - if relase passed, checkout release/branch - - pip install requirements - - collect static django - - restart gunicorn, livy session, celery - ''' - - help = 'Update Combine' - - - # python path - PYTHON_PATH = sys.executable.rstrip('python').rstrip('/') - - - def add_arguments(self, parser): + Performs the following: + - pull from github, updates all branches + - if relase passed, checkout release/branch + - pip install requirements + - collect static django + - restart gunicorn, livy session, celery + ''' - # release - parser.add_argument( - '--release', - dest='release', - help='GitHub branch/release to update to', - type=str, - default=None - ) + help = 'Update Combine' - # update method - parser.add_argument( - '--run_update_snippet', - dest='run_update_snippet', - help='Update code snippet to run', - type=str, - default=None - ) + # python path + PYTHON_PATH = sys.executable.rstrip('python').rstrip('/') - # update method - parser.add_argument( - '--run_update_snippets_only', - action='store_true', - help='Run update snippets only during update' - ) + def add_arguments(self, parser): + # release + parser.add_argument( + '--release', + dest='release', + help='GitHub branch/release to update to', + type=str, + default=None + ) - def handle(self, *args, **options): + # update method + parser.add_argument( + '--run_update_snippet', + dest='run_update_snippet', + help='Update code snippet to run', + type=str, + default=None + ) + + # update method + parser.add_argument( + '--run_update_snippets_only', + action='store_true', + help='Run update snippets only during update' + ) - ''' - Handler for updates to Combine - ''' + def handle(self, *args, **options): - logger.debug('Updating Combine') + ''' + Handler for updates to Combine + ''' - # run update snippet if passed - if options.get('run_update_snippet'): - self.run_update_snippet(args, options) + logger.debug('Updating Combine') - # else, run update - else: - self.update(args, options) + # run update snippet if passed + if options.get('run_update_snippet'): + self.run_update_snippet(args, options) + # else, run update + else: + self.update(args, options) - def update(self, args, options): + def update(self, args, options): - ''' - Method to handle branch/tagged release update - ''' + ''' + Method to handle branch/tagged release update + ''' - # do not run at all if Combine is Docker deployed - if getattr(settings,'COMBINE_DEPLOYMENT','server') != 'docker': + # do not run at all if Combine is Docker deployed + if getattr(settings, 'COMBINE_DEPLOYMENT', 'server') != 'docker': - # if not running update snippets only - if not options.get('run_update_snippets_only', False): + # if not running update snippets only + if not options.get('run_update_snippets_only', False): - # git pull - os.system('git pull') + # git pull + os.system('git pull') - # checkout release if provided - if options.get('release', None) != None: - release = options['release'] - logger.debug('release/branch provided, checking out: %s' % release) + # checkout release if provided + if options.get('release', None) != None: + release = options['release'] + logger.debug('release/branch provided, checking out: %s' % release) - # git checkout - os.system('git checkout %s' % release) + # git checkout + os.system('git checkout %s' % release) - # install requirements as combine user - os.system('%s/pip install -r requirements.txt' % (self.PYTHON_PATH)) + # install requirements as combine user + os.system('%s/pip install -r requirements.txt' % (self.PYTHON_PATH)) - # collect django static - os.system('%s/python manage.py collectstatic --noinput' % (self.PYTHON_PATH)) + # collect django static + os.system('%s/python manage.py collectstatic --noinput' % (self.PYTHON_PATH)) - # restart gunicorn - self._restart_gunicorn() + # restart gunicorn + self._restart_gunicorn() - # restart livy and livy session - self._restart_livy() + # restart livy and livy session + self._restart_livy() - # restart celery background tasks - self._restart_celery() + # restart celery background tasks + self._restart_celery() - # run update code snippets - vuh = VersionUpdateHelper() - vuh.run_update_snippets() + # run update code snippets + vuh = VersionUpdateHelper() + vuh.run_update_snippets() - # return - self.stdout.write(self.style.SUCCESS('Update complete.')) + # return + self.stdout.write(self.style.SUCCESS('Update complete.')) - # docker return - else: - self.stdout.write(self.style.ERROR('Update script does not currently support Docker deployment.')) + # docker return + else: + self.stdout.write(self.style.ERROR('Update script does not currently support Docker deployment.')) + def _restart_gunicorn(self): - def _restart_gunicorn(self): + # get supervisor handle + sp = SupervisorRPCClient() + # fire action + results = sp.restart_process('gunicorn') + logger.debug(results) - # get supervisor handle - sp = SupervisorRPCClient() - # fire action - results = sp.restart_process('gunicorn') - logger.debug(results) + def _restart_livy(self): + # get supervisor handle + sp = SupervisorRPCClient() + # fire action + results = sp.restart_process('livy') + logger.debug(results) - def _restart_livy(self): + # sleep + time.sleep(10) - # get supervisor handle - sp = SupervisorRPCClient() - # fire action - results = sp.restart_process('livy') - logger.debug(results) + # get active livy sessions - restart or start + active_ls = LivySession.get_active_session() + if not active_ls: + logger.debug('active livy session not found, starting') + livy_session = LivySession() + livy_session.start_session() + else: + logger.debug('single, active session found, and restart flag passed, restarting') + new_ls = active_ls.restart_session() - # sleep - time.sleep(10) + def _restart_celery(self): - # get active livy sessions - restart or start - active_ls = LivySession.get_active_session() - if not active_ls: - logger.debug('active livy session not found, starting') - livy_session = LivySession() - livy_session.start_session() - else: - logger.debug('single, active session found, and restart flag passed, restarting') - new_ls = active_ls.restart_session() + # get supervisor handle + sp = SupervisorRPCClient() + # fire action + results = sp.restart_process('celery') + logger.debug(results) + def run_update_snippet(self, args, options): - def _restart_celery(self): + ''' + Method to run update snippet if passed + ''' - # get supervisor handle - sp = SupervisorRPCClient() - # fire action - results = sp.restart_process('celery') - logger.debug(results) - - - def run_update_snippet(self, args, options): - - ''' - Method to run update snippet if passed - ''' - - # init VersionUpdateHelper instance - vuh = VersionUpdateHelper() - - # get snippet - snippet = getattr(vuh, options.get('run_update_snippet'), None) - if snippet != None: - snippet() - else: - logger.debug('Update snippet "%s" could not be found' % options.get('run_update_snippet', None)) + # init VersionUpdateHelper instance + vuh = VersionUpdateHelper() + # get snippet + snippet = getattr(vuh, options.get('run_update_snippet'), None) + if snippet != None: + snippet() + else: + logger.debug('Update snippet "%s" could not be found' % options.get('run_update_snippet', None)) class VersionUpdateHelper(object): + ''' + Class to manage actions specific to version-to-version updates + ''' - ''' - Class to manage actions specific to version-to-version updates - ''' - - # python path - PYTHON_PATH = sys.executable.rstrip('python').rstrip('/') - - - def __init__(self): - - # registered, ordered list of snippets - self.registered_snippets = [ - self.v0_4__set_job_baseline_combine_version, - self.v0_4__update_transform_job_details, - self.v0_4__set_job_current_combine_version, - self.v0_7_1__fix_redis_version_mismatches - ] - - - def run_update_snippets(self): - - ''' - Method to loop through update snippets and fire - ''' - - for snippet in self.registered_snippets: - try: - snippet() - except Exception as e: - logger.debug('Could not run udpate snippet: %s' % snippet.__name__) - logger.debug(str(e)) - - - def v0_4__set_job_baseline_combine_version(self): - - ''' - Method to set combine_version as v0.1 in job_details for all lacking version - ''' - - logger.debug('v0_4__set_job_baseline_combine_version: setting Job combine_version to v0.1 if not set') - - # get Transform Jobs - jobs = Job.objects.all() - - # loop through jobs - for job in jobs: - - # check for combine_version key - if not job.job_details_dict.get('combine_version', False): - - logger.debug('stamping v0.1 combine_version to Job: %s' % (job)) - - # update job_details - job.update_job_details({'combine_version':'v0.1'}) - - - def v0_4__set_job_current_combine_version(self): + # python path + PYTHON_PATH = sys.executable.rstrip('python').rstrip('/') - ''' - Method to set combine_version as current Combine version in job_details - ''' + def __init__(self): - logger.debug('v0_4__set_job_current_combine_version: checking and setting Job combine_version to %s' % (settings.COMBINE_VERSION)) + # registered, ordered list of snippets + self.registered_snippets = [ + self.v0_4__set_job_baseline_combine_version, + self.v0_4__update_transform_job_details, + self.v0_4__set_job_current_combine_version, + self.v0_7_1__fix_redis_version_mismatches + ] - # get Transform Jobs - jobs = Job.objects.all() + def run_update_snippets(self): - # loop through jobs - for job in jobs: + ''' + Method to loop through update snippets and fire + ''' - # compare and stamp - if version.parse(job.job_details_dict['combine_version']) < version.parse(settings.COMBINE_VERSION): + for snippet in self.registered_snippets: + try: + snippet() + except Exception as e: + logger.debug('Could not run udpate snippet: %s' % snippet.__name__) + logger.debug(str(e)) - logger.debug('stamping %s combine_version to Job: %s' % (settings.COMBINE_VERSION, job)) + def v0_4__set_job_baseline_combine_version(self): - # update job_details - job.update_job_details({'combine_version':settings.COMBINE_VERSION}) + ''' + Method to set combine_version as v0.1 in job_details for all lacking version + ''' + logger.debug('v0_4__set_job_baseline_combine_version: setting Job combine_version to v0.1 if not set') - def v0_4__update_transform_job_details(self): + # get Transform Jobs + jobs = Job.objects.all() - ''' - Method to update job_details for Transform Jobs if from_v < v0.4 or None - ''' + # loop through jobs + for job in jobs: - logger.debug('v0_4__update_transform_job_details: updating job details for pre v0.4 Transform Jobs') + # check for combine_version key + if not job.job_details_dict.get('combine_version', False): + logger.debug('stamping v0.1 combine_version to Job: %s' % (job)) - # get Transform Jobs - trans_jobs = Job.objects.filter(job_type='TransformJob') + # update job_details + job.update_job_details({'combine_version': 'v0.1'}) - # loop through and check for single Transformation Scenario - for job in trans_jobs: + def v0_4__set_job_current_combine_version(self): - # check version - if version.parse(job.job_details_dict['combine_version']) < version.parse('v0.4'): + ''' + Method to set combine_version as current Combine version in job_details + ''' - logger.debug('Transform Job "%s" is Combine version %s, checking if needs updating' % (job, job.job_details_dict['combine_version'])) + logger.debug('v0_4__set_job_current_combine_version: checking and setting Job combine_version to %s' % ( + settings.COMBINE_VERSION)) - # check for 'transformation' key in job_details - if job.job_details_dict.get('transformation', False): + # get Transform Jobs + jobs = Job.objects.all() - # get transform details - trans_details = job.job_details_dict.get('transformation') + # loop through jobs + for job in jobs: - # check for 'id' key at this level, indicating < v0.4 - if 'id' in trans_details.keys(): + # compare and stamp + if version.parse(job.job_details_dict['combine_version']) < version.parse(settings.COMBINE_VERSION): + logger.debug('stamping %s combine_version to Job: %s' % (settings.COMBINE_VERSION, job)) - logger.debug('Transform Job "%s" requires job details updating, performing' % job) + # update job_details + job.update_job_details({'combine_version': settings.COMBINE_VERSION}) - # create dictionary - trans_dict = { - 'scenarios':[ - { - 'id':trans_details['id'], - 'name':trans_details['name'], - 'type':trans_details['type'], - 'type_human':trans_details['type'], - 'index':0 - } - ], - 'scenarios_json':'[{"index":0,"trans_id":%s}]' % trans_details['id'] - } + def v0_4__update_transform_job_details(self): - # update job_details - job.update_job_details({'transformation':trans_dict}) + ''' + Method to update job_details for Transform Jobs if from_v < v0.4 or None + ''' + logger.debug('v0_4__update_transform_job_details: updating job details for pre v0.4 Transform Jobs') - def v0_7_1__fix_redis_version_mismatches(self): + # get Transform Jobs + trans_jobs = Job.objects.filter(job_type='TransformJob') - ''' - Method to fix any redis version mismatches - ''' + # loop through and check for single Transformation Scenario + for job in trans_jobs: - if version.parse(settings.COMBINE_VERSION) == version.parse('v0.7'): + # check version + if version.parse(job.job_details_dict['combine_version']) < version.parse('v0.4'): - logger.debug('v0_7_1__fix_redis_version_mismatches: fixing redis versioning') + logger.debug('Transform Job "%s" is Combine version %s, checking if needs updating' % ( + job, job.job_details_dict['combine_version'])) - # ensure redis version - os.system('%s/pip uninstall redis celery -y' % (self.PYTHON_PATH)) - os.system('%s/pip install redis==3.2.1 celery==4.3.0' % (self.PYTHON_PATH)) + # check for 'transformation' key in job_details + if job.job_details_dict.get('transformation', False): - # restart celery background tasks - # get supervisor handle - sp = SupervisorRPCClient() - # fire action - results = sp.restart_process('celery') - logger.debug(results) + # get transform details + trans_details = job.job_details_dict.get('transformation') + # check for 'id' key at this level, indicating < v0.4 + if 'id' in trans_details.keys(): + logger.debug('Transform Job "%s" requires job details updating, performing' % job) + # create dictionary + trans_dict = { + 'scenarios': [ + { + 'id': trans_details['id'], + 'name': trans_details['name'], + 'type': trans_details['type'], + 'type_human': trans_details['type'], + 'index': 0 + } + ], + 'scenarios_json': '[{"index":0,"trans_id":%s}]' % trans_details['id'] + } + # update job_details + job.update_job_details({'transformation': trans_dict}) + def v0_7_1__fix_redis_version_mismatches(self): + ''' + Method to fix any redis version mismatches + ''' + if version.parse(settings.COMBINE_VERSION) == version.parse('v0.7'): + logger.debug('v0_7_1__fix_redis_version_mismatches: fixing redis versioning') + # ensure redis version + os.system('%s/pip uninstall redis celery -y' % (self.PYTHON_PATH)) + os.system('%s/pip install redis==3.2.1 celery==4.3.0' % (self.PYTHON_PATH)) + # restart celery background tasks + # get supervisor handle + sp = SupervisorRPCClient() + # fire action + results = sp.restart_process('celery') + logger.debug(results) diff --git a/core/mongo.py b/core/mongo.py index 0bdfc217..79f9b808 100644 --- a/core/mongo.py +++ b/core/mongo.py @@ -5,8 +5,10 @@ # import mongoengine and connect import mongoengine + mongoengine.connect('combine', host=settings.MONGO_HOST, port=27017) # import pymongo and establish client import pymongo -mc_handle = pymongo.MongoClient(host=settings.MONGO_HOST, port=27017) \ No newline at end of file + +mc_handle = pymongo.MongoClient(host=settings.MONGO_HOST, port=27017) diff --git a/core/oai.py b/core/oai.py index 66dac97b..b7521651 100644 --- a/core/oai.py +++ b/core/oai.py @@ -18,618 +18,592 @@ # Get an instance of a logger logger = logging.getLogger(__name__) - # attempt to load metadataPrefix map from localSettings, otherwise provide default if hasattr(settings, 'METADATA_PREFIXES'): - metadataPrefix_hash = settings.METADATA_PREFIXES + metadataPrefix_hash = settings.METADATA_PREFIXES else: - metadataPrefix_hash = { - 'mods':{ - 'schema':'http://www.loc.gov/standards/mods/v3/mods.xsd', - 'namespace':'http://www.loc.gov/mods/v3' - }, - 'oai_dc':{ - 'schema':'http://www.openarchives.org/OAI/2.0/oai_dc.xsd', - 'namespace':'http://purl.org/dc/elements/1.1/' - }, - 'dc':{ - 'schema':'http://www.openarchives.org/OAI/2.0/oai_dc.xsd', - 'namespace':'http://purl.org/dc/elements/1.1/' - }, - } - + metadataPrefix_hash = { + 'mods': { + 'schema': 'http://www.loc.gov/standards/mods/v3/mods.xsd', + 'namespace': 'http://www.loc.gov/mods/v3' + }, + 'oai_dc': { + 'schema': 'http://www.openarchives.org/OAI/2.0/oai_dc.xsd', + 'namespace': 'http://purl.org/dc/elements/1.1/' + }, + 'dc': { + 'schema': 'http://www.openarchives.org/OAI/2.0/oai_dc.xsd', + 'namespace': 'http://purl.org/dc/elements/1.1/' + }, + } class OAIProvider(object): + ''' + Class for scaffolding and building responses to OAI queries + + NOTE: Because the OAI-PMH protocol shares verbs with reserved words in Python (e.g. "set", or "from"), + easier to keep the HTTP request args to work with as a dictionary, and maintain the original OAI-PMH vocab. + ''' + + def __init__(self, args, subset=None): + + # set subset + self.subset = subset + + # read args, route verb to verb handler + self.verb_routes = { + 'GetRecord': self._GetRecord, + 'Identify': self._Identify, + 'ListIdentifiers': self._ListIdentifiers, + 'ListMetadataFormats': self._ListMetadataFormats, + 'ListRecords': self._ListRecords, + 'ListSets': self._ListSets + } + + self.args = args.copy() + self.request_timestamp = datetime.datetime.now() + self.request_timestamp_string = self.request_timestamp.strftime('%Y-%m-%dT%H:%M:%SZ') + self.record_nodes = [] + + # published dataframe slice parameters + self.start = 0 + self.chunk_size = settings.OAI_RESPONSE_SIZE + if 'set' in self.args.keys() and self.args['set'] != '': + self.publish_set_id = self.args['set'] + else: + self.publish_set_id = None + + # get instance of Published model + self.published = models.PublishedRecords(subset=self.subset) + + # begin scaffolding + self.scaffold() + + # generate XML root node with OAI-PMH scaffolding + def scaffold(self): + + ''' + Scaffold XML, OAI response + + Args: + None + + Returns: + None + - sets multiple attributes for response building + ''' + + # build root node, nsmap, and attributes + NSMAP = { + None: 'http://www.openarchives.org/OAI/2.0/' + } + self.root_node = etree.Element('OAI-PMH', nsmap=NSMAP) + self.root_node.set( + '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation', + 'http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd' + ) + + # set responseDate node + self.responseDate_node = etree.Element('responseDate') + self.responseDate_node.text = self.request_timestamp.strftime('%Y-%m-%dT%H:%M:%SZ') + self.root_node.append(self.responseDate_node) + + # set request node + self.request_node = etree.Element('request') + + # set verb + try: + self.request_node.attrib['verb'] = self.args['verb'] + except: + self.args['verb'] = 'NULL' + self.request_node.attrib['verb'] = 'NULL' + + # capture set if present + if 'set' in self.args.keys(): + self.request_node.attrib['set'] = self.args['set'] + + # metadataPrefix + if 'metadataPrefix' in self.args.keys(): + self.request_node.attrib['metadataPrefix'] = self.args['metadataPrefix'] + + self.request_node.text = 'http://%s%s' % (settings.APP_HOST, reverse('oai')) + self.root_node.append(self.request_node) + + # set verb node + self.verb_node = etree.Element(self.args['verb']) + self.root_node.append(self.verb_node) + + def retrieve_records(self, include_metadata=False): + + ''' + Retrieve record(s) from DB for response + + Args: + include_metadata (bool): If False, return only identifiers, if True, include record document as well + + Returns: + None + - adds record(s) to self.record_nodes + ''' + + stime = time.time() + logger.debug("retrieving records for verb %s" % (self.args['verb'])) + + # get records + records = self.published.records + + # if set present, filter by this set + if self.publish_set_id: + logger.debug('applying publish_set_id filter: %s' % self.publish_set_id) + records = records.filter(publish_set_id=self.publish_set_id) + + # loop through rows, limited by current OAI transaction start / chunk + + # count records before slice + records_count = records.count() + + # get slice for iteration + records = records[self.start:(self.start + self.chunk_size)] + for record in records: + + record = OAIRecord( + args=self.args, + record_id=record.record_id, + publish_set_id=record.publish_set_id, + document=record.document, + timestamp=self.request_timestamp_string + ) + + # include full metadata in record + if include_metadata: + record.include_metadata() + + # append to record_nodes + self.record_nodes.append(record.oai_record_node) + + # add to verb node + for oai_record_node in self.record_nodes: + self.verb_node.append(oai_record_node) + + # finally, set resumption token + self.set_resumption_token(records, completeListSize=records_count) + + # report + record_nodes_num = len(self.record_nodes) + logger.debug("%s record(s) returned in %s" % (record_nodes_num, (float(time.time()) - float(stime)))) + + def set_resumption_token(self, records, completeListSize=None): + + ''' + Set resumption tokens in DB under OAITransaction model + + Args: + completeListSize (int): total number of records based on passed parameters + + Returns: + None + - sets attributes related to resumption tokens + ''' + + # set resumption token + if self.start + self.chunk_size < completeListSize: + # set token and slice parameters to DB + token = str(uuid.uuid4()) + logger.debug('setting resumption token: %s' % token) + oai_trans = models.OAITransaction( + verb=self.args['verb'], + start=self.start + self.chunk_size, + chunk_size=self.chunk_size, + publish_set_id=self.publish_set_id, + token=token, + args=json.dumps(self.args) + ) + oai_trans.save() - ''' - Class for scaffolding and building responses to OAI queries - - NOTE: Because the OAI-PMH protocol shares verbs with reserved words in Python (e.g. "set", or "from"), - easier to keep the HTTP request args to work with as a dictionary, and maintain the original OAI-PMH vocab. - ''' - - def __init__(self, args, subset=None): - - # set subset - self.subset = subset - - # read args, route verb to verb handler - self.verb_routes = { - 'GetRecord':self._GetRecord, - 'Identify':self._Identify, - 'ListIdentifiers':self._ListIdentifiers, - 'ListMetadataFormats':self._ListMetadataFormats, - 'ListRecords':self._ListRecords, - 'ListSets':self._ListSets - } - - self.args = args.copy() - self.request_timestamp = datetime.datetime.now() - self.request_timestamp_string = self.request_timestamp.strftime('%Y-%m-%dT%H:%M:%SZ') - self.record_nodes = [] - - # published dataframe slice parameters - self.start = 0 - self.chunk_size = settings.OAI_RESPONSE_SIZE - if 'set' in self.args.keys() and self.args['set'] != '': - self.publish_set_id = self.args['set'] - else: - self.publish_set_id = None - - # get instance of Published model - self.published = models.PublishedRecords(subset=self.subset) - - # begin scaffolding - self.scaffold() - - - # generate XML root node with OAI-PMH scaffolding - def scaffold(self): - - ''' - Scaffold XML, OAI response - - Args: - None - - Returns: - None - - sets multiple attributes for response building - ''' - - # build root node, nsmap, and attributes - NSMAP = { - None:'http://www.openarchives.org/OAI/2.0/' - } - self.root_node = etree.Element('OAI-PMH', nsmap=NSMAP) - self.root_node.set( - '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation', - 'http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd' - ) - - # set responseDate node - self.responseDate_node = etree.Element('responseDate') - self.responseDate_node.text = self.request_timestamp.strftime('%Y-%m-%dT%H:%M:%SZ') - self.root_node.append(self.responseDate_node) - - # set request node - self.request_node = etree.Element('request') - - # set verb - try: - self.request_node.attrib['verb'] = self.args['verb'] - except: - self.args['verb'] = 'NULL' - self.request_node.attrib['verb'] = 'NULL' - - # capture set if present - if 'set' in self.args.keys(): - self.request_node.attrib['set'] = self.args['set'] - - # metadataPrefix - if 'metadataPrefix' in self.args.keys(): - self.request_node.attrib['metadataPrefix'] = self.args['metadataPrefix'] - - self.request_node.text = 'http://%s%s' % (settings.APP_HOST, reverse('oai')) - self.root_node.append(self.request_node) - - # set verb node - self.verb_node = etree.Element(self.args['verb']) - self.root_node.append(self.verb_node) - - - def retrieve_records(self, include_metadata=False): - - ''' - Retrieve record(s) from DB for response - - Args: - include_metadata (bool): If False, return only identifiers, if True, include record document as well - - Returns: - None - - adds record(s) to self.record_nodes - ''' - - stime = time.time() - logger.debug("retrieving records for verb %s" % (self.args['verb'])) - - # get records - records = self.published.records - - # if set present, filter by this set - if self.publish_set_id: - logger.debug('applying publish_set_id filter: %s' % self.publish_set_id) - records = records.filter(publish_set_id = self.publish_set_id) - - # loop through rows, limited by current OAI transaction start / chunk - - # count records before slice - records_count = records.count() - - # get slice for iteration - records = records[self.start:(self.start+self.chunk_size)] - for record in records: - - record = OAIRecord( - args=self.args, - record_id=record.record_id, - publish_set_id=record.publish_set_id, - document=record.document, - timestamp=self.request_timestamp_string - ) - - # include full metadata in record - if include_metadata: - record.include_metadata() - - # append to record_nodes - self.record_nodes.append(record.oai_record_node) - - # add to verb node - for oai_record_node in self.record_nodes: - self.verb_node.append(oai_record_node) - - # finally, set resumption token - self.set_resumption_token(records, completeListSize=records_count) - - # report - record_nodes_num = len(self.record_nodes) - logger.debug("%s record(s) returned in %s" % (record_nodes_num, (float(time.time()) - float(stime)))) - - - def set_resumption_token(self, records, completeListSize=None): - - ''' - Set resumption tokens in DB under OAITransaction model - - Args: - completeListSize (int): total number of records based on passed parameters + # set resumption token node and attributes + self.resumptionToken_node = etree.Element('resumptionToken') + self.resumptionToken_node.attrib['expirationDate'] = (self.request_timestamp + datetime.timedelta(0, 3600)) \ + .strftime('%Y-%m-%dT%H:%M:%SZ') + self.resumptionToken_node.attrib['completeListSize'] = str(completeListSize) + self.resumptionToken_node.attrib['cursor'] = str(self.start) + self.resumptionToken_node.text = token + self.verb_node.append(self.resumptionToken_node) - Returns: - None - - sets attributes related to resumption tokens - ''' + # convenience function to run all internal methods + def generate_response(self): - # set resumption token - if self.start + self.chunk_size < completeListSize: + ''' + Returns OAI response as XML - # set token and slice parameters to DB - token = str(uuid.uuid4()) - logger.debug('setting resumption token: %s' % token) - oai_trans = models.OAITransaction( - verb = self.args['verb'], - start = self.start + self.chunk_size, - chunk_size = self.chunk_size, - publish_set_id = self.publish_set_id, - token = token, - args = json.dumps(self.args) - ) - oai_trans.save() + Args: + None - # set resumption token node and attributes - self.resumptionToken_node = etree.Element('resumptionToken') - self.resumptionToken_node.attrib['expirationDate'] = (self.request_timestamp + datetime.timedelta(0,3600))\ - .strftime('%Y-%m-%dT%H:%M:%SZ') - self.resumptionToken_node.attrib['completeListSize'] = str(completeListSize) - self.resumptionToken_node.attrib['cursor'] = str(self.start) - self.resumptionToken_node.text = token - self.verb_node.append(self.resumptionToken_node) + Returns: + (str): XML response + ''' + + # check verb + if self.args['verb'] not in self.verb_routes.keys(): + return self.raise_error( + 'badVerb', + 'The verb %s is not allowed, must be from: %s' % (self.args['verb'], str(self.verb_routes.keys())) + ) + + # check for resumption token + if 'resumptionToken' in self.args.keys(): + + # retrieve token params and alter args and search_params + ot_query = models.OAITransaction.objects.filter(token=self.args['resumptionToken']) + if ot_query.count() == 1: + ot = ot_query.first() + + # set args and start and chunk_size + self.start = ot.start + self.chunk_size = ot.chunk_size + self.publish_set_id = ot.publish_set_id + + logger.debug('following resumption token, altering dataframe slice params:') + logger.debug([self.start, self.chunk_size, self.publish_set_id]) + + # raise error + else: + return self.raise_error('badResumptionToken', + 'The resumptionToken %s is not found' % self.args['resumptionToken']) + + # fire verb reponse building + self.verb_routes[self.args['verb']]() + return self.serialize() + def raise_error(self, error_code, error_msg): - # convenience function to run all internal methods - def generate_response(self): + ''' + Returns error as XML, OAI response + + Args: + error_code (str): OAI-PMH error codes (e.g. badVerb, generic, etc.) + error_msg (str): details about error - ''' - Returns OAI response as XML + Returns: + (str): XML response + ''' + + # remove verb node + try: + self.root_node.remove(self.verb_node) + except: + logger.debug('verb_node not found') + + # create error node and append + error_node = etree.SubElement(self.root_node, 'error') + error_node.attrib['code'] = error_code + error_node.text = error_msg + + # serialize and return + return self.serialize() + + # serialize record nodes as XML response + def serialize(self): + + ''' + Serialize all nodes as XML for returning + + Args: + None + + Returns: + (str): XML response + ''' - Args: - None + return etree.tostring(self.root_node) - Returns: - (str): XML response - ''' + # GetRecord + def _GetRecord(self): - # check verb - if self.args['verb'] not in self.verb_routes.keys(): - return self.raise_error( - 'badVerb', - 'The verb %s is not allowed, must be from: %s' % (self.args['verb'],str(self.verb_routes.keys())) - ) + ''' + OAI-PMH verb: GetRecord + Retrieve a single record based on record id, return - # check for resumption token - if 'resumptionToken' in self.args.keys(): + Args: + None - # retrieve token params and alter args and search_params - ot_query = models.OAITransaction.objects.filter(token=self.args['resumptionToken']) - if ot_query.count() == 1: - ot = ot_query.first() + Returns: + None + sets single record node to self.record_nodes + ''' - # set args and start and chunk_size - self.start = ot.start - self.chunk_size = ot.chunk_size - self.publish_set_id = ot.publish_set_id + stime = time.time() + logger.debug("retrieving record: %s" % (self.args['identifier'])) - logger.debug('following resumption token, altering dataframe slice params:') - logger.debug([self.start, self.chunk_size, self.publish_set_id]) + # get single row + single_record = self.published.get_record(self.args['identifier']) - # raise error - else: - return self.raise_error('badResumptionToken', 'The resumptionToken %s is not found' % self.args['resumptionToken']) + # if single record found + if single_record: - # fire verb reponse building - self.verb_routes[self.args['verb']]() - return self.serialize() + # open as OAIRecord + record = OAIRecord( + args=self.args, + record_id=single_record.record_id, + document=single_record.document, + timestamp=self.request_timestamp_string + ) + # include metadata + record.include_metadata() - def raise_error(self, error_code, error_msg): + # append to record_nodes + self.record_nodes.append(record.oai_record_node) - ''' - Returns error as XML, OAI response + # add to verb node + for oai_record_node in self.record_nodes: + self.verb_node.append(oai_record_node) - Args: - error_code (str): OAI-PMH error codes (e.g. badVerb, generic, etc.) - error_msg (str): details about error + else: + logger.debug('record not found for id: %s, not appending node' % self.args['identifier']) - Returns: - (str): XML response - ''' + # report + etime = time.time() + logger.debug("%s record(s) returned in %sms" % (len(self.record_nodes), (float(etime) - float(stime)) * 1000)) - # remove verb node - try: - self.root_node.remove(self.verb_node) - except: - logger.debug('verb_node not found') + # Identify + def _Identify(self): - # create error node and append - error_node = etree.SubElement(self.root_node, 'error') - error_node.attrib['code'] = error_code - error_node.text = error_msg + ''' + OAI-PMH verb: Identify + Provide information about Repository / OAI Server - # serialize and return - return self.serialize() + Args: + None + Returns: + None + sets description node text + ''' - # serialize record nodes as XML response - def serialize(self): + # init OAIRecord + logger.debug('generating identify node') - ''' - Serialize all nodes as XML for returning + # write Identify node + description_node = etree.Element('description') + desc_text = 'Combine, integrated OAI-PMH.' + if self.subset != None: + desc_text += ' Note: You are receiving a published subset of this Combine instance named: %s.' % self.subset + description_node.text = desc_text + self.verb_node.append(description_node) - Args: - None + # ListIdentifiers + def _ListIdentifiers(self): - Returns: - (str): XML response - ''' + ''' + OAI-PMH verb: ListIdentifiers + Lists identifiers - return etree.tostring(self.root_node) + Args: + None + Returns: + None + sets multiple record nodes to self.record.nodes + ''' - # GetRecord - def _GetRecord(self): + self.retrieve_records() - ''' - OAI-PMH verb: GetRecord - Retrieve a single record based on record id, return + # ListMetadataFormats + def _ListMetadataFormats(self): - Args: - None + ''' + # OAI-PMH verb: ListMetadataFormats + # List all metadataformats, or optionally, available metadataformats for + # one item based on published metadata formats - Returns: - None - sets single record node to self.record_nodes - ''' + NOTE: Currently, Combine does not support Metadata Formats for the outgoing OAI-PMH server. + All published Records are undoubtedly of a metadata format, but this is opaque to Combine. This + may change in the future, but for now, a shim is in place to return valid OAI-PMH responses for + the verb ListMetadataForamts + ''' - stime = time.time() - logger.debug("retrieving record: %s" % (self.args['identifier'])) + # generic metadata prefix shim + generic_metadata_hash = { + 'prefix': 'generic', + 'schema': 'http://generic.org/schema', + 'namespace': 'gnc' + } - # get single row - single_record = self.published.get_record(self.args['identifier']) + # identifier provided + if 'identifier' in self.args.keys(): - # if single record found - if single_record: + try: + logging.debug("identifier provided for ListMetadataFormats, confirming that identifier exists...") + single_record = self.published.get_record(self.args['identifier']) - # open as OAIRecord - record = OAIRecord( - args=self.args, - record_id=single_record.record_id, - document=single_record.document, - timestamp=self.request_timestamp_string - ) + if single_record != False: - # include metadata - record.include_metadata() + mf_node = etree.Element('metadataFormat') - # append to record_nodes - self.record_nodes.append(record.oai_record_node) + # write metadataPrefix node + prefix = etree.SubElement(mf_node, 'metadataPrefix') + prefix.text = generic_metadata_hash['prefix'] - # add to verb node - for oai_record_node in self.record_nodes: - self.verb_node.append(oai_record_node) + # write schema node + schema = etree.SubElement(mf_node, 'schema') + schema.text = generic_metadata_hash['schema'] - else: - logger.debug('record not found for id: %s, not appending node' % self.args['identifier']) + # write schema node + namespace = etree.SubElement(mf_node, 'metadataNamespace') + namespace.text = generic_metadata_hash['namespace'] - # report - etime = time.time() - logger.debug("%s record(s) returned in %sms" % (len(self.record_nodes), (float(etime) - float(stime)) * 1000)) + # append to verb_node and return + self.verb_node.append(mf_node) + else: + raise Exception('record could not be located') + except: + return self.raise_error('idDoesNotExist', 'The identifier %s is not found.' % self.args['identifier']) - # Identify - def _Identify(self): + # no identifier, return all available metadataPrefixes + else: - ''' - OAI-PMH verb: Identify - Provide information about Repository / OAI Server + mf_node = etree.Element('metadataFormat') - Args: - None + # write metadataPrefix node + prefix = etree.SubElement(mf_node, 'metadataPrefix') + prefix.text = generic_metadata_hash['prefix'] - Returns: - None - sets description node text - ''' + # write schema node + schema = etree.SubElement(mf_node, 'schema') + schema.text = generic_metadata_hash['schema'] - # init OAIRecord - logger.debug('generating identify node') + # write schema node + namespace = etree.SubElement(mf_node, 'metadataNamespace') + namespace.text = generic_metadata_hash['namespace'] - # write Identify node - description_node = etree.Element('description') - desc_text = 'Combine, integrated OAI-PMH.' - if self.subset != None: - desc_text += ' Note: You are receiving a published subset of this Combine instance named: %s.' % self.subset - description_node.text = desc_text - self.verb_node.append(description_node) + # append to verb_node and return + self.verb_node.append(mf_node) + # ListRecords + def _ListRecords(self): - # ListIdentifiers - def _ListIdentifiers(self): + ''' + OAI-PMH verb: ListRecords + Lists records; similar to ListIdentifiers, but includes metadata from record.document - ''' - OAI-PMH verb: ListIdentifiers - Lists identifiers + Args: + None - Args: - None + Returns: + None + sets multiple record nodes to self.record.nodes + ''' - Returns: - None - sets multiple record nodes to self.record.nodes - ''' + self.retrieve_records(include_metadata=True) - self.retrieve_records() + # ListSets + def _ListSets(self): + ''' + OAI-PMH verb: ListSets + Lists available sets. Sets are derived from the publish_set_id from a published Job - # ListMetadataFormats - def _ListMetadataFormats(self): + Args: + None - ''' - # OAI-PMH verb: ListMetadataFormats - # List all metadataformats, or optionally, available metadataformats for - # one item based on published metadata formats - - NOTE: Currently, Combine does not support Metadata Formats for the outgoing OAI-PMH server. - All published Records are undoubtedly of a metadata format, but this is opaque to Combine. This - may change in the future, but for now, a shim is in place to return valid OAI-PMH responses for - the verb ListMetadataForamts - ''' - - # generic metadata prefix shim - generic_metadata_hash = { - 'prefix':'generic', - 'schema':'http://generic.org/schema', - 'namespace':'gnc' - } - - - # identifier provided - if 'identifier' in self.args.keys(): - - try: - logging.debug("identifier provided for ListMetadataFormats, confirming that identifier exists...") - single_record = self.published.get_record(self.args['identifier']) - - if single_record != False: - - mf_node = etree.Element('metadataFormat') - - # write metadataPrefix node - prefix = etree.SubElement(mf_node,'metadataPrefix') - prefix.text = generic_metadata_hash['prefix'] - - # write schema node - schema = etree.SubElement(mf_node,'schema') - schema.text = generic_metadata_hash['schema'] - - # write schema node - namespace = etree.SubElement(mf_node,'metadataNamespace') - namespace.text = generic_metadata_hash['namespace'] - - # append to verb_node and return - self.verb_node.append(mf_node) - - else: - raise Exception('record could not be located') - except: - return self.raise_error('idDoesNotExist','The identifier %s is not found.' % self.args['identifier']) - - # no identifier, return all available metadataPrefixes - else: - - mf_node = etree.Element('metadataFormat') - - # write metadataPrefix node - prefix = etree.SubElement(mf_node,'metadataPrefix') - prefix.text = generic_metadata_hash['prefix'] - - # write schema node - schema = etree.SubElement(mf_node,'schema') - schema.text = generic_metadata_hash['schema'] - - # write schema node - namespace = etree.SubElement(mf_node,'metadataNamespace') - namespace.text = generic_metadata_hash['namespace'] - - # append to verb_node and return - self.verb_node.append(mf_node) - - - # ListRecords - def _ListRecords(self): - - ''' - OAI-PMH verb: ListRecords - Lists records; similar to ListIdentifiers, but includes metadata from record.document - - Args: - None - - Returns: - None - sets multiple record nodes to self.record.nodes - ''' - - self.retrieve_records(include_metadata=True) - - - # ListSets - def _ListSets(self): - - ''' - OAI-PMH verb: ListSets - Lists available sets. Sets are derived from the publish_set_id from a published Job - - Args: - None - - Returns: - None - sets multiple set nodes - ''' - - # generate response - for publish_set_id in self.published.sets: - set_node = etree.Element('set') - setSpec = etree.SubElement(set_node,'setSpec') - setSpec.text = publish_set_id - setName = etree.SubElement(set_node,'setName') - setName.text = publish_set_id - self.verb_node.append(set_node) + Returns: + None + sets multiple set nodes + ''' + # generate response + for publish_set_id in self.published.sets: + set_node = etree.Element('set') + setSpec = etree.SubElement(set_node, 'setSpec') + setSpec.text = publish_set_id + setName = etree.SubElement(set_node, 'setName') + setName.text = publish_set_id + self.verb_node.append(set_node) class OAIRecord(object): + ''' + Initialize OAIRecord with pid and args + ''' - ''' - Initialize OAIRecord with pid and args - ''' - - def __init__(self, args=None, record_id=None, publish_set_id=None, document=None, timestamp=None): - - self.args = args - self.record_id = record_id - self.publish_set_id = publish_set_id - self.document = document - self.timestamp = timestamp - self.oai_record_node = None - - # build record node - self.init_record_node() - - - def _construct_oai_identifier(self): - - ''' - build OAI identifier - ''' - - # if publish set id include - if self.publish_set_id: - return '%s:%s:%s' % (settings.COMBINE_OAI_IDENTIFIER, self.publish_set_id, self.record_id) - - # else, without - else: - return '%s:%s' % (settings.COMBINE_OAI_IDENTIFIER, self.record_id) - + def __init__(self, args=None, record_id=None, publish_set_id=None, document=None, timestamp=None): - def init_record_node(self): + self.args = args + self.record_id = record_id + self.publish_set_id = publish_set_id + self.document = document + self.timestamp = timestamp + self.oai_record_node = None - ''' - Initialize and scaffold record node + # build record node + self.init_record_node() - Args: - None + def _construct_oai_identifier(self): - Returns: - None - sets self.oai_record_node - ''' + ''' + build OAI identifier + ''' - # init node - self.oai_record_node = etree.Element('record') + # if publish set id include + if self.publish_set_id: + return '%s:%s:%s' % (settings.COMBINE_OAI_IDENTIFIER, self.publish_set_id, self.record_id) - # header node - header_node = etree.Element('header') + # else, without + else: + return '%s:%s' % (settings.COMBINE_OAI_IDENTIFIER, self.record_id) - # identifier - identifier_node = etree.Element('identifier') - identifier_node.text = self._construct_oai_identifier() - header_node.append(identifier_node) + def init_record_node(self): - # datestamp - datestamp_node = etree.Element('datestamp') - datestamp_node.text = self.timestamp - header_node.append(datestamp_node) + ''' + Initialize and scaffold record node - if 'set' in self.args.keys(): - setSpec_node = etree.Element('setSpec') - setSpec_node.text = self.args['set'] - header_node.append(setSpec_node) + Args: + None - self.oai_record_node.append(header_node) + Returns: + None + sets self.oai_record_node + ''' + # init node + self.oai_record_node = etree.Element('record') - def include_metadata(self): + # header node + header_node = etree.Element('header') - ''' - Method to retrieve metadata from record.document, and include in XML response (for GetRecord and ListRecords) + # identifier + identifier_node = etree.Element('identifier') + identifier_node.text = self._construct_oai_identifier() + header_node.append(identifier_node) - Args: - None + # datestamp + datestamp_node = etree.Element('datestamp') + datestamp_node.text = self.timestamp + header_node.append(datestamp_node) - Returns: - None - sets self.oai_record_node - ''' + if 'set' in self.args.keys(): + setSpec_node = etree.Element('setSpec') + setSpec_node.text = self.args['set'] + header_node.append(setSpec_node) - # metadate node - metadata_node = etree.Element('metadata') - metadata_node.append(etree.fromstring(self.document.encode('utf-8'))) - self.oai_record_node.append(metadata_node) + self.oai_record_node.append(header_node) + def include_metadata(self): + ''' + Method to retrieve metadata from record.document, and include in XML response (for GetRecord and ListRecords) + Args: + None + Returns: + None + sets self.oai_record_node + ''' + # metadate node + metadata_node = etree.Element('metadata') + metadata_node.append(etree.fromstring(self.document.encode('utf-8'))) + self.oai_record_node.append(metadata_node) diff --git a/core/spark/console.py b/core/spark/console.py index 3560b297..e98f545c 100644 --- a/core/spark/console.py +++ b/core/spark/console.py @@ -1,4 +1,3 @@ - # generic imports import django import json @@ -13,9 +12,9 @@ # check for registered apps signifying readiness, if not, run django.setup() to run as standalone if not hasattr(django, 'apps'): - os.environ['DJANGO_SETTINGS_MODULE'] = 'combine.settings' - sys.path.append('/opt/combine') - django.setup() + os.environ['DJANGO_SETTINGS_MODULE'] = 'combine.settings' + sys.path.append('/opt/combine') + django.setup() # import django settings from django.conf import settings @@ -29,364 +28,358 @@ from xml2kvp import XML2kvp - - - ############################################################################ # Background Tasks ############################################################################ def export_records_as_xml(spark, ct_id): + ''' + Function to export multiple Jobs, with folder hierarchy for each Job - ''' - Function to export multiple Jobs, with folder hierarchy for each Job + Notes: + - exports to s3 as parquet + - with limited columns, can benefit from parquest's compression - Notes: - - exports to s3 as parquet - - with limited columns, can benefit from parquest's compression + Args: + ct_id (int): CombineBackgroundTask id + ''' - Args: - ct_id (int): CombineBackgroundTask id - ''' + # init logging support + spark.sparkContext.setLogLevel('INFO') + log4jLogger = spark.sparkContext._jvm.org.apache.log4j + logger = log4jLogger.LogManager.getLogger(__name__) - # init logging support - spark.sparkContext.setLogLevel('INFO') - log4jLogger = spark.sparkContext._jvm.org.apache.log4j - logger = log4jLogger.LogManager.getLogger(__name__) + # hydrate CombineBackgroundTask + ct = CombineBackgroundTask.objects.get(pk=int(ct_id)) - # hydrate CombineBackgroundTask - ct = CombineBackgroundTask.objects.get(pk=int(ct_id)) + # clean base path + output_path = "file:///%s" % ct.task_params['output_path'].lstrip('file://').rstrip('/') - # clean base path - output_path = "file:///%s" % ct.task_params['output_path'].lstrip('file://').rstrip('/') + # write DataFrame to S3 + if ct.task_params.get('s3_export', False) and ct.task_params.get('s3_export_type', None) == 'spark_df': - # write DataFrame to S3 - if ct.task_params.get('s3_export', False) and ct.task_params.get('s3_export_type', None) == 'spark_df': + # dynamically set credentials + spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", settings.AWS_ACCESS_KEY_ID) + spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", settings.AWS_SECRET_ACCESS_KEY) - # dynamically set credentials - spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", settings.AWS_ACCESS_KEY_ID) - spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", settings.AWS_SECRET_ACCESS_KEY) + # init dfs and col_set across all published sets + dfs = [] + col_set = set() - # init dfs and col_set across all published sets - dfs = [] - col_set = set() + # loop through published sets (includes non-set Records) + for folder_name, job_ids in ct.task_params['job_dict'].items(): - # loop through published sets (includes non-set Records) - for folder_name, job_ids in ct.task_params['job_dict'].items(): + # get dfs and columns + for job_id in job_ids: + print("Adding job #%s" % job_id) - # get dfs and columns - for job_id in job_ids: + # get df + df = get_job_as_df(spark, job_id) - print("Adding job #%s" % job_id) + # add to total set of columns + col_set.update(df.columns) - # get df - df = get_job_as_df(spark, job_id) + # append to dfs + dfs.append(df) - # add to total set of columns - col_set.update(df.columns) + # convert col_set to list + col_set = list(col_set) + logger.info("column final set: %s" % col_set) - # append to dfs - dfs.append(df) + # add empty columns to dfs where needed + n_dfs = [] + for df in dfs: + n_df = df + for col in col_set: + if col not in df.columns: + n_df = n_df.withColumn(col, lit('').cast(StringType())) + n_dfs.append(n_df) - # convert col_set to list - col_set = list(col_set) - logger.info("column final set: %s" % col_set) + # get union of all RDDs to write + rdd_to_write = spark.sparkContext.union([df.select(col_set).rdd for df in n_dfs]) - # add empty columns to dfs where needed - n_dfs = [] - for df in dfs: - n_df = df - for col in col_set: - if col not in df.columns: - n_df = n_df.withColumn(col, lit('').cast(StringType())) - n_dfs.append(n_df) + # repartition + rdd_to_write = rdd_to_write.repartition(math.ceil(rdd_to_write.count() / settings.TARGET_RECORDS_PER_PARTITION)) - # get union of all RDDs to write - rdd_to_write = spark.sparkContext.union([ df.select(col_set).rdd for df in n_dfs ]) + # convert to DataFrame and write to s3 as parquet + rdd_to_write.toDF().write.mode('overwrite').parquet( + 's3a://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'])) - # repartition - rdd_to_write = rdd_to_write.repartition(math.ceil(rdd_to_write.count() / settings.TARGET_RECORDS_PER_PARTITION)) + # write to disk + else: - # convert to DataFrame and write to s3 as parquet - rdd_to_write.toDF().write.mode('overwrite').parquet('s3a://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'])) + # determine column subset + col_subset = ['document'] - # write to disk - else: + # loop through keys and export + for folder_name, job_ids in ct.task_params['job_dict'].items(): - # determine column subset - col_subset = ['document'] + # handle single job_id + if len(job_ids) == 1: - # loop through keys and export - for folder_name, job_ids in ct.task_params['job_dict'].items(): + # get Job records as df + rdd_to_write = get_job_as_df(spark, job_ids[0]).select(col_subset).rdd - # handle single job_id - if len(job_ids) == 1: + # handle multiple jobs + else: - # get Job records as df - rdd_to_write = get_job_as_df(spark, job_ids[0]).select(col_subset).rdd + rdds = [get_job_as_df(spark, job_id).select(col_subset).rdd for job_id in job_ids] + rdd_to_write = spark.sparkContext.union(rdds) - # handle multiple jobs - else: - - rdds = [ get_job_as_df(spark, job_id).select(col_subset).rdd for job_id in job_ids ] - rdd_to_write = spark.sparkContext.union(rdds) - - # repartition, wrap in XML dec, and write - rdd_to_write.repartition(math.ceil(rdd_to_write.count()/int(ct.task_params['records_per_file'])))\ - .map(lambda row: row.document.replace('',''))\ - .saveAsTextFile('%s/%s' % (output_path, folder_name)) + # repartition, wrap in XML dec, and write + rdd_to_write.repartition(math.ceil(rdd_to_write.count() / int(ct.task_params['records_per_file']))) \ + .map(lambda row: row.document.replace('', '')) \ + .saveAsTextFile('%s/%s' % (output_path, folder_name)) def generate_validation_report(spark, output_path, task_params): - - job_id = task_params['job_id'] - validation_scenarios = [ int(vs_id) for vs_id in task_params['validation_scenarios']] - - # get job validations, limiting by selected validation scenarios - pipeline = json.dumps({'$match': {'job_id': job_id, 'validation_scenario_id':{'$in':validation_scenarios}}}) - rvdf = spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record_validation")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() - - # get job as df - records_df = get_job_as_df(spark, job_id) - - # merge on validation failures - mdf = rvdf.alias('rvdf').join(records_df.alias('records_df'), rvdf['record_id'] == records_df['_id']) - - # select subset of fields for export, and rename - mdf = mdf.select( - 'records_df._id.oid', - 'records_df.record_id', - 'rvdf.validation_scenario_id', - 'rvdf.validation_scenario_name', - 'rvdf.results_payload', - 'rvdf.fail_count' - ) - - # if mapped fields requested, query ES and join - if len(task_params['mapped_field_include']) > 0: - - # get mapped fields - mapped_fields = task_params['mapped_field_include'] - - # get mapped fields as df - if 'db_id' not in mapped_fields: - mapped_fields.append('db_id') - es_df = get_job_es(spark, job_id=job_id).select(mapped_fields) - - # join - mdf = mdf.alias('mdf').join(es_df.alias('es_df'), mdf['oid'] == es_df['db_id']) - - # cleanup columns - mdf = mdf.select([c for c in mdf.columns if c != 'db_id']).withColumnRenamed('oid','db_id') - - # write to output dir - if task_params['report_format'] == 'csv': - mdf.write.format('com.databricks.spark.csv').option("delimiter", ",").save('file://%s' % output_path) - if task_params['report_format'] == 'tsv': - mdf.write.format('com.databricks.spark.csv').option("delimiter", "\t").save('file://%s' % output_path) - if task_params['report_format'] == 'json': - mdf.write.format('json').save('file://%s' % output_path) + job_id = task_params['job_id'] + validation_scenarios = [int(vs_id) for vs_id in task_params['validation_scenarios']] + + # get job validations, limiting by selected validation scenarios + pipeline = json.dumps({'$match': {'job_id': job_id, 'validation_scenario_id': {'$in': validation_scenarios}}}) + rvdf = spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record_validation") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() + + # get job as df + records_df = get_job_as_df(spark, job_id) + + # merge on validation failures + mdf = rvdf.alias('rvdf').join(records_df.alias('records_df'), rvdf['record_id'] == records_df['_id']) + + # select subset of fields for export, and rename + mdf = mdf.select( + 'records_df._id.oid', + 'records_df.record_id', + 'rvdf.validation_scenario_id', + 'rvdf.validation_scenario_name', + 'rvdf.results_payload', + 'rvdf.fail_count' + ) + + # if mapped fields requested, query ES and join + if len(task_params['mapped_field_include']) > 0: + + # get mapped fields + mapped_fields = task_params['mapped_field_include'] + + # get mapped fields as df + if 'db_id' not in mapped_fields: + mapped_fields.append('db_id') + es_df = get_job_es(spark, job_id=job_id).select(mapped_fields) + + # join + mdf = mdf.alias('mdf').join(es_df.alias('es_df'), mdf['oid'] == es_df['db_id']) + + # cleanup columns + mdf = mdf.select([c for c in mdf.columns if c != 'db_id']).withColumnRenamed('oid', 'db_id') + + # write to output dir + if task_params['report_format'] == 'csv': + mdf.write.format('com.databricks.spark.csv').option("delimiter", ",").save('file://%s' % output_path) + if task_params['report_format'] == 'tsv': + mdf.write.format('com.databricks.spark.csv').option("delimiter", "\t").save('file://%s' % output_path) + if task_params['report_format'] == 'json': + mdf.write.format('json').save('file://%s' % output_path) def export_records_as_tabular_data(spark, ct_id): + ''' + Function to export multiple Jobs, with folder hierarchy for each Job + Notes: + - writes to s3 as JSONLines to avoid column names which contain characters + that parquet will not accept + - much less efficient storage-wise, but flexible for the field/column variety + that tabular data has - ''' - Function to export multiple Jobs, with folder hierarchy for each Job + Args: + ct_id (int): CombineBackgroundTask id - Notes: - - writes to s3 as JSONLines to avoid column names which contain characters - that parquet will not accept - - much less efficient storage-wise, but flexible for the field/column variety - that tabular data has + Expecting from CombineBackgroundTask: + output_path (str): base location for folder structure + job_dict (dict): dictionary of directory name --> list of Job ids + - e.g. single job: {'j29':[29]} + - e.g. published records: {'foo':[2,42], 'bar':[3]} + - in this case, a union will be performed for all Jobs within a single key + records_per_file (int): number of XML records per file + fm_export_config_json (str): JSON of configurations to be used + tabular_data_export_type (str): 'json' or 'csv' + ''' - Args: - ct_id (int): CombineBackgroundTask id + # hydrate CombineBackgroundTask + ct = CombineBackgroundTask.objects.get(pk=int(ct_id)) - Expecting from CombineBackgroundTask: - output_path (str): base location for folder structure - job_dict (dict): dictionary of directory name --> list of Job ids - - e.g. single job: {'j29':[29]} - - e.g. published records: {'foo':[2,42], 'bar':[3]} - - in this case, a union will be performed for all Jobs within a single key - records_per_file (int): number of XML records per file - fm_export_config_json (str): JSON of configurations to be used - tabular_data_export_type (str): 'json' or 'csv' - ''' + # reconstitute fm_export_config_json + fm_config = json.loads(ct.task_params['fm_export_config_json']) - # hydrate CombineBackgroundTask - ct = CombineBackgroundTask.objects.get(pk=int(ct_id)) + # clean base path + output_path = "file:///%s" % ct.task_params['output_path'].lstrip('file://').rstrip('/') - # reconstitute fm_export_config_json - fm_config = json.loads(ct.task_params['fm_export_config_json']) + # write DataFrame to S3 + if ct.task_params.get('s3_export', False) and ct.task_params.get('s3_export_type', None) == 'spark_df': - # clean base path - output_path = "file:///%s" % ct.task_params['output_path'].lstrip('file://').rstrip('/') + # dynamically set credentials + spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", settings.AWS_ACCESS_KEY_ID) + spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", settings.AWS_SECRET_ACCESS_KEY) - # write DataFrame to S3 - if ct.task_params.get('s3_export', False) and ct.task_params.get('s3_export_type', None) == 'spark_df': + # determine column subset + col_subset = ['*'] - # dynamically set credentials - spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", settings.AWS_ACCESS_KEY_ID) - spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", settings.AWS_SECRET_ACCESS_KEY) + # loop through keys and export + rdds = [] + for folder_name, job_ids in ct.task_params['job_dict'].items(): - # determine column subset - col_subset = ['*'] + # handle single job_id + if len(job_ids) == 1: + rdds.extend([get_job_as_df(spark, job_ids[0]).select(['document', 'combine_id', 'record_id']).rdd]) - # loop through keys and export - rdds = [] - for folder_name, job_ids in ct.task_params['job_dict'].items(): + # handle multiple jobs + else: + rdds.extend( + [get_job_as_df(spark, job_id).select(['document', 'combine_id', 'record_id']).rdd for job_id in + job_ids]) - # handle single job_id - if len(job_ids) == 1: - rdds.extend([get_job_as_df(spark, job_ids[0]).select(['document','combine_id','record_id']).rdd]) + # union all + batch_rdd = spark.sparkContext.union(rdds) - # handle multiple jobs - else: - rdds.extend([ get_job_as_df(spark, job_id).select(['document','combine_id','record_id']).rdd for job_id in job_ids ]) + # convert rdd + kvp_batch_rdd = _convert_xml_to_kvp(batch_rdd, fm_config) - # union all - batch_rdd = spark.sparkContext.union(rdds) + # repartition to records per file + kvp_batch_rdd = kvp_batch_rdd.repartition( + math.ceil(kvp_batch_rdd.count() / settings.TARGET_RECORDS_PER_PARTITION)) - # convert rdd - kvp_batch_rdd = _convert_xml_to_kvp(batch_rdd, fm_config) + # convert to dataframe + kvp_batch_df = spark.read.json(kvp_batch_rdd) - # repartition to records per file - kvp_batch_rdd = kvp_batch_rdd.repartition(math.ceil(kvp_batch_rdd.count() / settings.TARGET_RECORDS_PER_PARTITION)) + # write to bucket as jsonl + kvp_batch_df.write.mode('overwrite').json( + 's3a://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'])) - # convert to dataframe - kvp_batch_df = spark.read.json(kvp_batch_rdd) + # write to disk + else: - # write to bucket as jsonl - kvp_batch_df.write.mode('overwrite').json('s3a://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'])) + # loop through potential output folders + for folder_name, job_ids in ct.task_params['job_dict'].items(): - # write to disk - else: + # handle single job_id + if len(job_ids) == 1: - # loop through potential output folders - for folder_name, job_ids in ct.task_params['job_dict'].items(): + # get Job records as df + batch_rdd = get_job_as_df(spark, job_ids[0]).select(['document', 'combine_id', 'record_id']).rdd - # handle single job_id - if len(job_ids) == 1: + # handle multiple jobs + else: - # get Job records as df - batch_rdd = get_job_as_df(spark, job_ids[0]).select(['document','combine_id','record_id']).rdd + rdds = [get_job_as_df(spark, job_id).select(['document', 'combine_id', 'record_id']).rdd for job_id in + job_ids] + batch_rdd = spark.sparkContext.union(rdds) - # handle multiple jobs - else: + # convert rdd + kvp_batch_rdd = _convert_xml_to_kvp(batch_rdd, fm_config) - rdds = [ get_job_as_df(spark, job_id).select(['document','combine_id','record_id']).rdd for job_id in job_ids ] - batch_rdd = spark.sparkContext.union(rdds) + # repartition to records per file + kvp_batch_rdd = kvp_batch_rdd.repartition( + math.ceil(kvp_batch_rdd.count() / int(ct.task_params['records_per_file']))) - # convert rdd - kvp_batch_rdd = _convert_xml_to_kvp(batch_rdd, fm_config) + # handle json + if ct.task_params['tabular_data_export_type'] == 'json': + _write_tabular_json(spark, kvp_batch_rdd, output_path, folder_name, fm_config) - # repartition to records per file - kvp_batch_rdd = kvp_batch_rdd.repartition(math.ceil(kvp_batch_rdd.count()/int(ct.task_params['records_per_file']))) - - # handle json - if ct.task_params['tabular_data_export_type'] == 'json': - _write_tabular_json(spark, kvp_batch_rdd, output_path, folder_name, fm_config) - - # handle csv - if ct.task_params['tabular_data_export_type'] == 'csv': - _write_tabular_csv(spark, kvp_batch_rdd, output_path, folder_name, fm_config) + # handle csv + if ct.task_params['tabular_data_export_type'] == 'csv': + _write_tabular_csv(spark, kvp_batch_rdd, output_path, folder_name, fm_config) def _convert_xml_to_kvp(batch_rdd, fm_config): + ''' + Sub-Function to convert RDD of XML to KVP - ''' - Sub-Function to convert RDD of XML to KVP + Args: + batch_rdd (RDD): RDD containing batch of Records rows + fm_config (dict): Dictionary of XML2kvp configurations to use for kvp_to_xml() - Args: - batch_rdd (RDD): RDD containing batch of Records rows - fm_config (dict): Dictionary of XML2kvp configurations to use for kvp_to_xml() + Returns + kvp_batch_rdd (RDD): RDD of JSONlines + ''' - Returns - kvp_batch_rdd (RDD): RDD of JSONlines - ''' + def kvp_writer_udf(row, fm_config): - def kvp_writer_udf(row, fm_config): + ''' + Converts XML to kvpjson, for testing okay? + ''' - ''' - Converts XML to kvpjson, for testing okay? - ''' + # get handler, that includes defaults + xml2kvp_defaults = XML2kvp(**fm_config) - # get handler, that includes defaults - xml2kvp_defaults = XML2kvp(**fm_config) + # convert XML to kvp + xml2kvp_handler = XML2kvp.xml_to_kvp(row.document, return_handler=True, handler=xml2kvp_defaults) - # convert XML to kvp - xml2kvp_handler = XML2kvp.xml_to_kvp(row.document, return_handler=True, handler=xml2kvp_defaults) + # loop through and convert lists/tuples to multivalue_delim + for k, v in xml2kvp_handler.kvp_dict.items(): + if type(v) in [list, tuple]: + xml2kvp_handler.kvp_dict[k] = xml2kvp_handler.multivalue_delim.join(v) - # loop through and convert lists/tuples to multivalue_delim - for k,v in xml2kvp_handler.kvp_dict.items(): - if type(v) in [list,tuple]: - xml2kvp_handler.kvp_dict[k] = xml2kvp_handler.multivalue_delim.join(v) + # mixin other row attributes to kvp_dict + xml2kvp_handler.kvp_dict.update({ + 'record_id': row.record_id, + 'combine_id': row.combine_id + }) - # mixin other row attributes to kvp_dict - xml2kvp_handler.kvp_dict.update({ - 'record_id':row.record_id, - 'combine_id':row.combine_id - }) + # return JSON line + return json.dumps(xml2kvp_handler.kvp_dict) - # return JSON line - return json.dumps(xml2kvp_handler.kvp_dict) - - # run UDF - return batch_rdd.map(lambda row: kvp_writer_udf(row, fm_config)) + # run UDF + return batch_rdd.map(lambda row: kvp_writer_udf(row, fm_config)) def _write_tabular_json(spark, kvp_batch_rdd, base_path, folder_name, fm_config): - - # write JSON lines - kvp_batch_rdd.saveAsTextFile('%s/%s' % (base_path, folder_name)) + # write JSON lines + kvp_batch_rdd.saveAsTextFile('%s/%s' % (base_path, folder_name)) def _write_tabular_csv(spark, kvp_batch_rdd, base_path, folder_name, fm_config): + # read rdd to DataFrame + kvp_batch_df = spark.read.json(kvp_batch_rdd) - # read rdd to DataFrame - kvp_batch_df = spark.read.json(kvp_batch_rdd) + # load XML2kvp instance + xml2kvp_defaults = XML2kvp(**fm_config) - # load XML2kvp instance - xml2kvp_defaults = XML2kvp(**fm_config) - - # write to CSV - kvp_batch_df.write.csv('%s/%s' % (base_path, folder_name), header=True) + # write to CSV + kvp_batch_df.write.csv('%s/%s' % (base_path, folder_name), header=True) def _write_rdd_to_s3( - spark, - rdd, - bucket, - key, - access_key=settings.AWS_ACCESS_KEY_ID, - secret_key=settings.AWS_SECRET_ACCESS_KEY): - - ''' - Function to write RDD to S3 - - Args: - rdd (RDD): RDD to write to S3 - bucket (str): bucket string to write to - key (str): key/path to write to in S3 bucket - access_key (str): default to settings, override with access key - secret_key (str): default to settings, override with secret key - ''' - - # dynamically set s3 credentials - spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", access_key) - spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secret_key) - - # write rdd to S3 - rdd.saveAsTextFile('s3a://%s/%s' % (bucket, key.lstrip('/'))) - + spark, + rdd, + bucket, + key, + access_key=settings.AWS_ACCESS_KEY_ID, + secret_key=settings.AWS_SECRET_ACCESS_KEY): + ''' + Function to write RDD to S3 + + Args: + rdd (RDD): RDD to write to S3 + bucket (str): bucket string to write to + key (str): key/path to write to in S3 bucket + access_key (str): default to settings, override with access key + secret_key (str): default to settings, override with secret key + ''' + + # dynamically set s3 credentials + spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", access_key) + spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secret_key) + + # write rdd to S3 + rdd.saveAsTextFile('s3a://%s/%s' % (bucket, key.lstrip('/'))) ############################################################################ @@ -394,142 +387,132 @@ def _write_rdd_to_s3( ############################################################################ def get_job_as_df(spark, job_id, remove_id=False): + ''' + Convenience method to retrieve set of records as Spark DataFrame + ''' - ''' - Convenience method to retrieve set of records as Spark DataFrame - ''' - - pipeline = json.dumps({'$match': {'job_id': job_id}}) - mdf = spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() + pipeline = json.dumps({'$match': {'job_id': job_id}}) + mdf = spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() - # if remove ID - if remove_id: - mdf = mdf.select([ c for c in mdf.columns if c != '_id' ]) + # if remove ID + if remove_id: + mdf = mdf.select([c for c in mdf.columns if c != '_id']) - return mdf + return mdf def get_job_es(spark, - job_id=None, - indices=None, - doc_type='record', - es_query=None, - field_include=None, - field_exclude=None, - as_rdd=False): - - ''' - Convenience method to retrieve mapped fields from ElasticSearch - - Args: - - job_id (int): job to retrieve - indices (list): list of index strings to retrieve from - doc_type (str): defaults to 'record', but configurable (e.g. 'item') - es_query (str): JSON string of ES query - field_include (str): comma seperated list of fields to include in response - field_exclude (str): comma seperated list of fields to exclude in response - as_rdd (boolean): boolean to return as RDD, or False to convert to DF - ''' - - # handle indices - if job_id: - es_indexes = 'j%s' % job_id - elif indices: - es_indexes = ','.join(indices) - - # prep conf - conf = { - "es.resource":"%s/%s" % (es_indexes,doc_type), - "es.output.json":"true", - "es.input.max.docs.per.partition":"10000", - "es.nodes":"%s:9200" % settings.ES_HOST, - } - - # handle es_query - if es_query: - conf['es.query'] = es_query - - # handle field exclusion - if field_exclude: - conf['es.read.field.exclude'] = field_exclude - - # handle field inclusion - if field_include: - conf['es.read.field.include'] = field_exclude - - # get es index as RDD - es_rdd = spark.sparkContext.newAPIHadoopRDD( - inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", - keyClass="org.apache.hadoop.io.NullWritable", - valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", - conf=conf) - - # return rdd - if as_rdd: - return es_rdd - - # read json - es_df = spark.read.json(es_rdd.map(lambda row: row[1])) - - # return - return es_df + job_id=None, + indices=None, + doc_type='record', + es_query=None, + field_include=None, + field_exclude=None, + as_rdd=False): + ''' + Convenience method to retrieve mapped fields from ElasticSearch + + Args: + + job_id (int): job to retrieve + indices (list): list of index strings to retrieve from + doc_type (str): defaults to 'record', but configurable (e.g. 'item') + es_query (str): JSON string of ES query + field_include (str): comma seperated list of fields to include in response + field_exclude (str): comma seperated list of fields to exclude in response + as_rdd (boolean): boolean to return as RDD, or False to convert to DF + ''' + + # handle indices + if job_id: + es_indexes = 'j%s' % job_id + elif indices: + es_indexes = ','.join(indices) + + # prep conf + conf = { + "es.resource": "%s/%s" % (es_indexes, doc_type), + "es.output.json": "true", + "es.input.max.docs.per.partition": "10000", + "es.nodes": "%s:9200" % settings.ES_HOST, + } + + # handle es_query + if es_query: + conf['es.query'] = es_query + + # handle field exclusion + if field_exclude: + conf['es.read.field.exclude'] = field_exclude + + # handle field inclusion + if field_include: + conf['es.read.field.include'] = field_exclude + + # get es index as RDD + es_rdd = spark.sparkContext.newAPIHadoopRDD( + inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", + keyClass="org.apache.hadoop.io.NullWritable", + valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", + conf=conf) + + # return rdd + if as_rdd: + return es_rdd + + # read json + es_df = spark.read.json(es_rdd.map(lambda row: row[1])) + + # return + return es_df def get_sql_job_as_df(spark, job_id, remove_id=False): + sqldf = spark.read.jdbc(settings.COMBINE_DATABASE['jdbc_url'], 'core_record', properties=settings.COMBINE_DATABASE) + sqldf = sqldf.filter(sqldf['job_id'] == job_id) - sqldf = spark.read.jdbc(settings.COMBINE_DATABASE['jdbc_url'],'core_record',properties=settings.COMBINE_DATABASE) - sqldf = sqldf.filter(sqldf['job_id'] == job_id) + # if remove ID + if remove_id: + sqldf = sqldf.select([c for c in sqldf.columns if c != 'id']) - # if remove ID - if remove_id: - sqldf = sqldf.select([ c for c in sqldf.columns if c != 'id' ]) - - return sqldf + return sqldf def copy_sql_to_mongo(spark, job_id): + # get sql job + sdf = get_sql_job_as_df(spark, job_id, remove_id=True) - # get sql job - sdf = get_sql_job_as_df(spark, job_id, remove_id=True) - - # repartition - sdf = sdf.rdd.repartition(200).toDF(schema=sdf.schema) + # repartition + sdf = sdf.rdd.repartition(200).toDF(schema=sdf.schema) - # insert - sdf.write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection", "record").save() + # insert + sdf.write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record").save() def copy_sql_to_mongo_adv(spark, job_id, lowerBound, upperBound, numPartitions): - - sqldf = spark.read.jdbc( - settings.COMBINE_DATABASE['jdbc_url'], - 'core_record', - properties=settings.COMBINE_DATABASE, - column='id', - lowerBound=lowerBound, - upperBound=upperBound, - numPartitions=numPartitions - ) - db_records = sqldf.filter(sqldf.job_id == int(job_id)) - - db_records.write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection", "record").save() - - - - - + sqldf = spark.read.jdbc( + settings.COMBINE_DATABASE['jdbc_url'], + 'core_record', + properties=settings.COMBINE_DATABASE, + column='id', + lowerBound=lowerBound, + upperBound=upperBound, + numPartitions=numPartitions + ) + db_records = sqldf.filter(sqldf.job_id == int(job_id)) + + db_records.write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record").save() diff --git a/core/spark/es.py b/core/spark/es.py index 1ee4edff..ecf6383a 100644 --- a/core/spark/es.py +++ b/core/spark/es.py @@ -16,367 +16,352 @@ # import Row from pyspark try: - from pyspark.sql import Row - from pyspark.sql.types import StringType, IntegerType - from pyspark.sql.functions import udf, lit + from pyspark.sql import Row + from pyspark.sql.types import StringType, IntegerType + from pyspark.sql.functions import udf, lit except: - pass + pass # check for registered apps signifying readiness, if not, run django.setup() to run as standalone if not hasattr(django, 'apps'): - os.environ['DJANGO_SETTINGS_MODULE'] = 'combine.settings' - sys.path.append('/opt/combine') - django.setup() + os.environ['DJANGO_SETTINGS_MODULE'] = 'combine.settings' + sys.path.append('/opt/combine') + django.setup() # import django settings from django.conf import settings # import xml2kvp try: - from core.xml2kvp import XML2kvp + from core.xml2kvp import XML2kvp except: - from xml2kvp import XML2kvp + from xml2kvp import XML2kvp class ESIndex(object): - - ''' - Class to organize methods for indexing mapped/flattened metadata into ElasticSearch (ES) - ''' - - @staticmethod - def index_job_to_es_spark(spark, job, records_df, field_mapper_config): - - ''' - Method to index records dataframe into ES - - Args: - spark (pyspark.sql.session.SparkSession): spark instance from static job methods - job (core.models.Job): Job for records - records_df (pyspark.sql.DataFrame): records as pyspark DataFrame - field_mapper_config (dict): XML2kvp field mapper configurations - - Returns: - None - - indexes records to ES - ''' - - # init logging support - spark.sparkContext.setLogLevel('INFO') - log4jLogger = spark.sparkContext._jvm.org.apache.log4j - logger = log4jLogger.LogManager.getLogger(__name__) - - # get index mapper - index_mapper_handle = globals()['XML2kvpMapper'] - - # create rdd from index mapper - def es_mapper_pt_udf(pt): - - # init mapper once per partition - mapper = index_mapper_handle(field_mapper_config=field_mapper_config) - - for row in pt: - - yield mapper.map_record( - record_string=row.document, - db_id=row._id.oid, - combine_id=row.combine_id, - record_id=row.record_id, - publish_set_id=job.publish_set_id, - fingerprint=row.fingerprint - ) - - logger.info('###ES 1 -- mapping records') - mapped_records_rdd = records_df.rdd.mapPartitions(es_mapper_pt_udf) - - # attempt to write index mapping failures to DB - # filter our failures - logger.info('###ES 2 -- filtering failures') - failures_rdd = mapped_records_rdd.filter(lambda row: row[0] == 'fail') - - # if failures, write - if not failures_rdd.isEmpty(): - - logger.info('###ES 3 -- writing indexing failures') - - failures_df = failures_rdd.map(lambda row: Row( - db_id=row[1]['db_id'], - record_id=row[1]['record_id'], - mapping_error=row[1]['mapping_error'] - )).toDF() - - # add job_id as column - failures_df = failures_df.withColumn('job_id', lit(job.id)) - - # write mapping failures to DB - failures_df.select(['db_id', 'record_id', 'job_id', 'mapping_error'])\ - .write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://127.0.0.1")\ - .option("database","combine")\ - .option("collection", "index_mapping_failure").save() - - # retrieve successes to index - logger.info('###ES 4 -- filtering successes') - to_index_rdd = mapped_records_rdd.filter(lambda row: row[0] == 'success') - - # create index in advance - index_name = 'j%s' % job.id - es_handle_temp = Elasticsearch(hosts=[settings.ES_HOST]) - if not es_handle_temp.indices.exists(index_name): - - # put combine es index templates - template_body = { - 'template':'*', - 'settings':{ - 'number_of_shards':1, - 'number_of_replicas':0, - 'refresh_interval':-1 - }, - 'mappings':{ - 'record':{ - "dynamic_templates": [ - { - "strings": { - "match_mapping_type": "string", - "mapping": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword" - } - } - } - } - } - ], - 'date_detection':False, - 'properties':{ - 'combine_db_id':{ - 'type':'integer' - } - } - } - } - } - es_handle_temp.indices.put_template('combine_template', body=json.dumps(template_body)) - - # create index - es_handle_temp.indices.create(index_name) - - # index to ES - logger.info('###ES 5 -- writing to ES') - to_index_rdd.saveAsNewAPIHadoopFile( - path='-', - outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", - keyClass="org.apache.hadoop.io.NullWritable", - valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", - conf={ - "es.resource":"%s/record" % index_name, - "es.nodes":"%s:9200" % settings.ES_HOST, - "es.mapping.exclude":"temp_id,__class__", - "es.mapping.id":"temp_id", - } - ) - - # refresh index - es_handle_temp.indices.refresh(index_name) - - # return - return to_index_rdd - - - @staticmethod - def copy_es_index( - source_index=None, - target_index=None, - create_target_index=True, - refresh=True, - wait_for_completion=True, - add_copied_from=None): - - ''' - Method to duplicate one ES index to another - - Args: - create_target_index (boolean): If True, check for target and create - source_index (str): Source ES index to copy from - target_index (str): Target ES index to copy to - - Returns: - (dict): results of reindex via elasticsearch client reindex request - ''' - - # get ES handle - es_handle_temp = Elasticsearch(hosts=[settings.ES_HOST]) - - # put/confirm combine es index templates - template_body = { - 'template':'*', - 'settings':{ - 'number_of_shards':1, - 'number_of_replicas':0, - 'refresh_interval':-1 - }, - 'mappings':{ - 'record':{ - 'date_detection':False, - 'properties':{ - 'combine_db_id':{ - 'type':'integer' - } - } - } - } - } - es_handle_temp.indices.put_template('combine_template', body=json.dumps(template_body)) - - # if creating target index check if target index exists - if create_target_index and not es_handle_temp.indices.exists(target_index): - es_handle_temp.indices.create(target_index) - - # prepare reindex query - dupe_dict = { - 'source':{ - 'index': source_index, - 'query':{} - }, - 'dest': { - 'index':target_index - } - } - - # if add_copied_from, include in reindexed document - if add_copied_from: - dupe_dict['script'] = { - 'inline': 'ctx._source.source_job_id = %s' % add_copied_from, - 'lang': 'painless' - } - - # reindex using elasticsearch client - reindex = es_handle_temp.reindex(body=dupe_dict, wait_for_completion=wait_for_completion, refresh=refresh) - return reindex - + ''' + Class to organize methods for indexing mapped/flattened metadata into ElasticSearch (ES) + ''' + + @staticmethod + def index_job_to_es_spark(spark, job, records_df, field_mapper_config): + + ''' + Method to index records dataframe into ES + + Args: + spark (pyspark.sql.session.SparkSession): spark instance from static job methods + job (core.models.Job): Job for records + records_df (pyspark.sql.DataFrame): records as pyspark DataFrame + field_mapper_config (dict): XML2kvp field mapper configurations + + Returns: + None + - indexes records to ES + ''' + + # init logging support + spark.sparkContext.setLogLevel('INFO') + log4jLogger = spark.sparkContext._jvm.org.apache.log4j + logger = log4jLogger.LogManager.getLogger(__name__) + + # get index mapper + index_mapper_handle = globals()['XML2kvpMapper'] + + # create rdd from index mapper + def es_mapper_pt_udf(pt): + + # init mapper once per partition + mapper = index_mapper_handle(field_mapper_config=field_mapper_config) + + for row in pt: + yield mapper.map_record( + record_string=row.document, + db_id=row._id.oid, + combine_id=row.combine_id, + record_id=row.record_id, + publish_set_id=job.publish_set_id, + fingerprint=row.fingerprint + ) + + logger.info('###ES 1 -- mapping records') + mapped_records_rdd = records_df.rdd.mapPartitions(es_mapper_pt_udf) + + # attempt to write index mapping failures to DB + # filter our failures + logger.info('###ES 2 -- filtering failures') + failures_rdd = mapped_records_rdd.filter(lambda row: row[0] == 'fail') + + # if failures, write + if not failures_rdd.isEmpty(): + logger.info('###ES 3 -- writing indexing failures') + + failures_df = failures_rdd.map(lambda row: Row( + db_id=row[1]['db_id'], + record_id=row[1]['record_id'], + mapping_error=row[1]['mapping_error'] + )).toDF() + + # add job_id as column + failures_df = failures_df.withColumn('job_id', lit(job.id)) + + # write mapping failures to DB + failures_df.select(['db_id', 'record_id', 'job_id', 'mapping_error']) \ + .write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://127.0.0.1") \ + .option("database", "combine") \ + .option("collection", "index_mapping_failure").save() + + # retrieve successes to index + logger.info('###ES 4 -- filtering successes') + to_index_rdd = mapped_records_rdd.filter(lambda row: row[0] == 'success') + + # create index in advance + index_name = 'j%s' % job.id + es_handle_temp = Elasticsearch(hosts=[settings.ES_HOST]) + if not es_handle_temp.indices.exists(index_name): + # put combine es index templates + template_body = { + 'template': '*', + 'settings': { + 'number_of_shards': 1, + 'number_of_replicas': 0, + 'refresh_interval': -1 + }, + 'mappings': { + 'record': { + "dynamic_templates": [ + { + "strings": { + "match_mapping_type": "string", + "mapping": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } + } + } + } + ], + 'date_detection': False, + 'properties': { + 'combine_db_id': { + 'type': 'integer' + } + } + } + } + } + es_handle_temp.indices.put_template('combine_template', body=json.dumps(template_body)) + + # create index + es_handle_temp.indices.create(index_name) + + # index to ES + logger.info('###ES 5 -- writing to ES') + to_index_rdd.saveAsNewAPIHadoopFile( + path='-', + outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", + keyClass="org.apache.hadoop.io.NullWritable", + valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", + conf={ + "es.resource": "%s/record" % index_name, + "es.nodes": "%s:9200" % settings.ES_HOST, + "es.mapping.exclude": "temp_id,__class__", + "es.mapping.id": "temp_id", + } + ) + + # refresh index + es_handle_temp.indices.refresh(index_name) + + # return + return to_index_rdd + + @staticmethod + def copy_es_index( + source_index=None, + target_index=None, + create_target_index=True, + refresh=True, + wait_for_completion=True, + add_copied_from=None): + + ''' + Method to duplicate one ES index to another + + Args: + create_target_index (boolean): If True, check for target and create + source_index (str): Source ES index to copy from + target_index (str): Target ES index to copy to + + Returns: + (dict): results of reindex via elasticsearch client reindex request + ''' + + # get ES handle + es_handle_temp = Elasticsearch(hosts=[settings.ES_HOST]) + + # put/confirm combine es index templates + template_body = { + 'template': '*', + 'settings': { + 'number_of_shards': 1, + 'number_of_replicas': 0, + 'refresh_interval': -1 + }, + 'mappings': { + 'record': { + 'date_detection': False, + 'properties': { + 'combine_db_id': { + 'type': 'integer' + } + } + } + } + } + es_handle_temp.indices.put_template('combine_template', body=json.dumps(template_body)) + + # if creating target index check if target index exists + if create_target_index and not es_handle_temp.indices.exists(target_index): + es_handle_temp.indices.create(target_index) + + # prepare reindex query + dupe_dict = { + 'source': { + 'index': source_index, + 'query': {} + }, + 'dest': { + 'index': target_index + } + } + + # if add_copied_from, include in reindexed document + if add_copied_from: + dupe_dict['script'] = { + 'inline': 'ctx._source.source_job_id = %s' % add_copied_from, + 'lang': 'painless' + } + + # reindex using elasticsearch client + reindex = es_handle_temp.reindex(body=dupe_dict, wait_for_completion=wait_for_completion, refresh=refresh) + return reindex class BaseMapper(object): + ''' + All mappers extend this BaseMapper class. - ''' - All mappers extend this BaseMapper class. - - Contains some useful methods and attributes that other mappers may use + Contains some useful methods and attributes that other mappers may use - Mappers expected to contain following methods: - - map_record() - ''' + Mappers expected to contain following methods: + - map_record() + ''' - # pre-compiled regex - blank_check_regex = re.compile(r"[^ \t\n]") # checker for blank spaces - namespace_prefix_regex = re.compile(r'(\{.+\})?(.*)') # element tag name + # pre-compiled regex + blank_check_regex = re.compile(r"[^ \t\n]") # checker for blank spaces + namespace_prefix_regex = re.compile(r'(\{.+\})?(.*)') # element tag name + def get_namespaces(self): - def get_namespaces(self): + ''' + Method to parse namespaces from XML document and save to self.nsmap + ''' - ''' - Method to parse namespaces from XML document and save to self.nsmap - ''' - - nsmap = {} - for ns in self.xml_root.xpath('//namespace::*'): - if ns[0]: - nsmap[ns[0]] = ns[1] - self.nsmap = nsmap - - # set inverted nsmap - self.nsmap_inv = {v: k for k, v in self.nsmap.items()} + nsmap = {} + for ns in self.xml_root.xpath('//namespace::*'): + if ns[0]: + nsmap[ns[0]] = ns[1] + self.nsmap = nsmap + # set inverted nsmap + self.nsmap_inv = {v: k for k, v in self.nsmap.items()} class XML2kvpMapper(BaseMapper): + ''' + Map XML to ElasticSearch friendly fields with XML2kvp + ''' - ''' - Map XML to ElasticSearch friendly fields with XML2kvp - ''' - - - def __init__(self, field_mapper_config=None): - - self.field_mapper_config = field_mapper_config - - - def map_record(self, - record_string=None, - db_id=None, - combine_id=None, - record_id=None, - publish_set_id=None, - fingerprint=None - ): - - ''' - Map record + def __init__(self, field_mapper_config=None): - Args: - record_string (str): string of record document - db_id (str): mongo db id - combine_id (str): combine_id id - record_id (str): record id - publish_set_id (str): core.models.RecordGroup.published_set_id, used to build publish identifier - fingerprint (str): fingerprint + self.field_mapper_config = field_mapper_config - Returns: - (tuple): - 0 (str): ['success','fail'] - 1 (dict): details from mapping process, success or failure - ''' + def map_record(self, + record_string=None, + db_id=None, + combine_id=None, + record_id=None, + publish_set_id=None, + fingerprint=None + ): - try: + ''' + Map record - # prepare literals - if 'add_literals' not in self.field_mapper_config.keys(): - self.field_mapper_config['add_literals'] = {} + Args: + record_string (str): string of record document + db_id (str): mongo db id + combine_id (str): combine_id id + record_id (str): record id + publish_set_id (str): core.models.RecordGroup.published_set_id, used to build publish identifier + fingerprint (str): fingerprint - # add literals - self.field_mapper_config['add_literals'].update({ + Returns: + (tuple): + 0 (str): ['success','fail'] + 1 (dict): details from mapping process, success or failure + ''' - # add temporary id field - 'temp_id':db_id, + try: - # add combine_id field - 'combine_id':combine_id, + # prepare literals + if 'add_literals' not in self.field_mapper_config.keys(): + self.field_mapper_config['add_literals'] = {} - # add record_id field - 'record_id':record_id, + # add literals + self.field_mapper_config['add_literals'].update({ - # add publish set id - 'publish_set_id':publish_set_id, + # add temporary id field + 'temp_id': db_id, - # add record's Combine DB id - 'db_id':db_id, + # add combine_id field + 'combine_id': combine_id, - # add record's crc32 document hash, aka "fingerprint" - 'fingerprint':fingerprint, + # add record_id field + 'record_id': record_id, - }) + # add publish set id + 'publish_set_id': publish_set_id, - # map with XML2kvp - kvp_dict = XML2kvp.xml_to_kvp(record_string, **self.field_mapper_config) + # add record's Combine DB id + 'db_id': db_id, - return ( - 'success', - kvp_dict - ) + # add record's crc32 document hash, aka "fingerprint" + 'fingerprint': fingerprint, - except Exception as e: + }) - return ( - 'fail', - { - 'db_id':db_id, - 'record_id':record_id, - 'mapping_error':str(e) - } - ) + # map with XML2kvp + kvp_dict = XML2kvp.xml_to_kvp(record_string, **self.field_mapper_config) + return ( + 'success', + kvp_dict + ) + except Exception as e: + return ( + 'fail', + { + 'db_id': db_id, + 'record_id': record_id, + 'mapping_error': str(e) + } + ) diff --git a/core/spark/jobs.py b/core/spark/jobs.py index 67bca2e6..5207f864 100644 --- a/core/spark/jobs.py +++ b/core/spark/jobs.py @@ -25,17 +25,17 @@ # import from core.spark try: - from es import ESIndex - from utils import PythonUDFRecord, refresh_django_db_connection, df_union_all - from record_validation import ValidationScenarioSpark - from console import get_job_as_df, get_job_es - from xml2kvp import XML2kvp + from es import ESIndex + from utils import PythonUDFRecord, refresh_django_db_connection, df_union_all + from record_validation import ValidationScenarioSpark + from console import get_job_as_df, get_job_es + from xml2kvp import XML2kvp except: - from core.spark.es import ESIndex - from core.spark.utils import PythonUDFRecord, refresh_django_db_connection, df_union_all - from core.spark.record_validation import ValidationScenarioSpark - from core.spark.console import get_job_as_df, get_job_es - from core.xml2kvp import XML2kvp + from core.spark.es import ESIndex + from core.spark.utils import PythonUDFRecord, refresh_django_db_connection, df_union_all + from core.spark.record_validation import ValidationScenarioSpark + from core.spark.console import get_job_as_df, get_job_es + from core.xml2kvp import XML2kvp # import Row from pyspark from pyspark import StorageLevel @@ -47,30 +47,28 @@ # check for registered apps signifying readiness, if not, run django.setup() to run as standalone if not hasattr(django, 'apps'): - os.environ['DJANGO_SETTINGS_MODULE'] = 'combine.settings' - sys.path.append('/opt/combine') - django.setup() + os.environ['DJANGO_SETTINGS_MODULE'] = 'combine.settings' + sys.path.append('/opt/combine') + django.setup() # import django settings from django.conf import settings from django.db import connection, transaction # import select models from Core -from core.models import CombineJob, Job, JobInput, JobTrack, Transformation, PublishedRecords, RecordIdentifierTransformationScenario, RecordValidation, DPLABulkDataDownload +from core.models import CombineJob, Job, JobInput, JobTrack, Transformation, PublishedRecords, \ + RecordIdentifierTransformationScenario, RecordValidation, DPLABulkDataDownload # import mongo dependencies from core.mongo import * - - #################################################################### # Custom Exceptions # #################################################################### class AmbiguousIdentifier(Exception): - pass - + pass #################################################################### @@ -78,33 +76,30 @@ class AmbiguousIdentifier(Exception): #################################################################### class CombineRecordSchema(object): - - ''' - Class to organize Combine record spark dataframe schemas - ''' - - def __init__(self): - - # schema for Combine records - self.schema = StructType([ - StructField('combine_id', StringType(), True), - StructField('record_id', StringType(), True), - StructField('document', StringType(), True), - StructField('error', StringType(), True), - StructField('unique', BooleanType(), True), - StructField('job_id', IntegerType(), False), - StructField('oai_set', StringType(), True), - StructField('success', BooleanType(), False), - StructField('fingerprint', IntegerType(), False), - StructField('transformed', BooleanType(), False), - StructField('valid', BooleanType(), False), - StructField('dbdm', BooleanType(), False) - ] - ) - - # fields - self.field_names = [f.name for f in self.schema.fields if f.name != 'id'] - + ''' + Class to organize Combine record spark dataframe schemas + ''' + + def __init__(self): + # schema for Combine records + self.schema = StructType([ + StructField('combine_id', StringType(), True), + StructField('record_id', StringType(), True), + StructField('document', StringType(), True), + StructField('error', StringType(), True), + StructField('unique', BooleanType(), True), + StructField('job_id', IntegerType(), False), + StructField('oai_set', StringType(), True), + StructField('success', BooleanType(), False), + StructField('fingerprint', IntegerType(), False), + StructField('transformed', BooleanType(), False), + StructField('valid', BooleanType(), False), + StructField('dbdm', BooleanType(), False) + ] + ) + + # fields + self.field_names = [f.name for f in self.schema.fields if f.name != 'id'] #################################################################### @@ -112,1489 +107,1474 @@ def __init__(self): #################################################################### class CombineSparkJob(object): + ''' + Base class for Combine Spark Jobs. + Provides some usuable components for jobs. + ''' - ''' - Base class for Combine Spark Jobs. - Provides some usuable components for jobs. - ''' - - - def __init__(self, spark, **kwargs): - - self.spark = spark - - self.kwargs = kwargs - - # init logging support - spark.sparkContext.setLogLevel('INFO') - log4jLogger = spark.sparkContext._jvm.org.apache.log4j - self.logger = log4jLogger.LogManager.getLogger(__name__) - - - def init_job(self): - - # refresh Django DB Connection - refresh_django_db_connection() - - # get job - results = polling.poll(lambda: Job.objects.filter(id=int(self.kwargs['job_id'])).count() == 1, step=1, timeout=60) - self.job = Job.objects.get(pk=int(self.kwargs['job_id'])) - - # start job_track instance, marking job start - self.job_track = JobTrack( - job_id = self.job.id - ) - self.job_track.save() - - # retrieve job_details - self.job_details = self.job.job_details_dict - - - def close_job(self): - - ''' - Note to Job tracker that finished, and perform other long-running, one-time calculations - to speed up front-end - ''' - - refresh_django_db_connection() - - # if re-run, check if job was previously published and republish - if 'published' in self.job.job_details_dict.keys(): - if self.job.job_details_dict['published']['status'] == True: - self.logger.info('job params flagged for publishing') - self.job.publish(publish_set_id=self.job.publish_set_id) - elif self.job.job_details_dict['published']['status'] == False: - self.logger.info('job params flagged for unpublishing') - self.job.unpublish() - - # finally, update finish_timestamp of job_track instance - self.job_track.finish_timestamp = datetime.datetime.now() - self.job_track.save() - - # count new job validations - for jv in self.job.jobvalidation_set.filter(failure_count=None): - jv.validation_failure_count(force_recount=True) - - # unpersist cached dataframes - self.spark.catalog.clearCache() - - - def update_jobGroup(self, description): - - ''' - Method to update spark jobGroup - ''' - - self.logger.info("### %s" % description) - self.spark.sparkContext.setJobGroup("%s" % self.job.id, "%s, Job #%s" % (description, self.job.id)) - - - def get_input_records(self, filter_input_records=True): - - # get input job ids - input_job_ids = [int(job_id) for job_id in self.job_details['input_job_ids']] - - # if job_specific input filters set, handle - if 'job_specific' in self.job_details['input_filters'].keys() and len(self.job_details['input_filters']['job_specific']) > 0: - - # debug - self.logger.info("Job specific input filters found, handling") - - # convenience dict - job_spec_dicts = self.job_details['input_filters']['job_specific'] - - # init list of dataframes to have union performed - job_spec_dfs = [] - - # remove job_specific from input_jobs - for spec_input_job_id in self.job_details['input_filters']['job_specific'].keys(): - input_job_ids.remove(int(spec_input_job_id)) - - # handle remaining, if any, non-specified jobs as per normal - if len(input_job_ids) > 0: - # retrieve from Mongo - pipeline = json.dumps([ - { - '$match': { - 'job_id':{ - '$in':input_job_ids - } - } - }, - { - '$project': { field_name:1 for field_name in CombineRecordSchema().field_names } - } - ]) - records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() - - # optionally filter - if filter_input_records: - records = self.record_input_filters(records) - - # add CombineRecordSchema columns if absent - records = self.add_missing_columns(records) - - # append to list of dataframes - job_spec_dfs.append(records) - - # group like/identical input filter parameters, run together - # https://stackoverflow.com/questions/52484043/group-key-value-pairs-in-python-dictionary-by-value-maintaining-original-key-as - grouped_spec_dicts = [{'input_filters': k, 'job_ids': [int(job_id) for job_id in list(map(itemgetter(0), g))]} for k, g in groupby(sorted(job_spec_dicts.items(), key=lambda t: t[1].items()), itemgetter(1))] - - # next, loop through spec'ed jobs, retrieve and filter - for job_spec_group in grouped_spec_dicts: - - # debug - self.logger.info("Handling specific input filters for job ids: %s" % job_spec_group['job_ids']) - - # handle remaining, non-specified jobs as per normal - # retrieve from Mongo - pipeline = json.dumps([ - { - '$match': { - 'job_id':{ - '$in':job_spec_group['job_ids'] - } - } - }, - { - '$project': { field_name:1 for field_name in CombineRecordSchema().field_names } - } - ]) - job_spec_records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() - - # optionally filter - if filter_input_records: - job_spec_records = self.record_input_filters(job_spec_records, input_filters=job_spec_group['input_filters']) - - # add CombineRecordSchema columns if absent - job_spec_records = self.add_missing_columns(job_spec_records) - - # append dataframe - job_spec_dfs.append(job_spec_records) - - # union spec'ed jobs with unspec'ed records - self.logger.info("union-izing all job dataframes") - unioned_records = df_union_all(job_spec_dfs) - - # count breakdown of input jobs/records, save to Job - self.count_input_records(unioned_records) - - # finally, return records - return unioned_records - - # else, handle filtering and retrieval same for each input job - else: - # retrieve from Mongo - pipeline = json.dumps([ - { - '$match': { - 'job_id':{ - '$in':input_job_ids - } - } - }, - { - '$project': { field_name:1 for field_name in CombineRecordSchema().field_names } - } - ]) - records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() - - # optionally filter - if filter_input_records: - records = self.record_input_filters(records) - - # add CombineRecordSchema columns if absent - records = self.add_missing_columns(records) - - # count breakdown of input jobs/records, save to Job - self.count_input_records(records) - - # return - return records - - - def add_missing_columns(self, records): - - ''' - Method to ensure records dataframe has all required columns from CombineRecordSchema - - Args: - records (DataFrame): dataframe of records - ''' - - # loop through required columns from CombineRecordSchema - self.logger.info("check for missing columns from CombineRecordSchema") - for field_name in CombineRecordSchema().field_names: - if field_name not in records.columns: - self.logger.info("adding column: %s" % field_name) - records = records.withColumn(field_name, pyspark_sql_functions.lit('')) - return records - - - def save_records(self, - records_df=None, - write_avro=settings.WRITE_AVRO, - index_records=settings.INDEX_TO_ES, - assign_combine_id=False): - - ''' - Method to index records to DB and trigger indexing to ElasticSearch (ES) - - Args: - records_df (pyspark.sql.DataFrame): records as pyspark DataFrame - write_avro (bool): boolean to write avro files to disk after DB indexing - index_records (bool): boolean to index to ES - assign_combine_id (bool): if True, establish `combine_id` column and populate with UUID - - Returns: - None - - determines if record_id unique among records DataFrame - - selects only columns that match CombineRecordSchema - - writes to DB, writes to avro files - ''' - - # assign combine ID - if assign_combine_id: - combine_id_udf = udf(lambda record_id: str(uuid.uuid4()), StringType()) - records_df = records_df.withColumn('combine_id', combine_id_udf(records_df.record_id)) - - # run record identifier transformation scenario if provided - records_df = self.run_rits(records_df) - - # check uniqueness (overwrites if column already exists) - records_df = records_df.withColumn("unique", ( - pyspark_sql_functions.count('record_id')\ - .over(Window.partitionBy('record_id')) == True)\ - .cast('boolean')) - - # add valid column - records_df = records_df.withColumn('valid', pyspark_sql_functions.lit(True)) - - # add DPLA Bulk Data Match (dbdm) column - records_df = records_df.withColumn('dbdm', pyspark_sql_functions.lit(False)) - - # ensure columns to avro and DB - records_df_combine_cols = records_df.select(CombineRecordSchema().field_names) - - # write avro, coalescing for output - if write_avro: - records_df_combine_cols.coalesce(settings.SPARK_REPARTITION)\ - .write.format("com.databricks.spark.avro").save(self.job.job_output) - - # write records to MongoDB - self.update_jobGroup('Saving Records to DB') - records_df_combine_cols.write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection", "record").save() - - # check if anything written to DB to continue, else abort - if self.job.get_records().count() > 0: - - # read rows from Mongo with minted ID for future stages - pipeline = json.dumps({'$match': {'job_id': self.job.id, 'success': True}}) - db_records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() - - # index to ElasticSearch - self.update_jobGroup('Indexing to ElasticSearch') - if index_records and settings.INDEX_TO_ES: - es_rdd = ESIndex.index_job_to_es_spark( - self.spark, - job=self.job, - records_df=db_records, - field_mapper_config=self.job_details['field_mapper_config'] - ) - - # run Validation Scenarios - if 'validation_scenarios' in self.job_details.keys(): - self.update_jobGroup('Running Validation Scenarios') - vs = ValidationScenarioSpark( - spark=self.spark, - job=self.job, - records_df=db_records, - validation_scenarios = self.job_details['validation_scenarios'] - ) - vs.run_record_validation_scenarios() - - # handle DPLA Bulk Data matching, rewriting/updating records where match is found - self.dpla_bulk_data_compare(db_records, es_rdd) - - # return - return db_records - - else: - raise Exception("No successful records written to disk for Job: %s" % self.job.name) - - - def record_input_filters(self, filtered_df, input_filters=None): - - ''' - Method to apply filters to input Records - - Args: - spark (pyspark.sql.session.SparkSession): provided by pyspark context - records_df (pyspark.sql.DataFrame): DataFrame of records pre validity filtering - - Returns: - (pyspark.sql.DataFrame): DataFrame of records post filtering - ''' - - # use input filters if provided, else fall back to job - if input_filters == None: - input_filters = self.job_details['input_filters'] - - # filter to input record appropriate field - # filtered_df = filtered_df.select(CombineRecordSchema().field_names) - - # handle validity filters - input_validity_valve = input_filters['input_validity_valve'] - - # filter to valid or invalid records - # return valid records - if input_validity_valve == 'valid': - filtered_df = filtered_df.filter(filtered_df.valid == 1) - - # return invalid records - elif input_validity_valve == 'invalid': - filtered_df = filtered_df.filter(filtered_df.valid == 0) - - # handle numerical filters - input_numerical_valve = input_filters['input_numerical_valve'] - if input_numerical_valve != None: - filtered_df = filtered_df.limit(input_numerical_valve) - - # handle es query valve - if 'input_es_query_valve' in input_filters.keys(): - input_es_query_valve = input_filters['input_es_query_valve'] - if input_es_query_valve not in [None,'{}']: - filtered_df = self.es_query_valve_filter(input_es_query_valve, filtered_df) - - # filter duplicates - if 'filter_dupe_record_ids' in input_filters.keys() and input_filters['filter_dupe_record_ids'] == True: - filtered_df = filtered_df.dropDuplicates(['record_id']) - - # after input filtering which might leverage db_id, drop - filtered_df = filtered_df.select([ c for c in filtered_df.columns if c != '_id' ]) - - # return - return filtered_df - - - def count_input_records(self, records): - - ''' - Method to count records by job_id - - count records from input jobs if > 1 - - otherwise assume Job.udpate_status() will calculate from single input job - - Args: - records (dataframe): Records to count based on job_id - ''' - - refresh_django_db_connection() - if 'input_job_ids' in self.job_details.keys() and len(self.job_details['input_job_ids']) > 1: - - # cache - records.cache() - - # copy input job ids to mark done (cast to int) - input_jobs = [int(job_id) for job_id in self.job_details['input_job_ids'].copy()] - - # group by job_ids - record_counts = records.groupBy('job_id').count() - - # loop through input jobs, init, and write - for input_job_count in record_counts.collect(): + def __init__(self, spark, **kwargs): - # remove from input_jobs - input_jobs.remove(input_job_count['job_id']) + self.spark = spark - # set passed records and save - input_job = JobInput.objects.filter(job_id=self.job.id, input_job_id=int(input_job_count['job_id'])).first() - input_job.passed_records = input_job_count['count'] - input_job.save() + self.kwargs = kwargs - # loop through any remaining jobs, where absence indicates 0 records passed - for input_job_id in input_jobs: + # init logging support + spark.sparkContext.setLogLevel('INFO') + log4jLogger = spark.sparkContext._jvm.org.apache.log4j + self.logger = log4jLogger.LogManager.getLogger(__name__) + + def init_job(self): + + # refresh Django DB Connection + refresh_django_db_connection() + + # get job + results = polling.poll(lambda: Job.objects.filter(id=int(self.kwargs['job_id'])).count() == 1, step=1, + timeout=60) + self.job = Job.objects.get(pk=int(self.kwargs['job_id'])) + + # start job_track instance, marking job start + self.job_track = JobTrack( + job_id=self.job.id + ) + self.job_track.save() + + # retrieve job_details + self.job_details = self.job.job_details_dict + + def close_job(self): + + ''' + Note to Job tracker that finished, and perform other long-running, one-time calculations + to speed up front-end + ''' + + refresh_django_db_connection() + + # if re-run, check if job was previously published and republish + if 'published' in self.job.job_details_dict.keys(): + if self.job.job_details_dict['published']['status'] == True: + self.logger.info('job params flagged for publishing') + self.job.publish(publish_set_id=self.job.publish_set_id) + elif self.job.job_details_dict['published']['status'] == False: + self.logger.info('job params flagged for unpublishing') + self.job.unpublish() + + # finally, update finish_timestamp of job_track instance + self.job_track.finish_timestamp = datetime.datetime.now() + self.job_track.save() + + # count new job validations + for jv in self.job.jobvalidation_set.filter(failure_count=None): + jv.validation_failure_count(force_recount=True) + + # unpersist cached dataframes + self.spark.catalog.clearCache() + + def update_jobGroup(self, description): + + ''' + Method to update spark jobGroup + ''' + + self.logger.info("### %s" % description) + self.spark.sparkContext.setJobGroup("%s" % self.job.id, "%s, Job #%s" % (description, self.job.id)) + + def get_input_records(self, filter_input_records=True): + + # get input job ids + input_job_ids = [int(job_id) for job_id in self.job_details['input_job_ids']] + + # if job_specific input filters set, handle + if 'job_specific' in self.job_details['input_filters'].keys() and len( + self.job_details['input_filters']['job_specific']) > 0: + + # debug + self.logger.info("Job specific input filters found, handling") + + # convenience dict + job_spec_dicts = self.job_details['input_filters']['job_specific'] + + # init list of dataframes to have union performed + job_spec_dfs = [] + + # remove job_specific from input_jobs + for spec_input_job_id in self.job_details['input_filters']['job_specific'].keys(): + input_job_ids.remove(int(spec_input_job_id)) + + # handle remaining, if any, non-specified jobs as per normal + if len(input_job_ids) > 0: + # retrieve from Mongo + pipeline = json.dumps([ + { + '$match': { + 'job_id': { + '$in': input_job_ids + } + } + }, + { + '$project': {field_name: 1 for field_name in CombineRecordSchema().field_names} + } + ]) + records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", + settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() + + # optionally filter + if filter_input_records: + records = self.record_input_filters(records) + + # add CombineRecordSchema columns if absent + records = self.add_missing_columns(records) + + # append to list of dataframes + job_spec_dfs.append(records) + + # group like/identical input filter parameters, run together + # https://stackoverflow.com/questions/52484043/group-key-value-pairs-in-python-dictionary-by-value-maintaining-original-key-as + grouped_spec_dicts = [ + {'input_filters': k, 'job_ids': [int(job_id) for job_id in list(map(itemgetter(0), g))]} for k, g in + groupby(sorted(job_spec_dicts.items(), key=lambda t: t[1].items()), itemgetter(1))] + + # next, loop through spec'ed jobs, retrieve and filter + for job_spec_group in grouped_spec_dicts: + + # debug + self.logger.info("Handling specific input filters for job ids: %s" % job_spec_group['job_ids']) + + # handle remaining, non-specified jobs as per normal + # retrieve from Mongo + pipeline = json.dumps([ + { + '$match': { + 'job_id': { + '$in': job_spec_group['job_ids'] + } + } + }, + { + '$project': {field_name: 1 for field_name in CombineRecordSchema().field_names} + } + ]) + job_spec_records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", + settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() + + # optionally filter + if filter_input_records: + job_spec_records = self.record_input_filters(job_spec_records, + input_filters=job_spec_group['input_filters']) + + # add CombineRecordSchema columns if absent + job_spec_records = self.add_missing_columns(job_spec_records) + + # append dataframe + job_spec_dfs.append(job_spec_records) + + # union spec'ed jobs with unspec'ed records + self.logger.info("union-izing all job dataframes") + unioned_records = df_union_all(job_spec_dfs) + + # count breakdown of input jobs/records, save to Job + self.count_input_records(unioned_records) + + # finally, return records + return unioned_records + + # else, handle filtering and retrieval same for each input job + else: + # retrieve from Mongo + pipeline = json.dumps([ + { + '$match': { + 'job_id': { + '$in': input_job_ids + } + } + }, + { + '$project': {field_name: 1 for field_name in CombineRecordSchema().field_names} + } + ]) + records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() + + # optionally filter + if filter_input_records: + records = self.record_input_filters(records) + + # add CombineRecordSchema columns if absent + records = self.add_missing_columns(records) + + # count breakdown of input jobs/records, save to Job + self.count_input_records(records) + + # return + return records + + def add_missing_columns(self, records): + + ''' + Method to ensure records dataframe has all required columns from CombineRecordSchema + + Args: + records (DataFrame): dataframe of records + ''' + + # loop through required columns from CombineRecordSchema + self.logger.info("check for missing columns from CombineRecordSchema") + for field_name in CombineRecordSchema().field_names: + if field_name not in records.columns: + self.logger.info("adding column: %s" % field_name) + records = records.withColumn(field_name, pyspark_sql_functions.lit('')) + return records + + def save_records(self, + records_df=None, + write_avro=settings.WRITE_AVRO, + index_records=settings.INDEX_TO_ES, + assign_combine_id=False): + + ''' + Method to index records to DB and trigger indexing to ElasticSearch (ES) + + Args: + records_df (pyspark.sql.DataFrame): records as pyspark DataFrame + write_avro (bool): boolean to write avro files to disk after DB indexing + index_records (bool): boolean to index to ES + assign_combine_id (bool): if True, establish `combine_id` column and populate with UUID + + Returns: + None + - determines if record_id unique among records DataFrame + - selects only columns that match CombineRecordSchema + - writes to DB, writes to avro files + ''' + + # assign combine ID + if assign_combine_id: + combine_id_udf = udf(lambda record_id: str(uuid.uuid4()), StringType()) + records_df = records_df.withColumn('combine_id', combine_id_udf(records_df.record_id)) + + # run record identifier transformation scenario if provided + records_df = self.run_rits(records_df) + + # check uniqueness (overwrites if column already exists) + records_df = records_df.withColumn("unique", ( + pyspark_sql_functions.count('record_id') \ + .over(Window.partitionBy('record_id')) == True) \ + .cast('boolean')) + + # add valid column + records_df = records_df.withColumn('valid', pyspark_sql_functions.lit(True)) + + # add DPLA Bulk Data Match (dbdm) column + records_df = records_df.withColumn('dbdm', pyspark_sql_functions.lit(False)) + + # ensure columns to avro and DB + records_df_combine_cols = records_df.select(CombineRecordSchema().field_names) + + # write avro, coalescing for output + if write_avro: + records_df_combine_cols.coalesce(settings.SPARK_REPARTITION) \ + .write.format("com.databricks.spark.avro").save(self.job.job_output) + + # write records to MongoDB + self.update_jobGroup('Saving Records to DB') + records_df_combine_cols.write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record").save() + + # check if anything written to DB to continue, else abort + if self.job.get_records().count() > 0: + + # read rows from Mongo with minted ID for future stages + pipeline = json.dumps({'$match': {'job_id': self.job.id, 'success': True}}) + db_records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() + + # index to ElasticSearch + self.update_jobGroup('Indexing to ElasticSearch') + if index_records and settings.INDEX_TO_ES: + es_rdd = ESIndex.index_job_to_es_spark( + self.spark, + job=self.job, + records_df=db_records, + field_mapper_config=self.job_details['field_mapper_config'] + ) + + # run Validation Scenarios + if 'validation_scenarios' in self.job_details.keys(): + self.update_jobGroup('Running Validation Scenarios') + vs = ValidationScenarioSpark( + spark=self.spark, + job=self.job, + records_df=db_records, + validation_scenarios=self.job_details['validation_scenarios'] + ) + vs.run_record_validation_scenarios() + + # handle DPLA Bulk Data matching, rewriting/updating records where match is found + self.dpla_bulk_data_compare(db_records, es_rdd) + + # return + return db_records + + else: + raise Exception("No successful records written to disk for Job: %s" % self.job.name) + + def record_input_filters(self, filtered_df, input_filters=None): + + ''' + Method to apply filters to input Records + + Args: + spark (pyspark.sql.session.SparkSession): provided by pyspark context + records_df (pyspark.sql.DataFrame): DataFrame of records pre validity filtering + + Returns: + (pyspark.sql.DataFrame): DataFrame of records post filtering + ''' + + # use input filters if provided, else fall back to job + if input_filters == None: + input_filters = self.job_details['input_filters'] + + # filter to input record appropriate field + # filtered_df = filtered_df.select(CombineRecordSchema().field_names) + + # handle validity filters + input_validity_valve = input_filters['input_validity_valve'] + + # filter to valid or invalid records + # return valid records + if input_validity_valve == 'valid': + filtered_df = filtered_df.filter(filtered_df.valid == 1) + + # return invalid records + elif input_validity_valve == 'invalid': + filtered_df = filtered_df.filter(filtered_df.valid == 0) + + # handle numerical filters + input_numerical_valve = input_filters['input_numerical_valve'] + if input_numerical_valve != None: + filtered_df = filtered_df.limit(input_numerical_valve) + + # handle es query valve + if 'input_es_query_valve' in input_filters.keys(): + input_es_query_valve = input_filters['input_es_query_valve'] + if input_es_query_valve not in [None, '{}']: + filtered_df = self.es_query_valve_filter(input_es_query_valve, filtered_df) + + # filter duplicates + if 'filter_dupe_record_ids' in input_filters.keys() and input_filters['filter_dupe_record_ids'] == True: + filtered_df = filtered_df.dropDuplicates(['record_id']) + + # after input filtering which might leverage db_id, drop + filtered_df = filtered_df.select([c for c in filtered_df.columns if c != '_id']) + + # return + return filtered_df + + def count_input_records(self, records): + + ''' + Method to count records by job_id + - count records from input jobs if > 1 + - otherwise assume Job.udpate_status() will calculate from single input job - input_job = JobInput.objects.filter(job_id=self.job.id, input_job_id=int(input_job_id)).first() - input_job.passed_records = 0 - input_job.save() + Args: + records (dataframe): Records to count based on job_id + ''' + refresh_django_db_connection() + if 'input_job_ids' in self.job_details.keys() and len(self.job_details['input_job_ids']) > 1: - def es_query_valve_filter(self, input_es_query_valve, filtered_df): + # cache + records.cache() - ''' - Method to handle input valve based on ElasticSearch query + # copy input job ids to mark done (cast to int) + input_jobs = [int(job_id) for job_id in self.job_details['input_job_ids'].copy()] - - perform union if multiple input Jobs are used + # group by job_ids + record_counts = records.groupBy('job_id').count() - ''' + # loop through input jobs, init, and write + for input_job_count in record_counts.collect(): + # remove from input_jobs + input_jobs.remove(input_job_count['job_id']) - # prepare input jobs list - if 'input_job_ids' in self.job_details.keys(): - input_jobs_ids = [int(job_id) for job_id in self.job_details['input_job_ids']] - elif 'input_job_id' in self.job_details: - input_jobs_ids = [int(self.job_details['input_job_id'])] + # set passed records and save + input_job = JobInput.objects.filter(job_id=self.job.id, + input_job_id=int(input_job_count['job_id'])).first() + input_job.passed_records = input_job_count['count'] + input_job.save() - # loop through and create es.resource string - es_indexes = ','.join([ 'j%s' % job_id for job_id in input_jobs_ids]) + # loop through any remaining jobs, where absence indicates 0 records passed + for input_job_id in input_jobs: + input_job = JobInput.objects.filter(job_id=self.job.id, input_job_id=int(input_job_id)).first() + input_job.passed_records = 0 + input_job.save() - # get es index as RDD - es_rdd = self.spark.sparkContext.newAPIHadoopRDD( - inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", - keyClass="org.apache.hadoop.io.NullWritable", - valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", - conf={ - "es.resource":"%s/record" % es_indexes, - "es.nodes":"%s:9200" % settings.ES_HOST, - "es.query":input_es_query_valve, - "es.read.field.exclude":"*"}) - es_df = es_rdd.map(lambda row: (row[0], )).toDF() + def es_query_valve_filter(self, input_es_query_valve, filtered_df): - # perform join on ES documents - filtered_df = filtered_df.join(es_df, filtered_df['_id']['oid'] == es_df['_1'], 'leftsemi') + ''' + Method to handle input valve based on ElasticSearch query - # return - return filtered_df + - perform union if multiple input Jobs are used + ''' - def run_rits(self, records_df): + # prepare input jobs list + if 'input_job_ids' in self.job_details.keys(): + input_jobs_ids = [int(job_id) for job_id in self.job_details['input_job_ids']] + elif 'input_job_id' in self.job_details: + input_jobs_ids = [int(self.job_details['input_job_id'])] - ''' - Method to run Record Identifier Transformation Scenarios (rits) if present. - - RITS can be of three types: - 1) 'regex' - Java Regular Expressions - 2) 'python' - Python Code Snippets - 3) 'xpath' - XPath expression - - Each are handled differently, but all strive to return a dataframe (records_df), - with the `record_id` column modified. - - Args: - records_df (pyspark.sql.DataFrame): records as pyspark DataFrame - rits_id (string|int): DB identifier of pre-configured RITS - - Returns: - - ''' - - # get rits ID from kwargs - rits_id = self.job_details.get('rits', False) - - # if rits id provided - if rits_id and rits_id != None: - - # get RITS - rits = RecordIdentifierTransformationScenario.objects.get(pk=int(rits_id)) - - # handle regex - if rits.transformation_type == 'regex': - - # define udf function for python transformation - def regex_record_id_trans_udf(row, match, replace, trans_target): - - try: - - # use python's re module to perform regex - if trans_target == 'record_id': - trans_result = re.sub(match, replace, row.record_id) - if trans_target == 'document': - trans_result = re.sub(match, replace, row.document) - - # run transformation - success = True - error = row.error - - except Exception as e: - trans_result = str(e) - error = 'record_id transformation failure' - success = False - - # return Row - return Row( - combine_id = row.combine_id, - record_id = trans_result, - document = row.document, - error = error, - job_id = row.job_id, - oai_set = row.oai_set, - success = success, - fingerprint = row.fingerprint, - transformed = row.transformed - ) - - # transform via rdd.map and return - match = rits.regex_match_payload - replace = rits.regex_replace_payload - trans_target = rits.transformation_target - records_rdd = records_df.rdd.map(lambda row: regex_record_id_trans_udf(row, match, replace, trans_target)) - records_df = records_rdd.toDF() - - # handle python - if rits.transformation_type == 'python': - - # define udf function for python transformation - def python_record_id_trans_udf(row, python_code, trans_target): - - try: - # get python function from Transformation Scenario - temp_mod = ModuleType('temp_mod') - exec(python_code, temp_mod.__dict__) - - # establish python udf record - if trans_target == 'record_id': - pyudfr = PythonUDFRecord(None, non_row_input = True, record_id = row.record_id) - if trans_target == 'document': - pyudfr = PythonUDFRecord(None, non_row_input = True, document = row.document) - - # run transformation - trans_result = temp_mod.transform_identifier(pyudfr) - success = True - error = row.error - - except Exception as e: - trans_result = str(e) - error = 'record_id transformation failure' - success = False - - # return Row - return Row( - combine_id = row.combine_id, - record_id = trans_result, - document = row.document, - error = error, - job_id = row.job_id, - oai_set = row.oai_set, - success = success, - fingerprint = row.fingerprint, - transformed = row.transformed - ) - - # transform via rdd.map and return - python_code = rits.python_payload - trans_target = rits.transformation_target - records_rdd = records_df.rdd.map(lambda row: python_record_id_trans_udf(row, python_code, trans_target)) - records_df = records_rdd.toDF() - - # handle xpath - if rits.transformation_type == 'xpath': - - ''' - Currently XPath RITS are handled via python and etree, - but might be worth investigating if this could be performed - via pyjxslt to support XPath 2.0 - ''' - - # define udf function for xpath expression - def xpath_record_id_trans_udf(row, xpath): - - # establish python udf record, forcing 'document' type trans for XPath - pyudfr = PythonUDFRecord(None, non_row_input = True, document = row.document) - - # run xpath and retrieve value - xpath_query = pyudfr.xml.xpath(xpath, namespaces=pyudfr.nsmap) - if len(xpath_query) == 1: - trans_result = xpath_query[0].text - success = True - error = row.error - elif len(xpath_query) == 0: - trans_result = 'xpath expression found nothing' - success = False - error = 'record_id transformation failure' - else: - trans_result = 'more than one node found for XPath query' - success = False - error = 'record_id transformation failure' - - # return Row - return Row( - combine_id = row.combine_id, - record_id = trans_result, - document = row.document, - error = error, - job_id = row.job_id, - oai_set = row.oai_set, - success = success, - fingerprint = row.fingerprint, - transformed = row.transformed - ) - - # transform via rdd.map and return - xpath = rits.xpath_payload - records_rdd = records_df.rdd.map(lambda row: xpath_record_id_trans_udf(row, xpath)) - records_df = records_rdd.toDF() - - # return - return records_df - - # else return dataframe untouched - else: - return records_df - - - def dpla_bulk_data_compare(self, records_df, es_rdd): - - ''' - Method to compare against bulk data if provided - - Args: - records_df (dataframe): records post-write to DB - es_rdd (rdd): RDD of documents as written to ElasticSearch - Columns: - _1 : boolean, 'success'/'failure' - _2 : map, mapped fields - ''' - - self.logger.info('Running DPLA Bulk Data Compare') - self.update_jobGroup('Running DPLA Bulk Data Compare') - - # check for dbdm params, get dbdd ID from kwargs - if 'dbdm' in self.job_details.keys(): - dbdd_id = self.job_details['dbdm'].get('dbdd', False) - else: - dbdd_id = False - - # if rits id provided - if dbdd_id and dbdd_id != None: - - self.logger.info('DBDD id provided, retrieving and running...') - - # get dbdd instance - dbdd = DPLABulkDataDownload.objects.get(pk=int(dbdd_id)) - self.logger.info('DBDD retrieved: %s @ ES index %s' % (dbdd.s3_key, dbdd.es_index)) - - # get DPLA bulk data from ES as DF - dpla_df = get_job_es(self.spark, indices=[dbdd.es_index], doc_type='item') - - # get job mapped fields from es_rdd - es_df = es_rdd.toDF() - - # join on isShownAt - matches_df = es_df.join(dpla_df, es_df['_2']['dpla_isShownAt'] == dpla_df['isShownAt'], 'leftsemi') - - # select records from records_df for updating (writing) - update_dbdm_df = records_df.join(matches_df, records_df['_id']['oid'] == matches_df['_2']['db_id'], 'leftsemi') - - # set dbdm column to True - update_dbdm_df = update_dbdm_df.withColumn('dbdm', pyspark_sql_functions.lit(True)) - - # write to DB - update_dbdm_df.write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection", "record").save() - - # else, return with dbdm column all False - else: - return records_df.withColumn('dbdm', pyspark_sql_functions.lit(False)) - - - def fingerprint_records(self, df): - - ''' - Method to generate a crc32 hash "fingerprint" for each Record - ''' - - # fingerprint Record document - df = df.withColumn('fingerprint', crc32(df.document)) - return df + # loop through and create es.resource string + es_indexes = ','.join(['j%s' % job_id for job_id in input_jobs_ids]) + # get es index as RDD + es_rdd = self.spark.sparkContext.newAPIHadoopRDD( + inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", + keyClass="org.apache.hadoop.io.NullWritable", + valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", + conf={ + "es.resource": "%s/record" % es_indexes, + "es.nodes": "%s:9200" % settings.ES_HOST, + "es.query": input_es_query_valve, + "es.read.field.exclude": "*"}) + es_df = es_rdd.map(lambda row: (row[0],)).toDF() + + # perform join on ES documents + filtered_df = filtered_df.join(es_df, filtered_df['_id']['oid'] == es_df['_1'], 'leftsemi') + + # return + return filtered_df + + def run_rits(self, records_df): + + ''' + Method to run Record Identifier Transformation Scenarios (rits) if present. + + RITS can be of three types: + 1) 'regex' - Java Regular Expressions + 2) 'python' - Python Code Snippets + 3) 'xpath' - XPath expression + + Each are handled differently, but all strive to return a dataframe (records_df), + with the `record_id` column modified. + + Args: + records_df (pyspark.sql.DataFrame): records as pyspark DataFrame + rits_id (string|int): DB identifier of pre-configured RITS + + Returns: + + ''' + + # get rits ID from kwargs + rits_id = self.job_details.get('rits', False) + + # if rits id provided + if rits_id and rits_id != None: + + # get RITS + rits = RecordIdentifierTransformationScenario.objects.get(pk=int(rits_id)) + + # handle regex + if rits.transformation_type == 'regex': + + # define udf function for python transformation + def regex_record_id_trans_udf(row, match, replace, trans_target): + + try: + + # use python's re module to perform regex + if trans_target == 'record_id': + trans_result = re.sub(match, replace, row.record_id) + if trans_target == 'document': + trans_result = re.sub(match, replace, row.document) + + # run transformation + success = True + error = row.error + + except Exception as e: + trans_result = str(e) + error = 'record_id transformation failure' + success = False + + # return Row + return Row( + combine_id=row.combine_id, + record_id=trans_result, + document=row.document, + error=error, + job_id=row.job_id, + oai_set=row.oai_set, + success=success, + fingerprint=row.fingerprint, + transformed=row.transformed + ) + + # transform via rdd.map and return + match = rits.regex_match_payload + replace = rits.regex_replace_payload + trans_target = rits.transformation_target + records_rdd = records_df.rdd.map( + lambda row: regex_record_id_trans_udf(row, match, replace, trans_target)) + records_df = records_rdd.toDF() + + # handle python + if rits.transformation_type == 'python': + + # define udf function for python transformation + def python_record_id_trans_udf(row, python_code, trans_target): + + try: + # get python function from Transformation Scenario + temp_mod = ModuleType('temp_mod') + exec(python_code, temp_mod.__dict__) + + # establish python udf record + if trans_target == 'record_id': + pyudfr = PythonUDFRecord(None, non_row_input=True, record_id=row.record_id) + if trans_target == 'document': + pyudfr = PythonUDFRecord(None, non_row_input=True, document=row.document) + + # run transformation + trans_result = temp_mod.transform_identifier(pyudfr) + success = True + error = row.error + + except Exception as e: + trans_result = str(e) + error = 'record_id transformation failure' + success = False + + # return Row + return Row( + combine_id=row.combine_id, + record_id=trans_result, + document=row.document, + error=error, + job_id=row.job_id, + oai_set=row.oai_set, + success=success, + fingerprint=row.fingerprint, + transformed=row.transformed + ) + + # transform via rdd.map and return + python_code = rits.python_payload + trans_target = rits.transformation_target + records_rdd = records_df.rdd.map(lambda row: python_record_id_trans_udf(row, python_code, trans_target)) + records_df = records_rdd.toDF() + + # handle xpath + if rits.transformation_type == 'xpath': + + ''' + Currently XPath RITS are handled via python and etree, + but might be worth investigating if this could be performed + via pyjxslt to support XPath 2.0 + ''' + + # define udf function for xpath expression + def xpath_record_id_trans_udf(row, xpath): + + # establish python udf record, forcing 'document' type trans for XPath + pyudfr = PythonUDFRecord(None, non_row_input=True, document=row.document) + + # run xpath and retrieve value + xpath_query = pyudfr.xml.xpath(xpath, namespaces=pyudfr.nsmap) + if len(xpath_query) == 1: + trans_result = xpath_query[0].text + success = True + error = row.error + elif len(xpath_query) == 0: + trans_result = 'xpath expression found nothing' + success = False + error = 'record_id transformation failure' + else: + trans_result = 'more than one node found for XPath query' + success = False + error = 'record_id transformation failure' + + # return Row + return Row( + combine_id=row.combine_id, + record_id=trans_result, + document=row.document, + error=error, + job_id=row.job_id, + oai_set=row.oai_set, + success=success, + fingerprint=row.fingerprint, + transformed=row.transformed + ) + + # transform via rdd.map and return + xpath = rits.xpath_payload + records_rdd = records_df.rdd.map(lambda row: xpath_record_id_trans_udf(row, xpath)) + records_df = records_rdd.toDF() + + # return + return records_df + + # else return dataframe untouched + else: + return records_df + + def dpla_bulk_data_compare(self, records_df, es_rdd): + + ''' + Method to compare against bulk data if provided + + Args: + records_df (dataframe): records post-write to DB + es_rdd (rdd): RDD of documents as written to ElasticSearch + Columns: + _1 : boolean, 'success'/'failure' + _2 : map, mapped fields + ''' + + self.logger.info('Running DPLA Bulk Data Compare') + self.update_jobGroup('Running DPLA Bulk Data Compare') + + # check for dbdm params, get dbdd ID from kwargs + if 'dbdm' in self.job_details.keys(): + dbdd_id = self.job_details['dbdm'].get('dbdd', False) + else: + dbdd_id = False + + # if rits id provided + if dbdd_id and dbdd_id != None: + + self.logger.info('DBDD id provided, retrieving and running...') + + # get dbdd instance + dbdd = DPLABulkDataDownload.objects.get(pk=int(dbdd_id)) + self.logger.info('DBDD retrieved: %s @ ES index %s' % (dbdd.s3_key, dbdd.es_index)) + + # get DPLA bulk data from ES as DF + dpla_df = get_job_es(self.spark, indices=[dbdd.es_index], doc_type='item') + + # get job mapped fields from es_rdd + es_df = es_rdd.toDF() + + # join on isShownAt + matches_df = es_df.join(dpla_df, es_df['_2']['dpla_isShownAt'] == dpla_df['isShownAt'], 'leftsemi') + + # select records from records_df for updating (writing) + update_dbdm_df = records_df.join(matches_df, records_df['_id']['oid'] == matches_df['_2']['db_id'], + 'leftsemi') + + # set dbdm column to True + update_dbdm_df = update_dbdm_df.withColumn('dbdm', pyspark_sql_functions.lit(True)) + + # write to DB + update_dbdm_df.write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record").save() + + # else, return with dbdm column all False + else: + return records_df.withColumn('dbdm', pyspark_sql_functions.lit(False)) + + def fingerprint_records(self, df): + + ''' + Method to generate a crc32 hash "fingerprint" for each Record + ''' + + # fingerprint Record document + df = df.withColumn('fingerprint', crc32(df.document)) + return df class HarvestOAISpark(CombineSparkJob): - - ''' - Spark code for harvesting OAI records - ''' - - def spark_function(self): - - ''' - Harvest records via OAI. - - As a harvest type job, unlike other jobs, this introduces various fields to the Record for the first time: - - record_id - - job_id - - oai_set - - publish_set_id - - unique (TBD) - - Args: - spark (pyspark.sql.session.SparkSession): provided by pyspark context - job_id (int): Job ID - - Returns: - None: - - harvests OAI records and writes to disk as avro files - - indexes records into DB - - map / flatten records and indexes to ES - ''' - - # init job - self.init_job() - self.update_jobGroup('Running Harvest OAI Job') - - # prepare to harvest OAI records via Ingestion3 - df = self.spark.read.format("dpla.ingestion3.harvesters.oai")\ - .option("endpoint", self.job_details['oai_params']['endpoint'])\ - .option("verb", self.job_details['oai_params']['verb'])\ - .option("metadataPrefix", self.job_details['oai_params']['metadataPrefix']) - - # remove scope entirely if harvesting all records, not sets - if self.job_details['oai_params']['scope_type'] != 'harvestAllRecords': - df = df.option(self.job_details['oai_params']['scope_type'], self.job_details['oai_params']['scope_value'])\ - - # harvest - df = df.load() - - # select records with content - records = df.select("record.*").where("record is not null") - - # repartition - records = records.repartition(settings.SPARK_REPARTITION) - - # if removing OAI record
- if not self.job_details['oai_params']['include_oai_record_header']: - # attempt to find and select element from OAI record, else filter out - def find_metadata_udf(document): - if type(document) == str: - xml_root = etree.fromstring(document) - m_root = xml_root.find('{http://www.openarchives.org/OAI/2.0/}metadata') - if m_root is not None: - # expecting only one child to element - m_children = m_root.getchildren() - if len(m_children) == 1: - m_child = m_children[0] - m_string = etree.tostring(m_child).decode('utf-8') - return m_string - else: - return 'none' - else: - return 'none' - - metadata_udf = udf(lambda col_val: find_metadata_udf(col_val), StringType()) - records = records.select(*[metadata_udf(col).alias('document') if col == 'document' else col for col in records.columns]) - - # filter where not none - records = records.filter(records.document != 'none') - - # establish 'success' column, setting all success for Harvest - records = records.withColumn('success', pyspark_sql_functions.lit(True)) - - # copy 'id' from OAI harvest to 'record_id' column - records = records.withColumn('record_id', records.id) - - # add job_id as column - job_id = self.job.id - job_id_udf = udf(lambda id: job_id, IntegerType()) - records = records.withColumn('job_id', job_id_udf(records.id)) - - # add oai_set, accomodating multiple sets - records = records.withColumn('oai_set', records.setIds) - - # add blank error column - error = udf(lambda id: '', StringType()) - records = records.withColumn('error', error(records.id)) - - # fingerprint records and set transformed - records = self.fingerprint_records(records) - records = records.withColumn('transformed', pyspark_sql_functions.lit(True)) - - # index records to DB and index to ElasticSearch - self.save_records( - records_df=records, - assign_combine_id=True - ) - - # close job - self.close_job() - + ''' + Spark code for harvesting OAI records + ''' + + def spark_function(self): + + ''' + Harvest records via OAI. + + As a harvest type job, unlike other jobs, this introduces various fields to the Record for the first time: + - record_id + - job_id + - oai_set + - publish_set_id + - unique (TBD) + + Args: + spark (pyspark.sql.session.SparkSession): provided by pyspark context + job_id (int): Job ID + + Returns: + None: + - harvests OAI records and writes to disk as avro files + - indexes records into DB + - map / flatten records and indexes to ES + ''' + + # init job + self.init_job() + self.update_jobGroup('Running Harvest OAI Job') + + # prepare to harvest OAI records via Ingestion3 + df = self.spark.read.format("dpla.ingestion3.harvesters.oai") \ + .option("endpoint", self.job_details['oai_params']['endpoint']) \ + .option("verb", self.job_details['oai_params']['verb']) \ + .option("metadataPrefix", self.job_details['oai_params']['metadataPrefix']) + + # remove scope entirely if harvesting all records, not sets + if self.job_details['oai_params']['scope_type'] != 'harvestAllRecords': + df = df.option(self.job_details['oai_params']['scope_type'], self.job_details['oai_params']['scope_value']) \ + \ + # harvest + df = df.load() + + # select records with content + records = df.select("record.*").where("record is not null") + + # repartition + records = records.repartition(settings.SPARK_REPARTITION) + + # if removing OAI record
+ if not self.job_details['oai_params']['include_oai_record_header']: + # attempt to find and select element from OAI record, else filter out + def find_metadata_udf(document): + if type(document) == str: + xml_root = etree.fromstring(document) + m_root = xml_root.find('{http://www.openarchives.org/OAI/2.0/}metadata') + if m_root is not None: + # expecting only one child to element + m_children = m_root.getchildren() + if len(m_children) == 1: + m_child = m_children[0] + m_string = etree.tostring(m_child).decode('utf-8') + return m_string + else: + return 'none' + else: + return 'none' + + metadata_udf = udf(lambda col_val: find_metadata_udf(col_val), StringType()) + records = records.select( + *[metadata_udf(col).alias('document') if col == 'document' else col for col in records.columns]) + + # filter where not none + records = records.filter(records.document != 'none') + + # establish 'success' column, setting all success for Harvest + records = records.withColumn('success', pyspark_sql_functions.lit(True)) + + # copy 'id' from OAI harvest to 'record_id' column + records = records.withColumn('record_id', records.id) + + # add job_id as column + job_id = self.job.id + job_id_udf = udf(lambda id: job_id, IntegerType()) + records = records.withColumn('job_id', job_id_udf(records.id)) + + # add oai_set, accomodating multiple sets + records = records.withColumn('oai_set', records.setIds) + + # add blank error column + error = udf(lambda id: '', StringType()) + records = records.withColumn('error', error(records.id)) + + # fingerprint records and set transformed + records = self.fingerprint_records(records) + records = records.withColumn('transformed', pyspark_sql_functions.lit(True)) + + # index records to DB and index to ElasticSearch + self.save_records( + records_df=records, + assign_combine_id=True + ) + + # close job + self.close_job() class HarvestStaticXMLSpark(CombineSparkJob): - - ''' - Spark code for harvesting static xml records - ''' - - def spark_function(self): - - ''' - Harvest static XML records provided by user. - - Expected input structure: - /foo/bar <-- self.static_payload - baz1.xml <-- record at self.xpath_query within file - baz2.xml - baz3.xml - - As a harvest type job, unlike other jobs, this introduces various fields to the Record for the first time: - - record_id - - job_id - - oai_set - - publish_set_id - - unique (TBD) - - Args: - spark (pyspark.sql.session.SparkSession): provided by pyspark context - kwargs: - job_id (int): Job ID - static_payload (str): path of static payload on disk - # TODO: add other kwargs here from static job - index_mapper (str): class name from core.spark.es, extending BaseMapper - validation_scenarios (list): list of Validadtion Scenario IDs - - Returns: - None: - - opens and parses static files from payload - - indexes records into DB - - map / flatten records and indexes to ES - ''' - - # init job - self.init_job() - self.update_jobGroup('Running Harvest Static Job') - - # use Spark-XML's XmlInputFormat to stream globbed files, parsing with user provided `document_element_root` - static_rdd = self.spark.sparkContext.newAPIHadoopFile( - 'file://%s/**' % self.job_details['payload_dir'].rstrip('/'), - 'com.databricks.spark.xml.XmlInputFormat', - 'org.apache.hadoop.io.LongWritable', - 'org.apache.hadoop.io.Text', - conf = { - 'xmlinput.start':'<%s>' % self.job_details['document_element_root'], - 'xmlinput.end':'' % self.job_details['document_element_root'], - 'xmlinput.encoding': 'utf-8' - } - ) - - - # parse namespaces - def get_namespaces(xml_node): - nsmap = {} - for ns in xml_node.xpath('//namespace::*'): - if ns[0]: - nsmap[ns[0]] = ns[1] - return nsmap - - - def parse_records_udf(job_id, row, job_details): - - # get doc string - doc_string = row[1] - - # if optional (additional) namespace declaration provided, use - if job_details['additional_namespace_decs']: - doc_string = re.sub( - ns_regex, - r'<%s %s>' % (job_details['document_element_root'], job_details['additional_namespace_decs']), - doc_string - ) - - try: - - # parse with lxml - try: - xml_root = etree.fromstring(doc_string.encode('utf-8')) - except Exception as e: - raise Exception('Could not parse record XML: %s' % str(e)) - - # get namespaces - nsmap = get_namespaces(xml_root) - - # get unique identifier - if job_details['xpath_record_id'] != '': - record_id = xml_root.xpath(job_details['xpath_record_id'], namespaces=nsmap) - if len(record_id) == 1: - record_id = record_id[0].text - elif len(xml_root) > 1: - raise AmbiguousIdentifier('multiple elements found for identifier xpath: %s' % job_details['xpath_record_id']) - elif len(xml_root) == 0: - raise AmbiguousIdentifier('no elements found for identifier xpath: %s' % job_details['xpath_record_id']) - else: - record_id = hashlib.md5(doc_string.encode('utf-8')).hexdigest() - - # return success Row - return Row( - record_id = record_id, - document = etree.tostring(xml_root).decode('utf-8'), - error = '', - job_id = int(job_id), - oai_set = '', - success = True - ) - - # catch missing or ambiguous identifiers - except AmbiguousIdentifier as e: - - # hash record string to produce a unique id - record_id = hashlib.md5(doc_string.encode('utf-8')).hexdigest() - - # return error Row - return Row( - record_id = record_id, - document = etree.tostring(xml_root).decode('utf-8'), - error = "AmbiguousIdentifier: %s" % str(e), - job_id = int(job_id), - oai_set = '', - success = True - ) - - # handle all other exceptions - except Exception as e: - - # hash record string to produce a unique id - record_id = hashlib.md5(doc_string.encode('utf-8')).hexdigest() - - # return error Row - return Row( - record_id = record_id, - document = doc_string, - error = str(e), - job_id = int(job_id), - oai_set = '', - success = False - ) - - # map with parse_records_udf - job_id = self.job.id - job_details = self.job_details - ns_regex = re.compile(r'<%s(.?|.+?)>' % self.job_details['document_element_root']) - records = static_rdd.map(lambda row: parse_records_udf(job_id, row, job_details)) - - # convert back to DF - records = records.toDF() - - # fingerprint records and set transformed - records = self.fingerprint_records(records) - records = records.withColumn('transformed', pyspark_sql_functions.lit(True)) - - # index records to DB and index to ElasticSearch - self.save_records( - records_df=records, - assign_combine_id=True - ) - - # close job - self.close_job() - + ''' + Spark code for harvesting static xml records + ''' + + def spark_function(self): + + ''' + Harvest static XML records provided by user. + + Expected input structure: + /foo/bar <-- self.static_payload + baz1.xml <-- record at self.xpath_query within file + baz2.xml + baz3.xml + + As a harvest type job, unlike other jobs, this introduces various fields to the Record for the first time: + - record_id + - job_id + - oai_set + - publish_set_id + - unique (TBD) + + Args: + spark (pyspark.sql.session.SparkSession): provided by pyspark context + kwargs: + job_id (int): Job ID + static_payload (str): path of static payload on disk + # TODO: add other kwargs here from static job + index_mapper (str): class name from core.spark.es, extending BaseMapper + validation_scenarios (list): list of Validadtion Scenario IDs + + Returns: + None: + - opens and parses static files from payload + - indexes records into DB + - map / flatten records and indexes to ES + ''' + + # init job + self.init_job() + self.update_jobGroup('Running Harvest Static Job') + + # use Spark-XML's XmlInputFormat to stream globbed files, parsing with user provided `document_element_root` + static_rdd = self.spark.sparkContext.newAPIHadoopFile( + 'file://%s/**' % self.job_details['payload_dir'].rstrip('/'), + 'com.databricks.spark.xml.XmlInputFormat', + 'org.apache.hadoop.io.LongWritable', + 'org.apache.hadoop.io.Text', + conf={ + 'xmlinput.start': '<%s>' % self.job_details['document_element_root'], + 'xmlinput.end': '' % self.job_details['document_element_root'], + 'xmlinput.encoding': 'utf-8' + } + ) + + # parse namespaces + def get_namespaces(xml_node): + nsmap = {} + for ns in xml_node.xpath('//namespace::*'): + if ns[0]: + nsmap[ns[0]] = ns[1] + return nsmap + + def parse_records_udf(job_id, row, job_details): + + # get doc string + doc_string = row[1] + + # if optional (additional) namespace declaration provided, use + if job_details['additional_namespace_decs']: + doc_string = re.sub( + ns_regex, + r'<%s %s>' % (job_details['document_element_root'], job_details['additional_namespace_decs']), + doc_string + ) + + try: + + # parse with lxml + try: + xml_root = etree.fromstring(doc_string.encode('utf-8')) + except Exception as e: + raise Exception('Could not parse record XML: %s' % str(e)) + + # get namespaces + nsmap = get_namespaces(xml_root) + + # get unique identifier + if job_details['xpath_record_id'] != '': + record_id = xml_root.xpath(job_details['xpath_record_id'], namespaces=nsmap) + if len(record_id) == 1: + record_id = record_id[0].text + elif len(xml_root) > 1: + raise AmbiguousIdentifier( + 'multiple elements found for identifier xpath: %s' % job_details['xpath_record_id']) + elif len(xml_root) == 0: + raise AmbiguousIdentifier( + 'no elements found for identifier xpath: %s' % job_details['xpath_record_id']) + else: + record_id = hashlib.md5(doc_string.encode('utf-8')).hexdigest() + + # return success Row + return Row( + record_id=record_id, + document=etree.tostring(xml_root).decode('utf-8'), + error='', + job_id=int(job_id), + oai_set='', + success=True + ) + + # catch missing or ambiguous identifiers + except AmbiguousIdentifier as e: + + # hash record string to produce a unique id + record_id = hashlib.md5(doc_string.encode('utf-8')).hexdigest() + + # return error Row + return Row( + record_id=record_id, + document=etree.tostring(xml_root).decode('utf-8'), + error="AmbiguousIdentifier: %s" % str(e), + job_id=int(job_id), + oai_set='', + success=True + ) + + # handle all other exceptions + except Exception as e: + + # hash record string to produce a unique id + record_id = hashlib.md5(doc_string.encode('utf-8')).hexdigest() + + # return error Row + return Row( + record_id=record_id, + document=doc_string, + error=str(e), + job_id=int(job_id), + oai_set='', + success=False + ) + + # map with parse_records_udf + job_id = self.job.id + job_details = self.job_details + ns_regex = re.compile(r'<%s(.?|.+?)>' % self.job_details['document_element_root']) + records = static_rdd.map(lambda row: parse_records_udf(job_id, row, job_details)) + + # convert back to DF + records = records.toDF() + + # fingerprint records and set transformed + records = self.fingerprint_records(records) + records = records.withColumn('transformed', pyspark_sql_functions.lit(True)) + + # index records to DB and index to ElasticSearch + self.save_records( + records_df=records, + assign_combine_id=True + ) + + # close job + self.close_job() class HarvestTabularDataSpark(CombineSparkJob): - - ''' - Spark code for harvesting tabular data (e.g. spreadsheets) - ''' - - def spark_function(self): - - ''' - Harvest tabular data provided by user, convert to XML records. - - handles Delimited data (e.g. csv, tsv) or JSON lines - - Args: - spark (pyspark.sql.session.SparkSession): provided by pyspark context - kwargs: - job_id (int): Job ID - static_payload (str): path of static payload on disk - # TODO: add other kwargs here from static job - index_mapper (str): class name from core.spark.es, extending BaseMapper - validation_scenarios (list): list of Validadtion Scenario IDs - - Returns: - None: - - opens and parses static files from payload - - indexes records into DB - - map / flatten records and indexes to ES - ''' - - # init job - self.init_job() - self.update_jobGroup('Running Harvest Tabular Data Job') - - # load CSV - if self.job_details['payload_filepath'].endswith('.csv'): - dc_df = self.spark.read.format('com.databricks.spark.csv')\ - .options(header=True, inferschema=True, multiLine=True)\ - .load('file://%s' % self.job_details['payload_filepath']) - - # load JSON - elif self.job_details['payload_filepath'].endswith('.json'): - dc_df = self.spark.read.json('file://%s' % self.job_details['payload_filepath']) - - # repartition - dc_df = dc_df.repartition(settings.SPARK_REPARTITION) - - # partition udf - def kvp_to_xml_pt_udf(pt): - - for row in pt: - - # get as dict - row_dict = row.asDict() - - # pop combine fields if exist, ascribe to new dictionary - fields = ['combine_id', 'db_id', 'fingerprint', 'publish_set_id', 'record_id', 'xml2kvp_meta'] - combine_vals_dict = { field:row_dict.pop(field, None) for field in fields } - - try: - - # convert dictionary to XML with XML2kvp - xml_record_str = XML2kvp.kvp_to_xml(row_dict, serialize_xml=True, **xml2kvp_config.__dict__) - - # return success Row - yield Row( - record_id = combine_vals_dict.get('record_id'), - document = xml_record_str, - error = '', - job_id = int(job_id), - oai_set = '', - success = True - ) - - # handle all other exceptions - except Exception as e: - - # return error Row - yield Row( - record_id = combine_vals_dict.get('record_id'), - document = '', - error = str(e), - job_id = int(job_id), - oai_set = '', - success = False - ) - - # mixin passed configurations with defaults - fm_config = json.loads(self.job_details['fm_harvest_config_json']) - xml2kvp_config = XML2kvp(**fm_config) - - # map partitions - job_id = self.job.id - job_details = self.job_details - records = dc_df.rdd.mapPartitions(kvp_to_xml_pt_udf) - - # convert back to DF - records = records.toDF() - - # fingerprint records and set transformed - records = self.fingerprint_records(records) - records = records.withColumn('transformed', pyspark_sql_functions.lit(True)) - - # index records to DB and index to ElasticSearch - self.save_records( - records_df=records, - assign_combine_id=True - ) - - # close job - self.close_job() - + ''' + Spark code for harvesting tabular data (e.g. spreadsheets) + ''' + + def spark_function(self): + + ''' + Harvest tabular data provided by user, convert to XML records. + - handles Delimited data (e.g. csv, tsv) or JSON lines + + Args: + spark (pyspark.sql.session.SparkSession): provided by pyspark context + kwargs: + job_id (int): Job ID + static_payload (str): path of static payload on disk + # TODO: add other kwargs here from static job + index_mapper (str): class name from core.spark.es, extending BaseMapper + validation_scenarios (list): list of Validadtion Scenario IDs + + Returns: + None: + - opens and parses static files from payload + - indexes records into DB + - map / flatten records and indexes to ES + ''' + + # init job + self.init_job() + self.update_jobGroup('Running Harvest Tabular Data Job') + + # load CSV + if self.job_details['payload_filepath'].endswith('.csv'): + dc_df = self.spark.read.format('com.databricks.spark.csv') \ + .options(header=True, inferschema=True, multiLine=True) \ + .load('file://%s' % self.job_details['payload_filepath']) + + # load JSON + elif self.job_details['payload_filepath'].endswith('.json'): + dc_df = self.spark.read.json('file://%s' % self.job_details['payload_filepath']) + + # repartition + dc_df = dc_df.repartition(settings.SPARK_REPARTITION) + + # partition udf + def kvp_to_xml_pt_udf(pt): + + for row in pt: + + # get as dict + row_dict = row.asDict() + + # pop combine fields if exist, ascribe to new dictionary + fields = ['combine_id', 'db_id', 'fingerprint', 'publish_set_id', 'record_id', 'xml2kvp_meta'] + combine_vals_dict = {field: row_dict.pop(field, None) for field in fields} + + try: + + # convert dictionary to XML with XML2kvp + xml_record_str = XML2kvp.kvp_to_xml(row_dict, serialize_xml=True, **xml2kvp_config.__dict__) + + # return success Row + yield Row( + record_id=combine_vals_dict.get('record_id'), + document=xml_record_str, + error='', + job_id=int(job_id), + oai_set='', + success=True + ) + + # handle all other exceptions + except Exception as e: + + # return error Row + yield Row( + record_id=combine_vals_dict.get('record_id'), + document='', + error=str(e), + job_id=int(job_id), + oai_set='', + success=False + ) + + # mixin passed configurations with defaults + fm_config = json.loads(self.job_details['fm_harvest_config_json']) + xml2kvp_config = XML2kvp(**fm_config) + + # map partitions + job_id = self.job.id + job_details = self.job_details + records = dc_df.rdd.mapPartitions(kvp_to_xml_pt_udf) + + # convert back to DF + records = records.toDF() + + # fingerprint records and set transformed + records = self.fingerprint_records(records) + records = records.withColumn('transformed', pyspark_sql_functions.lit(True)) + + # index records to DB and index to ElasticSearch + self.save_records( + records_df=records, + assign_combine_id=True + ) + + # close job + self.close_job() class TransformSpark(CombineSparkJob): + ''' + Spark code for Transform jobs + ''' - ''' - Spark code for Transform jobs - ''' - - def spark_function(self): - - ''' - Transform records based on Transformation Scenario. - - Args: - spark (pyspark.sql.session.SparkSession): provided by pyspark context - kwargs: - job_id (int): Job ID - job_input (str): location of avro files on disk - transformation_id (str): id of Transformation Scenario - index_mapper (str): class name from core.spark.es, extending BaseMapper - validation_scenarios (list): list of Validadtion Scenario IDs - - Returns: - None - - transforms records via XSL, writes new records to avro files on disk - - indexes records into DB - - map / flatten records and indexes to ES - ''' - - # init job - self.init_job() - self.update_jobGroup('Running Transform Job') - - # get input records - records = self.get_input_records(filter_input_records=True) - - # fork as input_records - input_records = records - - # get transformation json - sel_trans = json.loads(self.job_details['transformation']['scenarios_json']) - - # loop through oredered transformations - for trans in sel_trans: - - # load transformation - transformation = Transformation.objects.get(pk=int(trans['trans_id'])) - self.logger.info('Applying transformation #%s: %s' % (trans['index'], transformation.name)) - - # if xslt type transformation - if transformation.transformation_type == 'xslt': - records = self.transform_xslt(transformation, records) - - # if python type transformation - if transformation.transformation_type == 'python': - records = self.transform_python(transformation, records) - - # if OpenRefine type transformation - if transformation.transformation_type == 'openrefine': - - # get XML2kvp settings from input Job - input_job_details = input_job.job_details_dict - input_job_fm_config = input_job_details['field_mapper_config'] - - # pass config json - records = self.transform_openrefineactions(transformation, records, input_job_fm_config) - - # convert back to DataFrame - records = records.toDF() - - # fingerprint Record document - records = self.fingerprint_records(records) - - # assign to records_trans - records_trans = records - - # write `transformed` column based on new fingerprint - records_trans = records_trans.alias("records_trans").join(input_records.alias("input_records"), input_records.combine_id == records_trans.combine_id, 'left').select(*['records_trans.%s' % c for c in records_trans.columns if c not in ['transformed']], pyspark_sql_functions.when(records_trans.fingerprint != input_records.fingerprint, pyspark_sql_functions.lit(True)).otherwise(pyspark_sql_functions.lit(False)).alias('transformed')) + def spark_function(self): - # index records to DB and index to ElasticSearch - self.save_records( - records_df=records_trans - ) + ''' + Transform records based on Transformation Scenario. - # close job - self.close_job() + Args: + spark (pyspark.sql.session.SparkSession): provided by pyspark context + kwargs: + job_id (int): Job ID + job_input (str): location of avro files on disk + transformation_id (str): id of Transformation Scenario + index_mapper (str): class name from core.spark.es, extending BaseMapper + validation_scenarios (list): list of Validadtion Scenario IDs + Returns: + None + - transforms records via XSL, writes new records to avro files on disk + - indexes records into DB + - map / flatten records and indexes to ES + ''' - def transform_xslt(self, transformation, records): + # init job + self.init_job() + self.update_jobGroup('Running Transform Job') + + # get input records + records = self.get_input_records(filter_input_records=True) + + # fork as input_records + input_records = records + + # get transformation json + sel_trans = json.loads(self.job_details['transformation']['scenarios_json']) + + # loop through oredered transformations + for trans in sel_trans: + + # load transformation + transformation = Transformation.objects.get(pk=int(trans['trans_id'])) + self.logger.info('Applying transformation #%s: %s' % (trans['index'], transformation.name)) + + # if xslt type transformation + if transformation.transformation_type == 'xslt': + records = self.transform_xslt(transformation, records) + + # if python type transformation + if transformation.transformation_type == 'python': + records = self.transform_python(transformation, records) + + # if OpenRefine type transformation + if transformation.transformation_type == 'openrefine': + # get XML2kvp settings from input Job + input_job_details = input_job.job_details_dict + input_job_fm_config = input_job_details['field_mapper_config'] - ''' - Method to transform records with XSLT, using pyjxslt server + # pass config json + records = self.transform_openrefineactions(transformation, records, input_job_fm_config) + + # convert back to DataFrame + records = records.toDF() + + # fingerprint Record document + records = self.fingerprint_records(records) + + # assign to records_trans + records_trans = records - Args: - job: job from parent job - transformation: Transformation Scenario from parent job - records (pyspark.sql.DataFrame): DataFrame of records pre-transformation + # write `transformed` column based on new fingerprint + records_trans = records_trans.alias("records_trans").join(input_records.alias("input_records"), + input_records.combine_id == records_trans.combine_id, + 'left').select( + *['records_trans.%s' % c for c in records_trans.columns if c not in ['transformed']], + pyspark_sql_functions.when(records_trans.fingerprint != input_records.fingerprint, + pyspark_sql_functions.lit(True)).otherwise( + pyspark_sql_functions.lit(False)).alias('transformed')) - Return: - records_trans (rdd): transformed records as RDD - ''' + # index records to DB and index to ElasticSearch + self.save_records( + records_df=records_trans + ) - def transform_xslt_pt_udf(pt): + # close job + self.close_job() - # transform with pyjxslt gateway - gw = pyjxslt.Gateway(6767) - gw.add_transform('xslt_transform', xslt_string) + def transform_xslt(self, transformation, records): - # loop through rows in partition - for row in pt: + ''' + Method to transform records with XSLT, using pyjxslt server - try: - result = gw.transform('xslt_transform', row.document) - # attempt XML parse to confirm well-formedness - # error will bubble up in try/except - valid_xml = etree.fromstring(result.encode('utf-8')) + Args: + job: job from parent job + transformation: Transformation Scenario from parent job + records (pyspark.sql.DataFrame): DataFrame of records pre-transformation - # set trans_result tuple - trans_result = (result, '', True) + Return: + records_trans (rdd): transformed records as RDD + ''' + + def transform_xslt_pt_udf(pt): + + # transform with pyjxslt gateway + gw = pyjxslt.Gateway(6767) + gw.add_transform('xslt_transform', xslt_string) + + # loop through rows in partition + for row in pt: + + try: + result = gw.transform('xslt_transform', row.document) + # attempt XML parse to confirm well-formedness + # error will bubble up in try/except + valid_xml = etree.fromstring(result.encode('utf-8')) + + # set trans_result tuple + trans_result = (result, '', True) + + # catch transformation exception and save exception to 'error' + except Exception as e: + # set trans_result tuple + trans_result = (row.document, str(e), False) - # catch transformation exception and save exception to 'error' - except Exception as e: - # set trans_result tuple - trans_result = (row.document, str(e), False) + # yield each Row in mapPartition + yield Row( + combine_id=row.combine_id, + record_id=row.record_id, + document=trans_result[0], + error=trans_result[1], + job_id=int(job_id), + oai_set=row.oai_set, + success=trans_result[2], + fingerprint=row.fingerprint, + transformed=row.transformed + ) - # yield each Row in mapPartition - yield Row( - combine_id = row.combine_id, - record_id = row.record_id, - document = trans_result[0], - error = trans_result[1], - job_id = int(job_id), - oai_set = row.oai_set, - success = trans_result[2], - fingerprint = row.fingerprint, - transformed = row.transformed - ) + # drop transform + gw.drop_transform('xslt_transform') - # drop transform - gw.drop_transform('xslt_transform') + # get XSLT transformation as string + xslt_string = transformation.payload - # get XSLT transformation as string - xslt_string = transformation.payload + # transform via rdd.map and return + job_id = self.job.id - # transform via rdd.map and return - job_id = self.job.id + # perform transformations a la mapPartitions + records_trans = records.rdd.mapPartitions(transform_xslt_pt_udf) + return records_trans - # perform transformations a la mapPartitions - records_trans = records.rdd.mapPartitions(transform_xslt_pt_udf) - return records_trans + def transform_python(self, transformation, records): + ''' + Transform records via python code snippet. - def transform_python(self, transformation, records): + Required: + - a function named `python_record_transformation(record)` in transformation.payload python code - ''' - Transform records via python code snippet. + Args: + job: job from parent job + transformation: Transformation Scenario from parent job + records (pyspark.sql.DataFrame): DataFrame of records pre-transformation - Required: - - a function named `python_record_transformation(record)` in transformation.payload python code + Return: + records_trans (rdd): transformed records as RDD + ''' - Args: - job: job from parent job - transformation: Transformation Scenario from parent job - records (pyspark.sql.DataFrame): DataFrame of records pre-transformation + # define udf function for python transformation + def transform_python_pt_udf(pt): - Return: - records_trans (rdd): transformed records as RDD - ''' + # get python function from Transformation Scenario + temp_pyts = ModuleType('temp_pyts') + exec(python_code, temp_pyts.__dict__) - # define udf function for python transformation - def transform_python_pt_udf(pt): + for row in pt: - # get python function from Transformation Scenario - temp_pyts = ModuleType('temp_pyts') - exec(python_code, temp_pyts.__dict__) + # try: - for row in pt: + # prepare row as parsed document with PythonUDFRecord class + prtb = PythonUDFRecord(row) - # try: + # run transformation + trans_result = temp_pyts.python_record_transformation(prtb) - # prepare row as parsed document with PythonUDFRecord class - prtb = PythonUDFRecord(row) + # convert any possible byte responses to string + if type(trans_result[0]) == bytes: + trans_result[0] = trans_result[0].decode('utf-8') + if type(trans_result[1]) == bytes: + trans_result[1] = trans_result[1].decode('utf-8') - # run transformation - trans_result = temp_pyts.python_record_transformation(prtb) + # except Exception as e: + # # set trans_result tuple + # trans_result = (row.document, str(e), False) - # convert any possible byte responses to string - if type(trans_result[0]) == bytes: - trans_result[0] = trans_result[0].decode('utf-8') - if type(trans_result[1]) == bytes: - trans_result[1] = trans_result[1].decode('utf-8') + # return Row + yield Row( + combine_id=row.combine_id, + record_id=row.record_id, + document=trans_result[0], + error=trans_result[1], + job_id=int(job_id), + oai_set=row.oai_set, + success=trans_result[2], + fingerprint=row.fingerprint, + transformed=row.transformed + ) - # except Exception as e: - # # set trans_result tuple - # trans_result = (row.document, str(e), False) + # transform via rdd.mapPartitions and return + job_id = self.job.id + python_code = transformation.payload + records_trans = records.rdd.mapPartitions(transform_python_pt_udf) + return records_trans - # return Row - yield Row( - combine_id = row.combine_id, - record_id = row.record_id, - document = trans_result[0], - error = trans_result[1], - job_id = int(job_id), - oai_set = row.oai_set, - success = trans_result[2], - fingerprint = row.fingerprint, - transformed = row.transformed - ) + def transform_openrefineactions(self, transformation, records, input_job_fm_config): - # transform via rdd.mapPartitions and return - job_id = self.job.id - python_code = transformation.payload - records_trans = records.rdd.mapPartitions(transform_python_pt_udf) - return records_trans + ''' + Transform records per OpenRefine Actions JSON + Args: + job: job from parent job + transformation: Transformation Scenario from parent job + records (pyspark.sql.DataFrame): DataFrame of records pre-transformation - def transform_openrefineactions(self, transformation, records, input_job_fm_config): + Return: + records_trans (rdd): transformed records as RDD + ''' - ''' - Transform records per OpenRefine Actions JSON + # define udf function for python transformation + def transform_openrefine_pt_udf(pt): - Args: - job: job from parent job - transformation: Transformation Scenario from parent job - records (pyspark.sql.DataFrame): DataFrame of records pre-transformation + # parse OpenRefine actions JSON + or_actions = json.loads(or_actions_json) - Return: - records_trans (rdd): transformed records as RDD - ''' + # loop through rows + for row in pt: - # define udf function for python transformation - def transform_openrefine_pt_udf(pt): + try: - # parse OpenRefine actions JSON - or_actions = json.loads(or_actions_json) + # prepare row as parsed document with PythonUDFRecord class + prtb = PythonUDFRecord(row) - # loop through rows - for row in pt: + # loop through actions + for event in or_actions: - try: + # handle mass edits + if event['op'] == 'core/mass-edit': - # prepare row as parsed document with PythonUDFRecord class - prtb = PythonUDFRecord(row) + # for each column, reconstitue columnName --> XPath + xpath = XML2kvp.k_to_xpath(event['columnName'], **input_job_fm_config) - # loop through actions - for event in or_actions: + # find elements for potential edits + eles = prtb.xml.xpath(xpath, namespaces=prtb.nsmap) - # handle mass edits - if event['op'] == 'core/mass-edit': + # loop through elements + for ele in eles: - # for each column, reconstitue columnName --> XPath - xpath = XML2kvp.k_to_xpath(event['columnName'], **input_job_fm_config) + # loop through edits + for edit in event['edits']: - # find elements for potential edits - eles = prtb.xml.xpath(xpath, namespaces=prtb.nsmap) + # check if element text in from, change + if ele.text in edit['from']: + ele.text = edit['to'] - # loop through elements - for ele in eles: + # handle jython + if event['op'] == 'core/text-transform' and event['expression'].startswith('jython:'): - # loop through edits - for edit in event['edits']: + # fire up temp module + temp_pyts = ModuleType('temp_pyts') - # check if element text in from, change - if ele.text in edit['from']: - ele.text = edit['to'] + # parse code + code = event['expression'].split('jython:')[1] - # handle jython - if event['op'] == 'core/text-transform' and event['expression'].startswith('jython:'): + # wrap in function and write to temp module + code = 'def temp_func(value):\n%s' % textwrap.indent(code, prefix=' ') + exec(code, temp_pyts.__dict__) - # fire up temp module - temp_pyts = ModuleType('temp_pyts') + # get xpath (unique to action, can't pre learn) + xpath = XML2kvp.k_to_xpath(event['columnName'], **input_job_fm_config) - # parse code - code = event['expression'].split('jython:')[1] + # find elements for potential edits + eles = prtb.xml.xpath(xpath, namespaces=prtb.nsmap) - # wrap in function and write to temp module - code = 'def temp_func(value):\n%s' % textwrap.indent(code, prefix=' ') - exec(code, temp_pyts.__dict__) + # loop through elements + for ele in eles: + ele.text = temp_pyts.temp_func(ele.text) - # get xpath (unique to action, can't pre learn) - xpath = XML2kvp.k_to_xpath(event['columnName'], **input_job_fm_config) + # re-serialize as trans_result + trans_result = (etree.tostring(prtb.xml).decode('utf-8'), '', 1) - # find elements for potential edits - eles = prtb.xml.xpath(xpath, namespaces=prtb.nsmap) + except Exception as e: + # set trans_result tuple + trans_result = (row.document, str(e), False) - # loop through elements - for ele in eles: - ele.text = temp_pyts.temp_func(ele.text) - - # re-serialize as trans_result - trans_result = (etree.tostring(prtb.xml).decode('utf-8'), '', 1) - - except Exception as e: - # set trans_result tuple - trans_result = (row.document, str(e), False) - - # return Row - yield Row( - combine_id = row.combine_id, - record_id = row.record_id, - document = trans_result[0], - error = trans_result[1], - job_id = int(job_id), - oai_set = row.oai_set, - success = trans_result[2], - fingerprint = row.fingerprint, - transformed = row.transformed - ) - - - # transform via rdd.mapPartitions and return - job_id = self.job.id - or_actions_json = transformation.payload - records_trans = records.rdd.mapPartitions(transform_openrefine_pt_udf) - return records_trans + # return Row + yield Row( + combine_id=row.combine_id, + record_id=row.record_id, + document=trans_result[0], + error=trans_result[1], + job_id=int(job_id), + oai_set=row.oai_set, + success=trans_result[2], + fingerprint=row.fingerprint, + transformed=row.transformed + ) + # transform via rdd.mapPartitions and return + job_id = self.job.id + or_actions_json = transformation.payload + records_trans = records.rdd.mapPartitions(transform_openrefine_pt_udf) + return records_trans class MergeSpark(CombineSparkJob): - - ''' - Spark code for running Merge type jobs. Also used for duplciation, analysis, and others. - Note: Merge jobs merge only successful documents from an input job, not the errors - ''' - - def spark_function(self): - - ''' - Harvest records, select non-null, and write to avro files - - Args: - spark (pyspark.sql.session.SparkSession): provided by pyspark context - kwargs: - job_id (int): Job ID - job_inputs (list): list of locations of avro files on disk - index_mapper (str): class name from core.spark.es, extending BaseMapper - validation_scenarios (list): list of Validadtion Scenario IDs - - Returns: - None - - merges records from previous jobs, writes new aggregated records to avro files on disk - - indexes records into DB - - map / flatten records and indexes to ES - ''' - - # init job - self.init_job() - self.update_jobGroup('Running Merge/Duplicate Job') - - # get input records - records = self.get_input_records(filter_input_records=True) - - # update job column, overwriting job_id from input jobs in merge - job_id = self.job.id - job_id_udf = udf(lambda record_id: job_id, IntegerType()) - records = records.withColumn('job_id', job_id_udf(records.record_id)) - - # set transformed column to False - records = records.withColumn('transformed', pyspark_sql_functions.lit(False)) - - # if Analysis Job, do not write avro - if self.job.job_type == 'AnalysisJob': - write_avro = False - else: - write_avro = settings.WRITE_AVRO - - # index records to DB and index to ElasticSearch - self.save_records( - records_df=records, - write_avro=write_avro - ) - - # close job - self.close_job() - + ''' + Spark code for running Merge type jobs. Also used for duplciation, analysis, and others. + Note: Merge jobs merge only successful documents from an input job, not the errors + ''' + + def spark_function(self): + + ''' + Harvest records, select non-null, and write to avro files + + Args: + spark (pyspark.sql.session.SparkSession): provided by pyspark context + kwargs: + job_id (int): Job ID + job_inputs (list): list of locations of avro files on disk + index_mapper (str): class name from core.spark.es, extending BaseMapper + validation_scenarios (list): list of Validadtion Scenario IDs + + Returns: + None + - merges records from previous jobs, writes new aggregated records to avro files on disk + - indexes records into DB + - map / flatten records and indexes to ES + ''' + + # init job + self.init_job() + self.update_jobGroup('Running Merge/Duplicate Job') + + # get input records + records = self.get_input_records(filter_input_records=True) + + # update job column, overwriting job_id from input jobs in merge + job_id = self.job.id + job_id_udf = udf(lambda record_id: job_id, IntegerType()) + records = records.withColumn('job_id', job_id_udf(records.record_id)) + + # set transformed column to False + records = records.withColumn('transformed', pyspark_sql_functions.lit(False)) + + # if Analysis Job, do not write avro + if self.job.job_type == 'AnalysisJob': + write_avro = False + else: + write_avro = settings.WRITE_AVRO + + # index records to DB and index to ElasticSearch + self.save_records( + records_df=records, + write_avro=write_avro + ) + + # close job + self.close_job() #################################################################### @@ -1603,195 +1583,178 @@ def spark_function(self): class CombineSparkPatch(object): + ''' + Base class for Combine Spark Patches. + - these are considered categorically "secondary" to the main + CombineSparkJobs above, but may be just as long running + ''' - ''' - Base class for Combine Spark Patches. - - these are considered categorically "secondary" to the main - CombineSparkJobs above, but may be just as long running - ''' - - - def __init__(self, spark, **kwargs): - - self.spark = spark - - self.kwargs = kwargs - - # init logging support - spark.sparkContext.setLogLevel('INFO') - log4jLogger = spark.sparkContext._jvm.org.apache.log4j - self.logger = log4jLogger.LogManager.getLogger(__name__) + def __init__(self, spark, **kwargs): + self.spark = spark + self.kwargs = kwargs - def update_jobGroup(self, description, job_id): + # init logging support + spark.sparkContext.setLogLevel('INFO') + log4jLogger = spark.sparkContext._jvm.org.apache.log4j + self.logger = log4jLogger.LogManager.getLogger(__name__) - ''' - Method to update spark jobGroup - ''' - - self.logger.info("### %s" % description) - self.spark.sparkContext.setJobGroup("%s" % job_id, "%s, Job #%s" % (description, job_id)) + def update_jobGroup(self, description, job_id): + ''' + Method to update spark jobGroup + ''' + self.logger.info("### %s" % description) + self.spark.sparkContext.setJobGroup("%s" % job_id, "%s, Job #%s" % (description, job_id)) class ReindexSparkPatch(CombineSparkPatch): - - ''' - Class to handle Job re-indexing - - Args: - kwargs(dict): - - job_id (int): ID of Job to reindex - ''' - - def spark_function(self): - - # get job and set to self - self.job = Job.objects.get(pk=int(self.kwargs['job_id'])) - self.update_jobGroup('Running Re-Index Job', self.job.id) - - # get records as DF - pipeline = json.dumps({'$match': {'job_id': self.job.id}}) - db_records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() - - # reindex - ESIndex.index_job_to_es_spark( - self.spark, - job=self.job, - records_df=db_records, - field_mapper_config=json.loads(self.kwargs['fm_config_json']) - ) - + ''' + Class to handle Job re-indexing + + Args: + kwargs(dict): + - job_id (int): ID of Job to reindex + ''' + + def spark_function(self): + # get job and set to self + self.job = Job.objects.get(pk=int(self.kwargs['job_id'])) + self.update_jobGroup('Running Re-Index Job', self.job.id) + + # get records as DF + pipeline = json.dumps({'$match': {'job_id': self.job.id}}) + db_records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() + + # reindex + ESIndex.index_job_to_es_spark( + self.spark, + job=self.job, + records_df=db_records, + field_mapper_config=json.loads(self.kwargs['fm_config_json']) + ) class RunNewValidationsSpark(CombineSparkPatch): - - ''' - Class to run new validations for Job - - Args: - kwargs(dict): - - job_id (int): ID of Job - - validation_scenarios (list): list of validation scenarios to run - ''' - - def spark_function(self): - - # get job and set to self - self.job = Job.objects.get(pk=int(self.kwargs['job_id'])) - self.update_jobGroup('Running New Validation Scenarios', self.job.id) - - pipeline = json.dumps({'$match': {'job_id': self.job.id}}) - db_records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() - - # run Validation Scenarios - if 'validation_scenarios' in self.kwargs.keys(): - vs = ValidationScenarioSpark( - spark=self.spark, - job=self.job, - records_df=db_records, - validation_scenarios = ast.literal_eval(self.kwargs['validation_scenarios']) - ) - vs.run_record_validation_scenarios() - + ''' + Class to run new validations for Job + + Args: + kwargs(dict): + - job_id (int): ID of Job + - validation_scenarios (list): list of validation scenarios to run + ''' + + def spark_function(self): + # get job and set to self + self.job = Job.objects.get(pk=int(self.kwargs['job_id'])) + self.update_jobGroup('Running New Validation Scenarios', self.job.id) + + pipeline = json.dumps({'$match': {'job_id': self.job.id}}) + db_records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() + + # run Validation Scenarios + if 'validation_scenarios' in self.kwargs.keys(): + vs = ValidationScenarioSpark( + spark=self.spark, + job=self.job, + records_df=db_records, + validation_scenarios=ast.literal_eval(self.kwargs['validation_scenarios']) + ) + vs.run_record_validation_scenarios() class RemoveValidationsSpark(CombineSparkPatch): - - ''' - Class to remove validations for Job - - Args: - kwargs(dict): - - job_id (int): ID of Job - - validation_scenarios (list): list of validation scenarios to run - ''' - - def spark_function(self): - - # get job and set to self - self.job = Job.objects.get(pk=int(self.kwargs['job_id'])) - self.update_jobGroup('Removing Validation Scenario', self.job.id) - - # create pipeline to select INVALID records, that may become valid - pipeline = json.dumps({'$match':{'$and':[{'job_id': self.job.id},{'valid':False}]}}) - db_records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() - - # if not nothing to update, skip - if not db_records.rdd.isEmpty(): - # run Validation Scenarios - if 'validation_scenarios' in self.kwargs.keys(): - vs = ValidationScenarioSpark( - spark=self.spark, - job=self.job, - records_df=db_records, - validation_scenarios = ast.literal_eval(self.kwargs['validation_scenarios']) - ) - vs.remove_validation_scenarios() - + ''' + Class to remove validations for Job + + Args: + kwargs(dict): + - job_id (int): ID of Job + - validation_scenarios (list): list of validation scenarios to run + ''' + + def spark_function(self): + + # get job and set to self + self.job = Job.objects.get(pk=int(self.kwargs['job_id'])) + self.update_jobGroup('Removing Validation Scenario', self.job.id) + + # create pipeline to select INVALID records, that may become valid + pipeline = json.dumps({'$match': {'$and': [{'job_id': self.job.id}, {'valid': False}]}}) + db_records = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() + + # if not nothing to update, skip + if not db_records.rdd.isEmpty(): + # run Validation Scenarios + if 'validation_scenarios' in self.kwargs.keys(): + vs = ValidationScenarioSpark( + spark=self.spark, + job=self.job, + records_df=db_records, + validation_scenarios=ast.literal_eval(self.kwargs['validation_scenarios']) + ) + vs.remove_validation_scenarios() class RunDBDM(CombineSparkPatch): + ''' + Class to run DPLA Bulk Data Match as patch job - ''' - Class to run DPLA Bulk Data Match as patch job - - Args: - kwargs(dict): - - job_id (int): ID of Job - - dbdd_id (int): int of DBDD instance to use - ''' - - def spark_function(self): + Args: + kwargs(dict): + - job_id (int): ID of Job + - dbdd_id (int): int of DBDD instance to use + ''' - # get job and set to self - self.job = Job.objects.get(pk=int(self.kwargs['job_id'])) - self.update_jobGroup('Running DPLA Bulk Data Match', self.job.id) + def spark_function(self): + # get job and set to self + self.job = Job.objects.get(pk=int(self.kwargs['job_id'])) + self.update_jobGroup('Running DPLA Bulk Data Match', self.job.id) - # get full dbdd es - dbdd = DPLABulkDataDownload.objects.get(pk=int(self.kwargs['dbdd_id'])) - dpla_df = get_job_es(self.spark, indices=[dbdd.es_index], doc_type='item') + # get full dbdd es + dbdd = DPLABulkDataDownload.objects.get(pk=int(self.kwargs['dbdd_id'])) + dpla_df = get_job_es(self.spark, indices=[dbdd.es_index], doc_type='item') - # get job mapped fields - es_df = get_job_es(self.spark, job_id=self.job.id) + # get job mapped fields + es_df = get_job_es(self.spark, job_id=self.job.id) - # get job records - records_df = get_job_as_df(self.spark, self.job.id) + # get job records + records_df = get_job_as_df(self.spark, self.job.id) - # join on isShownAt - matches_df = es_df.join(dpla_df, es_df['dpla_isShownAt'] == dpla_df['isShownAt'], 'leftsemi') + # join on isShownAt + matches_df = es_df.join(dpla_df, es_df['dpla_isShownAt'] == dpla_df['isShownAt'], 'leftsemi') - # select records_df for writing - update_dbdm_df = records_df.join(matches_df, records_df['_id']['oid'] == matches_df['db_id'], 'leftsemi') + # select records_df for writing + update_dbdm_df = records_df.join(matches_df, records_df['_id']['oid'] == matches_df['db_id'], 'leftsemi') - # set dbdm column to match - update_dbdm_df = update_dbdm_df.withColumn('dbdm', pyspark_sql_functions.lit(True)) - - # write to DB - update_dbdm_df.write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection", "record").save() + # set dbdm column to match + update_dbdm_df = update_dbdm_df.withColumn('dbdm', pyspark_sql_functions.lit(True)) + # write to DB + update_dbdm_df.write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record").save() #################################################################### @@ -1799,325 +1762,308 @@ def spark_function(self): #################################################################### class CombineStateIO(object): + ''' + Base class for Combine State IO work. + ''' - ''' - Base class for Combine State IO work. - ''' - - def __init__(self, spark, **kwargs): - - self.spark = spark + def __init__(self, spark, **kwargs): + self.spark = spark - self.kwargs = kwargs + self.kwargs = kwargs - # capture common params - self.import_path = kwargs.get('import_path', None) - self.import_manifest = kwargs.get('import_manifest', None) + # capture common params + self.import_path = kwargs.get('import_path', None) + self.import_manifest = kwargs.get('import_manifest', None) - # init logging support - spark.sparkContext.setLogLevel('INFO') - log4jLogger = spark.sparkContext._jvm.org.apache.log4j - self.logger = log4jLogger.LogManager.getLogger(__name__) + # init logging support + spark.sparkContext.setLogLevel('INFO') + log4jLogger = spark.sparkContext._jvm.org.apache.log4j + self.logger = log4jLogger.LogManager.getLogger(__name__) + def update_jobGroup(self, group_id, description): + ''' + Method to update spark jobGroup + ''' - def update_jobGroup(self, group_id, description): - - ''' - Method to update spark jobGroup - ''' - - self.spark.sparkContext.setJobGroup(group_id, description) + self.spark.sparkContext.setJobGroup(group_id, description) class CombineStateIOImport(CombineStateIO): + ''' + Class to handle state imports - ''' - Class to handle state imports - - Args: - kwargs(dict): - - import_path (str): string of unzipped export directory on disk - - import_manifest (dict): dictionary containing import information, including hash of old:new primary keys - ''' - - def spark_function(self): - - # import records - self._import_records() - - # import validations - self._import_validations() - - # import mapped fields (ES) - self._import_mapped_fields() - - - def _import_records(self): - - ''' - Method to import records to Mongo - ''' - - # import records - self.update_jobGroup(self.import_manifest.get('import_id', uuid.uuid4().hex), 'StateIO: Importing Records') - - # loop through jobs - for orig_job_id, clone_job_id in self.import_manifest['pk_hash']['jobs'].items(): - - # assemple location of export - records_json_filepath = '%s/record_exports/j%s_mongo_records.json' % (self.import_path, orig_job_id) - - # load as dataframe - records_df = self.spark.read.json(records_json_filepath) - - # copy original _id - records_df = records_df\ - .withColumn('orig_id', records_df['_id']['$oid']) - - # flatten fingerprint column - try: - records_df = records_df.withColumn('fingerprint', records_df.fingerprint['$numberLong']) - except: - records_df = records_df.withColumn('fingerprint', records_df.fingerprint) - - # update job_id - records_df = records_df.withColumn('job_id', pyspark_sql_functions.lit(int(clone_job_id))) - - # write records to MongoDB, dropping _id in process - records_df.select([col for col in records_df.columns if col != '_id'])\ - .write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection", "record").save() - - - def _import_validations(self): - - ''' - Method to import validations to Mongo - ''' - - # import validations - self.update_jobGroup(self.import_manifest.get('import_id', uuid.uuid4().hex), 'StateIO: Importing Validations') - - # loop through jobs - for orig_job_id, clone_job_id in self.import_manifest['pk_hash']['jobs'].items(): - - # assemple location of export - validations_json_filepath = '%s/validation_exports/j%s_mongo_validations.json' % (self.import_path, orig_job_id) - - # load as dataframe - validations_df = self.spark.read.json(validations_json_filepath) - - # check for dataframe rows to proceed - if len(validations_df.take(1)) > 0: - - # read first row to get old validation_scenario_id, and run through pk_hash for new one - row = validations_df.take(1)[0] - vs_id = int(row.validation_scenario_id['$numberLong']) - new_vs_id = self.import_manifest['pk_hash']['validations'][vs_id] - - # flatten record_id - validations_df = validations_df.withColumn('record_id', validations_df['record_id']['$oid']) - - # retrieve newly written records for this Job - pipeline = json.dumps({'$match': {'job_id': clone_job_id, 'success': True}}) - records_df = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() - - # join on validations_df.record_id : records_df.orig_id - updated_validations_df = validations_df.drop('_id').alias('validations_df').join(records_df.select('_id','orig_id').alias('records_df'), validations_df['record_id'] == records_df['orig_id']) - - # update record_id - updated_validations_df = updated_validations_df.withColumn('record_id', updated_validations_df['_id']) - - # limit to validation columns - updated_validations_df = updated_validations_df.select(validations_df.columns).drop('_id') - - # flatten - updated_validations_df = updated_validations_df.withColumn('fail_count', updated_validations_df.fail_count['$numberLong'].cast(LongType())) - updated_validations_df = updated_validations_df.withColumn('job_id', pyspark_sql_functions.lit(int(clone_job_id)).cast(LongType())) - - # update validation scenario id - updated_validations_df = updated_validations_df.withColumn('validation_scenario_id', pyspark_sql_functions.lit(int(new_vs_id)).cast(LongType())) - - # write records to MongoDB - updated_validations_df.write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection", "record_validation").save() - - - def _import_mapped_fields(self, reindex=True): - - ''' - Method to import mapped fields to ElasticSearch - - re-map and index, based on saved Job params - - inefficient to export/import ElasticSearch records, when modifying values - - QUESTION: Why is this partitioned to 200, when reading from Mongo appears to be - the same for Re-Indexing? - ''' - - # import mapped fields - self.update_jobGroup(self.import_manifest.get('import_id', uuid.uuid4().hex), 'StateIO: Importing Mapped Fields') - - # loop through jobs - for orig_job_id, clone_job_id in self.import_manifest['pk_hash']['jobs'].items(): - - # re-index (default) - if reindex: - - # get job and set to self - job = Job.objects.get(pk=int(clone_job_id)) - - # get records as DF - pipeline = json.dumps({'$match': {'job_id': job.id}}) - records_df = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load() - - # reindex - ESIndex.index_job_to_es_spark( - self.spark, - job=job, - records_df=records_df, - field_mapper_config=job.job_details_dict.get('field_mapper_config') - ) - - # else, fallback on slower, import of serialized records - else: - - # assemple location of export - mapped_fields_json_filepath = '%s/mapped_fields_exports/j%s_mapped_fields.json' % (self.import_path, orig_job_id) - - # read raw JSON lines - json_lines_rdd = self.spark.sparkContext.textFile(mapped_fields_json_filepath) - - # parse to expose record db_id - def parser_udf(row): - - # parse JSON - d = json.loads(row) - - # return tuple with exposed original id - return (d['db_id'], row) - - orig_id_rdd = json_lines_rdd.map(lambda row: parser_udf(row)) - - # to dataframe for join - orig_id_df = orig_id_rdd.toDF() - - # retrieve newly written records for this Job - pipeline = json.dumps({'$match': {'job_id': clone_job_id, 'success': True}}) - records_df = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",4)\ - .option("pipeline",pipeline).load() - - # join on id - join_id_df = orig_id_df.join(records_df, orig_id_df['_1'] == records_df['orig_id']) - - # rewrite _1 as new id for Record - new_id_df = join_id_df.withColumn('_1', join_id_df['_id']['oid']) - - # select only what's needed - new_id_df = new_id_df.select('_1','_2') - - # convert back to RDD - new_id_rdd = new_id_df.rdd - - # update db_id in JSON destined for ES - def update_db_id_udf(row): - - # load json - d = json.loads(row['_2']) - - # set identifiers - d['db_id'] = row['_1'] - d['temp_id'] = row['_1'] - - # convert lists to tuples - for k,v in d.items(): - if type(v) == list: - d[k] = tuple(v) - - return (row['_1'], d) - - new_id_rdd = new_id_rdd.map(lambda row: update_db_id_udf(row)) - - # create index in advance - index_name = 'j%s' % clone_job_id - es_handle_temp = Elasticsearch(hosts=[settings.ES_HOST]) - if not es_handle_temp.indices.exists(index_name): - - # put combine es index templates - template_body = { - 'template':'*', - 'settings':{ - 'number_of_shards':1, - 'number_of_replicas':0, - 'refresh_interval':-1 - }, - 'mappings':{ - 'record':{ - 'date_detection':False, - 'properties':{ - 'combine_db_id':{ - 'type':'integer' - } - } - } - } - } - es_handle_temp.indices.put_template('combine_template', body=json.dumps(template_body)) - - # create index - es_handle_temp.indices.create(index_name) - - # index - new_id_rdd.saveAsNewAPIHadoopFile( - path='-', - outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", - keyClass="org.apache.hadoop.io.NullWritable", - valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", - conf={ - "es.resource":"%s/record" % index_name, - "es.nodes":"%s:9200" % settings.ES_HOST, - "es.mapping.exclude":"temp_id", - "es.mapping.id":"temp_id", - } - ) - - - - - - - - - - - - - + Args: + kwargs(dict): + - import_path (str): string of unzipped export directory on disk + - import_manifest (dict): dictionary containing import information, including hash of old:new primary keys + ''' + def spark_function(self): + # import records + self._import_records() + # import validations + self._import_validations() + # import mapped fields (ES) + self._import_mapped_fields() + + def _import_records(self): + ''' + Method to import records to Mongo + ''' + + # import records + self.update_jobGroup(self.import_manifest.get('import_id', uuid.uuid4().hex), 'StateIO: Importing Records') + + # loop through jobs + for orig_job_id, clone_job_id in self.import_manifest['pk_hash']['jobs'].items(): + + # assemple location of export + records_json_filepath = '%s/record_exports/j%s_mongo_records.json' % (self.import_path, orig_job_id) + + # load as dataframe + records_df = self.spark.read.json(records_json_filepath) + + # copy original _id + records_df = records_df \ + .withColumn('orig_id', records_df['_id']['$oid']) + + # flatten fingerprint column + try: + records_df = records_df.withColumn('fingerprint', records_df.fingerprint['$numberLong']) + except: + records_df = records_df.withColumn('fingerprint', records_df.fingerprint) + + # update job_id + records_df = records_df.withColumn('job_id', pyspark_sql_functions.lit(int(clone_job_id))) + # write records to MongoDB, dropping _id in process + records_df.select([col for col in records_df.columns if col != '_id']) \ + .write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record").save() + + def _import_validations(self): + + ''' + Method to import validations to Mongo + ''' + + # import validations + self.update_jobGroup(self.import_manifest.get('import_id', uuid.uuid4().hex), 'StateIO: Importing Validations') + + # loop through jobs + for orig_job_id, clone_job_id in self.import_manifest['pk_hash']['jobs'].items(): + + # assemple location of export + validations_json_filepath = '%s/validation_exports/j%s_mongo_validations.json' % ( + self.import_path, orig_job_id) + + # load as dataframe + validations_df = self.spark.read.json(validations_json_filepath) + + # check for dataframe rows to proceed + if len(validations_df.take(1)) > 0: + # read first row to get old validation_scenario_id, and run through pk_hash for new one + row = validations_df.take(1)[0] + vs_id = int(row.validation_scenario_id['$numberLong']) + new_vs_id = self.import_manifest['pk_hash']['validations'][vs_id] + + # flatten record_id + validations_df = validations_df.withColumn('record_id', validations_df['record_id']['$oid']) + + # retrieve newly written records for this Job + pipeline = json.dumps({'$match': {'job_id': clone_job_id, 'success': True}}) + records_df = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", + settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() + + # join on validations_df.record_id : records_df.orig_id + updated_validations_df = validations_df.drop('_id').alias('validations_df').join( + records_df.select('_id', 'orig_id').alias('records_df'), + validations_df['record_id'] == records_df['orig_id']) + + # update record_id + updated_validations_df = updated_validations_df.withColumn('record_id', updated_validations_df['_id']) + + # limit to validation columns + updated_validations_df = updated_validations_df.select(validations_df.columns).drop('_id') + + # flatten + updated_validations_df = updated_validations_df.withColumn('fail_count', + updated_validations_df.fail_count[ + '$numberLong'].cast(LongType())) + updated_validations_df = updated_validations_df.withColumn('job_id', pyspark_sql_functions.lit( + int(clone_job_id)).cast(LongType())) + + # update validation scenario id + updated_validations_df = updated_validations_df.withColumn('validation_scenario_id', + pyspark_sql_functions.lit( + int(new_vs_id)).cast(LongType())) + + # write records to MongoDB + updated_validations_df.write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record_validation").save() + + def _import_mapped_fields(self, reindex=True): + + ''' + Method to import mapped fields to ElasticSearch + - re-map and index, based on saved Job params + - inefficient to export/import ElasticSearch records, when modifying values + + QUESTION: Why is this partitioned to 200, when reading from Mongo appears to be + the same for Re-Indexing? + ''' + + # import mapped fields + self.update_jobGroup(self.import_manifest.get('import_id', uuid.uuid4().hex), + 'StateIO: Importing Mapped Fields') + + # loop through jobs + for orig_job_id, clone_job_id in self.import_manifest['pk_hash']['jobs'].items(): + + # re-index (default) + if reindex: + + # get job and set to self + job = Job.objects.get(pk=int(clone_job_id)) + + # get records as DF + pipeline = json.dumps({'$match': {'job_id': job.id}}) + records_df = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", + settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() + + # reindex + ESIndex.index_job_to_es_spark( + self.spark, + job=job, + records_df=records_df, + field_mapper_config=job.job_details_dict.get('field_mapper_config') + ) + + # else, fallback on slower, import of serialized records + else: + + # assemple location of export + mapped_fields_json_filepath = '%s/mapped_fields_exports/j%s_mapped_fields.json' % ( + self.import_path, orig_job_id) + + # read raw JSON lines + json_lines_rdd = self.spark.sparkContext.textFile(mapped_fields_json_filepath) + + # parse to expose record db_id + def parser_udf(row): + + # parse JSON + d = json.loads(row) + + # return tuple with exposed original id + return (d['db_id'], row) + + orig_id_rdd = json_lines_rdd.map(lambda row: parser_udf(row)) + + # to dataframe for join + orig_id_df = orig_id_rdd.toDF() + + # retrieve newly written records for this Job + pipeline = json.dumps({'$match': {'job_id': clone_job_id, 'success': True}}) + records_df = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", 4) \ + .option("pipeline", pipeline).load() + + # join on id + join_id_df = orig_id_df.join(records_df, orig_id_df['_1'] == records_df['orig_id']) + + # rewrite _1 as new id for Record + new_id_df = join_id_df.withColumn('_1', join_id_df['_id']['oid']) + + # select only what's needed + new_id_df = new_id_df.select('_1', '_2') + + # convert back to RDD + new_id_rdd = new_id_df.rdd + + # update db_id in JSON destined for ES + def update_db_id_udf(row): + + # load json + d = json.loads(row['_2']) + + # set identifiers + d['db_id'] = row['_1'] + d['temp_id'] = row['_1'] + + # convert lists to tuples + for k, v in d.items(): + if type(v) == list: + d[k] = tuple(v) + + return (row['_1'], d) + + new_id_rdd = new_id_rdd.map(lambda row: update_db_id_udf(row)) + + # create index in advance + index_name = 'j%s' % clone_job_id + es_handle_temp = Elasticsearch(hosts=[settings.ES_HOST]) + if not es_handle_temp.indices.exists(index_name): + # put combine es index templates + template_body = { + 'template': '*', + 'settings': { + 'number_of_shards': 1, + 'number_of_replicas': 0, + 'refresh_interval': -1 + }, + 'mappings': { + 'record': { + 'date_detection': False, + 'properties': { + 'combine_db_id': { + 'type': 'integer' + } + } + } + } + } + es_handle_temp.indices.put_template('combine_template', body=json.dumps(template_body)) + + # create index + es_handle_temp.indices.create(index_name) + + # index + new_id_rdd.saveAsNewAPIHadoopFile( + path='-', + outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", + keyClass="org.apache.hadoop.io.NullWritable", + valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", + conf={ + "es.resource": "%s/record" % index_name, + "es.nodes": "%s:9200" % settings.ES_HOST, + "es.mapping.exclude": "temp_id", + "es.mapping.id": "temp_id", + } + ) diff --git a/core/spark/record_validation.py b/core/spark/record_validation.py index c3b12dca..d7dc208a 100644 --- a/core/spark/record_validation.py +++ b/core/spark/record_validation.py @@ -19,9 +19,9 @@ # import from core.spark try: - from utils import PythonUDFRecord, refresh_django_db_connection + from utils import PythonUDFRecord, refresh_django_db_connection except: - from core.spark.utils import PythonUDFRecord, refresh_django_db_connection + from core.spark.utils import PythonUDFRecord, refresh_django_db_connection # init django settings file to retrieve settings os.environ['DJANGO_SETTINGS_MODULE'] = 'combine.settings' @@ -35,539 +35,522 @@ from core.models import Job, ValidationScenario - #################################################################### # Record Validation # #################################################################### class ValidationScenarioSpark(object): + ''' + Class to organize methods and attributes used for running validation scenarios + ''' + + def __init__(self, + spark=None, + job=None, + records_df=None, + validation_scenarios=None): - ''' - Class to organize methods and attributes used for running validation scenarios - ''' - - def __init__(self, - spark=None, - job=None, - records_df=None, - validation_scenarios=None): - - ''' - Args: - spark (pyspark.sql.session.SparkSession): spark instance from static job methods - job (core.models.Job): Job instance - records_df (pyspark.sql.DataFrame): records as pyspark DataFrame - validation_scenarios (list): list of ValidationScenario job ids as integers - ''' - - self.spark = spark - self.job = job - self.records_df = records_df - self.validation_scenarios = validation_scenarios - - # init logging support - spark.sparkContext.setLogLevel('INFO') - log4jLogger = spark.sparkContext._jvm.org.apache.log4j - self.logger = log4jLogger.LogManager.getLogger(__name__) - - - def run_record_validation_scenarios(self): - - ''' - Function to run validation scenarios - Results are written to RecordValidation table, one result, per record, per failed validation test. - - Validation tests may be of type: - - 'sch': Schematron based validation, performed with lxml etree - - 'python': custom python code snippets - - Args: - None - - Returns: - None - - writes validation fails to RecordValidation table - ''' - - # refresh Django DB Connection - refresh_django_db_connection() - - # loop through validation scenarios and fire validation type specific method - failure_rdds = [] - for vs_id in self.validation_scenarios: - - # get validation scenario - vs = ValidationScenario.objects.get(pk=int(vs_id)) - vs_id = vs.id - vs_name = vs.name - vs_filepath = vs.filepath - - # schematron based validation scenario - if vs.validation_type == 'sch': - validation_fails_rdd = self._sch_validation(vs, vs_id, vs_name, vs_filepath) - - # python based validation scenario - elif vs.validation_type == 'python': - validation_fails_rdd = self._python_validation(vs, vs_id, vs_name, vs_filepath) - - # ElasticSearch DSL query based validation scenario - elif vs.validation_type == 'es_query': - validation_fails_rdd = self._es_query_validation(vs, vs_id, vs_name, vs_filepath) - - # XML Schema (XSD) based validation scenario - elif vs.validation_type == 'xsd': - validation_fails_rdd = self._xsd_validation(vs, vs_id, vs_name, vs_filepath) - - # if results, append - if validation_fails_rdd and not validation_fails_rdd.isEmpty(): - failure_rdds.append(validation_fails_rdd) - - # if rdds, union and write - if len(failure_rdds) > 0: - - # merge rdds - failures_union_rdd = self.spark.sparkContext.union(failure_rdds) - failures_df = failures_union_rdd.toDF() - - # write failures - failures_df.write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection", "record_validation").save() - - # update validity for Job - self.update_job_record_validity() - - - def _sch_validation(self, vs, vs_id, vs_name, vs_filepath): - - self.logger.info('running schematron validation: %s' % vs.name) - - def validate_schematron_pt_udf(pt): - - # parse schematron - sct_doc = etree.parse(vs_filepath) - validator = isoschematron.Schematron(sct_doc, store_report=True) - - for row in pt: - - try: - - # get document xml - record_xml = etree.fromstring(row.document.encode('utf-8')) - - # validate - is_valid = validator.validate(record_xml) - - # if not valid, prepare Row - if not is_valid: - - # prepare results_dict - results_dict = { - 'fail_count':0, - 'failed':[] - } - - # get failed - report_root = validator.validation_report.getroot() - fails = report_root.findall('svrl:failed-assert', namespaces=report_root.nsmap) - - # log fail_count - results_dict['fail_count'] = len(fails) - - # loop through fails and add to dictionary - for fail in fails: - fail_text_elem = fail.find('svrl:text', namespaces=fail.nsmap) - results_dict['failed'].append(fail_text_elem.text) - - yield Row( - record_id=row._id, - record_identifier=row.record_id, - job_id=row.job_id, - validation_scenario_id=int(vs_id), - validation_scenario_name=vs_name, - valid=False, - results_payload=json.dumps(results_dict), - fail_count=results_dict['fail_count'] - ) - - except Exception as e: - - results_dict = { - 'fail_count':0, - 'failed':[] - } - results_dict['fail_count'] += 1 - results_dict['failed'].append("Schematron validation exception: %s" % (str(e))) - - yield Row( - record_id=row._id, - record_identifier=row.record_id, - job_id=row.job_id, - validation_scenario_id=int(vs_id), - validation_scenario_name=vs_name, - valid=False, - results_payload=json.dumps(results_dict), - fail_count=results_dict['fail_count'] - ) - - # run pt_udf map - validation_fails_rdd = self.records_df.rdd.mapPartitions(validate_schematron_pt_udf).filter(lambda row: row is not None) - - # return - return validation_fails_rdd - - - def _python_validation(self, vs, vs_id, vs_name, vs_filepath): - - self.logger.info('running python validation: %s' % vs.name) - - def validate_python_udf(vs_id, vs_name, pyvs_funcs, row): - - ''' - Loop through test functions and aggregate in fail_dict to return with Row - - Args: - vs_id (int): integer of validation scenario - pyvs_funcs (list): list of functions imported from user created python validation scenario payload - row (): - ''' - - # prepare row as parsed document with PythonUDFRecord class - prvb = PythonUDFRecord(row) - - # prepare results_dict - results_dict = { - 'fail_count':0, - 'failed':[] - } - - # loop through functions - for func in pyvs_funcs: - - # get name as string - func_name = func.__name__ - - # get func test message - func_signature = signature(func) - t_msg = func_signature.parameters['test_message'].default - - # attempt to run user-defined validation function - try: - - # run test - test_result = func(prvb) - - # if fail, append - if test_result != True: - - # bump fail count - results_dict['fail_count'] += 1 - - # if custom message override provided, use - if test_result != False: - results_dict['failed'].append(test_result) - - # else, default to test message - else: - results_dict['failed'].append(t_msg) - - # if problem, report as failure with Exception string - except Exception as e: - results_dict['fail_count'] += 1 - results_dict['failed'].append("test '%s' had exception: %s" % (func_name, str(e))) - - # if failed, return Row - if results_dict['fail_count'] > 0: - - # return row - return Row( - record_id=row._id, - record_identifier=row.record_id, - job_id=row.job_id, - validation_scenario_id=int(vs_id), - validation_scenario_name=vs_name, - valid=False, - results_payload=json.dumps(results_dict), - fail_count=results_dict['fail_count'] - ) - - # parse user defined functions from validation scenario payload - temp_pyvs = ModuleType('temp_pyvs') - exec(vs.payload, temp_pyvs.__dict__) - - # get defined functions - pyvs_funcs = [] - test_labeled_attrs = [ attr for attr in dir(temp_pyvs) if attr.lower().startswith('test') ] - for attr in test_labeled_attrs: - attr = getattr(temp_pyvs, attr) - if isfunction(attr): - pyvs_funcs.append(attr) - - validation_fails_rdd = self.records_df.rdd.\ - map(lambda row: validate_python_udf(vs_id, vs_name, pyvs_funcs, row))\ - .filter(lambda row: row is not None) - - # return - return validation_fails_rdd - - - def _es_query_validation(self, vs, vs_id, vs_name, vs_filepath): - - self.logger.info('running es_query validation: %s' % vs.name) - - # set es index - # TODO: how handle published Jobs? - es_index = 'j%s' % self.job.id - - # loads validation payload as dictionary - validations = json.loads(vs.payload) - - # failure dfs - fail_dfs = [] - - # loop through validations - for v in validations: - - # prepare query - es_val_query = json.dumps(v['es_query']) - - # perform es query - es_rdd = self.spark.sparkContext.newAPIHadoopRDD( - inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", - keyClass="org.apache.hadoop.io.NullWritable", - valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", - conf={ - "es.resource":"%s/record" % es_index, - "es.query":es_val_query, - "es.read.field.include":"db_id" - } - ) - - # if query is not empty, map to DataFrame - if not es_rdd.isEmpty(): - es_df = es_rdd.map(lambda row: (row[1]['db_id'], )).toDF() - - # handle validity matching - # NOTE: matching on records_df['_id']['oid'] to get str cast of Mongo ObjectId - # if a match is valid, report all Records that don't match - if v['matches'] == 'valid': - - # if empty, assume all Records in Job are invalid - if es_rdd.isEmpty(): - fail_df = self.records_df.select('_id','record_id') - - # else, perform join - else: - fail_df = self.records_df.alias('records_df').join(es_df, self.records_df['_id']['oid'] == es_df['_1'], 'leftanti').select('_id','records_df.record_id') - - # if a match is invalid, report all Records that match - elif v['matches'] == 'invalid': - - # if empty, nothing to report, return None - if es_rdd.isEmpty(): - return None - - # else, perform join - else: - fail_df = self.records_df.alias('records_df').join(es_df, self.records_df['_id']['oid'] == es_df['_1'], 'leftsemi').select('_id','records_df.record_id') - - # add columns to df to return - fail_df = fail_df.withColumn('failed', pyspark_sql_functions.array(pyspark_sql_functions.lit(v['test_name']))) - fail_df = fail_df.withColumn('fail_count', pyspark_sql_functions.lit(1)) - - # append to validations dictionary - fail_dfs.append(fail_df) - - # if dataframes to reduce and return, perform - if len(fail_dfs) > 0: - - # merge and format - new_df = reduce(lambda a, b: a.unionAll(b), fail_dfs)\ - .select("_id", "record_id", pyspark_sql_functions.explode("failed").alias("failed_values"), "fail_count")\ - .groupBy("_id","record_id")\ - .agg(pyspark_sql_functions.collect_list("failed_values").alias("failed"), pyspark_sql_functions.sum("fail_count").alias("fail_count"))\ - .select("_id", "record_id", pyspark_sql_functions.to_json(pyspark_sql_functions.struct("failed", "fail_count")).alias("data"), "fail_count") - - # write return failures as validation_fails_rdd - job_id = self.job.id - validation_fails_rdd = new_df.rdd.map(lambda row: Row( - record_id=row._id, - record_identifier=row.record_id, - job_id=job_id, - validation_scenario_id=int(vs_id), - validation_scenario_name=vs_name, - valid=False, - results_payload=row.data, - fail_count=int(row['fail_count'])) - ) - return validation_fails_rdd - - else: - return None - - - def _xsd_validation(self, vs, vs_id, vs_name, vs_filepath): - - self.logger.info('running xsd validation: %s' % vs.name) - - def validate_xsd_pt_udf(pt): - - # parse xsd - xmlschema_doc = etree.parse(vs_filepath) - xmlschema = etree.XMLSchema(xmlschema_doc) - - for row in pt: - - try: - - # get document xml - record_xml = etree.fromstring(row.document.encode('utf-8')) - - # validate - try: - xmlschema.assertValid(record_xml) - - except etree.DocumentInvalid as e: - - # prepare results_dict - results_dict = { - 'fail_count':1, - 'failed':[str(e)] - } - - yield Row( - record_id=row._id, - record_identifier=row.record_id, - job_id=row.job_id, - validation_scenario_id=int(vs_id), - validation_scenario_name=vs_name, - valid=False, - results_payload=json.dumps(results_dict), - fail_count=results_dict['fail_count'] - ) - - except Exception as e: - - results_dict = { - 'fail_count':1, - 'failed':[] - } - results_dict['failed'].append("XSD validation exception: %s" % (str(e))) - - yield Row( - record_id=row._id, - record_identifier=row.record_id, - job_id=row.job_id, - validation_scenario_id=int(vs_id), - validation_scenario_name=vs_name, - valid=False, - results_payload=json.dumps(results_dict), - fail_count=results_dict['fail_count'] - ) - - # run pt_udf map - validation_fails_rdd = self.records_df.rdd.mapPartitions(validate_xsd_pt_udf).filter(lambda row: row is not None) - - # return - return validation_fails_rdd - - - def remove_validation_scenarios(self): - - ''' - Method to update validity attribute of records after removal of validation scenarios - - approach is to update all INVALID Records that may now be valid by lack of - matching record_id in remaining validation failures - ''' - - # read current failures from Mongo - failures_pipeline = json.dumps({'$match': {'job_id': self.job.id}}) - failures_df = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record_validation")\ - .option("pipeline",failures_pipeline).load() - - # if failures to work with, rewrite records with valid = True if NOT in remaining DB failures - if not failures_df.rdd.isEmpty(): - set_valid_df = self.records_df.alias('records_df').join( - failures_df.select('record_id').distinct().alias('failures_df'), - failures_df['record_id'] == self.records_df['_id'], - 'leftanti')\ - .select(self.records_df.columns)\ - .withColumn('valid',pyspark_sql_functions.lit(True)) - else: - # will write all previously invalid, as valid - set_valid_df = self.records_df.withColumn('valid',pyspark_sql_functions.lit(True)) - - # update validity of Records - set_valid_df.write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection", "record").save() - - - def update_job_record_validity(self): - - ''' - Method to update validity of Records in Job based on found RecordValidadtions - ''' - - # get failures - pipeline = json.dumps({'$match':{'$and':[{'job_id': self.job.id}]}}) - all_failures_df = self.spark.read.format("com.mongodb.spark.sql.DefaultSource")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection","record_validation")\ - .option("partitioner","MongoSamplePartitioner")\ - .option("spark.mongodb.input.partitionerOptions.partitionSizeMB",settings.MONGO_READ_PARTITION_SIZE_MB)\ - .option("pipeline",pipeline).load()\ - .select('record_id')\ - .withColumnRenamed('record_id','fail_id') - - # join, writing potentially null `fail_id` column - fail_join = self.records_df.alias('records_df').join( - all_failures_df.select('fail_id').distinct().alias('all_failures_df'), - self.records_df['_id'] == all_failures_df['fail_id'], - 'leftouter') - - # set valid column based on join and drop column - updated_validity = fail_join.withColumn('update_valid', pyspark_sql_functions.when(fail_join['fail_id'].isNotNull(), False).otherwise(True)) - - # subset those that need updating and flip validity - to_update = updated_validity.where(updated_validity['valid'] != updated_validity['update_valid'])\ - .select(self.records_df.columns)\ - .withColumn('valid', pyspark_sql_functions.when(self.records_df.valid == True, False).otherwise(True)) - - # update in DB by overwriting - to_update.write.format("com.mongodb.spark.sql.DefaultSource")\ - .mode("append")\ - .option("uri","mongodb://%s" % settings.MONGO_HOST)\ - .option("database","combine")\ - .option("collection", "record").save() - - - def export_job_validation_report(self): - - pass - - - + ''' + Args: + spark (pyspark.sql.session.SparkSession): spark instance from static job methods + job (core.models.Job): Job instance + records_df (pyspark.sql.DataFrame): records as pyspark DataFrame + validation_scenarios (list): list of ValidationScenario job ids as integers + ''' + self.spark = spark + self.job = job + self.records_df = records_df + self.validation_scenarios = validation_scenarios + # init logging support + spark.sparkContext.setLogLevel('INFO') + log4jLogger = spark.sparkContext._jvm.org.apache.log4j + self.logger = log4jLogger.LogManager.getLogger(__name__) + def run_record_validation_scenarios(self): + ''' + Function to run validation scenarios + Results are written to RecordValidation table, one result, per record, per failed validation test. + Validation tests may be of type: + - 'sch': Schematron based validation, performed with lxml etree + - 'python': custom python code snippets + Args: + None + + Returns: + None + - writes validation fails to RecordValidation table + ''' + + # refresh Django DB Connection + refresh_django_db_connection() + + # loop through validation scenarios and fire validation type specific method + failure_rdds = [] + for vs_id in self.validation_scenarios: + + # get validation scenario + vs = ValidationScenario.objects.get(pk=int(vs_id)) + vs_id = vs.id + vs_name = vs.name + vs_filepath = vs.filepath + + # schematron based validation scenario + if vs.validation_type == 'sch': + validation_fails_rdd = self._sch_validation(vs, vs_id, vs_name, vs_filepath) + + # python based validation scenario + elif vs.validation_type == 'python': + validation_fails_rdd = self._python_validation(vs, vs_id, vs_name, vs_filepath) + + # ElasticSearch DSL query based validation scenario + elif vs.validation_type == 'es_query': + validation_fails_rdd = self._es_query_validation(vs, vs_id, vs_name, vs_filepath) + + # XML Schema (XSD) based validation scenario + elif vs.validation_type == 'xsd': + validation_fails_rdd = self._xsd_validation(vs, vs_id, vs_name, vs_filepath) + + # if results, append + if validation_fails_rdd and not validation_fails_rdd.isEmpty(): + failure_rdds.append(validation_fails_rdd) + + # if rdds, union and write + if len(failure_rdds) > 0: + # merge rdds + failures_union_rdd = self.spark.sparkContext.union(failure_rdds) + failures_df = failures_union_rdd.toDF() + + # write failures + failures_df.write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record_validation").save() + + # update validity for Job + self.update_job_record_validity() + + def _sch_validation(self, vs, vs_id, vs_name, vs_filepath): + + self.logger.info('running schematron validation: %s' % vs.name) + + def validate_schematron_pt_udf(pt): + + # parse schematron + sct_doc = etree.parse(vs_filepath) + validator = isoschematron.Schematron(sct_doc, store_report=True) + + for row in pt: + + try: + + # get document xml + record_xml = etree.fromstring(row.document.encode('utf-8')) + + # validate + is_valid = validator.validate(record_xml) + # if not valid, prepare Row + if not is_valid: + # prepare results_dict + results_dict = { + 'fail_count': 0, + 'failed': [] + } + # get failed + report_root = validator.validation_report.getroot() + fails = report_root.findall('svrl:failed-assert', namespaces=report_root.nsmap) + # log fail_count + results_dict['fail_count'] = len(fails) + # loop through fails and add to dictionary + for fail in fails: + fail_text_elem = fail.find('svrl:text', namespaces=fail.nsmap) + results_dict['failed'].append(fail_text_elem.text) + yield Row( + record_id=row._id, + record_identifier=row.record_id, + job_id=row.job_id, + validation_scenario_id=int(vs_id), + validation_scenario_name=vs_name, + valid=False, + results_payload=json.dumps(results_dict), + fail_count=results_dict['fail_count'] + ) + + except Exception as e: + + results_dict = { + 'fail_count': 0, + 'failed': [] + } + results_dict['fail_count'] += 1 + results_dict['failed'].append("Schematron validation exception: %s" % (str(e))) + + yield Row( + record_id=row._id, + record_identifier=row.record_id, + job_id=row.job_id, + validation_scenario_id=int(vs_id), + validation_scenario_name=vs_name, + valid=False, + results_payload=json.dumps(results_dict), + fail_count=results_dict['fail_count'] + ) + + # run pt_udf map + validation_fails_rdd = self.records_df.rdd.mapPartitions(validate_schematron_pt_udf).filter( + lambda row: row is not None) + + # return + return validation_fails_rdd + + def _python_validation(self, vs, vs_id, vs_name, vs_filepath): + + self.logger.info('running python validation: %s' % vs.name) + + def validate_python_udf(vs_id, vs_name, pyvs_funcs, row): + + ''' + Loop through test functions and aggregate in fail_dict to return with Row + Args: + vs_id (int): integer of validation scenario + pyvs_funcs (list): list of functions imported from user created python validation scenario payload + row (): + ''' + # prepare row as parsed document with PythonUDFRecord class + prvb = PythonUDFRecord(row) + # prepare results_dict + results_dict = { + 'fail_count': 0, + 'failed': [] + } + + # loop through functions + for func in pyvs_funcs: + + # get name as string + func_name = func.__name__ + + # get func test message + func_signature = signature(func) + t_msg = func_signature.parameters['test_message'].default + + # attempt to run user-defined validation function + try: + + # run test + test_result = func(prvb) + + # if fail, append + if test_result != True: + + # bump fail count + results_dict['fail_count'] += 1 + + # if custom message override provided, use + if test_result != False: + results_dict['failed'].append(test_result) + + # else, default to test message + else: + results_dict['failed'].append(t_msg) + + # if problem, report as failure with Exception string + except Exception as e: + results_dict['fail_count'] += 1 + results_dict['failed'].append("test '%s' had exception: %s" % (func_name, str(e))) + + # if failed, return Row + if results_dict['fail_count'] > 0: + # return row + return Row( + record_id=row._id, + record_identifier=row.record_id, + job_id=row.job_id, + validation_scenario_id=int(vs_id), + validation_scenario_name=vs_name, + valid=False, + results_payload=json.dumps(results_dict), + fail_count=results_dict['fail_count'] + ) + + # parse user defined functions from validation scenario payload + temp_pyvs = ModuleType('temp_pyvs') + exec(vs.payload, temp_pyvs.__dict__) + + # get defined functions + pyvs_funcs = [] + test_labeled_attrs = [attr for attr in dir(temp_pyvs) if attr.lower().startswith('test')] + for attr in test_labeled_attrs: + attr = getattr(temp_pyvs, attr) + if isfunction(attr): + pyvs_funcs.append(attr) + + validation_fails_rdd = self.records_df.rdd. \ + map(lambda row: validate_python_udf(vs_id, vs_name, pyvs_funcs, row)) \ + .filter(lambda row: row is not None) + + # return + return validation_fails_rdd + + def _es_query_validation(self, vs, vs_id, vs_name, vs_filepath): + + self.logger.info('running es_query validation: %s' % vs.name) + + # set es index + # TODO: how handle published Jobs? + es_index = 'j%s' % self.job.id + + # loads validation payload as dictionary + validations = json.loads(vs.payload) + + # failure dfs + fail_dfs = [] + + # loop through validations + for v in validations: + + # prepare query + es_val_query = json.dumps(v['es_query']) + + # perform es query + es_rdd = self.spark.sparkContext.newAPIHadoopRDD( + inputFormatClass="org.elasticsearch.hadoop.mr.EsInputFormat", + keyClass="org.apache.hadoop.io.NullWritable", + valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", + conf={ + "es.resource": "%s/record" % es_index, + "es.query": es_val_query, + "es.read.field.include": "db_id" + } + ) + + # if query is not empty, map to DataFrame + if not es_rdd.isEmpty(): + es_df = es_rdd.map(lambda row: (row[1]['db_id'],)).toDF() + + # handle validity matching + # NOTE: matching on records_df['_id']['oid'] to get str cast of Mongo ObjectId + # if a match is valid, report all Records that don't match + if v['matches'] == 'valid': + + # if empty, assume all Records in Job are invalid + if es_rdd.isEmpty(): + fail_df = self.records_df.select('_id', 'record_id') + + # else, perform join + else: + fail_df = self.records_df.alias('records_df').join(es_df, + self.records_df['_id']['oid'] == es_df['_1'], + 'leftanti').select('_id', 'records_df.record_id') + + # if a match is invalid, report all Records that match + elif v['matches'] == 'invalid': + + # if empty, nothing to report, return None + if es_rdd.isEmpty(): + return None + + # else, perform join + else: + fail_df = self.records_df.alias('records_df').join(es_df, + self.records_df['_id']['oid'] == es_df['_1'], + 'leftsemi').select('_id', 'records_df.record_id') + + # add columns to df to return + fail_df = fail_df.withColumn('failed', + pyspark_sql_functions.array(pyspark_sql_functions.lit(v['test_name']))) + fail_df = fail_df.withColumn('fail_count', pyspark_sql_functions.lit(1)) + + # append to validations dictionary + fail_dfs.append(fail_df) + + # if dataframes to reduce and return, perform + if len(fail_dfs) > 0: + + # merge and format + new_df = reduce(lambda a, b: a.unionAll(b), fail_dfs) \ + .select("_id", "record_id", pyspark_sql_functions.explode("failed").alias("failed_values"), + "fail_count") \ + .groupBy("_id", "record_id") \ + .agg(pyspark_sql_functions.collect_list("failed_values").alias("failed"), + pyspark_sql_functions.sum("fail_count").alias("fail_count")) \ + .select("_id", "record_id", + pyspark_sql_functions.to_json(pyspark_sql_functions.struct("failed", "fail_count")).alias( + "data"), "fail_count") + + # write return failures as validation_fails_rdd + job_id = self.job.id + validation_fails_rdd = new_df.rdd.map(lambda row: Row( + record_id=row._id, + record_identifier=row.record_id, + job_id=job_id, + validation_scenario_id=int(vs_id), + validation_scenario_name=vs_name, + valid=False, + results_payload=row.data, + fail_count=int(row['fail_count'])) + ) + return validation_fails_rdd + + else: + return None + + def _xsd_validation(self, vs, vs_id, vs_name, vs_filepath): + + self.logger.info('running xsd validation: %s' % vs.name) + + def validate_xsd_pt_udf(pt): + + # parse xsd + xmlschema_doc = etree.parse(vs_filepath) + xmlschema = etree.XMLSchema(xmlschema_doc) + + for row in pt: + + try: + + # get document xml + record_xml = etree.fromstring(row.document.encode('utf-8')) + + # validate + try: + xmlschema.assertValid(record_xml) + + except etree.DocumentInvalid as e: + + # prepare results_dict + results_dict = { + 'fail_count': 1, + 'failed': [str(e)] + } + + yield Row( + record_id=row._id, + record_identifier=row.record_id, + job_id=row.job_id, + validation_scenario_id=int(vs_id), + validation_scenario_name=vs_name, + valid=False, + results_payload=json.dumps(results_dict), + fail_count=results_dict['fail_count'] + ) + + except Exception as e: + + results_dict = { + 'fail_count': 1, + 'failed': [] + } + results_dict['failed'].append("XSD validation exception: %s" % (str(e))) + + yield Row( + record_id=row._id, + record_identifier=row.record_id, + job_id=row.job_id, + validation_scenario_id=int(vs_id), + validation_scenario_name=vs_name, + valid=False, + results_payload=json.dumps(results_dict), + fail_count=results_dict['fail_count'] + ) + + # run pt_udf map + validation_fails_rdd = self.records_df.rdd.mapPartitions(validate_xsd_pt_udf).filter( + lambda row: row is not None) + + # return + return validation_fails_rdd + + def remove_validation_scenarios(self): + + ''' + Method to update validity attribute of records after removal of validation scenarios + - approach is to update all INVALID Records that may now be valid by lack of + matching record_id in remaining validation failures + ''' + + # read current failures from Mongo + failures_pipeline = json.dumps({'$match': {'job_id': self.job.id}}) + failures_df = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record_validation") \ + .option("pipeline", failures_pipeline).load() + + # if failures to work with, rewrite records with valid = True if NOT in remaining DB failures + if not failures_df.rdd.isEmpty(): + set_valid_df = self.records_df.alias('records_df').join( + failures_df.select('record_id').distinct().alias('failures_df'), + failures_df['record_id'] == self.records_df['_id'], + 'leftanti') \ + .select(self.records_df.columns) \ + .withColumn('valid', pyspark_sql_functions.lit(True)) + else: + # will write all previously invalid, as valid + set_valid_df = self.records_df.withColumn('valid', pyspark_sql_functions.lit(True)) + + # update validity of Records + set_valid_df.write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record").save() + + def update_job_record_validity(self): + + ''' + Method to update validity of Records in Job based on found RecordValidadtions + ''' + + # get failures + pipeline = json.dumps({'$match': {'$and': [{'job_id': self.job.id}]}}) + all_failures_df = self.spark.read.format("com.mongodb.spark.sql.DefaultSource") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record_validation") \ + .option("partitioner", "MongoSamplePartitioner") \ + .option("spark.mongodb.input.partitionerOptions.partitionSizeMB", settings.MONGO_READ_PARTITION_SIZE_MB) \ + .option("pipeline", pipeline).load() \ + .select('record_id') \ + .withColumnRenamed('record_id', 'fail_id') + + # join, writing potentially null `fail_id` column + fail_join = self.records_df.alias('records_df').join( + all_failures_df.select('fail_id').distinct().alias('all_failures_df'), + self.records_df['_id'] == all_failures_df['fail_id'], + 'leftouter') + + # set valid column based on join and drop column + updated_validity = fail_join.withColumn('update_valid', + pyspark_sql_functions.when(fail_join['fail_id'].isNotNull(), + False).otherwise(True)) + + # subset those that need updating and flip validity + to_update = updated_validity.where(updated_validity['valid'] != updated_validity['update_valid']) \ + .select(self.records_df.columns) \ + .withColumn('valid', pyspark_sql_functions.when(self.records_df.valid == True, False).otherwise(True)) + + # update in DB by overwriting + to_update.write.format("com.mongodb.spark.sql.DefaultSource") \ + .mode("append") \ + .option("uri", "mongodb://%s" % settings.MONGO_HOST) \ + .option("database", "combine") \ + .option("collection", "record").save() + + def export_job_validation_report(self): + + pass diff --git a/core/spark/utils.py b/core/spark/utils.py index 3ab11d81..34c08611 100644 --- a/core/spark/utils.py +++ b/core/spark/utils.py @@ -1,140 +1,133 @@ - # generic imports import django from lxml import etree import os import sys - # check for registered apps signifying readiness, if not, run django.setup() to run as standalone if not hasattr(django, 'apps'): - os.environ['DJANGO_SETTINGS_MODULE'] = 'combine.settings' - sys.path.append('/opt/combine') - django.setup() + os.environ['DJANGO_SETTINGS_MODULE'] = 'combine.settings' + sys.path.append('/opt/combine') + django.setup() # import django settings from django.conf import settings from django.db import connection - def refresh_django_db_connection(): - - ''' - Function to refresh connection to Django DB. - - Behavior with python files uploaded to Spark context via Livy is atypical when - it comes to opening/closing connections with MySQL. Specifically, if jobs are run farther - apart than MySQL's `wait_timeout` setting, it will result in the error, (2006, 'MySQL server has gone away'). + ''' + Function to refresh connection to Django DB. - Running this function before jobs ensures that the connection is fresh between these python files - operating in the Livy context, and Django's DB connection to MySQL. + Behavior with python files uploaded to Spark context via Livy is atypical when + it comes to opening/closing connections with MySQL. Specifically, if jobs are run farther + apart than MySQL's `wait_timeout` setting, it will result in the error, (2006, 'MySQL server has gone away'). - Args: - None + Running this function before jobs ensures that the connection is fresh between these python files + operating in the Livy context, and Django's DB connection to MySQL. - Returns: - None - ''' + Args: + None - connection.close() - connection.connect() + Returns: + None + ''' + connection.close() + connection.connect() class PythonUDFRecord(object): + ''' + Class to provide a slim-downed version of core.models.Record that is used for spark UDF functions, + and for previewing python based validations and transformations + ''' - ''' - Class to provide a slim-downed version of core.models.Record that is used for spark UDF functions, - and for previewing python based validations and transformations - ''' + def __init__(self, record_input, non_row_input=False, record_id=None, document=None): - def __init__(self, record_input, non_row_input=False, record_id=None, document=None): + ''' + Instantiated in one of two ways + 1) from a DB row representing a Record in its entirety + 2) manually passed record_id or document (or both), triggered by non_row_input Flag + - for example, this is used for testing record_id transformations + ''' - ''' - Instantiated in one of two ways - 1) from a DB row representing a Record in its entirety - 2) manually passed record_id or document (or both), triggered by non_row_input Flag - - for example, this is used for testing record_id transformations - ''' + if non_row_input: - if non_row_input: + # if record_id provided, set + if record_id: + self.record_id = record_id - # if record_id provided, set - if record_id: - self.record_id = record_id + # if document provided, set and parse + if document: + self.document = document - # if document provided, set and parse - if document: - self.document = document + try: - try: + # parse XML string, save + self.xml = etree.fromstring(self.document.encode('utf-8')) - # parse XML string, save - self.xml = etree.fromstring(self.document.encode('utf-8')) + # get namespace map, popping None values + _nsmap = self.xml.nsmap.copy() + try: + _nsmap.pop(None) + except: + pass + self.nsmap = _nsmap - # get namespace map, popping None values - _nsmap = self.xml.nsmap.copy() - try: - _nsmap.pop(None) - except: - pass - self.nsmap = _nsmap + except: - except: + self.xml = None + self.nsmap = None - self.xml = None - self.nsmap = None + else: - else: + # row + self._row = record_input - # row - self._row = record_input + # get db id + try: + self.id = self._row._id + except: + pass - # get db id - try: - self.id = self._row._id - except: - pass + # get record id + self.record_id = self._row.record_id - # get record id - self.record_id = self._row.record_id + # document string + self.document = self._row.document - # document string - self.document = self._row.document + # set error + self.error = self._row.error - # set error - self.error = self._row.error + try: - try: + # parse XML string, save + self.xml = etree.fromstring(self.document.encode('utf-8')) - # parse XML string, save - self.xml = etree.fromstring(self.document.encode('utf-8')) + # get namespace map, popping None values + _nsmap = self.xml.nsmap.copy() + try: + _nsmap.pop(None) + except: + pass + self.nsmap = _nsmap - # get namespace map, popping None values - _nsmap = self.xml.nsmap.copy() - try: - _nsmap.pop(None) - except: - pass - self.nsmap = _nsmap + # set inverted nsmap + self.nsmap_inv = {v: k for k, v in self.nsmap.items()} - # set inverted nsmap - self.nsmap_inv = {v: k for k, v in self.nsmap.items()} + except: - except: + self.xml = None + self.nsmap = None - self.xml = None - self.nsmap = None - def df_union_all(dfs): - - ''' - Function to merge list of DataFrames - ''' - - if len(dfs) > 1: - return dfs[0].unionAll(df_union_all(dfs[1:])) - else: - return dfs[0] + ''' + Function to merge list of DataFrames + ''' + + if len(dfs) > 1: + return dfs[0].unionAll(df_union_all(dfs[1:])) + else: + return dfs[0] diff --git a/core/tasks.py b/core/tasks.py index b3e7f8d7..c41ff85d 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -19,6 +19,7 @@ # Get an instance of a logger import logging + logger = logging.getLogger(__name__) # import celery app @@ -30,1374 +31,1359 @@ # AWS import boto3 + # TODO: need some handling for failed Jobs which may not be available, but will not be changing, # to prevent infinite polling (https://github.com/WSULib/combine/issues/192) def spark_job_done(response): - return response['state'] == 'available' + return response['state'] == 'available' @celery_app.task() def delete_model_instance(instance_model, instance_id): + ''' + Background task to delete generic DB model instance + ''' - ''' - Background task to delete generic DB model instance - ''' - - # try: + # try: - # get model - m = getattr(models, instance_model, None) + # get model + m = getattr(models, instance_model, None) - if m: + if m: - # get model instance - i = m.objects.get(pk=int(instance_id)) - logger.info('retrieved %s model, instance ID %s, deleting' % (m.__name__, instance_id)) + # get model instance + i = m.objects.get(pk=int(instance_id)) + logger.info('retrieved %s model, instance ID %s, deleting' % (m.__name__, instance_id)) - # delete - return i.delete() + # delete + return i.delete() - else: - logger.info('Combine model %s not found, aborting' % (instance_model)) + else: + logger.info('Combine model %s not found, aborting' % (instance_model)) @celery_app.task() def download_and_index_bulk_data(dbdd_id): + ''' + Background task driver to manage downloading and indexing of bulk data - ''' - Background task driver to manage downloading and indexing of bulk data + Args: + dbdd_id (int): ID of DPLABulkDataDownload (dbdd) instance + ''' - Args: - dbdd_id (int): ID of DPLABulkDataDownload (dbdd) instance - ''' + # init bulk download instance + dbdd = models.DPLABulkDataDownload.objects.get(pk=dbdd_id) - # init bulk download instance - dbdd = models.DPLABulkDataDownload.objects.get(pk=dbdd_id) + # init data client with filepath + dbdc = models.DPLABulkDataClient() - # init data client with filepath - dbdc = models.DPLABulkDataClient() + # download data + logger.info('downloading %s' % dbdd.s3_key) + dbdd.status = 'downloading' + dbdd.save() + download_results = dbdc.download_bulk_data(dbdd.s3_key, dbdd.filepath) - # download data - logger.info('downloading %s' % dbdd.s3_key) - dbdd.status = 'downloading' - dbdd.save() - download_results = dbdc.download_bulk_data(dbdd.s3_key, dbdd.filepath) + # index data + logger.info('indexing %s' % dbdd.filepath) + dbdd.status = 'indexing' + dbdd.save() + es_index = dbdc.index_to_es(dbdd.s3_key, dbdd.filepath) - # index data - logger.info('indexing %s' % dbdd.filepath) - dbdd.status = 'indexing' - dbdd.save() - es_index = dbdc.index_to_es(dbdd.s3_key, dbdd.filepath) - - # update and return - dbdd.es_index = es_index - dbdd.status = 'finished' - dbdd.save() + # update and return + dbdd.es_index = es_index + dbdd.status = 'finished' + dbdd.save() @celery_app.task() def create_validation_report(ct_id): + ''' + Function to generate a Validation Report for a Job as a bg task - ''' - Function to generate a Validation Report for a Job as a bg task - - Args: - request (django.request): request object with parameters needed for report generation + Args: + request (django.request): request object with parameters needed for report generation - Returns: - location on disk - ''' + Returns: + location on disk + ''' - # get CombineTask (ct) - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + # get CombineTask (ct) + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - logger.info(ct.task_params) + logger.info(ct.task_params) - try: + try: - # check for livy session - _check_livy_session() + # check for livy session + _check_livy_session() - # set output path - output_path = '/tmp/%s' % uuid.uuid4().hex + # set output path + output_path = '/tmp/%s' % uuid.uuid4().hex - # generate spark code - spark_code = "from console import *\ngenerate_validation_report(spark, '%(output_path)s', %(task_params)s)" % { - 'output_path':output_path, - 'task_params':ct.task_params - } - logger.info(spark_code) + # generate spark code + spark_code = "from console import *\ngenerate_validation_report(spark, '%(output_path)s', %(task_params)s)" % { + 'output_path': output_path, + 'task_params': ct.task_params + } + logger.info(spark_code) - # submit to livy - logger.info('submitting code to Spark') - submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) + # submit to livy + logger.info('submitting code to Spark') + submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code': spark_code}) - # poll until complete - logger.info('polling for Spark job to complete...') - results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) - logger.info(results) + # poll until complete + logger.info('polling for Spark job to complete...') + results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), + check_success=spark_job_done, step=5, poll_forever=True) + logger.info(results) - # set archive filename of loose XML files - archive_filename_root = '/tmp/%s.%s' % (ct.task_params['report_name'],ct.task_params['report_format']) + # set archive filename of loose XML files + archive_filename_root = '/tmp/%s.%s' % (ct.task_params['report_name'], ct.task_params['report_format']) - # loop through partitioned parts, coalesce and write to single file - logger.info('coalescing output parts') + # loop through partitioned parts, coalesce and write to single file + logger.info('coalescing output parts') - # glob parts - export_parts = glob.glob('%s/part*' % output_path) - logger.info('found %s documents to group' % len(export_parts)) + # glob parts + export_parts = glob.glob('%s/part*' % output_path) + logger.info('found %s documents to group' % len(export_parts)) - # if output not found, exit - if len(export_parts) == 0: - ct.task_output_json = json.dumps({ - 'error':'no output found', - 'spark_output':results - }) - ct.save() + # if output not found, exit + if len(export_parts) == 0: + ct.task_output_json = json.dumps({ + 'error': 'no output found', + 'spark_output': results + }) + ct.save() - # else, continue - else: + # else, continue + else: - # set report_format - report_format = ct.task_params['report_format'] + # set report_format + report_format = ct.task_params['report_format'] - # open new file for writing and loop through files - with open(archive_filename_root, 'w') as fout, fileinput.input(export_parts) as fin: + # open new file for writing and loop through files + with open(archive_filename_root, 'w') as fout, fileinput.input(export_parts) as fin: - # if CSV or TSV, write first line of headers - if report_format == 'csv': - header_string = 'db_id,record_id,validation_scenario_id,validation_scenario_name,results_payload,fail_count' - if len(ct.task_params['mapped_field_include']) > 0: - header_string += ',' + ','.join(ct.task_params['mapped_field_include']) - fout.write('%s\n' % header_string) + # if CSV or TSV, write first line of headers + if report_format == 'csv': + header_string = 'db_id,record_id,validation_scenario_id,validation_scenario_name,results_payload,fail_count' + if len(ct.task_params['mapped_field_include']) > 0: + header_string += ',' + ','.join(ct.task_params['mapped_field_include']) + fout.write('%s\n' % header_string) - if report_format == 'tsv': - header_string = 'db_id\trecord_id\tvalidation_scenario_id\tvalidation_scenario_name\tresults_payload\tfail_count' - if len(ct.task_params['mapped_field_include']) > 0: - header_string += '\t' + '\t'.join(ct.task_params['mapped_field_include']) - fout.write('%s\n' % header_string) + if report_format == 'tsv': + header_string = 'db_id\trecord_id\tvalidation_scenario_id\tvalidation_scenario_name\tresults_payload\tfail_count' + if len(ct.task_params['mapped_field_include']) > 0: + header_string += '\t' + '\t'.join(ct.task_params['mapped_field_include']) + fout.write('%s\n' % header_string) - # loop through output and write - for line in fin: - fout.write(line) + # loop through output and write + for line in fin: + fout.write(line) - # removing partitioned output - logger.info('removing dir: %s' % output_path) - shutil.rmtree(output_path) + # removing partitioned output + logger.info('removing dir: %s' % output_path) + shutil.rmtree(output_path) - # optionally, compress file - if ct.task_params['compression_type'] == 'none': - logger.info('no compression requested, continuing') - output_filename = archive_filename_root + # optionally, compress file + if ct.task_params['compression_type'] == 'none': + logger.info('no compression requested, continuing') + output_filename = archive_filename_root - elif ct.task_params['compression_type'] == 'zip': + elif ct.task_params['compression_type'] == 'zip': - logger.info('creating compressed zip archive') - report_format = 'zip' + logger.info('creating compressed zip archive') + report_format = 'zip' - # establish output archive file - output_filename = '%s.zip' % (archive_filename_root) + # establish output archive file + output_filename = '%s.zip' % (archive_filename_root) - with zipfile.ZipFile(output_filename,'w', zipfile.ZIP_DEFLATED) as zip: - zip.write(archive_filename_root, archive_filename_root.split('/')[-1]) + with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zip: + zip.write(archive_filename_root, archive_filename_root.split('/')[-1]) - # tar.gz - elif ct.task_params['compression_type'] == 'targz': + # tar.gz + elif ct.task_params['compression_type'] == 'targz': - logger.info('creating compressed tar archive') - report_format = 'targz' + logger.info('creating compressed tar archive') + report_format = 'targz' - # establish output archive file - output_filename = '%s.tar.gz' % (archive_filename_root) + # establish output archive file + output_filename = '%s.tar.gz' % (archive_filename_root) - with tarfile.open(output_filename, 'w:gz') as tar: - tar.add(archive_filename_root, arcname=archive_filename_root.split('/')[-1]) + with tarfile.open(output_filename, 'w:gz') as tar: + tar.add(archive_filename_root, arcname=archive_filename_root.split('/')[-1]) - # save validation report output to Combine Task output - ct.task_output_json = json.dumps({ - 'report_format':report_format, - 'mapped_field_include':ct.task_params['mapped_field_include'], - 'output_dir':output_path, - 'output_filename':output_filename, - 'results':results - }) - ct.save() + # save validation report output to Combine Task output + ct.task_output_json = json.dumps({ + 'report_format': report_format, + 'mapped_field_include': ct.task_params['mapped_field_include'], + 'output_dir': output_path, + 'output_filename': output_filename, + 'results': results + }) + ct.save() - except Exception as e: + except Exception as e: - logger.info(str(e)) + logger.info(str(e)) - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() @celery_app.task() def export_mapped_fields(ct_id): + # get CombineTask (ct) + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + + try: + + # JSON export + if ct.task_params['mapped_fields_export_type'] == 'json': + + # handle single Job + if 'job_id' in ct.task_params.keys(): + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + + # set output filename + output_path = '/tmp/%s' % uuid.uuid4().hex + os.mkdir(output_path) + export_output = '%s/job_%s_mapped_fields.json' % (output_path, cjob.job.id) + + # build command list + cmd = [ + "elasticdump", + "--input=http://%s:9200/j%s" % (settings.ES_HOST, cjob.job.id), + "--output=%s" % export_output, + "--type=data", + "--sourceOnly", + "--ignore-errors", + "--noRefresh" + ] + + # handle published records + if 'published' in ct.task_params.keys(): + # set output filename + output_path = '/tmp/%s' % uuid.uuid4().hex + os.mkdir(output_path) + export_output = '%s/published_mapped_fields.json' % (output_path) + + # get list of jobs ES indices to export + pr = models.PublishedRecords(subset=ct.task_params['subset']) + es_list = ','.join(['j%s' % job.id for job in pr.published_jobs]) + + # build command list + cmd = [ + "elasticdump", + "--input=http://%s:9200/%s" % (settings.ES_HOST, es_list), + "--output=%s" % export_output, + "--type=data", + "--sourceOnly", + "--ignore-errors", + "--noRefresh" + ] + + # if fields provided, limit + if ct.task_params['mapped_field_include']: + logger.info('specific fields selected, adding to elasticdump command:') + searchBody = { + "_source": ct.task_params['mapped_field_include'] + } + cmd.append("--searchBody='%s'" % json.dumps(searchBody)) + + # CSV export + if ct.task_params['mapped_fields_export_type'] == 'csv': + + # handle single Job + if 'job_id' in ct.task_params.keys(): + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + + # set output filename + output_path = '/tmp/%s' % uuid.uuid4().hex + os.mkdir(output_path) + export_output = '%s/job_%s_mapped_fields.csv' % (output_path, cjob.job.id) + + # build command list + cmd = [ + "es2csv", + "-u http://%s:9200" % settings.ES_HOST, + "-q '*'", + "-i 'j%s'" % cjob.job.id, + "-D 'record'", + "-o '%s'" % export_output + ] + + # handle published records + if 'published' in ct.task_params.keys(): + # set output filename + output_path = '/tmp/%s' % uuid.uuid4().hex + os.mkdir(output_path) + export_output = '%s/published_mapped_fields.csv' % (output_path) + + # get list of jobs ES indices to export + pr = models.PublishedRecords(subset=ct.task_params['subset']) + es_list = ','.join(['j%s' % job.id for job in pr.published_jobs]) + + # build command list + cmd = [ + "es2csv", + "-u http://%s:9200" % settings.ES_HOST, + "-q '*'", + "-i '%s'" % es_list, + "-D 'record'", + "-o '%s'" % export_output + ] + + # handle kibana style + if ct.task_params['kibana_style']: + cmd.append('-k') + cmd.append("-kd '|'") + + # if fields provided, limit + if ct.task_params['mapped_field_include']: + logger.info('specific fields selected, adding to es2csv command:') + cmd.append('-f ' + " ".join(["'%s'" % field for field in ct.task_params['mapped_field_include']])) + + # execute compiled command + logger.info(cmd) + os.system(" ".join(cmd)) + + # handle compression + if ct.task_params['archive_type'] == 'none': + logger.info('uncompressed csv file requested, continuing') + + elif ct.task_params['archive_type'] == 'zip': + + logger.info('creating compressed zip archive') + content_type = 'application/zip' + + # establish output archive file + export_output_archive = '%s/%s.zip' % (output_path, export_output.split('/')[-1]) + + with zipfile.ZipFile(export_output_archive, 'w', zipfile.ZIP_DEFLATED) as zip: + zip.write(export_output, export_output.split('/')[-1]) + + # set export output to archive file + export_output = export_output_archive + + # tar.gz + elif ct.task_params['archive_type'] == 'targz': + + logger.info('creating compressed tar archive') + content_type = 'application/gzip' + + # establish output archive file + export_output_archive = '%s/%s.tar.gz' % (output_path, export_output.split('/')[-1]) + + with tarfile.open(export_output_archive, 'w:gz') as tar: + tar.add(export_output, arcname=export_output.split('/')[-1]) + + # set export output to archive file + export_output = export_output_archive + + # handle s3 bucket + if ct.task_params.get('s3_export', False): + + logger.debug('writing archive file to S3') + + # upload to s3 + s3 = boto3.resource('s3', + aws_access_key_id=settings.AWS_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) + s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key']) \ + .put(Body=open(export_output, 'rb')) + + # delete all traces from local output + shutil.rmtree(output_path) + + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 's3_export_type': ct.task_params['s3_export_type'], + 'export_output': 's3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')), + }) + ct.save() + logger.info(ct.task_output_json) + + # handle local filesystem + else: + + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'export_output': export_output, + 'name': export_output.split('/')[-1], + 'export_dir': "/".join(export_output.split('/')[:-1]) + }) + ct.save() - # get CombineTask (ct) - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - - try: - - # JSON export - if ct.task_params['mapped_fields_export_type'] == 'json': - - # handle single Job - if 'job_id' in ct.task_params.keys(): - - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - - # set output filename - output_path = '/tmp/%s' % uuid.uuid4().hex - os.mkdir(output_path) - export_output = '%s/job_%s_mapped_fields.json' % (output_path, cjob.job.id) - - # build command list - cmd = [ - "elasticdump", - "--input=http://%s:9200/j%s" % (settings.ES_HOST, cjob.job.id), - "--output=%s" % export_output, - "--type=data", - "--sourceOnly", - "--ignore-errors", - "--noRefresh" - ] - - # handle published records - if 'published' in ct.task_params.keys(): - - # set output filename - output_path = '/tmp/%s' % uuid.uuid4().hex - os.mkdir(output_path) - export_output = '%s/published_mapped_fields.json' % (output_path) - - # get list of jobs ES indices to export - pr = models.PublishedRecords(subset=ct.task_params['subset']) - es_list = ','.join(['j%s' % job.id for job in pr.published_jobs]) - - # build command list - cmd = [ - "elasticdump", - "--input=http://%s:9200/%s" % (settings.ES_HOST, es_list), - "--output=%s" % export_output, - "--type=data", - "--sourceOnly", - "--ignore-errors", - "--noRefresh" - ] - - # if fields provided, limit - if ct.task_params['mapped_field_include']: - logger.info('specific fields selected, adding to elasticdump command:') - searchBody = { - "_source":ct.task_params['mapped_field_include'] - } - cmd.append("--searchBody='%s'" % json.dumps(searchBody)) - - - # CSV export - if ct.task_params['mapped_fields_export_type'] == 'csv': - - # handle single Job - if 'job_id' in ct.task_params.keys(): - - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - - # set output filename - output_path = '/tmp/%s' % uuid.uuid4().hex - os.mkdir(output_path) - export_output = '%s/job_%s_mapped_fields.csv' % (output_path, cjob.job.id) - - # build command list - cmd = [ - "es2csv", - "-u http://%s:9200" % settings.ES_HOST, - "-q '*'", - "-i 'j%s'" % cjob.job.id, - "-D 'record'", - "-o '%s'" % export_output - ] - - # handle published records - if 'published' in ct.task_params.keys(): - - # set output filename - output_path = '/tmp/%s' % uuid.uuid4().hex - os.mkdir(output_path) - export_output = '%s/published_mapped_fields.csv' % (output_path) - - # get list of jobs ES indices to export - pr = models.PublishedRecords(subset=ct.task_params['subset']) - es_list = ','.join(['j%s' % job.id for job in pr.published_jobs]) - - # build command list - cmd = [ - "es2csv", - "-u http://%s:9200" % settings.ES_HOST, - "-q '*'", - "-i '%s'" % es_list, - "-D 'record'", - "-o '%s'" % export_output - ] - - # handle kibana style - if ct.task_params['kibana_style']: - cmd.append('-k') - cmd.append("-kd '|'") - - # if fields provided, limit - if ct.task_params['mapped_field_include']: - logger.info('specific fields selected, adding to es2csv command:') - cmd.append('-f ' + " ".join(["'%s'" % field for field in ct.task_params['mapped_field_include']])) - - # execute compiled command - logger.info(cmd) - os.system(" ".join(cmd)) - - # handle compression - if ct.task_params['archive_type'] == 'none': - logger.info('uncompressed csv file requested, continuing') - - elif ct.task_params['archive_type'] == 'zip': - - logger.info('creating compressed zip archive') - content_type = 'application/zip' - - # establish output archive file - export_output_archive = '%s/%s.zip' % (output_path, export_output.split('/')[-1]) - - with zipfile.ZipFile(export_output_archive,'w', zipfile.ZIP_DEFLATED) as zip: - zip.write(export_output, export_output.split('/')[-1]) - - # set export output to archive file - export_output = export_output_archive - - # tar.gz - elif ct.task_params['archive_type'] == 'targz': - - logger.info('creating compressed tar archive') - content_type = 'application/gzip' - - # establish output archive file - export_output_archive = '%s/%s.tar.gz' % (output_path, export_output.split('/')[-1]) - - with tarfile.open(export_output_archive, 'w:gz') as tar: - tar.add(export_output, arcname=export_output.split('/')[-1]) - - # set export output to archive file - export_output = export_output_archive - - # handle s3 bucket - if ct.task_params.get('s3_export', False): - - logger.debug('writing archive file to S3') - - # upload to s3 - s3 = boto3.resource('s3', - aws_access_key_id=settings.AWS_ACCESS_KEY_ID, - aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) - s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key'])\ - .put(Body=open(export_output,'rb')) - - # delete all traces from local output - shutil.rmtree(output_path) - - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 's3_export_type':ct.task_params['s3_export_type'], - 'export_output':'s3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')), - }) - ct.save() - logger.info(ct.task_output_json) - - # handle local filesystem - else: - - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'export_output':export_output, - 'name':export_output.split('/')[-1], - 'export_dir':"/".join(export_output.split('/')[:-1]) - }) - ct.save() + except Exception as e: - except Exception as e: + logger.info(str(e)) - logger.info(str(e)) - - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() @celery_app.task() def export_tabular_data(ct_id): + # get CombineTask (ct) + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - # get CombineTask (ct) - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - - # generate spark code - output_path = '/tmp/%s' % str(uuid.uuid4()) + # generate spark code + output_path = '/tmp/%s' % str(uuid.uuid4()) - # handle single Job - if 'job_id' in ct.task_params.keys(): + # handle single Job + if 'job_id' in ct.task_params.keys(): + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + # set archive filename of loose XML files + archive_filename_root = 'j_%s_tabular_data' % cjob.job.id - # set archive filename of loose XML files - archive_filename_root = 'j_%s_tabular_data' % cjob.job.id + # build job_dictionary + job_dict = {'j%s' % cjob.job.id: [cjob.job.id]} + logger.info(job_dict) - # build job_dictionary - job_dict = {'j%s' % cjob.job.id: [cjob.job.id]} - logger.info(job_dict) + # handle published records + if 'published' in ct.task_params.keys(): - # handle published records - if 'published' in ct.task_params.keys(): + # set archive filename of loose XML files + archive_filename_root = 'published_tabular_data' - # set archive filename of loose XML files - archive_filename_root = 'published_tabular_data' + # get anonymous CombineJob + cjob = models.CombineJob() - # get anonymous CombineJob - cjob = models.CombineJob() + # get published records to determine sets + pr = models.PublishedRecords(subset=ct.task_params['subset']) - # get published records to determine sets - pr = models.PublishedRecords(subset=ct.task_params['subset']) + # init job dictionary + job_dict = {} - # init job dictionary - job_dict = {} + # handle published jobs with publish set ids + for publish_id, jobs in pr.sets.items(): + job_dict[publish_id] = [job.id for job in jobs] - # handle published jobs with publish set ids - for publish_id, jobs in pr.sets.items(): - job_dict[publish_id] = [ job.id for job in jobs ] + # handle "loose" Jobs + job_dict['no_publish_set_id'] = [job.id for job in pr.published_jobs.filter(publish_set_id='')] - # handle "loose" Jobs - job_dict['no_publish_set_id'] = [job.id for job in pr.published_jobs.filter(publish_set_id='')] + # update task params + ct.refresh_from_db() + ct.update_task_params({ + 'output_path': output_path, + 'archive_filename_root': archive_filename_root, + 'job_dict': job_dict + }) - # update task params - ct.refresh_from_db() - ct.update_task_params({ - 'output_path':output_path, - 'archive_filename_root':archive_filename_root, - 'job_dict':job_dict - }) + # prepare spark code + spark_code = "from console import *\nexport_records_as_tabular_data(spark, %d)" % (int(ct_id)) + logger.info(spark_code) - # prepare spark code - spark_code = "from console import *\nexport_records_as_tabular_data(spark, %d)" % (int(ct_id)) - logger.info(spark_code) + # submit spark code to livy + try: - # submit spark code to livy - try: + # check for livy session + _check_livy_session() - # check for livy session - _check_livy_session() + logger.info('submitting code to Spark') + submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code': spark_code}) - logger.info('submitting code to Spark') - submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) + # poll until complete + logger.info('polling for Spark job to complete...') + results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), + check_success=spark_job_done, step=5, poll_forever=True) + logger.info(results) - # poll until complete - logger.info('polling for Spark job to complete...') - results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) - logger.info(results) + # handle s3 bucket + if ct.task_params.get('s3_export', False): - # handle s3 bucket - if ct.task_params.get('s3_export', False): + if ct.task_params.get('s3_export_type') == 'archive': - if ct.task_params.get('s3_export_type') == 'archive': + # create single archive file + ct = _create_export_tabular_data_archive(ct) - # create single archive file - ct = _create_export_tabular_data_archive(ct) + # upload to s3 + s3 = boto3.resource('s3', + aws_access_key_id=settings.AWS_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) + s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key']) \ + .put(Body=open(ct.task_params['export_output_archive'], 'rb')) - # upload to s3 - s3 = boto3.resource('s3', - aws_access_key_id=settings.AWS_ACCESS_KEY_ID, - aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) - s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key'])\ - .put(Body=open(ct.task_params['export_output_archive'],'rb')) + # delete all traces from local output + shutil.rmtree(ct.task_params['output_path']) - # delete all traces from local output - shutil.rmtree(ct.task_params['output_path']) + elif ct.task_params.get('s3_export_type') == 'spark_df': + logger.debug('s3 export type was spark_df, nothing to cleanup or do') - elif ct.task_params.get('s3_export_type') == 'spark_df': - logger.debug('s3 export type was spark_df, nothing to cleanup or do') + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 's3_export_type': ct.task_params['s3_export_type'], + 'export_output': 's3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')), + }) + ct.save() + logger.info(ct.task_output_json) - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 's3_export_type':ct.task_params['s3_export_type'], - 'export_output':'s3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')), - }) - ct.save() - logger.info(ct.task_output_json) + # handle local filesystem + else: - # handle local filesystem - else: + # create single archive file + ct = _create_export_tabular_data_archive(ct) - # create single archive file - ct = _create_export_tabular_data_archive(ct) + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'export_output': ct.task_params['export_output_archive'], + 'name': ct.task_params['export_output_archive'].split('/')[-1], + 'content_type': ct.task_params['content_type'], + 'export_dir': "/".join(ct.task_params['export_output_archive'].split('/')[:-1]) + }) + ct.save() + logger.info(ct.task_output_json) - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'export_output':ct.task_params['export_output_archive'], - 'name':ct.task_params['export_output_archive'].split('/')[-1], - 'content_type':ct.task_params['content_type'], - 'export_dir':"/".join(ct.task_params['export_output_archive'].split('/')[:-1]) - }) - ct.save() - logger.info(ct.task_output_json) + except Exception as e: - except Exception as e: + logger.info(str(e)) - logger.info(str(e)) - - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() def _create_export_tabular_data_archive(ct): + # rewrite with extensions + export_parts = glob.glob('%s/**/part*' % ct.task_params['output_path']) + for part in export_parts: + if not part.endswith(ct.task_params['tabular_data_export_type']): + os.rename(part, '%s.%s' % (part, ct.task_params['tabular_data_export_type'])) - # rewrite with extensions - export_parts = glob.glob('%s/**/part*' % ct.task_params['output_path']) - for part in export_parts: - if not part.endswith(ct.task_params['tabular_data_export_type']): - os.rename(part, '%s.%s' % (part, ct.task_params['tabular_data_export_type'])) - - # save list of directories to remove - pre_archive_dirs = glob.glob('%s/**' % ct.task_params['output_path']) + # save list of directories to remove + pre_archive_dirs = glob.glob('%s/**' % ct.task_params['output_path']) - # zip - if ct.task_params['archive_type'] == 'zip': + # zip + if ct.task_params['archive_type'] == 'zip': - logger.info('creating compressed zip archive') - content_type = 'application/zip' + logger.info('creating compressed zip archive') + content_type = 'application/zip' - # establish output archive file - export_output_archive = '%s/%s.zip' % (ct.task_params['output_path'], ct.task_params['archive_filename_root']) + # establish output archive file + export_output_archive = '%s/%s.zip' % (ct.task_params['output_path'], ct.task_params['archive_filename_root']) - with zipfile.ZipFile(export_output_archive,'w', zipfile.ZIP_DEFLATED) as zip: - for f in glob.glob('%s/**/*.%s' % (ct.task_params['output_path'], ct.task_params['tabular_data_export_type'])): - zip.write(f, '/'.join(f.split('/')[-2:])) + with zipfile.ZipFile(export_output_archive, 'w', zipfile.ZIP_DEFLATED) as zip: + for f in glob.glob( + '%s/**/*.%s' % (ct.task_params['output_path'], ct.task_params['tabular_data_export_type'])): + zip.write(f, '/'.join(f.split('/')[-2:])) - # tar - elif ct.task_params['archive_type'] == 'tar': + # tar + elif ct.task_params['archive_type'] == 'tar': - logger.info('creating uncompressed tar archive') - content_type = 'application/tar' + logger.info('creating uncompressed tar archive') + content_type = 'application/tar' - # establish output archive file - export_output_archive = '%s/%s.tar' % (ct.task_params['output_path'], ct.task_params['archive_filename_root']) + # establish output archive file + export_output_archive = '%s/%s.tar' % (ct.task_params['output_path'], ct.task_params['archive_filename_root']) - with tarfile.open(export_output_archive, 'w') as tar: - for f in glob.glob('%s/**/*.%s' % (ct.task_params['output_path'], ct.task_params['tabular_data_export_type'])): - tar.add(f, arcname='/'.join(f.split('/')[-2:])) + with tarfile.open(export_output_archive, 'w') as tar: + for f in glob.glob( + '%s/**/*.%s' % (ct.task_params['output_path'], ct.task_params['tabular_data_export_type'])): + tar.add(f, arcname='/'.join(f.split('/')[-2:])) - # tar.gz - elif ct.task_params['archive_type'] == 'targz': + # tar.gz + elif ct.task_params['archive_type'] == 'targz': - logger.info('creating compressed tar archive') - content_type = 'application/gzip' + logger.info('creating compressed tar archive') + content_type = 'application/gzip' - # establish output archive file - export_output_archive = '%s/%s.tar.gz' % (ct.task_params['output_path'], ct.task_params['archive_filename_root']) + # establish output archive file + export_output_archive = '%s/%s.tar.gz' % ( + ct.task_params['output_path'], ct.task_params['archive_filename_root']) - with tarfile.open(export_output_archive, 'w:gz') as tar: - for f in glob.glob('%s/**/*.%ss' % (ct.task_params['output_path'], ct.task_params['tabular_data_export_type'])): - tar.add(f, arcname='/'.join(f.split('/')[-2:])) + with tarfile.open(export_output_archive, 'w:gz') as tar: + for f in glob.glob( + '%s/**/*.%ss' % (ct.task_params['output_path'], ct.task_params['tabular_data_export_type'])): + tar.add(f, arcname='/'.join(f.split('/')[-2:])) - # cleanup directory - for d in pre_archive_dirs: - logger.info('removing dir: %s' % d) - shutil.rmtree(d) + # cleanup directory + for d in pre_archive_dirs: + logger.info('removing dir: %s' % d) + shutil.rmtree(d) - # update task params - ct.refresh_from_db() - ct.update_task_params({ - 'export_output_archive':export_output_archive, - 'content_type':content_type - }) + # update task params + ct.refresh_from_db() + ct.update_task_params({ + 'export_output_archive': export_output_archive, + 'content_type': content_type + }) - # return - return ct + # return + return ct @celery_app.task() def export_documents(ct_id): + ''' + - submit livy job and poll until complete + - use livy session from cjob (works, but awkward way to get this) + - add wrapper element to file parts + - rename file parts + - tar/zip together + ''' - ''' - - submit livy job and poll until complete - - use livy session from cjob (works, but awkward way to get this) - - add wrapper element to file parts - - rename file parts - - tar/zip together - ''' - - # get CombineBackgroundTask - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) - - # generate spark code - output_path = '/tmp/%s' % str(uuid.uuid4()) - - # handle single Job - if 'job_id' in ct.task_params.keys(): - - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + # get CombineBackgroundTask + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) - # set archive filename of loose XML files - archive_filename_root = 'j_%s_documents' % cjob.job.id + # generate spark code + output_path = '/tmp/%s' % str(uuid.uuid4()) - # build job_dictionary - job_dict = {'j%s' % cjob.job.id: [cjob.job.id]} - logger.info(job_dict) + # handle single Job + if 'job_id' in ct.task_params.keys(): + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - # handle published records - if 'published' in ct.task_params.keys(): + # set archive filename of loose XML files + archive_filename_root = 'j_%s_documents' % cjob.job.id - # set archive filename of loose XML files - archive_filename_root = 'published_documents' + # build job_dictionary + job_dict = {'j%s' % cjob.job.id: [cjob.job.id]} + logger.info(job_dict) + + # handle published records + if 'published' in ct.task_params.keys(): - # get anonymous CombineJob - cjob = models.CombineJob() + # set archive filename of loose XML files + archive_filename_root = 'published_documents' - # get published records to determine sets - pr = models.PublishedRecords(subset=ct.task_params['subset']) + # get anonymous CombineJob + cjob = models.CombineJob() + + # get published records to determine sets + pr = models.PublishedRecords(subset=ct.task_params['subset']) - # init job dictionary - job_dict = {} + # init job dictionary + job_dict = {} - # handle published jobs with publish set ids - for publish_id, jobs in pr.sets.items(): - job_dict[publish_id] = [ job.id for job in jobs ] + # handle published jobs with publish set ids + for publish_id, jobs in pr.sets.items(): + job_dict[publish_id] = [job.id for job in jobs] - # handle "loose" Jobs - job_dict['no_publish_set_id'] = [job.id for job in pr.published_jobs.filter(publish_set_id='')] + # handle "loose" Jobs + job_dict['no_publish_set_id'] = [job.id for job in pr.published_jobs.filter(publish_set_id='')] - # debug - logger.info(job_dict) + # debug + logger.info(job_dict) - # update task params - ct.refresh_from_db() - ct.update_task_params({ - 'output_path':output_path, - 'archive_filename_root':archive_filename_root, - 'job_dict':job_dict - }) + # update task params + ct.refresh_from_db() + ct.update_task_params({ + 'output_path': output_path, + 'archive_filename_root': archive_filename_root, + 'job_dict': job_dict + }) - # prepare spark code - spark_code = "import math,uuid\nfrom console import *\nexport_records_as_xml(spark, %d)" % (int(ct_id)) - logger.info(spark_code) + # prepare spark code + spark_code = "import math,uuid\nfrom console import *\nexport_records_as_xml(spark, %d)" % (int(ct_id)) + logger.info(spark_code) - try: + try: - # check for livy session - _check_livy_session() + # check for livy session + _check_livy_session() - # submit to livy - logger.info('submitting code to Spark') - submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) + # submit to livy + logger.info('submitting code to Spark') + submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code': spark_code}) - # poll until complete - logger.info('polling for Spark job to complete...') - results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) - logger.info(results) + # poll until complete + logger.info('polling for Spark job to complete...') + results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), + check_success=spark_job_done, step=5, poll_forever=True) + logger.info(results) - # handle s3 bucket - if ct.task_params.get('s3_export', False): + # handle s3 bucket + if ct.task_params.get('s3_export', False): - if ct.task_params.get('s3_export_type') == 'archive': + if ct.task_params.get('s3_export_type') == 'archive': - logger.debug('writing archive file to S3') + logger.debug('writing archive file to S3') - # create single archive file - ct = _create_export_documents_archive(ct) + # create single archive file + ct = _create_export_documents_archive(ct) - # upload to s3 - s3 = boto3.resource('s3', - aws_access_key_id=settings.AWS_ACCESS_KEY_ID, - aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) - s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key'])\ - .put(Body=open(ct.task_params['export_output_archive'],'rb')) + # upload to s3 + s3 = boto3.resource('s3', + aws_access_key_id=settings.AWS_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) + s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key']) \ + .put(Body=open(ct.task_params['export_output_archive'], 'rb')) - # delete all traces from local output - shutil.rmtree(ct.task_params['output_path']) + # delete all traces from local output + shutil.rmtree(ct.task_params['output_path']) - elif ct.task_params.get('s3_export_type') == 'spark_df': - logger.debug('s3 export type was spark_df, nothing to cleanup or do') + elif ct.task_params.get('s3_export_type') == 'spark_df': + logger.debug('s3 export type was spark_df, nothing to cleanup or do') - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 's3_export_type':ct.task_params['s3_export_type'], - 'export_output':'s3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')), - }) - ct.save() - logger.info(ct.task_output_json) + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 's3_export_type': ct.task_params['s3_export_type'], + 'export_output': 's3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')), + }) + ct.save() + logger.info(ct.task_output_json) - # handle local filesystem - else: + # handle local filesystem + else: - # create single archive file - ct = _create_export_documents_archive(ct) + # create single archive file + ct = _create_export_documents_archive(ct) - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'export_output':ct.task_params['export_output_archive'], - 'name':ct.task_params['export_output_archive'].split('/')[-1], - 'content_type':ct.task_params['content_type'], - 'export_dir':"/".join(ct.task_params['export_output_archive'].split('/')[:-1]) - }) - ct.save() - logger.info(ct.task_output_json) + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'export_output': ct.task_params['export_output_archive'], + 'name': ct.task_params['export_output_archive'].split('/')[-1], + 'content_type': ct.task_params['content_type'], + 'export_dir': "/".join(ct.task_params['export_output_archive'].split('/')[:-1]) + }) + ct.save() + logger.info(ct.task_output_json) - except Exception as e: + except Exception as e: - logger.info(str(e)) + logger.info(str(e)) - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() def _create_export_documents_archive(ct): + # loop through parts, group XML docs with rool XML element, and save as new XML file + logger.info('grouping documents in XML files') - # loop through parts, group XML docs with rool XML element, and save as new XML file - logger.info('grouping documents in XML files') + export_parts = glob.glob('%s/**/part*' % ct.task_params['output_path']) + logger.info('found %s documents to write as XML' % len(export_parts)) + for part in export_parts: + with open('%s.xml' % part, 'w') as f: + f.write('') + with open(part) as f_part: + f.write(f_part.read()) + f.write('') - export_parts = glob.glob('%s/**/part*' % ct.task_params['output_path']) - logger.info('found %s documents to write as XML' % len(export_parts)) - for part in export_parts: - with open('%s.xml' % part, 'w') as f: - f.write('') - with open(part) as f_part: - f.write(f_part.read()) - f.write('') + # save list of directories to remove + pre_archive_dirs = glob.glob('%s/**' % ct.task_params['output_path']) - # save list of directories to remove - pre_archive_dirs = glob.glob('%s/**' % ct.task_params['output_path']) + # zip + if ct.task_params['archive_type'] == 'zip': - # zip - if ct.task_params['archive_type'] == 'zip': + logger.info('creating compressed zip archive') + content_type = 'application/zip' - logger.info('creating compressed zip archive') - content_type = 'application/zip' + # establish output archive file + export_output_archive = '%s/%s.zip' % (ct.task_params['output_path'], ct.task_params['archive_filename_root']) - # establish output archive file - export_output_archive = '%s/%s.zip' % (ct.task_params['output_path'], ct.task_params['archive_filename_root']) + with zipfile.ZipFile(export_output_archive, 'w', zipfile.ZIP_DEFLATED) as zip: + for f in glob.glob('%s/**/*.xml' % ct.task_params['output_path']): + zip.write(f, '/'.join(f.split('/')[-2:])) - with zipfile.ZipFile(export_output_archive,'w', zipfile.ZIP_DEFLATED) as zip: - for f in glob.glob('%s/**/*.xml' % ct.task_params['output_path']): - zip.write(f, '/'.join(f.split('/')[-2:])) + # tar + elif ct.task_params['archive_type'] == 'tar': - # tar - elif ct.task_params['archive_type'] == 'tar': + logger.info('creating uncompressed tar archive') + content_type = 'application/tar' - logger.info('creating uncompressed tar archive') - content_type = 'application/tar' + # establish output archive file + export_output_archive = '%s/%s.tar' % (ct.task_params['output_path'], ct.task_params['archive_filename_root']) - # establish output archive file - export_output_archive = '%s/%s.tar' % (ct.task_params['output_path'], ct.task_params['archive_filename_root']) + with tarfile.open(export_output_archive, 'w') as tar: + for f in glob.glob('%s/**/*.xml' % ct.task_params['output_path']): + tar.add(f, arcname='/'.join(f.split('/')[-2:])) - with tarfile.open(export_output_archive, 'w') as tar: - for f in glob.glob('%s/**/*.xml' % ct.task_params['output_path']): - tar.add(f, arcname='/'.join(f.split('/')[-2:])) + # tar.gz + elif ct.task_params['archive_type'] == 'targz': - # tar.gz - elif ct.task_params['archive_type'] == 'targz': + logger.info('creating compressed tar archive') + content_type = 'application/gzip' - logger.info('creating compressed tar archive') - content_type = 'application/gzip' + # establish output archive file + export_output_archive = '%s/%s.tar.gz' % ( + ct.task_params['output_path'], ct.task_params['archive_filename_root']) - # establish output archive file - export_output_archive = '%s/%s.tar.gz' % (ct.task_params['output_path'], ct.task_params['archive_filename_root']) + with tarfile.open(export_output_archive, 'w:gz') as tar: + for f in glob.glob('%s/**/*.xml' % ct.task_params['output_path']): + tar.add(f, arcname='/'.join(f.split('/')[-2:])) - with tarfile.open(export_output_archive, 'w:gz') as tar: - for f in glob.glob('%s/**/*.xml' % ct.task_params['output_path']): - tar.add(f, arcname='/'.join(f.split('/')[-2:])) + # cleanup directory + for d in pre_archive_dirs: + logger.info('removing dir: %s' % d) + shutil.rmtree(d) - # cleanup directory - for d in pre_archive_dirs: - logger.info('removing dir: %s' % d) - shutil.rmtree(d) + # update task params + ct.refresh_from_db() + ct.update_task_params({ + 'export_output_archive': export_output_archive, + 'content_type': content_type + }) - # update task params - ct.refresh_from_db() - ct.update_task_params({ - 'export_output_archive':export_output_archive, - 'content_type':content_type - }) - - # return - return ct + # return + return ct @celery_app.task() def job_reindex(ct_id): + ''' - ''' - - Background tasks to re-index Job + Background tasks to re-index Job - - submit livy job and poll until complete - - use livy session from cjob (works, but awkward way to get this) - ''' + - submit livy job and poll until complete + - use livy session from cjob (works, but awkward way to get this) + ''' - # get CombineTask (ct) - try: + # get CombineTask (ct) + try: - # check for livy session - _check_livy_session() + # check for livy session + _check_livy_session() - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - # drop Job's ES index - cjob.job.drop_es_index(clear_mapped_field_analysis=False) + # drop Job's ES index + cjob.job.drop_es_index(clear_mapped_field_analysis=False) - # drop previous index mapping failures - cjob.job.remove_mapping_failures_from_db() + # drop previous index mapping failures + cjob.job.remove_mapping_failures_from_db() - # generate spark code - spark_code = 'from jobs import ReindexSparkPatch\nReindexSparkPatch(spark, job_id="%(job_id)s", fm_config_json=\'\'\'%(fm_config_json)s\'\'\').spark_function()' % { - 'job_id':cjob.job.id, - 'fm_config_json':ct.task_params['fm_config_json'] - } + # generate spark code + spark_code = 'from jobs import ReindexSparkPatch\nReindexSparkPatch(spark, job_id="%(job_id)s", fm_config_json=\'\'\'%(fm_config_json)s\'\'\').spark_function()' % { + 'job_id': cjob.job.id, + 'fm_config_json': ct.task_params['fm_config_json'] + } - # submit to livy - logger.info('submitting code to Spark') - submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) + # submit to livy + logger.info('submitting code to Spark') + submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code': spark_code}) - # poll until complete - logger.info('polling for Spark job to complete...') - results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) - logger.info(results) + # poll until complete + logger.info('polling for Spark job to complete...') + results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), + check_success=spark_job_done, step=5, poll_forever=True) + logger.info(results) - # get new mapping - mapped_field_analysis = cjob.count_indexed_fields() - cjob.job.update_job_details({ - 'field_mapper_config':json.loads(ct.task_params['fm_config_json']), - 'mapped_field_analysis':mapped_field_analysis - }, save=True) + # get new mapping + mapped_field_analysis = cjob.count_indexed_fields() + cjob.job.update_job_details({ + 'field_mapper_config': json.loads(ct.task_params['fm_config_json']), + 'mapped_field_analysis': mapped_field_analysis + }, save=True) - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'reindex_results':results - }) - ct.save() + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'reindex_results': results + }) + ct.save() - except Exception as e: + except Exception as e: - logger.info(str(e)) + logger.info(str(e)) - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() @celery_app.task() def job_new_validations(ct_id): - - ''' - - submit livy job and poll until complete - - use livy session from cjob (works, but awkward way to get this) - ''' - - # get CombineTask (ct) - try: - - # check for livy session - _check_livy_session() - - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) - - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - - # generate spark code - spark_code = 'from jobs import RunNewValidationsSpark\nRunNewValidationsSpark(spark, job_id="%(job_id)s", validation_scenarios="%(validation_scenarios)s").spark_function()' % { - 'job_id':cjob.job.id, - 'validation_scenarios':str([ int(vs_id) for vs_id in ct.task_params['validation_scenarios'] ]), - } - logger.info(spark_code) - - # submit to livy - logger.info('submitting code to Spark') - submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) - - # poll until complete - logger.info('polling for Spark job to complete...') - results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) - logger.info(results) - - # loop through validation jobs, and remove from DB if share validation scenario - cjob.job.remove_validation_jobs(validation_scenarios=[ int(vs_id) for vs_id in ct.task_params['validation_scenarios'] ]) - - # update job_details - cjob.job.refresh_from_db() - # remove validation results - cjob.job.job_details = json.dumps({ k:v for k,v in cjob.job.job_details_dict.items() if k != 'validation_results' }) - cjob.job.save() - # update scenarios - validation_scenarios = cjob.job.job_details_dict['validation_scenarios'] - validation_scenarios.extend(ct.task_params['validation_scenarios']) - cjob.job.update_job_details({ - 'validation_scenarios':validation_scenarios - }, save=True) - - # write validation links - logger.info('writing validations job links') - for vs_id in ct.task_params['validation_scenarios']: - val_job = models.JobValidation( - job=cjob.job, - validation_scenario=models.ValidationScenario.objects.get(pk=vs_id) - ) - val_job.save() - - # update failure counts - logger.info('updating failure counts for new validation jobs') - for jv in cjob.job.jobvalidation_set.filter(failure_count=None): - jv.validation_failure_count(force_recount=True) - - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'run_new_validations':results - }) - ct.save() - - except Exception as e: - - logger.info(str(e)) - - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + ''' + - submit livy job and poll until complete + - use livy session from cjob (works, but awkward way to get this) + ''' + + # get CombineTask (ct) + try: + + # check for livy session + _check_livy_session() + + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) + + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + + # generate spark code + spark_code = 'from jobs import RunNewValidationsSpark\nRunNewValidationsSpark(spark, job_id="%(job_id)s", validation_scenarios="%(validation_scenarios)s").spark_function()' % { + 'job_id': cjob.job.id, + 'validation_scenarios': str([int(vs_id) for vs_id in ct.task_params['validation_scenarios']]), + } + logger.info(spark_code) + + # submit to livy + logger.info('submitting code to Spark') + submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code': spark_code}) + + # poll until complete + logger.info('polling for Spark job to complete...') + results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), + check_success=spark_job_done, step=5, poll_forever=True) + logger.info(results) + + # loop through validation jobs, and remove from DB if share validation scenario + cjob.job.remove_validation_jobs( + validation_scenarios=[int(vs_id) for vs_id in ct.task_params['validation_scenarios']]) + + # update job_details + cjob.job.refresh_from_db() + # remove validation results + cjob.job.job_details = json.dumps( + {k: v for k, v in cjob.job.job_details_dict.items() if k != 'validation_results'}) + cjob.job.save() + # update scenarios + validation_scenarios = cjob.job.job_details_dict['validation_scenarios'] + validation_scenarios.extend(ct.task_params['validation_scenarios']) + cjob.job.update_job_details({ + 'validation_scenarios': validation_scenarios + }, save=True) + + # write validation links + logger.info('writing validations job links') + for vs_id in ct.task_params['validation_scenarios']: + val_job = models.JobValidation( + job=cjob.job, + validation_scenario=models.ValidationScenario.objects.get(pk=vs_id) + ) + val_job.save() + + # update failure counts + logger.info('updating failure counts for new validation jobs') + for jv in cjob.job.jobvalidation_set.filter(failure_count=None): + jv.validation_failure_count(force_recount=True) + + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'run_new_validations': results + }) + ct.save() + + except Exception as e: + + logger.info(str(e)) + + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() @celery_app.task() def job_remove_validation(ct_id): - - ''' - Task to remove a validation, and all failures, from a Job - ''' - - # get CombineTask (ct) - try: - - # check for livy session - _check_livy_session() - - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) - - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - - # get Job Validation and delete - jv = models.JobValidation.objects.get(pk=int(ct.task_params['jv_id'])) - - # delete validation failures associated with Validation Scenario and Job - delete_results = jv.delete_record_validation_failures() - - # update valid field in Records via Spark - # generate spark code - spark_code = 'from jobs import RemoveValidationsSpark\nRemoveValidationsSpark(spark, job_id="%(job_id)s", validation_scenarios="%(validation_scenarios)s").spark_function()' % { - 'job_id':cjob.job.id, - 'validation_scenarios':str([ jv.validation_scenario.id ]), - } - logger.info(spark_code) - - # submit to livy - logger.info('submitting code to Spark') - submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) - - # poll until complete - logger.info('polling for Spark job to complete...') - results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) - logger.info(results) - - # remove Job Validation from job_details - cjob.job.refresh_from_db() - # remove validation results - cjob.job.job_details = json.dumps({ k:v for k,v in cjob.job.job_details_dict.items() if k != 'validation_results' }) - cjob.job.save() - validation_scenarios = cjob.job.job_details_dict['validation_scenarios'] - if jv.validation_scenario.id in validation_scenarios: - validation_scenarios.remove(jv.validation_scenario.id) - cjob.job.update_job_details({ - 'validation_scenarios':validation_scenarios - }, save=True) - - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'delete_job_validation':str(jv), - 'validation_failures_removed_':delete_results - }) - ct.save() - - # remove job validation link - jv.delete() - - except Exception as e: - - logger.info(str(e)) - - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + ''' + Task to remove a validation, and all failures, from a Job + ''' + + # get CombineTask (ct) + try: + + # check for livy session + _check_livy_session() + + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) + + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + + # get Job Validation and delete + jv = models.JobValidation.objects.get(pk=int(ct.task_params['jv_id'])) + + # delete validation failures associated with Validation Scenario and Job + delete_results = jv.delete_record_validation_failures() + + # update valid field in Records via Spark + # generate spark code + spark_code = 'from jobs import RemoveValidationsSpark\nRemoveValidationsSpark(spark, job_id="%(job_id)s", validation_scenarios="%(validation_scenarios)s").spark_function()' % { + 'job_id': cjob.job.id, + 'validation_scenarios': str([jv.validation_scenario.id]), + } + logger.info(spark_code) + + # submit to livy + logger.info('submitting code to Spark') + submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code': spark_code}) + + # poll until complete + logger.info('polling for Spark job to complete...') + results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), + check_success=spark_job_done, step=5, poll_forever=True) + logger.info(results) + + # remove Job Validation from job_details + cjob.job.refresh_from_db() + # remove validation results + cjob.job.job_details = json.dumps( + {k: v for k, v in cjob.job.job_details_dict.items() if k != 'validation_results'}) + cjob.job.save() + validation_scenarios = cjob.job.job_details_dict['validation_scenarios'] + if jv.validation_scenario.id in validation_scenarios: + validation_scenarios.remove(jv.validation_scenario.id) + cjob.job.update_job_details({ + 'validation_scenarios': validation_scenarios + }, save=True) + + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'delete_job_validation': str(jv), + 'validation_failures_removed_': delete_results + }) + ct.save() + + # remove job validation link + jv.delete() + + except Exception as e: + + logger.info(str(e)) + + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() @celery_app.task() def job_publish(ct_id): + # get CombineTask (ct) + try: + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) - # get CombineTask (ct) - try: - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) - - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - # publish job - publish_results = cjob.job.publish(publish_set_id=ct.task_params['publish_set_id']) + # publish job + publish_results = cjob.job.publish(publish_set_id=ct.task_params['publish_set_id']) - # remove from published subsets - cjob.job.remove_from_published_precounts() + # remove from published subsets + cjob.job.remove_from_published_precounts() - # add publish_set_id to published subsets if present, and remove precount - for published_subset in ct.task_params['in_published_subsets']: - logger.debug('adding publish_set_id to Published Subset: %s' % published_subset) - pr = models.PublishedRecords(subset=published_subset) - pr.add_publish_set_id_to_subset(publish_set_id=ct.task_params['publish_set_id']) + # add publish_set_id to published subsets if present, and remove precount + for published_subset in ct.task_params['in_published_subsets']: + logger.debug('adding publish_set_id to Published Subset: %s' % published_subset) + pr = models.PublishedRecords(subset=published_subset) + pr.add_publish_set_id_to_subset(publish_set_id=ct.task_params['publish_set_id']) - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'job_id':ct.task_params['job_id'], - 'publish_results':publish_results - }) - ct.save() + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'job_id': ct.task_params['job_id'], + 'publish_results': publish_results + }) + ct.save() - except Exception as e: + except Exception as e: - logger.info(str(e)) + logger.info(str(e)) - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() @celery_app.task() def job_unpublish(ct_id): + # get CombineTask (ct) + try: + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) - # get CombineTask (ct) - try: - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + # publish job + unpublish_results = cjob.job.unpublish() - # publish job - unpublish_results = cjob.job.unpublish() + # REEVALUATE SUBSET HIERARCHY + # If job|# exists in any subset, remove - # REEVALUATE SUBSET HIERARCHY - # If job|# exists in any subset, remove + # remove from published subsets + cjob.job.remove_from_published_precounts() - # remove from published subsets - cjob.job.remove_from_published_precounts() + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'job_id': ct.task_params['job_id'], + 'unpublish_results': unpublish_results + }) + ct.save() - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'job_id':ct.task_params['job_id'], - 'unpublish_results':unpublish_results - }) - ct.save() + except Exception as e: - except Exception as e: + logger.info(str(e)) - logger.info(str(e)) - - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() @celery_app.task() def job_dbdm(ct_id): - - # get CombineTask (ct) - try: - - # check for livy session - _check_livy_session() - - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) - - # get CombineJob - cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) - - # set dbdm as False for all Records in Job - clear_result = models.mc_handle.combine.record.update_many({'job_id':cjob.job.id},{'$set':{'dbdm':False}}, upsert=False) - - # generate spark code - spark_code = 'from jobs import RunDBDM\nRunDBDM(spark, job_id="%(job_id)s", dbdd_id=%(dbdd_id)s).spark_function()' % { - 'job_id':cjob.job.id, - 'dbdd_id':int(ct.task_params['dbdd_id']) - } - logger.info(spark_code) - - # submit to livy - logger.info('submitting code to Spark') - submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code}) - - # poll until complete - logger.info('polling for Spark job to complete...') - results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True) - logger.info(results) - - # update job_details - cjob.job.refresh_from_db() - - # get dbdd - dbdd = models.DPLABulkDataDownload.objects.get(pk=int(ct.task_params['dbdd_id'])) - cjob.job.update_job_details({ - 'dbdm':{ - 'dbdd':int(ct.task_params['dbdd_id']), - 'dbdd_s3_key':dbdd.s3_key, - 'matches':None, - 'misses':None - } - }) - - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'job_id':ct.task_params['job_id'], - 'dbdd_id':ct.task_params['dbdd_id'], - 'dbdd_results':results - }) - ct.save() - logger.info(ct.task_output_json) - - except Exception as e: - - logger.info(str(e)) - - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + # get CombineTask (ct) + try: + + # check for livy session + _check_livy_session() + + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) + + # get CombineJob + cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id'])) + + # set dbdm as False for all Records in Job + clear_result = models.mc_handle.combine.record.update_many({'job_id': cjob.job.id}, {'$set': {'dbdm': False}}, + upsert=False) + + # generate spark code + spark_code = 'from jobs import RunDBDM\nRunDBDM(spark, job_id="%(job_id)s", dbdd_id=%(dbdd_id)s).spark_function()' % { + 'job_id': cjob.job.id, + 'dbdd_id': int(ct.task_params['dbdd_id']) + } + logger.info(spark_code) + + # submit to livy + logger.info('submitting code to Spark') + submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code': spark_code}) + + # poll until complete + logger.info('polling for Spark job to complete...') + results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), + check_success=spark_job_done, step=5, poll_forever=True) + logger.info(results) + + # update job_details + cjob.job.refresh_from_db() + + # get dbdd + dbdd = models.DPLABulkDataDownload.objects.get(pk=int(ct.task_params['dbdd_id'])) + cjob.job.update_job_details({ + 'dbdm': { + 'dbdd': int(ct.task_params['dbdd_id']), + 'dbdd_s3_key': dbdd.s3_key, + 'matches': None, + 'misses': None + } + }) + + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'job_id': ct.task_params['job_id'], + 'dbdd_id': ct.task_params['dbdd_id'], + 'dbdd_results': results + }) + ct.save() + logger.info(ct.task_output_json) + + except Exception as e: + + logger.info(str(e)) + + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() @celery_app.task() def rerun_jobs_prep(ct_id): + # get CombineTask (ct) + try: - # get CombineTask (ct) - try: - - # check for livy session - _check_livy_session() + # check for livy session + _check_livy_session() - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) - # loop through and run - for job_id in ct.task_params['ordered_job_rerun_set']: + # loop through and run + for job_id in ct.task_params['ordered_job_rerun_set']: + # cjob + cjob = models.CombineJob.get_combine_job(job_id) - # cjob - cjob = models.CombineJob.get_combine_job(job_id) + # rerun + cjob.rerun(rerun_downstream=False, set_gui_status=False) - # rerun - cjob.rerun(rerun_downstream=False, set_gui_status=False) + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'ordered_job_rerun_set': ct.task_params['ordered_job_rerun_set'], + 'msg': 'Jobs prepared for rerunning, running or queued as Spark jobs' + }) + ct.save() + logger.info(ct.task_output_json) - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'ordered_job_rerun_set':ct.task_params['ordered_job_rerun_set'], - 'msg':'Jobs prepared for rerunning, running or queued as Spark jobs' - }) - ct.save() - logger.info(ct.task_output_json) + except Exception as e: - except Exception as e: + logger.info(str(e)) - logger.info(str(e)) - - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() @celery_app.task() def clone_jobs(ct_id): + ''' + Background task to clone Job(s) - ''' - Background task to clone Job(s) - - - because multiple Jobs can be run through this method, - that might result in newly created clones as downstream for Jobs - run through later, need to pass newly created clones under skip_clones[] - list to cjob.clone() to pass on - ''' + - because multiple Jobs can be run through this method, + that might result in newly created clones as downstream for Jobs + run through later, need to pass newly created clones under skip_clones[] + list to cjob.clone() to pass on + ''' - # get CombineTask (ct) - try: + # get CombineTask (ct) + try: - # check for livy session - _check_livy_session() + # check for livy session + _check_livy_session() - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) - # loop through and run - skip_clones = [] - for job_id in ct.task_params['ordered_job_clone_set']: + # loop through and run + skip_clones = [] + for job_id in ct.task_params['ordered_job_clone_set']: - # cjob - cjob = models.CombineJob.get_combine_job(job_id) + # cjob + cjob = models.CombineJob.get_combine_job(job_id) - # clone - clones = cjob.clone( - rerun=ct.task_params['rerun_on_clone'], - clone_downstream=ct.task_params['downstream_toggle'], - skip_clones=skip_clones) + # clone + clones = cjob.clone( + rerun=ct.task_params['rerun_on_clone'], + clone_downstream=ct.task_params['downstream_toggle'], + skip_clones=skip_clones) - # append newly created clones to skip_clones - for job, clone in clones.items(): - skip_clones.append(clone) + # append newly created clones to skip_clones + for job, clone in clones.items(): + skip_clones.append(clone) - # save export output to Combine Task output - ct.refresh_from_db() - ct.task_output_json = json.dumps({ - 'ordered_job_clone_set':ct.task_params['ordered_job_clone_set'], - 'msg':'Jobs cloned' - }) - ct.save() - logger.info(ct.task_output_json) + # save export output to Combine Task output + ct.refresh_from_db() + ct.task_output_json = json.dumps({ + 'ordered_job_clone_set': ct.task_params['ordered_job_clone_set'], + 'msg': 'Jobs cloned' + }) + ct.save() + logger.info(ct.task_output_json) - except Exception as e: + except Exception as e: - logger.info(str(e)) + logger.info(str(e)) - # attempt to capture error and return for task - ct.task_output_json = json.dumps({ - 'error':str(e) - }) - ct.save() + # attempt to capture error and return for task + ct.task_output_json = json.dumps({ + 'error': str(e) + }) + ct.save() @celery_app.task() def stateio_export(ct_id): + ''' + Background task to export state + ''' - ''' - Background task to export state - ''' + # check for livy session + _check_livy_session() - # check for livy session - _check_livy_session() + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) - - # begin export - sio_client = models.StateIOClient() - sio_client.export_state( - stateio_id=ct.task_params['stateio_id'], - jobs=ct.task_params['jobs'], - record_groups=ct.task_params['record_groups'], - orgs=ct.task_params['orgs'], - config_scenarios=ct.task_params['config_scenarios'], - export_name=ct.task_params['export_name'], - compress=True, - compression_format='zip') + # begin export + sio_client = models.StateIOClient() + sio_client.export_state( + stateio_id=ct.task_params['stateio_id'], + jobs=ct.task_params['jobs'], + record_groups=ct.task_params['record_groups'], + orgs=ct.task_params['orgs'], + config_scenarios=ct.task_params['config_scenarios'], + export_name=ct.task_params['export_name'], + compress=True, + compression_format='zip') @celery_app.task() def stateio_import(ct_id): + ''' + Background task to import state + ''' - ''' - Background task to import state - ''' - - # check for livy session - _check_livy_session() + # check for livy session + _check_livy_session() - ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) - logger.info('using %s' % ct) + ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id)) + logger.info('using %s' % ct) - # begin import - sio_client = models.StateIOClient() - sio_client.import_state( - stateio_id=ct.task_params['stateio_id'], - import_name=ct.task_params['import_name'], - export_path=ct.task_params['export_path']) + # begin import + sio_client = models.StateIOClient() + sio_client.import_state( + stateio_id=ct.task_params['stateio_id'], + import_name=ct.task_params['import_name'], + export_path=ct.task_params['export_path']) def _check_livy_session(): - - ''' - Function to check for Livy session if spark is needed, - and if not, raise Exception - ''' - - # check for presence of session - ls = models.LivySession.get_active_session() - - # if False, attempt to start livy session - if not ls: - - try: - ls_id = models.LivySession.ensure_active_session_id(None) - ls = models.LivySession.get_active_session() - except: - raise Exception('Error while attempting to start new Livy session') - - # if still failing, raise exception - if not ls: - raise Exception('Spark required for this task, but could not start Livy session.') - - - - - + ''' + Function to check for Livy session if spark is needed, + and if not, raise Exception + ''' + + # check for presence of session + ls = models.LivySession.get_active_session() + + # if False, attempt to start livy session + if not ls: + + try: + ls_id = models.LivySession.ensure_active_session_id(None) + ls = models.LivySession.get_active_session() + except: + raise Exception('Error while attempting to start new Livy session') + + # if still failing, raise exception + if not ls: + raise Exception('Spark required for this task, but could not start Livy session.') diff --git a/core/templatetags/core_template_filters.py b/core/templatetags/core_template_filters.py index 128bd17f..93675bcb 100644 --- a/core/templatetags/core_template_filters.py +++ b/core/templatetags/core_template_filters.py @@ -1,4 +1,3 @@ - import logging import re from django import template @@ -12,46 +11,42 @@ def get_obj_attr(value, arg): - ''' - Gets an attribute of an object dynamically from a string name - https://stackoverflow.com/questions/844746/performing-a-getattr-style-lookup-in-a-django-template - ''' - - if hasattr(value, str(arg)): - return getattr(value, arg) - elif hasattr(value, 'has_key') and value.has_key(arg): - return value[arg] - elif numeric_test.match(str(arg)) and len(value) > int(arg): - return value[int(arg)] - else: - # return settings.TEMPLATE_STRING_IF_INVALID - return None + ''' + Gets an attribute of an object dynamically from a string name + https://stackoverflow.com/questions/844746/performing-a-getattr-style-lookup-in-a-django-template + ''' + + if hasattr(value, str(arg)): + return getattr(value, arg) + elif hasattr(value, 'has_key') and value.has_key(arg): + return value[arg] + elif numeric_test.match(str(arg)) and len(value) > int(arg): + return value[int(arg)] + else: + # return settings.TEMPLATE_STRING_IF_INVALID + return None def get_dict_value(dictionary, key): - - ''' - Return value from dictionary with variable key - ''' + ''' + Return value from dictionary with variable key + ''' - return dictionary.get(key, False) + return dictionary.get(key, False) def es_field_name_format(field_name): + ''' + Template filter to convert ES friendly field names + into human friendly XML-like paths + ''' - ''' - Template filter to convert ES friendly field names - into human friendly XML-like paths - ''' - - # add slashes - field_name = re.sub('\|','/',field_name) + # add slashes + field_name = re.sub('\|', '/', field_name) - return '/%s' % field_name + return '/%s' % field_name register.filter('get_obj_attr', get_obj_attr) register.filter('get_dict_value', get_dict_value) register.filter('es_field_name_format', es_field_name_format) - - diff --git a/core/urls.py b/core/urls.py index e2b8eefb..c8b91ca7 100644 --- a/core/urls.py +++ b/core/urls.py @@ -7,156 +7,228 @@ urlpatterns = [ - # System - url(r'^system$', views.system, name='system'), - - # User Livy sessions - url(r'^system/livy_sessions/start$', views.livy_session_start, name='livy_session_start'), - url(r'^system/livy_sessions/(?P[0-9]+)/stop$', views.livy_session_stop, name='livy_session_stop'), - url(r'^system/bg_status$', views.system_bg_status, name='system_bg_status'), - - # Organizations - url(r'^organization/all$', views.organizations, name='organizations'), - url(r'^organization/(?P[0-9]+)$', views.organization, name='organization'), - url(r'^organization/(?P[0-9]+)/delete$', views.organization_delete, name='organization_delete'), - - # Record Groups - url(r'^record_group/(?P[0-9]+)$', views.record_group_id_redirect, name='record_group_id_redirect'), - url(r'^organization/(?P[0-9]+)/record_group/new$', views.record_group_new, name='record_group_new'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)$', views.record_group, name='record_group'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/delete$', views.record_group_delete, name='record_group_delete'), - - # Jobs - url(r'^job/(?P[0-9]+)$', views.job_id_redirect, name='job_id_redirect'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)$', views.job_details, name='job_details'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/details$', views.job_details, name='job_details'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/delete$', views.job_delete, name='job_delete'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/publish$', views.job_publish, name='job_publish'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/unpublish$', views.job_unpublish, name='job_unpublish'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/errors$', views.job_errors, name='job_errors'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/update_note$', views.job_update_note, name='job_update_note'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/update_name$', views.job_update_name, name='job_update_name'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/harvest/oai/new$', views.job_harvest_oai, name='job_harvest_oai'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/harvest/static/xml/new$', views.job_harvest_static_xml, name='job_harvest_static_xml'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/harvest/static/tabular/new$', views.job_harvest_tabular_data, name='job_harvest_tabular_data'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/transform/new$', views.job_transform, name='job_transform'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/merge/new$', views.job_merge, name='job_merge'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/reports/create_validation_report$', views.job_reports_create_validation, name='job_reports_create_validation'), - url(r'^organization/(?P([0-9]|(DYNAMIC_ORG_ID))+)/record_group/(?P([0-9]|(DYNAMIC_RG_ID))+)/job/(?P([0-9]|(DYNAMIC_ID))+)/job_lineage_json$', views.job_lineage_json, name='job_lineage_json'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/update$', views.job_update, name='job_update'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/job_parameters$', views.job_parameters, name='job_parameters'), - - # Job Record Validation Scenarios - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/job_validation_scenario_failures/(?P[0-9]+)$', views.job_validation_scenario_failures, name='job_validation_scenario_failures'), - - # Record Group Job Analysis - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/analysis/indexing_failures$', views.job_indexing_failures, name='job_indexing_failures'), - - # ElasticSearch Analysis - url(r'^analysis/es/index/(?P.+)/field_analysis$', views.field_analysis, name='field_analysis'), - url(r'^analysis/es/index/(?P.+)/field_analysis/docs/(?P.+)$', views.field_analysis_docs, name='field_analysis_docs'), - - # Jobs General - url(r'^jobs/all$', views.all_jobs, name='all_jobs'), - url(r'^jobs/move_jobs$', views.move_jobs, name='move_jobs'), - url(r'^jobs/stop_jobs$', views.stop_jobs, name='stop_jobs'), - url(r'^jobs/delete_jobs$', views.delete_jobs, name='delete_jobs'), - url(r'^jobs/rerun_jobs$', views.rerun_jobs, name='rerun_jobs'), - url(r'^jobs/clone_jobs$', views.clone_jobs, name='clone_jobs'), - - # Records - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)$', views.record, name='record'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/document$', views.record_document, name='record_document'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/indexed_document$', views.record_indexed_document, name='record_indexed_document'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/error$', views.record_error, name='record_error'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/validation_scenario/(?P[0-9]+)$', views.record_validation_scenario, name='record_validation_scenario'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/diff/combined$', views.record_combined_diff_html, name='record_combined_diff_html'), - url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/diff/side_by_side$', views.record_side_by_side_diff_html, name='record_side_by_side_diff_html'), - - # Configuration - url(r'^configuration$', views.configuration, name='configuration'), - url(r'^configuration/transformation/(?P[0-9]+)/payload$', views.transformation_scenario_payload, name='transformation_scenario_payload'), - url(r'^configuration/oai_endpoint/(?P[0-9]+)/payload$', views.oai_endpoint_payload, name='oai_endpoint_payload'), - url(r'^configuration/validation/(?P[0-9]+)/payload$', views.validation_scenario_payload, name='validation_scenario_payload'), - url(r'^configuration/test_validation_scenario$', views.test_validation_scenario, name='test_validation_scenario'), - url(r'^configuration/test_transformation_scenario$', views.test_transformation_scenario, name='test_transformation_scenario'), - url(r'^configuration/rits/(?P[0-9]+)/payload$', views.rits_payload, name='rits_payload'), - url(r'^configuration/test_rits$', views.test_rits, name='test_rits'), - url(r'^configuration/field_mapper/(?P[0-9]+)/payload$', views.field_mapper_payload, name='field_mapper_payload'), - url(r'^configuration/field_mapper/update$', views.field_mapper_update, name='field_mapper_update'), - url(r'^configuration/test_field_mapper$', views.test_field_mapper, name='test_field_mapper'), - url(r'^configuration/dpla_bulk_data/download$', views.dpla_bulk_data_download, name='dpla_bulk_data_download'), - - # Publish - url(r'^published$', views.published, name='published'), - url(r'^published/published_dt_json$', views.DTPublishedJson.as_view(), name='published_dt_json'), - url(r'^published/published_dt_json/subset/(?P.+)$', views.DTPublishedJson.as_view(), name='published_dt_json'), - url(r'^published/subsets/create$', views.published_subset_create, name='published_subset_create'), - url(r'^published/subsets/edit/(?P.+)$', views.published_subset_edit, name='published_subset_edit'), - url(r'^published/subsets/delete/(?P.+)$', views.published_subset_delete, name='published_subset_delete'), - url(r'^published/subset/(?P.+)$', views.published, name='published_subset'), - - # Export - url(r'^export/mapped_fields/(?P[a-zA-Z]+)/(?P[0-9]+)$', views.export_mapped_fields, name='export_mapped_fields'), - url(r'^export/mapped_fields/(?P[a-zA-Z]+)$', views.export_mapped_fields, name='export_mapped_fields'), - url(r'^export/mapped_fields/(?P[a-zA-Z]+)/subset/(?P.+)$', views.export_mapped_fields, name='export_mapped_fields'), - url(r'^export/documents/(?P[a-zA-Z]+)/(?P[0-9]+)$', views.export_documents, name='export_documents'), - url(r'^export/documents/(?P[a-zA-Z]+)$', views.export_documents, name='export_documents'), - url(r'^export/documents/(?P[a-zA-Z]+)/subset/(?P.+)$', views.export_documents, name='export_documents'), - url(r'^export/tabular_data/(?P[a-zA-Z]+)/(?P[0-9]+)$', views.export_tabular_data, name='export_tabular_data'), - url(r'^export/tabular_data/(?P[a-zA-Z]+)$', views.export_tabular_data, name='export_tabular_data'), - url(r'^export/tabular_data/(?P[a-zA-Z]+)/subset/(?P.+)$', views.export_tabular_data, name='export_tabular_data'), - - # OAI - url(r'^oai$', views.oai, name='oai'), - url(r'^oai/subset/(?P.+)$', views.oai, name='oai_subset'), - - # Global Search - url(r'^search$', views.search, name='search'), - - # Datatables Endpoints - url(r'^datatables/all_records/records_dt_json$', views.DTRecordsJson.as_view(), name='all_records_dt_json'), - url(r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/records_dt_json$', views.DTRecordsJson.as_view(), name='records_dt_json'), - url(r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/records_dt_json/(?P[0-1]+)$', views.DTRecordsJson.as_view(), name='records_dt_json'), - url(r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/indexing_failures_dt_json$', views.DTIndexingFailuresJson.as_view(), name='indexing_failures_dt_json'), - url(r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/job_validation_scenario_failures_json/(?P[0-9]+)$', views.DTJobValidationScenarioFailuresJson.as_view(), name='job_validation_scenario_failures_json'), - url(r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/dpla_bulk_data/(?P.*)$', views.DTDPLABulkDataMatches.as_view(), name='dpla_bulk_data_matches'), - url(r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record_diffs$', views.JobRecordDiffs.as_view(), name='job_record_diffs'), - url(r'^datatables/background_tasks$', views.CombineBackgroundTasksDT.as_view(), name='bg_tasks_dt'), - url(r'^datatables/es/index/(?P.+)/(?P.+)/records_es_field_dt_json$', DTElasticFieldSearch.as_view(), name='records_es_field_dt_json'), - url(r'^datatables/es/search$', DTElasticGenericSearch.as_view(), name='records_es_generic_dt_json'), - - # Analysis - url(r'^analysis$', views.analysis, name='analysis'), - url(r'^analysis/new$', views.job_analysis, name='job_analysis'), - - # Background Tasks - url(r'^background_tasks$', views.bg_tasks, name='bg_tasks'), - url(r'^background_tasks/process/action/(?P[0-9a-z]+)$', views.bgtasks_proc_action, name='bgtasks_proc_action'), - url(r'^background_tasks/process/logs/err$', views.bgtasks_proc_stderr_log, name='bgtasks_proc_stderr_log'), - url(r'^background_tasks/delete_all$', views.bg_tasks_delete_all, name='bg_tasks_delete_all'), - url(r'^background_tasks/task/(?P[0-9]+)$', views.bg_task, name='bg_task'), - url(r'^background_tasks/task/(?P[0-9]+)/delete$', views.bg_task_delete, name='bg_task_delete'), - url(r'^background_tasks/task/(?P[0-9]+)/cancel$', views.bg_task_cancel, name='bg_task_cancel'), - - # Document Download - url(r'^document_download$', views.document_download, name='document_download'), - - # Global Messages (GMs) - url(r'^gm/delete$', views.gm_delete, name='gm_delete'), - - # StateIO - url(r'^stateio$', views.stateio, name='stateio'), - url(r'^stateio/state/(?P[0-9a-z]+)$', views.stateio_state, name='stateio_state'), - url(r'^stateio/state/(?P[0-9a-z]+)/manifest/(?P.+)$', views.stateio_state_manifest, name='stateio_state_manifest'), - url(r'^stateio/state/(?P[0-9a-z]+)/delete$', views.stateio_state_delete, name='stateio_state_delete'), - url(r'^stateio/state/(?P[0-9a-z]+)/download$', views.stateio_state_download, name='stateio_state_download'), - url(r'^stateio/state/(?P[0-9a-z]+)/stop$', views.stateio_state_stop, name='stateio_state_stop'), - url(r'^stateio/export$', views.stateio_export, name='stateio_export'), - url(r'^stateio/import$', views.stateio_import, name='stateio_import'), - - # General - url(r'^login$', auth_views.login, name='login'), - url(r'^logout$', auth_views.logout, name='logout'), - url(r'^', views.index, name='combine_home'), + # System + url(r'^system$', views.system, name='system'), + + # User Livy sessions + url(r'^system/livy_sessions/start$', views.livy_session_start, name='livy_session_start'), + url(r'^system/livy_sessions/(?P[0-9]+)/stop$', views.livy_session_stop, name='livy_session_stop'), + url(r'^system/bg_status$', views.system_bg_status, name='system_bg_status'), + + # Organizations + url(r'^organization/all$', views.organizations, name='organizations'), + url(r'^organization/(?P[0-9]+)$', views.organization, name='organization'), + url(r'^organization/(?P[0-9]+)/delete$', views.organization_delete, name='organization_delete'), + + # Record Groups + url(r'^record_group/(?P[0-9]+)$', views.record_group_id_redirect, name='record_group_id_redirect'), + url(r'^organization/(?P[0-9]+)/record_group/new$', views.record_group_new, name='record_group_new'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)$', views.record_group, + name='record_group'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/delete$', views.record_group_delete, + name='record_group_delete'), + + # Jobs + url(r'^job/(?P[0-9]+)$', views.job_id_redirect, name='job_id_redirect'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)$', + views.job_details, name='job_details'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/details$', + views.job_details, name='job_details'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/delete$', + views.job_delete, name='job_delete'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/publish$', + views.job_publish, name='job_publish'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/unpublish$', + views.job_unpublish, name='job_unpublish'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/errors$', + views.job_errors, name='job_errors'), + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/update_note$', + views.job_update_note, name='job_update_note'), + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/update_name$', + views.job_update_name, name='job_update_name'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/harvest/oai/new$', + views.job_harvest_oai, name='job_harvest_oai'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/harvest/static/xml/new$', + views.job_harvest_static_xml, name='job_harvest_static_xml'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/harvest/static/tabular/new$', + views.job_harvest_tabular_data, name='job_harvest_tabular_data'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/transform/new$', + views.job_transform, name='job_transform'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/merge/new$', views.job_merge, + name='job_merge'), + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/reports/create_validation_report$', + views.job_reports_create_validation, name='job_reports_create_validation'), + url( + r'^organization/(?P([0-9]|(DYNAMIC_ORG_ID))+)/record_group/(?P([0-9]|(DYNAMIC_RG_ID))+)/job/(?P([0-9]|(DYNAMIC_ID))+)/job_lineage_json$', + views.job_lineage_json, name='job_lineage_json'), + url(r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/update$', + views.job_update, name='job_update'), + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/job_parameters$', + views.job_parameters, name='job_parameters'), + + # Job Record Validation Scenarios + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/job_validation_scenario_failures/(?P[0-9]+)$', + views.job_validation_scenario_failures, name='job_validation_scenario_failures'), + + # Record Group Job Analysis + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/analysis/indexing_failures$', + views.job_indexing_failures, name='job_indexing_failures'), + + # ElasticSearch Analysis + url(r'^analysis/es/index/(?P.+)/field_analysis$', views.field_analysis, name='field_analysis'), + url(r'^analysis/es/index/(?P.+)/field_analysis/docs/(?P.+)$', views.field_analysis_docs, + name='field_analysis_docs'), + + # Jobs General + url(r'^jobs/all$', views.all_jobs, name='all_jobs'), + url(r'^jobs/move_jobs$', views.move_jobs, name='move_jobs'), + url(r'^jobs/stop_jobs$', views.stop_jobs, name='stop_jobs'), + url(r'^jobs/delete_jobs$', views.delete_jobs, name='delete_jobs'), + url(r'^jobs/rerun_jobs$', views.rerun_jobs, name='rerun_jobs'), + url(r'^jobs/clone_jobs$', views.clone_jobs, name='clone_jobs'), + + # Records + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)$', + views.record, name='record'), + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/document$', + views.record_document, name='record_document'), + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/indexed_document$', + views.record_indexed_document, name='record_indexed_document'), + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/error$', + views.record_error, name='record_error'), + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/validation_scenario/(?P[0-9]+)$', + views.record_validation_scenario, name='record_validation_scenario'), + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/diff/combined$', + views.record_combined_diff_html, name='record_combined_diff_html'), + url( + r'^organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record/(?P[0-9a-z]+)/diff/side_by_side$', + views.record_side_by_side_diff_html, name='record_side_by_side_diff_html'), + + # Configuration + url(r'^configuration$', views.configuration, name='configuration'), + url(r'^configuration/transformation/(?P[0-9]+)/payload$', views.transformation_scenario_payload, + name='transformation_scenario_payload'), + url(r'^configuration/oai_endpoint/(?P[0-9]+)/payload$', views.oai_endpoint_payload, + name='oai_endpoint_payload'), + url(r'^configuration/validation/(?P[0-9]+)/payload$', views.validation_scenario_payload, + name='validation_scenario_payload'), + url(r'^configuration/test_validation_scenario$', views.test_validation_scenario, name='test_validation_scenario'), + url(r'^configuration/test_transformation_scenario$', views.test_transformation_scenario, + name='test_transformation_scenario'), + url(r'^configuration/rits/(?P[0-9]+)/payload$', views.rits_payload, name='rits_payload'), + url(r'^configuration/test_rits$', views.test_rits, name='test_rits'), + url(r'^configuration/field_mapper/(?P[0-9]+)/payload$', views.field_mapper_payload, + name='field_mapper_payload'), + url(r'^configuration/field_mapper/update$', views.field_mapper_update, name='field_mapper_update'), + url(r'^configuration/test_field_mapper$', views.test_field_mapper, name='test_field_mapper'), + url(r'^configuration/dpla_bulk_data/download$', views.dpla_bulk_data_download, name='dpla_bulk_data_download'), + + # Publish + url(r'^published$', views.published, name='published'), + url(r'^published/published_dt_json$', views.DTPublishedJson.as_view(), name='published_dt_json'), + url(r'^published/published_dt_json/subset/(?P.+)$', views.DTPublishedJson.as_view(), + name='published_dt_json'), + url(r'^published/subsets/create$', views.published_subset_create, name='published_subset_create'), + url(r'^published/subsets/edit/(?P.+)$', views.published_subset_edit, name='published_subset_edit'), + url(r'^published/subsets/delete/(?P.+)$', views.published_subset_delete, name='published_subset_delete'), + url(r'^published/subset/(?P.+)$', views.published, name='published_subset'), + + # Export + url(r'^export/mapped_fields/(?P[a-zA-Z]+)/(?P[0-9]+)$', views.export_mapped_fields, + name='export_mapped_fields'), + url(r'^export/mapped_fields/(?P[a-zA-Z]+)$', views.export_mapped_fields, + name='export_mapped_fields'), + url(r'^export/mapped_fields/(?P[a-zA-Z]+)/subset/(?P.+)$', views.export_mapped_fields, + name='export_mapped_fields'), + url(r'^export/documents/(?P[a-zA-Z]+)/(?P[0-9]+)$', views.export_documents, + name='export_documents'), + url(r'^export/documents/(?P[a-zA-Z]+)$', views.export_documents, name='export_documents'), + url(r'^export/documents/(?P[a-zA-Z]+)/subset/(?P.+)$', views.export_documents, + name='export_documents'), + url(r'^export/tabular_data/(?P[a-zA-Z]+)/(?P[0-9]+)$', views.export_tabular_data, + name='export_tabular_data'), + url(r'^export/tabular_data/(?P[a-zA-Z]+)$', views.export_tabular_data, name='export_tabular_data'), + url(r'^export/tabular_data/(?P[a-zA-Z]+)/subset/(?P.+)$', views.export_tabular_data, + name='export_tabular_data'), + + # OAI + url(r'^oai$', views.oai, name='oai'), + url(r'^oai/subset/(?P.+)$', views.oai, name='oai_subset'), + + # Global Search + url(r'^search$', views.search, name='search'), + + # Datatables Endpoints + url(r'^datatables/all_records/records_dt_json$', views.DTRecordsJson.as_view(), name='all_records_dt_json'), + url( + r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/records_dt_json$', + views.DTRecordsJson.as_view(), name='records_dt_json'), + url( + r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/records_dt_json/(?P[0-1]+)$', + views.DTRecordsJson.as_view(), name='records_dt_json'), + url( + r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/indexing_failures_dt_json$', + views.DTIndexingFailuresJson.as_view(), name='indexing_failures_dt_json'), + url( + r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/job_validation_scenario_failures_json/(?P[0-9]+)$', + views.DTJobValidationScenarioFailuresJson.as_view(), name='job_validation_scenario_failures_json'), + url( + r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/dpla_bulk_data/(?P.*)$', + views.DTDPLABulkDataMatches.as_view(), name='dpla_bulk_data_matches'), + url( + r'^datatables/organization/(?P[0-9]+)/record_group/(?P[0-9]+)/job/(?P[0-9]+)/record_diffs$', + views.JobRecordDiffs.as_view(), name='job_record_diffs'), + url(r'^datatables/background_tasks$', views.CombineBackgroundTasksDT.as_view(), name='bg_tasks_dt'), + url(r'^datatables/es/index/(?P.+)/(?P.+)/records_es_field_dt_json$', + DTElasticFieldSearch.as_view(), name='records_es_field_dt_json'), + url(r'^datatables/es/search$', DTElasticGenericSearch.as_view(), name='records_es_generic_dt_json'), + + # Analysis + url(r'^analysis$', views.analysis, name='analysis'), + url(r'^analysis/new$', views.job_analysis, name='job_analysis'), + + # Background Tasks + url(r'^background_tasks$', views.bg_tasks, name='bg_tasks'), + url(r'^background_tasks/process/action/(?P[0-9a-z]+)$', views.bgtasks_proc_action, + name='bgtasks_proc_action'), + url(r'^background_tasks/process/logs/err$', views.bgtasks_proc_stderr_log, name='bgtasks_proc_stderr_log'), + url(r'^background_tasks/delete_all$', views.bg_tasks_delete_all, name='bg_tasks_delete_all'), + url(r'^background_tasks/task/(?P[0-9]+)$', views.bg_task, name='bg_task'), + url(r'^background_tasks/task/(?P[0-9]+)/delete$', views.bg_task_delete, name='bg_task_delete'), + url(r'^background_tasks/task/(?P[0-9]+)/cancel$', views.bg_task_cancel, name='bg_task_cancel'), + + # Document Download + url(r'^document_download$', views.document_download, name='document_download'), + + # Global Messages (GMs) + url(r'^gm/delete$', views.gm_delete, name='gm_delete'), + + # StateIO + url(r'^stateio$', views.stateio, name='stateio'), + url(r'^stateio/state/(?P[0-9a-z]+)$', views.stateio_state, name='stateio_state'), + url(r'^stateio/state/(?P[0-9a-z]+)/manifest/(?P.+)$', views.stateio_state_manifest, + name='stateio_state_manifest'), + url(r'^stateio/state/(?P[0-9a-z]+)/delete$', views.stateio_state_delete, name='stateio_state_delete'), + url(r'^stateio/state/(?P[0-9a-z]+)/download$', views.stateio_state_download, + name='stateio_state_download'), + url(r'^stateio/state/(?P[0-9a-z]+)/stop$', views.stateio_state_stop, name='stateio_state_stop'), + url(r'^stateio/export$', views.stateio_export, name='stateio_export'), + url(r'^stateio/import$', views.stateio_import, name='stateio_import'), + + # General + url(r'^login$', auth_views.login, name='login'), + url(r'^logout$', auth_views.logout, name='logout'), + url(r'^', views.index, name='combine_home'), ] diff --git a/core/views.py b/core/views.py index cc67dd3d..36eb58a9 100644 --- a/core/views.py +++ b/core/views.py @@ -59,157 +59,170 @@ logging.getLogger("requests").setLevel(logging.WARNING) - # breadcrumb parser def breadcrumb_parser(request): - - ''' + ''' Rudimentary breadcrumbs parser ''' - crumbs = [] - - # livy/spark - regex_match = re.match(r'(.+?/livy_sessions)', request.path) - if regex_match: - crumbs.append(("Livy/Spark", reverse('livy_sessions'))) - - # configurations - regex_match = re.match(r'(.+?/configuration)', request.path) - if regex_match: - crumbs.append(("Configuration", reverse('configuration'))) - - # search - regex_match = re.match(r'(.+?/search)', request.path) - if regex_match: - crumbs.append(("Search", reverse('search'))) - - # configurations/test_validation_scenario - regex_match = re.match(r'(.+?/configuration/test_validation_scenario)', request.path) - if regex_match: - crumbs.append(("Test Validation Scenario", reverse('test_validation_scenario'))) - - # all jobs - regex_match = re.match(r'(.+?/jobs/all)', request.path) - if regex_match: - crumbs.append(("All Jobs", reverse('all_jobs'))) - - # analysis - regex_match = re.match(r'(.+?/analysis)', request.path) - if regex_match: - crumbs.append(("Analysis", reverse('analysis'))) - - # field analysis - regex_match = re.match(r'(.+?/analysis/es/index/j([0-9]+)/field_analysis.*)', request.path) - if regex_match: - - # get job - j = models.Job.objects.get(pk=int(regex_match.group(2))) - - # get field for analysis - field_name = request.GET.get('field_name', None) - - # append crumbs - if j.record_group.organization.for_analysis: - logger.debug("breadcrumbs: org is for analysis, skipping") - else: - crumbs.append(("Organzation - %s" % j.record_group.organization.name, reverse('organization', kwargs={'org_id':j.record_group.organization.id}))) - if j.record_group.for_analysis: - logger.debug("breadcrumbs: rg is for analysis, skipping") - else: - crumbs.append(("RecordGroup - %s" % j.record_group.name, reverse('record_group', kwargs={'org_id':j.record_group.organization.id, 'record_group_id':j.record_group.id}))) - crumbs.append(("Job - %s" % j.name, reverse('job_details', kwargs={'org_id':j.record_group.organization.id, 'record_group_id':j.record_group.id, 'job_id':j.id}))) - crumbs.append(("Field Analysis - %s" % field_name, '%s?%s' % (regex_match.group(1), request.META['QUERY_STRING']))) - - # published - pub_m = re.match(r'(.+?/published.*)', request.path) - if pub_m: - crumbs.append(("Published", reverse('published'))) - - # published subset create - pub_m = re.match(r'(.+?/published/subsets/create)', request.path) - if pub_m: - crumbs.append(("Published Subset Create", reverse('published_subset_create'))) - - # published subset - pub_m = re.match(r'(.+?/published/subset/(.+))', request.path) - if pub_m: - crumbs.append(("Published Subset: %s" % pub_m.group(2), reverse('published_subset', kwargs={'subset':pub_m.group(2)}))) - - # organization - pub_m = re.match(r'(.+?/organization/.*)', request.path) - if pub_m: - crumbs.append(("Organizations", reverse('organizations'))) - - # org - org_m = re.match(r'(.+?/organization/([0-9]+))', request.path) - if org_m: - org = models.Organization.objects.get(pk=int(org_m.group(2))) - if org.for_analysis: - logger.debug("breadcrumbs: org is for analysis, converting breadcrumbs") - crumbs.append(("Analysis", reverse('analysis'))) - else: - crumbs.append(("Organzation - %s" % org.name, org_m.group(1))) - - # record_group - rg_m = re.match(r'(.+?/record_group/([0-9]+))', request.path) - if rg_m: - rg = models.RecordGroup.objects.get(pk=int(rg_m.group(2))) - if rg.for_analysis: - logger.debug("breadcrumbs: rg is for analysis, converting breadcrumbs") - else: - crumbs.append(("RecordGroup - %s" % rg.name, rg_m.group(1))) - - # job - j_m = re.match(r'(.+?/job/([0-9]+))', request.path) - if j_m: - j = models.Job.objects.get(pk=int(j_m.group(2))) - if j.record_group.for_analysis: - crumbs.append(("Analysis - %s" % j.name, j_m.group(1))) - else: - crumbs.append(("Job - %s" % j.name, j_m.group(1))) - - # record - r_m = re.match(r'(.+?/record/([0-9a-z]+))', request.path) - if r_m: - r = models.Record.objects.get(id=r_m.group(2)) - crumbs.append(("Record - %s" % r.record_id, r_m.group(1))) - - # background tasks - regex_match = re.match(r'(.+?/background_tasks)', request.path) - if regex_match: - crumbs.append(("Background Tasks", reverse('bg_tasks'))) - - # background task - regex_match = re.match(r'(.+?/background_tasks/task/([0-9]+))', request.path) - if regex_match: - bg_task = models.CombineBackgroundTask.objects.get(pk=int(regex_match.group(2))) - crumbs.append(("Task - %s" % (bg_task.name), reverse('bg_tasks'))) - - # stateio - regex_match = re.match(r'(.+?/stateio.*)', request.path) - if regex_match: - crumbs.append(("State Export/Import", reverse('stateio'))) - - # stateio - state details - regex_match = re.match(r'(.+?/stateio/state/([0-9a-z].*))', request.path) - if regex_match: - state = models.StateIO.objects.get(id=regex_match.group(2)) - crumbs.append(("State - %s" % (state.name), reverse('stateio_state', kwargs={'state_id':regex_match.group(2)}))) - - # stateio - export - regex_match = re.match(r'(.+?/stateio/export.*)', request.path) - if regex_match: - crumbs.append(("Export", reverse('stateio_export'))) - - # stateio - export - regex_match = re.match(r'(.+?/stateio/import.*)', request.path) - if regex_match: - crumbs.append(("Import", reverse('stateio_import'))) - - # return - return crumbs - + crumbs = [] + + # livy/spark + regex_match = re.match(r'(.+?/livy_sessions)', request.path) + if regex_match: + crumbs.append(("Livy/Spark", reverse('livy_sessions'))) + + # configurations + regex_match = re.match(r'(.+?/configuration)', request.path) + if regex_match: + crumbs.append(("Configuration", reverse('configuration'))) + + # search + regex_match = re.match(r'(.+?/search)', request.path) + if regex_match: + crumbs.append(("Search", reverse('search'))) + + # configurations/test_validation_scenario + regex_match = re.match(r'(.+?/configuration/test_validation_scenario)', request.path) + if regex_match: + crumbs.append( + ("Test Validation Scenario", reverse('test_validation_scenario'))) + + # all jobs + regex_match = re.match(r'(.+?/jobs/all)', request.path) + if regex_match: + crumbs.append(("All Jobs", reverse('all_jobs'))) + + # analysis + regex_match = re.match(r'(.+?/analysis)', request.path) + if regex_match: + crumbs.append(("Analysis", reverse('analysis'))) + + # field analysis + regex_match = re.match(r'(.+?/analysis/es/index/j([0-9]+)/field_analysis.*)', request.path) + if regex_match: + + # get job + j = models.Job.objects.get(pk=int(regex_match.group(2))) + + # get field for analysis + field_name = request.GET.get('field_name', None) + + # append crumbs + if j.record_group.organization.for_analysis: + logger.debug("breadcrumbs: org is for analysis, skipping") + else: + crumbs.append(( + "Organzation - %s" % j.record_group.organization.name, + reverse('organization', kwargs={'org_id': j.record_group.organization.id}))) + if j.record_group.for_analysis: + logger.debug("breadcrumbs: rg is for analysis, skipping") + else: + crumbs.append(("RecordGroup - %s" % j.record_group.name, + reverse('record_group', kwargs={'org_id': j.record_group.organization.id, + 'record_group_id': j.record_group.id}))) + crumbs.append(("Job - %s" % j.name, reverse('job_details', + kwargs={ + 'org_id': j.record_group.organization.id, + 'record_group_id': j.record_group.id, + 'job_id': j.id}))) + crumbs.append(("Field Analysis - %s" % field_name, + '%s?%s' % (regex_match.group(1), request.META['QUERY_STRING']))) + + # published + pub_m = re.match(r'(.+?/published.*)', request.path) + if pub_m: + crumbs.append(("Published", reverse('published'))) + + # published subset create + pub_m = re.match(r'(.+?/published/subsets/create)', request.path) + if pub_m: + crumbs.append( + ("Published Subset Create", reverse('published_subset_create'))) + + # published subset + pub_m = re.match(r'(.+?/published/subset/(.+))', request.path) + if pub_m: + crumbs.append(("Published Subset: %s" % pub_m.group(2), + reverse('published_subset', kwargs={'subset': pub_m.group(2)}))) + + # organization + pub_m = re.match(r'(.+?/organization/.*)', request.path) + if pub_m: + crumbs.append(("Organizations", reverse('organizations'))) + + # org + org_m = re.match(r'(.+?/organization/([0-9]+))', request.path) + if org_m: + org = models.Organization.objects.get(pk=int(org_m.group(2))) + if org.for_analysis: + logger.debug("breadcrumbs: org is for analysis, converting breadcrumbs") + crumbs.append(("Analysis", reverse('analysis'))) + else: + crumbs.append( + ("Organzation - %s" % org.name, org_m.group(1))) + + # record_group + rg_m = re.match(r'(.+?/record_group/([0-9]+))', request.path) + if rg_m: + rg = models.RecordGroup.objects.get(pk=int(rg_m.group(2))) + if rg.for_analysis: + logger.debug("breadcrumbs: rg is for analysis, converting breadcrumbs") + else: + crumbs.append( + ("RecordGroup - %s" % rg.name, rg_m.group(1))) + + # job + j_m = re.match(r'(.+?/job/([0-9]+))', request.path) + if j_m: + j = models.Job.objects.get(pk=int(j_m.group(2))) + if j.record_group.for_analysis: + crumbs.append(("Analysis - %s" % j.name, j_m.group(1))) + else: + crumbs.append(("Job - %s" % j.name, j_m.group(1))) + + # record + r_m = re.match(r'(.+?/record/([0-9a-z]+))', request.path) + if r_m: + r = models.Record.objects.get(id=r_m.group(2)) + crumbs.append(("Record - %s" % r.record_id, r_m.group(1))) + + # background tasks + regex_match = re.match(r'(.+?/background_tasks)', request.path) + if regex_match: + crumbs.append(("Background Tasks", reverse('bg_tasks'))) + + # background task + regex_match = re.match(r'(.+?/background_tasks/task/([0-9]+))', request.path) + if regex_match: + bg_task = models.CombineBackgroundTask.objects.get(pk=int(regex_match.group(2))) + crumbs.append( + ("Task - %s" % (bg_task.name), reverse('bg_tasks'))) + + # stateio + regex_match = re.match(r'(.+?/stateio.*)', request.path) + if regex_match: + crumbs.append(("State Export/Import", reverse('stateio'))) + + # stateio - state details + regex_match = re.match(r'(.+?/stateio/state/([0-9a-z].*))', request.path) + if regex_match: + state = models.StateIO.objects.get(id=regex_match.group(2)) + crumbs.append(("State - %s" % (state.name), + reverse('stateio_state', kwargs={'state_id': regex_match.group(2)}))) + + # stateio - export + regex_match = re.match(r'(.+?/stateio/export.*)', request.path) + if regex_match: + crumbs.append(("Export", reverse('stateio_export'))) + + # stateio - export + regex_match = re.match(r'(.+?/stateio/import.*)', request.path) + if regex_match: + crumbs.append(("Import", reverse('stateio_import'))) + + # return + return crumbs #################################################################### @@ -218,31 +231,29 @@ def breadcrumb_parser(request): @login_required def index(request): + # get username + username = request.user.username - # get username - username = request.user.username + # get all organizations + orgs = models.Organization.objects.exclude(for_analysis=True).all() - # get all organizations - orgs = models.Organization.objects.exclude(for_analysis=True).all() + # get record count + record_count = models.Record.objects.all().count() - # get record count - record_count = models.Record.objects.all().count() + # get published records count + pr = models.PublishedRecords() + published_record_count = pr.records.count() - # get published records count - pr = models.PublishedRecords() - published_record_count = pr.records.count() - - # get job count - job_count = models.Job.objects.all().count() - - return render(request, 'core/index.html', { - 'username':username, - 'orgs':orgs, - 'record_count':"{:,}".format(record_count), - 'published_record_count':"{:,}".format(published_record_count), - 'job_count':"{:,}".format(job_count) - }) + # get job count + job_count = models.Job.objects.all().count() + return render(request, 'core/index.html', { + 'username': username, + 'orgs': orgs, + 'record_count': "{:,}".format(record_count), + 'published_record_count': "{:,}".format(published_record_count), + 'job_count': "{:,}".format(job_count) + }) #################################################################### @@ -251,174 +262,167 @@ def index(request): @login_required def system(request): - - # single Livy session - logger.debug("checking or active Livy session") - livy_session = models.LivySession.get_active_session() - - # if session found, refresh - if type(livy_session) == models.LivySession: - - # refresh - livy_session.refresh_from_livy() - - # create and append to list - livy_sessions = [livy_session] - - elif type(livy_session) == QuerySet: - - # loop and refresh - for s in livy_session: - s.refresh_from_livy() - - # set as list - livy_sessions = livy_session - - else: - livy_sessions = livy_session - - # get status of background jobs - if not hasattr(settings,'COMBINE_DEPLOYMENT') or settings.COMBINE_DEPLOYMENT != 'docker': - try: - sp = models.SupervisorRPCClient() - bgtasks_proc = sp.check_process('celery') - except: - logger.debug('supervisor might be down?') - bgtasks_proc = None - else: - bgtasks_proc = None - - # get celery worker status - active_tasks = celery_app.control.inspect().active() - - if active_tasks == None: - celery_status = 'stopped' - else: - if len(next(iter(active_tasks.values()))) == 0: - celery_status = 'idle' - elif len(next(iter(active_tasks.values()))) > 0: - celery_status = 'busy' - - # return - return render(request, 'core/system.html', { - 'livy_session':livy_session, - 'livy_sessions':livy_sessions, - 'celery_status':celery_status, - 'bgtasks_proc':bgtasks_proc, - 'breadcrumbs':breadcrumb_parser(request) - }) + # single Livy session + logger.debug("checking or active Livy session") + livy_session = models.LivySession.get_active_session() + + # if session found, refresh + if type(livy_session) == models.LivySession: + + # refresh + livy_session.refresh_from_livy() + + # create and append to list + livy_sessions = [livy_session] + + elif type(livy_session) == QuerySet: + + # loop and refresh + for s in livy_session: + s.refresh_from_livy() + + # set as list + livy_sessions = livy_session + + else: + livy_sessions = livy_session + + # get status of background jobs + if not hasattr(settings, 'COMBINE_DEPLOYMENT') or settings.COMBINE_DEPLOYMENT != 'docker': + try: + sp = models.SupervisorRPCClient() + bgtasks_proc = sp.check_process('celery') + except: + logger.debug('supervisor might be down?') + bgtasks_proc = None + else: + bgtasks_proc = None + + # get celery worker status + active_tasks = celery_app.control.inspect().active() + + if active_tasks == None: + celery_status = 'stopped' + else: + if len(next(iter(active_tasks.values()))) == 0: + celery_status = 'idle' + elif len(next(iter(active_tasks.values()))) > 0: + celery_status = 'busy' + + # return + return render(request, 'core/system.html', { + 'livy_session': livy_session, + 'livy_sessions': livy_sessions, + 'celery_status': celery_status, + 'bgtasks_proc': bgtasks_proc, + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def livy_session_start(request): + logger.debug('Checking for pre-existing livy sessions') - logger.debug('Checking for pre-existing livy sessions') + # get active livy sessions + active_ls = models.LivySession.get_active_session() - # get active livy sessions - active_ls = models.LivySession.get_active_session() + # none found + if not active_ls: + logger.debug('active livy session not found, starting') + livy_session = models.LivySession() + livy_session.start_session() - # none found - if not active_ls: - logger.debug('active livy session not found, starting') - livy_session = models.LivySession() - livy_session.start_session() + elif type(active_ls) == models.LivySession and request.GET.get('restart') == 'true': + logger.debug('single, active session found, and restart flag passed, restarting') - elif type(active_ls) == models.LivySession and request.GET.get('restart') == 'true': - logger.debug('single, active session found, and restart flag passed, restarting') + # restart + new_ls = active_ls.restart_session() - # restart - new_ls = active_ls.restart_session() - - # redirect - return redirect('system') + # redirect + return redirect('system') @login_required def livy_session_stop(request, session_id): + logger.debug('stopping Livy session by Combine ID: %s' % session_id) - logger.debug('stopping Livy session by Combine ID: %s' % session_id) - - livy_session = models.LivySession.objects.filter(id=session_id).first() + livy_session = models.LivySession.objects.filter(id=session_id).first() - # attempt to stop with Livy - models.LivyClient.stop_session(livy_session.session_id) + # attempt to stop with Livy + models.LivyClient.stop_session(livy_session.session_id) - # remove from DB - livy_session.delete() + # remove from DB + livy_session.delete() - # redirect - return redirect('system') + # redirect + return redirect('system') @login_required def bgtasks_proc_action(request, proc_action): + logger.debug('performing %s on bgtasks_proc' % proc_action) - logger.debug('performing %s on bgtasks_proc' % proc_action) + # get supervisor handle + sp = models.SupervisorRPCClient() - # get supervisor handle - sp = models.SupervisorRPCClient() + # fire action + actions = { + 'start': sp.start_process, + 'restart': sp.restart_process, + 'stop': sp.stop_process + } + results = actions[proc_action]('celery') + logger.debug(results) - # fire action - actions = { - 'start':sp.start_process, - 'restart':sp.restart_process, - 'stop':sp.stop_process - } - results = actions[proc_action]('celery') - logger.debug(results) - - # redirect - return redirect('system') + # redirect + return redirect('system') @login_required def bgtasks_proc_stderr_log(request): + # get supervisor handle + sp = models.SupervisorRPCClient() - # get supervisor handle - sp = models.SupervisorRPCClient() - - log_tail = sp.stderr_log_tail('celery') + log_tail = sp.stderr_log_tail('celery') - # redirect - return HttpResponse(log_tail, content_type='text/plain') + # redirect + return HttpResponse(log_tail, content_type='text/plain') def system_bg_status(request): - - ''' + ''' View to return status on: - Livy session - celery worker ''' - # get livy status - lv = models.LivySession.get_active_session() - if lv: - if type(lv) == models.LivySession: - # refresh single session - lv.refresh_from_livy() - # set status - livy_status = lv.status - else: - livy_status = 'stopped' - - # get celery worker status - active_tasks = celery_app.control.inspect().active() - - if active_tasks == None: - celery_status = 'stopped' - else: - if len(next(iter(active_tasks.values()))) == 0: - celery_status = 'idle' - elif len(next(iter(active_tasks.values()))) > 0: - celery_status = 'busy' - - # return json - return JsonResponse({ - 'celery_status':celery_status, - 'livy_status':livy_status - }) - + # get livy status + lv = models.LivySession.get_active_session() + if lv: + if type(lv) == models.LivySession: + # refresh single session + lv.refresh_from_livy() + # set status + livy_status = lv.status + else: + livy_status = 'stopped' + + # get celery worker status + active_tasks = celery_app.control.inspect().active() + + if active_tasks == None: + celery_status = 'stopped' + else: + if len(next(iter(active_tasks.values()))) == 0: + celery_status = 'idle' + elif len(next(iter(active_tasks.values()))) > 0: + celery_status = 'busy' + + # return json + return JsonResponse({ + 'celery_status': celery_status, + 'livy_status': livy_status + }) #################################################################### @@ -426,60 +430,54 @@ def system_bg_status(request): #################################################################### def organizations(request): - - ''' + ''' View all Organizations ''' - # show organizations - if request.method == 'GET': - - logger.debug('retrieving organizations') - - # get all organizations - orgs = models.Organization.objects.exclude(for_analysis=True).all() - - # render page - return render(request, 'core/organizations.html', { - 'orgs':orgs, - 'breadcrumbs':breadcrumb_parser(request) - }) + # show organizations + if request.method == 'GET': + logger.debug('retrieving organizations') + # get all organizations + orgs = models.Organization.objects.exclude(for_analysis=True).all() - # create new organization - if request.method == 'POST': + # render page + return render(request, 'core/organizations.html', { + 'orgs': orgs, + 'breadcrumbs': breadcrumb_parser(request) + }) - # create new org - logger.debug(request.POST) - f = forms.OrganizationForm(request.POST) - new_org = f.save() + # create new organization + if request.method == 'POST': + # create new org + logger.debug(request.POST) + f = forms.OrganizationForm(request.POST) + new_org = f.save() - return redirect('organization', org_id=new_org.id) + return redirect('organization', org_id=new_org.id) def organization(request, org_id): - - ''' + ''' Details for Organization ''' - # get organization - org = models.Organization.objects.get(pk=org_id) + # get organization + org = models.Organization.objects.get(pk=org_id) - # get record groups for this organization - record_groups = models.RecordGroup.objects.filter(organization=org).exclude(for_analysis=True) + # get record groups for this organization + record_groups = models.RecordGroup.objects.filter(organization=org).exclude(for_analysis=True) - # render page - return render(request, 'core/organization.html', { - 'org':org, - 'record_groups':record_groups, - 'breadcrumbs':breadcrumb_parser(request) - }) + # render page + return render(request, 'core/organization.html', { + 'org': org, + 'record_groups': record_groups, + 'breadcrumbs': breadcrumb_parser(request) + }) def organization_delete(request, org_id): - - ''' + ''' Delete Organization Note: Through cascade deletes, would remove: - RecordGroup @@ -487,32 +485,31 @@ def organization_delete(request, org_id): - Record ''' - # get organization - org = models.Organization.objects.get(pk=org_id) - - # set job status to deleting - org.name = "%s (DELETING)" % org.name - org.save() + # get organization + org = models.Organization.objects.get(pk=org_id) - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = 'Delete Organization: %s' % org.name, - task_type = 'delete_model_instance', - task_params_json = json.dumps({ - 'model':'Organization', - 'org_id':org.id - }) - ) - ct.save() + # set job status to deleting + org.name = "%s (DELETING)" % org.name + org.save() - # run celery task - bg_task = tasks.delete_model_instance.delay('Organization',org.id,) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name='Delete Organization: %s' % org.name, + task_type='delete_model_instance', + task_params_json=json.dumps({ + 'model': 'Organization', + 'org_id': org.id + }) + ) + ct.save() - return redirect('organizations') + # run celery task + bg_task = tasks.delete_model_instance.delay('Organization', org.id, ) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + return redirect('organizations') #################################################################### @@ -521,116 +518,109 @@ def organization_delete(request, org_id): @login_required def record_group_id_redirect(request, record_group_id): - - ''' + ''' Route to redirect to more verbose Record Group URL ''' - # get job - record_group = models.RecordGroup.objects.get(pk=record_group_id) + # get job + record_group = models.RecordGroup.objects.get(pk=record_group_id) - # redirect - return redirect('record_group', - org_id=record_group.organization.id, - record_group_id=record_group.id) + # redirect + return redirect('record_group', + org_id=record_group.organization.id, + record_group_id=record_group.id) def record_group_new(request, org_id): - - ''' + ''' Create new Record Group ''' - # create new organization - if request.method == 'POST': - - # create new record group - logger.debug(request.POST) - f = forms.RecordGroupForm(request.POST) - new_rg = f.save() - - # redirect to organization page - return redirect('record_group', org_id=org_id, record_group_id=new_rg.id) + # create new organization + if request.method == 'POST': + # create new record group + logger.debug(request.POST) + f = forms.RecordGroupForm(request.POST) + new_rg = f.save() + # redirect to organization page + return redirect('record_group', org_id=org_id, record_group_id=new_rg.id) def record_group_delete(request, org_id, record_group_id): - - ''' + ''' Create new Record Group ''' - # retrieve record group - record_group = models.RecordGroup.objects.get(pk=record_group_id) + # retrieve record group + record_group = models.RecordGroup.objects.get(pk=record_group_id) - # set job status to deleting - record_group.name = "%s (DELETING)" % record_group.name - record_group.save() + # set job status to deleting + record_group.name = "%s (DELETING)" % record_group.name + record_group.save() - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = 'Delete RecordGroup: %s' % record_group.name, - task_type = 'delete_model_instance', - task_params_json = json.dumps({ - 'model':'RecordGroup', - 'record_group_id':record_group.id - }) - ) - ct.save() + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name='Delete RecordGroup: %s' % record_group.name, + task_type='delete_model_instance', + task_params_json=json.dumps({ + 'model': 'RecordGroup', + 'record_group_id': record_group.id + }) + ) + ct.save() - # run celery task - bg_task = tasks.delete_model_instance.delay('RecordGroup',record_group.id,) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() + # run celery task + bg_task = tasks.delete_model_instance.delay('RecordGroup', record_group.id, ) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() - # redirect to organization page - return redirect('organization', org_id=org_id) + # redirect to organization page + return redirect('organization', org_id=org_id) @login_required def record_group(request, org_id, record_group_id): - - ''' + ''' View information about a single record group, including any and all jobs run Args: record_group_id (str/int): PK for RecordGroup table ''' - logger.debug('retrieving record group ID: %s' % record_group_id) - - # retrieve record group - record_group = models.RecordGroup.objects.get(pk=int(record_group_id)) + logger.debug('retrieving record group ID: %s' % record_group_id) - # get all jobs associated with record group - jobs = models.Job.objects.filter(record_group=record_group_id) + # retrieve record group + record_group = models.RecordGroup.objects.get(pk=int(record_group_id)) - # get all currently applied publish set ids - publish_set_ids = models.PublishedRecords.get_publish_set_ids() + # get all jobs associated with record group + jobs = models.Job.objects.filter(record_group=record_group_id) - # loop through jobs - for job in jobs: + # get all currently applied publish set ids + publish_set_ids = models.PublishedRecords.get_publish_set_ids() - # update status - job.update_status() + # loop through jobs + for job in jobs: + # update status + job.update_status() - # get record group job lineage - job_lineage = record_group.get_jobs_lineage() + # get record group job lineage + job_lineage = record_group.get_jobs_lineage() - # get all record groups for this organization - record_groups = models.RecordGroup.objects.filter(organization=org_id).exclude(id=record_group_id).exclude(for_analysis=True) - - # render page - return render(request, 'core/record_group.html', { - 'record_group':record_group, - 'jobs':jobs, - 'job_lineage_json':json.dumps(job_lineage), - 'publish_set_ids':publish_set_ids, - 'record_groups':record_groups, - 'breadcrumbs':breadcrumb_parser(request) - }) + # get all record groups for this organization + record_groups = models.RecordGroup.objects.filter(organization=org_id).exclude(id=record_group_id).exclude( + for_analysis=True) + # render page + return render(request, 'core/record_group.html', { + 'record_group': record_group, + 'jobs': jobs, + 'job_lineage_json': json.dumps(job_lineage), + 'publish_set_ids': publish_set_ids, + 'record_groups': record_groups, + 'breadcrumbs': breadcrumb_parser(request) + }) #################################################################### @@ -640,1011 +630,982 @@ def record_group(request, org_id, record_group_id): @login_required def job_id_redirect(request, job_id): - - ''' + ''' Route to redirect to more verbose Jobs URL ''' - # get job - job = models.Job.objects.get(pk=job_id) + # get job + job = models.Job.objects.get(pk=job_id) - # redirect - return redirect('job_details', - org_id=job.record_group.organization.id, - record_group_id=job.record_group.id, - job_id=job.id) + # redirect + return redirect('job_details', + org_id=job.record_group.organization.id, + record_group_id=job.record_group.id, + job_id=job.id) @login_required def all_jobs(request): + # get all the record groups. + record_groups = models.RecordGroup.objects.exclude(for_analysis=True) - # get all the record groups. - record_groups = models.RecordGroup.objects.exclude(for_analysis=True) - - ''' + ''' View to show all jobs, across all Organizations, RecordGroups, and Job types GET Args: include_analysis: if true, include Analysis type jobs ''' - # capture include_analysis GET param if present - include_analysis = request.GET.get('include_analysis', False) + # capture include_analysis GET param if present + include_analysis = request.GET.get('include_analysis', False) - # get all jobs associated with record group - if include_analysis: - jobs = models.Job.objects.all() - else: - jobs = models.Job.objects.exclude(job_type='AnalysisJob').all() + # get all jobs associated with record group + if include_analysis: + jobs = models.Job.objects.all() + else: + jobs = models.Job.objects.exclude(job_type='AnalysisJob').all() - # get job lineage for all jobs - if include_analysis: - ld = models.Job.get_all_jobs_lineage(exclude_analysis_jobs=False) - else: - ld = models.Job.get_all_jobs_lineage(exclude_analysis_jobs=True) + # get job lineage for all jobs + if include_analysis: + ld = models.Job.get_all_jobs_lineage(exclude_analysis_jobs=False) + else: + ld = models.Job.get_all_jobs_lineage(exclude_analysis_jobs=True) - # loop through jobs and update status - for job in jobs: - job.update_status() + # loop through jobs and update status + for job in jobs: + job.update_status() - # render page - return render(request, 'core/all_jobs.html', { - 'jobs':jobs, - 'record_groups':record_groups, - 'job_lineage_json':json.dumps(ld), - 'breadcrumbs':breadcrumb_parser(request) - }) + # render page + return render(request, 'core/all_jobs.html', { + 'jobs': jobs, + 'record_groups': record_groups, + 'job_lineage_json': json.dumps(ld), + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def job_delete(request, org_id, record_group_id, job_id): - - logger.debug('deleting job by id: %s' % job_id) - - # get job - job = models.Job.objects.get(pk=job_id) - - # set job status to deleting - job.name = "%s (DELETING)" % job.name - job.deleted = True - job.status = 'deleting' - job.save() - - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = 'Delete Job: %s' % job.name, - task_type = 'delete_model_instance', - task_params_json = json.dumps({ - 'model':'Job', - 'job_id':job.id - }) - ) - ct.save() - - # run celery task - bg_task = tasks.delete_model_instance.delay('Job',job.id) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # redirect - return redirect(request.META.get('HTTP_REFERER')) + logger.debug('deleting job by id: %s' % job_id) + + # get job + job = models.Job.objects.get(pk=job_id) + + # set job status to deleting + job.name = "%s (DELETING)" % job.name + job.deleted = True + job.status = 'deleting' + job.save() + + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name='Delete Job: %s' % job.name, + task_type='delete_model_instance', + task_params_json=json.dumps({ + 'model': 'Job', + 'job_id': job.id + }) + ) + ct.save() + + # run celery task + bg_task = tasks.delete_model_instance.delay('Job', job.id) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + + # redirect + return redirect(request.META.get('HTTP_REFERER')) @login_required def stop_jobs(request): + logger.debug('stopping jobs') - logger.debug('stopping jobs') - - job_ids = request.POST.getlist('job_ids[]') - logger.debug(job_ids) - - # get downstream toggle - downstream_toggle = request.POST.get('downstream_stop_toggle', False); - if downstream_toggle == 'true': - downstream_toggle = True - elif downstream_toggle == 'false': - downstream_toggle = False + job_ids = request.POST.getlist('job_ids[]') + logger.debug(job_ids) - # set of jobs to rerun - job_stop_set = set() + # get downstream toggle + downstream_toggle = request.POST.get('downstream_stop_toggle', False); + if downstream_toggle == 'true': + downstream_toggle = True + elif downstream_toggle == 'false': + downstream_toggle = False - # loop through job_ids - for job_id in job_ids: + # set of jobs to rerun + job_stop_set = set() - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) + # loop through job_ids + for job_id in job_ids: - # if including downstream - if downstream_toggle: + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - # add rerun lineage for this job to set - job_stop_set.update(cjob.job.get_downstream_jobs()) + # if including downstream + if downstream_toggle: - # else, just job - else: + # add rerun lineage for this job to set + job_stop_set.update(cjob.job.get_downstream_jobs()) - job_stop_set.add(cjob.job) + # else, just job + else: - # sort and run - ordered_job_delete_set = sorted(list(job_stop_set), key=lambda j: j.id) + job_stop_set.add(cjob.job) - # # loop through and update visible elements of Job for front-end - for job in ordered_job_delete_set: + # sort and run + ordered_job_delete_set = sorted(list(job_stop_set), key=lambda j: j.id) - logger.debug('stopping Job: %s' % job) + # # loop through and update visible elements of Job for front-end + for job in ordered_job_delete_set: + logger.debug('stopping Job: %s' % job) - # stop job - job.stop_job() + # stop job + job.stop_job() - # set gms - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'

Stopped Job(s):
%s

' % ('
'.join([j.name for j in ordered_job_delete_set ])), - 'class':'danger' - }) + # set gms + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': '

Stopped Job(s):
%s

' % ( + '
'.join([j.name for j in ordered_job_delete_set])), + 'class': 'danger' + }) - # return - return JsonResponse({'results':True}) + # return + return JsonResponse({'results': True}) @login_required def delete_jobs(request): + logger.debug('deleting jobs') - logger.debug('deleting jobs') + job_ids = request.POST.getlist('job_ids[]') + logger.debug(job_ids) - job_ids = request.POST.getlist('job_ids[]') - logger.debug(job_ids) + # get downstream toggle + downstream_toggle = request.POST.get('downstream_delete_toggle', False); + if downstream_toggle == 'true': + downstream_toggle = True + elif downstream_toggle == 'false': + downstream_toggle = False - # get downstream toggle - downstream_toggle = request.POST.get('downstream_delete_toggle', False); - if downstream_toggle == 'true': - downstream_toggle = True - elif downstream_toggle == 'false': - downstream_toggle = False + # set of jobs to rerun + job_delete_set = set() - # set of jobs to rerun - job_delete_set = set() + # loop through job_ids + for job_id in job_ids: - # loop through job_ids - for job_id in job_ids: + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) + # if including downstream + if downstream_toggle: - # if including downstream - if downstream_toggle: + # add rerun lineage for this job to set + job_delete_set.update(cjob.job.get_downstream_jobs()) - # add rerun lineage for this job to set - job_delete_set.update(cjob.job.get_downstream_jobs()) + # else, just job + else: - # else, just job - else: + job_delete_set.add(cjob.job) - job_delete_set.add(cjob.job) + # sort and run + ordered_job_delete_set = sorted(list(job_delete_set), key=lambda j: j.id) - # sort and run - ordered_job_delete_set = sorted(list(job_delete_set), key=lambda j: j.id) + # # loop through and update visible elements of Job for front-end + for job in ordered_job_delete_set: + logger.debug('deleting Job: %s' % job) - # # loop through and update visible elements of Job for front-end - for job in ordered_job_delete_set: + # set job status to deleting + job.name = "%s (DELETING)" % job.name + job.deleted = True + job.status = 'deleting' + job.save() - logger.debug('deleting Job: %s' % job) + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name='Delete Job: #%s' % job.name, + task_type='delete_model_instance', + task_params_json=json.dumps({ + 'model': 'Job', + 'job_id': job.id + }) + ) + ct.save() - # set job status to deleting - job.name = "%s (DELETING)" % job.name - job.deleted = True - job.status = 'deleting' - job.save() + # run celery task + bg_task = tasks.delete_model_instance.delay('Job', job.id, ) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = 'Delete Job: #%s' % job.name, - task_type = 'delete_model_instance', - task_params_json = json.dumps({ - 'model':'Job', - 'job_id':job.id - }) - ) - ct.save() + # set gms + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': '

Deleting Job(s):
%s

Refresh this page to update status of removing Jobs.

' % ( + '
'.join([j.name for j in ordered_job_delete_set])), + 'class': 'danger' + }) - # run celery task - bg_task = tasks.delete_model_instance.delay('Job',job.id,) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # set gms - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'

Deleting Job(s):
%s

Refresh this page to update status of removing Jobs.

' % ('
'.join([j.name for j in ordered_job_delete_set ])), - 'class':'danger' - }) - - # return - return JsonResponse({'results':True}) + # return + return JsonResponse({'results': True}) @login_required def move_jobs(request): + logger.debug('moving jobs') - logger.debug('moving jobs') - - job_ids = request.POST.getlist('job_ids[]') - record_group_id = request.POST.getlist('record_group_id')[0] + job_ids = request.POST.getlist('job_ids[]') + record_group_id = request.POST.getlist('record_group_id')[0] - # get downstream toggle - downstream_toggle = request.POST.get('downstream_move_toggle', False); - if downstream_toggle == 'true': - downstream_toggle = True - elif downstream_toggle == 'false': - downstream_toggle = False + # get downstream toggle + downstream_toggle = request.POST.get('downstream_move_toggle', False); + if downstream_toggle == 'true': + downstream_toggle = True + elif downstream_toggle == 'false': + downstream_toggle = False - # set of jobs to rerun - job_move_set = set() + # set of jobs to rerun + job_move_set = set() - # loop through job_ids - for job_id in job_ids: + # loop through job_ids + for job_id in job_ids: - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - # if including downstream - if downstream_toggle: + # if including downstream + if downstream_toggle: - # add rerun lineage for this job to set - job_move_set.update(cjob.job.get_downstream_jobs()) + # add rerun lineage for this job to set + job_move_set.update(cjob.job.get_downstream_jobs()) - # else, just job - else: + # else, just job + else: - job_move_set.add(cjob.job) + job_move_set.add(cjob.job) - # sort and run - ordered_job_move_set = sorted(list(job_move_set), key=lambda j: j.id) + # sort and run + ordered_job_move_set = sorted(list(job_move_set), key=lambda j: j.id) - # loop through jobs - for job in ordered_job_move_set: + # loop through jobs + for job in ordered_job_move_set: + logger.debug('moving Job: %s' % job) - logger.debug('moving Job: %s' % job) + new_record_group = models.RecordGroup.objects.get(pk=record_group_id) + job.record_group = new_record_group + job.save() - new_record_group = models.RecordGroup.objects.get(pk=record_group_id) - job.record_group = new_record_group - job.save() + logger.debug('Job %s has been moved' % job) - logger.debug('Job %s has been moved' % job) - - # redirect - return JsonResponse({'results':True}) + # redirect + return JsonResponse({'results': True}) @login_required def job_details(request, org_id, record_group_id, job_id): - - logger.debug('details for job id: %s' % job_id) - - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) - - # update status - cjob.job.update_status() - - # detailed record count - record_count_details = cjob.job.get_detailed_job_record_count() - - # get job lineage - job_lineage = cjob.job.get_lineage() - - # get dpla_bulk_data_match - dpla_bulk_data_matches = cjob.job.get_dpla_bulk_data_matches() - - # check if limiting to one, pre-existing record - q = request.GET.get('q', None) - - # job details and job type specific augment - job_details = cjob.job.job_details_dict - - # mapped field analysis, generate if not part of job_details - if 'mapped_field_analysis' in job_details.keys(): - field_counts = job_details['mapped_field_analysis'] - else: - if cjob.job.finished: - field_counts = cjob.count_indexed_fields() - cjob.job.update_job_details({'mapped_field_analysis':field_counts}, save=True) - else: - logger.debug('job not finished, not setting') - field_counts = {} - - # OAI Harvest - if type(cjob) == models.HarvestOAIJob: - pass - - # Static Harvest - elif type(cjob) == models.HarvestStaticXMLJob: - pass - - # Transform - elif type(cjob) == models.TransformJob: - pass - - # Merge/Duplicate - elif type(cjob) == models.MergeJob: - pass - - # Analysis - elif type(cjob) == models.AnalysisJob: - pass - - # get published records, primarily for published sets - pr = models.PublishedRecords() - - # get published subsets with PublishedRecords static method - published_subsets = models.PublishedRecords.get_subsets() - - # loop through subsets and enrich - for _ in published_subsets: - - # add counts - counts = mc_handle.combine.misc.find_one({'_id':'published_field_counts_%s' % _['name']}) - - # if counts not yet calculated, do now - if counts == None: - counts = models.PublishedRecords(subset=_['name']).count_indexed_fields() - _['counts'] = counts - - # get field mappers - field_mappers = models.FieldMapper.objects.all() - - # return - return render(request, 'core/job_details.html', { - 'cjob':cjob, - 'record_group':cjob.job.record_group, - 'record_count_details':record_count_details, - 'field_counts':field_counts, - 'field_mappers':field_mappers, - 'xml2kvp_handle':models.XML2kvp(), - 'job_lineage_json':json.dumps(job_lineage), - 'dpla_bulk_data_matches':dpla_bulk_data_matches, - 'q':q, - 'job_details':job_details, - 'pr':pr, - 'published_subsets':published_subsets, - 'es_index_str':cjob.esi.es_index_str, - 'breadcrumbs':breadcrumb_parser(request) - }) + logger.debug('details for job id: %s' % job_id) + + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) + + # update status + cjob.job.update_status() + + # detailed record count + record_count_details = cjob.job.get_detailed_job_record_count() + + # get job lineage + job_lineage = cjob.job.get_lineage() + + # get dpla_bulk_data_match + dpla_bulk_data_matches = cjob.job.get_dpla_bulk_data_matches() + + # check if limiting to one, pre-existing record + q = request.GET.get('q', None) + + # job details and job type specific augment + job_details = cjob.job.job_details_dict + + # mapped field analysis, generate if not part of job_details + if 'mapped_field_analysis' in job_details.keys(): + field_counts = job_details['mapped_field_analysis'] + else: + if cjob.job.finished: + field_counts = cjob.count_indexed_fields() + cjob.job.update_job_details({'mapped_field_analysis': field_counts}, save=True) + else: + logger.debug('job not finished, not setting') + field_counts = {} + + # OAI Harvest + if type(cjob) == models.HarvestOAIJob: + pass + + # Static Harvest + elif type(cjob) == models.HarvestStaticXMLJob: + pass + + # Transform + elif type(cjob) == models.TransformJob: + pass + + # Merge/Duplicate + elif type(cjob) == models.MergeJob: + pass + + # Analysis + elif type(cjob) == models.AnalysisJob: + pass + + # get published records, primarily for published sets + pr = models.PublishedRecords() + + # get published subsets with PublishedRecords static method + published_subsets = models.PublishedRecords.get_subsets() + + # loop through subsets and enrich + for _ in published_subsets: + + # add counts + counts = mc_handle.combine.misc.find_one({'_id': 'published_field_counts_%s' % _['name']}) + + # if counts not yet calculated, do now + if counts == None: + counts = models.PublishedRecords(subset=_['name']).count_indexed_fields() + _['counts'] = counts + + # get field mappers + field_mappers = models.FieldMapper.objects.all() + + # return + return render(request, 'core/job_details.html', { + 'cjob': cjob, + 'record_group': cjob.job.record_group, + 'record_count_details': record_count_details, + 'field_counts': field_counts, + 'field_mappers': field_mappers, + 'xml2kvp_handle': models.XML2kvp(), + 'job_lineage_json': json.dumps(job_lineage), + 'dpla_bulk_data_matches': dpla_bulk_data_matches, + 'q': q, + 'job_details': job_details, + 'pr': pr, + 'published_subsets': published_subsets, + 'es_index_str': cjob.esi.es_index_str, + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def job_errors(request, org_id, record_group_id, job_id): + logger.debug('retrieving errors for job id: %s' % job_id) - logger.debug('retrieving errors for job id: %s' % job_id) - - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - job_errors = cjob.get_job_errors() + job_errors = cjob.get_job_errors() - # return - return render(request, 'core/job_errors.html', { - 'cjob':cjob, - 'job_errors':job_errors, - 'breadcrumbs':breadcrumb_parser(request) - }) + # return + return render(request, 'core/job_errors.html', { + 'cjob': cjob, + 'job_errors': job_errors, + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def job_update_note(request, org_id, record_group_id, job_id): + if request.method == 'POST': - if request.method == 'POST': + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) + # get job note + job_note = request.POST.get('job_note') + if job_note == '': + job_note = None - # get job note - job_note = request.POST.get('job_note') - if job_note == '': - job_note = None + # update job note + cjob.job.note = job_note + cjob.job.save() - # update job note - cjob.job.note = job_note - cjob.job.save() - - # redirect - return redirect(request.META.get('HTTP_REFERER')) + # redirect + return redirect(request.META.get('HTTP_REFERER')) @login_required def job_update_name(request, org_id, record_group_id, job_id): + if request.method == 'POST': - if request.method == 'POST': - - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - # get job note - job_name = request.POST.get('job_name') - if job_name == '': - job_name = None + # get job note + job_name = request.POST.get('job_name') + if job_name == '': + job_name = None - # update job note - cjob.job.name = job_name - cjob.job.save() + # update job note + cjob.job.name = job_name + cjob.job.save() - # redirect - return redirect(request.META.get('HTTP_REFERER')) + # redirect + return redirect(request.META.get('HTTP_REFERER')) @login_required def job_publish(request, org_id, record_group_id, job_id): + logger.debug(request.POST) - logger.debug(request.POST) - - # capture entered publish set id - publish_set_id = request.POST.get('publish_set_id', None) + # capture entered publish set id + publish_set_id = request.POST.get('publish_set_id', None) - # override with pre-existing publish set id is selected - if request.POST.get('existing_publish_set_id', None) != None: - publish_set_id = request.POST.get('existing_publish_set_id') + # override with pre-existing publish set id is selected + if request.POST.get('existing_publish_set_id', None) != None: + publish_set_id = request.POST.get('existing_publish_set_id') - # get published subsets to include in - published_subsets = request.POST.getlist('published_subsets', []) + # get published subsets to include in + published_subsets = request.POST.getlist('published_subsets', []) - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - # init publish - bg_task = cjob.publish_bg_task( - publish_set_id=publish_set_id, - in_published_subsets=published_subsets) + # init publish + bg_task = cjob.publish_bg_task( + publish_set_id=publish_set_id, + in_published_subsets=published_subsets) - # set gms - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'

Publishing Job:
%s

Publish Set ID:
%s

' % (cjob.job.name, publish_set_id, reverse('published')), - 'class':'success' - }) + # set gms + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': '

Publishing Job:
%s

Publish Set ID:
%s

' % ( + cjob.job.name, publish_set_id, reverse('published')), + 'class': 'success' + }) - return redirect('record_group', - org_id=cjob.job.record_group.organization.id, - record_group_id=cjob.job.record_group.id) + return redirect('record_group', + org_id=cjob.job.record_group.organization.id, + record_group_id=cjob.job.record_group.id) @login_required def job_unpublish(request, org_id, record_group_id, job_id): + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) + # init unpublish + bg_task = cjob.unpublish_bg_task() - # init unpublish - bg_task = cjob.unpublish_bg_task() + # set gms + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': '

Unpublishing Job:
%s

' % ( + cjob.job.name, reverse('published')), + 'class': 'success' + }) - # set gms - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'

Unpublishing Job:
%s

' % (cjob.job.name, reverse('published')), - 'class':'success' - }) - - return redirect('record_group', - org_id=cjob.job.record_group.organization.id, - record_group_id=cjob.job.record_group.id) + return redirect('record_group', + org_id=cjob.job.record_group.organization.id, + record_group_id=cjob.job.record_group.id) @login_required def rerun_jobs(request): - - logger.debug('re-running jobs') - - # get job ids - job_ids = request.POST.getlist('job_ids[]') - - # get downstream toggle - downstream_toggle = request.POST.get('downstream_rerun_toggle', False); - if downstream_toggle == 'true': - downstream_toggle = True - elif downstream_toggle == 'false': - downstream_toggle = False - - # set of jobs to rerun - job_rerun_set = set() - - # loop through job_ids - for job_id in job_ids: - - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) - - # if including downstream - if downstream_toggle: - - # add rerun lineage for this job to set - job_rerun_set.update(cjob.job.get_downstream_jobs()) - - # else, just job - else: - - job_rerun_set.add(cjob.job) - - # sort and run - ordered_job_rerun_set = sorted(list(job_rerun_set), key=lambda j: j.id) - - # # loop through and update visible elements of Job for front-end - for re_job in ordered_job_rerun_set: - - re_job.timestamp = datetime.datetime.now() - re_job.status = 'initializing' - re_job.record_count = 0 - re_job.finished = False - re_job.elapsed = 0 - re_job.deleted = True - re_job.save() - - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = "Rerun Jobs Prep", - task_type = 'rerun_jobs_prep', - task_params_json = json.dumps({ - 'ordered_job_rerun_set':[j.id for j in ordered_job_rerun_set] - }) - ) - ct.save() - - # run celery task - bg_task = tasks.rerun_jobs_prep.delay(ct.id) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # set gms - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'Preparing to Rerun Job(s):
%s

Refresh this page to update status of Jobs rerunning. ' % '
'.join([str(j.name) for j in ordered_job_rerun_set]), - 'class':'success' - }) - - # return, as requested via Ajax which will reload page - return JsonResponse({'results':True}) + logger.debug('re-running jobs') + + # get job ids + job_ids = request.POST.getlist('job_ids[]') + + # get downstream toggle + downstream_toggle = request.POST.get('downstream_rerun_toggle', False); + if downstream_toggle == 'true': + downstream_toggle = True + elif downstream_toggle == 'false': + downstream_toggle = False + + # set of jobs to rerun + job_rerun_set = set() + + # loop through job_ids + for job_id in job_ids: + + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) + + # if including downstream + if downstream_toggle: + + # add rerun lineage for this job to set + job_rerun_set.update(cjob.job.get_downstream_jobs()) + + # else, just job + else: + + job_rerun_set.add(cjob.job) + + # sort and run + ordered_job_rerun_set = sorted(list(job_rerun_set), key=lambda j: j.id) + + # # loop through and update visible elements of Job for front-end + for re_job in ordered_job_rerun_set: + re_job.timestamp = datetime.datetime.now() + re_job.status = 'initializing' + re_job.record_count = 0 + re_job.finished = False + re_job.elapsed = 0 + re_job.deleted = True + re_job.save() + + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name="Rerun Jobs Prep", + task_type='rerun_jobs_prep', + task_params_json=json.dumps({ + 'ordered_job_rerun_set': [j.id for j in ordered_job_rerun_set] + }) + ) + ct.save() + + # run celery task + bg_task = tasks.rerun_jobs_prep.delay(ct.id) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + + # set gms + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': 'Preparing to Rerun Job(s):
%s

Refresh this page to update status of Jobs rerunning. ' % '
'.join( + [str(j.name) for j in ordered_job_rerun_set]), + 'class': 'success' + }) + + # return, as requested via Ajax which will reload page + return JsonResponse({'results': True}) @login_required def clone_jobs(request): - - logger.debug('cloning jobs') - - job_ids = request.POST.getlist('job_ids[]') - - # get downstream toggle - downstream_toggle = request.POST.get('downstream_clone_toggle', False); - if downstream_toggle == 'true': - downstream_toggle = True - elif downstream_toggle == 'false': - downstream_toggle = False - - # get rerun toggle - rerun_on_clone = request.POST.get('rerun_on_clone', False); - if rerun_on_clone == 'true': - rerun_on_clone = True - elif rerun_on_clone == 'false': - rerun_on_clone = False - - # set of jobs to rerun - job_clone_set = set() - - # loop through job_ids and add - for job_id in job_ids: - cjob = models.CombineJob.get_combine_job(job_id) - job_clone_set.add(cjob.job) - - # sort and run - ordered_job_clone_set = sorted(list(job_clone_set), key=lambda j: j.id) - - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = "Clone Jobs", - task_type = 'clone_jobs', - task_params_json = json.dumps({ - 'ordered_job_clone_set':[j.id for j in ordered_job_clone_set], - 'downstream_toggle':downstream_toggle, - 'rerun_on_clone':rerun_on_clone - }) - ) - ct.save() - - # run celery task - bg_task = tasks.clone_jobs.delay(ct.id) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # set gms - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'Cloning Job(s):
%s

Including downstream? %s

Refresh this page to update status of Jobs cloning. ' % ('
'.join([str(j.name) for j in ordered_job_clone_set]), downstream_toggle), - 'class':'success' - }) - - # return, as requested via Ajax which will reload page - return JsonResponse({'results':True}) + logger.debug('cloning jobs') + + job_ids = request.POST.getlist('job_ids[]') + + # get downstream toggle + downstream_toggle = request.POST.get('downstream_clone_toggle', False); + if downstream_toggle == 'true': + downstream_toggle = True + elif downstream_toggle == 'false': + downstream_toggle = False + + # get rerun toggle + rerun_on_clone = request.POST.get('rerun_on_clone', False); + if rerun_on_clone == 'true': + rerun_on_clone = True + elif rerun_on_clone == 'false': + rerun_on_clone = False + + # set of jobs to rerun + job_clone_set = set() + + # loop through job_ids and add + for job_id in job_ids: + cjob = models.CombineJob.get_combine_job(job_id) + job_clone_set.add(cjob.job) + + # sort and run + ordered_job_clone_set = sorted(list(job_clone_set), key=lambda j: j.id) + + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name="Clone Jobs", + task_type='clone_jobs', + task_params_json=json.dumps({ + 'ordered_job_clone_set': [j.id for j in ordered_job_clone_set], + 'downstream_toggle': downstream_toggle, + 'rerun_on_clone': rerun_on_clone + }) + ) + ct.save() + + # run celery task + bg_task = tasks.clone_jobs.delay(ct.id) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + + # set gms + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': 'Cloning Job(s):
%s

Including downstream? %s

Refresh this page to update status of Jobs cloning. ' % ( + '
'.join([str(j.name) for j in ordered_job_clone_set]), downstream_toggle), + 'class': 'success' + }) + + # return, as requested via Ajax which will reload page + return JsonResponse({'results': True}) @login_required def job_parameters(request, org_id, record_group_id, job_id): + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) - - # if GET, return JSON - if request.method == 'GET': + # if GET, return JSON + if request.method == 'GET': + # return + return JsonResponse(cjob.job.job_details_dict) - # return - return JsonResponse(cjob.job.job_details_dict) + # if POST, udpate + if request.method == 'POST': + # get job_details as JSON + job_details_json = request.POST.get('job_details_json', None) - # if POST, udpate - if request.method == 'POST': - - # get job_details as JSON - job_details_json = request.POST.get('job_details_json', None) - - if job_details_json != None: - - cjob.job.job_details = job_details_json - cjob.job.save() - - return JsonResponse({"msg":"Job Parameters updated!"}) + if job_details_json != None: + cjob.job.job_details = job_details_json + cjob.job.save() + return JsonResponse({"msg": "Job Parameters updated!"}) @login_required def job_harvest_oai(request, org_id, record_group_id): - - ''' + ''' Create a new OAI Harvest Job ''' - # retrieve record group - record_group = models.RecordGroup.objects.filter(id=record_group_id).first() - - # if GET, prepare form - if request.method == 'GET': + # retrieve record group + record_group = models.RecordGroup.objects.filter(id=record_group_id).first() - # retrieve all OAI endoints - oai_endpoints = models.OAIEndpoint.objects.all() + # if GET, prepare form + if request.method == 'GET': + # retrieve all OAI endoints + oai_endpoints = models.OAIEndpoint.objects.all() - # get validation scenarios - validation_scenarios = models.ValidationScenario.objects.all() + # get validation scenarios + validation_scenarios = models.ValidationScenario.objects.all() - # get record identifier transformation scenarios - rits = models.RecordIdentifierTransformationScenario.objects.all() + # get record identifier transformation scenarios + rits = models.RecordIdentifierTransformationScenario.objects.all() - # get field mappers - field_mappers = models.FieldMapper.objects.all() + # get field mappers + field_mappers = models.FieldMapper.objects.all() - # get all bulk downloads - bulk_downloads = models.DPLABulkDataDownload.objects.all() + # get all bulk downloads + bulk_downloads = models.DPLABulkDataDownload.objects.all() - # render page - return render(request, 'core/job_harvest_oai.html', { - 'record_group':record_group, - 'oai_endpoints':oai_endpoints, - 'validation_scenarios':validation_scenarios, - 'rits':rits, - 'field_mappers':field_mappers, - 'xml2kvp_handle':models.XML2kvp(), - 'bulk_downloads':bulk_downloads, - 'breadcrumbs':breadcrumb_parser(request) - }) + # render page + return render(request, 'core/job_harvest_oai.html', { + 'record_group': record_group, + 'oai_endpoints': oai_endpoints, + 'validation_scenarios': validation_scenarios, + 'rits': rits, + 'field_mappers': field_mappers, + 'xml2kvp_handle': models.XML2kvp(), + 'bulk_downloads': bulk_downloads, + 'breadcrumbs': breadcrumb_parser(request) + }) - # if POST, submit job - if request.method == 'POST': + # if POST, submit job + if request.method == 'POST': - cjob = models.CombineJob.init_combine_job( - user = request.user, - record_group = record_group, - job_type_class = models.HarvestOAIJob, - job_params = request.POST - ) + cjob = models.CombineJob.init_combine_job( + user=request.user, + record_group=record_group, + job_type_class=models.HarvestOAIJob, + job_params=request.POST + ) - # start job and update status - job_status = cjob.start_job() + # start job and update status + job_status = cjob.start_job() - # if job_status is absent, report job status as failed - if job_status == False: - cjob.job.status = 'failed' - cjob.job.save() + # if job_status is absent, report job status as failed + if job_status == False: + cjob.job.status = 'failed' + cjob.job.save() - return redirect('record_group', org_id=org_id, record_group_id=record_group.id) + return redirect('record_group', org_id=org_id, record_group_id=record_group.id) @login_required def job_harvest_static_xml(request, org_id, record_group_id, hash_payload_filename=False): - - ''' + ''' Create a new static XML Harvest Job ''' - # retrieve record group - record_group = models.RecordGroup.objects.filter(id=record_group_id).first() - - # get validation scenarios - validation_scenarios = models.ValidationScenario.objects.all() - - # get field mappers - field_mappers = models.FieldMapper.objects.all() + # retrieve record group + record_group = models.RecordGroup.objects.filter(id=record_group_id).first() - # get record identifier transformation scenarios - rits = models.RecordIdentifierTransformationScenario.objects.all() + # get validation scenarios + validation_scenarios = models.ValidationScenario.objects.all() - # get all bulk downloads - bulk_downloads = models.DPLABulkDataDownload.objects.all() + # get field mappers + field_mappers = models.FieldMapper.objects.all() - # if GET, prepare form - if request.method == 'GET': + # get record identifier transformation scenarios + rits = models.RecordIdentifierTransformationScenario.objects.all() - # render page - return render(request, 'core/job_harvest_static_xml.html', { - 'record_group':record_group, - 'validation_scenarios':validation_scenarios, - 'rits':rits, - 'field_mappers':field_mappers, - 'xml2kvp_handle':models.XML2kvp(), - 'bulk_downloads':bulk_downloads, - 'breadcrumbs':breadcrumb_parser(request) - }) + # get all bulk downloads + bulk_downloads = models.DPLABulkDataDownload.objects.all() + # if GET, prepare form + if request.method == 'GET': + # render page + return render(request, 'core/job_harvest_static_xml.html', { + 'record_group': record_group, + 'validation_scenarios': validation_scenarios, + 'rits': rits, + 'field_mappers': field_mappers, + 'xml2kvp_handle': models.XML2kvp(), + 'bulk_downloads': bulk_downloads, + 'breadcrumbs': breadcrumb_parser(request) + }) - # if POST, submit job - if request.method == 'POST': + # if POST, submit job + if request.method == 'POST': - cjob = models.CombineJob.init_combine_job( - user = request.user, - record_group = record_group, - job_type_class = models.HarvestStaticXMLJob, - job_params = request.POST, - files = request.FILES, - hash_payload_filename = hash_payload_filename - ) + cjob = models.CombineJob.init_combine_job( + user=request.user, + record_group=record_group, + job_type_class=models.HarvestStaticXMLJob, + job_params=request.POST, + files=request.FILES, + hash_payload_filename=hash_payload_filename + ) - # start job and update status - job_status = cjob.start_job() + # start job and update status + job_status = cjob.start_job() - # if job_status is absent, report job status as failed - if job_status == False: - cjob.job.status = 'failed' - cjob.job.save() + # if job_status is absent, report job status as failed + if job_status == False: + cjob.job.status = 'failed' + cjob.job.save() - return redirect('record_group', org_id=org_id, record_group_id=record_group.id) + return redirect('record_group', org_id=org_id, record_group_id=record_group.id) @login_required def job_harvest_tabular_data(request, org_id, record_group_id, hash_payload_filename=False): - - ''' + ''' Create a new static XML Harvest Job ''' - # retrieve record group - record_group = models.RecordGroup.objects.filter(id=record_group_id).first() - - # get validation scenarios - validation_scenarios = models.ValidationScenario.objects.all() + # retrieve record group + record_group = models.RecordGroup.objects.filter(id=record_group_id).first() - # get field mappers - field_mappers = models.FieldMapper.objects.all() + # get validation scenarios + validation_scenarios = models.ValidationScenario.objects.all() - # get record identifier transformation scenarios - rits = models.RecordIdentifierTransformationScenario.objects.all() + # get field mappers + field_mappers = models.FieldMapper.objects.all() - # get all bulk downloads - bulk_downloads = models.DPLABulkDataDownload.objects.all() + # get record identifier transformation scenarios + rits = models.RecordIdentifierTransformationScenario.objects.all() - # if GET, prepare form - if request.method == 'GET': + # get all bulk downloads + bulk_downloads = models.DPLABulkDataDownload.objects.all() - # render page - return render(request, 'core/job_harvest_tabular_data.html', { - 'record_group':record_group, - 'validation_scenarios':validation_scenarios, - 'rits':rits, - 'field_mappers':field_mappers, - 'xml2kvp_handle':models.XML2kvp(), - 'bulk_downloads':bulk_downloads, - 'breadcrumbs':breadcrumb_parser(request) - }) + # if GET, prepare form + if request.method == 'GET': + # render page + return render(request, 'core/job_harvest_tabular_data.html', { + 'record_group': record_group, + 'validation_scenarios': validation_scenarios, + 'rits': rits, + 'field_mappers': field_mappers, + 'xml2kvp_handle': models.XML2kvp(), + 'bulk_downloads': bulk_downloads, + 'breadcrumbs': breadcrumb_parser(request) + }) + # if POST, submit job + if request.method == 'POST': - # if POST, submit job - if request.method == 'POST': + cjob = models.CombineJob.init_combine_job( + user=request.user, + record_group=record_group, + job_type_class=models.HarvestTabularDataJob, + job_params=request.POST, + files=request.FILES, + hash_payload_filename=hash_payload_filename + ) - cjob = models.CombineJob.init_combine_job( - user = request.user, - record_group = record_group, - job_type_class = models.HarvestTabularDataJob, - job_params = request.POST, - files = request.FILES, - hash_payload_filename = hash_payload_filename - ) + # start job and update status + job_status = cjob.start_job() - # start job and update status - job_status = cjob.start_job() + # if job_status is absent, report job status as failed + if job_status == False: + cjob.job.status = 'failed' + cjob.job.save() - # if job_status is absent, report job status as failed - if job_status == False: - cjob.job.status = 'failed' - cjob.job.save() - - return redirect('record_group', org_id=org_id, record_group_id=record_group.id) + return redirect('record_group', org_id=org_id, record_group_id=record_group.id) @login_required def job_transform(request, org_id, record_group_id): - - ''' + ''' Create a new Transform Job ''' - # retrieve record group - record_group = models.RecordGroup.objects.filter(id=record_group_id).first() + # retrieve record group + record_group = models.RecordGroup.objects.filter(id=record_group_id).first() - # if GET, prepare form - if request.method == 'GET': + # if GET, prepare form + if request.method == 'GET': - # get scope of input jobs and retrieve - input_job_scope = request.GET.get('scope', None) + # get scope of input jobs and retrieve + input_job_scope = request.GET.get('scope', None) - # if all jobs, retrieve all jobs - if input_job_scope == 'all_jobs': - input_jobs = models.Job.objects.exclude(job_type='AnalysisJob').all() + # if all jobs, retrieve all jobs + if input_job_scope == 'all_jobs': + input_jobs = models.Job.objects.exclude(job_type='AnalysisJob').all() - # else, limit to RecordGroup - else: - input_jobs = record_group.job_set.all() + # else, limit to RecordGroup + else: + input_jobs = record_group.job_set.all() - # get all transformation scenarios - transformations = models.Transformation.objects.filter(use_as_include=False) + # get all transformation scenarios + transformations = models.Transformation.objects.filter(use_as_include=False) - # get validation scenarios - validation_scenarios = models.ValidationScenario.objects.all() + # get validation scenarios + validation_scenarios = models.ValidationScenario.objects.all() - # get field mappers - field_mappers = models.FieldMapper.objects.all() + # get field mappers + field_mappers = models.FieldMapper.objects.all() - # get record identifier transformation scenarios - rits = models.RecordIdentifierTransformationScenario.objects.all() + # get record identifier transformation scenarios + rits = models.RecordIdentifierTransformationScenario.objects.all() - # get job lineage for all jobs (filtered to input jobs scope) - ld = models.Job.get_all_jobs_lineage(jobs_query_set=input_jobs) + # get job lineage for all jobs (filtered to input jobs scope) + ld = models.Job.get_all_jobs_lineage(jobs_query_set=input_jobs) - # get all bulk downloads - bulk_downloads = models.DPLABulkDataDownload.objects.all() + # get all bulk downloads + bulk_downloads = models.DPLABulkDataDownload.objects.all() - # render page - return render(request, 'core/job_transform.html', { - 'record_group':record_group, - 'input_jobs':input_jobs, - 'input_job_scope':input_job_scope, - 'transformations':transformations, - 'validation_scenarios':validation_scenarios, - 'rits':rits, - 'field_mappers':field_mappers, - 'xml2kvp_handle':models.XML2kvp(), - 'job_lineage_json':json.dumps(ld), - 'bulk_downloads':bulk_downloads, - 'breadcrumbs':breadcrumb_parser(request) - }) + # render page + return render(request, 'core/job_transform.html', { + 'record_group': record_group, + 'input_jobs': input_jobs, + 'input_job_scope': input_job_scope, + 'transformations': transformations, + 'validation_scenarios': validation_scenarios, + 'rits': rits, + 'field_mappers': field_mappers, + 'xml2kvp_handle': models.XML2kvp(), + 'job_lineage_json': json.dumps(ld), + 'bulk_downloads': bulk_downloads, + 'breadcrumbs': breadcrumb_parser(request) + }) - # if POST, submit job - if request.method == 'POST': + # if POST, submit job + if request.method == 'POST': - cjob = models.CombineJob.init_combine_job( - user = request.user, - record_group = record_group, - job_type_class = models.TransformJob, - job_params = request.POST) + cjob = models.CombineJob.init_combine_job( + user=request.user, + record_group=record_group, + job_type_class=models.TransformJob, + job_params=request.POST) - # start job and update status - job_status = cjob.start_job() + # start job and update status + job_status = cjob.start_job() - # if job_status is absent, report job status as failed - if job_status == False: - cjob.job.status = 'failed' - cjob.job.save() + # if job_status is absent, report job status as failed + if job_status == False: + cjob.job.status = 'failed' + cjob.job.save() - return redirect('record_group', org_id=org_id, record_group_id=record_group.id) + return redirect('record_group', org_id=org_id, record_group_id=record_group.id) @login_required def job_merge(request, org_id, record_group_id): - - ''' + ''' Merge multiple jobs into a single job ''' - # retrieve record group - record_group = models.RecordGroup.objects.get(pk=record_group_id) + # retrieve record group + record_group = models.RecordGroup.objects.get(pk=record_group_id) - # if GET, prepare form - if request.method == 'GET': + # if GET, prepare form + if request.method == 'GET': - # get scope of input jobs and retrieve - input_job_scope = request.GET.get('scope', None) + # get scope of input jobs and retrieve + input_job_scope = request.GET.get('scope', None) - # if all jobs, retrieve all jobs - if input_job_scope == 'all_jobs': - input_jobs = models.Job.objects.exclude(job_type='AnalysisJob').all() + # if all jobs, retrieve all jobs + if input_job_scope == 'all_jobs': + input_jobs = models.Job.objects.exclude(job_type='AnalysisJob').all() - # else, limit to RecordGroup - else: - input_jobs = record_group.job_set.all() + # else, limit to RecordGroup + else: + input_jobs = record_group.job_set.all() - # get validation scenarios - validation_scenarios = models.ValidationScenario.objects.all() + # get validation scenarios + validation_scenarios = models.ValidationScenario.objects.all() - # get record identifier transformation scenarios - rits = models.RecordIdentifierTransformationScenario.objects.all() + # get record identifier transformation scenarios + rits = models.RecordIdentifierTransformationScenario.objects.all() - # get field mappers - field_mappers = models.FieldMapper.objects.all() + # get field mappers + field_mappers = models.FieldMapper.objects.all() - # get job lineage for all jobs (filtered to input jobs scope) - ld = models.Job.get_all_jobs_lineage(jobs_query_set=input_jobs) + # get job lineage for all jobs (filtered to input jobs scope) + ld = models.Job.get_all_jobs_lineage(jobs_query_set=input_jobs) - # get all bulk downloads - bulk_downloads = models.DPLABulkDataDownload.objects.all() + # get all bulk downloads + bulk_downloads = models.DPLABulkDataDownload.objects.all() - # render page - return render(request, 'core/job_merge.html', { - 'job_select_type':'multiple', - 'record_group':record_group, - 'input_jobs':input_jobs, - 'input_job_scope':input_job_scope, - 'validation_scenarios':validation_scenarios, - 'rits':rits, - 'field_mappers':field_mappers, - 'xml2kvp_handle':models.XML2kvp(), - 'job_lineage_json':json.dumps(ld), - 'bulk_downloads':bulk_downloads, - 'breadcrumbs':breadcrumb_parser(request) - }) + # render page + return render(request, 'core/job_merge.html', { + 'job_select_type': 'multiple', + 'record_group': record_group, + 'input_jobs': input_jobs, + 'input_job_scope': input_job_scope, + 'validation_scenarios': validation_scenarios, + 'rits': rits, + 'field_mappers': field_mappers, + 'xml2kvp_handle': models.XML2kvp(), + 'job_lineage_json': json.dumps(ld), + 'bulk_downloads': bulk_downloads, + 'breadcrumbs': breadcrumb_parser(request) + }) - # if POST, submit job - if request.method == 'POST': + # if POST, submit job + if request.method == 'POST': - cjob = models.CombineJob.init_combine_job( - user = request.user, - record_group = record_group, - job_type_class = models.MergeJob, - job_params = request.POST) + cjob = models.CombineJob.init_combine_job( + user=request.user, + record_group=record_group, + job_type_class=models.MergeJob, + job_params=request.POST) - # start job and update status - job_status = cjob.start_job() + # start job and update status + job_status = cjob.start_job() - # if job_status is absent, report job status as failed - if job_status == False: - cjob.job.status = 'failed' - cjob.job.save() + # if job_status is absent, report job status as failed + if job_status == False: + cjob.job.status = 'failed' + cjob.job.save() - return redirect('record_group', org_id=org_id, record_group_id=record_group.id) + return redirect('record_group', org_id=org_id, record_group_id=record_group.id) def job_lineage_json(request, org_id, record_group_id, job_id): - - ''' + ''' Return job lineage as JSON ''' - # get job - job = models.Job.objects.get(pk=int(job_id)) - - # get lineage - job_lineage = job.get_lineage() + # get job + job = models.Job.objects.get(pk=int(job_id)) - return JsonResponse({ - 'job_id_list':[ node['id'] for node in job_lineage['nodes'] ], - 'nodes':job_lineage['nodes'], - 'edges':job_lineage['edges'] - }) + # get lineage + job_lineage = job.get_lineage() + return JsonResponse({ + 'job_id_list': [node['id'] for node in job_lineage['nodes']], + 'nodes': job_lineage['nodes'], + 'edges': job_lineage['edges'] + }) #################################################################### @@ -1653,222 +1614,220 @@ def job_lineage_json(request, org_id, record_group_id, job_id): @login_required def job_reports_create_validation(request, org_id, record_group_id, job_id): - - ''' + ''' Generate job report based on validation results ''' - # retrieve job - cjob = models.CombineJob.get_combine_job(int(job_id)) - - # if GET, prepare form - if request.method == 'GET': - - # mapped field analysis, generate if not part of job_details - if 'mapped_field_analysis' in cjob.job.job_details_dict.keys(): - field_counts = cjob.job.job_details_dict['mapped_field_analysis'] - else: - if cjob.job.finished: - field_counts = cjob.count_indexed_fields() - cjob.job.update_job_details({'mapped_field_analysis':field_counts}, save=True) - else: - logger.debug('job not finished, not setting') - field_counts = {} - - # render page - return render(request, 'core/job_reports_create_validation.html', { - 'cjob':cjob, - 'field_counts':field_counts, - 'breadcrumbs':breadcrumb_parser(request) - }) - - # if POST, generate report - if request.method == 'POST': - - # get job name for Combine Task - report_name = request.POST.get('report_name') - if report_name == '': - report_name = 'j_%s_validation_report' % cjob.job.id - combine_task_name = "Validation Report: %s" % cjob.job.name - else: - combine_task_name = "Validation Report: %s" % report_name - - # handle POST params and save as Combine task params - task_params = { - 'job_id':cjob.job.id, - 'report_name':report_name, - 'report_format':request.POST.get('report_format'), - 'compression_type':request.POST.get('compression_type'), - 'validation_scenarios':request.POST.getlist('validation_scenario', []), - 'mapped_field_include':request.POST.getlist('mapped_field_include', []) - } - - # cast to int - task_params['validation_scenarios'] = [int(vs_id) for vs_id in task_params['validation_scenarios']] - - # remove select, reserved fields if in mapped field request - task_params['mapped_field_include'] = [ f for f in task_params['mapped_field_include'] if f not in ['record_id','db_id','oid','_id']] - - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = combine_task_name, - task_type = 'validation_report', - task_params_json = json.dumps(task_params) - ) - ct.save() - - # run celery task - bg_task = tasks.create_validation_report.delay(ct.id) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # redirect to Background Tasks - return redirect('bg_tasks') + # retrieve job + cjob = models.CombineJob.get_combine_job(int(job_id)) + + # if GET, prepare form + if request.method == 'GET': + + # mapped field analysis, generate if not part of job_details + if 'mapped_field_analysis' in cjob.job.job_details_dict.keys(): + field_counts = cjob.job.job_details_dict['mapped_field_analysis'] + else: + if cjob.job.finished: + field_counts = cjob.count_indexed_fields() + cjob.job.update_job_details({'mapped_field_analysis': field_counts}, save=True) + else: + logger.debug('job not finished, not setting') + field_counts = {} + + # render page + return render(request, 'core/job_reports_create_validation.html', { + 'cjob': cjob, + 'field_counts': field_counts, + 'breadcrumbs': breadcrumb_parser(request) + }) + + # if POST, generate report + if request.method == 'POST': + + # get job name for Combine Task + report_name = request.POST.get('report_name') + if report_name == '': + report_name = 'j_%s_validation_report' % cjob.job.id + combine_task_name = "Validation Report: %s" % cjob.job.name + else: + combine_task_name = "Validation Report: %s" % report_name + + # handle POST params and save as Combine task params + task_params = { + 'job_id': cjob.job.id, + 'report_name': report_name, + 'report_format': request.POST.get('report_format'), + 'compression_type': request.POST.get('compression_type'), + 'validation_scenarios': request.POST.getlist('validation_scenario', []), + 'mapped_field_include': request.POST.getlist('mapped_field_include', []) + } + + # cast to int + task_params['validation_scenarios'] = [int(vs_id) for vs_id in task_params['validation_scenarios']] + + # remove select, reserved fields if in mapped field request + task_params['mapped_field_include'] = [f for f in task_params['mapped_field_include'] if + f not in ['record_id', 'db_id', 'oid', '_id']] + + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name=combine_task_name, + task_type='validation_report', + task_params_json=json.dumps(task_params) + ) + ct.save() + + # run celery task + bg_task = tasks.create_validation_report.delay(ct.id) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + + # redirect to Background Tasks + return redirect('bg_tasks') @login_required def job_update(request, org_id, record_group_id, job_id): - - ''' + ''' Update Job in one of several ways: - re-map and index - run new / different validations ''' - # retrieve job - cjob = models.CombineJob.get_combine_job(int(job_id)) - - # if GET, prepare form - if request.method == 'GET': - - # get validation scenarios - validation_scenarios = models.ValidationScenario.objects.all() - - # get field mappers - field_mappers = models.FieldMapper.objects.all() - orig_fm_config_json = cjob.job.get_fm_config_json() - - # get all bulk downloads - bulk_downloads = models.DPLABulkDataDownload.objects.all() - - # get uptdate type from GET params - update_type = request.GET.get('update_type', None) - - # render page - return render(request, 'core/job_update.html', { - 'cjob':cjob, - 'update_type':update_type, - 'validation_scenarios':validation_scenarios, - 'field_mappers':field_mappers, - 'bulk_downloads':bulk_downloads, - 'xml2kvp_handle':models.XML2kvp(), - 'orig_fm_config_json':orig_fm_config_json, - 'breadcrumbs':breadcrumb_parser(request) - }) - - # if POST, submit job - if request.method == 'POST': - - logger.debug('updating job') - logger.debug(request.POST) - - # retrieve job - cjob = models.CombineJob.get_combine_job(int(job_id)) - - # get update type - update_type = request.POST.get('update_type', None) - logger.debug('running job update: %s' % update_type) - - # handle re-index - if update_type == 'reindex': - - # get preferred metadata index mapper - fm_config_json = request.POST.get('fm_config_json') - - # init re-index - ct = cjob.reindex_bg_task(fm_config_json=fm_config_json) - - # set gms - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'

Re-Indexing Job:
%s

' % (cjob.job.name, reverse('bg_tasks')), - 'class':'success' - }) - - return redirect('job_details', - org_id=cjob.job.record_group.organization.id, - record_group_id=cjob.job.record_group.id, - job_id=cjob.job.id) - - # handle new validations - if update_type == 'validations': - - # get requested validation scenarios - validation_scenarios = request.POST.getlist('validation_scenario', []) - - # get validations - validations = models.ValidationScenario.objects.filter(id__in=[ int(vs_id) for vs_id in validation_scenarios ]) - - # init bg task - bg_task = cjob.new_validations_bg_task([ vs.id for vs in validations ]) - - # set gms - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'

Running New Validations for Job:
%s

Validation Scenarios:
%s

' % (cjob.job.name, '
'.join([vs.name for vs in validations]), reverse('bg_tasks')), - 'class':'success' - }) - - return redirect('job_details', - org_id=cjob.job.record_group.organization.id, - record_group_id=cjob.job.record_group.id, - job_id=cjob.job.id) - - # handle validation removal - if update_type == 'remove_validation': - - # get validation scenario to remove - jv_id = request.POST.get('jv_id', False) - - # initiate Combine BG Task - bg_task = cjob.remove_validation_bg_task(jv_id) - - # set gms - vs = models.JobValidation.objects.get(pk=int(jv_id)).validation_scenario - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'

Removing Validation for Job:
%s

Validation Scenario:
%s

' % (cjob.job.name, vs.name, reverse('bg_tasks')), - 'class':'success' - }) - - return redirect('job_details', - org_id=cjob.job.record_group.organization.id, - record_group_id=cjob.job.record_group.id, - job_id=cjob.job.id) - - # handle validation removal - if update_type == 'dbdm': - - # get validation scenario to remove - dbdd_id = request.POST.get('dbdd', False) - - # initiate Combine BG Task - bg_task = cjob.dbdm_bg_task(dbdd_id) - - # set gms - dbdd = models.DPLABulkDataDownload.objects.get(pk=int(dbdd_id)) - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'

Running DPLA Bulk Data comparison for Job:
%s

Bulk Data S3 key:
%s

' % (cjob.job.name, dbdd.s3_key, reverse('bg_tasks')), - 'class':'success' - }) - - return redirect('job_details', - org_id=cjob.job.record_group.organization.id, - record_group_id=cjob.job.record_group.id, - job_id=cjob.job.id) - + # retrieve job + cjob = models.CombineJob.get_combine_job(int(job_id)) + + # if GET, prepare form + if request.method == 'GET': + # get validation scenarios + validation_scenarios = models.ValidationScenario.objects.all() + + # get field mappers + field_mappers = models.FieldMapper.objects.all() + orig_fm_config_json = cjob.job.get_fm_config_json() + + # get all bulk downloads + bulk_downloads = models.DPLABulkDataDownload.objects.all() + + # get uptdate type from GET params + update_type = request.GET.get('update_type', None) + + # render page + return render(request, 'core/job_update.html', { + 'cjob': cjob, + 'update_type': update_type, + 'validation_scenarios': validation_scenarios, + 'field_mappers': field_mappers, + 'bulk_downloads': bulk_downloads, + 'xml2kvp_handle': models.XML2kvp(), + 'orig_fm_config_json': orig_fm_config_json, + 'breadcrumbs': breadcrumb_parser(request) + }) + + # if POST, submit job + if request.method == 'POST': + + logger.debug('updating job') + logger.debug(request.POST) + + # retrieve job + cjob = models.CombineJob.get_combine_job(int(job_id)) + + # get update type + update_type = request.POST.get('update_type', None) + logger.debug('running job update: %s' % update_type) + + # handle re-index + if update_type == 'reindex': + # get preferred metadata index mapper + fm_config_json = request.POST.get('fm_config_json') + + # init re-index + ct = cjob.reindex_bg_task(fm_config_json=fm_config_json) + + # set gms + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': '

Re-Indexing Job:
%s

' % ( + cjob.job.name, reverse('bg_tasks')), + 'class': 'success' + }) + + return redirect('job_details', + org_id=cjob.job.record_group.organization.id, + record_group_id=cjob.job.record_group.id, + job_id=cjob.job.id) + + # handle new validations + if update_type == 'validations': + # get requested validation scenarios + validation_scenarios = request.POST.getlist('validation_scenario', []) + + # get validations + validations = models.ValidationScenario.objects.filter( + id__in=[int(vs_id) for vs_id in validation_scenarios]) + + # init bg task + bg_task = cjob.new_validations_bg_task([vs.id for vs in validations]) + + # set gms + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': '

Running New Validations for Job:
%s

Validation Scenarios:
%s

' % ( + cjob.job.name, '
'.join([vs.name for vs in validations]), reverse('bg_tasks')), + 'class': 'success' + }) + + return redirect('job_details', + org_id=cjob.job.record_group.organization.id, + record_group_id=cjob.job.record_group.id, + job_id=cjob.job.id) + + # handle validation removal + if update_type == 'remove_validation': + # get validation scenario to remove + jv_id = request.POST.get('jv_id', False) + + # initiate Combine BG Task + bg_task = cjob.remove_validation_bg_task(jv_id) + + # set gms + vs = models.JobValidation.objects.get(pk=int(jv_id)).validation_scenario + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': '

Removing Validation for Job:
%s

Validation Scenario:
%s

' % ( + cjob.job.name, vs.name, reverse('bg_tasks')), + 'class': 'success' + }) + + return redirect('job_details', + org_id=cjob.job.record_group.organization.id, + record_group_id=cjob.job.record_group.id, + job_id=cjob.job.id) + + # handle validation removal + if update_type == 'dbdm': + # get validation scenario to remove + dbdd_id = request.POST.get('dbdd', False) + + # initiate Combine BG Task + bg_task = cjob.dbdm_bg_task(dbdd_id) + + # set gms + dbdd = models.DPLABulkDataDownload.objects.get(pk=int(dbdd_id)) + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': '

Running DPLA Bulk Data comparison for Job:
%s

Bulk Data S3 key:
%s

' % ( + cjob.job.name, dbdd.s3_key, reverse('bg_tasks')), + 'class': 'success' + }) + + return redirect('job_details', + org_id=cjob.job.record_group.organization.id, + record_group_id=cjob.job.record_group.id, + job_id=cjob.job.id) #################################################################### @@ -1876,62 +1835,59 @@ def job_update(request, org_id, record_group_id, job_id): #################################################################### def document_download(request): - - ''' + ''' Args (GET params): file_location: location on disk for file file_download_name: desired download name content_type: ContentType Headers ''' - # known download format params - download_format_hash = { - 'excel':{ - 'extension':'.xlsx', - 'content_type':'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' - }, - 'csv':{ - 'extension':'.csv', - 'content_type':'text/plain' - }, - 'tsv':{ - 'extension':'.tsv', - 'content_type':'text/plain' - }, - 'json':{ - 'extension':'.json', - 'content_type':'text/plain' - }, - 'zip':{ - 'extension':'.zip', - 'content_type':'application/zip' - }, - 'targz':{ - 'extension':'.tar.gz', - 'content_type':'application/gzip' - } - } - - # get params - download_format = request.GET.get('download_format', None) - filepath = request.GET.get('filepath', None) - name = request.GET.get('name', 'download') - content_type = request.GET.get('content_type', 'text/plain') - preview = request.GET.get('preview', False) - - # if known download format, use hash and overwrite provided or defaults - if download_format and download_format in download_format_hash.keys(): - - format_params = download_format_hash[download_format] - name = '%s%s' % (name, format_params['extension']) - content_type = format_params['content_type'] - - # generate response - response = FileResponse(open(filepath, 'rb')) - if not preview: - response['Content-Disposition'] = 'attachment; filename="%s"' % name - return response - + # known download format params + download_format_hash = { + 'excel': { + 'extension': '.xlsx', + 'content_type': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' + }, + 'csv': { + 'extension': '.csv', + 'content_type': 'text/plain' + }, + 'tsv': { + 'extension': '.tsv', + 'content_type': 'text/plain' + }, + 'json': { + 'extension': '.json', + 'content_type': 'text/plain' + }, + 'zip': { + 'extension': '.zip', + 'content_type': 'application/zip' + }, + 'targz': { + 'extension': '.tar.gz', + 'content_type': 'application/gzip' + } + } + + # get params + download_format = request.GET.get('download_format', None) + filepath = request.GET.get('filepath', None) + name = request.GET.get('name', 'download') + content_type = request.GET.get('content_type', 'text/plain') + preview = request.GET.get('preview', False) + + # if known download format, use hash and overwrite provided or defaults + if download_format and download_format in download_format_hash.keys(): + format_params = download_format_hash[download_format] + name = '%s%s' % (name, format_params['extension']) + content_type = format_params['content_type'] + + # generate response + response = FileResponse(open(filepath, 'rb')) + if not preview: + response['Content-Disposition'] = 'attachment; filename="%s"' % name + return response #################################################################### @@ -1940,42 +1896,39 @@ def document_download(request): @login_required def field_analysis(request, es_index): + # get field name + field_name = request.GET.get('field_name') - # get field name - field_name = request.GET.get('field_name') - - # get ESIndex, evaluating stringified list - esi = models.ESIndex(ast.literal_eval(es_index)) + # get ESIndex, evaluating stringified list + esi = models.ESIndex(ast.literal_eval(es_index)) - # get analysis for field - field_metrics = esi.field_analysis(field_name, metrics_only=True) + # get analysis for field + field_metrics = esi.field_analysis(field_name, metrics_only=True) - # return - return render(request, 'core/field_analysis.html', { - 'esi':esi, - 'field_name':field_name, - 'field_metrics':field_metrics, - 'breadcrumbs':breadcrumb_parser(request) - }) + # return + return render(request, 'core/field_analysis.html', { + 'esi': esi, + 'field_name': field_name, + 'field_metrics': field_metrics, + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def job_indexing_failures(request, org_id, record_group_id, job_id): + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) - - # return - return render(request, 'core/job_indexing_failures.html', { - 'cjob':cjob, - 'breadcrumbs':breadcrumb_parser(request) - }) + # return + return render(request, 'core/job_indexing_failures.html', { + 'cjob': cjob, + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def field_analysis_docs(request, es_index, filter_type): - - ''' + ''' Table of documents that match a filtered ES query. @@ -1984,85 +1937,82 @@ def field_analysis_docs(request, es_index, filter_type): filter_type (str): what kind of filtering to impose on documents returned ''' - # regardless of filtering type, get field name - field_name = request.GET.get('field_name') - - # get ESIndex - esi = models.ESIndex(ast.literal_eval(es_index)) - - # begin construction of DT GET params with 'fields_names' - dt_get_params = [ - ('field_names', 'db_id'), # get DB ID - ('field_names', 'combine_id'), # get Combine ID - ('field_names', 'record_id'), # get ID from ES index document - ('field_names', field_name), # add field to returned fields - ('filter_field', field_name), - ('filter_type', filter_type) - ] - - # analysis scenario dict - analysis_scenario = { - 'exists':None, - 'matches':None, - 'value':None - } - - # field existence - if filter_type == 'exists': - - # if check exists, get expected GET params - exists = request.GET.get('exists') - dt_get_params.append(('exists', exists)) - - # update analysis scenario dict - analysis_scenario['exists'] = exists - - # field equals - if filter_type == 'equals': - - # if check equals, get expected GET params - matches = request.GET.get('matches') - dt_get_params.append(('matches', matches)) - - value = request.GET.get('value', None) # default None if checking non-matches to value - if value: - dt_get_params.append(('filter_value', value)) - - # update analysis scenario dict - analysis_scenario['matches'] = matches - analysis_scenario['value'] = value - - - # construct DT Ajax GET parameters string from tuples - dt_get_params_string = urlencode(dt_get_params) - - # return - return render(request, 'core/field_analysis_docs.html', { - 'esi':esi, - 'field_name':field_name, - 'filter_type':filter_type, - 'analysis_scenario':analysis_scenario, - 'msg':None, - 'dt_get_params_string':dt_get_params_string, - 'breadcrumbs':breadcrumb_parser(request) - }) + # regardless of filtering type, get field name + field_name = request.GET.get('field_name') + + # get ESIndex + esi = models.ESIndex(ast.literal_eval(es_index)) + + # begin construction of DT GET params with 'fields_names' + dt_get_params = [ + ('field_names', 'db_id'), # get DB ID + ('field_names', 'combine_id'), # get Combine ID + ('field_names', 'record_id'), # get ID from ES index document + ('field_names', field_name), # add field to returned fields + ('filter_field', field_name), + ('filter_type', filter_type) + ] + + # analysis scenario dict + analysis_scenario = { + 'exists': None, + 'matches': None, + 'value': None + } + + # field existence + if filter_type == 'exists': + # if check exists, get expected GET params + exists = request.GET.get('exists') + dt_get_params.append(('exists', exists)) + + # update analysis scenario dict + analysis_scenario['exists'] = exists + + # field equals + if filter_type == 'equals': + + # if check equals, get expected GET params + matches = request.GET.get('matches') + dt_get_params.append(('matches', matches)) + + value = request.GET.get('value', None) # default None if checking non-matches to value + if value: + dt_get_params.append(('filter_value', value)) + + # update analysis scenario dict + analysis_scenario['matches'] = matches + analysis_scenario['value'] = value + + # construct DT Ajax GET parameters string from tuples + dt_get_params_string = urlencode(dt_get_params) + + # return + return render(request, 'core/field_analysis_docs.html', { + 'esi': esi, + 'field_name': field_name, + 'filter_type': filter_type, + 'analysis_scenario': analysis_scenario, + 'msg': None, + 'dt_get_params_string': dt_get_params_string, + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def job_validation_scenario_failures(request, org_id, record_group_id, job_id, job_validation_id): + # get CombineJob + cjob = models.CombineJob.get_combine_job(job_id) - # get CombineJob - cjob = models.CombineJob.get_combine_job(job_id) + # get job validation instance + jv = models.JobValidation.objects.get(pk=int(job_validation_id)) - # get job validation instance - jv = models.JobValidation.objects.get(pk=int(job_validation_id)) - - # return - return render(request, 'core/job_validation_scenario_failures.html', { - 'cjob':cjob, - 'jv':jv, - 'breadcrumbs':breadcrumb_parser(request) - }) + # return + return render(request, 'core/job_validation_scenario_failures.html', { + 'cjob': cjob, + 'jv': jv, + 'breadcrumbs': breadcrumb_parser(request) + }) #################################################################### @@ -2070,197 +2020,186 @@ def job_validation_scenario_failures(request, org_id, record_group_id, job_id, j #################################################################### def record(request, org_id, record_group_id, job_id, record_id): - - ''' + ''' Single Record page ''' - # get record - record = models.Record.objects.get(id=record_id) - - # build ancestry in both directions - record_stages = record.get_record_stages() - - # get details depending on job type - logger.debug('Job type is %s, retrieving details' % record.job.job_type) - try: - job_details = record.job.job_details_dict - except: - logger.debug('could not load job details') - job_details = {} - - # attempt to retrieve pre-existing DPLA document - dpla_api_doc = record.dpla_api_record_match() - if dpla_api_doc is not None: - dpla_api_json = json.dumps(dpla_api_doc, indent=4, sort_keys=True) - else: - dpla_api_json = None - - # retrieve diffs, if any, from input record - # request only combined diff at this point - record_diff_dict = record.get_input_record_diff(output='combined_gen', combined_as_html=True) - - # retrieve field mapper config json used - try: - job_fm_config_json = json.dumps(job_details['field_mapper_config']) - except: - job_fm_config_json = json.dumps({'error':'job field mapping configuration json could not be found'}) - - # attempt to get document as pretty print - try: - pretty_document = record.document_pretty_print() - pretty_format_msg = False - except Exception as e: - pretty_document = record.document - pretty_format_msg = str(e) - - # return - return render(request, 'core/record.html', { - 'record_id':record_id, - 'record':record, - 'record_stages':record_stages, - 'job_details':job_details, - 'dpla_api_doc':dpla_api_doc, - 'dpla_api_json':dpla_api_json, - 'record_diff_dict':record_diff_dict, - 'pretty_document':pretty_document, - 'pretty_format_msg':pretty_format_msg, - 'job_fm_config_json':job_fm_config_json, - 'breadcrumbs':breadcrumb_parser(request) - }) + # get record + record = models.Record.objects.get(id=record_id) + + # build ancestry in both directions + record_stages = record.get_record_stages() + + # get details depending on job type + logger.debug('Job type is %s, retrieving details' % record.job.job_type) + try: + job_details = record.job.job_details_dict + except: + logger.debug('could not load job details') + job_details = {} + + # attempt to retrieve pre-existing DPLA document + dpla_api_doc = record.dpla_api_record_match() + if dpla_api_doc is not None: + dpla_api_json = json.dumps(dpla_api_doc, indent=4, sort_keys=True) + else: + dpla_api_json = None + + # retrieve diffs, if any, from input record + # request only combined diff at this point + record_diff_dict = record.get_input_record_diff(output='combined_gen', combined_as_html=True) + + # retrieve field mapper config json used + try: + job_fm_config_json = json.dumps(job_details['field_mapper_config']) + except: + job_fm_config_json = json.dumps({'error': 'job field mapping configuration json could not be found'}) + + # attempt to get document as pretty print + try: + pretty_document = record.document_pretty_print() + pretty_format_msg = False + except Exception as e: + pretty_document = record.document + pretty_format_msg = str(e) + + # return + return render(request, 'core/record.html', { + 'record_id': record_id, + 'record': record, + 'record_stages': record_stages, + 'job_details': job_details, + 'dpla_api_doc': dpla_api_doc, + 'dpla_api_json': dpla_api_json, + 'record_diff_dict': record_diff_dict, + 'pretty_document': pretty_document, + 'pretty_format_msg': pretty_format_msg, + 'job_fm_config_json': job_fm_config_json, + 'breadcrumbs': breadcrumb_parser(request) + }) def record_document(request, org_id, record_group_id, job_id, record_id): - - ''' + ''' View document for record ''' - # get record - record = models.Record.objects.get(id=record_id) + # get record + record = models.Record.objects.get(id=record_id) - # return document as XML - return HttpResponse(record.document, content_type='text/xml') + # return document as XML + return HttpResponse(record.document, content_type='text/xml') def record_indexed_document(request, org_id, record_group_id, job_id, record_id): - - ''' + ''' View indexed, ES document for record ''' - # get record - record = models.Record.objects.get(id=record_id) - - # return ES document as JSON - return JsonResponse(record.get_es_doc()) + # get record + record = models.Record.objects.get(id=record_id) + # return ES document as JSON + return JsonResponse(record.get_es_doc()) def record_error(request, org_id, record_group_id, job_id, record_id): - - ''' + ''' View document for record ''' - # get record - record = models.Record.objects.get(id=record_id) + # get record + record = models.Record.objects.get(id=record_id) - # return document as XML - return HttpResponse("
%s
" % record.error) + # return document as XML + return HttpResponse("
%s
" % record.error) def record_validation_scenario(request, org_id, record_group_id, job_id, record_id, job_validation_id): - - ''' + ''' Re-run validation test for single record Returns: results of validation ''' - # get record - record = models.Record.objects.get(id=record_id) - - # get validation scenario - vs = models.ValidationScenario.objects.get(pk=int(job_validation_id)) + # get record + record = models.Record.objects.get(id=record_id) - # schematron type validation - if vs.validation_type == 'sch': + # get validation scenario + vs = models.ValidationScenario.objects.get(pk=int(job_validation_id)) - vs_result = vs.validate_record(record) + # schematron type validation + if vs.validation_type == 'sch': + vs_result = vs.validate_record(record) - # return - return HttpResponse(vs_result['raw'], content_type='text/xml') + # return + return HttpResponse(vs_result['raw'], content_type='text/xml') - # python type validation - if vs.validation_type == 'python': + # python type validation + if vs.validation_type == 'python': + vs_result = vs.validate_record(record) - vs_result = vs.validate_record(record) - - # return - return JsonResponse(vs_result['parsed'], safe=False) + # return + return JsonResponse(vs_result['parsed'], safe=False) def record_combined_diff_html(request, org_id, record_group_id, job_id, record_id): - - ''' + ''' Return combined diff of Record against Input Record ''' - # get record - record = models.Record.objects.get(id=record_id) + # get record + record = models.Record.objects.get(id=record_id) - # get side_by_side diff as HTML - diff_dict = record.get_input_record_diff(output='combined_gen', combined_as_html=True) + # get side_by_side diff as HTML + diff_dict = record.get_input_record_diff(output='combined_gen', combined_as_html=True) - if diff_dict: + if diff_dict: - # get combined output as html from output - html = diff_dict['combined_gen'] + # get combined output as html from output + html = diff_dict['combined_gen'] - # return document as HTML - return HttpResponse(html, content_type='text/html') + # return document as HTML + return HttpResponse(html, content_type='text/html') - else: - return HttpResponse("Record was not altered during Transformation.", content_type='text/html') + else: + return HttpResponse("Record was not altered during Transformation.", content_type='text/html') def record_side_by_side_diff_html(request, org_id, record_group_id, job_id, record_id): - - ''' + ''' Return side_by_side diff of Record against Input Record - uses sxsdiff (https://github.com/timonwong/sxsdiff) - if embed == true, strip some uncessary HTML and return ''' - # get record - record = models.Record.objects.get(id=record_id) - - # check for embed flag - embed = request.GET.get('embed', False) + # get record + record = models.Record.objects.get(id=record_id) - # get side_by_side diff as HTML - diff_dict = record.get_input_record_diff(output='side_by_side_html') + # check for embed flag + embed = request.GET.get('embed', False) - if diff_dict: + # get side_by_side diff as HTML + diff_dict = record.get_input_record_diff(output='side_by_side_html') - # get side_by_side html from output - html = diff_dict['side_by_side_html'] + if diff_dict: - # if embed flag set, alter CSS - # these are defaulted in sxsdiff library, currently - # easier to pinpoint and remove these than fork library and alter - html = html.replace('
', '
') - html = html.replace('padding-left:30px;', '/*padding-left:30px;*/') - html = html.replace('padding-right:30px;', '/*padding-right:30px;*/') + # get side_by_side html from output + html = diff_dict['side_by_side_html'] - # return document as HTML - return HttpResponse(html, content_type='text/html') + # if embed flag set, alter CSS + # these are defaulted in sxsdiff library, currently + # easier to pinpoint and remove these than fork library and alter + html = html.replace('
', '
') + html = html.replace('padding-left:30px;', '/*padding-left:30px;*/') + html = html.replace('padding-right:30px;', '/*padding-right:30px;*/') - else: - return HttpResponse("Record was not altered during Transformation.", content_type='text/html') + # return document as HTML + return HttpResponse(html, content_type='text/html') + else: + return HttpResponse("Record was not altered during Transformation.", content_type='text/html') #################################################################### @@ -2269,521 +2208,512 @@ def record_side_by_side_diff_html(request, org_id, record_group_id, job_id, reco @login_required def configuration(request): + # get all transformations + transformations = models.Transformation.objects.filter(use_as_include=False) - # get all transformations - transformations = models.Transformation.objects.filter(use_as_include=False) + # get all OAI endpoints + oai_endpoints = models.OAIEndpoint.objects.all() - # get all OAI endpoints - oai_endpoints = models.OAIEndpoint.objects.all() + # get all validation scenarios + validation_scenarios = models.ValidationScenario.objects.all() - # get all validation scenarios - validation_scenarios = models.ValidationScenario.objects.all() + # get record identifier transformation scenarios + rits = models.RecordIdentifierTransformationScenario.objects.all() - # get record identifier transformation scenarios - rits = models.RecordIdentifierTransformationScenario.objects.all() + # get all bulk downloads + bulk_downloads = models.DPLABulkDataDownload.objects.all() - # get all bulk downloads - bulk_downloads = models.DPLABulkDataDownload.objects.all() + # get field mappers + field_mappers = models.FieldMapper.objects.all() - # get field mappers - field_mappers = models.FieldMapper.objects.all() - - # return - return render(request, 'core/configuration.html', { - 'transformations':transformations, - 'oai_endpoints':oai_endpoints, - 'validation_scenarios':validation_scenarios, - 'rits':rits, - 'field_mappers':field_mappers, - 'bulk_downloads':bulk_downloads, - 'breadcrumbs':breadcrumb_parser(request) - }) + # return + return render(request, 'core/configuration.html', { + 'transformations': transformations, + 'oai_endpoints': oai_endpoints, + 'validation_scenarios': validation_scenarios, + 'rits': rits, + 'field_mappers': field_mappers, + 'bulk_downloads': bulk_downloads, + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def oai_endpoint_payload(request, oai_endpoint_id): - - ''' + ''' Return JSON of saved OAI endpoint information ''' - # retrieve OAIEndpoint - oai_endpoint = models.OAIEndpoint.objects.get(pk=oai_endpoint_id) + # retrieve OAIEndpoint + oai_endpoint = models.OAIEndpoint.objects.get(pk=oai_endpoint_id) - # pop state - oai_endpoint.__dict__.pop('_state') + # pop state + oai_endpoint.__dict__.pop('_state') - # return as json - return JsonResponse(oai_endpoint.__dict__) + # return as json + return JsonResponse(oai_endpoint.__dict__) def transformation_scenario_payload(request, trans_id): - - ''' + ''' View payload for transformation scenario ''' - # get transformation - transformation = models.Transformation.objects.get(pk=int(trans_id)) - - # return transformation as XML - if transformation.transformation_type == 'xslt': - return HttpResponse(transformation.payload, content_type='text/xml') + # get transformation + transformation = models.Transformation.objects.get(pk=int(trans_id)) - # return transformation as Python - if transformation.transformation_type == 'python': - return HttpResponse(transformation.payload, content_type='text/plain') + # return transformation as XML + if transformation.transformation_type == 'xslt': + return HttpResponse(transformation.payload, content_type='text/xml') - # return transformation as Python - if transformation.transformation_type == 'openrefine': - return HttpResponse(transformation.payload, content_type='text/plain') + # return transformation as Python + if transformation.transformation_type == 'python': + return HttpResponse(transformation.payload, content_type='text/plain') + # return transformation as Python + if transformation.transformation_type == 'openrefine': + return HttpResponse(transformation.payload, content_type='text/plain') def test_transformation_scenario(request): - - ''' + ''' View to live test transformation scenarios ''' - # If GET, serve transformation test screen - if request.method == 'GET': - - # get validation scenarios - transformation_scenarios = models.Transformation.objects.filter(use_as_include=False) - - # check if limiting to one, pre-existing record - q = request.GET.get('q', None) - - # check for pre-requested transformation scenario - tsid = request.GET.get('transformation_scenario', None) + # If GET, serve transformation test screen + if request.method == 'GET': + # get validation scenarios + transformation_scenarios = models.Transformation.objects.filter(use_as_include=False) - # return - return render(request, 'core/test_transformation_scenario.html', { - 'q':q, - 'tsid':tsid, - 'transformation_scenarios':transformation_scenarios, - 'breadcrumbs':breadcrumb_parser(request) - }) + # check if limiting to one, pre-existing record + q = request.GET.get('q', None) - # If POST, provide raw result of validation test - if request.method == 'POST': + # check for pre-requested transformation scenario + tsid = request.GET.get('transformation_scenario', None) - logger.debug('running test transformation and returning') + # return + return render(request, 'core/test_transformation_scenario.html', { + 'q': q, + 'tsid': tsid, + 'transformation_scenarios': transformation_scenarios, + 'breadcrumbs': breadcrumb_parser(request) + }) - # get response type - response_type = request.POST.get('response_type', False) + # If POST, provide raw result of validation test + if request.method == 'POST': - # get record - record = models.Record.objects.get(id=request.POST.get('db_id')) - record_iter = models.Record.objects.get(id=request.POST.get('db_id')) + logger.debug('running test transformation and returning') - try: + # get response type + response_type = request.POST.get('response_type', False) - # testing multiple, chained transformations - if request.POST.get('trans_test_type') == 'multiple': + # get record + record = models.Record.objects.get(id=request.POST.get('db_id')) + record_iter = models.Record.objects.get(id=request.POST.get('db_id')) - # get and rehydrate sel_trans_json - sel_trans = json.loads(request.POST.get('sel_trans_json')) + try: - # loop through transformations - for trans in sel_trans: + # testing multiple, chained transformations + if request.POST.get('trans_test_type') == 'multiple': - # init Transformation instance - trans = models.Transformation.objects.get(pk=int(trans['trans_id'])) + # get and rehydrate sel_trans_json + sel_trans = json.loads(request.POST.get('sel_trans_json')) - # transform with record - trans_results = trans.transform_record(record_iter) + # loop through transformations + for trans in sel_trans: + # init Transformation instance + trans = models.Transformation.objects.get(pk=int(trans['trans_id'])) - # set to record.document for next iteration - record_iter.document = trans_results + # transform with record + trans_results = trans.transform_record(record_iter) - # finally, fall in line with trans_results as record_iter document string - trans_results = record_iter.document + # set to record.document for next iteration + record_iter.document = trans_results - # testing single transformation - elif request.POST.get('trans_test_type') == 'single': + # finally, fall in line with trans_results as record_iter document string + trans_results = record_iter.document - # init new transformation scenario - trans = models.Transformation( - name='temp_trans_%s' % str(uuid.uuid4()), - payload=request.POST.get('trans_payload'), - transformation_type=request.POST.get('trans_type') - ) - trans.save() + # testing single transformation + elif request.POST.get('trans_test_type') == 'single': - # transform with record - trans_results = trans.transform_record(record) + # init new transformation scenario + trans = models.Transformation( + name='temp_trans_%s' % str(uuid.uuid4()), + payload=request.POST.get('trans_payload'), + transformation_type=request.POST.get('trans_type') + ) + trans.save() - # delete temporary trans - trans.delete() + # transform with record + trans_results = trans.transform_record(record) - # if raw transformation results - if response_type == 'transformed_doc': - return HttpResponse(trans_results, content_type="text/xml") + # delete temporary trans + trans.delete() - # get diff of original record as combined results - elif response_type == 'combined_html': + # if raw transformation results + if response_type == 'transformed_doc': + return HttpResponse(trans_results, content_type="text/xml") - # get combined diff as HTML - diff_dict = record.get_record_diff(xml_string=trans_results, output='combined_gen', combined_as_html=True, reverse_direction=True) - if diff_dict: - diff_html = diff_dict['combined_gen'] + # get diff of original record as combined results + elif response_type == 'combined_html': - return HttpResponse(diff_html, content_type="text/xml") + # get combined diff as HTML + diff_dict = record.get_record_diff(xml_string=trans_results, output='combined_gen', + combined_as_html=True, reverse_direction=True) + if diff_dict: + diff_html = diff_dict['combined_gen'] - # get diff of original record as side_by_side - elif response_type == 'side_by_side_html': + return HttpResponse(diff_html, content_type="text/xml") - # get side_by_side diff as HTML - diff_dict = record.get_record_diff(xml_string=trans_results, output='side_by_side_html', reverse_direction=True) - if diff_dict: - diff_html = diff_dict['side_by_side_html'] + # get diff of original record as side_by_side + elif response_type == 'side_by_side_html': - # strip some CSS - diff_html = diff_html.replace('
', '
') - diff_html = diff_html.replace('padding-left:30px;', '/*padding-left:30px;*/') - diff_html = diff_html.replace('padding-right:30px;', '/*padding-right:30px;*/') + # get side_by_side diff as HTML + diff_dict = record.get_record_diff(xml_string=trans_results, output='side_by_side_html', + reverse_direction=True) + if diff_dict: + diff_html = diff_dict['side_by_side_html'] + # strip some CSS + diff_html = diff_html.replace('
', '
') + diff_html = diff_html.replace('padding-left:30px;', '/*padding-left:30px;*/') + diff_html = diff_html.replace('padding-right:30px;', '/*padding-right:30px;*/') - return HttpResponse(diff_html, content_type="text/xml") + return HttpResponse(diff_html, content_type="text/xml") - except Exception as e: - logger.debug('test transformation scenario was unsucessful, deleting temporary') - try: - if request.POST.get('trans_test_type') == 'single': - trans.delete() - except: - logger.debug('could not delete temporary transformation') - return HttpResponse(str(e), content_type="text/plain") + except Exception as e: + logger.debug('test transformation scenario was unsucessful, deleting temporary') + try: + if request.POST.get('trans_test_type') == 'single': + trans.delete() + except: + logger.debug('could not delete temporary transformation') + return HttpResponse(str(e), content_type="text/plain") def validation_scenario_payload(request, vs_id): - - ''' + ''' View payload for validation scenario ''' - # get transformation - vs = models.ValidationScenario.objects.get(pk=int(vs_id)) + # get transformation + vs = models.ValidationScenario.objects.get(pk=int(vs_id)) - if vs.validation_type == 'sch': - # return document as XML - return HttpResponse(vs.payload, content_type='text/xml') + if vs.validation_type == 'sch': + # return document as XML + return HttpResponse(vs.payload, content_type='text/xml') - else: - return HttpResponse(vs.payload, content_type='text/plain') + else: + return HttpResponse(vs.payload, content_type='text/plain') def test_validation_scenario(request): - - ''' + ''' View to live test validation scenario ''' - # If GET, serve validation test screen - if request.method == 'GET': - - # get validation scenarios - validation_scenarios = models.ValidationScenario.objects.all() + # If GET, serve validation test screen + if request.method == 'GET': + # get validation scenarios + validation_scenarios = models.ValidationScenario.objects.all() - # check if limiting to one, pre-existing record - q = request.GET.get('q', None) + # check if limiting to one, pre-existing record + q = request.GET.get('q', None) - # check for pre-requested transformation scenario - vsid = request.GET.get('validation_scenario', None) + # check for pre-requested transformation scenario + vsid = request.GET.get('validation_scenario', None) - # return - return render(request, 'core/test_validation_scenario.html', { - 'q':q, - 'vsid':vsid, - 'validation_scenarios':validation_scenarios, - 'breadcrumbs':breadcrumb_parser(request) - }) + # return + return render(request, 'core/test_validation_scenario.html', { + 'q': q, + 'vsid': vsid, + 'validation_scenarios': validation_scenarios, + 'breadcrumbs': breadcrumb_parser(request) + }) - # If POST, provide raw result of validation test - if request.method == 'POST': + # If POST, provide raw result of validation test + if request.method == 'POST': - logger.debug('running test validation and returning') + logger.debug('running test validation and returning') - # get record - record = models.Record.objects.get(id=request.POST.get('db_id')) + # get record + record = models.Record.objects.get(id=request.POST.get('db_id')) - try: - # init new validation scenario - vs = models.ValidationScenario( - name='temp_vs_%s' % str(uuid.uuid4()), - payload=request.POST.get('vs_payload'), - validation_type=request.POST.get('vs_type'), - default_run=False - ) - vs.save() + try: + # init new validation scenario + vs = models.ValidationScenario( + name='temp_vs_%s' % str(uuid.uuid4()), + payload=request.POST.get('vs_payload'), + validation_type=request.POST.get('vs_type'), + default_run=False + ) + vs.save() - # validate with record - vs_results = vs.validate_record(record) + # validate with record + vs_results = vs.validate_record(record) - # delete vs - vs.delete() + # delete vs + vs.delete() - if request.POST.get('vs_results_format') == 'raw': - return HttpResponse(vs_results['raw'], content_type="text/plain") - elif request.POST.get('vs_results_format') == 'parsed': - return JsonResponse(vs_results['parsed']) - else: - raise Exception('validation results format not recognized') + if request.POST.get('vs_results_format') == 'raw': + return HttpResponse(vs_results['raw'], content_type="text/plain") + elif request.POST.get('vs_results_format') == 'parsed': + return JsonResponse(vs_results['parsed']) + else: + raise Exception('validation results format not recognized') - except Exception as e: + except Exception as e: - logger.debug('test validation scenario was unsucessful, deleting temporary vs') - vs.delete() + logger.debug('test validation scenario was unsucessful, deleting temporary vs') + vs.delete() - return HttpResponse(str(e), content_type="text/plain") + return HttpResponse(str(e), content_type="text/plain") def rits_payload(request, rits_id): - - ''' + ''' View payload for record identifier transformation scenario ''' - # get transformation - rt = models.RecordIdentifierTransformationScenario.objects.get(pk=int(rits_id)) + # get transformation + rt = models.RecordIdentifierTransformationScenario.objects.get(pk=int(rits_id)) - # return as json package - return JsonResponse(model_to_dict(rt)) + # return as json package + return JsonResponse(model_to_dict(rt)) def test_rits(request): - - ''' + ''' View to live test record identifier transformation scenarios ''' - # If GET, serve validation test screen - if request.method == 'GET': - - # check if limiting to one, pre-existing record - q = request.GET.get('q', None) + # If GET, serve validation test screen + if request.method == 'GET': + # check if limiting to one, pre-existing record + q = request.GET.get('q', None) - # get record identifier transformation scenarios - rits = models.RecordIdentifierTransformationScenario.objects.all() + # get record identifier transformation scenarios + rits = models.RecordIdentifierTransformationScenario.objects.all() - # return - return render(request, 'core/test_rits.html', { - 'q':q, - 'rits':rits, - 'breadcrumbs':breadcrumb_parser(request) - }) + # return + return render(request, 'core/test_rits.html', { + 'q': q, + 'rits': rits, + 'breadcrumbs': breadcrumb_parser(request) + }) - # If POST, provide raw result of validation test - if request.method == 'POST': + # If POST, provide raw result of validation test + if request.method == 'POST': - logger.debug('testing record identifier transformation') - logger.debug(request.POST) + logger.debug('testing record identifier transformation') + logger.debug(request.POST) - try: + try: - # make POST data mutable - request.POST._mutable = True + # make POST data mutable + request.POST._mutable = True - # get record - if request.POST.get('db_id', False): - record = models.Record.objects.get(id=request.POST.get('db_id')) - else: - return JsonResponse({'results':'Please select a record from the table above!','success':False}) + # get record + if request.POST.get('db_id', False): + record = models.Record.objects.get(id=request.POST.get('db_id')) + else: + return JsonResponse({'results': 'Please select a record from the table above!', 'success': False}) - # determine testing type - if request.POST['record_id_transform_target'] == 'record_id': - logger.debug('configuring test for record_id') - request.POST['test_transform_input'] = record.record_id - elif request.POST['record_id_transform_target'] == 'document': - logger.debug('configuring test for record_id') - request.POST['test_transform_input'] = record.document + # determine testing type + if request.POST['record_id_transform_target'] == 'record_id': + logger.debug('configuring test for record_id') + request.POST['test_transform_input'] = record.record_id + elif request.POST['record_id_transform_target'] == 'document': + logger.debug('configuring test for record_id') + request.POST['test_transform_input'] = record.document - # instantiate rits and return test - rits = models.RITSClient(request.POST) - return JsonResponse(rits.test_user_input()) + # instantiate rits and return test + rits = models.RITSClient(request.POST) + return JsonResponse(rits.test_user_input()) - except Exception as e: - return JsonResponse({'results':str(e), 'success':False}) + except Exception as e: + return JsonResponse({'results': str(e), 'success': False}) def field_mapper_payload(request, fm_id): - - ''' + ''' View payload for field mapper ''' - # get transformation - fm = models.FieldMapper.objects.get(pk=int(fm_id)) + # get transformation + fm = models.FieldMapper.objects.get(pk=int(fm_id)) - # get type - doc_type = request.GET.get('type',None) + # get type + doc_type = request.GET.get('type', None) - if fm.field_mapper_type == 'xml2kvp': + if fm.field_mapper_type == 'xml2kvp': - if not doc_type: - return HttpResponse(fm.config_json, content_type='application/json') + if not doc_type: + return HttpResponse(fm.config_json, content_type='application/json') - elif doc_type and doc_type == 'config': - return HttpResponse(fm.config_json, content_type='application/json') + elif doc_type and doc_type == 'config': + return HttpResponse(fm.config_json, content_type='application/json') - elif doc_type and doc_type == 'payload': - return HttpResponse(fm.payload, content_type='application/json') + elif doc_type and doc_type == 'payload': + return HttpResponse(fm.payload, content_type='application/json') def field_mapper_update(request): - - ''' + ''' Create and save JSON to FieldMapper instance, or update pre-existing ''' - logger.debug(request.POST) - - # get update type - update_type = request.POST.get('update_type') - - # handle new FieldMapper creation - if update_type == 'new': - logger.debug('creating new FieldMapper instance') - - fm = models.FieldMapper( - name=request.POST.get('fm_name'), - config_json=request.POST.get('fm_config_json'), - field_mapper_type='xml2kvp' - ) - - # validate fm_config before creating - try: - fm.validate_config_json() - fm.save() - return JsonResponse({'results':True,'msg':'New Field Mapper configurations were saved as: %s' % request.POST.get('fm_name')}, status=201) - except jsonschema.ValidationError as e: - return JsonResponse({'results':False,'msg':'Could not create %s, the following error was had: %s' % (fm.name, str(e))}, status=409) - - # handle update - if update_type == 'update': - logger.debug('updating pre-existing FieldMapper instance') - - # get fm instance - fm = models.FieldMapper.objects.get(pk=int(request.POST.get('fm_id'))) - - # update and save - fm.config_json = request.POST.get('fm_config_json') - - # validate fm_config before updating - try: - fm.validate_config_json() - fm.save() - return JsonResponse({'results':True,'msg':'Field Mapper configurations for %s were updated' % fm.name}, status=200) - except jsonschema.ValidationError as e: - return JsonResponse({'results':False,'msg':'Could not update %s, the following error was had: %s' % (fm.name, str(e))}, status=409) - - # handle delete - if update_type == 'delete': - logger.debug('deleting pre-existing FieldMapper instance') - - # get fm instance - fm = models.FieldMapper.objects.get(pk=int(request.POST.get('fm_id'))) - - # delete - fm.delete() - return JsonResponse({'results':True,'msg':'Field Mapper configurations for %s were deleted' % fm.name}, status=200) + logger.debug(request.POST) + + # get update type + update_type = request.POST.get('update_type') + + # handle new FieldMapper creation + if update_type == 'new': + logger.debug('creating new FieldMapper instance') + + fm = models.FieldMapper( + name=request.POST.get('fm_name'), + config_json=request.POST.get('fm_config_json'), + field_mapper_type='xml2kvp' + ) + + # validate fm_config before creating + try: + fm.validate_config_json() + fm.save() + return JsonResponse({'results': True, + 'msg': 'New Field Mapper configurations were saved as: %s' % request.POST.get( + 'fm_name')}, status=201) + except jsonschema.ValidationError as e: + return JsonResponse({'results': False, + 'msg': 'Could not create %s, the following error was had: %s' % ( + fm.name, str(e))}, status=409) + + # handle update + if update_type == 'update': + logger.debug('updating pre-existing FieldMapper instance') + + # get fm instance + fm = models.FieldMapper.objects.get(pk=int(request.POST.get('fm_id'))) + + # update and save + fm.config_json = request.POST.get('fm_config_json') + + # validate fm_config before updating + try: + fm.validate_config_json() + fm.save() + return JsonResponse({'results': True, + 'msg': 'Field Mapper configurations for %s were updated' % fm.name}, + status=200) + except jsonschema.ValidationError as e: + return JsonResponse({'results': False, + 'msg': 'Could not update %s, the following error was had: %s' % ( + fm.name, str(e))}, status=409) + + # handle delete + if update_type == 'delete': + logger.debug('deleting pre-existing FieldMapper instance') + + # get fm instance + fm = models.FieldMapper.objects.get(pk=int(request.POST.get('fm_id'))) + + # delete + fm.delete() + return JsonResponse({'results': True, + 'msg': 'Field Mapper configurations for %s were deleted' % fm.name}, + status=200) def test_field_mapper(request): - - ''' + ''' View to live test field mapper configurations ''' - if request.method == 'GET': - - # get field mapper - field_mappers = models.FieldMapper.objects.all() + if request.method == 'GET': + # get field mapper + field_mappers = models.FieldMapper.objects.all() - # check if limiting to one, pre-existing record - q = request.GET.get('q', None) + # check if limiting to one, pre-existing record + q = request.GET.get('q', None) - # check for pre-requested transformation scenario - fmid = request.GET.get('fmid', None) + # check for pre-requested transformation scenario + fmid = request.GET.get('fmid', None) - # return - return render(request, 'core/test_field_mapper.html', { - 'q':q, - 'fmid':fmid, - 'field_mappers':field_mappers, - 'xml2kvp_handle':models.XML2kvp(), - 'breadcrumbs':breadcrumb_parser(request) - }) + # return + return render(request, 'core/test_field_mapper.html', { + 'q': q, + 'fmid': fmid, + 'field_mappers': field_mappers, + 'xml2kvp_handle': models.XML2kvp(), + 'breadcrumbs': breadcrumb_parser(request) + }) - # If POST, provide mapping of record - if request.method == 'POST': + # If POST, provide mapping of record + if request.method == 'POST': - logger.debug('running test field mapping') - logger.debug(request.POST) + logger.debug('running test field mapping') + logger.debug(request.POST) - # get record - record = models.Record.objects.get(id=request.POST.get('db_id')) + # get record + record = models.Record.objects.get(id=request.POST.get('db_id')) - # get field mapper info - field_mapper = request.POST.get('field_mapper') - fm_config_json = request.POST.get('fm_config_json') + # get field mapper info + field_mapper = request.POST.get('field_mapper') + fm_config_json = request.POST.get('fm_config_json') - try: + try: - # parse record with XML2kvp - fm_config = json.loads(fm_config_json) - kvp_dict = models.XML2kvp.xml_to_kvp(record.document, **fm_config) + # parse record with XML2kvp + fm_config = json.loads(fm_config_json) + kvp_dict = models.XML2kvp.xml_to_kvp(record.document, **fm_config) - # return as JSON - return JsonResponse(kvp_dict) + # return as JSON + return JsonResponse(kvp_dict) - except Exception as e: + except Exception as e: - logger.debug('field mapper was unsucessful') - return JsonResponse({'error':str(e)}) + logger.debug('field mapper was unsucessful') + return JsonResponse({'error': str(e)}) @login_required def dpla_bulk_data_download(request): - - ''' + ''' View to support the downloading of DPLA bulk data ''' - if request.method == 'GET': - - # if S3 credentials set - if settings.AWS_ACCESS_KEY_ID and settings.AWS_SECRET_ACCESS_KEY and settings.AWS_ACCESS_KEY_ID != None and settings.AWS_SECRET_ACCESS_KEY != None: + if request.method == 'GET': - # get DPLABulkDataClient and keys from DPLA bulk download - dbdc = models.DPLABulkDataClient() - bulk_data_keys = dbdc.retrieve_keys() + # if S3 credentials set + if settings.AWS_ACCESS_KEY_ID and settings.AWS_SECRET_ACCESS_KEY and settings.AWS_ACCESS_KEY_ID != None and settings.AWS_SECRET_ACCESS_KEY != None: - else: - bulk_data_keys = False + # get DPLABulkDataClient and keys from DPLA bulk download + dbdc = models.DPLABulkDataClient() + bulk_data_keys = dbdc.retrieve_keys() - # return - return render(request, 'core/dpla_bulk_data_download.html', { - 'bulk_data_keys':bulk_data_keys, - 'breadcrumbs':breadcrumb_parser(request) - }) + else: + bulk_data_keys = False - if request.method == 'POST': + # return + return render(request, 'core/dpla_bulk_data_download.html', { + 'bulk_data_keys': bulk_data_keys, + 'breadcrumbs': breadcrumb_parser(request) + }) - # OLD ###################################################################### - logger.debug('initiating bulk data download') + if request.method == 'POST': + # OLD ###################################################################### + logger.debug('initiating bulk data download') - # get DPLABulkDataClient - dbdc = models.DPLABulkDataClient() + # get DPLABulkDataClient + dbdc = models.DPLABulkDataClient() - # initiate download - dbdc.download_and_index_bulk_data(request.POST.get('object_key', None)) - - # return to configuration screen - return redirect('configuration') + # initiate download + dbdc.download_and_index_bulk_data(request.POST.get('object_key', None)) + # return to configuration screen + return redirect('configuration') #################################################################### @@ -2792,60 +2722,58 @@ def dpla_bulk_data_download(request): @login_required def published(request, subset=None): - - ''' + ''' Published records ''' - # get instance of Published model - published = models.PublishedRecords(subset=subset) + # get instance of Published model + published = models.PublishedRecords(subset=subset) - # get field counts - if published.records.count() > 0: - # get count of fields for all published job indices - field_counts = published.count_indexed_fields() - else: - field_counts = {} + # get field counts + if published.records.count() > 0: + # get count of fields for all published job indices + field_counts = published.count_indexed_fields() + else: + field_counts = {} - # get field mappers - field_mappers = models.FieldMapper.objects.all() + # get field mappers + field_mappers = models.FieldMapper.objects.all() - # get published subsets with PublishedRecords static method - subsets = models.PublishedRecords.get_subsets() + # get published subsets with PublishedRecords static method + subsets = models.PublishedRecords.get_subsets() - # loop through subsets and enrich - for _ in subsets: + # loop through subsets and enrich + for _ in subsets: - # add counts - counts = mc_handle.combine.misc.find_one({'_id':'published_field_counts_%s' % _['name']}) + # add counts + counts = mc_handle.combine.misc.find_one({'_id': 'published_field_counts_%s' % _['name']}) - # if counts not yet calculated, do now - if counts == None: - counts = models.PublishedRecords(subset=_['name']).count_indexed_fields() - _['counts'] = counts + # if counts not yet calculated, do now + if counts == None: + counts = models.PublishedRecords(subset=_['name']).count_indexed_fields() + _['counts'] = counts - # generate hierarchy_dict - job_hierarchy = _stateio_prepare_job_hierarchy() + # generate hierarchy_dict + job_hierarchy = _stateio_prepare_job_hierarchy() - return render(request, 'core/published.html', { - 'published':published, - 'field_mappers':field_mappers, - 'xml2kvp_handle':models.XML2kvp(), - 'field_counts':field_counts, - 'es_index_str':published.esi.es_index_str, - 'subsets':subsets, - 'job_hierarchy_json':json.dumps(job_hierarchy), - 'job_hierarchy_json_subset':json.dumps( - getattr(published,'ps_doc',{}).get('hierarchy',[]) - ), - 'breadcrumbs':breadcrumb_parser(request) - }) + return render(request, 'core/published.html', { + 'published': published, + 'field_mappers': field_mappers, + 'xml2kvp_handle': models.XML2kvp(), + 'field_counts': field_counts, + 'es_index_str': published.esi.es_index_str, + 'subsets': subsets, + 'job_hierarchy_json': json.dumps(job_hierarchy), + 'job_hierarchy_json_subset': json.dumps( + getattr(published, 'ps_doc', {}).get('hierarchy', []) + ), + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def published_subset_create(request): - - ''' + ''' Create subset of published records - output should be a Mongo document in combine.misc called "published_subset_[SUBSET]" @@ -2858,126 +2786,124 @@ def published_subset_create(request): - also include "loose" records? ''' - if request.method == 'GET': + if request.method == 'GET': - # get all published sets - published = models.PublishedRecords() + # get all published sets + published = models.PublishedRecords() - # generate hierarchy_dict - job_hierarchy = _stateio_prepare_job_hierarchy() + # generate hierarchy_dict + job_hierarchy = _stateio_prepare_job_hierarchy() - return render(request, 'core/published_subset_create.html', { - 'published':published, - 'job_hierarchy_json':json.dumps(job_hierarchy), - 'breadcrumbs':breadcrumb_parser(request) - }) + return render(request, 'core/published_subset_create.html', { + 'published': published, + 'job_hierarchy_json': json.dumps(job_hierarchy), + 'breadcrumbs': breadcrumb_parser(request) + }) - elif request.method == 'POST': + elif request.method == 'POST': - logger.debug('creating new published subset') + logger.debug('creating new published subset') - # sanitize name - name = request.POST.get('name') - name = ''.join(c for c in name if c.isalnum()) - name = name.lower() + # sanitize name + name = request.POST.get('name') + name = ''.join(c for c in name if c.isalnum()) + name = name.lower() - # confirm sets are present - sets = request.POST.getlist('sets') + # confirm sets are present + sets = request.POST.getlist('sets') - # handle non set records - if request.POST.get('include_non_set_records',False): - include_non_set_records = True - else: - include_non_set_records = False + # handle non set records + if request.POST.get('include_non_set_records', False): + include_non_set_records = True + else: + include_non_set_records = False - # handle org / rg hierarchy - hierarchy = json.loads(request.POST.get('hierarchy', [])) + # handle org / rg hierarchy + hierarchy = json.loads(request.POST.get('hierarchy', [])) - # create new published subset - doc = mc_handle.combine.misc.insert_one( - { - 'name':name, - 'description':request.POST.get('description',None), - 'type':'published_subset', - 'publish_set_ids':sets, - 'hierarchy':hierarchy, - 'include_non_set_records':include_non_set_records - }) + # create new published subset + doc = mc_handle.combine.misc.insert_one( + { + 'name': name, + 'description': request.POST.get('description', None), + 'type': 'published_subset', + 'publish_set_ids': sets, + 'hierarchy': hierarchy, + 'include_non_set_records': include_non_set_records + }) - return redirect('published_subset', - subset=name) + return redirect('published_subset', + subset=name) @login_required def published_subset_edit(request, subset): - - ''' + ''' Edit Published Subset ''' - if request.method == 'GET': + if request.method == 'GET': - # get subset published records - published = models.PublishedRecords() - published_subset = models.PublishedRecords(subset=subset) - published_subset.ps_doc['id'] = str(published_subset.ps_doc['_id']) + # get subset published records + published = models.PublishedRecords() + published_subset = models.PublishedRecords(subset=subset) + published_subset.ps_doc['id'] = str(published_subset.ps_doc['_id']) - # generate hierarchy_dict - job_hierarchy = _stateio_prepare_job_hierarchy() + # generate hierarchy_dict + job_hierarchy = _stateio_prepare_job_hierarchy() - return render(request, 'core/published_subset_edit.html', { - 'published':published, - 'published_subset':published_subset, - 'job_hierarchy_json':json.dumps(job_hierarchy), - 'job_hierarchy_json_subset':json.dumps(published_subset.ps_doc.get('hierarchy',[])), - 'breadcrumbs':breadcrumb_parser(request) - }) + return render(request, 'core/published_subset_edit.html', { + 'published': published, + 'published_subset': published_subset, + 'job_hierarchy_json': json.dumps(job_hierarchy), + 'job_hierarchy_json_subset': json.dumps(published_subset.ps_doc.get('hierarchy', [])), + 'breadcrumbs': breadcrumb_parser(request) + }) - elif request.method == 'POST': + elif request.method == 'POST': - logger.debug('updating published subset') + logger.debug('updating published subset') - # confirm sets are present - sets = request.POST.getlist('sets') + # confirm sets are present + sets = request.POST.getlist('sets') - # handle non set records - if request.POST.get('include_non_set_records', False): - include_non_set_records = True - else: - include_non_set_records = False + # handle non set records + if request.POST.get('include_non_set_records', False): + include_non_set_records = True + else: + include_non_set_records = False - # handle org / rg hierarchy - hierarchy = json.loads(request.POST.get('hierarchy', [])) + # handle org / rg hierarchy + hierarchy = json.loads(request.POST.get('hierarchy', [])) - # update published subset - published = models.PublishedRecords(subset=subset) - published.update_subset({ - 'description':request.POST.get('description', None), - 'type':'published_subset', - 'publish_set_ids':sets, - 'hierarchy':hierarchy, - 'include_non_set_records':include_non_set_records - }) - published.remove_subset_precounts() + # update published subset + published = models.PublishedRecords(subset=subset) + published.update_subset({ + 'description': request.POST.get('description', None), + 'type': 'published_subset', + 'publish_set_ids': sets, + 'hierarchy': hierarchy, + 'include_non_set_records': include_non_set_records + }) + published.remove_subset_precounts() - return redirect('published_subset', - subset=subset) + return redirect('published_subset', + subset=subset) @login_required def published_subset_delete(request, subset): - - ''' + ''' Delete published subset ''' - d = mc_handle.combine.misc.delete_one({'type':'published_subset','name':subset}) - logger.debug(d.raw_result) - d = mc_handle.combine.misc.delete_one({'_id':'published_field_counts_%s' % subset}) - logger.debug(d.raw_result) - return redirect('published') + d = mc_handle.combine.misc.delete_one({'type': 'published_subset', 'name': subset}) + logger.debug(d.raw_result) + d = mc_handle.combine.misc.delete_one({'_id': 'published_field_counts_%s' % subset}) + logger.debug(d.raw_result) + return redirect('published') #################################################################### @@ -2985,18 +2911,16 @@ def published_subset_delete(request, subset): #################################################################### def oai(request, subset=None): - - ''' + ''' Parse GET parameters, send to OAIProvider instance from oai.py Return XML results ''' - # get OAIProvider instance - op = OAIProvider(request.GET, subset=subset) - - # return XML - return HttpResponse(op.generate_response(), content_type='text/xml') + # get OAIProvider instance + op = OAIProvider(request.GET, subset=subset) + # return XML + return HttpResponse(op.generate_response(), content_type='text/xml') #################################################################### @@ -3004,30 +2928,28 @@ def oai(request, subset=None): #################################################################### def search(request): - - ''' + ''' Global search of Records ''' - # if search term present, use - q = request.GET.get('q', None) - if q: - search_params = json.dumps({'q':q}) - logger.debug(search_params) - else: - search_params = None - - # generate hierarchy_dict - job_hierarchy = _stateio_prepare_job_hierarchy() + # if search term present, use + q = request.GET.get('q', None) + if q: + search_params = json.dumps({'q': q}) + logger.debug(search_params) + else: + search_params = None - return render(request, 'core/search.html', { - 'search_string':q, - 'search_params':search_params, - 'job_hierarchy_json':json.dumps(job_hierarchy), - 'breadcrumbs':breadcrumb_parser(request), - 'page_title':' | Search' - }) + # generate hierarchy_dict + job_hierarchy = _stateio_prepare_job_hierarchy() + return render(request, 'core/search.html', { + 'search_string': q, + 'search_params': search_params, + 'job_hierarchy_json': json.dumps(job_hierarchy), + 'breadcrumbs': breadcrumb_parser(request), + 'page_title': ' | Search' + }) #################################################################### @@ -3035,310 +2957,306 @@ def search(request): #################################################################### def export_documents(request, - export_source=None, - job_id=None, - subset=None): - - # get records per file - records_per_file = request.POST.get('records_per_file', False) - if records_per_file in ['',False]: - records_per_file = 500 - - # get archive type - archive_type = request.POST.get('archive_type') - - # export for single job - if export_source == 'job': - - logger.debug('exporting documents from Job') - - # retrieve job - cjob = models.CombineJob.get_combine_job(int(job_id)) - - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = 'Export Documents for Job: %s' % cjob.job.name, - task_type = 'export_documents', - task_params_json = json.dumps({ - 'job_id':cjob.job.id, - 'records_per_file':int(records_per_file), - 'archive_type':archive_type - }) - ) - ct.save() - - # handle export output configurations - ct = _handle_export_output(request,export_source,ct) - - # run celery task - bg_task = tasks.export_documents.delay(ct.id) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # set gm - gmc = models.GlobalMessageClient(request.session) - target = "Job:
%s" % cjob.job.name - gmc.add_gm({ - 'html':'

Exporting Documents for %s

' % (target, reverse('bg_tasks')), - 'class':'success' - }) - - return redirect('job_details', - org_id=cjob.job.record_group.organization.id, - record_group_id=cjob.job.record_group.id, - job_id=cjob.job.id) - - # export for published - if export_source == 'published': - - logger.debug('exporting documents for published records') - - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = 'Export Documents for Published Records', - task_type = 'export_documents', - task_params_json = json.dumps({ - 'published':True, - 'subset':subset, - 'records_per_file':int(records_per_file), - 'archive_type':archive_type - }) - ) - ct.save() - - # handle export output configurations - ct = _handle_export_output(request,export_source,ct) - - # run celery task - bg_task = tasks.export_documents.delay(ct.id) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # set gm - gmc = models.GlobalMessageClient(request.session) - target = ":
Published Records" - gmc.add_gm({ - 'html':'

Exporting Documents for %s

' % (target, reverse('bg_tasks')), - 'class':'success' - }) - - return redirect('published') + export_source=None, + job_id=None, + subset=None): + # get records per file + records_per_file = request.POST.get('records_per_file', False) + if records_per_file in ['', False]: + records_per_file = 500 + + # get archive type + archive_type = request.POST.get('archive_type') + + # export for single job + if export_source == 'job': + logger.debug('exporting documents from Job') + + # retrieve job + cjob = models.CombineJob.get_combine_job(int(job_id)) + + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name='Export Documents for Job: %s' % cjob.job.name, + task_type='export_documents', + task_params_json=json.dumps({ + 'job_id': cjob.job.id, + 'records_per_file': int(records_per_file), + 'archive_type': archive_type + }) + ) + ct.save() + + # handle export output configurations + ct = _handle_export_output(request, export_source, ct) + + # run celery task + bg_task = tasks.export_documents.delay(ct.id) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + + # set gm + gmc = models.GlobalMessageClient(request.session) + target = "Job:
%s" % cjob.job.name + gmc.add_gm({ + 'html': '

Exporting Documents for %s

' % ( + target, reverse('bg_tasks')), + 'class': 'success' + }) + + return redirect('job_details', + org_id=cjob.job.record_group.organization.id, + record_group_id=cjob.job.record_group.id, + job_id=cjob.job.id) + + # export for published + if export_source == 'published': + logger.debug('exporting documents for published records') + + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name='Export Documents for Published Records', + task_type='export_documents', + task_params_json=json.dumps({ + 'published': True, + 'subset': subset, + 'records_per_file': int(records_per_file), + 'archive_type': archive_type + }) + ) + ct.save() + + # handle export output configurations + ct = _handle_export_output(request, export_source, ct) + + # run celery task + bg_task = tasks.export_documents.delay(ct.id) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + + # set gm + gmc = models.GlobalMessageClient(request.session) + target = ":
Published Records" + gmc.add_gm({ + 'html': '

Exporting Documents for %s

' % ( + target, reverse('bg_tasks')), + 'class': 'success' + }) + + return redirect('published') def export_mapped_fields(request, - export_source=None, - job_id=None, - subset=None): - - # get mapped fields export type - mapped_fields_export_type = request.POST.get('mapped_fields_export_type') - - # check for Kibana check - kibana_style = request.POST.get('kibana_style', False) - if kibana_style: - kibana_style = True - - # get archive type - archive_type = request.POST.get('archive_type') - - # get selected fields if present - mapped_field_include = request.POST.getlist('mapped_field_include',False) - - # export for single job - if export_source == 'job': - - logger.debug('exporting mapped fields from Job') - - # retrieve job - cjob = models.CombineJob.get_combine_job(int(job_id)) - - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = 'Export Mapped Fields for Job: %s' % cjob.job.name, - task_type = 'export_mapped_fields', - task_params_json = json.dumps({ - 'job_id':cjob.job.id, - 'mapped_fields_export_type':mapped_fields_export_type, - 'kibana_style':kibana_style, - 'archive_type':archive_type, - 'mapped_field_include':mapped_field_include - }) - ) - ct.save() - - # handle export output configurations - ct = _handle_export_output(request,export_source,ct) - - # run celery task - bg_task = tasks.export_mapped_fields.delay(ct.id) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # set gm - gmc = models.GlobalMessageClient(request.session) - target = "Job:
%s" % cjob.job.name - gmc.add_gm({ - 'html':'

Exporting Mapped Fields for %s

' % (target, reverse('bg_tasks')), - 'class':'success' - }) - - return redirect('job_details', - org_id=cjob.job.record_group.organization.id, - record_group_id=cjob.job.record_group.id, - job_id=cjob.job.id) - - # export for published - if export_source == 'published': - - logger.debug('exporting mapped fields from published records') - - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = 'Export Mapped Fields for Published Records', - task_type = 'export_mapped_fields', - task_params_json = json.dumps({ - 'published':True, - 'subset':subset, - 'mapped_fields_export_type':mapped_fields_export_type, - 'kibana_style':kibana_style, - 'archive_type':archive_type, - 'mapped_field_include':mapped_field_include - }) - ) - ct.save() - - # handle export output configurations - ct = _handle_export_output(request,export_source,ct) - - # run celery task - bg_task = tasks.export_mapped_fields.delay(ct.id) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # set gm - gmc = models.GlobalMessageClient(request.session) - target = ":
Published Records" - gmc.add_gm({ - 'html':'

Exporting Mapped Fields for %s

' % (target, reverse('bg_tasks')), - 'class':'success' - }) - - return redirect('published') + export_source=None, + job_id=None, + subset=None): + # get mapped fields export type + mapped_fields_export_type = request.POST.get('mapped_fields_export_type') + + # check for Kibana check + kibana_style = request.POST.get('kibana_style', False) + if kibana_style: + kibana_style = True + + # get archive type + archive_type = request.POST.get('archive_type') + + # get selected fields if present + mapped_field_include = request.POST.getlist('mapped_field_include', False) + + # export for single job + if export_source == 'job': + logger.debug('exporting mapped fields from Job') + + # retrieve job + cjob = models.CombineJob.get_combine_job(int(job_id)) + + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name='Export Mapped Fields for Job: %s' % cjob.job.name, + task_type='export_mapped_fields', + task_params_json=json.dumps({ + 'job_id': cjob.job.id, + 'mapped_fields_export_type': mapped_fields_export_type, + 'kibana_style': kibana_style, + 'archive_type': archive_type, + 'mapped_field_include': mapped_field_include + }) + ) + ct.save() + + # handle export output configurations + ct = _handle_export_output(request, export_source, ct) + + # run celery task + bg_task = tasks.export_mapped_fields.delay(ct.id) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + + # set gm + gmc = models.GlobalMessageClient(request.session) + target = "Job:
%s" % cjob.job.name + gmc.add_gm({ + 'html': '

Exporting Mapped Fields for %s

' % ( + target, reverse('bg_tasks')), + 'class': 'success' + }) + + return redirect('job_details', + org_id=cjob.job.record_group.organization.id, + record_group_id=cjob.job.record_group.id, + job_id=cjob.job.id) + + # export for published + if export_source == 'published': + logger.debug('exporting mapped fields from published records') + + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name='Export Mapped Fields for Published Records', + task_type='export_mapped_fields', + task_params_json=json.dumps({ + 'published': True, + 'subset': subset, + 'mapped_fields_export_type': mapped_fields_export_type, + 'kibana_style': kibana_style, + 'archive_type': archive_type, + 'mapped_field_include': mapped_field_include + }) + ) + ct.save() + + # handle export output configurations + ct = _handle_export_output(request, export_source, ct) + + # run celery task + bg_task = tasks.export_mapped_fields.delay(ct.id) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + + # set gm + gmc = models.GlobalMessageClient(request.session) + target = ":
Published Records" + gmc.add_gm({ + 'html': '

Exporting Mapped Fields for %s

' % ( + target, reverse('bg_tasks')), + 'class': 'success' + }) + + return redirect('published') def export_tabular_data(request, - export_source=None, - job_id=None, - subset=None): - - # get records per file - records_per_file = request.POST.get('records_per_file', False) - if records_per_file in ['',False]: - records_per_file = 500 - - # get mapped fields export type - tabular_data_export_type = request.POST.get('tabular_data_export_type') - - # get archive type - archive_type = request.POST.get('archive_type') - - # get fm config json - fm_export_config_json = request.POST.get('fm_export_config_json') - - # export for single job - if export_source == 'job': - - logger.debug('exporting tabular data from Job') - - # retrieve job - cjob = models.CombineJob.get_combine_job(int(job_id)) - - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = 'Export Tabular Data for Job: %s' % cjob.job.name, - task_type = 'export_tabular_data', - task_params_json = json.dumps({ - 'job_id':cjob.job.id, - 'records_per_file':int(records_per_file), - 'tabular_data_export_type':tabular_data_export_type, - 'archive_type':archive_type, - 'fm_export_config_json':fm_export_config_json - }) - ) - ct.save() - - # handle export output configurations - ct = _handle_export_output(request,export_source,ct) - - # run celery task - bg_task = tasks.export_tabular_data.delay(ct.id) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # set gm - gmc = models.GlobalMessageClient(request.session) - target = "Job:
%s" % cjob.job.name - gmc.add_gm({ - 'html':'

Exporting Tabular Data for %s

' % (target, reverse('bg_tasks')), - 'class':'success' - }) - - return redirect('job_details', - org_id=cjob.job.record_group.organization.id, - record_group_id=cjob.job.record_group.id, - job_id=cjob.job.id) - - # export for published - if export_source == 'published': - - logger.debug('exporting tabular data from published records') - - # get instance of Published model - published = models.PublishedRecords() - - # initiate Combine BG Task - ct = models.CombineBackgroundTask( - name = 'Export Tabular Data for Published Records', - task_type = 'export_tabular_data', - task_params_json = json.dumps({ - 'published':True, - 'subset':subset, - 'records_per_file':int(records_per_file), - 'tabular_data_export_type':tabular_data_export_type, - 'archive_type':archive_type, - 'fm_export_config_json':fm_export_config_json - }) - ) - ct.save() - - # handle export output configurations - ct = _handle_export_output(request,export_source,ct) - - # run celery task - bg_task = tasks.export_tabular_data.delay(ct.id) - logger.debug('firing bg task: %s' % bg_task) - ct.celery_task_id = bg_task.task_id - ct.save() - - # set gm - gmc = models.GlobalMessageClient(request.session) - target = ":
Published Records" - gmc.add_gm({ - 'html':'

Exporting Tabular Data for %s

' % (target, reverse('bg_tasks')), - 'class':'success' - }) - - return redirect('published') + export_source=None, + job_id=None, + subset=None): + # get records per file + records_per_file = request.POST.get('records_per_file', False) + if records_per_file in ['', False]: + records_per_file = 500 + + # get mapped fields export type + tabular_data_export_type = request.POST.get('tabular_data_export_type') + + # get archive type + archive_type = request.POST.get('archive_type') + + # get fm config json + fm_export_config_json = request.POST.get('fm_export_config_json') + + # export for single job + if export_source == 'job': + logger.debug('exporting tabular data from Job') + + # retrieve job + cjob = models.CombineJob.get_combine_job(int(job_id)) + + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name='Export Tabular Data for Job: %s' % cjob.job.name, + task_type='export_tabular_data', + task_params_json=json.dumps({ + 'job_id': cjob.job.id, + 'records_per_file': int(records_per_file), + 'tabular_data_export_type': tabular_data_export_type, + 'archive_type': archive_type, + 'fm_export_config_json': fm_export_config_json + }) + ) + ct.save() + + # handle export output configurations + ct = _handle_export_output(request, export_source, ct) + + # run celery task + bg_task = tasks.export_tabular_data.delay(ct.id) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + + # set gm + gmc = models.GlobalMessageClient(request.session) + target = "Job:
%s" % cjob.job.name + gmc.add_gm({ + 'html': '

Exporting Tabular Data for %s

' % ( + target, reverse('bg_tasks')), + 'class': 'success' + }) + + return redirect('job_details', + org_id=cjob.job.record_group.organization.id, + record_group_id=cjob.job.record_group.id, + job_id=cjob.job.id) + + # export for published + if export_source == 'published': + logger.debug('exporting tabular data from published records') + + # get instance of Published model + published = models.PublishedRecords() + + # initiate Combine BG Task + ct = models.CombineBackgroundTask( + name='Export Tabular Data for Published Records', + task_type='export_tabular_data', + task_params_json=json.dumps({ + 'published': True, + 'subset': subset, + 'records_per_file': int(records_per_file), + 'tabular_data_export_type': tabular_data_export_type, + 'archive_type': archive_type, + 'fm_export_config_json': fm_export_config_json + }) + ) + ct.save() + + # handle export output configurations + ct = _handle_export_output(request, export_source, ct) + + # run celery task + bg_task = tasks.export_tabular_data.delay(ct.id) + logger.debug('firing bg task: %s' % bg_task) + ct.celery_task_id = bg_task.task_id + ct.save() + + # set gm + gmc = models.GlobalMessageClient(request.session) + target = ":
Published Records" + gmc.add_gm({ + 'html': '

Exporting Tabular Data for %s

' % ( + target, reverse('bg_tasks')), + 'class': 'success' + }) + + return redirect('published') def _handle_export_output(request, export_source, ct): - - ''' + ''' Function to handle export outputs - currently only augmenting with S3 export @@ -3351,25 +3269,24 @@ def _handle_export_output(request, export_source, ct): ct (CombineBackgroundTask) ''' - # handle s3 export - s3_export = request.POST.get('s3_export',False) - if s3_export: - s3_export = True - - # if s3_export - if s3_export: + # handle s3 export + s3_export = request.POST.get('s3_export', False) + if s3_export: + s3_export = True - # udpate task params - ct.update_task_params({ - 's3_export':True, - 's3_bucket':request.POST.get('s3_bucket', None), - 's3_key':request.POST.get('s3_key', None), - 's3_export_type':request.POST.get('s3_export_type', None) - }) + # if s3_export + if s3_export: + # udpate task params + ct.update_task_params({ + 's3_export': True, + 's3_bucket': request.POST.get('s3_bucket', None), + 's3_key': request.POST.get('s3_key', None), + 's3_export_type': request.POST.get('s3_export_type', None) + }) - # save and return - ct.save() - return ct + # save and return + ct.save() + return ct #################################################################### @@ -3377,113 +3294,110 @@ def _handle_export_output(request, export_source, ct): #################################################################### def analysis(request): - - ''' + ''' Analysis home ''' - # get all jobs associated with record group - analysis_jobs = models.Job.objects.filter(job_type='AnalysisJob') + # get all jobs associated with record group + analysis_jobs = models.Job.objects.filter(job_type='AnalysisJob') - # get analysis jobs hierarchy - analysis_hierarchy = models.AnalysisJob.get_analysis_hierarchy() + # get analysis jobs hierarchy + analysis_hierarchy = models.AnalysisJob.get_analysis_hierarchy() - # get analysis jobs lineage - analysis_job_lineage = models.Job.get_all_jobs_lineage( - organization = analysis_hierarchy['organization'], - record_group = analysis_hierarchy['record_group'], - exclude_analysis_jobs = False - ) + # get analysis jobs lineage + analysis_job_lineage = models.Job.get_all_jobs_lineage( + organization=analysis_hierarchy['organization'], + record_group=analysis_hierarchy['record_group'], + exclude_analysis_jobs=False + ) - # loop through jobs - for job in analysis_jobs: - # update status - job.update_status() + # loop through jobs + for job in analysis_jobs: + # update status + job.update_status() - # render page - return render(request, 'core/analysis.html', { - 'jobs':analysis_jobs, - 'job_lineage_json':json.dumps(analysis_job_lineage), - 'for_analysis':True, - 'breadcrumbs':breadcrumb_parser(request) - }) + # render page + return render(request, 'core/analysis.html', { + 'jobs': analysis_jobs, + 'job_lineage_json': json.dumps(analysis_job_lineage), + 'for_analysis': True, + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def job_analysis(request): - - ''' + ''' Run new analysis job ''' - # if GET, prepare form - if request.method == 'GET': - - # retrieve jobs (limiting if needed) - input_jobs = models.Job.objects.all() + # if GET, prepare form + if request.method == 'GET': - # limit if analysis_type set - analysis_type = request.GET.get('type', None) - subset = request.GET.get('subset', None) - if analysis_type == 'published': + # retrieve jobs (limiting if needed) + input_jobs = models.Job.objects.all() - # load PublishedRecords - published = models.PublishedRecords(subset=subset) + # limit if analysis_type set + analysis_type = request.GET.get('type', None) + subset = request.GET.get('subset', None) + if analysis_type == 'published': - # define input_jobs - input_jobs = published.published_jobs + # load PublishedRecords + published = models.PublishedRecords(subset=subset) - else: - published = None + # define input_jobs + input_jobs = published.published_jobs - # get validation scenarios - validation_scenarios = models.ValidationScenario.objects.all() + else: + published = None - # get field mappers - field_mappers = models.FieldMapper.objects.all() + # get validation scenarios + validation_scenarios = models.ValidationScenario.objects.all() - # get record identifier transformation scenarios - rits = models.RecordIdentifierTransformationScenario.objects.all() + # get field mappers + field_mappers = models.FieldMapper.objects.all() - # get job lineage for all jobs (filtered to input jobs scope) - ld = models.Job.get_all_jobs_lineage(jobs_query_set=input_jobs) + # get record identifier transformation scenarios + rits = models.RecordIdentifierTransformationScenario.objects.all() - # get all bulk downloads - bulk_downloads = models.DPLABulkDataDownload.objects.all() + # get job lineage for all jobs (filtered to input jobs scope) + ld = models.Job.get_all_jobs_lineage(jobs_query_set=input_jobs) - # render page - return render(request, 'core/job_analysis.html', { - 'job_select_type':'multiple', - 'input_jobs':input_jobs, - 'published':published, - 'validation_scenarios':validation_scenarios, - 'rits':rits, - 'field_mappers':field_mappers, - 'xml2kvp_handle':models.XML2kvp(), - 'analysis_type':analysis_type, - 'bulk_downloads':bulk_downloads, - 'job_lineage_json':json.dumps(ld) - }) + # get all bulk downloads + bulk_downloads = models.DPLABulkDataDownload.objects.all() - # if POST, submit job - if request.method == 'POST': + # render page + return render(request, 'core/job_analysis.html', { + 'job_select_type': 'multiple', + 'input_jobs': input_jobs, + 'published': published, + 'validation_scenarios': validation_scenarios, + 'rits': rits, + 'field_mappers': field_mappers, + 'xml2kvp_handle': models.XML2kvp(), + 'analysis_type': analysis_type, + 'bulk_downloads': bulk_downloads, + 'job_lineage_json': json.dumps(ld) + }) - cjob = models.CombineJob.init_combine_job( - user = request.user, - record_group = record_group, - job_type_class = models.AnalysisJob, - job_params = request.POST) + # if POST, submit job + if request.method == 'POST': - # start job and update status - job_status = cjob.start_job() + cjob = models.CombineJob.init_combine_job( + user=request.user, + record_group=record_group, + job_type_class=models.AnalysisJob, + job_params=request.POST) - # if job_status is absent, report job status as failed - if job_status == False: - cjob.job.status = 'failed' - cjob.job.save() + # start job and update status + job_status = cjob.start_job() - return redirect('analysis') + # if job_status is absent, report job status as failed + if job_status == False: + cjob.job.status = 'failed' + cjob.job.save() + return redirect('analysis') #################################################################### @@ -3491,72 +3405,66 @@ def job_analysis(request): #################################################################### def bg_tasks(request): + logger.debug('retrieving background tasks') - logger.debug('retrieving background tasks') - - # update all tasks not marked as complete - nc_tasks = models.CombineBackgroundTask.objects.filter(completed=False) - for task in nc_tasks: - task.update() + # update all tasks not marked as complete + nc_tasks = models.CombineBackgroundTask.objects.filter(completed=False) + for task in nc_tasks: + task.update() - return render(request, 'core/bg_tasks.html', { - 'breadcrumbs':breadcrumb_parser(request) - }) + return render(request, 'core/bg_tasks.html', { + 'breadcrumbs': breadcrumb_parser(request) + }) def bg_tasks_delete_all(request): + logger.debug('deleting all background tasks') - logger.debug('deleting all background tasks') + # delete all Combine Background Tasks + cts = models.CombineBackgroundTask.objects.all() + for ct in cts: + ct.delete() - # delete all Combine Background Tasks - cts = models.CombineBackgroundTask.objects.all() - for ct in cts: - ct.delete() - - return redirect('bg_tasks') + return redirect('bg_tasks') def bg_task(request, task_id): + # get task + ct = models.CombineBackgroundTask.objects.get(pk=int(task_id)) + logger.debug('retrieving task: %s' % ct) - # get task - ct = models.CombineBackgroundTask.objects.get(pk=int(task_id)) - logger.debug('retrieving task: %s' % ct) - - # include job if mentioned in task params - if 'job_id' in ct.task_params: - cjob = models.CombineJob.get_combine_job(ct.task_params['job_id']) - else: - cjob = None + # include job if mentioned in task params + if 'job_id' in ct.task_params: + cjob = models.CombineJob.get_combine_job(ct.task_params['job_id']) + else: + cjob = None - return render(request, 'core/bg_task.html', { - 'ct':ct, - 'cjob':cjob, - 'breadcrumbs':breadcrumb_parser(request) - }) + return render(request, 'core/bg_task.html', { + 'ct': ct, + 'cjob': cjob, + 'breadcrumbs': breadcrumb_parser(request) + }) def bg_task_delete(request, task_id): + # get task + ct = models.CombineBackgroundTask.objects.get(pk=int(task_id)) + logger.debug('deleting task: %s' % ct) - # get task - ct = models.CombineBackgroundTask.objects.get(pk=int(task_id)) - logger.debug('deleting task: %s' % ct) + ct.delete() - ct.delete() - - return redirect('bg_tasks') + return redirect('bg_tasks') def bg_task_cancel(request, task_id): + # get task + ct = models.CombineBackgroundTask.objects.get(pk=int(task_id)) + logger.debug('cancelling task: %s' % ct) - # get task - ct = models.CombineBackgroundTask.objects.get(pk=int(task_id)) - logger.debug('cancelling task: %s' % ct) - - # cancel - ct.cancel() - - return redirect('bg_tasks') + # cancel + ct.cancel() + return redirect('bg_tasks') #################################################################### @@ -3565,669 +3473,634 @@ def bg_task_cancel(request, task_id): #################################################################### class DTRecordsJson(BaseDatatableView): - - ''' + ''' Prepare and return Datatables JSON for Records table in Job Details ''' - # define the columns that will be returned - columns = [ - '_id', - 'record_id', - 'job_id', - 'oai_set', - 'unique', - 'document', - 'error', - 'valid' - ] - - # define column names that will be used in sorting - # order is important and should be same as order of columns - # displayed by datatables. For non sortable columns use empty - # value like '' - # order_columns = ['number', 'user', 'state', '', ''] - order_columns = [ - '_id', - 'record_id', - 'job_id', - 'oai_set', - 'unique', - 'document', - 'error', - 'valid' - ] - - # set max limit of records returned, this is used to protect our site if someone tries to attack our site - # and make it return huge amount of data - max_display_length = 1000 - - - def get_initial_queryset(self): - - # return queryset used as base for futher sorting/filtering - - # if job present, filter by job - if 'job_id' in self.kwargs.keys(): - - # get jobself.kwargs['job_id'] - job = models.Job.objects.get(pk=self.kwargs['job_id']) - - # return filtered queryset - if 'success_filter' in self.kwargs.keys(): - success_filter = bool(int(self.kwargs['success_filter'])) - else: - success_filter = None - return job.get_records(success=success_filter) - - # else, return all records - else: - return models.Record.objects - - - def render_column(self, row, column): - - # construct record link - record_link = reverse(record, kwargs={ - 'org_id':row.job.record_group.organization.id, - 'record_group_id':row.job.record_group.id, - 'job_id':row.job.id, 'record_id':str(row.id) - }) - - # handle db_id - if column == '_id': - return '%s' % (record_link, str(row.id)) - - # handle record_id - if column == 'record_id': - return '%s' % (record_link, row.record_id) - - # handle document - elif column == 'document': - # attempt to parse as XML and return if valid or not - try: - xml = etree.fromstring(row.document.encode('utf-8')) - return 'Valid XML' % (reverse(record_document, kwargs={ - 'org_id':row.job.record_group.organization.id, - 'record_group_id':row.job.record_group.id, - 'job_id':row.job.id, 'record_id':str(row.id) - })) - except: - return 'Invalid XML' - - # handle associated job - elif column == 'job': - return '%s' % (reverse(job_details, kwargs={ - 'org_id':row.job.record_group.organization.id, - 'record_group_id':row.job.record_group.id, - 'job_id':row.job.id - }), row.job.name) - - # handle unique - elif column == 'unique': - if row.unique: - return 'Unique in Job' - else: - return 'Duplicate in Job' - - # handle validation_results - elif column == 'valid': - if row.valid: - return 'Valid' - else: - return 'Invalid' - - else: - return super(DTRecordsJson, self).render_column(row, column) - - - def filter_queryset(self, qs): - # use parameters passed in GET request to filter queryset - - # handle search - search = self.request.GET.get(u'search[value]', None) - if search: - # sniff out ObjectId if present - if len(search) == 24: - try: - oid = ObjectId(search) - qs = qs.filter(mongoengine.Q(id=oid)) - except: - logger.debug('recieved 24 chars, but not ObjectId') - else: - qs = qs.filter(mongoengine.Q(record_id=search)) - - # return - return qs - + # define the columns that will be returned + columns = [ + '_id', + 'record_id', + 'job_id', + 'oai_set', + 'unique', + 'document', + 'error', + 'valid' + ] + + # define column names that will be used in sorting + # order is important and should be same as order of columns + # displayed by datatables. For non sortable columns use empty + # value like '' + # order_columns = ['number', 'user', 'state', '', ''] + order_columns = [ + '_id', + 'record_id', + 'job_id', + 'oai_set', + 'unique', + 'document', + 'error', + 'valid' + ] + + # set max limit of records returned, this is used to protect our site if someone tries to attack our site + # and make it return huge amount of data + max_display_length = 1000 + + def get_initial_queryset(self): + + # return queryset used as base for futher sorting/filtering + + # if job present, filter by job + if 'job_id' in self.kwargs.keys(): + + # get jobself.kwargs['job_id'] + job = models.Job.objects.get(pk=self.kwargs['job_id']) + + # return filtered queryset + if 'success_filter' in self.kwargs.keys(): + success_filter = bool(int(self.kwargs['success_filter'])) + else: + success_filter = None + return job.get_records(success=success_filter) + + # else, return all records + else: + return models.Record.objects + + def render_column(self, row, column): + + # construct record link + record_link = reverse(record, kwargs={ + 'org_id': row.job.record_group.organization.id, + 'record_group_id': row.job.record_group.id, + 'job_id': row.job.id, 'record_id': str(row.id) + }) + + # handle db_id + if column == '_id': + return '%s' % (record_link, str(row.id)) + + # handle record_id + if column == 'record_id': + return '%s' % (record_link, row.record_id) + + # handle document + elif column == 'document': + # attempt to parse as XML and return if valid or not + try: + xml = etree.fromstring(row.document.encode('utf-8')) + return 'Valid XML' % (reverse(record_document, kwargs={ + 'org_id': row.job.record_group.organization.id, + 'record_group_id': row.job.record_group.id, + 'job_id': row.job.id, 'record_id': str(row.id) + })) + except: + return 'Invalid XML' + + # handle associated job + elif column == 'job': + return '%s' % (reverse(job_details, kwargs={ + 'org_id': row.job.record_group.organization.id, + 'record_group_id': row.job.record_group.id, + 'job_id': row.job.id + }), row.job.name) + + # handle unique + elif column == 'unique': + if row.unique: + return 'Unique in Job' + else: + return 'Duplicate in Job' + + # handle validation_results + elif column == 'valid': + if row.valid: + return 'Valid' + else: + return 'Invalid' + + else: + return super(DTRecordsJson, self).render_column(row, column) + + def filter_queryset(self, qs): + # use parameters passed in GET request to filter queryset + + # handle search + search = self.request.GET.get(u'search[value]', None) + if search: + # sniff out ObjectId if present + if len(search) == 24: + try: + oid = ObjectId(search) + qs = qs.filter(mongoengine.Q(id=oid)) + except: + logger.debug('recieved 24 chars, but not ObjectId') + else: + qs = qs.filter(mongoengine.Q(record_id=search)) + + # return + return qs class DTPublishedJson(BaseDatatableView): - - ''' + ''' Prepare and return Datatables JSON for Published records ''' - # define the columns that will be returned - columns = [ - '_id', - 'record_id', - 'job_id', - 'publish_set_id', - # 'oai_set', - # 'unique_published', - 'document' - ] - - # define column names that will be used in sorting - # order is important and should be same as order of columns - # displayed by datatables. For non sortable columns use empty - # value like '' - order_columns = [ - '_id', - 'record_id', - 'job_id', - 'publish_set_id', - # 'oai_set', - # 'unique_published', - 'document' - ] - - # set max limit of records returned, this is used to protect our site if someone tries to attack our site - # and make it return huge amount of data - max_display_length = 1000 - - - def get_initial_queryset(self): - - # return queryset used as base for futher sorting/filtering - - # get PublishedRecords instance - pr = models.PublishedRecords(subset=self.kwargs.get('subset', None)) - - # return queryset - return pr.records - - - def render_column(self, row, column): - - # handle document metadata - - if column == '_id': - return '%s' % (reverse(record, kwargs={ - 'org_id':row.job.record_group.organization.id, - 'record_group_id':row.job.record_group.id, - 'job_id':row.job.id, 'record_id':str(row.id) - }), str(row.id)) - - if column == 'record_id': - return '%s' % (reverse(record, kwargs={ - 'org_id':row.job.record_group.organization.id, - 'record_group_id':row.job.record_group.id, - 'job_id':row.job.id, 'record_id':str(row.id) - }), row.record_id) - - if column == 'job_id': - return '%s' % (reverse(job_details, kwargs={ - 'org_id':row.job.record_group.organization.id, - 'record_group_id':row.job.record_group.id, - 'job_id':row.job.id - }), row.job.name) - - if column == 'document': - # attempt to parse as XML and return if valid or not - try: - xml = etree.fromstring(row.document.encode('utf-8')) - return 'Valid XML' % (reverse(record_document, kwargs={ - 'org_id':row.job.record_group.organization.id, - 'record_group_id':row.job.record_group.id, - 'job_id':row.job.id, 'record_id':str(row.id) - })) - except: - return 'Invalid XML' - - # # handle associated job - # if column == 'unique_published': - # if row.unique_published: - # return 'True' - # else: - # return 'False' - - else: - return super(DTPublishedJson, self).render_column(row, column) - - - def filter_queryset(self, qs): - # use parameters passed in GET request to filter queryset - - # handle search - search = self.request.GET.get(u'search[value]', None) - if search: - # sniff out ObjectId if present - if len(search) == 24: - try: - oid = ObjectId(search) - qs = qs.filter(mongoengine.Q(id=oid)) - except: - logger.debug('recieved 24 chars, but not ObjectId') - else: - qs = qs.filter(mongoengine.Q(record_id=search) | mongoengine.Q(publish_set_id=search)) - - # return - return qs - + # define the columns that will be returned + columns = [ + '_id', + 'record_id', + 'job_id', + 'publish_set_id', + # 'oai_set', + # 'unique_published', + 'document' + ] + + # define column names that will be used in sorting + # order is important and should be same as order of columns + # displayed by datatables. For non sortable columns use empty + # value like '' + order_columns = [ + '_id', + 'record_id', + 'job_id', + 'publish_set_id', + # 'oai_set', + # 'unique_published', + 'document' + ] + + # set max limit of records returned, this is used to protect our site if someone tries to attack our site + # and make it return huge amount of data + max_display_length = 1000 + + def get_initial_queryset(self): + + # return queryset used as base for futher sorting/filtering + + # get PublishedRecords instance + pr = models.PublishedRecords(subset=self.kwargs.get('subset', None)) + + # return queryset + return pr.records + + def render_column(self, row, column): + + # handle document metadata + + if column == '_id': + return '%s' % (reverse(record, kwargs={ + 'org_id': row.job.record_group.organization.id, + 'record_group_id': row.job.record_group.id, + 'job_id': row.job.id, 'record_id': str(row.id) + }), str(row.id)) + + if column == 'record_id': + return '%s' % (reverse(record, kwargs={ + 'org_id': row.job.record_group.organization.id, + 'record_group_id': row.job.record_group.id, + 'job_id': row.job.id, 'record_id': str(row.id) + }), row.record_id) + + if column == 'job_id': + return '%s' % (reverse(job_details, kwargs={ + 'org_id': row.job.record_group.organization.id, + 'record_group_id': row.job.record_group.id, + 'job_id': row.job.id + }), row.job.name) + + if column == 'document': + # attempt to parse as XML and return if valid or not + try: + xml = etree.fromstring(row.document.encode('utf-8')) + return 'Valid XML' % (reverse(record_document, kwargs={ + 'org_id': row.job.record_group.organization.id, + 'record_group_id': row.job.record_group.id, + 'job_id': row.job.id, 'record_id': str(row.id) + })) + except: + return 'Invalid XML' + + # # handle associated job + # if column == 'unique_published': + # if row.unique_published: + # return 'True' + # else: + # return 'False' + + else: + return super(DTPublishedJson, self).render_column(row, column) + + def filter_queryset(self, qs): + # use parameters passed in GET request to filter queryset + + # handle search + search = self.request.GET.get(u'search[value]', None) + if search: + # sniff out ObjectId if present + if len(search) == 24: + try: + oid = ObjectId(search) + qs = qs.filter(mongoengine.Q(id=oid)) + except: + logger.debug('recieved 24 chars, but not ObjectId') + else: + qs = qs.filter(mongoengine.Q(record_id=search) | mongoengine.Q(publish_set_id=search)) + + # return + return qs class DTIndexingFailuresJson(BaseDatatableView): - - ''' + ''' Databales JSON response for Indexing Failures ''' - # define the columns that will be returned - columns = ['_id', 'record_id', 'mapping_error'] + # define the columns that will be returned + columns = ['_id', 'record_id', 'mapping_error'] - # define column names that will be used in sorting - order_columns = ['_id', 'record_id', 'mapping_error'] + # define column names that will be used in sorting + order_columns = ['_id', 'record_id', 'mapping_error'] - # set max limit of records returned, this is used to protect our site if someone tries to attack our site - # and make it return huge amount of data - max_display_length = 1000 + # set max limit of records returned, this is used to protect our site if someone tries to attack our site + # and make it return huge amount of data + max_display_length = 1000 + def get_initial_queryset(self): - def get_initial_queryset(self): + # return queryset used as base for futher sorting/filtering - # return queryset used as base for futher sorting/filtering + # get job + job = models.Job.objects.get(pk=self.kwargs['job_id']) - # get job - job = models.Job.objects.get(pk=self.kwargs['job_id']) + # return filtered queryset + return models.IndexMappingFailure.objects(job_id=job.id) - # return filtered queryset - return models.IndexMappingFailure.objects(job_id=job.id) + def render_column(self, row, column): + # determine record link + target_record = row.record + record_link = reverse(record, kwargs={ + 'org_id': target_record.job.record_group.organization.id, + 'record_group_id': target_record.job.record_group.id, + 'job_id': target_record.job.id, + 'record_id': target_record.id + }) - def render_column(self, row, column): + if column == '_id': + return '%s' % (record_link, target_record.id) - # determine record link - target_record = row.record - record_link = reverse(record, kwargs={ - 'org_id':target_record.job.record_group.organization.id, - 'record_group_id':target_record.job.record_group.id, - 'job_id':target_record.job.id, - 'record_id':target_record.id - }) + if column == 'record_id': + return '%s' % (record_link, target_record.record_id) - if column == '_id': - return '%s' % (record_link, target_record.id) + # handle associated job + if column == 'job': + return row.job.name - if column == 'record_id': - return '%s' % (record_link, target_record.record_id) + else: + return super(DTIndexingFailuresJson, self).render_column(row, column) - # handle associated job - if column == 'job': - return row.job.name +# def filter_queryset(self, qs): - else: - return super(DTIndexingFailuresJson, self).render_column(row, column) - - - # def filter_queryset(self, qs): - - # # handle search - # search = self.request.GET.get(u'search[value]', None) - # if search: - # logger.debug('looking for: %s' % search) - # qs = qs.filter(Q(id = search) | Q(combine_id = search) | Q(record_id = search) | Q(mapping_error__contains = search)) - - # return qs +# # handle search +# search = self.request.GET.get(u'search[value]', None) +# if search: +# logger.debug('looking for: %s' % search) +# qs = qs.filter(Q(id = search) | Q(combine_id = search) | Q(record_id = search) | Q(mapping_error__contains = search)) +# return qs class DTJobValidationScenarioFailuresJson(BaseDatatableView): - - ''' + ''' Prepare and return Datatables JSON for RecordValidation failures from Job, per Validation Scenario ''' - # define the columns that will be returned - columns = [ - 'id', - 'record', - 'results_payload', - 'fail_count' - ] - - # define column names that will be used in sorting - # order is important and should be same as order of columns - # displayed by datatables. For non sortable columns use empty - # value like '' - # order_columns = ['number', 'user', 'state', '', ''] - order_columns = [ - 'id', - 'record', - 'results_payload', - 'fail_count' - ] - - # set max limit of records returned, this is used to protect our site if someone tries to attack our site - # and make it return huge amount of data - max_display_length = 1000 - - - def get_initial_queryset(self): - - # return queryset used as base for futher sorting/filtering - - # get job - jv = models.JobValidation.objects.get(pk=self.kwargs['job_validation_id']) - - # return filtered queryset - return jv.get_record_validation_failures() - - - def render_column(self, row, column): - - # determine record link - target_record = row.record - record_link = "%s#validation_tab" % reverse(record, kwargs={ - 'org_id':target_record.job.record_group.organization.id, - 'record_group_id':target_record.job.record_group.id, - 'job_id':target_record.job.id, - 'record_id':target_record.id - }) - - # handle record id - if column == 'id': - # get target record from row - target_record = row.record - return '%s' % (record_link, target_record.id) - - # handle record record_id - elif column == 'record': - # get target record from row - target_record = row.record - return '%s' % (record_link, target_record.record_id) - - # handle results_payload - elif column == 'results_payload': - rp = json.loads(row.results_payload)['failed'] - return ', '.join(rp) - - # handle all else - else: - return super(DTJobValidationScenarioFailuresJson, self).render_column(row, column) - - - def filter_queryset(self, qs): - # use parameters passed in GET request to filter queryset - - # handle search - search = self.request.GET.get(u'search[value]', None) - if search: - # sniff out ObjectId if present - if len(search) == 24: - try: - oid = ObjectId(search) - qs = qs.filter(mongoengine.Q(record_id=oid)) - except: - logger.debug('recieved 24 chars, but not ObjectId') - # return - return qs - + # define the columns that will be returned + columns = [ + 'id', + 'record', + 'results_payload', + 'fail_count' + ] + + # define column names that will be used in sorting + # order is important and should be same as order of columns + # displayed by datatables. For non sortable columns use empty + # value like '' + # order_columns = ['number', 'user', 'state', '', ''] + order_columns = [ + 'id', + 'record', + 'results_payload', + 'fail_count' + ] + + # set max limit of records returned, this is used to protect our site if someone tries to attack our site + # and make it return huge amount of data + max_display_length = 1000 + + def get_initial_queryset(self): + + # return queryset used as base for futher sorting/filtering + + # get job + jv = models.JobValidation.objects.get(pk=self.kwargs['job_validation_id']) + + # return filtered queryset + return jv.get_record_validation_failures() + + def render_column(self, row, column): + + # determine record link + target_record = row.record + record_link = "%s#validation_tab" % reverse(record, kwargs={ + 'org_id': target_record.job.record_group.organization.id, + 'record_group_id': target_record.job.record_group.id, + 'job_id': target_record.job.id, + 'record_id': target_record.id + }) + + # handle record id + if column == 'id': + # get target record from row + target_record = row.record + return '%s' % (record_link, target_record.id) + + # handle record record_id + elif column == 'record': + # get target record from row + target_record = row.record + return '%s' % (record_link, target_record.record_id) + + # handle results_payload + elif column == 'results_payload': + rp = json.loads(row.results_payload)['failed'] + return ', '.join(rp) + + # handle all else + else: + return super(DTJobValidationScenarioFailuresJson, self).render_column(row, column) + + def filter_queryset(self, qs): + # use parameters passed in GET request to filter queryset + + # handle search + search = self.request.GET.get(u'search[value]', None) + if search: + # sniff out ObjectId if present + if len(search) == 24: + try: + oid = ObjectId(search) + qs = qs.filter(mongoengine.Q(record_id=oid)) + except: + logger.debug('recieved 24 chars, but not ObjectId') + # return + return qs class DTDPLABulkDataMatches(BaseDatatableView): - - ''' + ''' Prepare and return Datatables JSON for RecordValidation failures from Job, per Validation Scenario ''' - # define the columns that will be returned - columns = [ - 'id', - 'record_id' - ] - - # define column names that will be used in sorting - # order is important and should be same as order of columns - # displayed by datatables. For non sortable columns use empty - # value like '' - # order_columns = ['number', 'user', 'state', '', ''] - order_columns = [ - 'id', - 'record_id' - ] - - # set max limit of records returned, this is used to protect our site if someone tries to attack our site - # and make it return huge amount of data - max_display_length = 1000 - - - def get_initial_queryset(self): - - # return queryset used as base for futher sorting/filtering - - # get job and records - job = models.Job.objects.get(pk=self.kwargs['job_id']) - - # return queryset filtered for match/miss - if self.kwargs['match_type'] == 'matches': - return job.get_records().filter(dbdm=True) - elif self.kwargs['match_type'] == 'misses': - return job.get_records().filter(dbdm=False) - - - def render_column(self, row, column): - - # determine record link - target_record = row - record_link = reverse(record, kwargs={ - 'org_id':target_record.job.record_group.organization.id, - 'record_group_id':target_record.job.record_group.id, - 'job_id':target_record.job.id, - 'record_id':target_record.id - }) - - # handle record id - if column == 'id': - # get target record from row - target_record = row - return '%s' % (record_link, target_record.id) - - # handle record record_id - elif column == 'record_id': - # get target record from row - target_record = row - return '%s' % (record_link, target_record.record_id) - - # handle all else - else: - return super(DTDPLABulkDataMatches, self).render_column(row, column) - - - def filter_queryset(self, qs): - # use parameters passed in GET request to filter queryset - - # handle search - search = self.request.GET.get(u'search[value]', None) - if search: - # sniff out ObjectId if present - if len(search) == 24: - try: - oid = ObjectId(search) - qs = qs.filter(mongoengine.Q(id=oid)) - except: - logger.debug('recieved 24 chars, but not ObjectId') - else: - qs = qs.filter(mongoengine.Q(record_id=search)) - - # return - return qs - + # define the columns that will be returned + columns = [ + 'id', + 'record_id' + ] + + # define column names that will be used in sorting + # order is important and should be same as order of columns + # displayed by datatables. For non sortable columns use empty + # value like '' + # order_columns = ['number', 'user', 'state', '', ''] + order_columns = [ + 'id', + 'record_id' + ] + + # set max limit of records returned, this is used to protect our site if someone tries to attack our site + # and make it return huge amount of data + max_display_length = 1000 + + def get_initial_queryset(self): + + # return queryset used as base for futher sorting/filtering + + # get job and records + job = models.Job.objects.get(pk=self.kwargs['job_id']) + + # return queryset filtered for match/miss + if self.kwargs['match_type'] == 'matches': + return job.get_records().filter(dbdm=True) + elif self.kwargs['match_type'] == 'misses': + return job.get_records().filter(dbdm=False) + + def render_column(self, row, column): + + # determine record link + target_record = row + record_link = reverse(record, kwargs={ + 'org_id': target_record.job.record_group.organization.id, + 'record_group_id': target_record.job.record_group.id, + 'job_id': target_record.job.id, + 'record_id': target_record.id + }) + + # handle record id + if column == 'id': + # get target record from row + target_record = row + return '%s' % (record_link, target_record.id) + + # handle record record_id + elif column == 'record_id': + # get target record from row + target_record = row + return '%s' % (record_link, target_record.record_id) + + # handle all else + else: + return super(DTDPLABulkDataMatches, self).render_column(row, column) + + def filter_queryset(self, qs): + # use parameters passed in GET request to filter queryset + + # handle search + search = self.request.GET.get(u'search[value]', None) + if search: + # sniff out ObjectId if present + if len(search) == 24: + try: + oid = ObjectId(search) + qs = qs.filter(mongoengine.Q(id=oid)) + except: + logger.debug('recieved 24 chars, but not ObjectId') + else: + qs = qs.filter(mongoengine.Q(record_id=search)) + + # return + return qs class JobRecordDiffs(BaseDatatableView): - - ''' + ''' Prepare and return Datatables JSON for Records that were transformed during a Transformation Job ''' - # define the columns that will be returned - columns = [ - 'id', - 'record_id', - ] - - # define column names that will be used in sorting - # order is important and should be same as order of columns - # displayed by datatables. For non sortable columns use empty - # value like '' - order_columns = [ - 'id', - 'record_id' - ] - - # set max limit of records returned, this is used to protect our site if someone tries to attack our site - # and make it return huge amount of data - max_display_length = 1000 - - - def get_initial_queryset(self): + # define the columns that will be returned + columns = [ + 'id', + 'record_id', + ] - # return queryset used as base for futher sorting/filtering + # define column names that will be used in sorting + # order is important and should be same as order of columns + # displayed by datatables. For non sortable columns use empty + # value like '' + order_columns = [ + 'id', + 'record_id' + ] - # get job - job = models.Job.objects.get(pk=self.kwargs['job_id']) - job_records = job.get_records() + # set max limit of records returned, this is used to protect our site if someone tries to attack our site + # and make it return huge amount of data + max_display_length = 1000 - # filter for records that were transformed - return job_records.filter(transformed=True) + def get_initial_queryset(self): + # return queryset used as base for futher sorting/filtering - def render_column(self, row, column): + # get job + job = models.Job.objects.get(pk=self.kwargs['job_id']) + job_records = job.get_records() - # record link - record_link = "%s#job_type_specific_tab" % reverse(record, kwargs={ - 'org_id':row.job.record_group.organization.id, - 'record_group_id':row.job.record_group.id, - 'job_id':row.job.id, 'record_id':row.id - }) + # filter for records that were transformed + return job_records.filter(transformed=True) - # handle db_id - if column == 'id': - return '%s' % (record_link, row.id) + def render_column(self, row, column): - # handle record_id - if column == 'record_id': - return '%s' % (record_link, row.record_id) + # record link + record_link = "%s#job_type_specific_tab" % reverse(record, kwargs={ + 'org_id': row.job.record_group.organization.id, + 'record_group_id': row.job.record_group.id, + 'job_id': row.job.id, 'record_id': row.id + }) - else: - return super(JobRecordDiffs, self).render_column(row, column) + # handle db_id + if column == 'id': + return '%s' % (record_link, row.id) + # handle record_id + if column == 'record_id': + return '%s' % (record_link, row.record_id) - def filter_queryset(self, qs): + else: + return super(JobRecordDiffs, self).render_column(row, column) - # use parameters passed in GET request to filter queryset + def filter_queryset(self, qs): - # handle search - search = self.request.GET.get(u'search[value]', None) - if search: - qs = qs.filter(Q(id__contains=search) | Q(record_id__contains=search) | Q(document__contains=search)) + # use parameters passed in GET request to filter queryset - # return - return qs + # handle search + search = self.request.GET.get(u'search[value]', None) + if search: + qs = qs.filter(Q(id__contains=search) | Q(record_id__contains=search) | Q(document__contains=search)) + # return + return qs class CombineBackgroundTasksDT(BaseDatatableView): - - ''' + ''' Prepare and return Datatables JSON for Records table in Job Details ''' - # define the columns that will be returned - columns = [ - 'id', - 'start_timestamp', - 'name', - 'task_type', - 'celery_task_id', - 'completed', - 'duration', - 'actions' - ] - - # define column names that will be used in sorting - # order is important and should be same as order of columns - # displayed by datatables. For non sortable columns use empty - # value like '' - # order_columns = ['number', 'user', 'state', '', ''] - order_columns = [ - 'id', - 'start_timestamp', - 'name', - 'task_type', - 'celery_task_id', - 'completed', - 'duration', - 'actions' - ] - - # set max limit of records returned, this is used to protect our site if someone tries to attack our site - # and make it return huge amount of data - max_display_length = 1000 - - - def get_initial_queryset(self): - - # return queryset used as base for futher sorting/filtering - return models.CombineBackgroundTask.objects - - - def render_column(self, row, column): - - if column == 'task_type': - return row.get_task_type_display() - - elif column == 'celery_task_id': - return '%s' % row.celery_task_id - - elif column == 'completed': - if row.completed: - if row.celery_status in ['STOPPED','REVOKED']: - return "%s" % row.celery_status - else: - return "%s" % row.celery_status - else: - return "%s" % row.celery_status - - elif column == 'duration': - return row.calc_elapsed_as_string() - - - elif column == 'actions': - return ' ' % ( - reverse(bg_task, kwargs={'task_id':row.id}), - reverse(bg_task_cancel, kwargs={'task_id':row.id}), - reverse(bg_task_delete, kwargs={'task_id':row.id}), - ) - - else: - return super(CombineBackgroundTasksDT, self).render_column(row, column) - - - def filter_queryset(self, qs): - # use parameters passed in GET request to filter queryset - - # handle search - search = self.request.GET.get(u'search[value]', None) - if search: - qs = qs.filter(Q(id__contains=search) | Q(name__contains=search) | Q(verbose_name__contains=search)) - - # return - return qs - + # define the columns that will be returned + columns = [ + 'id', + 'start_timestamp', + 'name', + 'task_type', + 'celery_task_id', + 'completed', + 'duration', + 'actions' + ] + + # define column names that will be used in sorting + # order is important and should be same as order of columns + # displayed by datatables. For non sortable columns use empty + # value like '' + # order_columns = ['number', 'user', 'state', '', ''] + order_columns = [ + 'id', + 'start_timestamp', + 'name', + 'task_type', + 'celery_task_id', + 'completed', + 'duration', + 'actions' + ] + + # set max limit of records returned, this is used to protect our site if someone tries to attack our site + # and make it return huge amount of data + max_display_length = 1000 + + def get_initial_queryset(self): + + # return queryset used as base for futher sorting/filtering + return models.CombineBackgroundTask.objects + + def render_column(self, row, column): + + if column == 'task_type': + return row.get_task_type_display() + + elif column == 'celery_task_id': + return '%s' % row.celery_task_id + + elif column == 'completed': + if row.completed: + if row.celery_status in ['STOPPED', 'REVOKED']: + return "%s" % row.celery_status + else: + return "%s" % row.celery_status + else: + return "%s" % row.celery_status + + elif column == 'duration': + return row.calc_elapsed_as_string() + + + elif column == 'actions': + return ' ' % ( + reverse(bg_task, kwargs={'task_id': row.id}), + reverse(bg_task_cancel, kwargs={'task_id': row.id}), + reverse(bg_task_delete, kwargs={'task_id': row.id}), + ) + + else: + return super(CombineBackgroundTasksDT, self).render_column(row, column) + + def filter_queryset(self, qs): + # use parameters passed in GET request to filter queryset + + # handle search + search = self.request.GET.get(u'search[value]', None) + if search: + qs = qs.filter(Q(id__contains=search) | Q(name__contains=search) | Q(verbose_name__contains=search)) + + # return + return qs #################################################################### @@ -4236,23 +4109,21 @@ def filter_queryset(self, qs): @login_required def gm_delete(request): + if request.method == 'POST': + # get gm_id + gm_id = request.POST.get('gm_id') - if request.method == 'POST': - - # get gm_id - gm_id = request.POST.get('gm_id') - - # init GlobalMessageClient - gmc = models.GlobalMessageClient(request.session) + # init GlobalMessageClient + gmc = models.GlobalMessageClient(request.session) - # delete by id - results = gmc.delete_gm(gm_id) + # delete by id + results = gmc.delete_gm(gm_id) - # redirect - return JsonResponse({ - 'gm_id':gm_id, - 'num_removed':results - }) + # redirect + return JsonResponse({ + 'gm_id': gm_id, + 'num_removed': results + }) #################################################################### @@ -4261,88 +4132,85 @@ def gm_delete(request): @login_required def stateio(request): - - ''' + ''' Root view for StateIO ''' - # retrieve exports and imports - stateio_exports = models.StateIO.objects.filter(stateio_type='export') - stateio_imports = models.StateIO.objects.filter(stateio_type='import') + # retrieve exports and imports + stateio_exports = models.StateIO.objects.filter(stateio_type='export') + stateio_imports = models.StateIO.objects.filter(stateio_type='import') - # return - return render(request, 'core/stateio.html', { - 'stateio_exports':stateio_exports, - 'stateio_imports':stateio_imports, - 'breadcrumbs':breadcrumb_parser(request) - }) + # return + return render(request, 'core/stateio.html', { + 'stateio_exports': stateio_exports, + 'stateio_imports': stateio_imports, + 'breadcrumbs': breadcrumb_parser(request) + }) @login_required def stateio_state(request, state_id): - - ''' + ''' Single state view ''' - # retrieve state - state = models.StateIO.objects.get(id=state_id) - - # handle export state - if state.stateio_type == 'export': - - # retrieve imports, if any, that share this export_id - associated_imports = models.StateIO.objects.filter( - export_manifest__export_id=state.export_id, - stateio_type='import') - - # generate io results json - if 'exports' in state.export_manifest: - io_results_json = _generate_io_results_json(state.export_manifest['exports']) - else: - io_results_json = False - - # return - return render(request, 'core/stateio_state_export.html', { - 'state':state, - 'associated_imports':associated_imports, - 'io_results_json':json.dumps(io_results_json, sort_keys=True), - 'breadcrumbs':breadcrumb_parser(request) - }) - - # handle import state - if state.stateio_type == 'import': - - if state.status == 'finished': - # retrieve export used for import, if exists in same instance of Combine - associated_export_q = models.StateIO.objects.filter( - export_id=state.export_manifest['export_id'], - stateio_type='export') - if associated_export_q.count() == 1: - associated_export = associated_export_q.first() - else: - associated_export = None - else: - associated_export = None - - # generate io results json - if 'imports' in state.import_manifest: - io_results_json = _generate_io_results_json(state.import_manifest['imports']) - else: - io_results_json = False - - # return - return render(request, 'core/stateio_state_import.html', { - 'state':state, - 'associated_export':associated_export, - 'io_results_json':json.dumps(io_results_json, sort_keys=True), - 'breadcrumbs':breadcrumb_parser(request) - }) + # retrieve state + state = models.StateIO.objects.get(id=state_id) + + # handle export state + if state.stateio_type == 'export': + + # retrieve imports, if any, that share this export_id + associated_imports = models.StateIO.objects.filter( + export_manifest__export_id=state.export_id, + stateio_type='import') + + # generate io results json + if 'exports' in state.export_manifest: + io_results_json = _generate_io_results_json(state.export_manifest['exports']) + else: + io_results_json = False + + # return + return render(request, 'core/stateio_state_export.html', { + 'state': state, + 'associated_imports': associated_imports, + 'io_results_json': json.dumps(io_results_json, sort_keys=True), + 'breadcrumbs': breadcrumb_parser(request) + }) + + # handle import state + if state.stateio_type == 'import': + + if state.status == 'finished': + # retrieve export used for import, if exists in same instance of Combine + associated_export_q = models.StateIO.objects.filter( + export_id=state.export_manifest['export_id'], + stateio_type='export') + if associated_export_q.count() == 1: + associated_export = associated_export_q.first() + else: + associated_export = None + else: + associated_export = None + + # generate io results json + if 'imports' in state.import_manifest: + io_results_json = _generate_io_results_json(state.import_manifest['imports']) + else: + io_results_json = False + + # return + return render(request, 'core/stateio_state_import.html', { + 'state': state, + 'associated_export': associated_export, + 'io_results_json': json.dumps(io_results_json, sort_keys=True), + 'breadcrumbs': breadcrumb_parser(request) + }) def _generate_io_results_json(io_results): - - ''' + ''' Function to generate jstree ready JSON when provided with an export or import manifest @@ -4350,245 +4218,239 @@ def _generate_io_results_json(io_results): io_results (dict): Dictionary of IO results, either export or import ''' - # results_flag - io_results_flag = False - - # model translation for serializable strings for models - model_type_hash = { - 'jobs_hierarchy':{ - 'jobs':'Jobs', - 'record_groups':'Record Groups', - 'orgs':'Organizations', - }, - 'config_scenarios':{ - 'dbdd':'DPLA Bulk Data Downloads', - 'oai_endpoints':'OAI Endpoints', - 'rits':'Record Identifier Transformation Scenarios', - 'transformations':'Transformation Scenarios', - 'validations':'Validation Scenarios' - } - } - - # init dictionary - io_results_json = [] - - # loop through jobs and configs - for obj_type, obj_subsets in model_type_hash.items(): - - logger.debug('building %s' % obj_type) - - # obj_type_flag - obj_type_flag = False - - # init obj type level dict - obj_type_hash = { - 'jobs_hierarchy':{ - 'name':'Organizations, Record Groups, and Jobs', - 'icon':'la la-sitemap' - }, - 'config_scenarios':{ - 'name':'Configuration Scenarios', - 'icon':'la la-gears' - } - } - - obj_type_dict = { - # 'id':obj_type, - 'text':obj_type_hash[obj_type]['name'], - 'state':{'opened':True}, - 'children':[], - 'icon':obj_type_hash[obj_type]['icon'] - } - - # loop through model types and build dictionary - for model_key, model_name in obj_subsets.items(): - - # init model level dict - model_type_dict = { - # 'id':model_key, - 'text':model_name, - 'state':{'opened':True}, - 'children':[], - 'icon':'la la-folder-open' - } - - # loop through io results - for io_obj in io_results[model_key]: - - model_type_dict['children'].append( - { - # 'id':io_obj['id'], - 'text':io_obj['name'], - 'state':{'opened':True}, - 'icon':'la la-file', - 'children':[], - 'li_attr':{ - 'io_obj':True - } - } - ) - - # append model type dict to - if len(io_results[model_key]) > 0: - io_results_flag = True - obj_type_flag = True - obj_type_dict['children'].append(model_type_dict) - - # append obj type dict if contains children - if obj_type_flag: - io_results_json.append(obj_type_dict) - - # if results found for any type, return and imply True - if io_results_flag: - return io_results_json - else: - return False + # results_flag + io_results_flag = False + + # model translation for serializable strings for models + model_type_hash = { + 'jobs_hierarchy': { + 'jobs': 'Jobs', + 'record_groups': 'Record Groups', + 'orgs': 'Organizations', + }, + 'config_scenarios': { + 'dbdd': 'DPLA Bulk Data Downloads', + 'oai_endpoints': 'OAI Endpoints', + 'rits': 'Record Identifier Transformation Scenarios', + 'transformations': 'Transformation Scenarios', + 'validations': 'Validation Scenarios' + } + } + + # init dictionary + io_results_json = [] + + # loop through jobs and configs + for obj_type, obj_subsets in model_type_hash.items(): + + logger.debug('building %s' % obj_type) + + # obj_type_flag + obj_type_flag = False + + # init obj type level dict + obj_type_hash = { + 'jobs_hierarchy': { + 'name': 'Organizations, Record Groups, and Jobs', + 'icon': 'la la-sitemap' + }, + 'config_scenarios': { + 'name': 'Configuration Scenarios', + 'icon': 'la la-gears' + } + } + + obj_type_dict = { + # 'id':obj_type, + 'text': obj_type_hash[obj_type]['name'], + 'state': {'opened': True}, + 'children': [], + 'icon': obj_type_hash[obj_type]['icon'] + } + + # loop through model types and build dictionary + for model_key, model_name in obj_subsets.items(): + + # init model level dict + model_type_dict = { + # 'id':model_key, + 'text': model_name, + 'state': {'opened': True}, + 'children': [], + 'icon': 'la la-folder-open' + } + + # loop through io results + for io_obj in io_results[model_key]: + model_type_dict['children'].append( + { + # 'id':io_obj['id'], + 'text': io_obj['name'], + 'state': {'opened': True}, + 'icon': 'la la-file', + 'children': [], + 'li_attr': { + 'io_obj': True + } + } + ) + + # append model type dict to + if len(io_results[model_key]) > 0: + io_results_flag = True + obj_type_flag = True + obj_type_dict['children'].append(model_type_dict) + + # append obj type dict if contains children + if obj_type_flag: + io_results_json.append(obj_type_dict) + + # if results found for any type, return and imply True + if io_results_flag: + return io_results_json + else: + return False @login_required def stateio_state_manifest(request, state_id, manifest_type): - - ''' + ''' View export/import manifest from state ''' - # retrieve state - state = models.StateIO.objects.get(id=state_id) + # retrieve state + state = models.StateIO.objects.get(id=state_id) - # return - return JsonResponse(getattr(state, manifest_type, None)) + # return + return JsonResponse(getattr(state, manifest_type, None)) @login_required def stateio_state_delete(request, state_id): - - ''' + ''' Delete single state view ''' - # retrieve state - state = models.StateIO.objects.get(id=state_id) + # retrieve state + state = models.StateIO.objects.get(id=state_id) - # delete and redirect - state.delete() + # delete and redirect + state.delete() - # return - return redirect('stateio') + # return + return redirect('stateio') @login_required def stateio_state_download(request, state_id): - - ''' + ''' Download export state ''' - # retrieve state - state = models.StateIO.objects.get(id=state_id) + # retrieve state + state = models.StateIO.objects.get(id=state_id) - # set filepath as download location on disk - filepath = state.export_path + # set filepath as download location on disk + filepath = state.export_path - # set filename - filename = filepath.split('/')[-1] + # set filename + filename = filepath.split('/')[-1] - # generate response - response = FileResponse(open(filepath, 'rb')) - response['Content-Disposition'] = 'attachment; filename="%s"' % filename - return response + # generate response + response = FileResponse(open(filepath, 'rb')) + response['Content-Disposition'] = 'attachment; filename="%s"' % filename + return response @login_required def stateio_state_stop(request, state_id): - - ''' + ''' Attempt to stop state when running as bg task ''' - # retrieve state - state = models.StateIO.objects.get(id=state_id) + # retrieve state + state = models.StateIO.objects.get(id=state_id) - # issue cancel - if state.bg_task: - state.bg_task.cancel() + # issue cancel + if state.bg_task: + state.bg_task.cancel() - # update status - state.status = 'stopped' - state.finished = True - state.save() + # update status + state.status = 'stopped' + state.finished = True + state.save() - # return - return redirect('stateio') + # return + return redirect('stateio') @login_required def stateio_export(request): - - ''' + ''' Export state ''' - if request.method == 'GET': - - # generate hierarchy_dict - job_hierarchy = _stateio_prepare_job_hierarchy() - - # generate config scenarios - config_scenarios = _stateio_prepare_config_scenarios() - - # return - return render(request, 'core/stateio_export.html', { - 'job_hierarchy_json':json.dumps(job_hierarchy), - 'config_scenarios_json':json.dumps(config_scenarios), - 'breadcrumbs':breadcrumb_parser(request) - }) - - if request.method == 'POST': - - # capture optional export name - export_name = request.POST.get('export_name', None) - if export_name == '': - export_name = None - logger.debug('initing export: %s' % export_name) - - # capture and parse jobs_hierarchy_ids - jobs_hierarchy_ids = request.POST.getlist('jobs_hierarchy_ids[]') - jobs = [ int(obj.split('|')[-1]) for obj in jobs_hierarchy_ids if obj.startswith('job') ] - record_groups = [ int(obj.split('|')[-1]) for obj in jobs_hierarchy_ids if obj.startswith('record_group') ] - orgs = [ int(obj.split('|')[-1]) for obj in jobs_hierarchy_ids if obj.startswith('org') ] - - # capture and parse config_scenarios_ids - config_scenarios_ids = [ config_id for config_id in request.POST.getlist('config_scenarios_ids[]') if '|' in config_id ] - - # init export as bg task - ct = models.StateIOClient.export_state_bg_task( - export_name=export_name, - jobs=jobs, - record_groups=record_groups, - orgs=orgs, - config_scenarios=config_scenarios_ids # preserve prefixes through serialization - ) - - # retrieve StateIO instance, use metadata for msg - stateio = models.StateIO.objects.get(id=ct.task_params['stateio_id']) - - # set gms - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'

Exporting State:
%s

Refresh this page for updates:

' % (stateio.name), - 'class':'success' - }) - - # return - return JsonResponse({'msg':'success'}) + if request.method == 'GET': + # generate hierarchy_dict + job_hierarchy = _stateio_prepare_job_hierarchy() + + # generate config scenarios + config_scenarios = _stateio_prepare_config_scenarios() + + # return + return render(request, 'core/stateio_export.html', { + 'job_hierarchy_json': json.dumps(job_hierarchy), + 'config_scenarios_json': json.dumps(config_scenarios), + 'breadcrumbs': breadcrumb_parser(request) + }) + + if request.method == 'POST': + + # capture optional export name + export_name = request.POST.get('export_name', None) + if export_name == '': + export_name = None + logger.debug('initing export: %s' % export_name) + + # capture and parse jobs_hierarchy_ids + jobs_hierarchy_ids = request.POST.getlist('jobs_hierarchy_ids[]') + jobs = [int(obj.split('|')[-1]) for obj in jobs_hierarchy_ids if obj.startswith('job')] + record_groups = [int(obj.split('|')[-1]) for obj in jobs_hierarchy_ids if obj.startswith('record_group')] + orgs = [int(obj.split('|')[-1]) for obj in jobs_hierarchy_ids if obj.startswith('org')] + + # capture and parse config_scenarios_ids + config_scenarios_ids = [config_id for config_id in request.POST.getlist('config_scenarios_ids[]') if + '|' in config_id] + + # init export as bg task + ct = models.StateIOClient.export_state_bg_task( + export_name=export_name, + jobs=jobs, + record_groups=record_groups, + orgs=orgs, + config_scenarios=config_scenarios_ids # preserve prefixes through serialization + ) + + # retrieve StateIO instance, use metadata for msg + stateio = models.StateIO.objects.get(id=ct.task_params['stateio_id']) + + # set gms + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': '

Exporting State:
%s

Refresh this page for updates:

' % ( + stateio.name), + 'class': 'success' + }) + + # return + return JsonResponse({'msg': 'success'}) def _stateio_prepare_job_hierarchy( - include_record_groups=True, - include_jobs=True): - - # generate JSON that will be used by jstree to create Org, Record Group, Jobs selections - ''' + include_record_groups=True, + include_jobs=True): + # generate JSON that will be used by jstree to create Org, Record Group, Jobs selections + ''' Target structure: // Expected format of the node (there are no required fields) { @@ -4605,70 +4467,67 @@ def _stateio_prepare_job_hierarchy( a_attr : {} // attributes for the generated A node } ''' - # init dictionary with root node - hierarchy_dict = { - 'id':'root_jobs', - 'text':'Organizations, Record Groups, and Jobs', - 'state':{'opened':True}, - 'children':[], - 'icon':'la la-sitemap' - } - - # add Organizations --> Record Group --> Jobs - for org in models.Organization.objects.filter(for_analysis=False): - - # init org dict - org_dict = { - 'id':'org|%s' % org.id, - 'text':org.name, - 'state':{'opened':False}, - 'children':[], - 'icon':'la la-folder-open' - } - - if include_record_groups: - # loop through child Record Groups and add - for rg in org.recordgroup_set.all(): - - # init rg dict - rg_dict = { - 'id':'record_group|%s' % rg.id, - 'text':rg.name, - 'state':{'opened':False}, - 'children':[], - 'icon':'la la-folder-open' - } - - if include_jobs: - # loop through Jobs and add - for job in rg.job_set.all(): - - # init job dict - job_dict = { - 'id':'job|%s' % job.id, - 'text':job.name, - 'state':{'opened':False}, - 'icon':'la la-file' - } - - # append to rg - rg_dict['children'].append(job_dict) - - # add back to org - org_dict['children'].append(rg_dict) - - # add org to root hierarchy - hierarchy_dict['children'].append(org_dict) - - # return embedded in list - return [hierarchy_dict] - + # init dictionary with root node + hierarchy_dict = { + 'id': 'root_jobs', + 'text': 'Organizations, Record Groups, and Jobs', + 'state': {'opened': True}, + 'children': [], + 'icon': 'la la-sitemap' + } + + # add Organizations --> Record Group --> Jobs + for org in models.Organization.objects.filter(for_analysis=False): + + # init org dict + org_dict = { + 'id': 'org|%s' % org.id, + 'text': org.name, + 'state': {'opened': False}, + 'children': [], + 'icon': 'la la-folder-open' + } + + if include_record_groups: + # loop through child Record Groups and add + for rg in org.recordgroup_set.all(): + + # init rg dict + rg_dict = { + 'id': 'record_group|%s' % rg.id, + 'text': rg.name, + 'state': {'opened': False}, + 'children': [], + 'icon': 'la la-folder-open' + } + + if include_jobs: + # loop through Jobs and add + for job in rg.job_set.all(): + # init job dict + job_dict = { + 'id': 'job|%s' % job.id, + 'text': job.name, + 'state': {'opened': False}, + 'icon': 'la la-file' + } + + # append to rg + rg_dict['children'].append(job_dict) + + # add back to org + org_dict['children'].append(rg_dict) + + # add org to root hierarchy + hierarchy_dict['children'].append(org_dict) + + # return embedded in list + return [hierarchy_dict] def _stateio_prepare_config_scenarios(): - - # generate JSON that will be used by jstree to create configuration scenarios - ''' + # generate JSON that will be used by jstree to create configuration scenarios + ''' Target structure: // Expected format of the node (there are no required fields) { @@ -4685,130 +4544,121 @@ def _stateio_prepare_config_scenarios(): a_attr : {} // attributes for the generated A node } ''' - # init dictionary with root node - config_scenarios_dict = { - 'id':'root_config', - 'text':'Configurations and Scenarios', - 'state':{'opened':True}, - 'children':[], - 'icon':'la la-gears' - } - - def _add_config_scenarios(config_scenarios_dict, model, id_str, text_str, id_prefix): - - # set base dict - model_dict = { - 'id':id_str, - 'text':text_str, - 'state':{'opened':False}, - 'children':[], - 'icon':'la la-folder-open' - } - - # loop through instances - for obj in model.objects.all(): - model_dict['children'].append({ - 'id':'%s|%s' % (id_prefix, obj.id), - 'text':obj.name, - 'state':{'opened':False}, - 'children':[], - 'icon':'la la-file' - }) - - # append to config_scenarios_dict - config_scenarios_dict['children'].append(model_dict) - - # loop through models and append to config scenarios dict - for model_tup in [ - (config_scenarios_dict, models.ValidationScenario, 'validation_scenarios', 'Validation Scenarios', 'validations'), - (config_scenarios_dict, models.Transformation, 'transformations', 'Transformation Scenarios', 'transformations'), - (config_scenarios_dict, models.OAIEndpoint, 'oai_endpoints', 'OAI Endpoints', 'oai_endpoints'), - (config_scenarios_dict, models.RecordIdentifierTransformationScenario, 'rits', 'Record Identifier Transformation Scenarios', 'rits'), - (config_scenarios_dict, models.FieldMapper, 'field_mapper_configs', 'Field Mapper Configurations', 'field_mapper_configs'), - (config_scenarios_dict, models.DPLABulkDataDownload, 'dbdds', 'DPLA Bulk Data Downloads', 'dbdd') - ]: - - # add to config_scenarios_dict - _add_config_scenarios(*model_tup) - - # return embedded in list - return [config_scenarios_dict] + # init dictionary with root node + config_scenarios_dict = { + 'id': 'root_config', + 'text': 'Configurations and Scenarios', + 'state': {'opened': True}, + 'children': [], + 'icon': 'la la-gears' + } + + def _add_config_scenarios(config_scenarios_dict, model, id_str, text_str, id_prefix): + + # set base dict + model_dict = { + 'id': id_str, + 'text': text_str, + 'state': {'opened': False}, + 'children': [], + 'icon': 'la la-folder-open' + } + + # loop through instances + for obj in model.objects.all(): + model_dict['children'].append({ + 'id': '%s|%s' % (id_prefix, obj.id), + 'text': obj.name, + 'state': {'opened': False}, + 'children': [], + 'icon': 'la la-file' + }) + + # append to config_scenarios_dict + config_scenarios_dict['children'].append(model_dict) + + # loop through models and append to config scenarios dict + for model_tup in [ + (config_scenarios_dict, models.ValidationScenario, 'validation_scenarios', 'Validation Scenarios', + 'validations'), + ( + config_scenarios_dict, models.Transformation, 'transformations', 'Transformation Scenarios', 'transformations'), + (config_scenarios_dict, models.OAIEndpoint, 'oai_endpoints', 'OAI Endpoints', 'oai_endpoints'), + (config_scenarios_dict, models.RecordIdentifierTransformationScenario, 'rits', + 'Record Identifier Transformation Scenarios', 'rits'), + (config_scenarios_dict, models.FieldMapper, 'field_mapper_configs', 'Field Mapper Configurations', + 'field_mapper_configs'), + (config_scenarios_dict, models.DPLABulkDataDownload, 'dbdds', 'DPLA Bulk Data Downloads', 'dbdd') + ]: + # add to config_scenarios_dict + _add_config_scenarios(*model_tup) + + # return embedded in list + return [config_scenarios_dict] @login_required def stateio_import(request): - - ''' + ''' Import state ''' - if request.method == 'GET': - - # return - return render(request, 'core/stateio_import.html', { - 'breadcrumbs':breadcrumb_parser(request) - }) - - elif request.method == 'POST': - - # capture optional export name - import_name = request.POST.get('import_name', None) - if import_name == '': - import_name = None - logger.debug('initing import: %s' % import_name) - - # handle filesystem location - if request.POST.get('filesystem_location', None) not in ['', None]: - export_path = request.POST.get('filesystem_location').strip() - logger.debug('importing state based on filesystem location: %s' % export_path) - - # handle URL - elif request.POST.get('url_location', None) not in ['', None]: - export_path = request.POST.get('url_location').strip() - logger.debug('importing state based on remote location: %s' % export_path) - - # handle file upload - elif type(request.FILES.get('export_upload_payload', None)) != None: - - logger.debug('handling file upload') - - # save file to disk - payload = request.FILES.get('export_upload_payload', None) - new_file = '/tmp/%s' % (payload.name) - with open(new_file, 'wb') as f: - f.write(payload.read()) - payload.close() - - # set export_path - export_path = new_file - logger.debug('saved uploaded state to %s' % export_path) - - # init export as bg task - ct = models.StateIOClient.import_state_bg_task( - import_name=import_name, - export_path=export_path - ) - - # retrieve StateIO instance, use metadata for msg - stateio = models.StateIO.objects.get(id=ct.task_params['stateio_id']) - - # set gms - gmc = models.GlobalMessageClient(request.session) - gmc.add_gm({ - 'html':'

Importing State:
%s

Refresh this page for updates:

' % (stateio.name), - 'class':'success' - }) - - return redirect('stateio') - - - - - - - - - - - - + if request.method == 'GET': + + # return + return render(request, 'core/stateio_import.html', { + 'breadcrumbs': breadcrumb_parser(request) + }) + + elif request.method == 'POST': + + # capture optional export name + import_name = request.POST.get('import_name', None) + if import_name == '': + import_name = None + logger.debug('initing import: %s' % import_name) + + # handle filesystem location + if request.POST.get('filesystem_location', None) not in ['', None]: + export_path = request.POST.get('filesystem_location').strip() + logger.debug('importing state based on filesystem location: %s' % export_path) + + # handle URL + elif request.POST.get('url_location', None) not in ['', None]: + export_path = request.POST.get('url_location').strip() + logger.debug('importing state based on remote location: %s' % export_path) + + # handle file upload + elif type(request.FILES.get('export_upload_payload', None)) != None: + + logger.debug('handling file upload') + + # save file to disk + payload = request.FILES.get('export_upload_payload', None) + new_file = '/tmp/%s' % (payload.name) + with open(new_file, 'wb') as f: + f.write(payload.read()) + payload.close() + + # set export_path + export_path = new_file + logger.debug('saved uploaded state to %s' % export_path) + + # init export as bg task + ct = models.StateIOClient.import_state_bg_task( + import_name=import_name, + export_path=export_path + ) + + # retrieve StateIO instance, use metadata for msg + stateio = models.StateIO.objects.get(id=ct.task_params['stateio_id']) + + # set gms + gmc = models.GlobalMessageClient(request.session) + gmc.add_gm({ + 'html': '

Importing State:
%s

Refresh this page for updates:

' % ( + stateio.name), + 'class': 'success' + }) + + return redirect('stateio') diff --git a/core/xml2kvp.py b/core/xml2kvp.py index ad482cb3..7a840fb4 100644 --- a/core/xml2kvp.py +++ b/core/xml2kvp.py @@ -15,27 +15,24 @@ import uuid import xmltodict - # init logger logger = logging.getLogger(__name__) - # sibling hash regex sibling_hash_regex = re.compile(r'(.+?)\(([0-9a-zA-Z]+)\)|(.+)') class XML2kvp(object): + ''' + Class to handle the parsing of XML into Key/Value Pairs - ''' - Class to handle the parsing of XML into Key/Value Pairs - - - utilizes xmltodict (https://github.com/martinblech/xmltodict) - - static methods are designed to be called without user instantiating - instance of XML2kvp - ''' + - utilizes xmltodict (https://github.com/martinblech/xmltodict) + - static methods are designed to be called without user instantiating + instance of XML2kvp + ''' - # test xml - test_xml = ''' + # test xml + test_xml = ''' 88888888888 @@ -112,1186 +109,1156 @@ class XML2kvp(object): ''' + # custom exception for delimiter collision + class DelimiterCollision(Exception): + pass + + # schema for validation + schema = { + "$id": "xml2kvp_config_schema", + "title": "XML2kvp configuration options schema", + "type": "object", + "properties": { + "add_literals": { + "description": "Key/value pairs for literals to mixin, e.g. ``foo``:``bar`` would create field ``foo`` with value ``bar`` [Default: ``{}``]", + "type": "object" + }, + "capture_attribute_values": { + "description": "Array of attributes to capture values from and set as standalone field, e.g. if [``age``] is provided and encounters ````, a field ``foo_@age@`` would be created (note the additional trailing ``@`` to indicate an attribute value) with the value ``42``. [Default: ``[]``, Before: ``copy_to``, ``copy_to_regex``]", + "type": "array" + }, + "concat_values_on_all_fields": { + "description": "Boolean or String to join all values from multivalued field on [Default: ``false``]", + "type": ["boolean", "string"] + }, + "concat_values_on_fields": { + "description": "Key/value pairs for fields to concat on provided value, e.g. ``foo_bar``:``-`` if encountering ``foo_bar``:[``goober``,``tronic``] would concatenate to ``foo_bar``:``goober-tronic`` [Default: ``{}``]", + "type": "object" + }, + "copy_to": { + "description": "Key/value pairs to copy one field to another, optionally removing original field, e.g. ``foo``:``bar`` would create field ``bar`` and copy all values when encountered for ``foo`` to ``bar``, removing ``foo``. However, the original field can be retained by setting ``remove_copied_key`` to ``true``. Note: Can also be used to remove fields by setting the target field as false, e.g. 'foo':``false``, would remove field ``foo``. [Default: ``{}``]", + "type": "object" + }, + "copy_to_regex": { + "description": "Key/value pairs to copy one field to another, optionally removing original field, based on regex match of field, e.g. ``.*foo``:``bar`` would copy create field ``bar`` and copy all values fields ``goober_foo`` and ``tronic_foo`` to ``bar``. Note: Can also be used to remove fields by setting the target field as false, e.g. ``.*bar``:``false``, would remove fields matching regex ``.*bar`` [Default: ``{}``]", + "type": "object" + }, + "copy_value_to_regex": { + "description": "Key/value pairs that match values based on regex and copy to new field if matching, e.g. ``http.*``:``websites`` would create new field ``websites`` and copy ``http://exampl.com`` and ``https://example.org`` to new field ``websites`` [Default: ``{}``]", + "type": "object" + }, + "error_on_delims_collision": { + "description": "Boolean to raise ``DelimiterCollision`` exception if delimiter strings from either ``node_delim`` or ``ns_prefix_delim`` collide with field name or field value (``false`` by default for permissive mapping, but can be helpful if collisions are essential to detect) [Default: ``false``]", + "type": "boolean" + }, + "exclude_attributes": { + "description": "Array of attributes to skip when creating field names, e.g. [``baz``] when encountering XML ``tronic`` would create field ``foo_bar_@goober=1000``, skipping attribute ``baz`` [Default: ``[]``]", + "type": "array" + }, + "exclude_elements": { + "description": "Array of elements to skip when creating field names, e.g. [``baz``] when encountering field ``tronic`` would create field ``foo_bar``, skipping element ``baz`` [Default: ``[]``, After: ``include_all_attributes``, ``include_attributes``]", + "type": "array" + }, + "include_attributes": { + "description": "Array of attributes to include when creating field names, despite setting of ``include_all_attributes``, e.g. [``baz``] when encountering XML ``tronic`` would create field ``foo_bar_@baz=42`` [Default: ``[]``, Before: ``exclude_attributes``, After: ``include_all_attributes``]", + "type": "array" + }, + "include_all_attributes": { + "description": "Boolean to consider and include all attributes when creating field names, e.g. if ``false``, XML elements ``tronic`` would result in field name ``foo_bar`` without attributes included. Note: the use of all attributes for creating field names has the the potential to balloon rapidly, potentially encountering ElasticSearch field limit for an index, therefore ``false`` by default. [Default: ``false``, Before: ``include_attributes``, ``exclude_attributes``]", + "type": "boolean" + }, + "include_sibling_id": { + "description": "Boolean to append matching identifiers, as part of key name, to sibling nodes, e.g. ``foo_bar`` and `foo_baz`` might become ``foo(abc123)_bar(def456)`` and ``foo(abc123)_baz(def456)``", + "type": "boolean" + }, + "include_meta": { + "description": "Boolean to include ``xml2kvp_meta`` field with output that contains all these configurations [Default: ``false``]", + "type": "boolean" + }, + "node_delim": { + "description": "String to use as delimiter between XML elements and attributes when creating field name, e.g. ``___`` will convert XML ``tronic`` to field name ``foo___bar`` [Default: ``_``]", + "type": "string" + }, + "ns_prefix_delim": { + "description": "String to use as delimiter between XML namespace prefixes and elements, e.g. ``|`` for the XML ``tronic`` will create field name ``ns|foo_ns:bar``. Note: a ``|`` pipe character is used to avoid using a colon in ElasticSearch fields, which can be problematic. [Default: ``|``]", + "type": "string" + }, + "remove_copied_key": { + "description": "Boolean to determine if originating field will be removed from output if that field is copied to another field [Default: ``true``]", + "type": "boolean" + }, + "remove_copied_value": { + "description": "Boolean to determine if value will be removed from originating field if that value is copied to another field [Default: ``false``]", + "type": "boolean" + }, + "remove_ns_prefix": { + "description": "Boolean to determine if XML namespace prefixes are removed from field names, e.g. if ``false``, the XML ``tronic`` will result in field name ``foo_bar`` without ``ns`` prefix [Default: ``true``]", + "type": "boolean" + }, + "self_describing": { + "description": "Boolean to include machine parsable information about delimeters used (reading right-to-left, delimeter and its length in characters) as suffix to field name, e.g. if ``true``, and ``node_delim`` is ``___`` and ``ns_prefix_delim`` is ``|``, suffix will be ``___3|1``. Can be useful to reverse engineer field name when not re-parsed by XML2kvp. [Default: ``false``]", + "type": "boolean" + }, + "split_values_on_all_fields": { + "description": "If present, string to use for splitting values from all fields, e.g. `` `` will convert single value ``a foo bar please`` into the array of values [``a``,``foo``,``bar``,``please``] for that field [Default: ``false``]", + "type": ["boolean", "string"] + }, + "split_values_on_fields": { + "description": "Key/value pairs of field names to split, and the string to split on, e.g. ``foo_bar``:``,`` will split all values on field ``foo_bar`` on comma ``,`` [Default: ``{}``]", + "type": "object" + }, + "skip_attribute_ns_declarations": { + "description": "Boolean to remove namespace declarations as considered attributes when creating field names [Default: ``true``]", + "type": "boolean" + }, + "skip_repeating_values": { + "description": "Boolean to determine if a field is multivalued, if those values are allowed to repeat, e.g. if set to ``false``, XML ``4242`` would map to ``foo_bar``:``42``, removing the repeating instance of that value. [Default: ``true``]", + "type": "boolean" + }, + "skip_root": { + "description": "Boolean to determine if the XML root element will be included in output field names [Default: ``false``]", + "type": "boolean" + }, + "repeating_element_suffix_count": { + "description": "Boolean to suffix field name with incrementing integer (after first instance, which does not receieve a suffix), e.g. XML ``42109`` would map to ``foo_bar``:``42``, ``foo_bar_#1``:``109`` [Default: ``false``, Overrides: ``skip_repeating_values``]", + "type": "boolean" + } + } + } + + def __init__(self, **kwargs): + + ''' + Args + kwargs (dict): Accepts named args from static methods + ''' + + # defaults, overwritten by methods + self.add_literals = {} + self.as_tuples = True + self.capture_attribute_values = [] + self.concat_values_on_all_fields = False + self.concat_values_on_fields = {} + self.copy_to = {} + self.copy_to_regex = {} + self.copy_value_to_regex = {} + self.error_on_delims_collision = False + self.exclude_attributes = [] + self.exclude_elements = [] + self.include_attributes = [] + self.include_all_attributes = False + self.include_meta = False + self.include_sibling_id = False + self.include_xml_prop = False + self.multivalue_delim = '|' + self.node_delim = '_' + self.ns_prefix_delim = '|' + self.remove_copied_key = True + self.remove_copied_value = False + self.remove_ns_prefix = True + self.self_describing = False + self.split_values_on_all_fields = False + self.split_values_on_fields = {} + self.skip_attribute_ns_declarations = True + self.skip_repeating_values = True + self.skip_root = False + self.repeating_element_suffix_count = False + + # list of properties that are allowed to be overwritten with None + arg_none_allowed = [] + + # overwite with attributes from static methods + for k, v in kwargs.items(): + if v is not None or k in arg_none_allowed: + setattr(self, k, v) + + # set non-overwritable class attributes + self.kvp_dict = {} + self.k_xpath_dict = {} + + # sibling hash counter + self.sibling_hash_counter = {} + + @property + def schema_json(self): + return json.dumps(self.schema) + + @property + def config_json(self): + + config_dict = {k: v for k, v in self.__dict__.items() if k in [ + 'add_literals', + 'capture_attribute_values', + 'concat_values_on_all_fields', + 'concat_values_on_fields', + 'copy_to', + 'copy_to_regex', + 'copy_value_to_regex', + 'error_on_delims_collision', + 'exclude_attributes', + 'exclude_elements', + 'include_attributes', + 'include_all_attributes', + 'include_sibling_id', + 'multivalue_delim', + 'node_delim', + 'ns_prefix_delim', + 'remove_copied_key', + 'remove_copied_value', + 'remove_ns_prefix', + 'self_describing', + 'split_values_on_all_fields', + 'split_values_on_fields', + 'skip_attribute_ns_declarations', + 'skip_repeating_values', + 'skip_root', + 'repeating_element_suffix_count', + ]} + + return json.dumps(config_dict, indent=2, sort_keys=True) + + def _xml_dict_parser(self, in_k, in_v, hops=[]): + + # handle Dictionary + if type(in_v) == OrderedDict: + + # set sibling hash + if in_k != None: + hash_val = in_k + else: + hash_val = hash(frozenset(in_v.keys())) + if hash_val not in self.sibling_hash_counter.keys(): + self.sibling_hash_counter[hash_val] = 1 + else: + self.sibling_hash_counter[hash_val] += 1 + sibling_hash = '%s%s' % (hashlib.md5(str(hash_val).encode('utf-8')).hexdigest()[:4], + str(self.sibling_hash_counter[hash_val]).zfill(2)) + + # handle all attributes for node first + for k, v in in_v.items(): + if k.startswith('@'): + + # handle capture_attribute_values + if len(self.capture_attribute_values) > 0 and k.lstrip('@') in self.capture_attribute_values: + temp_hops = hops.copy() + temp_hops.append("%s@" % k) + self._process_kvp(temp_hops, v) + + # format and append if including + if self.include_all_attributes or ( + len(self.include_attributes) > 0 and k.lstrip('@') in self.include_attributes): + hops = self._format_and_append_hop(hops, 'attribute', k, v) + + # set hop length that will be returned to + hop_len = len(hops) + + # loop through remaining element and/or text nodes + for k, v in in_v.items(): + + # add key to hops + if k == '#text': + self._process_kvp(hops, v) + + else: + + # recurse with non attribute nodes (element or text) + if not k.startswith('@'): + hops = self._format_and_append_hop(hops, 'element', k, None, sibling_hash=sibling_hash) + + # recurse + self._xml_dict_parser(k, v, hops=hops) + + # reset hops + hops = hops[:hop_len] + + # handle list + elif type(in_v) == list: + + hop_len = len(hops) + for d in in_v: + # recurse + self._xml_dict_parser(None, d, hops=hops) + + # drop hops back one + hops = hops[:hop_len] + + # handle str or int, a value + elif type(in_v) in [str, int]: + + if in_k != '#text': + self._process_kvp(hops, in_v) + + def _format_and_append_hop(self, hops, hop_type, k, v, sibling_hash=None): + + # handle elements + if hop_type == 'element': + + # if erroring on collision + if self.error_on_delims_collision: + self._check_delims_collision(k) + + # if skipping elements + if len(self.exclude_elements) > 0: + if k in self.exclude_elements: + return hops + + # apply namespace delimiter + if not self.remove_ns_prefix: + hop = k.replace(':', self.ns_prefix_delim) + else: + if ':' in k: + hop = k.split(':')[1] + else: + hop = k + + # if include_sibling_id, append + if self.include_sibling_id: + # if not first entry, but repeating + if int(sibling_hash[-2:]) >= 1: + hop = '%s(%s)' % (hop, sibling_hash) + + # handle elements + if hop_type == 'attribute': + + # skip attribute namespace declarations + if self.skip_attribute_ns_declarations: + if k.startswith(('@xmlns', '@xsi')): + return hops + + # if excluded attributes + if len(self.exclude_attributes) > 0: + if k.lstrip('@') in self.exclude_attributes: + return hops + + # if erroring on collision + if self.error_on_delims_collision: + self._check_delims_collision(k) + self._check_delims_collision(v) + + # apply namespace delimiter + k = k.replace(':', self.ns_prefix_delim) + + # combine + hop = '%s=%s' % (k, v) + + # append and return + hops.append(hop) + return hops + + def _check_delims_collision(self, value): + + if any(delim in value for delim in [self.node_delim, self.ns_prefix_delim]): + raise self.DelimiterCollision('collision for key value: "%s", collides with a configured delimiter: %s' % + (value, + {'node_delim': self.node_delim, 'ns_prefix_delim': self.ns_prefix_delim})) + + def _process_kvp(self, hops, value): + + ''' + method to add key/value pairs to saved dictionary, + appending new values to pre-existing keys + ''' + + # sanitize value + value = self._sanitize_value(value) + + # join on node delimiter + k = self.node_delim.join(hops) + + # add delims suffix + if self.self_describing: + k = "%(k)s%(node_delim)s%(node_delim_len)s%(ns_prefix_delim)s%(ns_prefix_delim_len)s" % { + 'k': k, + 'node_delim': self.node_delim, + 'node_delim_len': len(self.node_delim), + 'ns_prefix_delim': self.ns_prefix_delim, + 'ns_prefix_delim_len': len(self.ns_prefix_delim) + } + + # init k_list + k_list = [k] + + # handle copy_to mixins + if len(self.copy_to) > 0: + slen = len(k_list) + k_list.extend([cv for ck, cv in self.copy_to.items() if ck == k]) + if self.remove_copied_key: + if slen != len(k_list) and k in k_list: + k_list.remove(k) + + # handle copy_to_regex mixins + if len(self.copy_to_regex) > 0: + + # key list prior to copies + slen = len(k_list) + + # loop through copy_to_regex + for rk, rv in self.copy_to_regex.items(): + + # if False, check for match and remove + if rv == False: + if re.match(rk, k): + k_list.append(False) + + # attempt sub + else: + try: + sub = re.sub(rk, rv, k) + if sub != k: + k_list.append(sub) + except: + pass + + if self.remove_copied_key: + if slen != len(k_list) and k in k_list: + k_list.remove(k) + + # handle copy_value_to_regex mixins + if len(self.copy_value_to_regex) > 0: + + # key list prior to copies + slen = len(k_list) + + # loop through copy_value_to_regex + for rk, rv in self.copy_value_to_regex.items(): + + # attempt sub + try: + if re.match(r'%s' % rk, value): + k_list.append(rv) + except: + pass + + if self.remove_copied_value: + if slen != len(k_list) and k in k_list: + k_list.remove(k) + + # loop through keys + for k in k_list: + + # if k is false, treat like /dev/null + if k == False: + pass + + # new key, new value + elif k not in self.kvp_dict.keys(): + self.kvp_dict[k] = value + + # pre-existing, but not yet list, convert + elif not self.repeating_element_suffix_count and k in self.kvp_dict.keys() and type( + self.kvp_dict[k]) != list: + + if self.skip_repeating_values and value == self.kvp_dict[k]: + pass + else: + tval = self.kvp_dict[k] + self.kvp_dict[k] = [tval, value] + + # suffix key with incrementing int + elif self.repeating_element_suffix_count and k in self.kvp_dict.keys(): + + # check for other numbers + suffix_count = 1 + while True: + if '%s%s#%s' % (k, self.node_delim, suffix_count) in self.kvp_dict.keys(): + suffix_count += 1 + else: + break + self.kvp_dict['%s%s#%s' % (k, self.node_delim, suffix_count)] = value + + # already list, append + else: + if not self.skip_repeating_values or value not in self.kvp_dict[k]: + self.kvp_dict[k].append(value) + + def _split_and_concat_fields(self): + + ''' + Method to group actions related to splitting and concatenating field values + ''' + + # concat values on all fields + if self.concat_values_on_all_fields: + for k, v in self.kvp_dict.items(): + if type(v) == list: + self.kvp_dict[k] = self.concat_values_on_all_fields.join(v) + + # concat values on select fields + if not self.concat_values_on_all_fields and len(self.concat_values_on_fields) > 0: + for k, v in self.concat_values_on_fields.items(): + if k in self.kvp_dict.keys() and type(self.kvp_dict[k]) == list: + self.kvp_dict[k] = v.join(self.kvp_dict[k]) + + # split values on all fields + if self.split_values_on_all_fields: + for k, v in self.kvp_dict.items(): + if type(v) == str: + self.kvp_dict[k] = v.split(self.split_values_on_all_fields) + + # split values on select fields + if not self.split_values_on_all_fields and len(self.split_values_on_fields) > 0: + for k, v in self.split_values_on_fields.items(): + if k in self.kvp_dict.keys() and type(self.kvp_dict[k]) == str: + self.kvp_dict[k] = self.kvp_dict[k].split(v) + + def _parse_xml_input(self, xml_input): + + ''' + Note: self may be handler instance passsed + ''' + + # if string, save + if type(xml_input) == str: + if self.include_xml_prop: + try: + self.xml = etree.fromstring(xml_input) + except: + self.xml = etree.fromstring(xml_input.encode('utf-8')) + self._parse_nsmap() + return (xml_input) + + # if etree object, to string and save + if type(xml_input) in [etree._Element, etree._ElementTree]: + if self.include_xml_prop: + self.xml = xml_input + self._parse_nsmap() + return (etree.tostring(xml_input).decode('utf-8')) + + def _parse_nsmap(self): + + ''' + Note: self may be handler instance passsed + ''' + + # get namespace map, popping None values + _nsmap = self.xml.nsmap.copy() + try: + global_ns = _nsmap.pop(None) + _nsmap['global_ns'] = ns0 + except: + pass + self.nsmap = _nsmap + + def _sanitize_value(self, value): + + ''' + Method to sanitize value before storage in ElasticSearch + + Current sanitations: + - length: Lucene index limited to 32,766, limiting to 32,000 + ''' + + # limit length + if len(value) > 32000: + value = value[:32000] + + # return + return value + + @staticmethod + def xml_to_kvp(xml_input, handler=None, return_handler=False, **kwargs): + + ''' + Static method to create key/value pairs (kvp) from XML string input - # custom exception for delimiter collision - class DelimiterCollision(Exception): - pass - - - # schema for validation - schema = { - "$id": "xml2kvp_config_schema", - "title": "XML2kvp configuration options schema", - "type": "object", - "properties": { - "add_literals": { - "description":"Key/value pairs for literals to mixin, e.g. ``foo``:``bar`` would create field ``foo`` with value ``bar`` [Default: ``{}``]", - "type": "object" - }, - "capture_attribute_values": { - "description": "Array of attributes to capture values from and set as standalone field, e.g. if [``age``] is provided and encounters ````, a field ``foo_@age@`` would be created (note the additional trailing ``@`` to indicate an attribute value) with the value ``42``. [Default: ``[]``, Before: ``copy_to``, ``copy_to_regex``]", - "type": "array" - }, - "concat_values_on_all_fields": { - "description": "Boolean or String to join all values from multivalued field on [Default: ``false``]", - "type": ["boolean","string"] - }, - "concat_values_on_fields": { - "description": "Key/value pairs for fields to concat on provided value, e.g. ``foo_bar``:``-`` if encountering ``foo_bar``:[``goober``,``tronic``] would concatenate to ``foo_bar``:``goober-tronic`` [Default: ``{}``]", - "type": "object" - }, - "copy_to": { - "description": "Key/value pairs to copy one field to another, optionally removing original field, e.g. ``foo``:``bar`` would create field ``bar`` and copy all values when encountered for ``foo`` to ``bar``, removing ``foo``. However, the original field can be retained by setting ``remove_copied_key`` to ``true``. Note: Can also be used to remove fields by setting the target field as false, e.g. 'foo':``false``, would remove field ``foo``. [Default: ``{}``]", - "type": "object" - }, - "copy_to_regex": { - "description": "Key/value pairs to copy one field to another, optionally removing original field, based on regex match of field, e.g. ``.*foo``:``bar`` would copy create field ``bar`` and copy all values fields ``goober_foo`` and ``tronic_foo`` to ``bar``. Note: Can also be used to remove fields by setting the target field as false, e.g. ``.*bar``:``false``, would remove fields matching regex ``.*bar`` [Default: ``{}``]", - "type": "object" - }, - "copy_value_to_regex": { - "description": "Key/value pairs that match values based on regex and copy to new field if matching, e.g. ``http.*``:``websites`` would create new field ``websites`` and copy ``http://exampl.com`` and ``https://example.org`` to new field ``websites`` [Default: ``{}``]", - "type": "object" - }, - "error_on_delims_collision": { - "description": "Boolean to raise ``DelimiterCollision`` exception if delimiter strings from either ``node_delim`` or ``ns_prefix_delim`` collide with field name or field value (``false`` by default for permissive mapping, but can be helpful if collisions are essential to detect) [Default: ``false``]", - "type": "boolean" - }, - "exclude_attributes": { - "description": "Array of attributes to skip when creating field names, e.g. [``baz``] when encountering XML ``tronic`` would create field ``foo_bar_@goober=1000``, skipping attribute ``baz`` [Default: ``[]``]", - "type": "array" - }, - "exclude_elements": { - "description": "Array of elements to skip when creating field names, e.g. [``baz``] when encountering field ``tronic`` would create field ``foo_bar``, skipping element ``baz`` [Default: ``[]``, After: ``include_all_attributes``, ``include_attributes``]", - "type": "array" - }, - "include_attributes": { - "description": "Array of attributes to include when creating field names, despite setting of ``include_all_attributes``, e.g. [``baz``] when encountering XML ``tronic`` would create field ``foo_bar_@baz=42`` [Default: ``[]``, Before: ``exclude_attributes``, After: ``include_all_attributes``]", - "type": "array" - }, - "include_all_attributes": { - "description": "Boolean to consider and include all attributes when creating field names, e.g. if ``false``, XML elements ``tronic`` would result in field name ``foo_bar`` without attributes included. Note: the use of all attributes for creating field names has the the potential to balloon rapidly, potentially encountering ElasticSearch field limit for an index, therefore ``false`` by default. [Default: ``false``, Before: ``include_attributes``, ``exclude_attributes``]", - "type": "boolean" - }, - "include_sibling_id": { - "description": "Boolean to append matching identifiers, as part of key name, to sibling nodes, e.g. ``foo_bar`` and `foo_baz`` might become ``foo(abc123)_bar(def456)`` and ``foo(abc123)_baz(def456)``", - "type": "boolean" - }, - "include_meta": { - "description": "Boolean to include ``xml2kvp_meta`` field with output that contains all these configurations [Default: ``false``]", - "type": "boolean" - }, - "node_delim": { - "description": "String to use as delimiter between XML elements and attributes when creating field name, e.g. ``___`` will convert XML ``tronic`` to field name ``foo___bar`` [Default: ``_``]", - "type": "string" - }, - "ns_prefix_delim": { - "description": "String to use as delimiter between XML namespace prefixes and elements, e.g. ``|`` for the XML ``tronic`` will create field name ``ns|foo_ns:bar``. Note: a ``|`` pipe character is used to avoid using a colon in ElasticSearch fields, which can be problematic. [Default: ``|``]", - "type": "string" - }, - "remove_copied_key": { - "description": "Boolean to determine if originating field will be removed from output if that field is copied to another field [Default: ``true``]", - "type": "boolean" - }, - "remove_copied_value": { - "description": "Boolean to determine if value will be removed from originating field if that value is copied to another field [Default: ``false``]", - "type": "boolean" - }, - "remove_ns_prefix": { - "description": "Boolean to determine if XML namespace prefixes are removed from field names, e.g. if ``false``, the XML ``tronic`` will result in field name ``foo_bar`` without ``ns`` prefix [Default: ``true``]", - "type": "boolean" - }, - "self_describing": { - "description": "Boolean to include machine parsable information about delimeters used (reading right-to-left, delimeter and its length in characters) as suffix to field name, e.g. if ``true``, and ``node_delim`` is ``___`` and ``ns_prefix_delim`` is ``|``, suffix will be ``___3|1``. Can be useful to reverse engineer field name when not re-parsed by XML2kvp. [Default: ``false``]", - "type": "boolean" - }, - "split_values_on_all_fields": { - "description": "If present, string to use for splitting values from all fields, e.g. `` `` will convert single value ``a foo bar please`` into the array of values [``a``,``foo``,``bar``,``please``] for that field [Default: ``false``]", - "type": ["boolean","string"] - }, - "split_values_on_fields": { - "description": "Key/value pairs of field names to split, and the string to split on, e.g. ``foo_bar``:``,`` will split all values on field ``foo_bar`` on comma ``,`` [Default: ``{}``]", - "type": "object" - }, - "skip_attribute_ns_declarations": { - "description": "Boolean to remove namespace declarations as considered attributes when creating field names [Default: ``true``]", - "type": "boolean" - }, - "skip_repeating_values": { - "description": "Boolean to determine if a field is multivalued, if those values are allowed to repeat, e.g. if set to ``false``, XML ``4242`` would map to ``foo_bar``:``42``, removing the repeating instance of that value. [Default: ``true``]", - "type": "boolean" - }, - "skip_root": { - "description": "Boolean to determine if the XML root element will be included in output field names [Default: ``false``]", - "type": "boolean" - }, - "repeating_element_suffix_count": { - "description": "Boolean to suffix field name with incrementing integer (after first instance, which does not receieve a suffix), e.g. XML ``42109`` would map to ``foo_bar``:``42``, ``foo_bar_#1``:``109`` [Default: ``false``, Overrides: ``skip_repeating_values``]", - "type": "boolean" - } - } - } - - - def __init__(self, **kwargs): - - ''' - Args - kwargs (dict): Accepts named args from static methods - ''' - - # defaults, overwritten by methods - self.add_literals={} - self.as_tuples=True - self.capture_attribute_values=[] - self.concat_values_on_all_fields=False - self.concat_values_on_fields={} - self.copy_to={} - self.copy_to_regex={} - self.copy_value_to_regex={} - self.error_on_delims_collision=False - self.exclude_attributes=[] - self.exclude_elements=[] - self.include_attributes=[] - self.include_all_attributes=False - self.include_meta=False - self.include_sibling_id=False - self.include_xml_prop=False - self.multivalue_delim='|' - self.node_delim='_' - self.ns_prefix_delim='|' - self.remove_copied_key=True - self.remove_copied_value=False - self.remove_ns_prefix=True - self.self_describing=False - self.split_values_on_all_fields=False - self.split_values_on_fields={} - self.skip_attribute_ns_declarations=True - self.skip_repeating_values=True - self.skip_root=False - self.repeating_element_suffix_count=False - - # list of properties that are allowed to be overwritten with None - arg_none_allowed = [] - - # overwite with attributes from static methods - for k,v in kwargs.items(): - if v is not None or k in arg_none_allowed: - setattr(self, k, v) - - # set non-overwritable class attributes - self.kvp_dict = {} - self.k_xpath_dict = {} - - # sibling hash counter - self.sibling_hash_counter = {} - - - @property - def schema_json(self): - return json.dumps(self.schema) - - - @property - def config_json(self): - - config_dict = { k:v for k,v in self.__dict__.items() if k in [ - 'add_literals', - 'capture_attribute_values', - 'concat_values_on_all_fields', - 'concat_values_on_fields', - 'copy_to', - 'copy_to_regex', - 'copy_value_to_regex', - 'error_on_delims_collision', - 'exclude_attributes', - 'exclude_elements', - 'include_attributes', - 'include_all_attributes', - 'include_sibling_id', - 'multivalue_delim', - 'node_delim', - 'ns_prefix_delim', - 'remove_copied_key', - 'remove_copied_value', - 'remove_ns_prefix', - 'self_describing', - 'split_values_on_all_fields', - 'split_values_on_fields', - 'skip_attribute_ns_declarations', - 'skip_repeating_values', - 'skip_root', - 'repeating_element_suffix_count', - ] } - - return json.dumps(config_dict, indent=2, sort_keys=True) - - - def _xml_dict_parser(self, in_k, in_v, hops=[]): - - # handle Dictionary - if type(in_v) == OrderedDict: - - # set sibling hash - if in_k != None: - hash_val = in_k - else: - hash_val = hash(frozenset(in_v.keys())) - if hash_val not in self.sibling_hash_counter.keys(): - self.sibling_hash_counter[hash_val] = 1 - else: - self.sibling_hash_counter[hash_val] += 1 - sibling_hash = '%s%s' % (hashlib.md5(str(hash_val).encode('utf-8')).hexdigest()[:4], str(self.sibling_hash_counter[hash_val]).zfill(2)) - - # handle all attributes for node first - for k, v in in_v.items(): - if k.startswith('@'): - - # handle capture_attribute_values - if len(self.capture_attribute_values) > 0 and k.lstrip('@') in self.capture_attribute_values: - temp_hops = hops.copy() - temp_hops.append("%s@" % k) - self._process_kvp(temp_hops, v) - - # format and append if including - if self.include_all_attributes or (len(self.include_attributes) > 0 and k.lstrip('@') in self.include_attributes): - hops = self._format_and_append_hop(hops, 'attribute', k, v) - - # set hop length that will be returned to - hop_len = len(hops) - - # loop through remaining element and/or text nodes - for k, v in in_v.items(): - - # add key to hops - if k == '#text': - self._process_kvp(hops, v) - - else: - - # recurse with non attribute nodes (element or text) - if not k.startswith('@'): - - hops = self._format_and_append_hop(hops, 'element', k, None, sibling_hash=sibling_hash) - - # recurse - self._xml_dict_parser(k, v, hops=hops) - - # reset hops - hops = hops[:hop_len] - - # handle list - elif type(in_v) == list: - - hop_len = len(hops) - for d in in_v: - - # recurse - self._xml_dict_parser(None, d, hops=hops) - - # drop hops back one - hops = hops[:hop_len] - - # handle str or int, a value - elif type(in_v) in [str,int]: - - if in_k != '#text': - self._process_kvp(hops, in_v) - - - def _format_and_append_hop(self, hops, hop_type, k, v, sibling_hash=None): - - # handle elements - if hop_type == 'element': - - # if erroring on collision - if self.error_on_delims_collision: - self._check_delims_collision(k) - - # if skipping elements - if len(self.exclude_elements) > 0: - if k in self.exclude_elements: - return hops - - # apply namespace delimiter - if not self.remove_ns_prefix: - hop = k.replace(':', self.ns_prefix_delim) - else: - if ':' in k: - hop = k.split(':')[1] - else: - hop = k - - # if include_sibling_id, append - if self.include_sibling_id: - # if not first entry, but repeating - if int(sibling_hash[-2:]) >= 1: - hop = '%s(%s)' % (hop, sibling_hash) - - # handle elements - if hop_type == 'attribute': - - # skip attribute namespace declarations - if self.skip_attribute_ns_declarations: - if k.startswith(('@xmlns', '@xsi')): - return hops - - # if excluded attributes - if len(self.exclude_attributes) > 0: - if k.lstrip('@') in self.exclude_attributes: - return hops - - # if erroring on collision - if self.error_on_delims_collision: - self._check_delims_collision(k) - self._check_delims_collision(v) - - # apply namespace delimiter - k = k.replace(':', self.ns_prefix_delim) - - # combine - hop = '%s=%s' % (k, v) - - # append and return - hops.append(hop) - return hops - - - def _check_delims_collision(self, value): - - if any(delim in value for delim in [self.node_delim, self.ns_prefix_delim]): - raise self.DelimiterCollision('collision for key value: "%s", collides with a configured delimiter: %s' % - (value, {'node_delim':self.node_delim, 'ns_prefix_delim':self.ns_prefix_delim})) - - - def _process_kvp(self, hops, value): - - ''' - method to add key/value pairs to saved dictionary, - appending new values to pre-existing keys - ''' - - # sanitize value - value = self._sanitize_value(value) - - # join on node delimiter - k = self.node_delim.join(hops) - - # add delims suffix - if self.self_describing: - k = "%(k)s%(node_delim)s%(node_delim_len)s%(ns_prefix_delim)s%(ns_prefix_delim_len)s" % { - 'k':k, - 'node_delim':self.node_delim, - 'node_delim_len':len(self.node_delim), - 'ns_prefix_delim':self.ns_prefix_delim, - 'ns_prefix_delim_len':len(self.ns_prefix_delim) - } - - # init k_list - k_list = [k] - - # handle copy_to mixins - if len(self.copy_to) > 0: - slen = len(k_list) - k_list.extend([ cv for ck, cv in self.copy_to.items() if ck == k ]) - if self.remove_copied_key: - if slen != len(k_list) and k in k_list: - k_list.remove(k) - - # handle copy_to_regex mixins - if len(self.copy_to_regex) > 0: - - # key list prior to copies - slen = len(k_list) - - # loop through copy_to_regex - for rk, rv in self.copy_to_regex.items(): - - # if False, check for match and remove - if rv == False: - if re.match(rk, k): - k_list.append(False) - - # attempt sub - else: - try: - sub = re.sub(rk, rv, k) - if sub != k: - k_list.append(sub) - except: - pass - - if self.remove_copied_key: - if slen != len(k_list) and k in k_list: - k_list.remove(k) - - # handle copy_value_to_regex mixins - if len(self.copy_value_to_regex) > 0: - - # key list prior to copies - slen = len(k_list) - - # loop through copy_value_to_regex - for rk, rv in self.copy_value_to_regex.items(): - - # attempt sub - try: - if re.match(r'%s' % rk, value): - k_list.append(rv) - except: - pass - - if self.remove_copied_value: - if slen != len(k_list) and k in k_list: - k_list.remove(k) - - # loop through keys - for k in k_list: - - # if k is false, treat like /dev/null - if k == False: - pass - - # new key, new value - elif k not in self.kvp_dict.keys(): - self.kvp_dict[k] = value - - # pre-existing, but not yet list, convert - elif not self.repeating_element_suffix_count and k in self.kvp_dict.keys() and type(self.kvp_dict[k]) != list: - - if self.skip_repeating_values and value == self.kvp_dict[k]: - pass - else: - tval = self.kvp_dict[k] - self.kvp_dict[k] = [tval, value] - - # suffix key with incrementing int - elif self.repeating_element_suffix_count and k in self.kvp_dict.keys(): - - # check for other numbers - suffix_count = 1 - while True: - if '%s%s#%s' % (k, self.node_delim, suffix_count) in self.kvp_dict.keys(): - suffix_count += 1 - else: - break - self.kvp_dict['%s%s#%s' % (k, self.node_delim, suffix_count)] = value - - # already list, append - else: - if not self.skip_repeating_values or value not in self.kvp_dict[k]: - self.kvp_dict[k].append(value) - - - def _split_and_concat_fields(self): - - ''' - Method to group actions related to splitting and concatenating field values - ''' - - # concat values on all fields - if self.concat_values_on_all_fields: - for k,v in self.kvp_dict.items(): - if type(v) == list: - self.kvp_dict[k] = self.concat_values_on_all_fields.join(v) - - # concat values on select fields - if not self.concat_values_on_all_fields and len(self.concat_values_on_fields) > 0: - for k,v in self.concat_values_on_fields.items(): - if k in self.kvp_dict.keys() and type(self.kvp_dict[k]) == list: - self.kvp_dict[k] = v.join(self.kvp_dict[k]) - - # split values on all fields - if self.split_values_on_all_fields: - for k,v in self.kvp_dict.items(): - if type(v) == str: - self.kvp_dict[k] = v.split(self.split_values_on_all_fields) - - # split values on select fields - if not self.split_values_on_all_fields and len(self.split_values_on_fields) > 0: - for k,v in self.split_values_on_fields.items(): - if k in self.kvp_dict.keys() and type(self.kvp_dict[k]) == str: - self.kvp_dict[k] = self.kvp_dict[k].split(v) - - - def _parse_xml_input(self, xml_input): - - ''' - Note: self may be handler instance passsed - ''' - - # if string, save - if type(xml_input) == str: - if self.include_xml_prop: - try: - self.xml = etree.fromstring(xml_input) - except: - self.xml = etree.fromstring(xml_input.encode('utf-8')) - self._parse_nsmap() - return (xml_input) - - # if etree object, to string and save - if type(xml_input) in [etree._Element, etree._ElementTree]: - if self.include_xml_prop: - self.xml = xml_input - self._parse_nsmap() - return (etree.tostring(xml_input).decode('utf-8')) - - - def _parse_nsmap(self): - - ''' - Note: self may be handler instance passsed - ''' - - # get namespace map, popping None values - _nsmap = self.xml.nsmap.copy() - try: - global_ns = _nsmap.pop(None) - _nsmap['global_ns'] = ns0 - except: - pass - self.nsmap = _nsmap - - - def _sanitize_value(self, value): - - ''' - Method to sanitize value before storage in ElasticSearch - - Current sanitations: - - length: Lucene index limited to 32,766, limiting to 32,000 - ''' - - # limit length - if len(value) > 32000: - value = value[:32000] - - # return - return value - - - @staticmethod - def xml_to_kvp(xml_input, handler=None, return_handler=False, **kwargs): - - ''' - Static method to create key/value pairs (kvp) from XML string input - - Args: - - Returns: - - ''' + Args: - # init handler, overwriting defaults if not None - if not handler: - handler = XML2kvp(**kwargs) + Returns: - # clean kvp_dict - handler.kvp_dict = OrderedDict() + ''' - # parse xml input - handler.xml_string = handler._parse_xml_input(xml_input) + # init handler, overwriting defaults if not None + if not handler: + handler = XML2kvp(**kwargs) - # parse as dictionary - handler.xml_dict = xmltodict.parse(handler.xml_string, xml_attribs=True) + # clean kvp_dict + handler.kvp_dict = OrderedDict() - # walk xmltodict parsed dictionary - handler._xml_dict_parser(None, handler.xml_dict, hops=[]) + # parse xml input + handler.xml_string = handler._parse_xml_input(xml_input) - # handle literal mixins - if len(handler.add_literals) > 0: - for k,v in handler.add_literals.items(): - handler.kvp_dict[k] = v + # parse as dictionary + handler.xml_dict = xmltodict.parse(handler.xml_string, xml_attribs=True) - # handle split and concatenations - handler._split_and_concat_fields() + # walk xmltodict parsed dictionary + handler._xml_dict_parser(None, handler.xml_dict, hops=[]) - # convert list to tuples if flagged - if handler.as_tuples: - # convert all lists to tuples - for k,v in handler.kvp_dict.items(): - if type(v) == list: - handler.kvp_dict[k] = tuple(v) + # handle literal mixins + if len(handler.add_literals) > 0: + for k, v in handler.add_literals.items(): + handler.kvp_dict[k] = v - # include metadata about delimeters - if handler.include_meta: + # handle split and concatenations + handler._split_and_concat_fields() - # set delimiters - meta_dict = { - 'node_delim':handler.node_delim, - 'ns_prefix_delim':handler.ns_prefix_delim - } + # convert list to tuples if flagged + if handler.as_tuples: + # convert all lists to tuples + for k, v in handler.kvp_dict.items(): + if type(v) == list: + handler.kvp_dict[k] = tuple(v) - # if nsmap exists, include - if handler.nsmap: - meta_dict['nsmap'] = handler.nsmap + # include metadata about delimeters + if handler.include_meta: - # set as json - handler.kvp_dict['xml2kvp_meta'] = json.dumps(meta_dict) + # set delimiters + meta_dict = { + 'node_delim': handler.node_delim, + 'ns_prefix_delim': handler.ns_prefix_delim + } - # return - if return_handler: - return handler - else: - return handler.kvp_dict + # if nsmap exists, include + if handler.nsmap: + meta_dict['nsmap'] = handler.nsmap + # set as json + handler.kvp_dict['xml2kvp_meta'] = json.dumps(meta_dict) - @staticmethod - def kvp_to_xml(kvp, handler=None, return_handler=False, serialize_xml=False, **kwargs): + # return + if return_handler: + return handler + else: + return handler.kvp_dict - ''' - Method to generate XML from KVP + @staticmethod + def kvp_to_xml(kvp, handler=None, return_handler=False, serialize_xml=False, **kwargs): - Args: - kvp (dict): Dictionary of key value pairs - handler (XML2kvp): Instance of XML2kvp client - return_handler (boolean): Return XML if False, handler if True - ''' + ''' + Method to generate XML from KVP - # DEBUG - stime = time.time() + Args: + kvp (dict): Dictionary of key value pairs + handler (XML2kvp): Instance of XML2kvp client + return_handler (boolean): Return XML if False, handler if True + ''' - # init handler, overwriting defaults if not None - if not handler: - handler = XML2kvp(**kwargs) + # DEBUG + stime = time.time() - # init XMLRecord - xml_record = XMLRecord() + # init handler, overwriting defaults if not None + if not handler: + handler = XML2kvp(**kwargs) - # loop through items - for k,v in kvp.items(): + # init XMLRecord + xml_record = XMLRecord() - # split on delim - nodes = k.split(handler.node_delim) + # loop through items + for k, v in kvp.items(): - # loop through nodes and create XML element nodes - hops = [] - for i, node in enumerate(nodes): + # split on delim + nodes = k.split(handler.node_delim) - # write hops - if not node.startswith('@'): + # loop through nodes and create XML element nodes + hops = [] + for i, node in enumerate(nodes): - # init attributes - attribs = {} + # write hops + if not node.startswith('@'): - # handle namespaces for tag name - if handler.ns_prefix_delim in node: + # init attributes + attribs = {} - # get prefix and tag name - prefix, tag_name = node.split(handler.ns_prefix_delim) + # handle namespaces for tag name + if handler.ns_prefix_delim in node: - # write - tag_name = '{%s}%s' % (handler.nsmap[prefix], tag_name) + # get prefix and tag name + prefix, tag_name = node.split(handler.ns_prefix_delim) - # else, handle non-namespaced - else: - tag_name = node + # write + tag_name = '{%s}%s' % (handler.nsmap[prefix], tag_name) - # handle sibling hashes - if handler.include_sibling_id: + # else, handle non-namespaced + else: + tag_name = node - # run tag_name through sibling_hash_regex - matches = re.match(sibling_hash_regex, tag_name) - if matches != None: - groups = matches.groups() + # handle sibling hashes + if handler.include_sibling_id: - # if tag_name and sibling hash, append to attribs - if groups[0] and groups[1]: - tag_name = groups[0] - sibling_hash = groups[1] - attribs['sibling_hash_id'] = sibling_hash + # run tag_name through sibling_hash_regex + matches = re.match(sibling_hash_regex, tag_name) + if matches != None: + groups = matches.groups() - # else, assume sibling hash not present, get tag name - elif groups[2]: - tag_name = groups[2] + # if tag_name and sibling hash, append to attribs + if groups[0] and groups[1]: + tag_name = groups[0] + sibling_hash = groups[1] + attribs['sibling_hash_id'] = sibling_hash - # init element - node_ele = etree.Element(tag_name, nsmap=handler.nsmap) + # else, assume sibling hash not present, get tag name + elif groups[2]: + tag_name = groups[2] - # check for attributes - if i+1 < len(nodes) and nodes[i+1].startswith('@'): - while True: - for attrib in nodes[i+1:]: - if attrib.startswith('@'): - attrib_name, attrib_value = attrib.split('=') - attribs[attrib_name.lstrip('@')] = attrib_value - else: - break - break + # init element + node_ele = etree.Element(tag_name, nsmap=handler.nsmap) - # write to element - node_ele.attrib.update(attribs) + # check for attributes + if i + 1 < len(nodes) and nodes[i + 1].startswith('@'): + while True: + for attrib in nodes[i + 1:]: + if attrib.startswith('@'): + attrib_name, attrib_value = attrib.split('=') + attribs[attrib_name.lstrip('@')] = attrib_value + else: + break + break - # append to hops - hops.append(node_ele) + # write to element + node_ele.attrib.update(attribs) - # write values and number of nodes - # # convert with ast.literal_eval to circumvent lists/tuples record as strings in pyspark - # # https://github.com/WSULib/combine/issues/361#issuecomment-442510950 - if type(v) == str: + # append to hops + hops.append(node_ele) - # evaluate to expose lists or tuples - try: - v_eval = ast.literal_eval(v) - if type(v_eval) in [list,tuple]: - v = v_eval - except: - pass + # write values and number of nodes + # # convert with ast.literal_eval to circumvent lists/tuples record as strings in pyspark + # # https://github.com/WSULib/combine/issues/361#issuecomment-442510950 + if type(v) == str: - # split based on handler.multivalue_delim - if handler.multivalue_delim != None and type(v) == str and handler.multivalue_delim in v: - v = [ val.strip() for val in v.split(handler.multivalue_delim) ] + # evaluate to expose lists or tuples + try: + v_eval = ast.literal_eval(v) + if type(v_eval) in [list, tuple]: + v = v_eval + except: + pass - # handle single value - if type(v) == str: + # split based on handler.multivalue_delim + if handler.multivalue_delim != None and type(v) == str and handler.multivalue_delim in v: + v = [val.strip() for val in v.split(handler.multivalue_delim)] - # write value - hops[-1].text = str(v) + # handle single value + if type(v) == str: - # append single list of nodes to xml_record - xml_record.node_lists.append(hops) + # write value + hops[-1].text = str(v) - # handle multiple values - elif type(v) in [list,tuple]: + # append single list of nodes to xml_record + xml_record.node_lists.append(hops) - # loop through values - for value in v: + # handle multiple values + elif type(v) in [list, tuple]: - # copy hops - hops_copy = deepcopy(hops) + # loop through values + for value in v: + # copy hops + hops_copy = deepcopy(hops) - # write value - hops_copy[-1].text = str(value) + # write value + hops_copy[-1].text = str(value) - # append single list of nodes to xml_record - xml_record.node_lists.append(hops_copy) + # append single list of nodes to xml_record + xml_record.node_lists.append(hops_copy) - # tether parent and child nodes - xml_record.tether_node_lists() + # tether parent and child nodes + xml_record.tether_node_lists() - # merge all root nodes - xml_record.merge_root_nodes() + # merge all root nodes + xml_record.merge_root_nodes() - # if sibling hashes included, attempt to merge - if handler.include_sibling_id: - xml_record.merge_siblings() - - # return - if serialize_xml: - return xml_record.serialize() - else: - return xml_record - - - @staticmethod - def k_to_xpath(k, handler=None, return_handler=False, **kwargs): - - ''' - Method to derive xpath from kvp key - ''' - - # init handler - if not handler: - handler = XML2kvp(**kwargs) - - # for each column, reconstitue columnName --> XPath - k_parts = k.split(handler.node_delim) - - # if skip root - if handler.skip_root: - k_parts = k_parts[1:] - - # if include_sibling_id, strip 6 char id from end - if handler.include_sibling_id: - k_parts = [ part[:-8] if not part.startswith('@') else part for part in k_parts ] - - # set initial on_attrib flag - on_attrib = False - - # init path string - if not handler.skip_root: - xpath = '' - else: - xpath = '/' # begin with single slash, will get appended to - - # determine if mixing of namespaced and non-namespaced elements - ns_used = False - for part in k_parts: - if handler.ns_prefix_delim in part: - ns_used = True - - # loop through pieces and build xpath - for i, part in enumerate(k_parts): - - # if not attribute, assume node hop - if not part.startswith('@'): - - # handle closing attrib if present - if on_attrib: - xpath += ']/' - on_attrib = False - - # close previous element - else: - xpath += '/' - - # handle parts without namespace, mingled among namespaced elements - if ns_used and handler.ns_prefix_delim not in part: - part = '*[local-name() = "%s"]' % part - else: - # replace delimiter with colon for prefix - part = part.replace(handler.ns_prefix_delim,':') - - # if part not followed by attribute, append no attribute qualifier - if ((i+1) < len(k_parts) and not k_parts[(i+1)].startswith('@')) or ((i+1) == len(k_parts) and not part.startswith('@')): - part += '[not(@*)]' - - # append to xpath - xpath += part - - # if attribute, assume part of previous element and build - else: - - # handle attribute - attrib, value = part.split('=') - - # if not on_attrib, open xpath for attribute inclusion - if not on_attrib: - xpath += "[%s='%s'" % (attrib, value) - - # else, currently in attribute write block, continue - else: - xpath += " and %s='%s'" % (attrib, value) - - # set on_attrib flag for followup - on_attrib = True - - # cleanup after loop - if on_attrib: - - # close attrib brackets - xpath += ']' - - # finally, avoid matching descandants - xpath += '[not(*)]' - - # save to handler - handler.k_xpath_dict[k] = xpath - - # return - if return_handler: - return handler - else: - return xpath - - - @staticmethod - def kvp_to_xpath( - kvp, - node_delim=None, - ns_prefix_delim=None, - skip_root=None, - handler=None, - return_handler=False): - - # init handler - if not handler: - handler = XML2kvp( - node_delim=node_delim, - ns_prefix_delim=ns_prefix_delim, - skip_root=skip_root) - - # handle forms of kvp - if type(kvp) == str: - handler.kvp_dict = json.loads(kvp) - if type(kvp) == dict: - handler.kvp_dict = kvp - - # loop through and append to handler - for k,v in handler.kvp_dict.items(): - XML2kvp.k_to_xpath(k, handler=handler) - - # return - if return_handler: - return handler - else: - return handler.k_xpath_dict - - - def test_kvp_to_xpath_roundtrip(self): - - # check for self.xml and self.nsmap - if not hasattr(self, 'xml'): - try: - self.xml = etree.fromstring(self.xml_string) - except: - self.xml = etree.fromstring(self.xml_string.encode('utf-8')) - if not hasattr(self, 'nsmap'): - self._parse_nsmap() - - # generate xpaths values - self = XML2kvp.kvp_to_xpath(self.kvp_dict, handler=self, return_handler=True) - - # check instances and report - for k,v in self.k_xpath_dict.items(): - try: - matched_elements = self.xml.xpath(v, namespaces=self.nsmap) - values = self.kvp_dict[k] - if type(values) == str: - values_len = 1 - elif type(values) in [tuple,list]: - values_len = len(values) - if len(matched_elements) != values_len: - logger.debug('mistmatch on %s --> %s, matched elements:values --> %s:%s' % (k, v, values_len, len(matched_elements))) - except etree.XPathEvalError: - logger.debug('problem with xpath statement: %s' % v) - logger.debug('could not calculate %s --> %s' % (k, v)) - - - @staticmethod - def test_xml_to_kvp_speed(iterations, kwargs): - - stime=time.time() - for x in range(0, iterations): - XML2kvp.xml_to_kvp(XML2kvp.test_xml, **kwargs) - print("avg for %s iterations: %s" % (iterations, (time.time()-stime) / float(iterations))) - - - def schema_as_table(self, table_format='rst'): - - ''' - Method to export schema as tabular table - - converts list of lists into ASCII table - - Args: - table_format (str) ['rst','md'] - ''' - - # init table - table = [] - - # set headers - table.append(['Parameter','Type','Description']) - - # loop through schema properties and add - props = self.schema['properties'] - for k,v in props.items(): - table.append([ - "``%s``" % k, - self._table_format_type(v['type']), - self._table_format_desc(v['description']) - ]) - - # sort by property name - table.sort(key=lambda x: x[0]) - - # return as table based on table_format - if table_format == 'rst': - return dashtable.data2rst(table, use_headers=True) - elif table_format == 'md': - return dashtable.data2md(table, use_headers=True) - elif table_format == 'html': - return None - - - def _table_format_type(self, prop_type): - - ''' - Method to format XML2kvp configuration property type for table - ''' - - # handle single - if type(prop_type) == str: - return "``%s``" % prop_type - - # handle list - elif type(prop_type) == list: - return "[" + ",".join([ "``%s``" % t for t in prop_type ]) + "]" - - - def _table_format_desc(self, desc): - - ''' - Method to format XML2kvp configuration property description for table - ''' - - return desc - - - @staticmethod - def k_to_human(k, handler=None, return_handler=False, **kwargs): - - ''' - Method to humanize k's with sibling hashes and attributes - ''' - - # remove sibling hash - if handler.include_sibling_id: - k = re.sub(r'\(.+?\)','',k) - - # rewrite namespace - k = re.sub(r'\%s' % handler.ns_prefix_delim,':',k) - - # return - return k + # if sibling hashes included, attempt to merge + if handler.include_sibling_id: + xml_record.merge_siblings() + + # return + if serialize_xml: + return xml_record.serialize() + else: + return xml_record + + @staticmethod + def k_to_xpath(k, handler=None, return_handler=False, **kwargs): + + ''' + Method to derive xpath from kvp key + ''' + + # init handler + if not handler: + handler = XML2kvp(**kwargs) + + # for each column, reconstitue columnName --> XPath + k_parts = k.split(handler.node_delim) + + # if skip root + if handler.skip_root: + k_parts = k_parts[1:] + + # if include_sibling_id, strip 6 char id from end + if handler.include_sibling_id: + k_parts = [part[:-8] if not part.startswith('@') else part for part in k_parts] + + # set initial on_attrib flag + on_attrib = False + + # init path string + if not handler.skip_root: + xpath = '' + else: + xpath = '/' # begin with single slash, will get appended to + + # determine if mixing of namespaced and non-namespaced elements + ns_used = False + for part in k_parts: + if handler.ns_prefix_delim in part: + ns_used = True + + # loop through pieces and build xpath + for i, part in enumerate(k_parts): + + # if not attribute, assume node hop + if not part.startswith('@'): + + # handle closing attrib if present + if on_attrib: + xpath += ']/' + on_attrib = False + + # close previous element + else: + xpath += '/' + + # handle parts without namespace, mingled among namespaced elements + if ns_used and handler.ns_prefix_delim not in part: + part = '*[local-name() = "%s"]' % part + else: + # replace delimiter with colon for prefix + part = part.replace(handler.ns_prefix_delim, ':') + + # if part not followed by attribute, append no attribute qualifier + if ((i + 1) < len(k_parts) and not k_parts[(i + 1)].startswith('@')) or ( + (i + 1) == len(k_parts) and not part.startswith('@')): + part += '[not(@*)]' + + # append to xpath + xpath += part + + # if attribute, assume part of previous element and build + else: + + # handle attribute + attrib, value = part.split('=') + + # if not on_attrib, open xpath for attribute inclusion + if not on_attrib: + xpath += "[%s='%s'" % (attrib, value) + + # else, currently in attribute write block, continue + else: + xpath += " and %s='%s'" % (attrib, value) + + # set on_attrib flag for followup + on_attrib = True + + # cleanup after loop + if on_attrib: + # close attrib brackets + xpath += ']' + + # finally, avoid matching descandants + xpath += '[not(*)]' + + # save to handler + handler.k_xpath_dict[k] = xpath + + # return + if return_handler: + return handler + else: + return xpath + + @staticmethod + def kvp_to_xpath( + kvp, + node_delim=None, + ns_prefix_delim=None, + skip_root=None, + handler=None, + return_handler=False): + + # init handler + if not handler: + handler = XML2kvp( + node_delim=node_delim, + ns_prefix_delim=ns_prefix_delim, + skip_root=skip_root) + + # handle forms of kvp + if type(kvp) == str: + handler.kvp_dict = json.loads(kvp) + if type(kvp) == dict: + handler.kvp_dict = kvp + + # loop through and append to handler + for k, v in handler.kvp_dict.items(): + XML2kvp.k_to_xpath(k, handler=handler) + + # return + if return_handler: + return handler + else: + return handler.k_xpath_dict + + def test_kvp_to_xpath_roundtrip(self): + + # check for self.xml and self.nsmap + if not hasattr(self, 'xml'): + try: + self.xml = etree.fromstring(self.xml_string) + except: + self.xml = etree.fromstring(self.xml_string.encode('utf-8')) + if not hasattr(self, 'nsmap'): + self._parse_nsmap() + + # generate xpaths values + self = XML2kvp.kvp_to_xpath(self.kvp_dict, handler=self, return_handler=True) + + # check instances and report + for k, v in self.k_xpath_dict.items(): + try: + matched_elements = self.xml.xpath(v, namespaces=self.nsmap) + values = self.kvp_dict[k] + if type(values) == str: + values_len = 1 + elif type(values) in [tuple, list]: + values_len = len(values) + if len(matched_elements) != values_len: + logger.debug('mistmatch on %s --> %s, matched elements:values --> %s:%s' % ( + k, v, values_len, len(matched_elements))) + except etree.XPathEvalError: + logger.debug('problem with xpath statement: %s' % v) + logger.debug('could not calculate %s --> %s' % (k, v)) + + @staticmethod + def test_xml_to_kvp_speed(iterations, kwargs): + + stime = time.time() + for x in range(0, iterations): + XML2kvp.xml_to_kvp(XML2kvp.test_xml, **kwargs) + print("avg for %s iterations: %s" % (iterations, (time.time() - stime) / float(iterations))) + + def schema_as_table(self, table_format='rst'): + + ''' + Method to export schema as tabular table + - converts list of lists into ASCII table + + Args: + table_format (str) ['rst','md'] + ''' + + # init table + table = [] + + # set headers + table.append(['Parameter', 'Type', 'Description']) + + # loop through schema properties and add + props = self.schema['properties'] + for k, v in props.items(): + table.append([ + "``%s``" % k, + self._table_format_type(v['type']), + self._table_format_desc(v['description']) + ]) + + # sort by property name + table.sort(key=lambda x: x[0]) + + # return as table based on table_format + if table_format == 'rst': + return dashtable.data2rst(table, use_headers=True) + elif table_format == 'md': + return dashtable.data2md(table, use_headers=True) + elif table_format == 'html': + return None + + def _table_format_type(self, prop_type): + + ''' + Method to format XML2kvp configuration property type for table + ''' + + # handle single + if type(prop_type) == str: + return "``%s``" % prop_type + + # handle list + elif type(prop_type) == list: + return "[" + ",".join(["``%s``" % t for t in prop_type]) + "]" + + def _table_format_desc(self, desc): + + ''' + Method to format XML2kvp configuration property description for table + ''' + + return desc + + @staticmethod + def k_to_human(k, handler=None, return_handler=False, **kwargs): + + ''' + Method to humanize k's with sibling hashes and attributes + ''' + + # remove sibling hash + if handler.include_sibling_id: + k = re.sub(r'\(.+?\)', '', k) + + # rewrite namespace + k = re.sub(r'\%s' % handler.ns_prefix_delim, ':', k) + # return + return k class XMLRecord(object): + ''' + Class to scaffold and create XML records from XML2kvp kvp + ''' - ''' - Class to scaffold and create XML records from XML2kvp kvp - ''' - - def __init__(self): - - self.root_node = None - self.node_lists = [] - self.nodes = [] - self.merge_metrics = {} - - - def tether_node_lists(self): - - ''' - Method to tether nodes from node_lists as parent/child - - Returns: - writes parent node to self.nodes - ''' - - for node_list in self.node_lists: - - # loop through nodes - parent_node = None - for i,node in enumerate(node_list): - - # append to parent - if i > 0: - parent_node.append(node) + def __init__(self): - # set as new parent and continue - parent_node = node + self.root_node = None + self.node_lists = [] + self.nodes = [] + self.merge_metrics = {} - # add root node from each list to self.nodes - self.nodes.append(node_list[0]) + def tether_node_lists(self): + ''' + Method to tether nodes from node_lists as parent/child - def merge_root_nodes(self): + Returns: + writes parent node to self.nodes + ''' - ''' - Method to merge all nodes from self.nodes - ''' + for node_list in self.node_lists: - # set root with arbitrary first node - self.root_node = self.nodes[0] + # loop through nodes + parent_node = None + for i, node in enumerate(node_list): - # loop through others, add children to root node - for node in self.nodes[1:]: + # append to parent + if i > 0: + parent_node.append(node) - # get children - children = node.getchildren() + # set as new parent and continue + parent_node = node - # loop through and add to root node - for child in children: - self.root_node.append(child) + # add root node from each list to self.nodes + self.nodes.append(node_list[0]) + def merge_root_nodes(self): - def merge_siblings(self, remove_empty_nodes=True, remove_sibling_hash_attrib=True): + ''' + Method to merge all nodes from self.nodes + ''' - ''' - Method to merge all siblings if sibling_hash provided - ''' + # set root with arbitrary first node + self.root_node = self.nodes[0] - # init list of finished hashes - finished_sibling_hashes = [] + # loop through others, add children to root node + for node in self.nodes[1:]: - # loop through root children - for node_path in self.root_node.getchildren(): + # get children + children = node.getchildren() - # get all descendents (should be simple hierarchy) - nodes = list(node_path.iterdescendants()) + # loop through and add to root node + for child in children: + self.root_node.append(child) - # reverse, to deal with most granular first - nodes.reverse() + def merge_siblings(self, remove_empty_nodes=True, remove_sibling_hash_attrib=True): - # loop through nodes - for node in nodes: + ''' + Method to merge all siblings if sibling_hash provided + ''' - # check if sibling hash present as attribute, and not already completed - if 'sibling_hash_id' in node.attrib and node.attrib.get('sibling_hash_id') not in finished_sibling_hashes: + # init list of finished hashes + finished_sibling_hashes = [] - # get hash - sibling_hash = node.attrib.get('sibling_hash_id') + # loop through root children + for node_path in self.root_node.getchildren(): - # group siblings - self.merge_metrics[sibling_hash] = self._siblings_xpath_merge(sibling_hash, - remove_empty_nodes=remove_empty_nodes) + # get all descendents (should be simple hierarchy) + nodes = list(node_path.iterdescendants()) - # remove sibling_hash_id - if remove_sibling_hash_attrib: - all_siblings = self.root_node.xpath('//*[@sibling_hash_id]', namespaces=self.root_node.nsmap) - for sibling in all_siblings: - sibling.attrib.pop('sibling_hash_id') + # reverse, to deal with most granular first + nodes.reverse() + # loop through nodes + for node in nodes: - def _siblings_xpath_merge(self, sibling_hash, remove_empty_nodes=True): + # check if sibling hash present as attribute, and not already completed + if 'sibling_hash_id' in node.attrib and node.attrib.get( + 'sibling_hash_id') not in finished_sibling_hashes: + # get hash + sibling_hash = node.attrib.get('sibling_hash_id') - ''' - Internal method to handle the actual movement of sibling nodes - - performs XPath query - - moves siblings to parent of 0th result + # group siblings + self.merge_metrics[sibling_hash] = self._siblings_xpath_merge(sibling_hash, + remove_empty_nodes=remove_empty_nodes) - Args: - sibling_hash (str): Sibling has to perform Xpath query with - remove_empty_nodes (bool): If True, remove nodes that no longer contain children + # remove sibling_hash_id + if remove_sibling_hash_attrib: + all_siblings = self.root_node.xpath('//*[@sibling_hash_id]', namespaces=self.root_node.nsmap) + for sibling in all_siblings: + sibling.attrib.pop('sibling_hash_id') - Returns: + def _siblings_xpath_merge(self, sibling_hash, remove_empty_nodes=True): - ''' + ''' + Internal method to handle the actual movement of sibling nodes + - performs XPath query + - moves siblings to parent of 0th result - # xpath query to find all siblings in tree - siblings = self.root_node.xpath('//*[@sibling_hash_id="%s"]' % sibling_hash, namespaces=self.root_node.nsmap) + Args: + sibling_hash (str): Sibling has to perform Xpath query with + remove_empty_nodes (bool): If True, remove nodes that no longer contain children - # metrics - removed = 0 - moved = 0 + Returns: - # if results - if len(siblings) > 0: + ''' - # establish arbitrary target parent node as 0th parent - target_parent = siblings[0].getparent() + # xpath query to find all siblings in tree + siblings = self.root_node.xpath('//*[@sibling_hash_id="%s"]' % sibling_hash, namespaces=self.root_node.nsmap) - # loop through remainders and move there - for sibling in siblings[1:]: + # metrics + removed = 0 + moved = 0 - # get parent - parent = sibling.getparent() + # if results + if len(siblings) > 0: - # move to target parent - target_parent.append(sibling) + # establish arbitrary target parent node as 0th parent + target_parent = siblings[0].getparent() - # if flagged, remove parent if now empty - if remove_empty_nodes: - if len(parent.getchildren()) == 0: - parent.getparent().remove(parent) - removed += 1 + # loop through remainders and move there + for sibling in siblings[1:]: - # bump counter - moved += 1 + # get parent + parent = sibling.getparent() - # return metrics - metrics = {'sibling_hash':sibling_hash, 'removed':removed, 'moved':moved} - return metrics + # move to target parent + target_parent.append(sibling) + # if flagged, remove parent if now empty + if remove_empty_nodes: + if len(parent.getchildren()) == 0: + parent.getparent().remove(parent) + removed += 1 - def serialize(self, pretty_print=True): + # bump counter + moved += 1 - ''' - Method to serialize self.root_node to XML - ''' + # return metrics + metrics = {'sibling_hash': sibling_hash, 'removed': removed, 'moved': moved} + return metrics - return etree.tostring(self.root_node, pretty_print=pretty_print).decode('utf-8') + def serialize(self, pretty_print=True): + ''' + Method to serialize self.root_node to XML + ''' + return etree.tostring(self.root_node, pretty_print=pretty_print).decode('utf-8') diff --git a/docs/conf.py b/docs/conf.py index 17f2e890..9118e2d1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -28,7 +28,6 @@ # The full version, including alpha/beta/rc tags release = '' - # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. @@ -68,7 +67,6 @@ # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'manni' - # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for @@ -103,7 +101,6 @@ # Output file base name for HTML help builder. htmlhelp_basename = 'Combinedoc' - # -- Options for LaTeX output ------------------------------------------------ latex_elements = { @@ -132,7 +129,6 @@ 'Michigan State DPLA Service Hub', 'manual'), ] - # -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples @@ -142,7 +138,6 @@ [author], 1) ] - # -- Options for Texinfo output ---------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples @@ -154,10 +149,8 @@ 'Miscellaneous'), ] + # -- Custom Setup for CSS ---------------------------------------------- def setup(app): app.add_stylesheet('css/custom.css') - - - \ No newline at end of file diff --git a/inc/console.py b/inc/console.py index 8bcf2bf8..701b7183 100644 --- a/inc/console.py +++ b/inc/console.py @@ -4,42 +4,41 @@ from core.models import * - # get Record instance def get_r(id): - return Record.objects.get(id=id) + return Record.objects.get(id=id) # get Job instance def get_j(id): - return Job.objects.get(pk=int(id)) + return Job.objects.get(pk=int(id)) # get CombineJob instance def get_cj(id): - return CombineJob.get_combine_job(int(id)) + return CombineJob.get_combine_job(int(id)) # get RecordGroup instance def get_rg(id): - return RecordGroup.objects.get(pk=int(id)) + return RecordGroup.objects.get(pk=int(id)) # get Organization instance def get_o(id): - return Organization.objects.get(pk=int(id)) + return Organization.objects.get(pk=int(id)) # tail livy def tail_livy(): - os.system('tail -f /var/log/livy/livy.stderr') + os.system('tail -f /var/log/livy/livy.stderr') # tail django def tail_celery(): - os.system('tail -f /var/log/celery.stdout') + os.system('tail -f /var/log/celery.stdout') # get StateIO instance def get_sio(id): - return StateIO.objects.get(id=id) \ No newline at end of file + return StateIO.objects.get(id=id) diff --git a/tests/conftest.py b/tests/conftest.py index 3531fe34..864d17a9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,3 @@ - import django from lxml import etree import os @@ -10,6 +9,7 @@ # logging import logging + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -25,31 +25,31 @@ # use active livy def pytest_addoption(parser): - parser.addoption('--keep_records', action="store_true") + parser.addoption('--keep_records', action="store_true") @pytest.fixture def keep_records(request): - return request.config.getoption("--keep_records") + return request.config.getoption("--keep_records") # global variables object "VO" class Vars(object): + ''' + Object to capture and store variables used across tests + ''' - ''' - Object to capture and store variables used across tests - ''' + def __init__(self): + # debug + self.ping = 'pong' - def __init__(self): + # combine user + self.user = User.objects.filter(username='combine').first() - # debug - self.ping = 'pong' - # combine user - self.user = User.objects.filter(username='combine').first() _VO = Vars() + @pytest.fixture def VO(request): - - return _VO \ No newline at end of file + return _VO diff --git a/tests/data/python_validation.py b/tests/data/python_validation.py index 2e20b1c9..bdb6770c 100644 --- a/tests/data/python_validation.py +++ b/tests/data/python_validation.py @@ -1,43 +1,44 @@ import re + def test_check_for_mods_titleInfo(record, test_message="check for mods:titleInfo element"): + titleInfo_elements = record.xml.xpath('//mods:titleInfo', namespaces=record.nsmap) + if len(titleInfo_elements) > 0: + return True + else: + return False - titleInfo_elements = record.xml.xpath('//mods:titleInfo', namespaces=record.nsmap) - if len(titleInfo_elements) > 0: - return True - else: - return False def test_check_dateIssued_format(record, test_message="check mods:dateIssued is YYYY-MM-DD or YYYY or YYYY-YYYY"): + # get dateIssued elements + dateIssued_elements = record.xml.xpath('//mods:dateIssued', namespaces=record.nsmap) + + # if found, check format + if len(dateIssued_elements) > 0: - # get dateIssued elements - dateIssued_elements = record.xml.xpath('//mods:dateIssued', namespaces=record.nsmap) + # loop through values and check + for dateIssued in dateIssued_elements: - # if found, check format - if len(dateIssued_elements) > 0: + # check format + if dateIssued.text is not None: + match = re.match(r'^([0-9]{4}-[0-9]{2}-[0-9]{2})|([0-9]{4})|([0-9]{4}-[0-9]{4})$', dateIssued.text) + else: + # allow None values to pass test + return True - # loop through values and check - for dateIssued in dateIssued_elements: - - # check format - if dateIssued.text is not None: - match = re.match(r'^([0-9]{4}-[0-9]{2}-[0-9]{2})|([0-9]{4})|([0-9]{4}-[0-9]{4})$', dateIssued.text) - else: - # allow None values to pass test - return True + # match found, continue + if match: + continue + else: + return False - # match found, continue - if match: - continue - else: - return False + # if all matches, return True + return True - # if all matches, return True - return True + # if none found, return True indicating passed test due to omission + else: + return True - # if none found, return True indicating passed test due to omission - else: - return True def test_will_fail(record, test_message="Failure test confirmed fail"): - return False \ No newline at end of file + return False diff --git a/tests/data/qs_python_validation.py b/tests/data/qs_python_validation.py index c4062da3..1912b6c6 100644 --- a/tests/data/qs_python_validation.py +++ b/tests/data/qs_python_validation.py @@ -1,40 +1,40 @@ import re + def test_check_for_mods_titleInfo(record, test_message="check for mods:titleInfo element"): + titleInfo_elements = record.xml.xpath('//mods:titleInfo', namespaces=record.nsmap) + if len(titleInfo_elements) > 0: + return True + else: + return False - titleInfo_elements = record.xml.xpath('//mods:titleInfo', namespaces=record.nsmap) - if len(titleInfo_elements) > 0: - return True - else: - return False def test_check_dateIssued_format(record, test_message="check mods:dateIssued is YYYY-MM-DD or YYYY or YYYY-YYYY"): - - # get dateIssued elements - dateIssued_elements = record.xml.xpath('//mods:dateIssued', namespaces=record.nsmap) - - # if found, check format - if len(dateIssued_elements) > 0: - - # loop through values and check - for dateIssued in dateIssued_elements: - - # check format - if dateIssued.text is not None: - match = re.match(r'^([0-9]{4}-[0-9]{2}-[0-9]{2})|([0-9]{4})|([0-9]{4}-[0-9]{4})$', dateIssued.text) - else: - # allow None values to pass test - return True - - # match found, continue - if match: - continue - else: - return False - - # if all matches, return True - return True - - # if none found, return True indicating passed test due to omission - else: - return True \ No newline at end of file + # get dateIssued elements + dateIssued_elements = record.xml.xpath('//mods:dateIssued', namespaces=record.nsmap) + + # if found, check format + if len(dateIssued_elements) > 0: + + # loop through values and check + for dateIssued in dateIssued_elements: + + # check format + if dateIssued.text is not None: + match = re.match(r'^([0-9]{4}-[0-9]{2}-[0-9]{2})|([0-9]{4})|([0-9]{4}-[0-9]{4})$', dateIssued.text) + else: + # allow None values to pass test + return True + + # match found, continue + if match: + continue + else: + return False + + # if all matches, return True + return True + + # if none found, return True indicating passed test due to omission + else: + return True diff --git a/tests/test_basic.py b/tests/test_basic.py index a2db980d..e39d3bb0 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,4 +1,3 @@ - import django from lxml import etree import os @@ -10,6 +9,7 @@ # logging import logging + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -29,36 +29,33 @@ @pytest.mark.run(order=1) def test_organization_create(VO): + ''' + Test creation of organization + ''' - ''' - Test creation of organization - ''' - - # instantiate and save - VO.org = Organization( - name='test_org_%s' % uuid.uuid4().hex, - description='' - ) - VO.org.save() - assert type(VO.org.id) == int + # instantiate and save + VO.org = Organization( + name='test_org_%s' % uuid.uuid4().hex, + description='' + ) + VO.org.save() + assert type(VO.org.id) == int @pytest.mark.run(order=2) def test_record_group_create(VO): + ''' + Test creation of record group + ''' - ''' - Test creation of record group - ''' - - # instantiate and save - VO.rg = RecordGroup( - organization=VO.org, - name='test_record_group_%s' % uuid.uuid4().hex, - description='' - ) - VO.rg.save() - assert type(VO.rg.id) == int - + # instantiate and save + VO.rg = RecordGroup( + organization=VO.org, + name='test_record_group_%s' % uuid.uuid4().hex, + description='' + ) + VO.rg.save() + assert type(VO.rg.id) == int ############################################################################# @@ -67,80 +64,78 @@ def test_record_group_create(VO): @pytest.mark.run(order=3) def test_static_harvest(VO): - - ''' - Test static harvest of XML records from disk - ''' - - # copy test data to /tmp - payload_dir = '/tmp/%s' % uuid.uuid4().hex - shutil.copytree('/opt/combine/tests/data/static_harvest_data', payload_dir) - - # emulate request.POST - request_dict = { - 'dbdd': '', - 'job_note': '', - 'xpath_record_id': '', - 'static_filepath': payload_dir, - 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', - 'static_payload': '', - 'job_name': '', - 'field_mapper': 'default', - 'rits': '', - 'additional_namespace_decs': 'xmlns:mods="http://www.loc.gov/mods/v3"', - 'document_element_root': 'mods:mods' - } - query_dict = QueryDict('', mutable=True) - query_dict.update(request_dict) - - # init job, using Variable Object (VO) - cjob = CombineJob.init_combine_job( - user = VO.user, - record_group = VO.rg, - job_type_class = HarvestStaticXMLJob, - job_params = query_dict, - files = {}, - hash_payload_filename = False - ) - - # start job and update status - job_status = cjob.start_job() - - # if job_status is absent, report job status as failed - if job_status == False: - cjob.job.status = 'failed' - cjob.job.save() - - # poll until complete - for x in range(0, 480): - - # pause - time.sleep(1) - - # refresh session - cjob.job.update_status() - - # check status - if cjob.job.status != 'available': - continue - else: - break - - # save static harvest job to VO - VO.static_harvest_cjob = cjob - - # remove payload_dir - shutil.rmtree(payload_dir) - - # assert job is done and available via livy - assert VO.static_harvest_cjob.job.status == 'available' - - # assert record count is 250 - assert VO.static_harvest_cjob.job.record_count == 250 - - # assert no indexing failures - assert len(VO.static_harvest_cjob.get_indexing_failures()) == 0 - + ''' + Test static harvest of XML records from disk + ''' + + # copy test data to /tmp + payload_dir = '/tmp/%s' % uuid.uuid4().hex + shutil.copytree('/opt/combine/tests/data/static_harvest_data', payload_dir) + + # emulate request.POST + request_dict = { + 'dbdd': '', + 'job_note': '', + 'xpath_record_id': '', + 'static_filepath': payload_dir, + 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', + 'static_payload': '', + 'job_name': '', + 'field_mapper': 'default', + 'rits': '', + 'additional_namespace_decs': 'xmlns:mods="http://www.loc.gov/mods/v3"', + 'document_element_root': 'mods:mods' + } + query_dict = QueryDict('', mutable=True) + query_dict.update(request_dict) + + # init job, using Variable Object (VO) + cjob = CombineJob.init_combine_job( + user=VO.user, + record_group=VO.rg, + job_type_class=HarvestStaticXMLJob, + job_params=query_dict, + files={}, + hash_payload_filename=False + ) + + # start job and update status + job_status = cjob.start_job() + + # if job_status is absent, report job status as failed + if job_status == False: + cjob.job.status = 'failed' + cjob.job.save() + + # poll until complete + for x in range(0, 480): + + # pause + time.sleep(1) + + # refresh session + cjob.job.update_status() + + # check status + if cjob.job.status != 'available': + continue + else: + break + + # save static harvest job to VO + VO.static_harvest_cjob = cjob + + # remove payload_dir + shutil.rmtree(payload_dir) + + # assert job is done and available via livy + assert VO.static_harvest_cjob.job.status == 'available' + + # assert record count is 250 + assert VO.static_harvest_cjob.job.record_count == 250 + + # assert no indexing failures + assert len(VO.static_harvest_cjob.get_indexing_failures()) == 0 # ############################################################################# @@ -148,98 +143,95 @@ def test_static_harvest(VO): # ############################################################################# def prepare_transform(): + ''' + Create temporary transformation scenario based on tests/data/mods_transform.xsl + ''' - ''' - Create temporary transformation scenario based on tests/data/mods_transform.xsl - ''' - - with open('tests/data/mods_transform.xsl','r') as f: - xsl_string = f.read() - trans = Transformation( - name='temp_mods_transformation', - payload=xsl_string, - transformation_type='xslt', - filepath='will_be_updated' - ) - trans.save() + with open('tests/data/mods_transform.xsl', 'r') as f: + xsl_string = f.read() + trans = Transformation( + name='temp_mods_transformation', + payload=xsl_string, + transformation_type='xslt', + filepath='will_be_updated' + ) + trans.save() - # return transformation - return trans + # return transformation + return trans @pytest.mark.run(order=4) def test_static_transform(VO): - - ''' - Test static harvest of XML records from disk - ''' - - # prepare and capture temporary transformation scenario - VO.transformation_scenario = prepare_transform() - - # emulate request.POST - request_dict = { - 'dbdd': '', - 'field_mapper': 'default', - 'filter_dupe_record_ids': 'true', - 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', - 'input_es_query_valve': '', - 'input_job_id': VO.static_harvest_cjob.job.id, - 'input_numerical_valve': '', - 'input_validity_valve': 'all', - 'job_name': '', - 'job_note': '', - 'rits': '', - 'sel_trans_json': '[{"index":0,"trans_id":%s}]' % VO.transformation_scenario.id - } - query_dict = QueryDict('', mutable=True) - query_dict.update(request_dict) - - # init job - cjob = CombineJob.init_combine_job( - user = VO.user, - record_group = VO.rg, - job_type_class = TransformJob, - job_params = query_dict) - - # start job and update status - job_status = cjob.start_job() - - # if job_status is absent, report job status as failed - if job_status == False: - cjob.job.status = 'failed' - cjob.job.save() - - # poll until complete - for x in range(0, 480): - - # pause - time.sleep(1) - - # refresh session - cjob.job.update_status() - - # check status - if cjob.job.status != 'available': - continue - else: - break - - # save static harvest job to VO - VO.static_transform_cjob = cjob - - # assert job is done and available via livy - assert VO.static_transform_cjob.job.status == 'available' - - # assert record count is 250 - assert VO.static_transform_cjob.job.record_count == 250 - - # assert no indexing failures - assert len(VO.static_transform_cjob.get_indexing_failures()) == 0 - - # remove transformation - assert VO.transformation_scenario.delete()[0] > 0 - + ''' + Test static harvest of XML records from disk + ''' + + # prepare and capture temporary transformation scenario + VO.transformation_scenario = prepare_transform() + + # emulate request.POST + request_dict = { + 'dbdd': '', + 'field_mapper': 'default', + 'filter_dupe_record_ids': 'true', + 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', + 'input_es_query_valve': '', + 'input_job_id': VO.static_harvest_cjob.job.id, + 'input_numerical_valve': '', + 'input_validity_valve': 'all', + 'job_name': '', + 'job_note': '', + 'rits': '', + 'sel_trans_json': '[{"index":0,"trans_id":%s}]' % VO.transformation_scenario.id + } + query_dict = QueryDict('', mutable=True) + query_dict.update(request_dict) + + # init job + cjob = CombineJob.init_combine_job( + user=VO.user, + record_group=VO.rg, + job_type_class=TransformJob, + job_params=query_dict) + + # start job and update status + job_status = cjob.start_job() + + # if job_status is absent, report job status as failed + if job_status == False: + cjob.job.status = 'failed' + cjob.job.save() + + # poll until complete + for x in range(0, 480): + + # pause + time.sleep(1) + + # refresh session + cjob.job.update_status() + + # check status + if cjob.job.status != 'available': + continue + else: + break + + # save static harvest job to VO + VO.static_transform_cjob = cjob + + # assert job is done and available via livy + assert VO.static_transform_cjob.job.status == 'available' + + # assert record count is 250 + assert VO.static_transform_cjob.job.record_count == 250 + + # assert no indexing failures + assert len(VO.static_transform_cjob.get_indexing_failures()) == 0 + + # remove transformation + assert VO.transformation_scenario.delete()[0] > 0 # ############################################################################# @@ -248,99 +240,94 @@ def test_static_transform(VO): @pytest.mark.run(order=5) def test_add_schematron_validation_scenario(VO): + ''' + Add schematron validation + ''' - ''' - Add schematron validation - ''' + # get schematron validation from test data + with open('tests/data/schematron_validation.sch', 'r') as f: + sch_payload = f.read() - # get schematron validation from test data - with open('tests/data/schematron_validation.sch','r') as f: - sch_payload = f.read() + # init new validation scenario + schematron_validation_scenario = ValidationScenario( + name='temp_vs_%s' % str(uuid.uuid4()), + payload=sch_payload, + validation_type='sch', + default_run=False + ) + schematron_validation_scenario.save() - # init new validation scenario - schematron_validation_scenario = ValidationScenario( - name='temp_vs_%s' % str(uuid.uuid4()), - payload=sch_payload, - validation_type='sch', - default_run=False - ) - schematron_validation_scenario.save() + # pin to VO + VO.schematron_validation_scenario = schematron_validation_scenario - # pin to VO - VO.schematron_validation_scenario = schematron_validation_scenario - - # assert creation - assert type(VO.schematron_validation_scenario.id) == int + # assert creation + assert type(VO.schematron_validation_scenario.id) == int @pytest.mark.run(order=6) def test_add_python_validation_scenario(VO): + ''' + Add python code snippet validation + ''' - ''' - Add python code snippet validation - ''' - - # get python validation from test data - with open('tests/data/python_validation.py','r') as f: - py_payload = f.read() + # get python validation from test data + with open('tests/data/python_validation.py', 'r') as f: + py_payload = f.read() - # init new validation scenario - python_validation_scenario = ValidationScenario( - name='temp_vs_%s' % str(uuid.uuid4()), - payload=py_payload, - validation_type='python', - default_run=False - ) - python_validation_scenario.save() + # init new validation scenario + python_validation_scenario = ValidationScenario( + name='temp_vs_%s' % str(uuid.uuid4()), + payload=py_payload, + validation_type='python', + default_run=False + ) + python_validation_scenario.save() - # pin to VO - VO.python_validation_scenario = python_validation_scenario + # pin to VO + VO.python_validation_scenario = python_validation_scenario - # assert creation - assert type(VO.python_validation_scenario.id) == int + # assert creation + assert type(VO.python_validation_scenario.id) == int @pytest.mark.run(order=7) def test_schematron_validation(VO): + # get target records + VO.harvest_record = VO.static_harvest_cjob.job.get_records().first() + VO.transform_record = VO.static_transform_cjob.job.get_records().first() - # get target records - VO.harvest_record = VO.static_harvest_cjob.job.get_records().first() - VO.transform_record = VO.static_transform_cjob.job.get_records().first() - - # validate harvest record with schematron - ''' - expecting failure count of 2 - ''' - vs_results = VO.schematron_validation_scenario.validate_record(VO.harvest_record) - assert vs_results['parsed']['fail_count'] == 2 + # validate harvest record with schematron + ''' + expecting failure count of 2 + ''' + vs_results = VO.schematron_validation_scenario.validate_record(VO.harvest_record) + assert vs_results['parsed']['fail_count'] == 2 - # validate transform record with schematron - ''' - expecting failure count of 1 - ''' - vs_results = VO.schematron_validation_scenario.validate_record(VO.transform_record) - assert vs_results['parsed']['fail_count'] == 1 + # validate transform record with schematron + ''' + expecting failure count of 1 + ''' + vs_results = VO.schematron_validation_scenario.validate_record(VO.transform_record) + assert vs_results['parsed']['fail_count'] == 1 @pytest.mark.run(order=8) def test_python_validation(VO): - - # validate harvest record with python - ''' - expecting failure count of 1 - ''' - vs_results = VO.python_validation_scenario.validate_record(VO.harvest_record) - print(vs_results) - assert vs_results['parsed']['fail_count'] == 1 - - # validate transform record with python - ''' - expecting failure count of 1 - ''' - vs_results = VO.python_validation_scenario.validate_record(VO.transform_record) - print(vs_results) - assert vs_results['parsed']['fail_count'] == 1 - + # validate harvest record with python + ''' + expecting failure count of 1 + ''' + vs_results = VO.python_validation_scenario.validate_record(VO.harvest_record) + print(vs_results) + assert vs_results['parsed']['fail_count'] == 1 + + # validate transform record with python + ''' + expecting failure count of 1 + ''' + vs_results = VO.python_validation_scenario.validate_record(VO.transform_record) + print(vs_results) + assert vs_results['parsed']['fail_count'] == 1 # ############################################################################# @@ -349,88 +336,86 @@ def test_python_validation(VO): @pytest.mark.run(order=9) def test_merge_duplicate(VO): - - ''' - Duplicate Transform job, applying newly created validation scenarios - ''' - - # emulate request.POST - request_dict = { - 'dbdd': '', - 'field_mapper': 'default', - 'filter_dupe_record_ids': 'true', - 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', - 'input_es_query_valve': '', - 'input_numerical_valve': '', - 'input_validity_valve': 'all', - 'job_name': '', - 'job_note': '', - 'rits': '' - } - query_dict = QueryDict('', mutable=True) - query_dict.update(request_dict) - - # set input jobs with QueryDict.setlist - query_dict.setlist('input_job_id', [ - VO.static_harvest_cjob.job.id, - VO.static_transform_cjob.job.id - ]) - # set validation scenarios with QueryDict.setlist - query_dict.setlist('validation_scenario', [ - VO.schematron_validation_scenario.id, - VO.python_validation_scenario.id - ]) - - # init job - cjob = CombineJob.init_combine_job( - user = VO.user, - record_group = VO.rg, - job_type_class = MergeJob, - job_params = query_dict) - - # start job and update status - job_status = cjob.start_job() - - # if job_status is absent, report job status as failed - if job_status == False: - cjob.job.status = 'failed' - cjob.job.save() - - # poll until complete - for x in range(0, 480): - - # pause - time.sleep(1) - - # refresh session - cjob.job.update_status() - - # check status - if cjob.job.status != 'available': - continue - else: - break - - # save static harvest job to VO - VO.merge_cjob = cjob - - # assert job is done and available via livy - assert VO.merge_cjob.job.status == 'available' - - # assert record count is 250 - assert VO.merge_cjob.job.record_count == 250 - - # assert validation scenarios applied - job_validation_scenarios = VO.merge_cjob.job.jobvalidation_set.all() - assert job_validation_scenarios.count() == 2 - - # loop through validation scenarios and confirm that both show 250 failures - for jv in job_validation_scenarios: - assert jv.get_record_validation_failures().count() == 232 - - # assert no indexing failures - assert len(VO.merge_cjob.get_indexing_failures()) == 0 - + ''' + Duplicate Transform job, applying newly created validation scenarios + ''' + + # emulate request.POST + request_dict = { + 'dbdd': '', + 'field_mapper': 'default', + 'filter_dupe_record_ids': 'true', + 'fm_config_json': '{"add_literals":{},"capture_attribute_values":[],"concat_values_on_all_fields":false,"concat_values_on_fields":{},"copy_to":{},"copy_to_regex":{},"copy_value_to_regex":{},"error_on_delims_collision":false,"exclude_attributes":[],"exclude_elements":[],"include_all_attributes":false,"include_attributes":[],"include_sibling_id":false,"multivalue_delim":"|","node_delim":"_","ns_prefix_delim":"|","remove_copied_key":true,"remove_copied_value":false,"remove_ns_prefix":true,"repeating_element_suffix_count":false,"self_describing":false,"skip_attribute_ns_declarations":true,"skip_repeating_values":true,"skip_root":false,"split_values_on_all_fields":false,"split_values_on_fields":{}}', + 'input_es_query_valve': '', + 'input_numerical_valve': '', + 'input_validity_valve': 'all', + 'job_name': '', + 'job_note': '', + 'rits': '' + } + query_dict = QueryDict('', mutable=True) + query_dict.update(request_dict) + + # set input jobs with QueryDict.setlist + query_dict.setlist('input_job_id', [ + VO.static_harvest_cjob.job.id, + VO.static_transform_cjob.job.id + ]) + # set validation scenarios with QueryDict.setlist + query_dict.setlist('validation_scenario', [ + VO.schematron_validation_scenario.id, + VO.python_validation_scenario.id + ]) + + # init job + cjob = CombineJob.init_combine_job( + user=VO.user, + record_group=VO.rg, + job_type_class=MergeJob, + job_params=query_dict) + + # start job and update status + job_status = cjob.start_job() + + # if job_status is absent, report job status as failed + if job_status == False: + cjob.job.status = 'failed' + cjob.job.save() + + # poll until complete + for x in range(0, 480): + + # pause + time.sleep(1) + + # refresh session + cjob.job.update_status() + + # check status + if cjob.job.status != 'available': + continue + else: + break + + # save static harvest job to VO + VO.merge_cjob = cjob + + # assert job is done and available via livy + assert VO.merge_cjob.job.status == 'available' + + # assert record count is 250 + assert VO.merge_cjob.job.record_count == 250 + + # assert validation scenarios applied + job_validation_scenarios = VO.merge_cjob.job.jobvalidation_set.all() + assert job_validation_scenarios.count() == 2 + + # loop through validation scenarios and confirm that both show 250 failures + for jv in job_validation_scenarios: + assert jv.get_record_validation_failures().count() == 232 + + # assert no indexing failures + assert len(VO.merge_cjob.get_indexing_failures()) == 0 ############################################################################# @@ -439,29 +424,15 @@ def test_merge_duplicate(VO): @pytest.mark.last def test_teardown(keep_records, VO): - - ''' - Test teardown - ''' - - # assert delete of org and children - if not keep_records: - assert VO.org.delete()[0] > 0 - else: - assert True - - assert VO.schematron_validation_scenario.delete()[0] > 0 - assert VO.python_validation_scenario.delete()[0] > 0 - - - - - - - - - - - - - + ''' + Test teardown + ''' + + # assert delete of org and children + if not keep_records: + assert VO.org.delete()[0] > 0 + else: + assert True + + assert VO.schematron_validation_scenario.delete()[0] > 0 + assert VO.python_validation_scenario.delete()[0] > 0 diff --git a/tests/test_bg_tasks.py b/tests/test_bg_tasks.py index c03bacdf..0f4a922d 100644 --- a/tests/test_bg_tasks.py +++ b/tests/test_bg_tasks.py @@ -1,4 +1,3 @@ - import django from lxml import etree import os @@ -10,6 +9,7 @@ # logging import logging + logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) @@ -30,41 +30,31 @@ # test re-indexing @pytest.mark.run(order=10) def test_static_harvest_reindex(VO): + # refresh job + VO.static_harvest_cjob = CombineJob.get_combine_job(VO.static_harvest_cjob.job.id) - # refresh job - VO.static_harvest_cjob = CombineJob.get_combine_job(VO.static_harvest_cjob.job.id) - - # fm config json, adding literal foo:bar - fm_config_json = '{"concat_values_on_all_fields": false, "capture_attribute_values": [], "remove_ns_prefix": true, "skip_attribute_ns_declarations": true, "remove_copied_key": true, "node_delim": "_", "copy_to": {}, "copy_value_to_regex": {}, "copy_to_regex": {}, "split_values_on_all_fields": false, "add_literals": {"foo":"bar"}, "exclude_attributes": [], "ns_prefix_delim": "|", "self_describing": false, "split_values_on_fields": {}, "include_attributes": [], "include_sibling_id": false, "multivalue_delim": "|", "skip_repeating_values": true, "repeating_element_suffix_count": false, "exclude_elements": [], "concat_values_on_fields": {}, "remove_copied_value": false, "error_on_delims_collision": false, "include_all_attributes": false, "skip_root": false}' - - # reindex static harvest - bg_task = VO.static_harvest_cjob.reindex_bg_task(fm_config_json=fm_config_json) - - # poll until complete - for x in range(0, 480): - - # pause - time.sleep(1) - logger.debug('polling for reindexing %s seconds...' % (x)) - - # refresh session - bg_task.update() - - # check status - if bg_task.celery_status not in ['SUCCESS','FAILURE']: - continue - else: - break - - # assert 250 records have foo:bar, indicating successful reindexing - results = VO.static_harvest_cjob.field_analysis('foo') - assert results['metrics']['doc_instances'] == 250 - - - + # fm config json, adding literal foo:bar + fm_config_json = '{"concat_values_on_all_fields": false, "capture_attribute_values": [], "remove_ns_prefix": true, "skip_attribute_ns_declarations": true, "remove_copied_key": true, "node_delim": "_", "copy_to": {}, "copy_value_to_regex": {}, "copy_to_regex": {}, "split_values_on_all_fields": false, "add_literals": {"foo":"bar"}, "exclude_attributes": [], "ns_prefix_delim": "|", "self_describing": false, "split_values_on_fields": {}, "include_attributes": [], "include_sibling_id": false, "multivalue_delim": "|", "skip_repeating_values": true, "repeating_element_suffix_count": false, "exclude_elements": [], "concat_values_on_fields": {}, "remove_copied_value": false, "error_on_delims_collision": false, "include_all_attributes": false, "skip_root": false}' + # reindex static harvest + bg_task = VO.static_harvest_cjob.reindex_bg_task(fm_config_json=fm_config_json) + # poll until complete + for x in range(0, 480): + # pause + time.sleep(1) + logger.debug('polling for reindexing %s seconds...' % (x)) + # refresh session + bg_task.update() + # check status + if bg_task.celery_status not in ['SUCCESS', 'FAILURE']: + continue + else: + break + # assert 250 records have foo:bar, indicating successful reindexing + results = VO.static_harvest_cjob.field_analysis('foo') + assert results['metrics']['doc_instances'] == 250