diff --git a/apps/cli/utilities/mr_books/database/__init__.py b/apps/cli/utilities/datafinder/README.md similarity index 100% rename from apps/cli/utilities/mr_books/database/__init__.py rename to apps/cli/utilities/datafinder/README.md diff --git a/apps/cli/utilities/datafinder/setup.py b/apps/cli/utilities/datafinder/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..cb31133e0edeac0a127a72de129c9008bfea0fb9 --- /dev/null +++ b/apps/cli/utilities/datafinder/setup.py @@ -0,0 +1,33 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from pathlib import Path +from setuptools import setup + +VERSION = open('src/datafinder/_version.py').readlines()[-1].split()[-1].strip("\"'") +README = Path('README.md').read_text() + +setup( + name=Path().absolute().name, + version=VERSION, + description='NRAO Archive Data Finding Tools', + long_description=README, + author='NRAO SSA Team', + author_email='dms-ssa@nrao.edu', + url='TBD', + license="GPL", + install_requires=['pandas', 'requests', 'schema', 'sqlalchemy', 'tqdm'], + keywords=[], + packages=['datafinder'], + package_dir={'':'src'}, + classifiers=[ + 'Programming Language :: Python :: 3.8' + ], + entry_points={ + 'console_scripts': [ + 'datafinder = datafinder.datafinder:main', + 'reconciler = datafinder.reconciler:main', + 'missingbdfs = datafinder.missingbdfs:main' + ] + }, +) diff --git a/apps/cli/utilities/datafinder/src/datafinder/__init__.py b/apps/cli/utilities/datafinder/src/datafinder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/datafinder/src/datafinder/_version.py b/apps/cli/utilities/datafinder/src/datafinder/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..e63ec3a147fc65f8e390ff413a349d66169f8ec1 --- /dev/null +++ b/apps/cli/utilities/datafinder/src/datafinder/_version.py @@ -0,0 +1,2 @@ +""" Version information for this package, don't put anything else here. """ +___version___ = '4.0a1.dev1' diff --git a/apps/cli/utilities/datafinder/src/datafinder/datafinder.py b/apps/cli/utilities/datafinder/src/datafinder/datafinder.py new file mode 100644 index 0000000000000000000000000000000000000000..261ac31f279b29e40eb91d4ce8b22cbc2f27f9cc --- /dev/null +++ b/apps/cli/utilities/datafinder/src/datafinder/datafinder.py @@ -0,0 +1,396 @@ +""" +Finding missing data in the archive. + +This tool generates reports. At the moment it generates these reports: + + - Products whose NGAS IDs are not known by NGAS + +In the future it should also generate these reports: + + - Discrepancies between the legacy archive and the new archive + +In "summary mode", the tool produces a count of the problems. In "verbose mode", the tool produces a report of exactly +which products are missing. +""" +from abc import ABC, abstractmethod +import argparse +from functools import reduce +import math +from typing import List +import sys + +import pandas +from pandas import DataFrame +from sqlalchemy.orm.exc import NoResultFound +from tqdm import tqdm + +import schema +from .util import groups_of +from schema import ScienceProduct, AncillaryProduct, Project +from schema.ngasmodel import NGASFile + + +# These are the columns of the general DataFrame we use for looking up product files +COLUMNS = ['locator', 'ngas_id', 'ngas_cluster', 'ngas_location'] + +# This is the number of files to try at once +GROUP_SIZE = 10000 + + +class ReporterMode(ABC): + """ + Abstracts "details mode" and "summary mode". + """ + @abstractmethod + def product_found_missing(self, locator: str, found: List[str], missing: List[str]): + """ + Report on each locator's found or missing files + :param locator: the locator under which we have these files + :param found: the files that were found + :param missing: the files that were missing + :return: + """ + pass + + @abstractmethod + def final_report(self) -> bool: + """ + Produce the final report. + :return: true if all the files are present; false if some are missing. + """ + pass + + +class SummaryReporterMode(ReporterMode): + """ + Summary mode just produces a final report of how many products were found or missing. + """ + def __init__(self): + self.products, self.found, self.missing = 0, 0, 0 + + def product_found_missing(self, locator, found, missing): + self.products += 1 + self.found += len(found) + self.missing += len(missing) + + def final_report(self): + print(f'{self.products} products\n{self.found} files found\n{self.missing} files missing') + return self.missing == 0 + + +class DetailsReporterMode(ReporterMode): + """ + Details mode tells you something about each missing file. + If verbose is True, it also tells you something about every product that is not missing files. + """ + def __init__(self): + self.summary = SummaryReporterMode() + self.verbose = False + + def product_found_missing(self, locator, found, missing): + # defer to our delegate for adding things up + self.summary.product_found_missing(locator, found, missing) + + # now print some extra stuff + if len(missing) > 0: + print(f'{locator} is missing {len(missing)} files') + if self.verbose: + for file in found: + print(f' {file}') + elif self.verbose: + print(f'{locator} is intact') + + def final_report(self): + return self.summary.final_report() + + +class ProductDegrouper(ReporterMode): + """ + Since we chunk up the output by some number of results at a time, all of the files found and missing + per product are not going to be sent at the same time. This "mode" just wraps another mode and + groups together all the notices for each file by waiting for the locator to change. + """ + def __init__(self, underlying: ReporterMode): + self.underlying = underlying + self.last_locator = None + self.last_found = [] + self.last_missing = [] + + def product_found_missing(self, locator: str, found: List[str], missing: List[str]): + # new call for the previous locator? add things up + if self.last_locator is not None and locator == self.last_locator: + self.last_found.extend(found) + self.last_missing.extend(missing) + else: + # report the previous batch + self.underlying.product_found_missing(self.last_locator, self.last_found, self.last_missing) + + # reset things + self.last_locator = locator + self.last_found = found + self.last_missing = missing + + def final_report(self): + # send out the last batch + self.product_found_missing(None, [], []) + # now propagate the final report + return self.underlying.final_report() + + +def find_science_product_files(session): + """ + Find all science products. + """ + query = """ + with recursive + rgroups as ( + select sp.filegroup_id as top_fg, sp.filegroup_id as child_fg + from science_products sp + union all + select rgroups.top_fg as top_fg, fg.filegroup_id + from rgroups + join filegroups fg on rgroups.child_fg = fg.parent_filegroup_id) + select sp.science_product_locator as locator, f.ngas_id, f.ngas_cluster, f.ngas_location from science_products sp + join rgroups on sp.filegroup_id = rgroups.top_fg + join files f on f.filegroup = rgroups.child_fg""" + return pandas.read_sql(query, session.connection()) + + +def find_ancillary_product_files(session): + """ + Find all ancillary products. + """ + query = """ + with recursive + rgroups as ( + select ap.filegroup_id as top_fg, ap.filegroup_id as child_fg + from ancillary_products ap + union all + select rgroups.top_fg as top_fg, fg.filegroup_id + from rgroups + join filegroups fg on rgroups.child_fg = fg.parent_filegroup_id) + select ap.ancillary_product_locator as locator, f.ngas_id, f.ngas_cluster, f.ngas_location from ancillary_products ap + join rgroups on ap.filegroup_id = rgroups.top_fg + join files f on f.filegroup = rgroups.child_fg""" + return pandas.read_sql(query, session.connection()) + + +def find_all_products(session): + """ + Find all products, period + """ + return find_science_product_files(session).append(find_ancillary_product_files(session)) + + +def find_product_files(product_locator): + """ + Find all the files under this product locator + """ + def find_files(session): + try: + # technically this kind of comparison should only happen in the locator service, but here we are + if 'ancillary_product' in product_locator: + product = session.query(AncillaryProduct)\ + .filter(AncillaryProduct.ancillary_product_locator == product_locator).one() + return get_ancillary_product_files(product) + else: + product = session.query(ScienceProduct)\ + .filter(ScienceProduct.science_product_locator == product_locator).one() + return get_science_product_files(product) + except NoResultFound: + print(f'Unable to locate product {product_locator}', file=sys.stderr) + return find_nothing(session) + + return find_files + + +def get_science_product_files(product: ScienceProduct) -> DataFrame: + """ + Find all the files under this science product + """ + result = DataFrame(((product.locator, f.ngas_id, f.ngas_cluster, f.ngas_location) + for f in product.filegroup.all_files), + columns=COLUMNS) + for ap in product.ancillary_products: + result = result.append(get_ancillary_product_files(ap)) + return result + + +def get_ancillary_product_files(product: AncillaryProduct) -> DataFrame: + """ + Find all of the files under this ancillary product + """ + return DataFrame(((product.locator, f.ngas_id, f.ngas_cluster, f.ngas_location) + for f in product.filegroup.all_files), + columns=COLUMNS) + + +def find_project_files(project_code): + """ + Find all the files under this project + """ + def find_files(session): + try: + project = session.query(Project).filter(Project.project_code == project_code).one() + result = DataFrame([], columns=COLUMNS) + for sp in project.science_products: + result = result.append(get_science_product_files(sp)) + return result + except NoResultFound: + print(f'Unable to locate project {project_code}', file=sys.stderr) + return find_nothing(session) + return find_files + + +def combine_finders(finder1, finder2): + """ + Combine two finder functions together into a single one + """ + def find_products(session): + return finder1(session).append(finder2(session)) + return find_products + + +def find_nothing(session): + """ + Find nothing. This is needed as a base case for the recursion of joining together a list of finders. + """ + return DataFrame([], columns=COLUMNS) + + +class DataFinder: + def __init__(self): + self.reporter = SummaryReporterMode() + self.product_finder = find_all_products + self.quiet = False + self.verbose = False + + def main(self): + # parse the command line + self.parse_commandline() + + # run the process + return self.process() + + def parse_commandline(self): + parser = argparse.ArgumentParser(description='Find missing data in the archive') + parser.add_argument('--science', dest='product_finder', action='append', const=find_science_product_files, + help='check all science products', nargs='?', default=[]) + parser.add_argument('--ancillary', dest='product_finder', action='append', const=find_ancillary_product_files, + help='check all ancillary products', nargs='?', default=[]) + parser.add_argument('--all', dest='product_finder', action='append', const=find_all_products, + help='check all products', nargs='?', default=[]) + parser.add_argument('--project', dest='projects', action='append', default=[], + help='check all products associated with this project') + parser.add_argument('--product', dest='products', action='append', default=[], + help='check this product specifically') + parser.add_argument('-s', dest='reporter', action='store_const', const=SummaryReporterMode(), + default=SummaryReporterMode(), + help='summary mode (just numbers please)') + parser.add_argument('-d', dest='reporter', action='store_const', const=DetailsReporterMode(), + help='details mode (report for every product)') + parser.add_argument('-q', dest='quiet', action='store_true', default=False, help='suppress progress bar') + parser.add_argument('-v', dest='verbose', action='store_true', default=False, help='show intact files') + + args = parser.parse_args() + + # now let's get some things from the namespace + self.quiet = args.quiet + self.build_reporter(args) + self.build_file_finder(args) + + def build_file_finder(self, args): + # We have two levels of abstraction here. the lower level one is that we have a few functions that, + # when given a SQL Alchemy session, returns a DataFrame with COLUMNS for columns. These functions are + # find_science_product_files and find_ancillary_product_files, and find_all_product_files. + # + # These can be composed with each other by calling one, calling the second, and appending the output of the + # second to the first, which is exactly what combine_finders does. You can generalize this to a list of finders + # by using reduce, which calls combine_finders on each pair until there is just one. find_nothing acts as a + # default if none are supplied. + # + # The higher layer of abstraction is the find_product_files and find_project_files functions. These need to be + # composed with the functions discussed above, so they can only take the SQL Alchemy session as a parameter, + # but they also need to know the name of their product or project. The solution is to make these into functions + # that return functions of the above type, so find_product_files('foo') returns a function find_product(session) + # that takes a session and returns the files it finds for 'foo'. + # + # This method first converts the higher-level finders for specific projects and products into their reduced + # form, and then composes those with whatever low-level finders exist. + + # prepare the specific products finder + # so the idea here is first to create a finder that gets each specific products that were asked for + single_product_finders = reduce(combine_finders, [find_product_files(p) for p in args.products], find_nothing) + + # do the same thing, for projects + project_product_finders = reduce(combine_finders, [find_project_files(p) for p in args.projects], + find_nothing) + + # now we add that to the list of product finders from the arguments + # that list contains finders for finding all science, all ancillary, etc. + args.product_finder.insert(0, project_product_finders) + args.product_finder.insert(0, single_product_finders) + + # now we combine all of the set finders into a single function that gets everything for us + self.product_finder = reduce(combine_finders, args.product_finder, find_nothing) + + def build_reporter(self, args): + # the reporter is a bit gross, because we have the instance above, + # but we need to propagate the verbose setting into it + self.reporter = args.reporter + self.reporter.verbose = args.verbose # yes, this is gross + # and now we have to degroupify the file results, which we can do with a small wrapper here + self.reporter = ProductDegrouper(self.reporter) + + def process(self): + # step 1: open a database and get all the products + session = schema.create_session('SDM') + + # step 2: get the products + files = self.product_finder(session) + + # step 3: group things up so that we don't overly stress postgres remotely or memory locally + ngas = schema.create_session('NGAS') + grouping = self.create_groupings(files) + + # repeat by each group: + for chunk in grouping: + # get the NGAS IDs for this group + ngas_ids = pandas.read_sql(ngas.query(NGASFile.file_id) + .filter(NGASFile.file_id.in_(chunk.ngas_id.values)).statement, + ngas.connection()) + + # they are here, we just found them + ngas_ids['present'] = True + + # join up using Pandas + joined = ngas_ids.set_index('file_id').join(chunk.set_index('ngas_id'), how='right') + + # look through the joined up stuff, grouping it by locator per the mode API + for spl, group in joined.groupby('locator'): + self.reporter.product_found_missing(spl, + list(group[group.present.notna()].index), # found + list(group[group.present.isna()].index)) # missing + return self.reporter.final_report() + + def create_groupings(self, files): + """ + Possibly wrap the argument iterator with a tqdm progress bar, if warranted + :param files: + :return: + """ + # this is the main trick, making the grouping and knowing how many chunks there are + grouping = groups_of(files, GROUP_SIZE) + group_count = math.ceil(len(files) / GROUP_SIZE) + + # make the wrapping, if we're not in quiet mode and going to have more than ten chunks + # (otherwise, the progress bar hardly shows) + if not self.quiet and group_count >= 10: + grouping = tqdm(grouping, total=group_count) + + return grouping + + +def main(): + x = DataFinder() + sys.exit(0 if x.main() else 1) diff --git a/apps/cli/utilities/datafinder/src/datafinder/missingbdfs.py b/apps/cli/utilities/datafinder/src/datafinder/missingbdfs.py new file mode 100644 index 0000000000000000000000000000000000000000..523ab9a2dee3b4351bb323b72b1a591896e71f57 --- /dev/null +++ b/apps/cli/utilities/datafinder/src/datafinder/missingbdfs.py @@ -0,0 +1,139 @@ +""" +Figure out which BDF files mentioned in Main.xml is missing from NGAS. +""" +import io +import sys +import xml.etree.ElementTree as ET + +import pandas +import requests +from pandas import DataFrame + +import schema +from .util import groups_of +from schema import File +from schema.legacy_model import t_ngas_file_sets +from schema.ngasmodel import NGASFile +from sqlalchemy.sql import * +import argparse + +def main(): + bdfs = get_bdfs() + + # 2. Join that to the NGAS BDF files + missing_from_ngas(bdfs) + + # 3. Find the files that are in NGAS, but not in the legacy database + missing_from_legacy(bdfs) + + # 4. Find the files that are in NGAS, but not in the new archive database + missing_from_newarchive(bdfs) + + +def get_bdfs(): + parser = argparse.ArgumentParser(description='Check for BDF ingestion') + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('-f', metavar='FILE', dest='file', type=argparse.FileType('r'), + help='parses the given Main.xml file') + group.add_argument('-p', metavar='PRODUCT', dest='product', type=str, + help='fetch the main table for the execblock with this product locator') + args = parser.parse_args() + + # 1. Enumerate the BDF files in the Main.xml here + if args.file: + bdfs = get_bdf_list_from_file(args.file) + else: + bdfs = get_bdf_list_from_locator(args.product) + + return bdfs + + +def get_bdf_list_from_locator(locator): + # this query gets us the Main.xml's ngas_id + session = schema.create_session('SDM') + query = """ + select f.ngas_id from science_products + join filegroups f1 on science_products.filegroup_id = f1.filegroup_id + join filegroups f2 on f1.filegroup_id = f2.parent_filegroup_id + join files f on f2.filegroup_id = f.filegroup + where + science_product_locator = %s and + filename = 'Main.xml' + """ + ngas_id, = next(session.connection().execute(query, (locator,))) + + # now let's figure out where this is in NGAS + query = """ + select nf.file_id, + nf.file_version, nh.host_id || '.' || nh.domain || ':' || nh.srv_port as host + from ngas_files nf + join ngas_disks nd on nf.disk_id = nd.disk_id + join ngas_hosts nh on nd.host_id = nh.host_id + join (select max(file_version) as latest_version from ngas_files + where ngas_files.file_id = %s) lv + on lv.latest_version = nf.file_version + where nf.file_id = %s""" + ngas = schema.create_session('NGAS') + file_id, file_version, host = next(ngas.connection().execute(query, (ngas_id, ngas_id))) + + # now let's run the request in stream mode and pass it to the other method + r = requests.get('http://' + host + '/RETRIEVE', params={'file_id': file_id, 'file_version': file_version}) + bytes_expected = int(r.headers.get('Content-length')) + if len(r.content) != bytes_expected: + print(f'Error streaming file from NGAS: expected {bytes_expected} but obtained {len(r.content)}' + f' (server probably needs to be rebooted)') + sys.exit(1) + else: + return get_bdf_list_from_file(io.BytesIO(r.content)) + +def get_bdf_list_from_file(file): + main = ET.parse(file) + bdfs = DataFrame((i.attrib['entityId'] for i in main.getroot().iter('EntityRef')), + columns=['bdf_id']) + bdfs['mangled_id'] = bdfs['bdf_id'].apply(lambda x: x.replace(':', '_').replace('/', '_') + '.bdf') + bdfs.set_index('bdf_id') + return bdfs + + +def missing_from_legacy(bdfs): + legacy = schema.create_session('LEGACY') + found = DataFrame([], columns=['file_id']) + for bdf_chunk in groups_of(bdfs, 1000): + query = select([t_ngas_file_sets.c.file_id]) \ + .where(t_ngas_file_sets.c.file_id.in_(bdf_chunk['mangled_id'])) + found = found.append(pandas.read_sql(query, legacy.connection())) + found['found'] = True + # 4.1 join up + missing = bdfs.set_index('mangled_id').join(found.set_index('file_id')) + missing = missing[missing.found.isna()] + if missing.empty: + print('Legacy archive is not missing any BDFs') + for row in missing.itertuples(): + print(f'Legacy archive is missing {row.bdf_id}') + + +def missing_from_newarchive(bdfs): + archive = schema.create_session('SDM') + query = select([File.ngas_id, literal_column('true').label('found')])\ + .where(File.ngas_id.in_(bdfs['mangled_id'])) + found = pandas.read_sql(query, archive.connection()) + result = bdfs.set_index('mangled_id').join(found.set_index('ngas_id')) + notfound = result[result.found.isna()] + if notfound.empty: + print('New archive is not missing any BDFs') + for row in notfound.itertuples(): + print(f"New archive is missing {row.bdf_id}") + + +def missing_from_ngas(bdfs): + ngas = schema.create_session('NGAS', profile='nmprod') + query = select([NGASFile.file_id, literal_column('true').label('found')]) \ + .where(NGASFile.file_id.in_(bdfs['mangled_id'])) + found = pandas.read_sql(query, ngas.connection()) + # 3. Find the files that are on disk but not in NGAS + result = bdfs.set_index('mangled_id').join(found.set_index('file_id')) + notfound = result[result.found.isna()] + if notfound.empty: + print('NGAS is not missing any BDFs') + for row in notfound.itertuples(): + print(f"BDF {row.bdf_id} is not in NGAS") diff --git a/apps/cli/utilities/datafinder/src/datafinder/reconciler.py b/apps/cli/utilities/datafinder/src/datafinder/reconciler.py new file mode 100644 index 0000000000000000000000000000000000000000..9e5a8aac9958157923d58e065c8e478bbf77aa80 --- /dev/null +++ b/apps/cli/utilities/datafinder/src/datafinder/reconciler.py @@ -0,0 +1,63 @@ +import pandas +from pandas import DataFrame +from tqdm.auto import tqdm +import schema +from .util import groups_of +from schema.model import Project, ScienceProduct, AncillaryProduct, File +from sqlalchemy.sql import * +from schema.legacy_model import * +from schema.ngasmodel import NGASFile + + +def main(): + session = pyat.schema.create_session('SDM', profile='local') + ngas = pyat.schema.create_session('NGAS', profile='local') + legacy = pyat.schema.create_session('LEGACY', profile='local') + + print('loading SDMs from NGAS') + sdm_query = select([NGASFile.file_id.label('ngas_id')])\ + .where(NGASFile.format == 'application/x-sdm')\ + .order_by(NGASFile.file_id) + ngas_sdms = pandas.read_sql(sdm_query, ngas.connection()) + + print('loading SDMs from new archive') + ngas_query = select([File.ngas_id]).where(File.format == 'sdm').order_by(File.ngas_id) + archive_sdms = pandas.read_sql(ngas_query, session.connection()) + + # mark them as seen + archive_sdms['present'] = True + + # left join to find the missing files + result = ngas_sdms.set_index('ngas_id').join(archive_sdms.set_index('ngas_id')) + missing = result[result.present.isna()] + + # with each of the missing files, we need to find their metadata in the legacy archive + # unfortunately, Oracle will only let us have up to 1000 items in an IN() clause, so + # we must do this in batches + count = 1000 + matching = DataFrame([], columns=['file_id', 'project_code', 'sb_id', 'eb_id']) + print('matching missing SDMs with legacy archive metadata') + + # regrettably, this is about the fastest thing the script does, + # and it's the only part we can get a progress bar on + for group in tqdm(groups_of(missing, count), total=len(missing)/count): + matching = matching.append(pandas.read_sql( + # I do not find this to be especially more readable than SQL, but there is + select([t_ngas_file_sets.c.file_id.distinct(), + t_ngas_file_sets.c.file_set_id, + t_archive.c.project_code, + t_archive.c.sb_id, + t_archive.c.eb_id, + # this case expression gets us the filename that John would write the file to + case([(t_ngas_file_sets.c.file_id.like('%.sdm'), + func.replace(t_ngas_file_sets.c.entity_type_name, 'Table', '') + '.xml'), + (t_ngas_file_sets.c.file_id.like('%.bin'), + func.replace(t_ngas_file_sets.c.entity_type_name, 'Table', '') + '.bin'), + (t_ngas_file_sets.c.file_id.like('%.bdf'), + func.replace(t_ngas_file_sets.c.file_id, '.bdf', ''))]).label('filename')]) + .where(and_(t_archive.c.arch_file == t_ngas_file_sets.c.file_set_id, + t_ngas_file_sets.c.file_id.in_(group.index))) + .order_by(t_ngas_file_sets.c.file_id), + legacy.connection())) + + matching.to_csv('output3.csv') diff --git a/apps/cli/utilities/datafinder/src/datafinder/util.py b/apps/cli/utilities/datafinder/src/datafinder/util.py new file mode 100644 index 0000000000000000000000000000000000000000..751ecb51bee23cb5b12116479646660a5bdac425 --- /dev/null +++ b/apps/cli/utilities/datafinder/src/datafinder/util.py @@ -0,0 +1,8 @@ +def groups_of(df, n): + """Return sub-dataframes for every N rows in the data frame.""" + + # we're just going to repeat, peeling off the top N items, as a generator + next_chunk, remainder = df.head(n), df.tail(-n) + while not next_chunk.empty: + yield next_chunk + next_chunk, remainder = remainder.head(n), remainder.tail(-n) diff --git a/apps/cli/utilities/dumplogs/README.md b/apps/cli/utilities/dumplogs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/dumplogs/setup.py b/apps/cli/utilities/dumplogs/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..74663608b7e7404a62484f422fb2f21ea0bdeee1 --- /dev/null +++ b/apps/cli/utilities/dumplogs/setup.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from pathlib import Path +from setuptools import setup + +VERSION = open('src/dumplogs/_version.py').readlines()[-1].split()[-1].strip("\"'") +README = Path('README.md').read_text() + +setup( + name=Path().absolute().name, + version=VERSION, + description='NRAO Archive Dumplogs', + long_description=README, + author='NRAO SSA Team', + author_email='dms-ssa@nrao.edu', + url='TBD', + license="GPL", + install_requires=['blessings', 'pycapo', 'pytz', 'pika'], + keywords=[], + packages=['dumplogs'], + package_dir={'':'src'}, + classifiers=[ + 'Programming Language :: Python :: 3.8' + ], + entry_points={ + 'console_scripts': ['dumplogs = dumplogs.commands:dumplogs'] + }, +) diff --git a/apps/cli/utilities/dumplogs/src/dumplogs/__init__.py b/apps/cli/utilities/dumplogs/src/dumplogs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/dumplogs/src/dumplogs/_version.py b/apps/cli/utilities/dumplogs/src/dumplogs/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..e63ec3a147fc65f8e390ff413a349d66169f8ec1 --- /dev/null +++ b/apps/cli/utilities/dumplogs/src/dumplogs/_version.py @@ -0,0 +1,2 @@ +""" Version information for this package, don't put anything else here. """ +___version___ = '4.0a1.dev1' diff --git a/apps/cli/utilities/dumplogs/src/dumplogs/commands.py b/apps/cli/utilities/dumplogs/src/dumplogs/commands.py new file mode 100644 index 0000000000000000000000000000000000000000..72476086073f52b3e2b401702787bd7ecce35928 --- /dev/null +++ b/apps/cli/utilities/dumplogs/src/dumplogs/commands.py @@ -0,0 +1,177 @@ +from __future__ import print_function + +import argparse as ap +import json +import re +import sys + +import dateutil.parser +import pika +from blessings import Terminal +from pycapo import CapoConfig +from pytz import timezone + +from _version import ___version___ as version + +# Description of this widget. +_DESCRIPTION = """Command line tool for dumping out AMQP logs for the AAT/PPI, version {}.""" + +# Printed at the end of the help for the tool, describes return values. +_EPILOG = """Return values: +1: can't deduce CAPO_PROFILE value +2: missing CAPO property +""" + +# Error messages. +_MISSING_PROFILE = """ERROR: dumplogs can't deduce the 'profile', give it the -P argument or set the CAPO_PROFILE environment variable! Geeze!\n\n""" +_MISSING_PROPERTY = "ERROR: missing required CAPO property {}\n\n" + + +t = Terminal() + + +def get_parser(): + """Build and return an argument parser for this tool.""" + p = ap.ArgumentParser(description=_DESCRIPTION.format(version), + formatter_class=ap.RawTextHelpFormatter, + epilog=_EPILOG) + p.add_argument('--outfile', '-O', action='store', + help='write output to file, - for STDOUT') + p.add_argument('-P', '--profile', action='store', + help='profile name to use, e.g. test, production') + p.add_argument('-p', '--show-properties', action='store_true', default=False, + help='Show MDC properties') + p.add_argument('-R', '--raw', action='store_true', help='Raw mode. No pretty formatting!') + p.add_argument('--exclude', action='store', help='Exclude logs matching this regular expression, e.g. ".*DataFetcher.*"') + + return p + + +def die_message(message, parser, return_value): + """Given an error message, argument parser and return_value, print the + message, dump the parser's help and return with the return value.""" + sys.stderr.write(message) + parser.print_help() + sys.exit(return_value) + + +def get_settings(): + """Get all the settings this widget needs to run, some command line args, + some are CAPO properties: returns a tuple (hostname, username, password, + outfile).""" + parser = get_parser() + args = parser.parse_args() + + try: + config = CapoConfig(profile=args.profile).settings('edu.nrao.archive.configuration.AmqpServer') + + args.hostname = config.hostname + args.username = config.username + args.password = config.password + except ValueError: + die_message(_MISSING_PROFILE, parser, 1) + except KeyError as e: + die_message(_MISSING_PROPERTY.format(e.args[0]), parser, 2) + + return args + +class LogDumper: + def __init__(self, settings): + self.settings = settings + self.credentials = pika.PlainCredentials(settings.username, settings.password) + self.connection = pika.BlockingConnection( + parameters=pika.ConnectionParameters(host=settings.hostname, + credentials=self.credentials)) + self.channel = self.connection.channel() + self.queue = self.channel.queue_declare('', exclusive=True, auto_delete=True, + durable=False).method.queue + self.channel.queue_bind(self.queue, 'archive.logs') + callback = self.raw_callback if self.settings.raw else self.callback + self.channel.basic_consume(self.queue, on_message_callback=callback, auto_ack=True) + if self.settings.outfile and self.settings.outfile != '-': + self.fh = open(self.settings.outfile, 'a') + else: + self.fh = sys.stdout + + + def raw_callback(self, ch, method, properties, body): + parsed = json.loads(body.decode('UTF-8')) + self.fh.write('{}: {}\n'.format(parsed['class'], parsed['formattedMessage'])) + + + def callback(self, ch, method, properties, body): + + try: + parsed = json.loads(body.decode('UTF-8')) + parsed['hostname'] = parsed['properties']['HOSTNAME'] \ + if 'HOSTNAME' in parsed['properties'] else '' + parsed['idValue'] = parsed['properties']['track_id'] \ + if 'track_id' in parsed['properties'] else '' + + if 'formattedMessage' in parsed: + parsed['date'] = dateutil.parser.parse(parsed['timestamp']) + + self.write(parsed) + + if 'stackTrace' in parsed: + # self.fh.write("\n\n ***** BEGIN STACK TRACE ******\n") + for stack in parsed['stackTrace']: + for line in stack: + self.fh.write('{}\n'.format(line)) + # self.fh.write("\n\n ***** END STACK TRACE ******\n\n") + else: + self.fh.write('no formatted message: {}\n'.format(body)) + + except Exception as e: + self.fh.write('{}\n'.format(e)) + self.fh.write('unparseable: {}\n'.format(body)) + + + def dump(self): + self.channel.start_consuming() + + + def write(self, parsed): + # self.fh.write('{} {} {} {}'.format(LEVEL['DEBUG'], LEVEL['INFO'], LEVEL['WARN'], LEVEL['ERROR'])) + parsed['hostname'] = parsed['properties']['user'] + '@' + parsed['hostname'] if 'user' in parsed['properties'] else parsed['hostname'] + parsed['hostname'] = t.blue(parsed['hostname']) + parsed['loggerName'] = t.cyan('{0}#{1}:{2}' + .format(declasse(parsed['loggerName']), + parsed['method'], parsed['line'])) + parsed['level'] = LEVEL[parsed['level']] + parsed['date'] = t.magenta(parsed['date'].astimezone(timezone('MST7MDT')).strftime('%m/%d %X.%f')[:-3]) + parsed['idValue'] = t.magenta(parsed['idValue']) + parsed['properties'] = t.yellow_on_blue(str(parsed['properties'])) if self.settings.show_properties else '' + + log = "{hostname: <40} - {date} {level: <21} {loggerName} - {idValue} - {formattedMessage} {properties}\n".format(**parsed) + try: + if not re.match(self.settings.exclude, log): + self.fh.write(log) + else: + log = "{hostname: <40} - {date} >>> EXCLUDED: {level: <21} {loggerName} - {formattedMessage} {properties}\n".format(**parsed) + self.fh.write(log) + except: + self.fh.write(log) + + +LEVEL = { + 'DEBUG': t.bright_white_on_yellow('DEBUG'), + 'INFO' : t.bright_white_on_blue ('INFO'), + 'WARN' : t.yellow_on_black ('WARN'), + 'ERROR': t.bright_white_on_red ('ERROR') +} + + +def declasse(logger): + depath = re.sub('([a-z])[^/]+(?=/)', '\\1', logger) + return re.sub('([a-z])[a-z]+(?=\.)', '\\1', depath) + + +def dumplogs(): + args = get_settings() + dumper = LogDumper(args) + dumper.dump() + + +if __name__ == '__main__': + dumplogs() diff --git a/apps/cli/utilities/faultchecker/README.md b/apps/cli/utilities/faultchecker/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/faultchecker/setup.py b/apps/cli/utilities/faultchecker/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..f3ff21842083992bc9151677adb8d3b0c1285243 --- /dev/null +++ b/apps/cli/utilities/faultchecker/setup.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from pathlib import Path +from setuptools import setup + +VERSION = open('src/faultchecker/_version.py').readlines()[-1].split()[-1].strip("\"'") +README = Path('README.md').read_text() + +setup( + name=Path().absolute().name, + version=VERSION, + description='NRAO Archive Fault Checker', + long_description=README, + author='NRAO SSA Team', + author_email='dms-ssa@nrao.edu', + url='TBD', + license="GPL", + install_requires=['blessings', 'pendulum', 'pika', 'psycopg2', 'pycapo', 'requests'], + keywords=[], + packages=['faultchecker'], + package_dir={'':'src'}, + classifiers=[ + 'Programming Language :: Python :: 3.8' + ], + entry_points={ + 'console_scripts': ['faultchecker = faultchecker.commands:check_faults'] + }, +) diff --git a/apps/cli/utilities/faultchecker/src/faultchecker/__init__.py b/apps/cli/utilities/faultchecker/src/faultchecker/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ed09ba1c57251751702901465f47f4cdd4869021 --- /dev/null +++ b/apps/cli/utilities/faultchecker/src/faultchecker/__init__.py @@ -0,0 +1 @@ +from _version import ___version___ as version \ No newline at end of file diff --git a/apps/cli/utilities/faultchecker/src/faultchecker/_version.py b/apps/cli/utilities/faultchecker/src/faultchecker/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..e63ec3a147fc65f8e390ff413a349d66169f8ec1 --- /dev/null +++ b/apps/cli/utilities/faultchecker/src/faultchecker/_version.py @@ -0,0 +1,2 @@ +""" Version information for this package, don't put anything else here. """ +___version___ = '4.0a1.dev1' diff --git a/apps/cli/utilities/faultchecker/src/faultchecker/commands.py b/apps/cli/utilities/faultchecker/src/faultchecker/commands.py new file mode 100644 index 0000000000000000000000000000000000000000..9622865f0a3f19d53e4baa455ab8c3972fe74fe1 --- /dev/null +++ b/apps/cli/utilities/faultchecker/src/faultchecker/commands.py @@ -0,0 +1,196 @@ +r""" +This module is an interface for the NRAO faultchecker. + +This file is meant to be run as a script from the command line. +""" +from __future__ import print_function + +import argparse as ap +import logging +import os + +import sys + +from . import version +from .faultreporter import ConsoleReporter, BulkReporter +from .faultchecker import ServicesChecker, QueuesChecker, LastProcessedChecker, \ + DirChecker + +logger = logging.getLogger(__name__) + +# Description of this widget. +_DESCRIPTION = """Command line tool for checking failed workflow services on the AAT/PPI, version {}.""" + +# Printed at the end of the help for the tool, describes return values. +_EPILOG = """Return values: +1: can't deduce CAPO_PROFILE value +2: missing CAPO property +3: Application Exception +""" + +# Error messages. +_MISSING_PROFILE = """ERROR: faultchecker can't deduce the 'profile', +give it the -P argument or set the CAPO_PROFILE environment variable.\n\n""" +_MISSING_PROPERTY = "ERROR: missing required CAPO property {}\n\n" + + +def get_parser(): + r""" Build and return an argument parser for this tool. + + :return: Return an argument parser specific to faultchecker. + """ + p = ap.ArgumentParser(description=_DESCRIPTION.format(version), + formatter_class=ap.RawTextHelpFormatter, + epilog=_EPILOG) + p.add_argument('-P', '--profile', action='store', + help='profile name to use, e.g. test, production') + return p + + +def die_message(message, parser, return_value): + r""" Given an error message, argument parser and return_value, print the + message, dump the parser's help and return with the return value. + + :param message: The error message to present to the user on application failure. + :param parser: The parser from which to pull the help/usage from. + :param return_value: The return value we're about to send to the OS. + :return: None... we're dying. + """ + """""" + sys.stderr.write(message) + parser.print_help() + sys.exit(return_value) + + +def get_settings(): + r""" Get the settings necessary to run faultchecker. + + :return: The faultchecker configuration settings + """ + parser = get_parser() + args = parser.parse_args() + return args + + +def check_services(agg, profile): + try: + sc = ServicesChecker(profile) + except Exception: + raise Exception('Unable to construct ServicesChecker({0})'.format(profile)) + + if sc.check_servlet("amygdala"): + agg.ok('{} servlet is running'.format("amygdala")) + else: + agg.error('{} servlet is NOT RUNNING!'.format("amygdala")) + + if sc.check_servlet("workflow"): + agg.ok('{} servlet is running'.format("workflow")) + else: + agg.error('{} servlet is NOT RUNNING!'.format("workflow")) + + if sc.check_amqp_online(): + agg.ok('AMQP is online') + else: + agg.error('AMQP is NOT ONLINE!') + + +def check_queues(agg, profile): + try: + qc = QueuesChecker(profile) + except Exception: + raise Exception('Unable to construct QueuesChecker({0})'.format(profile)) + + q_count = qc.get_queue_msg_count(qc.workflow.amqpPersistentQueueName) + if q_count == 0: + agg.ok('{} is empty'.format(qc.workflow.amqpPersistentQueueName)) + else: + agg.error('{} is NOT EMPTY ({} messages in queue)'.format(qc.workflow.amqpPersistentQueueName, q_count)) + + +def check_last_xxx(agg, profile): + try: + last_x = LastProcessedChecker(profile) + except Exception: + raise Exception('Unable to construct LastProcessedChecker({0})'.format(profile)) + + last_bdf_ingest = last_x.get_last_ingested('bdf') + if last_bdf_ingest.diff().in_hours() > 3: + agg.error("last {} ingested {}".format('bdf'.upper(), last_bdf_ingest.diff_for_humans())) + else: + agg.ok("last {} ingested {}".format('bdf'.upper(), last_bdf_ingest.diff_for_humans())) + + last_sdm_ingest = last_x.get_last_ingested('sdm') + if last_sdm_ingest.diff().in_hours() > 3: + agg.error("last {} ingested {}".format('sdm'.upper(), last_sdm_ingest.diff_for_humans())) + else: + agg.ok("last {} ingested {}".format('sdm'.upper(), last_sdm_ingest.diff_for_humans())) + + last_eb = last_x.get_last_eb() + if last_eb.diff().in_days() > 3: + agg.error("last execution block ended {}".format(last_eb.diff_for_humans())) + else: + agg.ok("last execution block ended {}".format(last_eb.diff_for_humans())) + + count, last_alma_reingestion = last_x.get_last_alma_reingestion() + if count > 0 and last_alma_reingestion.diff().in_minutes() > 5: + agg.error("last ALMA SDM reingested {}".format(last_alma_reingestion.diff_for_humans())) + else: + agg.ok("last ALMA SDM reingested {}".format(last_alma_reingestion.diff_for_humans())) + + +def check_directories(agg, profile): + try: + dc = DirChecker(profile) + except Exception: + raise Exception('Unable to construct DirChecker({0})'.format(profile)) + + if not os.path.exists(dc.sdmpath): + agg.warn('Skipping SDM check: {} does not exist'.format(dc.sdmpath)) + else: + missing_sdms = dc.get_missing_sdms() + if len(missing_sdms) > 0: + agg.error("Missing SDMs for {0}: ".format(dc.sdmpath) + + ', '.join(list(missing_sdms))) + else: + agg.ok("No missing recent SDMs") + + if not os.path.exists(dc.mchammer): + agg.warn('Skipping SDM check: {} does not exist'.format(dc.mchammer)) + else: + missing_sdms = dc.get_mchammer_workspace() + if len(missing_sdms) > 0: + agg.error("Missing SDMs for {0}: ".format(dc.mchammer) + + ', '.join(list(missing_sdms))) + else: + agg.ok("No missing recent SDMs") + + +def check_faults(): + r""" The functional called when this file is run as a script. + + :return: + """ + args = get_settings() + + # print('checking faults on {0}:'.format(args.profile)) + + aggregator = BulkReporter() + # slack = SlackReporter() + aggregator.add_aggregator(ConsoleReporter()) + # aggregator.add_aggregator(slack) + + try: + check_services(aggregator, args.profile) + check_queues(aggregator, args.profile) + check_last_xxx(aggregator, args.profile) + check_directories(aggregator, args.profile) + + # slack.publish() + + except Exception as cmd_ex: + die_message('Exception caught running faultchecker: {0} - {1}\n' + .format(type(cmd_ex).__name__, cmd_ex), get_parser(), 3) + + +if __name__ == '__main__': + check_faults() diff --git a/apps/cli/utilities/faultchecker/src/faultchecker/faultchecker.py b/apps/cli/utilities/faultchecker/src/faultchecker/faultchecker.py new file mode 100644 index 0000000000000000000000000000000000000000..94ffde2bd685c8a75f5940b8eb22697296413185 --- /dev/null +++ b/apps/cli/utilities/faultchecker/src/faultchecker/faultchecker.py @@ -0,0 +1,405 @@ +""" +Trace through common causes of failure in the new archive system. +Prints diagnostics that can be used to help address problems. + +Author: Daniel K Lyons <dlyons@nrao.edu> +Since: 2017-09-29 + +Some other stuff to consider checking: + +BDF/SDM completion unnoticed + +Multicast messaging trouble +REALfast does not send multicast messages! +BDF/SDM completion signal not received by Amygdala +Multicast disconnected + +Amygdala not sending workflow start message + +Workflow not responding to workflow start message + +Workflow is out of worker threads +Ingest job not sent to cluster +Workflow bug + +Cluster not launching ingestion job + +Cluster is under too much load, no free processing resources +Cluster offline + +Ingestion job hung +Ingestion job failed + +Required files not present +NGAS not responding +NGAS full +NGAS server offline or unreachable +""" +import json + +from collections import defaultdict +import os +import pendulum +import subprocess +import time +import pika +import requests +from pycapo import CapoConfig +import psycopg2 as pg +import logging + +logger = logging.getLogger(__name__) + +SECONDS_IN_ONE_DAY = 86400 + +class FaultChecker: + def __init__(self, profile): + self.capo = CapoConfig(profile=profile) + + def __repr__(self): + repr_str_list = ['{c}('] + temp = ['{0}={{s.{0}}}'.format(key) for key in vars(self)] + self_list = ', '.join(temp) + repr_str_list.append(self_list) + repr_str_list.append(')') + + return ''.join(repr_str_list).format(c=self.__class__.__name__, s=self) + + +class ServicesChecker(FaultChecker): + def __init__(self, profile): + super().__init__(profile) + self.tomcat = self.capo.settings('tomcat.deployment') + self.amqp = self.capo.settings('edu.nrao.archive.configuration.AmqpServer') + + self.running_servlets = self.update_running_servlets() + + def update_running_servlets(self): + r = requests.get(self.tomcat.uri + '/list', + auth=(self.tomcat.username, + self.tomcat.password)) + return set(x.split(':')[3] + for x + in r.text.split('\n')[1:-1] + if x.split(':')[1] == 'running') + + def check_servlet(self, servlet): + r""" Check a given servlet to see if it's running. + + :param servlet: The name of the servlet to check on tomcat. + :return: True if the servlet is running. + """ + return servlet in self.running_servlets + + def check_amqp_online(self): + r""" Check to see if we can connect to AMQP and send a message to ourselves. + + :return: True if a loopback message was send and received on amqp. + """ + connection = pika.BlockingConnection( + pika.ConnectionParameters(self.amqp.hostname, + credentials=pika.PlainCredentials( + self.amqp.username, + self.amqp.password))) + channel = connection.channel() + channel.exchange_declare('faulttrace', exchange_type='fanout', + durable=False, auto_delete=True) + queue = channel.queue_declare('', durable=False, auto_delete=True, exclusive=True).method.queue + channel.queue_bind(queue, 'faulttrace') + channel.basic_publish('faulttrace', '', b'hello, world') + _, _, message = channel.basic_get(queue) + channel.queue_delete(queue) + channel.exchange_delete('faulttrace') + channel.close() + connection.close() + return message == b'hello, world' + + +class QueuesChecker(FaultChecker): + def __init__(self, profile): + super().__init__(profile) + self.amqp = self.capo.settings('edu.nrao.archive.configuration.AmqpServer') + self.workflow = self.capo.settings('edu.nrao.archive.workflow.config.WorkflowManagerSettings') + + def get_queue_msg_count(self, queue_name): + r""" Get the number of messages in the given queue + + :param queue_name: The amqp queue to check. + :return: The number of messages + """ + r = requests.get('http://{}:15672/api/queues/%2F/{}'.format(self.amqp.hostname, queue_name), + auth=(self.amqp.username, self.amqp.password)) + result = json.loads(r.text) + return result['messages'] + + +class LastProcessedChecker(FaultChecker): + def __init__(self, profile): + super().__init__(profile) + # let's connect to the database out here + self.databaseCreds = self.capo.settings('metadataDatabase') + self.host, self.dbname = self.databaseCreds.jdbcUrl.split(':')[2][2:].split('/') + self.conn = pg.connect(dbname=self.dbname, host=self.host, + user=self.databaseCreds.jdbcUsername, + password=self.databaseCreds.jdbcPassword) + + # TODO: Will we need to manage a cursor if we put it in the object? + self.cursor = self.conn.cursor() + + def get_last_ingested(self, product_type): + r""" Check when we last ingested a <product_type> file + + :param product_type: The type of file we want to check the date on. + :return: A pendulum instance of the ingest time + """ + sql = "select max(ingestion_time) from files where format = %s" + self.cursor.execute(sql, (product_type,)) + ingest_time, = self.cursor.fetchone() + return pendulum.instance(ingest_time) + + # Check out the newest execution block we have + + def get_last_eb(self): + r""" Check when the last eb ran. + + :return: A pendulum instance of the time the last eb ran. + """ + sql = "select mjd_to_timestamp(max(endtime)) from execution_blocks" + self.cursor.execute(sql) + eb_time, = self.cursor.fetchone() + return pendulum.instance(eb_time) + + def get_last_alma_reingestion(self): + r""" Check when the last ALMA reingestion happened. + + :return: A tuple of (remaining, last_ingested) where remaining + is the number of ALMA SDMs remaining to be processed + """ + sql = """select coalesce(max(last_updated), '2018-01-01') + from alma_reingestion_queue where state IN ('COMPLETE', 'FAILED')""" + self.cursor.execute(sql) + last_updated, = self.cursor.fetchone() + + sql = "select count(*) from alma_reingestion_queue where state IN ('QUEUED', 'WAITING')" + self.cursor.execute(sql) + remaining, = self.cursor.fetchone() + + return remaining, pendulum.instance(last_updated) + + +class DirChecker(FaultChecker): + def __init__(self, profile): + super().__init__(profile) + self.sdmpath = self.capo['archive-ingestion.SDMPath'] + self.mchammer = '/home/mchammer/evla/mcaf/workspace/' + # let's connect to the database out here + self.databaseCreds = self.capo.settings('metadataDatabase') + self.host, self.dbname = self.databaseCreds.jdbcUrl.split(':')[2][2:].split('/') + self.conn = pg.connect(dbname=self.dbname, host=self.host, + user=self.databaseCreds.jdbcUsername, + password=self.databaseCreds.jdbcPassword) + + # TODO: Will we need to manage a cursor if we put it in the object? + self.cursor = self.conn.cursor() + + def get_missing_sdms(self): + r""" Return a list of missing sdms. + + :return: A list of missing sdms, or an empty list. + """ + now = time.time() + + sdms = [f.name for f in os.scandir(self.sdmpath) + if (now - os.stat(os.path.join(self.sdmpath, f.name)).st_mtime < 7 * SECONDS_IN_ONE_DAY) and (f.is_dir())] + self.cursor.execute('SELECT ngas_fileset_id FROM execution_blocks WHERE ngas_fileset_id IN %s', [tuple(sdms)]) + missing_sdms = set(sdms) - set(r[0] for r in self.cursor.fetchall()) + + return [f for f in missing_sdms] + + def get_mchammer_workspace(self): + r""" A test stub for returning the contents of a directory + + :return: Return the contents of self.ls_test_dir + """ + # dir_contents = [f for f in os.listdir(self.mchammer)] + # return dir_contents + + now = time.time() + sdms = [f for f in os.listdir(self.mchammer) + if now - os.stat(os.path.join(self.mchammer, f)).st_mtime < 7 * SECONDS_IN_ONE_DAY] + self.cursor.execute('SELECT ngas_fileset_id FROM execution_blocks WHERE ngas_fileset_id IN %s', [tuple(sdms)]) + missing_sdms = set(sdms) - set(r[0] for r in self.cursor.fetchall()) + return [f for f in missing_sdms] + + +class ClusterChecker(FaultChecker): + def __init__(self, profile): + super().__init__(profile) + # Username and password for accessing lustre & cluster information (use the deployment info) + self.wf_deploy = self.capo.settings('workflow.deployment') + self.uname = self.wf_deploy.username + self.rhost = self.wf_deploy.hostname + + self.usage_connector = self.uname+'@'+self.rhost + + def check_cluster_usage(self): + """A utility wrapping the qstat command which will take the list + of what is running on the cluster in the 'batch' queue and + provide a relevant subset of the data. + + Potential extensions: Some larger scale statistics (after we agree + on a set of keys + % of jobs that are vlapipe vs others + % of queued jobs (>9% means potential over subscription) + + multiple queues (test, vlass) + highlight distinctions between VLASS, Old Calibration, and + AAT/PPI jobs by name. + + :returns A dictionary of jobID:dictionary_of_values + where dictionary_of_values has the keys: + name, status, node, timeUsed + + or None if an error is encountered. + """ + # First, grab the data we need and do basic transformations: + try: + capture = subprocess.check_output(['ssh', + self.usage_connector, + '/opt/services/torque/bin/qstat', + '-1n', + 'batch']) + except subprocess.CalledProcessError: + # print("Cannot execute qstat command: " + ex.stderr) + # something went off the rails. + return None + # all is well + text = capture.decode() + output_lines = text.splitlines() + + # Loop over the output lines. Accumulate for overall statistics and + # populate the individual fields with desirable data. + cluster_jobs = defaultdict() + + # our usage % + vlapipe_job_count = 0 + others_job_count = 0 + + # % queued, an indication of over subscription + jobs_queued_count = 0 + jobs_total_count = 0 + + for data_line in output_lines[11:]: # First 6 lines are the NRAO header, then 5 lines of qstat header + current_job = defaultdict() + fields = data_line.split() # default splits on whitespace + # + # 0 - ID + # 1 - uname (statistics, but not forwarded) + # 2 - queue (discard) + # 3 - jobname + # 4 - SessionID (discard) + # 5 - Nodes requested (discard) + # 6 - Tasks (discard) + # 7 - Memory requested (discard) + # 8 - Time requested (discard, or provide a %) + # 9 - State: (R,Q,W,etc) + # 10 - Time Elapsed + # 11 - Node(s) used + # + current_job['name'] = fields[3] + current_job['status'] = fields[9] + if 'Q' == fields[9]: + jobs_queued_count += 1 + current_job['nodes'] = fields[11] + current_job['timeUsed'] = fields[10] + + # add this job to the overall result: + job_id_number = fields[0].split('.')[0] + cluster_jobs[job_id_number] = current_job + + if 'vlapipe' == fields[1]: + vlapipe_job_count += 1 + else: + others_job_count += 1 + # endif + + jobs_total_count += 1 + # endfor + + # + # A few overall metrics in case we're interested. + # + + # percent_current_usage = vlapipe_job_count/(others_job_count+vlapipe_job_count)*100.0 + # clusterJobs['usagePercent'] = percent_current_usage + + # percent_queued_jobs = jobs_queued_count/jobs_total_count*100.0 + # clusterJobs['queuedPercent'] = percent_queued_jobs + + return cluster_jobs + + +class LustreChecker(FaultChecker): + def __init__(self, profile): + super().__init__(profile) + # Username and password for accessing lustre & cluster information (use the deployment info) + self.wf_deploy = self.capo.settings('workflow.deployment') + self.uname = self.wf_deploy.username + self.rhost = self.wf_deploy.hostname + + self.usage_connector = self.uname+'@'+self.rhost + + def lustre_usage(self): + """ A wrapper of the lustre df and quota utilities to + provide an overall view of processing space usage. + This will grab the total size of lustre, the amount + which is used, and the amount used by the vlapipe + group, and provide them via a dictionary. + + If everything goes to plan, it will return a dictionary of + disk space values, all in terms of 1k blocks. With the keys: + + total: + used: + vlapipe: + + On error, returns None + """ + try: + capture = subprocess.check_output(['ssh', self.usage_connector, 'df', '/lustre/aoc']) + except subprocess.CalledProcessError: + # something went off the rails. + return None + # all is well + text = capture.decode() + output_lines = text.splitlines() + data_line = output_lines[7] # headers (nrao and command) is the first 7 lines + fields = data_line.split() + lustre_total = fields[1] + lustre_used = fields[2] + + try: + # grab the group quota without header information + capture = subprocess.check_output(['ssh', + self.usage_connector, + 'lfs', + 'quota', + '-qg', + 'vlapipe', + '/lustre/aoc']) + except subprocess.CalledProcessError: + # print("Cannot execute qstat command: " + ex.stderr) + # something went off the rails. + return None + # all is well + text = capture.decode() + output_lines = text.splitlines()[6] # skip the NRAO header + vlapipe_space = output_lines.split()[1] # used space is the 2nd field + + # populate the output dictionary + result = defaultdict() + result["total"] = lustre_total + result["used"] = lustre_used + result["vlapipe"] = vlapipe_space + + return result diff --git a/apps/cli/utilities/faultchecker/src/faultchecker/faultreporter.py b/apps/cli/utilities/faultchecker/src/faultchecker/faultreporter.py new file mode 100644 index 0000000000000000000000000000000000000000..835f0ae02775b167cf6c41ab594d6bd26e2e04a0 --- /dev/null +++ b/apps/cli/utilities/faultchecker/src/faultchecker/faultreporter.py @@ -0,0 +1,139 @@ +import json + +import requests +from blessings import Terminal +from pycapo import CapoConfig + + +class FaultReporter: + """ + Interface for reporting faults + """ + + def ok(self, message): + """ + Call to signal that your test passed + :param message: a message about your test + """ + pass + + def warn(self, message): + """ + Call to signal your test produced a warning + :param message: the message to display about your test + """ + pass + + def error(self, message): + """ + Call to signal that your test produced an error + :param message: the message to display about your test + """ + pass + + +class ConsoleReporter(FaultReporter): + """ + Fault reporter that displays messages on the command line + """ + def __init__(self): + self.t = Terminal() + + def ok(self, message): + r""" Print an '[OK]' message to the terminal. + + :param message: The message to print. + :return: None + """ + print('{} {}'.format(self.t.green('[OK] '), message)) + + def warn(self, message): + """ Print a '[WARN]' message to the terminal. + + :param message: The message to print. + :return: None + """ + print('{} {}'.format(self.t.yellow('[WARN] '), message)) + + def error(self, message): + r""" Print an '[ERROR]' message to the terminal. + + :param message: The message to print. + :return: None + """ + print('{} {}'.format(self.t.red('[ERROR] '), message)) + + +class SlackReporter(FaultReporter): + """ + Fault reporter to send the report back to Slack + """ + + def __init__(self): + self.results = [] + self.errors = 0 + self.successes = 0 + self.warnings = 0 + + def add_message(self, message, color): + """ + Add a note to the list of notes we're sending to Slack. + :param message: the message + :param color: what to color this message + """ + self.results.append({'fallback': message, 'text': message, 'color': color}) + + def ok(self, message): + self.successes += 1 + self.add_message(message, "#36a64f") + + def warn(self, message): + self.warnings += 1 + self.add_message(message, "#f28500") + + def error(self, message): + self.errors += 1 + self.add_message(message, "#D00") + + def make_summary(self): + """ + Produces a textual summary of the messages we're tracking for the + first line of the report + :return: this summary + """ + if self.errors == 0 and self.warnings == 0: + return 'All archive systems are fault-free right now (or someone needs to write a new test)' + elif self.errors == 0 and self.warnings > 0: + return 'A few archive systems are generating warnings, but nothing is known to be broken' + elif self.successes == 0: + return 'Things are *very* on fire right now!' + else: + return 'Several archive systems need attention now!' + + def publish(self): + """ + Publish the message to Slack using the events destination + """ + config = CapoConfig().settings('archive.events.slack') + requests.post(config.webhook, json.dumps({'text': self.make_summary(), + 'attachments': self.results})) + + +class BulkReporter(FaultReporter): + def __init__(self): + self.aggregators = [] + + def add_aggregator(self, agg): + self.aggregators.append(agg) + + def ok(self, message): + for a in self.aggregators: + a.ok(message) + + def warn(self, message): + for a in self.aggregators: + a.warn(message) + + def error(self, message): + for a in self.aggregators: + a.error(message) diff --git a/apps/cli/utilities/mr_books/README.md b/apps/cli/utilities/mr_books/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/mr_books/database/aat_models.py b/apps/cli/utilities/mr_books/database/aat_models.py deleted file mode 100644 index ac86e8f6028d5ad0bffb39348816f6f5e544f4cb..0000000000000000000000000000000000000000 --- a/apps/cli/utilities/mr_books/database/aat_models.py +++ /dev/null @@ -1,425 +0,0 @@ -# coding: utf-8 -from sqlalchemy import ARRAY, BigInteger, Boolean, CheckConstraint, Column, Date, DateTime, Float, ForeignKey, Integer, JSON, LargeBinary, Numeric, String, Table, Text, text -from sqlalchemy.orm import relationship -from sqlalchemy.ext.declarative import declarative_base - - -Base = declarative_base() -metadata = Base.metadata - - -class Author(Base): - __tablename__ = 'authors' - - author_id = Column(Integer, primary_key=True, server_default=text("nextval('authors_author_id_seq'::regclass)")) - project_code = Column(ForeignKey('projects.project_code'), nullable=False) - username = Column(String, nullable=False) - firstname = Column(String, nullable=False) - lastname = Column(String, nullable=False) - pst_person_id = Column(String) - is_pi = Column(Boolean, nullable=False) - - project = relationship('Project') - - def __repr__(self): - return "<Author#{author_id} {username} '{firstname} {lastname}'>"\ - .format(**self.__dict__) - - -class CalibrationStatusValue(Base): - __tablename__ = 'calibration_status_values' - - status = Column(String, primary_key=True) - description = Column(String, nullable=False) - - - def __repr__(self): - return "<CalibrationStatusValue {status}>".format(**self.__dict__) - - -class CalibrationTable(Base): - __tablename__ = 'calibration_tables' - - calibration_table_id = Column(Integer, primary_key=True, server_default=text("nextval('calibration_tables_calibration_table_id_seq'::regclass)")) - file_id = Column(ForeignKey('files.file_id'), nullable=False) - metadata_tbd = Column(String, nullable=False) - - file = relationship('File') - - def __repr__(self): - return "<CalibrationTable#{calibration_table_id}>".format(**self.__dict__) - - -class DataDescription(Base): - __tablename__ = 'data_descriptions' - - data_description_id = Column(Integer, primary_key=True, server_default=text("nextval('data_descriptions_data_description_id_seq'::regclass)")) - bandwidth = Column(Float(53), nullable=False) - frequency = Column(Float(53), nullable=False) - polarization_id = Column(ForeignKey('polarizations.polarization_id'), nullable=False) - - polarization = relationship('Polarization') - subscans = relationship('Subscan', secondary='subscan_data_descriptions') - - def __repr__(self): - return "<DataDescription#{data_description_id} Freq={frequency} BW={bandwidth}>"\ - .format(**self.__dict__) - - -t_datadesc = Table( - 'datadesc', metadata, - Column('frequency', Float(53)), - Column('bandwidth', Float(53)), - Column('pol_id', Integer), - Column('receiver', String), - Column('datadesc_id', Integer), - Column('spectral_window_id', Integer), - Column('receiver_id', Integer) -) - - -class Event(Base): - __tablename__ = 'events' - - id = Column(Integer, primary_key=True, server_default=text("nextval('events_id_seq'::regclass)")) - application = Column(String, nullable=False) - user = Column(String) - request = Column(String) - message = Column(String, nullable=False) - log_data = Column(JSON) - version = Column(Float(53), nullable=False) - timestamp = Column(DateTime(True), nullable=False) - - def __repr__(self): - return "<Event#{id} app={application} user={user} request={request} timestamp={timestamp}>"\ - .format(**self.__dict__) - - -t_execblock_start_stop = Table( - 'execblock_start_stop', metadata, - Column('execution_block_id', Integer), - Column('starttime', Float(53)), - Column('endtime', Float(53)) -) - - -class ExecutionBlock(Base): - __tablename__ = 'execution_blocks' - - execution_block_id = Column(Integer, primary_key=True, server_default=text("nextval('execution_blocks_execution_block_id_seq'::regclass)")) - ost_exec_block_id = Column(Integer) - filegroup_id = Column(ForeignKey('filegroups.filegroup_id'), nullable=False) - calibration_level = Column(String, nullable=False) - telescope = Column(String, nullable=False) - configuration = Column(String, nullable=False) - ingestion_complete = Column(Boolean, nullable=False) - scheduling_block_id = Column(Integer) - ngas_fileset_id = Column(String, nullable=False) - project_code = Column(ForeignKey('projects.project_code'), nullable=False) - starttime = Column(Float(53)) - endtime = Column(Float(53)) - calibration_status = Column(ForeignKey('calibration_status_values.status'), nullable=False, server_default=text("'Unknown'::character varying")) - scheduling_block_type = Column(String) - - calibration_status_value = relationship('CalibrationStatusValue') - filegroup = relationship('Filegroup') - project = relationship('Project') - scans = relationship('Scan') - - def __repr__(self): - return "<ExecutionBlock#{execution_block_id} project={project_code} start={starttime} end={endtime}>".format(**self.__dict__) - - - -class Filegroup(Base): - __tablename__ = 'filegroups' - __table_args__ = ( - CheckConstraint('(CASE WHEN (project_code IS NOT NULL) THEN 1 ELSE 0 END + CASE WHEN (parent_filegroup_id IS NOT NULL) THEN 1 ELSE 0 END) = 1'), - ) - - filegroup_id = Column(Integer, primary_key=True, server_default=text("nextval('filegroups_filegroup_id_seq'::regclass)")) - project_code = Column(ForeignKey('projects.project_code')) - groupname = Column(String, nullable=False) - parent_filegroup_id = Column(ForeignKey('filegroups.filegroup_id')) - datasize = Column(BigInteger) - - parent_filegroup = relationship('Filegroup', remote_side=[filegroup_id], backref='children_filegroups') - project = relationship('Project') - - execution_blocks = relationship('ExecutionBlock') - files = relationship('File') - scans = relationship('Scan') - - def __repr__(self): - return "<Filegroup#{filegroup_id} project={project_code} groupname={groupname}>".format(**self.__dict__) - - -class File(Base): - __tablename__ = 'files' - - file_id = Column(Integer, primary_key=True, server_default=text("nextval('files_file_id_seq'::regclass)")) - file_path = Column(String) - ngas_id = Column(String) - filegroup = Column(ForeignKey('filegroups.filegroup_id'), nullable=False, index=True) - filename = Column(String, nullable=False) - filesize = Column(BigInteger, nullable=False) - format = Column(String, nullable=False) - type = Column(String, nullable=False) - checksum = Column(String) - checksum_type = Column(String) - ingestion_time = Column(DateTime(True)) - - filegroup1 = relationship('Filegroup') - workflows = relationship('Workflow', secondary='workflow_input_files') - calibration_tables = relationship('CalibrationTable') - images = relationship('Image') - subscans = relationship('Subscan') - - def __repr__(self): - return "<File#{file_id} {file_path}/{filename} ngas_id={ngas_id}>".format(**self.__dict__) - - -t_flatfiles = Table( - 'flatfiles', metadata, - Column('project_code', String), - Column('telescope', String), - Column('config', String), - Column('sb_id', BigInteger), - Column('eb_id', BigInteger), - Column('calibration', String), - Column('file_path', String), - Column('ngas_file_id', String), - Column('ngas_fileset_id', String), - Column('filename', String), - Column('filesize', BigInteger), - Column('format', String), - Column('type', String), - Column('starttime', Numeric), - Column('endtime', Numeric), - Column('sourcename', String), - Column('sourcetype', String), - Column('ra', Numeric), - Column('dec', Numeric), - Column('exposure_time', Numeric), - Column('integration_time', Numeric), - Column('observation_type', String), - Column('scan', Numeric), - Column('subscan', Numeric), - Column('intent', String), - Column('data_desc_id', BigInteger), - Column('generated_eb_id', BigInteger), - Column('generated_scan_id', BigInteger) -) - - -class Image(Base): - __tablename__ = 'images' - - image_id = Column(Integer, primary_key=True, server_default=text("nextval('images_image_id_seq'::regclass)")) - file_id = Column(ForeignKey('files.file_id'), nullable=False) - target_name = Column(String, nullable=False) - center_position = Column(String, nullable=False) - observing_band = Column(String, nullable=False) - project_code = Column(String, nullable=False) - telescope = Column(String, nullable=False) - configuration = Column(String, nullable=False) - collection_name = Column(String) - thumbnail = Column(LargeBinary) - spatial_resolution = Column(Float(53), nullable=False) - field_of_view = Column(Float(53), nullable=False) - max_intensity = Column(Float(53), nullable=False) - min_intensity = Column(Float(53), nullable=False) - rms_noise = Column(Float(53), nullable=False) - polarization_id = Column(ForeignKey('polarizations.polarization_id'), nullable=False) - - file = relationship('File') - polarization = relationship('Polarization') - - def __repr__(self): - return "<Image#{image_id} target={target_name} telescope={telescope}>".format(**self.__dict__) - - -class Intent(Base): - __tablename__ = 'intents' - - intent_name = Column(String, primary_key=True) - - subscans = relationship('Subscan', secondary='subscan_intents') - - def __repr__(self): - return "<Intent {0}>".format(self.intent_name) - - -t_logs = Table( - 'logs', metadata, - Column('filename', String), - Column('class', String), - Column('method', String), - Column('line', Integer), - Column('arguments', ARRAY(String)), - Column('timestamp', DateTime(True)), - Column('formatted_message', String), - Column('logger_name', String), - Column('level', String), - Column('thread_name', String), - Column('reference_mask', Integer), - Column('properties', JSON), - Column('stacktrace', Text) -) - - -class Polarization(Base): - __tablename__ = 'polarizations' - - polarization_id = Column(Integer, primary_key=True) - name = Column(String, nullable=False) - description = Column(String, nullable=False) - - def __repr__(self): - return "<Polarization {0}>".format(self.name) - - -class Project(Base): - __tablename__ = 'projects' - - project_code = Column(String, primary_key=True) - legacy_id = Column(String) - total_observation_time = Column(Float(53)) - opt_project_id = Column(Integer) - title = Column(String) - abstract = Column(Text) - proprietary_expiration = Column(DateTime) - last_addition = Column(Date) - starttime = Column(Float(53)) - endtime = Column(Float(53)) - proprietary_duration = Column(Float(53)) - - authors = relationship('Author') - execution_blocks = relationship('ExecutionBlock') - file_groups = relationship('Filegroup') - - def __repr__(self): - return '<Project#{project_code} "{title}" start={starttime} end={endtime}>'.format(**self.__dict__) - - -class Receiver(Base): - __tablename__ = 'receivers' - - receiver_id = Column(Integer, primary_key=True, server_default=text("nextval('receivers_receiver_id_seq'::regclass)")) - description = Column(String, nullable=False) - - def __repr__(self): - return '<Receiver {0}>'.format(self.receiver_id) - - -class Scan(Base): - __tablename__ = 'scans' - - scan_id = Column(Integer, primary_key=True, server_default=text("nextval('scans_scan_id_seq'::regclass)")) - ost_scan_id = Column(Integer) - execution_block_id = Column(ForeignKey('execution_blocks.execution_block_id'), nullable=False) - filegroup_id = Column(ForeignKey('filegroups.filegroup_id'), nullable=False) - max_bandwidth = Column(Float(53), nullable=False) - min_bandwidth = Column(Float(53), nullable=False) - polarization_code = Column(Integer, nullable=False) - max_frequency = Column(Float(53), nullable=False) - min_frequency = Column(Float(53), nullable=False) - - execution_block = relationship('ExecutionBlock') - filegroup = relationship('Filegroup') - subscans = relationship('Subscan') - - def __repr__(self): - return '<Scan#{0}>'.format(self.scan_id) - - -t_subscan_data_descriptions = Table( - 'subscan_data_descriptions', metadata, - Column('subscan_id', ForeignKey('subscans.subscan_id'), primary_key=True, nullable=False), - Column('data_description_id', ForeignKey('data_descriptions.data_description_id'), primary_key=True, nullable=False, index=True) -) - - -t_subscan_intents = Table( - 'subscan_intents', metadata, - Column('intent_name', ForeignKey('intents.intent_name'), primary_key=True, nullable=False), - Column('subscan_id', ForeignKey('subscans.subscan_id'), primary_key=True, nullable=False) -) - - -class Subscan(Base): - __tablename__ = 'subscans' - - subscan_id = Column(Integer, primary_key=True, server_default=text("nextval('subscans_subscan_id_seq'::regclass)")) - scan_id = Column(ForeignKey('scans.scan_id'), nullable=False) - file_id = Column(ForeignKey('files.file_id'), nullable=False) - ost_subscan_id = Column(Integer) - obstype = Column(String, nullable=False) - starttime = Column(Float(53), nullable=False) - endtime = Column(Float(53), nullable=False) - sourcename = Column(String, nullable=False) - sourcetype = Column(String, nullable=False) - ra = Column(Float(53), nullable=False) - dec = Column(Float(53), nullable=False) - exposure_time = Column(Float(53), nullable=False) - integration_time = Column(Float(53), nullable=False) - receiver_id = Column(ForeignKey('receivers.receiver_id'), nullable=False) - backend = Column(String, nullable=False) - intent = Column(String) - datadesc = Column(Integer) - - file = relationship('File') - receiver = relationship('Receiver') - scan = relationship('Scan') - data_descriptions = relationship('DataDescription', secondary='subscan_data_descriptions') - intents = relationship('Intent', secondary='subscan_intents') - - def __repr__(self): - return '<Subscan#{subscan_id} {obstype} start={starttime} end={endtime} ra={ra} dec={dec} backend={backend} intent={intent}>'.format(**self.__dict__) - - -t_workflow_input_files = Table( - 'workflow_input_files', metadata, - Column('workflow_id', ForeignKey('workflows.workflow_id'), primary_key=True, nullable=False), - Column('file_id', ForeignKey('files.file_id'), primary_key=True, nullable=False) -) - - -class WorkflowParameter(Base): - __tablename__ = 'workflow_parameters' - - workflow_id = Column(ForeignKey('workflows.workflow_id'), primary_key=True, nullable=False) - parameter_name = Column(String, primary_key=True, nullable=False) - parameter_value = Column(String, nullable=False) - - workflow = relationship('Workflow') - - def __repr__(self): - return '<WorkflowParameter {parameter_name}={parameter_value}>'.format(**self.__dict__) - - -class Workflow(Base): - __tablename__ = 'workflows' - - workflow_id = Column(Integer, primary_key=True, server_default=text("nextval('workflows_workflow_id_seq'::regclass)")) - workflow_name = Column(String, nullable=False) - status = Column(String, nullable=False) - submitted_by = Column(String, nullable=False) - submitted = Column(DateTime) - started = Column(DateTime) - completed = Column(DateTime) - workflow_qa_analyst = Column(String) - workflow_qa_comments = Column(String) - workflow_qa_result = Column(String) - result_qa_analyst = Column(String) - result_qa_comments = Column(String) - result_qa_results = Column(String) - software_version = Column(String, nullable=False) - output_filegroup_id = Column(ForeignKey('filegroups.filegroup_id'), nullable=False) - - output_filegroup = relationship('Filegroup') - workflow_parameters = relationship('WorkflowParameter') - input_files = relationship('File', secondary='workflow_input_files') - - def __repr__(self): - return '<Workflow#{workflow_id} {workflow_name} status={status} start={started} end={completed}>'.format(**self.__dict__) diff --git a/apps/cli/utilities/mr_books/setup.py b/apps/cli/utilities/mr_books/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..e6550e31c21b5724d9ecfdf745f7208988b5f1ec --- /dev/null +++ b/apps/cli/utilities/mr_books/setup.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from pathlib import Path +from setuptools import setup + +VERSION = open('src/mr_books/_version.py').readlines()[-1].split()[-1].strip("\"'") +README = Path('README.md').read_text() + +setup( + name=Path().absolute().name, + version=VERSION, + description='NRAO Archive mr_books', + long_description=README, + author='NRAO SSA Team', + author_email='dms-ssa@nrao.edu', + url='TBD', + license="GPL", + install_requires=['pendulum', 'pid', 'pycapo', 'pymygdala', 'schema', 'sqlalchemy'], + keywords=[], + packages=['mr_books'], + package_dir={'':'src'}, + classifiers=[ + 'Programming Language :: Python :: 3.8' + ], + entry_points={ + 'console_scripts': ['mr_books = mr_books.commands:mr_books'] + }, +) diff --git a/apps/cli/utilities/mr_books/src/mr_books/__init__.py b/apps/cli/utilities/mr_books/src/mr_books/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc18fa59bebc781b7d467412e7c6e0a328a53c4 --- /dev/null +++ b/apps/cli/utilities/mr_books/src/mr_books/__init__.py @@ -0,0 +1 @@ +from _version import ___version___ as version diff --git a/apps/cli/utilities/mr_books/src/mr_books/_version.py b/apps/cli/utilities/mr_books/src/mr_books/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..e63ec3a147fc65f8e390ff413a349d66169f8ec1 --- /dev/null +++ b/apps/cli/utilities/mr_books/src/mr_books/_version.py @@ -0,0 +1,2 @@ +""" Version information for this package, don't put anything else here. """ +___version___ = '4.0a1.dev1' diff --git a/apps/cli/utilities/mr_books/src/mr_books/commands.py b/apps/cli/utilities/mr_books/src/mr_books/commands.py new file mode 100644 index 0000000000000000000000000000000000000000..aabe9484684526135bdbd778719b62161914181c --- /dev/null +++ b/apps/cli/utilities/mr_books/src/mr_books/commands.py @@ -0,0 +1,554 @@ +# -*- coding: utf-8 -*- + +import argparse as ap +import collections +import datetime +import json +import logging, logging.handlers +import os +import sys +import time +from pathlib import Path + +import dateutil.parser +import pendulum +from pid import PidFile, PidFileError +from pycapo import CapoConfig +from sqlalchemy import create_engine, or_ +from sqlalchemy.orm import sessionmaker + +from pymygdala import SendNRAOEvent, LogHandler +from _version import ___version___ as version +from schema.model import ExecutionBlock +from schema.ngasmodel import NGASFile, NGASDisk, NGASHost + +_STATE_DIR = os.path.abspath(os.path.join(Path.home(), '.mr_books')) +_CACHE_FILE = 'cache.json' +_TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%S.%f' +_EPOCH = dateutil.parser.parse('1970-01-01T00:00.000') + +_DESCRIPTION = """AAT/PPI calibration bookkeper, version {}.""" + +_EPILOG = """Return values: +0: everything worked, +1: can't deduce which profile to use, +2: unknown required property, +3: missing CAPO property, +4: permissions issue of some kind""" + +_MISSING_PROFILE = """ERROR: mr_books can't deduce the 'profile', give it the -P argument or set the CAPO_PROFILE environment variable! Geeze!\n\n""" + +_MISSING_SETTING = """ERROR: missing CAPO property {}.\n""" + +_UNKNOWN_PROPERTY = """ERROR: unknown required CAPO property {}.\n""" + +_RUN_STATE = 'edu.nrao.archive.workflow.config.CiplWorkflowSettings.ciplRunState' + +_REQUIRED_PROPERTIES = [ + 'ngasDatabase.jdbcUrl', + 'ngasDatabase.jdbcUsername', + 'ngasDatabase.jdbcPassword', + 'metadataDatabase.jdbcUrl', + 'metadataDatabase.jdbcUsername', + 'metadataDatabase.jdbcPassword', + _RUN_STATE +] + +# the magic name of a placeholder BDF +_STUB_BDFS = ['uid:__evla_bdf_X1.bdf', 'uid___evla_bdf_X1.bdf'] + +_NOW = time.time() + +# set up logging: +LOG = logging.getLogger("mr_books") +LOG.setLevel(logging.WARN) + +# Trim down the external logs we don't care about +logging.getLogger('pika').setLevel(logging.WARN) +logging.getLogger('PidFile').setLevel(logging.WARN) + + +def jdbc_to_python(jdbcurl, username, password): + """ does not work for oracle """ + return jdbcurl.replace('jdbc:', '').replace('://', '://' + username + ':' + password + '@') + + +def get_rawdata_filegroup(eb): + """ Return an EB's rawdata filegroup, if it can't find it throw a LookupError. """ + + # + # VLBA doesn't have the same structure, so we + # refer to the main filegroup. + # + if eb.telescope == 'VLBA': + return eb.filegroup + + if eb.telescope == 'EVLA' or eb.telescope == 'ALMA': + for fg in eb.filegroup.children_filegroups: + if fg.type == 'rawdata': + return fg + + # For now, we default with an exception to catch attention + raise LookupError("No such filegroup 'rawdata'") + + +def format_dt(t): + s = t.strftime(_TIMESTAMP_FORMAT) + f = round(float(s[-7:]), 3) + temp = "%.3f" % f + temp = "%s%s" % (s[:-7], temp[1:]) + return temp + + +class CapoSettings(): + r""" A class that pulls out the CAPO properties we care about and fails with reasonable error messages if the + property we asked for doesn't exist. """ + + def __init__(self, parser, args): + self._parser = parser + self._args = args + self.profile = self._get_profile() + self._config = self._get_capo_config() + self.properties = dict() + for s in _REQUIRED_PROPERTIES: + p = self._get_capo_property(s, self._config.getstring) + self.properties[s] = p + + def get(self, name): + r""" Get a property by short name (the full name minus the prefix). """ + return self.properties[name].replace('\"', '') + + def _get_profile(self): + result = None + if self._args.profile: + result = self._args.profile + elif 'CAPO_PROFILE' in os.environ: + result = os.environ['CAPO_PROFILE'] + else: + LOG.error(_MISSING_PROFILE) + self._parser.print_help() + sys.exit(1) + return result + + def _get_capo_config(self): + r"""" Get a CapoConfig object. """ + try: + result = CapoConfig(profile=self._args.profile) + except ValueError: + LOG.error(_MISSING_PROFILE) + self._parser.print_help() + sys.exit(1) + return result + + def _get_capo_property(self, name, fn): + r""" Get a CAPO property with a given short name, using a given function. """ + try: + result = fn(name) + except KeyError: + LOG.error(_MISSING_SETTING.format(name)) + self._parser.print_help() + sys.exit(2) + return result + + +class NGASFileInfoCache(): + def __init__(self, capo_settings, parser, args): + self._parser = parser + self._args = args + self._cs = capo_settings + self.ngasdb_session = self._get_ngasdb_session() + self.last_update = '2000-01-01T00:00.000' + self._state_dir = args.state_dir + self.cache_file = os.path.join(self._state_dir, _CACHE_FILE) + self.cache = dict() + + def _get_ngasdb_session(self): + try: + # Old Oracle Connection setup + # ngas_dsn = cx_Oracle.makedsn('quigon.aoc.nrao.edu', '1521', 'nraongas.nrao.edu') + # ngas_dsn = ngas_dsn.replace('SID', 'SERVICE_NAME') + # ngas_url = 'oracle+cx_oracle://ngas:XXXXXXXX@' + ngas_dsn + # return sessionmaker(bind=create_engine(ngas_url))() + # + #Get the NGAS credentials from capo. + ngas_url = jdbc_to_python(self._cs.get('ngasDatabase.jdbcUrl'), + self._cs.get('ngasDatabase.jdbcUsername'), + self._cs.get('ngasDatabase.jdbcPassword')) + return sessionmaker(bind=create_engine(ngas_url))() + except: + LOG.exception('Exception while connecting to NGAS!') + sys.exit(2) + + def read_cache_file(self): + LOG.debug('reading cache file') + result = dict() + try: + with open(self.cache_file, 'r') as f: + result = json.load(f) + dt = os.path.getmtime(self.cache_file) + dt = datetime.datetime.utcfromtimestamp(dt) + dt = dt - datetime.timedelta(days=2) + self.last_update = dt + except: + self.last_update = _EPOCH + return result + + def write_cache_file(self): + LOG.debug('writing cache file') + try: + if not os.path.exists(self._state_dir): + os.makedirs(self._state_dir) + with open(self.cache_file, 'w') as f: + json.dump(self.cache, f) + except: + LOG.exception('caught something while writing') + sys.exit(2) + + def get_fileinfo_from_database(self): + LOG.info('Updating file info from NGAS database') + query = self.ngasdb_session.query(NGASFile).join(NGASFile.disk).join(NGASDisk.host) + query = query.filter(or_(NGASFile.file_id.like('%.sdm'), + NGASFile.file_id.like('%.bin'), + NGASFile.file_id.like('%.bdf'))) + query = query.filter(NGASDisk.mounted != 0) + query = query.filter(NGASHost.domain == 'aoc.nrao.edu') + query = query.filter(NGASFile.ingestion_date > + format_dt(self.last_update)) + query = query.order_by(NGASFile.version.desc()) + + result = dict() + if query.count(): + for q in query.all(): + if q.file_id not in result: + result[q.file_id] = 1 + return result + + def update(self): + LOG.debug('starting cache update') + self.cache = self.read_cache_file() + LOG.info('last update {}'.format(self.last_update.strftime(_TIMESTAMP_FORMAT))) + file_info = self.get_fileinfo_from_database() + self.cache.update(file_info) + self.write_cache_file() + + def exists(self, file_id): + return file_id in self.cache + + +class MrBooks(): + r""" MrBooks. """ + + def __init__(self): + self._parser = _make_parser() + self._args = self._parser.parse_args() + self._cs = CapoSettings(self._parser, self._args) + self.profile = self._cs.profile + self._ngas_cache = NGASFileInfoCache(self._cs, self._parser, self._args) + self.run_state = self._cs.get(_RUN_STATE) + self.verbose = self._args.verbose + self._state_dir = self._args.state_dir + self._filesets = self._args.filesets if hasattr(self._args, 'filesets') else None + if self._filesets is not None: + self._filesets = self._filesets.split(",") + self.aatdb_session = self._get_aatdb_session() + self._send_event = SendNRAOEvent(profile=self.profile, application='ingestor', + routing_key='ingestion-complete.rawdata') + try: + if not os.path.exists(self._state_dir): + os.makedirs(self._state_dir) + except: + LOG.exception('problem creating state directory') + sys.exit(2) + + def _get_aatdb_session(self): + try: + aat_url = jdbc_to_python(self._cs.get('metadataDatabase.jdbcUrl'), + self._cs.get('metadataDatabase.jdbcUsername'), + self._cs.get('metadataDatabase.jdbcPassword')) + return sessionmaker(bind=create_engine(aat_url))() + except: + LOG.exception('exception while connecting to Metadata Database') + sys.exit(2) + + def is_calibratable(self, eb): + r""" Is this EB one that *can* be calibrated? """ + if eb.telescope != 'EVLA': + return False + if eb.starttime < 56316.0: + return False + return True + + def should_calibrate(self, eb): + r""" Stub function: should we calibrate this? Herein will be the heuristic that calibrates only proposed science + and skips over tests, plus exceptions. """ + if eb.scheduling_block_type is not None and \ + (eb.scheduling_block_type == 'OBSERVER' or + eb.project_code.lower().startswith('vlass') or + eb.project_code.lower() == 'tsky0001'): + return True + return False + + def _unknown_ebs(self): + r""" Returns a list of the EBs with an unknown calibration status. """ + query = self.aatdb_session.query(ExecutionBlock) + query = query.filter(ExecutionBlock.calibration_status == 'Unknown') + return query.all() + + def _not_ready_ebs(self): + r""" Returns a list of the EBs with a not ready status. """ + query = self.aatdb_session.query(ExecutionBlock) + query = query.filter(ExecutionBlock.calibration_status == 'Not Ready') + return query.all() + + def set_initial_status(self): + r""" Set the initial calibration status: ingestion leaves it 'Unknown', mr_books figures out whether that should + be 'not ready' or 'do not'. Checking each 'Not Ready' is done in a different stage. """ + ebs = self._unknown_ebs() + + # For the EVLA, check for whether it is science-y (set to Not Ready) or operations-y (Set to Do Not Calibrate) + # + # For ALMA, assume it can be calibrated, and set it to Not Ready. + # + # Anything else, assume 'Do Not Calibrate'. + # When setting the 'Do Not Calibrate' value, send an event to trigger a re-index, since + # this changes the options available via the archiveIface + # + for eb in ebs: + if "EVLA" == eb.telescope: + if self.is_calibratable(eb) and self.should_calibrate(eb): + # This is Observer Science (or a special case). + # Mark it to be tracked for ingestion completion + eb.calibration_status = 'Not Ready' + else: + # This is unsuitable for CIPL, mark it as such and move on + eb.calibration_status = 'Do Not Calibrate' + LOG.info('{} complete: Do Not Calibrate'.format(eb.ngas_fileset_id)) + self.fire_off_event(eb) + + elif "ALMA" == eb.telescope: + # update to Not Ready status, the rest is handled later + eb.calibration_status = 'Not Ready' + else: + # + # GBT, VLBA, and anything else all are marked Do Not Calibrate + # in our scheme (subject to change later). + # + eb.calibration_status = 'Do Not Calibrate' + LOG.debug('{} is not EVLA/ALMA: Do Not Calibrate'.format(eb.ngas_fileset_id)) + self.fire_off_event(eb) + + # Update the execution_blocks table: + try: + self.aatdb_session.add_all(ebs) + self.aatdb_session.commit() + except: + self.aatdb_session.rollback() + sys.exit(5) + + def _get_eb(self, fileset_id): + query = self.aatdb_session.query(ExecutionBlock) + query = query.filter(ExecutionBlock.ngas_fileset_id == fileset_id) + return query.first() + + def get_ebs(self, fileset_ids): + r""" Get an EB by a specific ebs: this is mostly for testing right now. """ + result = list() + if isinstance(fileset_ids, collections.Iterable): + for fileset_id in fileset_ids: + result.append(self._get_eb(fileset_id)) + else: + result.append(self._get_eb(fileset_ids)) + return result + + def is_eb_complete(self, eb): + r""" Check to see if an EB is complete, meaning all the files in the filegroup are in NGAS. """ + + files = get_rawdata_filegroup(eb).files + LOG.debug('checking {}, {} files'.format(eb.ngas_fileset_id, len(files))) + found = 0 + for f in files: + # _STUB_BDF is a magic BDF name, a placeholder: don't bother checking to make sure it exists + if f.ngas_id not in _STUB_BDFS: + if not self._ngas_cache.exists(f.ngas_id): + LOG.debug('{} is missing'.format(f.ngas_id)) + else: + found += 1 + if found != len(files): + LOG.info('{} not complete, found {} of {} files'.format(eb.ngas_fileset_id, found, len(files))) + return False + else: + LOG.info('{} complete, found {} of {} files'.format(eb.ngas_fileset_id, found, len(files))) + return True + + def fire_off_event(self, eb): + r""" Fire off an ingestion complete event for this EB. """ + LOG.debug('firing off event') + try: + event = self.ingestion_event(eb) + LOG.debug(json.dumps(event)) + self._send_event.send(event=event) + except: + LOG.exception('Exception occurred while sending completion event') + sys.exit(5) + LOG.debug('done') + + def update_to_ready(self, e): + r""" Mark an EB (or list of EBs) as ready to calibrate. """ + LOG.debug('updating to ready') + if isinstance(e, collections.Iterable): + for eb in e: + self.update_to_ready(eb) + else: + try: + e.calibration_status = 'Ready' + self.aatdb_session.add(e) + self.aatdb_session.commit() + except: + LOG.exception('Exception while updating status of {}', e.ngas_fileset_id) + self.aatdb_session.rollback() + sys.exit(5) + LOG.debug('done') + + def check_not_ready(self, e): + r""" Check an EB (or a list of EBs) to see if they are ready to calibrate. """ + if isinstance(e, collections.Iterable): + for eb in e: + self.check_not_ready(eb) + else: + LOG.debug('checking {}'.format(e.ngas_fileset_id)) + if e.telescope == 'EVLA': + # Check to see if all the BDFs have been ingested + if self.is_eb_complete(e): + LOG.debug('{} complete: Ready'.format(e.ngas_fileset_id)) + self.update_to_ready(e) + self.fire_off_event(e) + else: # Incomplete, wait for next go around + LOG.debug('{} not ready'.format(e.ngas_fileset_id)) + elif e.telescope == 'ALMA': + # ALMA is the easy case, just set it to Ready + LOG.info('{} set to Ready automatically'.format(e.ngas_fileset_id)) + self.update_to_ready(e) + self.fire_off_event(e) + else: + # Everything else, just print a message + LOG.info('Unexpected Telescope for {}'.format(e.ngas_fileset_id)) + + def ingestion_event(self, eb): + result = dict() + result['logData'] = { + 'profile': self.profile, + 'fileset_id': eb.ngas_fileset_id, + 'ingestion_type': 'observation', + 'telescope': eb.telescope, + 'execblock_id': eb.ost_exec_block_id, + 'schedblock_id': eb.scheduling_block_id, + 'schedblock_type': eb.scheduling_block_type, + 'project_code': eb.project_code, + 'override_db_screen': 'false', + 'file_count': len(get_rawdata_filegroup(eb).files) + } + result['message'] = 'ingestion complete' + result['request'] = 'de nada' + return result + + def update_to_dnc(self, e): + r""" Update an EB (or list of EBs) to Do Not Calibrate. """ + if isinstance(e, collections.Iterable): + for eb in e: + self.update_to_dnc(eb) + else: + try: + e.calibration_status = 'Do Not Calibrate' + self.aatdb_session.add(e) + self.aatdb_session.commit() + except: + LOG.exception('Exception while updating calibration status of {}', e.ngas_fileset_id) + self.aatdb_session.rollback() + sys.exit(5) + + def set_ingestion_complete(self): + pass + + def run(self): + r""" Run the actual clean process. """ + start_time = pendulum.now() + + try: + with PidFile(piddir=self._state_dir, pidname='mr_books.lock') as p: + self._ngas_cache.update() + + LOG.debug('mr_books called with arguments:\n{}'.format(self._args)) + + if self._filesets: + ebs = self.get_ebs(self._filesets) + self.check_not_ready(ebs) + else: + self.set_ingestion_complete() + self.set_initial_status() + self.check_not_ready(self._not_ready_ebs()) + time_diff = pendulum.now() - start_time + LOG.info('finished, took {}'.format(time_diff.in_words())) + except PidFileError: + LOG.error("mr_books already running, aborting this instance") + + +def _make_parser(): + r""" Build a command line parser for this app: this is external to MrBooks and/or CapoSettings because both need it, + and Sphinx will want a look at ot to build docs. """ + result = ap.ArgumentParser(description=_DESCRIPTION.format(version), + formatter_class=ap.RawTextHelpFormatter, + epilog=_EPILOG) + result.add_argument('-P', '--profile', action='store', + help='profile name to use, e.g. test, production') + result.add_argument('-v', '--verbose', action='store_true', + help='enable verbose-i-tude') + result.add_argument('-f', '--filesets', action='store', dest='filesets', + help='run on comma separated list of filesets') + result.add_argument('-s', '--state_dir', action='store', dest='state_dir', + default=_STATE_DIR, + help='state directory to use ({})'.format(_STATE_DIR)) + return result + + +def mr_books(): + r""" Wrapper around the MrBooks class, console script entry point. """ + + + mb = MrBooks() + + # Configure the logging: + # Log specifically to a file in the state directory, + # rather than to stdout. Rotate weekly, on Friday. + logFileName = mb.profile + '_direct.log' + fullLogPath = os.path.abspath( os.path.join(mb._state_dir, logFileName) ) + ch = logging.handlers.TimedRotatingFileHandler(fullLogPath, when='W4') + if mb.verbose: + # Given the the -v flag, be extra chatty + ch.setLevel(logging.DEBUG) + # and open up the limits for most of our subsidiaries + LOG.setLevel(logging.DEBUG) + else: + # default to WARN level to avoid some spam. + ch.setLevel(logging.INFO) + LOG.setLevel(logging.INFO) + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + ch.setFormatter(formatter) + LOG.addHandler(ch) + + # integrate the logs with pymygdala's logging system + handler = LogHandler(profile=mb.profile, application='mr_books') + if mb.verbose: + # Given the the -v flag, be extra chatty + handler.setLevel(logging.DEBUG) + else: + # default to WARN level to avoid some spam. + handler.setLevel(logging.WARN) + LOG.addHandler(handler) + + LOG.debug('starting, profile is {}'.format(mb.profile)) + + mb.run() + + +if __name__ == "__main__": + mr_books() diff --git a/apps/cli/utilities/mr_clean/README.md b/apps/cli/utilities/mr_clean/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/mr_clean/setup.py b/apps/cli/utilities/mr_clean/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..bd0f57bd6c0471eecaf4b25e0134ab5fbc6d72b5 --- /dev/null +++ b/apps/cli/utilities/mr_clean/setup.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from pathlib import Path +from setuptools import setup + +VERSION = open('src/mr_clean/_version.py').readlines()[-1].split()[-1].strip("\"'") +README = Path('README.md').read_text() + +setup( + name=Path().absolute().name, + version=VERSION, + description='NRAO Archive mr_clean', + long_description=README, + author='NRAO SSA Team', + author_email='dms-ssa@nrao.edu', + url='TBD', + license="GPL", + install_requires=['pendulum', 'pycapo', 'pymygdala'], + keywords=[], + packages=['mr_clean'], + package_dir={'':'src'}, + classifiers=[ + 'Programming Language :: Python :: 3.8' + ], + entry_points={ + 'console_scripts': ['mr_clean = mr_clean.commands:mr_clean'] + }, +) diff --git a/apps/cli/utilities/mr_clean/src/mr_clean/__init__.py b/apps/cli/utilities/mr_clean/src/mr_clean/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3688fc465f5538f20d7077732f69fdaa1949b5df --- /dev/null +++ b/apps/cli/utilities/mr_clean/src/mr_clean/__init__.py @@ -0,0 +1,18 @@ +r""" mr_clean: AAT/PPI utility for periodically cleaning things up + +mr_clean cleans up: +* the download directory +* the spool directory +* the temp directory + +This is configured through CAPO settings: +* edu.nrao.archive.configuration.JanitorSettings.spoolDirectory, string +* edu.nrao.archive.configuration.JanitorSettings.spoolDaysToKeep, int +* edu.nrao.archive.configuration.JanitorSettings.tempDirectory, string +* edu.nrao.archive.configuration.JanitorSettings.tempDaysToKeep, int +* edu.nrao.archive.configuration.JanitorSettings.downloadDirectory, string +* edu.nrao.archive.configuration.JanitorSettings.downloadDaysToKeep, int +* edu.nrao.archive.configuration.JanitorSettings.enabled, (true, false) + +'enabled' is a boolean that controls whether mr_clean actually does anything or not, if it is set to false then the best mr_clean will do for you is parse the command line arguments, look at CAPO settings and tell you what settings it would have used. +""" \ No newline at end of file diff --git a/apps/cli/utilities/mr_clean/src/mr_clean/_version.py b/apps/cli/utilities/mr_clean/src/mr_clean/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..e63ec3a147fc65f8e390ff413a349d66169f8ec1 --- /dev/null +++ b/apps/cli/utilities/mr_clean/src/mr_clean/_version.py @@ -0,0 +1,2 @@ +""" Version information for this package, don't put anything else here. """ +___version___ = '4.0a1.dev1' diff --git a/apps/cli/utilities/mr_clean/src/mr_clean/commands.py b/apps/cli/utilities/mr_clean/src/mr_clean/commands.py new file mode 100644 index 0000000000000000000000000000000000000000..6c387e3db3314037cee2896810f61211e5e922b3 --- /dev/null +++ b/apps/cli/utilities/mr_clean/src/mr_clean/commands.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- + +from __future__ import print_function + +import argparse as ap +import logging +import os +import shutil +import sys + +import pendulum +import time +from pycapo import CapoConfig + +from _version import ___version___ as version +from pymygdala import LogHandler, SendNRAOEvent + +_DESCRIPTION = """AAT/PPI request file/directory cleaner, version {}. Cleans up the spool directory and downloads area.""" + +_EPILOG = """Return values: +0: everything worked, +1: can't deduce which profile to use, +2: unknown required property, +3: missing CAPO property, +4: permissions issue of some kind""" + +_MISSING_PROFILE = """ERROR: mr_clean can't deduce the 'profile', give it the -P argument or set the CAPO_PROFILE environment variable! Geeze!\n\n""" + +_MISSING_SETTING = """ERROR: missing CAPO property {}.\n""" + +_UNKNOWN_PROPERTY = """ERROR: unknown required CAPO property {}.\n""" + +_REQUIRED_PROPERTIES = ['spoolDirectory', 'spoolDaysToKeep', + 'stageDirectory', 'stageDaysToKeep', + 'tempDirectory', 'tempDaysToKeep', + 'downloadDirectory', 'downloadDaysToKeep', + 'cacheDirectory', 'cacheDaysToKeep', + 'parallelIngestionDirectory', 'parallelIngestionDaysToKeep', # No parallel in va, so these point at tmp there + 'enabled'] + +_PROPERTY_PREFIX = "edu.nrao.archive.configuration.JanitorSettings" + +_NOW = time.time() + +_LOG = logging.getLogger('mr_clean') +_LOG.setLevel(logging.DEBUG) +ch = logging.StreamHandler(sys.stdout) +ch.setLevel(logging.DEBUG) +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +ch.setFormatter(formatter) +_LOG.addHandler(ch) + + +def _make_parser(): + r""" Build a command line parser for this app: this is external to MrClean and/or CapoSettings because both need + it, and Sphinx will want a look at ot to build docs. """ + result = ap.ArgumentParser(description=_DESCRIPTION.format(version), + formatter_class=ap.RawTextHelpFormatter, + epilog=_EPILOG) + result.add_argument('-P', '--profile', action='store', + help='profile name to use, e.g. test, production') + result.add_argument('-v', '--verbose', action='store_true', + help='enable verbose-i-tude') + return result + + +def contained_dirs(dir): + r""" Get a list of the subdirectories of a given directory. """ + return filter(os.path.isdir, + [os.path.join(dir, f) for f in os.listdir(dir)]) + + +def older_than(f, n): + r""" Is a file (or directory) older than N days. """ + if os.path.islink(f): + fn = os.lstat + else: + fn = os.stat + return True if fn(f).st_mtime <= _NOW - (n * 86400) else False + + +class CapoSettings(): + r""" A class that pulls out the CAPO properties we care about and fails with reasonable error messages if the + property we asked for doesn't exist. """ + + def __init__(self, capo_config, parser): + self._config = capo_config + self._parser = parser + self.properties = dict() + + for s in _REQUIRED_PROPERTIES: + if s.endswith('Directory'): + p = self._get_capo_property(_PROPERTY_PREFIX + '.' + s, self._config.getstring) + elif s.endswith('DaysToKeep'): + p = self._get_capo_property(_PROPERTY_PREFIX + '.' + s, self._config.getint) + elif s == 'enabled': + p = self._get_capo_property(_PROPERTY_PREFIX + '.' + s, self._config.getboolean) + else: + _LOG.error(_UNKNOWN_PROPERTY.format(_PROPERTY_PREFIX + "." + s)) + self._parser.print_help() + sys.exit(2) + self.properties[s] = p + + def get(self, name): + r""" Get a property by short name (the full name minus the prefix). """ + return self.properties[name] + + def _get_capo_property(self, name, fn): + r""" Get a CAPO property with a given short name, using a given function. """ + try: + result = fn(name) + except KeyError: + _LOG.error(_MISSING_SETTING.format(name)) + self._parser.print_help() + sys.exit(2) + return result + + +class MrClean(): + r""" MrClean just keeps things clean. """ + + def __init__(self): + self._parser = _make_parser() + self._args = self._parser.parse_args() + self._config = self._get_capo_config() + self._setup_logging() + self._cs = CapoSettings(self._config, self._parser) + self.enabled = self._cs.get('enabled') + self.verbose = self._args.verbose + self._send_event = SendNRAOEvent(profile=self.profile, application='janitor') + + def _setup_logging(self): + handler = LogHandler(profile=self.profile, application='mr_clean', + level=logging.DEBUG) + _LOG.addHandler(handler) + + def _get_capo_config(self): + r"""" Get a CapoConfig object. """ + if self._args.profile: + self.profile = self._args.profile + elif 'CAPO_PROFILE' in os.environ: + self.profile = os.environ['CAPO_PROFILE'] + else: + _LOG.error(_MISSING_PROFILE) + self._parser.print_help() + sys.exit(1) + return CapoConfig(profile=self.profile) + + def clean_flat_directory(self, directory, days): + r""" Clean a 'flat' directory, this is a directory with sub-directories, we only don't descend into them, we + only look at the modification date of each subfolder to determine whether to keep it or not. The spool folder + and the temp folder fit this use case. For the case of long-running cluster processes (casa in particular) + we need to determine if there are newer subdirectories underneath the to-be-removed target.""" + if not os.path.exists(directory): + _LOG.info('directory {} does not exist, skipping'.format(directory)) + return + + dirs = contained_dirs(directory) + for d in dirs: + if older_than(d, days): + # d is a candidate for deletion, but check for newer subdirectories: + subdirs = contained_dirs(d) + if subdirs: + # non-empty set of subdirectories: + for sd in subdirs: + if not older_than(sd, days): + # One of the sub directories is newer, so we + # set the parent's modified time to now. + if self.verbose: + _LOG.info('updated modified time of directory {}'.format(d)) + os.utime(d, None) # The None causes the time to be set to Now + break + else: + # If all the subdirectories are older, remove d + # Note: A break above will not execute this portion + if self.verbose: + _LOG.info('removing directory {}'.format(d)) + shutil.rmtree(d, ignore_errors=True) + else: + # If there are no subdirectories, we're ok to remove d + if self.verbose: + _LOG.info('removing directory {}'.format(d)) + shutil.rmtree(d, ignore_errors=True) + + def clean_download_directory(self, directory, days): + r""" Clean a download directory: first step through it and nuke all the old files, then walk it back to front + removing empty directories as we go. """ + if not os.path.exists(directory): + _LOG.info('directory {} does not exist, skipping'.format(directory)) + return + + # Nuke old files + for root, dirs, files in os.walk(directory, topdown=False): + for name in files: + fp = os.path.join(root, name) + if older_than(fp, days): + if self.verbose: + _LOG.info('removing file {}'.format(fp)) + os.remove(fp) + + # Nuke empty directories + for root, dirs, files in os.walk(directory, topdown=False): + for name in dirs: + dp = os.path.join(root, name) + if not os.listdir(dp): + if self.verbose: + _LOG.info('removing directory {}'.format(dp)) + os.rmdir(dp) + + def send_event(self, event): + self._send_event.send(event=event) + + def clean(self): + r""" Run the actual clean process. """ + self.send_event({'message': 'starting up', + 'request': 'cleanup', + 'logData': {} + }) + start_time = pendulum.now() + if self.verbose: + _LOG.debug('CLI arguments:\n{}'.format(self._args)) + _LOG.debug('required CAPO properties:\n{}'.format(self._cs.properties)) + if self.enabled: + self.clean_flat_directory(self._cs.get('spoolDirectory'), self._cs.get('spoolDaysToKeep')) + self.clean_flat_directory(self._cs.get('tempDirectory'), self._cs.get('tempDaysToKeep')) + self.clean_flat_directory(self._cs.get('parallelIngestionDirectory'), self._cs.get('parallelIngestionDaysToKeep')) + self.clean_flat_directory(self._cs.get('stageDirectory'), self._cs.get('stageDaysToKeep')) + + try: + sd = contained_dirs(self._cs.get('downloadDirectory')) + for s in sd: + self.clean_download_directory(s, self._cs.get('downloadDaysToKeep')) + except FileNotFoundError: + _LOG.exception('caught an exception cleaning the download directory') + _LOG.info('directory {} does not exist, skipping'.format(self._cs.get('downloadDirectory'))) + + # the cache is complicated, much like downloads. Use the same functionality: + try: + sd = contained_dirs(self._cs.get('cacheDirectory')) + for s in sd: + self.clean_download_directory(s, self._cs.get('cacheDaysToKeep')) + except FileNotFoundError: + _LOG.exception('caught an exception cleaning the cache directory') + _LOG.info('directory {} does not exist, skipping'.format(self._cs.get('cacheDirectory'))) + + time_diff = pendulum.now() - start_time + self.send_event({'message': 'finishing', + 'request': 'cleanup', + 'logData': {'elapsed_time': time_diff.in_words()} + }) + + +def mr_clean(): + r""" Wrapper around the MrClean class, console script entry point. """ + mc = MrClean() + mc.clean() + + +if __name__ == "__main__": + mr_clean() diff --git a/apps/cli/utilities/proprietary_setter/README.md b/apps/cli/utilities/proprietary_setter/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/proprietary_setter/setup.py b/apps/cli/utilities/proprietary_setter/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..749807d46507f68438e2e24951489626c953bade --- /dev/null +++ b/apps/cli/utilities/proprietary_setter/setup.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from pathlib import Path +from setuptools import setup + +VERSION = open('src/proprietary_setter/_version.py').readlines()[-1].split()[-1].strip("\"'") +README = Path('README.md').read_text() + +setup( + name=Path().absolute().name, + version=VERSION, + description='NRAO Archive Proprietary Time Setter', + long_description=README, + author='NRAO SSA Team', + author_email='dms-ssa@nrao.edu', + url='TBD', + license="GPL", + install_requires=['pymygdala', 'schema', 'support'], + keywords=[], + packages=['proprietary_setter'], + package_dir={'':'src'}, + classifiers=[ + 'Programming Language :: Python :: 3.8' + ], + entry_points={ + 'console_scripts': ['proj_prop_period = proprietary_setter.commands:main'] + }, +) diff --git a/apps/cli/utilities/proprietary_setter/src/proprietary_setter/__init__.py b/apps/cli/utilities/proprietary_setter/src/proprietary_setter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/proprietary_setter/src/proprietary_setter/_version.py b/apps/cli/utilities/proprietary_setter/src/proprietary_setter/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..e63ec3a147fc65f8e390ff413a349d66169f8ec1 --- /dev/null +++ b/apps/cli/utilities/proprietary_setter/src/proprietary_setter/_version.py @@ -0,0 +1,2 @@ +""" Version information for this package, don't put anything else here. """ +___version___ = '4.0a1.dev1' diff --git a/apps/cli/utilities/proprietary_setter/src/proprietary_setter/commands.py b/apps/cli/utilities/proprietary_setter/src/proprietary_setter/commands.py new file mode 100644 index 0000000000000000000000000000000000000000..afd893a0cb60b02b9077f8e4943a7503aaaf275d --- /dev/null +++ b/apps/cli/utilities/proprietary_setter/src/proprietary_setter/commands.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- +""" +A module for updating the proprietary period of a project, whether that be to set it as +proprietary or to make it public. + +Author: Richard Falardeau <rfalarde@nrao.edu> +""" +# pylint: disable=logging-format-interpolation + +import argparse as ap +import sys +import warnings +import logging +from astropy.time import Time +from sqlalchemy import exc as sa_exc + +from _version import ___version___ as version +from support.logging import get_console_logger, LOG_MESSAGE_FORMATTER +from support.capo import get_my_capo_config +from schema import create_session +from schema.model import Project +from schema.legacy_model import LegacyProject +from pymygdala import LogHandler, SendNRAOEvent + + +_APPLICATION_NAME = 'proprietary_setter' +_LOG = get_console_logger(_APPLICATION_NAME) +_MISSING_PROFILE = """unable to derive the capo profile from the environment, """ +\ + """provide the capo profile through the -P argument or set the """ +\ + """CAPO_PROFILE environment variable.""" +_DISALLOWED_DURATION = """invalid proprietary duration, only integer values """ +\ + """between 0 and 730 are allowed.""" +_DESCRIPTION = """A tool for setting the proprietary duration of a project""" +_EPILOG = """Return values: +0: everything worked, +1: can't deduce which profile to use +2: invalid proprietary duration +3: DB update failed""" + + +class UpdateException(Exception): + r"""A wrapper for exceptions caught during our attempt to update the proprietary period. + Since we aren't passing the argparse object to helper functions, we want to catch some + obviously library exceptions in sqlalchemy, but also use it for some error trapping and + 'handling'""" + + +def _make_parser(): + r""" + Build a command line parser to take the parameters for setting a projects + proprietary period. + + :return result: an argparse object with the profile, project and + duration in its namespace""" + result = ap.ArgumentParser(description=_DESCRIPTION.format(version), + formatter_class=ap.RawTextHelpFormatter, + epilog=_EPILOG) + result.add_argument('-P', '--profile', action='store', + help='profile name to use, e.g. nmtest, dsoc-test, or nmprod') + result.add_argument('project', action='store', + help='project_code to update proprietary duration') + result.add_argument('duration', action='store', + type=int, + help='an integer duration to apply to the project; ' + '0 (immediately public) to 730 (private for two years from today)') + return result + + +def set_project_proprietary_state(capo_config, project_code, proprietary_duration): + r""" + Set the proprietary period on the project code to the proprietary period provide on both the + archive and the legacy archive. + Note: Since the capo_config will only really set the db parameters for the archive, + all updates to the legacy archive are actually live on production. + :param capo_config: the capo_config we're running under, which determines which db we update + :param project_code: the project code to update + :param proprietary_duration: an integer value for the new proprietary period (in days) + """ + try: + with warnings.catch_warnings(): + # This is solely to suppress the SQLAlchemy warning messages + warnings.simplefilter("ignore", category=sa_exc.SAWarning) + a_session = create_session('SDM', profile=capo_config.profile) + legacy_session = create_session('LEGACY', profile=capo_config.profile) + new_mjd_endtime = Time.now().mjd + + # In the archive, the project code is a PK, we should only ever get one + project = a_session.query(Project) \ + .filter(Project.project_code == project_code) \ + .first() + # And if we don't, throw an exception before we commit anything + if project is not None: + project.proprietary_duration = proprietary_duration + if proprietary_duration != 0: + project.endtime = new_mjd_endtime + else: + raise UpdateException(f'Project {project_code} was not found in the archive') + + # We're not so lucky in the legacy archive, so we need to get all instances + leg_project = legacy_session.query(LegacyProject) \ + .filter(LegacyProject.project_code == project_code) \ + .all() + # Loop over each to set the period, or throw an exception if not found + if len(leg_project) > 0: + for project_instance in leg_project: + project_instance.proprietary_duration = proprietary_duration + project_instance.unlock_expire = proprietary_duration + if proprietary_duration != 0: + project_instance.proprietary = new_mjd_endtime + project_instance.project_lock = 'LTIME' + else: + project_instance.project_lock = 'PUBLIC' + else: + raise UpdateException(f'Project {project_code} was not found in the legacy archive') + + a_session.commit() + a_session.close() + legacy_session.commit() + legacy_session.close() + + except Exception as update_exception: + raise UpdateException(f'DB update failed for the following reason: {update_exception}') + + +def main(**kwargs): + r""" + The main entry point for this script. Builds the parser, checks params, gets a profile and then + attempts to update the db. If that succeeds, we kick off an amygdala even to tell the system + to re-index the project so the archive will reflect the new status. + :param kwargs: command line arguments to be passed to our parser builder + :return: nothing, if we complete the system will exit normally, if not we'll set a system + exit code. + """ + parser = _make_parser() + args = parser.parse_args(**kwargs) + + capo_config = get_my_capo_config(profile=args.profile) + + if args.duration not in range(0, 730): + _LOG.error(_DISALLOWED_DURATION) + parser.print_help() + sys.exit(2) + + try: + set_project_proprietary_state(capo_config, args.project, args.duration) + except UpdateException as update_exception: + _LOG.error(update_exception) + parser.print_help() + sys.exit(3) + + # Set up a LogHandler to record the fact we just made a change to this project. + # We're adding it here, instead of earlier, because nothing we log earlier should be presented + # to anyone but the command line user and would only add useless clutter to our system logging. + # We only really want the completed task to make a record in our system. + broadcast = LogHandler(profile=capo_config.profile, application=_APPLICATION_NAME) + broadcast.setLevel(logging.DEBUG) + broadcast.setFormatter(LOG_MESSAGE_FORMATTER) + _LOG.addHandler(broadcast) + _LOG.info(f'Attempting to update proprietary period for {args.project}.') + if args.duration != 0: + _LOG.info(f'Locking for {args.duration} days from today') + else: + _LOG.info('Unlocking') + + event = {'logData': {'project_code': args.project, + 'proprietary_duration': args.duration, + 'ingestion_type': 'evla_sdm' + }, + 'message': 'proprietary period updated', + 'request': 're-index please'} + SendNRAOEvent(profile=capo_config.profile, application=_APPLICATION_NAME) \ + .send(routing_key='ingestion-complete.metadata', event=event) + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/apps/cli/utilities/qa_results/README.md b/apps/cli/utilities/qa_results/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/qa_results/setup.py b/apps/cli/utilities/qa_results/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..d5ecaf926f5f04a3652f8bb5e90d206095d1d757 --- /dev/null +++ b/apps/cli/utilities/qa_results/setup.py @@ -0,0 +1,32 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from pathlib import Path +from setuptools import setup + +VERSION = open('src/qa_results/_version.py').readlines()[-1].split()[-1].strip("\"'") +README = Path('README.md').read_text() + +setup( + name=Path().absolute().name, + version=VERSION, + description='NRAO Archive Quality Assurance Pass/Fail Scripts', + long_description=README, + author='NRAO SSA Team', + author_email='dms-ssa@nrao.edu', + url='TBD', + license="GPL", + install_requires=['pycapo', 'pymygdala'], + keywords=[], + packages=['qa_results'], + package_dir={'':'src'}, + classifiers=[ + 'Programming Language :: Python :: 3.8' + ], + entry_points={ + 'console_scripts': [ + 'qaPass = qa_results.commands:qa_pass', + 'qaFail = qa_results.commands:qa_fail' + ] + }, +) diff --git a/apps/cli/utilities/qa_results/src/qa_results/__init__.py b/apps/cli/utilities/qa_results/src/qa_results/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a594901dfb5933483cf41031ff501aa654a1a249 --- /dev/null +++ b/apps/cli/utilities/qa_results/src/qa_results/__init__.py @@ -0,0 +1,18 @@ +r""" + A nice, clean command-line interface for the Data Analysts to + notify the newArchive of the PASS/FAIL status of CIPL run + results. + + On a PASS: send an ArchiveEvent signifying that we + should run the calibration ingestion workflow + + On a FAIL: send an ArchiveEvent signifying that the + data is no good, and should be marked as + DoNotCalibrate + + + In either case, the script needs to identify a fileSetId from the + subdirectory name in the QA2 directory (this is used by Amygdala to + uniquely identify the observation). + +""" \ No newline at end of file diff --git a/apps/cli/utilities/qa_results/src/qa_results/_version.py b/apps/cli/utilities/qa_results/src/qa_results/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..e63ec3a147fc65f8e390ff413a349d66169f8ec1 --- /dev/null +++ b/apps/cli/utilities/qa_results/src/qa_results/_version.py @@ -0,0 +1,2 @@ +""" Version information for this package, don't put anything else here. """ +___version___ = '4.0a1.dev1' diff --git a/apps/cli/utilities/qa_results/src/qa_results/commands.py b/apps/cli/utilities/qa_results/src/qa_results/commands.py new file mode 100644 index 0000000000000000000000000000000000000000..a1fe2538b57ed0c6c2d24492a31563426de742bc --- /dev/null +++ b/apps/cli/utilities/qa_results/src/qa_results/commands.py @@ -0,0 +1,267 @@ + +from __future__ import print_function + +import argparse as ap +import getpass +import os +import xml.etree.ElementTree as ET +from pathlib import Path + +import sys +from ._version import ___version___ as version +from pymygdala import RPCEvent +from pycapo import CapoConfig + +# Help Messages +_PASS_DESCRIPTION = """AAT/PPI QA Pass {}: The calibration jobs listed are considered acceptable. This tool initiates + the ingestion of calibration products for the indicated directories and updates the list of calibrations for the + associated execution blocks.""" + +_FAIL_DESCRIPTION = """AAT/PPI QA Fail {}: This tool marks the execution blocks related to the indicated directories +as not appropriate for automated calibration, and initiates a clean up procedure to remove the directories from the QA +area. """ + +# Pipeline Processing Request +PPR = "PPR" +PPR_FILENAME = PPR + ".xml" + + +def _make_parser(status): + r""" Build a command line parser for this app: this is external to MrClean and/or CapoSettings because both need + it, and Sphinx will want a look at ot to build docs. """ + + if status == "PASS": + _DESCRIPTION = _PASS_DESCRIPTION + else: + _DESCRIPTION = _FAIL_DESCRIPTION + + result = ap.ArgumentParser(description=_DESCRIPTION.format(version), + formatter_class=ap.RawTextHelpFormatter) + result.add_argument('-P', '--profile', action='store', default='', + help='profile name to use, e.g. test, production') + result.add_argument('-f', '--fileSetId', action='store', default='', help='provide a fileSetId for the directory instead of searching for it') + # For capturing DA comments, eventually. + # result.add_argument('-c', '--comment', type=str, action='store', default='No Comment Provided', help='Comment on the EB.') + result.add_argument('-e', '--email', type=str, action='store', default='', help='Email for completion notification') + result.add_argument('directoryName', type = str, action='store', nargs='+', help='name of the qa2 subdirectory to which the status applies') + return result + + +def rawdata_check(path): + #TODO: Integrate this with both searches + # If we arrived here, then we need another route: + # + # the rawdata is stored in a directory named for + # fileSetId + raw_path = Path(path) + fileList = list(raw_path.glob('**/*')) + if(1 < len(fileList)): + # This lists: [fsid, ASDMBinary] if successful + fileSetId = fileList[0].name.replace("/", "") # make sure no trailing slash + return fileSetId + + +def qa_search(path, subdir): + r"""Seach the /qa2/ subdirectory for the listed directory and extract the fileSetId from the + name of the flagversions file. return an empty string if this search fails""" + fileSetId = '' + + + # Where we're looking + qaPath = Path(path + '/' + subdir + '/products/') + + # There's quite a bit of junk in the directory, and we only need the one tgz file + # so screen out the rest of them + # + # Loosened the glob to allow non NN[ABCD]-NNN.sb... identifiers through + fileList = list(qaPath.glob('**/*.ms.flagversions.tgz')) + if(0 < len(fileList)): + flagTableFile = fileList[0].name + fileSetId = flagTableFile.split(".ms")[0] + return fileSetId + + # + # If we arrived here, then we need another route: + # + # the rawdata is stored in a directory named for + # fileSetId + raw_path = Path(path + '/' + subdir + '/rawdata/') + fileList = list(raw_path.glob('**/')) + if(1 < len(fileList)): + # This lists: [rawdata, fsid, ASDMBinary] if successful + fileSetId = fileList[1].name.replace("/", "") # make sure no trailing slash + return fileSetId + + # + # Ok, now things are getting weird. + # + return fileSetId + + +def spool_search(path, subdir): + r"""Search the /spool/ for the desired directory, check for + a rawdata subdirectory and parse the PPR to discover the fileSetId. + return an empty string if you can't acquire the fileSetId""" + fileSetId = '' + + # If we're here, it's because there's nothing useful in the qa2 + # directory, which has most of the data in it.... + # + + # Try the rawdata directory again + # + # + # the rawdata is stored in a directory named for + # fileSetId + raw_path = Path(path + '/' + subdir + '/rawdata/') + fileList = list(raw_path.glob('**/')) + if(1 < len(fileList)): + # This lists: [rawdata, fsid, ASDMBinary] if successful + fileSetId = fileList[1].name.replace("/", "") # make sure no trailing slash + return fileSetId + + # No luck with the raw data directory here in spool... + # + # So that leaves us with one last thing to try: PPR.xml + + pprFile = path+'/'+subdir+'/working/' + PPR_FILENAME + try: + fileTree = ET.parse(pprFile) + # + # Now walk the structure of the PPR file down to the piece we need + # + procRequests = fileTree.find('ProcessingRequests') + procReq = procRequests.find('ProcessingRequest') + dataSet = procReq.find('DataSet') + sdmElement = dataSet.find('SdmIdentifier').text + if sdmElement is not None: + fileSetId = sdmElement + return fileSetId + + except ET.ParseError as pe: + print("Error parsing PPR file.") + except StopIteration as si: + print('No SdmIdentifier in the PPR file.') + except FileNotFoundError as fnf: + print('No PPR file found.') + + # We haven't managed to find the fileSetId: indicate that + return '' + + +def qa_list(status): + r"""From the given directory(s) find the fileSetId(s) required for either workflow or database update + purposes (either from some filenames in the directory, or potentially parsing the PPR if that is deemed + necessary). """ + # Get the command line arguments + args = _make_parser(status).parse_args() + + # Shamelessly stolen from epilogue with a twist: allow for explict profile setting via the CL + if 'CAPO_PROFILE' not in os.environ and '' == args.profile: + # try to synthesize a profile from our installation root + profile = os.path.abspath(sys.argv[0]).split(os.path.sep)[-3] + os.environ['CAPO_PROFILE'] = profile + print('No CAPO_PROFILE explicitly set, synthesizing {0} by path'.format(str(profile))) + elif '' != args.profile: + os.environ['CAPO_PROFILE'] = args.profile + + capo = CapoConfig() + + qaDirectory = capo.getstring("edu.nrao.archive.workflow.config.CiplWorkflowSettings.qaDirectory") + spoolDirectory = capo.getstring("edu.nrao.archive.workflow.config.CiplWorkflowSettings.spoolDirectory") + + + # + # So, the fileSetId is both in the name of a secondary tar file in the QA directory, + # and in the casa PPR file (which lives in spoolDirectory/$subDirName/working/) + # The former is by far the easier to deal with. + # + # A QA products directory has the following tgz files + # + # 13B-326.sb29065738.eb29113188.56764.86226100694.ms.flagversions.tgz + # unknown.session_1.caltables.tgz + # weblog.tgz + # + # The first of which is the fileSetId, so for each directory we were given, + # parse the fileSetId and send an event for each. + # + for resultsDirectory in args.directoryName: + # consistency: we'll place the /s + resultsDirectory = resultsDirectory.replace("/", "") + + # If we haven't been provided with the fileSetId, go find it + if('' == args.fileSetId): + fileSetId = qa_search(qaDirectory, resultsDirectory) + if('' == fileSetId): + fileSetId = spool_search(spoolDirectory, resultsDirectory) + if ('' == fileSetId): + print("Failed to obtain a fileSetId for directory " + resultsDirectory) + return + else: + fileSetId = args.fileSetId + + # We've got the information we need, go ahead and send the event + print("Sending "+status+" for "+resultsDirectory+" ("+fileSetId+")") + qa_single(fileSetId, resultsDirectory, status, args.email) + + +def qa_single(fileset_id, results_directory, status, notify_email): + r"""Takes two strings: fileSetId (parsed from a file in the directory we're interested in) + and status (Pass or Fail), and builds string to execute to create the appropriate event + for updating the database and/or launching a workflow.""" + + # A little parsing work: The project code is the first part of the fileSetId + project_code = fileset_id.split('.')[0] + + # We'll want to know who sent the message: + analyst_name = getpass.getuser() + + # now prep the event, use an RPC event to get data back from the system: + broadcaster = RPCEvent(profile=os.environ['CAPO_PROFILE'], exchange='archive.events', application='qa-script') + + event_data = { + "user": analyst_name, + "message": "{}={}".format(fileset_id, status), + "logData": { + "projectCode": project_code, + "status": status, + "type": "calibration", + "fileset_id": fileset_id, + "markedDirectory": results_directory + } + } + + if '' != notify_email: + event_data["logData"]["email"] = notify_email + + broadcaster.send(event_data) + + # Now get the reply: (uncomment when that part is done) + ingestion_data = broadcaster.get_reply() + + + directory = None + log_id = None + + if "working_directory" in ingestion_data: + directory = ingestion_data["working_directory"] + + if "logging_id" in ingestion_data: + log_id = ingestion_data["logging_id"] + + if "PASS" == status: + workflow_name = 'Calibration Ingestion' + else: + workflow_name = 'Qa Cleanup' + + print("{} ({}) is running in {}".format(workflow_name, log_id, directory)) + + return + + +def qa_pass(): + qa_list("PASS") + + +def qa_fail(): + qa_list("FAIL") diff --git a/apps/cli/utilities/s_code_project_updater/README.md b/apps/cli/utilities/s_code_project_updater/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/s_code_project_updater/setup.py b/apps/cli/utilities/s_code_project_updater/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..36ee1407fe690184083a1b1eeb8d63cc6fa2a9be --- /dev/null +++ b/apps/cli/utilities/s_code_project_updater/setup.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from pathlib import Path +from setuptools import setup + +VERSION = open('src/s_code_project_updater/_version.py').readlines()[-1].split()[-1].strip("\"'") +README = Path('README.md').read_text() + +setup( + name=Path().absolute().name, + version=VERSION, + description='NRAO Archive S-Code Project Updater', + long_description=README, + author='NRAO SSA Team', + author_email='dms-ssa@nrao.edu', + url='TBD', + license="GPL", + install_requires=['pycapo', 'pymygdala', 'schema', 'sqlalchemy', 'support'], + keywords=[], + packages=['s_code_project_updater'], + package_dir={'':'src'}, + classifiers=[ + 'Programming Language :: Python :: 3.8' + ], + entry_points={ + 'console_scripts': ['update_sproj = s_code_project_updater.commands:main'] + }, +) diff --git a/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/__init__.py b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/_version.py b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..e63ec3a147fc65f8e390ff413a349d66169f8ec1 --- /dev/null +++ b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/_version.py @@ -0,0 +1,2 @@ +""" Version information for this package, don't put anything else here. """ +___version___ = '4.0a1.dev1' diff --git a/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/commands.py b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/commands.py new file mode 100644 index 0000000000000000000000000000000000000000..576b3a55f471d58c0091ca1fc1d43c18c013cd08 --- /dev/null +++ b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/commands.py @@ -0,0 +1,458 @@ +""" +A module for updating properties (title, abstract, PI and coI) of a provided project. +""" +import argparse as ap +import logging +import sys, os +import warnings +from typing import List + +from sqlalchemy import exc as sa_exc, asc, desc + +from ._version import ___version___ as version +from support.capo import get_my_capo_config +from support.logging import get_console_logger, LOG_MESSAGE_FORMATTER +from pymygdala import LogHandler, SendNRAOEvent +from .project_fetcher import ArchiveProjectFetcher +from schema import ArchiveDBSession, ScienceProduct +from schema.model import Author, Project +from schema.pstmodel import Person, UserAuthentication + +_APPLICATION_NAME = 's_code_project_updater' +_LOG = get_console_logger(_APPLICATION_NAME, logging.DEBUG) +_MISSING_PROFILE = """ERROR: unknown 'profile', provide the -P argument or set the CAPO_PROFILE + environment variable.""" +_DESCRIPTION = """A tool for updating the investigators, title and/or abstract of a project.""" +_EPILOG = """Return values: +0: everything worked, +1: error with capo configuration +2: Error with input parameters +3: Project not found +4: Investigator not found +5: Update failed""" + + +def scode_project_from_args(namespace): + + ns_dict = namespace.__dict__ + project_code = ns_dict['project'] + title = ns_dict['title'] + abstract = ns_dict['abstract'] + investigators = ns_dict['investigators'] + + new_project = ArchiveProject(project_code=project_code, title=title, abstract=abstract, author_pst_ids=investigators) + return new_project + + +class ScodeProjectUpdater: + """ + A class to bundle the operations involved with updating a project in the archive. + """ + def __init__(self, **kwargs): + + """ + Build an instance of the class with the cmd line parser, the capo_config, a context + manager for the archive and pst, and do some rudamentary tests of the input arguments + so we fail early. + :param kwargs: the command line arguments or namespace with the arguments to the parser + """ + self._make_parser() + self.args = self.parser.parse_args(**kwargs) + args_dict = self.args.__dict__ + + if args_dict['dry']: + self.is_dry = args_dict['dry'] + else: + self.is_dry = False + + if not args_dict['investigators'] and not args_dict['title'] and not args_dict['abstract']: + self.get_minimal_args(args_dict) + return + + self.stored_project = None + _LOG.debug(f'{self.args}') + + self.sc_project = scode_project_from_args(self.args) + + self.capo_config = get_my_capo_config(profile=self.args.profile) + try: + self.archive_context = ArchiveDBSession('SDM', profile=self.capo_config.profile) + self.pst_context = ArchiveDBSession('PST', profile=self.capo_config.profile) + except KeyError as k_ex: + self.exit_with_error(f'An error occurred while creating a db context: {k_ex}', 1) + + if self.args.investigators and \ + [inv for inv in self.args.investigators if self.args.investigators.count(inv) > 1]: + self.exit_with_error('You appear to be trying to add an investigator more than once, ' + 'which could cause issues with the presentation of investigators ' + 'in the archive. There should be only one PI and any number of ' + 'unique CoIs on a project.', 2) + + def get_minimal_args(self, args): + self.project_code = args['project'] + self.profile = args['profile'] + self.is_dry = True + self.fetch_only = True + + def _make_parser(self): + + r""" Build a command line parser for this app. """ + result = ap.ArgumentParser(description=_DESCRIPTION.format(version), + formatter_class=ap.RawTextHelpFormatter, + epilog=_EPILOG) + result.add_argument('-C', '--project', action='store', + help='project_code to update') + result.add_argument('-P', '--profile', action='store', + help='profile name to use, e.g. test, production') + result.add_argument('-T', '--title', action='store', + help='a quoted string for the new title for the project') + result.add_argument('-A', '--abstract', action='store', + help='a quoted string for the new abstract for the project') + result.add_argument('-I', '--investigators', action='store', type=int, nargs='+', + help='a PST ID, or list of PST IDs, of investigators for the project, ' + 'as an unquoted integer or space seperated integer list. The ' + 'first ID in the list will be added as the PI and all subsequenct ' + 'IDs will be added as CoIs.') + result.add_argument('-d', '--dry', action='store_true', + help='perform a dry run, going through the motions, but not committing ' + 'changes and not performing a re-index of the project. This may ' + 'be useful because it will print the current state of the project ' + 'and what the project would look like after the changes.') + self.parser = result + + def capture_error(self, msg, code): + self.error_message = msg + self.code = code + _LOG.error(f'error message received: {self.error_message}; code = {self.code}') + return self.code, self.error_message + + def exit_with_error(self, msg, code): + """ + On discovering we have an unresolvable condition the prevents us from proceeding with the + update of this project, print an error and exit with the code provided. + :param msg: an error message to the user + :param code: the exit code to accompany the error message + :return: None + """ + self.capture_error(msg, code) + _LOG.error(msg) + self.parser.print_help() + sys.exit(code) + + def get_stored_project(self): + """ + Return the project specified by the input arguments, if it exists. + :return: the first Project we found with the project code passed in + """ + project = self.archive_context.session.query(Project) \ + .filter(Project.project_code == self.args.project) \ + .first() + return project + + def get_pst_users(self, investigators): + """ + Get the Person(s) associated with the investigators (a list of PST person ID(s)). + :return: the Person(s) mapped to the person ID(s) passed in + """ + users = self.pst_context.session.query(Person.person_id, Person.firstName, + Person.lastName, UserAuthentication.personName)\ + .join(UserAuthentication, + Person.personAuthentication_id == UserAuthentication.userAuthentication_id)\ + .filter(Person.person_id.in_(investigators)).all() + return users + + def get_projects_current_investigators(self): + """ + Get a list of investigators associated with this project, ordered such that the PI(s) + is/are the first element(s) of the list + :return: a list of investigators associated with the project code passed in, ordered with + the PI(s) first + """ + investigators_list = self.archive_context.session.query(Author) \ + .filter(Author.project_code == self.args.project) \ + .order_by(desc(Author.is_pi), asc(Author.pst_person_id)) \ + .all() + return investigators_list + + def clear_projects_current_investigators(self): + """ + Clear the investigators currently attached to this project + :return: None + """ + investigators_list = self.archive_context.session.query(Author) \ + .filter(Author.project_code == self.args.project) \ + .all() + for inv in investigators_list: + self.archive_context.session.delete(inv) + + def set_new_project_investigators(self, new_investigators): + """ + Add a list of new project investigators + :param new_investigators: a list of investigators with PST personName, firstName, lastName + and person_id + :return: None + """ + old_investigators = self.get_projects_current_investigators() + # if any of the new investigators already exists, use the old author_id rather than making a new author + is_pi = True + num_expected = len(new_investigators) + num_changed = 0 + # Loop through our cmd line investigator list and map them to investigator list passed in. + # On the first pass through, we set the Author as the PI. + for in_inv in self.args.investigators: + for pst_user in new_investigators: + if in_inv == int(pst_user.person_id): + # The Author table has the project as a foreign key, so we use the whole + # project here, rather than just a string, sqlalchemy wires the rest. If you + # just give the table a project_code as a string, you will get an error. + # noinspection PyTypeChecker + auth = Author(author_id=None, + project=self.stored_project, + username=pst_user.personName, + firstname=pst_user.firstName, + lastname=pst_user.lastName, + pst_person_id=str(pst_user.person_id), + is_pi=is_pi) + self.archive_context.session.add(auth) + num_changed += 1 + is_pi = False + break + + if num_changed < num_expected: + _LOG.error(f'{num_changed} of {num_expected} investigators were NOT set') + raise Exception('incomplete investigator update') + + def print_project(self): + """ + Print the project's current investigators + :return: None + """ + output = self.get_project_info() + [_LOG.info(line) for line in output] + + def get_project_info(self): + output = [] + output.append(f'Title: {self.stored_project.title}') + output.append(f'Abstract: {self.stored_project.abstract}') + investigator_list = self.get_projects_current_investigators() + + # we want the PI's pst_person_id followed by the CoIs' pst_person_ids in numeric order + pi = investigator_list[0] + if pi.pst_person_id is not None: + self.is_alma = False + coi_pst_ids = [int(coi.pst_person_id) for coi in investigator_list[1:]] + coi_pst_ids = sorted(coi_pst_ids) + author_pst_ids = [int(pi.pst_person_id)] + [author_pst_ids.append(id) for id in coi_pst_ids] + authors_to_print = [str(id) for id in author_pst_ids] + id_list = ' '.join(authors_to_print) + output.append(f'Authors: {id_list}') + else: + self.is_alma = True + + return output + + def is_fetch_only(self): + try: + return self.fetch_only + except AttributeError: + return False + + def update_project(self): + if self.is_fetch_only(): + fetcher = ArchiveProjectFetcher(self.profile) + self.project = fetcher.fetch_project(self.project_code) + output = fetcher.build_project_info() + try: + [_LOG.info(line) for line in output] + except TypeError: + _LOG.error('Cannot display project info; is this an ALMA project?') + return self.project + + """ + The main function responsible for updating the project. It makes sure the project exists, + and, if the user is updating the investigators, that they have valid PST mappings. If there + aren't errors with those two checks it clears the projects current archive authors and + replaces them with the investigators found from the PST mapping to users. And, of course, + if the title and abstract are being updated, it adds those to the project + :return: None + """ + with warnings.catch_warnings(), self.archive_context, self.pst_context: + # Suppress SQLAlchemy warnings + warnings.simplefilter("ignore", category=sa_exc.SAWarning) + + # Get the project or fail + self.stored_project = self.get_stored_project() + if self.stored_project is None: + self.exit_with_error('No project found for the project_code provided', 3) + + # is this an ALMA project? + self.product = self.archive_context.session.query(ScienceProduct) \ + .filter(ScienceProduct.project == self.stored_project) \ + .first() + external_system = self.product.external_system + if str(external_system).startswith("ALMA"): + raise ValueError(f'{self.stored_project.project_code} is an ALMA project; update not permitted') + + if self.args.investigators: + proposed_investigators = self.get_pst_users(self.args.investigators) + self.sc_project.investigators = proposed_investigators + if len(proposed_investigators) == 0 or \ + not len(self.args.investigators) == len(proposed_investigators): + self.exit_with_error('One or more of the investigators you entered was not ' + 'found in the PST.', 4) + self.clear_projects_current_investigators() + self.set_new_project_investigators(proposed_investigators) + + if self.args.title: + self.stored_project.title = self.args.title + self.sc_project.title = self.args.title + if self.args.abstract: + self.stored_project.abstract = self.args.abstract + self.sc_project.abstract = self.args.abstract + + if not self.args.dry: + if not self.is_fetch_only(): + self.archive_context.session.commit() + _LOG.info(f'Changes committed') + elif not self.is_fetch_only(): + _LOG.info(f'Successful dry run; this would have updated the project') + + self.print_project() + return self.stored_project + + def reindex_project(self): + """ + If we are not performing a dry run, and have made it this far without error, then we + re-index the project so the updates will show up in the profile-mapped archive. + :return: None + """ + if not self.args.dry: + _LOG.info(f'Re-indexing project {self.args.project} to make changes available....') + # Set up a LogHandler to record the fact we just made a change to this project. + # We're adding it here, instead of earlier, because nothing we log earlier should be + # presented to anyone but the command line user and would only add useless clutter to + # our system logging. We only really want the completed task to make a record in our + # system. + broadcast = LogHandler(profile=self.capo_config.profile, application=_APPLICATION_NAME) + broadcast.setLevel(logging.DEBUG) + broadcast.setFormatter(LOG_MESSAGE_FORMATTER) + _LOG.addHandler(broadcast) + _LOG.info(f'Project {self.args.project} has been updated.') + + event = {'logData': {'project_code': self.args.project, + 'title_updated': self.args.title is not None, + 'abstract_updated': self.args.abstract is not None, + 'investigators_updated': self.args.investigators is not None, + 'ingestion_type': 'evla_sdm' + }, + 'message': 's-code project updated', + 'request': 're-index please'} + SendNRAOEvent(profile=self.capo_config.profile, application=_APPLICATION_NAME) \ + .send(routing_key='ingestion-complete.metadata', event=event) + sys.exit(0) + +class ArchiveProject: + + """ + A class to encapsulate the attributes of a project as stored in the archive + for the purpose of updating and getting current information about the project. + """ + + def __init__(self, project_code: str, title: str, abstract: str, author_pst_ids: List, ): + """ + Represents the state of a project. + :param project_code: + :param title: + :param abstract: + :param author_pst_ids: + """ + self.project_code = project_code + self.title = title + self.abstract = abstract + self.investigators = author_pst_ids + self.is_alma = None + + options = [] + options.append('-C') + options.append('--project') + options.append('-P') + options.append('--profile') + options.append('-T') + options.append('--title') + options.append('-A') + options.append('--abstract') + options.append('-I') + options.append('--investigators') + self.options = options + + def set_alma(self, is_alma): + self.is_alma = is_alma + + def make_args(self, isDry): + args = [] + if isDry: + args.append('-d') + + args.append('-C') + args.append(self.project_code) + args.append('-P') + self.profile = os.environ['CAPO_PROFILE'] + args.append(self.profile) + args.append('-T') + args.append(self.title) + args.append('-A') + args.append(self.abstract) + + if self.investigators: + args.append('-I') + for pst_id in self.investigators: + args.append(str(pst_id)) + + return args + + @staticmethod + def from_schema_project(project: Project, is_alma: bool): + to_return = ArchiveProject(project.project_code, project.title, project.abstract, project.authors) + to_return.set_alma(is_alma) + return to_return + + def is_arg(self, arg): + return arg in self.options + + def add_parameter(self, new_project, key, value): + if '-C' == key or '--project' == key: + new_project.project_code = value + elif '-T' == key or '--title' == key: + new_project.title = value + elif '-A' == key or '--abstract' == key: + new_project.abstract = value + elif '-P' == key or '--profile' == key: + self.profile = value + + def add_investigators(self, new_project, args, start_position): + value = args[start_position] + + while value not in self.options: + new_project.investigators.append(value) + + return new_project.investigators + + +def main(**kwargs): + """ + The script's main entry point. + :param kwargs: the command line arguments + :return: None + """ + updater = ScodeProjectUpdater(**kwargs) + updater.update_project() + # reindex only if there's been an update + if not updater.is_dry: + updater.reindex_project() + + +if __name__ == '__main__': + main() + + diff --git a/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/project_fetcher.py b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/project_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..b66d24241c1ca3af45214782ff968dcfeb512357 --- /dev/null +++ b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/project_fetcher.py @@ -0,0 +1,128 @@ +import logging +import sys +import warnings + +from support.capo import get_my_capo_config +from support.logging import get_console_logger +from schema import ArchiveDBSession, Project, Author, ScienceProduct +from sqlalchemy import exc as sa_exc, asc, desc + +_APPLICATION_NAME = 'project_fetcher' +_LOG = get_console_logger(_APPLICATION_NAME, logging.DEBUG) +_MISSING_PROFILE = """ERROR: unknown 'profile'; provide the -P argument +or set the CAPO_PROFILE environment variable.""" +_DESCRIPTION = """A tool for retrieving project metadata from the archive.""" + +class ArchiveProjectFetcher: + + def __init__(self, profile): + self.capo_config = get_my_capo_config(profile=profile) + try: + self.archive_context = ArchiveDBSession('SDM', profile=self.capo_config.profile) + self.pst_context = ArchiveDBSession('PST', profile=self.capo_config.profile) + except KeyError as k_ex: + _LOG.error(f'An error occurred while creating a db context: {k_ex}') + sys.exit(1) + + def fetch_project(self, project_code): + with warnings.catch_warnings(), self.archive_context, self.pst_context: + # Suppress SQLAlchemy warnings + warnings.simplefilter("ignore", category=sa_exc.SAWarning) + """ + Return the project specified by the input arguments, if it exists. + :return: the first Project we found with the project code passed in + """ + self.project = self.archive_context.session.query(Project) \ + .filter(Project.project_code == project_code) \ + .first() + + self.abstract = self.project.abstract + + self.is_alma = self._is_alma() + self.authors = self._get_investigators() + self.detachable_author_list = self._get_detached_authors() + + self.project_info = self.build_project_info() + return self.project + + def build_project_info(self): + output = [] + output.append(f'Title: {self.project.title}') + output.append(f'Abstract: {self.project.abstract}') + if self._is_alma(): + return + + investigator_list = self.authors + # investigator_list = self._get_investigators() + # We want the PI's pst_person_id followed by the CoIs' pst_person_ids in numeric order. + # ALMA authors, however, do not have pst_person_ids + pi = investigator_list[0] + coi_pst_ids = [int(coi.pst_person_id) for coi in investigator_list[1:]] + # TODO: should not need to sort; query does that + coi_pst_ids = sorted(coi_pst_ids) + + author_pst_ids = [int(pi.pst_person_id)] + [author_pst_ids.append(id) for id in coi_pst_ids] + authors_to_print = [str(id) for id in author_pst_ids] + id_list = ' '.join(authors_to_print) + output.append(f'Authors: {id_list}') + + return output + + def _get_investigators(self): + """ + Get a list of investigators associated with this project, with PI(s) as first element(s) of the list + + :return: a list of investigators associated with the project code passed in, ordered with + the PI(s) first + """ + investigators_list = self.archive_context.session.query(Author) \ + .filter(Author.project == self.project) \ + .order_by(desc(Author.is_pi), asc(Author.pst_person_id)) \ + .all() + return investigators_list + + def _is_alma(self): + with warnings.catch_warnings(), self.archive_context, self.pst_context: + # Suppress SQLAlchemy warnings + warnings.simplefilter("ignore", category=sa_exc.SAWarning) + self.product = self.archive_context.session.query(ScienceProduct)\ + .filter(ScienceProduct.project == self.project)\ + .first() + external_system = self.product.external_system + return str(external_system).startswith("ALMA") + + def _get_detached_authors(self): + return [DetachedAuthor(author) for author in self.authors] + +if __name__ == '__main__': + fetched = ArchiveProjectFetcher("nmtest").fetch_project('SK0442') + authors = fetched.authors + assert 4 == len(fetched.authors) + _LOG.debug("looks ok") + + +class DetachedAuthor: + + def __init__(self, author:Author): + self.author_id = author.author_id + self.username = author.username + self.firstname = author.firstname + self.lastname = author.lastname + self.pst_person_id = author.pst_person_id + self.is_pi = author.is_pi + # self.project_code = author.project_code + + def __eq__(self, other): + if type(other) is not type(self): + return False + + if other.author_id == self.author_id: + if other.pst_person_id == self.pst_person_id: + if other.lastname == self.lastname: + return other.firstname == self.firstname + + return False + + def __repr__(self): + return f'{self.firstname} {self.lastname}: {self.author_id}, {self.pst_person_id}' \ No newline at end of file diff --git a/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/test_projects.py b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/test_projects.py new file mode 100644 index 0000000000000000000000000000000000000000..27b8f2014cfe9c56c450ab0c9bc125312dcb9ca8 --- /dev/null +++ b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/test_projects.py @@ -0,0 +1,205 @@ +from abc import ABC +from typing import List + +from .commands import ArchiveProject +from schema import Author + + +class AbstractTestProject(ABC, ArchiveProject): + + def __init__(self, project_code: str, title: str, abstract: str, authors: List, is_alma: bool): + self.project_code = project_code + self.title = title + self.abstract = abstract + self.authors = authors + self.is_alma = is_alma + # self.author_pst_ids = self._get_author_pst_ids() + + def _get_author_pst_ids(self): + # we want the PI's pst_person_id followed by the CoIs' pst_person_ids in numeric order + pi = self.authors[0] + coi_pst_ids = [int(coi.pst_person_id) for coi in self.authors[1:]] + coi_pst_ids = sorted(coi_pst_ids) + + author_pst_ids = [int(pi.pst_person_id)] + [author_pst_ids.append(id) for id in coi_pst_ids] + return [str(id) for id in author_pst_ids] + + def as_sc_project(self): + author_pst_ids = [str(author.pst_person_id) for author in self.authors] + return ArchiveProject(self.project_code, self.title, self.abstract, author_pst_ids) + +class ScodeTestProject(AbstractTestProject): + + def __init__(self): + self.project_code = 'SK0442' + self.title = 'Cool Sky Stuff' + self.abstract = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ' \ + 'ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco ' \ + 'laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in ' \ + 'voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non ' \ + 'proident, sunt in culpa qui officia deserunt mollit anim id est laborum.' + self.authors = [ + Author(author_id=65409, + project=self.project_code, + username='srandall', + firstname='Scott', + lastname='Randall', + pst_person_id='4686', + is_pi=True), + + Author(author_id=65410, + project=self.project_code, + username='s.giacintucci', + firstname='Simona', + lastname='Giacintucci', + pst_person_id='317', + is_pi=False), + + Author(author_id=65411, + project=self.project_code, + username='esch44', + firstname='Emma', + lastname='Schwartzman', + pst_person_id='11991', + is_pi=False), + + Author(author_id=65412, + project=self.project_code, + username='tclarke', + firstname='Tracy', + lastname='Clarke', + pst_person_id='341', + is_pi=False), + + ] + + self.is_alma = False + self.author_pst_ids = self._get_author_pst_ids() + + super(AbstractTestProject, self).__init__(self.project_code, self.title, self.abstract, self.author_pst_ids) + +class ScienceTestProject(AbstractTestProject): + + def __init__(self): + self.project_code = '13B-014' + self.title = 'The Comprehensive VLA Survey for Black Holes in Globular Clusters' + self.abstract = 'Spurred by our surprising VLA discovery of the first black holes in Milky Way ' \ + 'globular clusters, we propose an ambitious survey for both stellar-mass and ' \ + 'intermediate-mass black holes in globular clusters. ' \ + 'With well-defined selection criteria, our sample will allow the first statistical ' \ + 'determination of the presence of black holes in clusters. This survey will make an ' \ + 'immediate impact in a number of fields, including black hole demographics, ' \ + 'accretion physics, gravitational wave predictions, and globular cluster evolution.' + self.authors = [ + + Author(author_id=8749, + project=self.project_code, + username='jstrader', + firstname='Jay', + lastname='Strader', + pst_person_id='4064', + is_pi=True), + + Author(author_id=8743, + project=self.project_code, + username='jcamj', + firstname='James', + lastname='Miller-Jones', + pst_person_id='490', + is_pi=False), + + Author(author_id=8744, + project=self.project_code, + username='chomiuk', + firstname='Laura', + lastname='Chomiuk', + pst_person_id='701', + is_pi=False), + + Author(author_id=8745, + project=self.project_code, + username='gsivakoff', + firstname='Gregory', + lastname='Sivakoff', + pst_person_id='834', + is_pi=False), + + Author(author_id=8746, + project=self.project_code, + username='tjmaccarone', + firstname='Thomas', + lastname='Maccarone', + pst_person_id='887', + is_pi=False), + + Author(author_id=8747, + project=self.project_code, + username='anilseth', + firstname='Anil', + lastname='Setn', + pst_person_id='1197', + is_pi=False), + + Author(author_id=8748, + project=self.project_code, + username='Craig Heinke', + firstname='Craig', + lastname='Heinke', + pst_person_id='3729', + is_pi=False), + + Author(author_id=8750, + project=self.project_code, + username='evanoyola', + firstname='Eva', + lastname='Noyola', + pst_person_id='5532', + is_pi=False), + + ] + + self.is_alma = False + self.author_pst_ids = self._get_author_pst_ids() + + super(AbstractTestProject, self).__init__(self.project_code, self.title, self.abstract, self.author_pst_ids) + + +class AlmaTestProject(AbstractTestProject): + + def __init__(self): + self.project_code = '2012.1.00060.S' + self.title = "Testing Schmidt's Conjecture in NGC 300: Bridging the Gap between Galactic and Extragalactic Star Formation" + self.abstract = "Understanding the physical factors that control the conversion of interstellar gas into stars " \ + "is of fundamental importance for both developing a predictive physical theory of star formation and understanding the evolution of galaxies from the earliest epochs of cosmic history to the present time. An important aspect of this question is the study of empirical relations that connect the star formation rate in a given region to local properties of the interstellar medium. An important example is the Schmidt-Kennicutt (KS) law for galaxies that relates the surface densities of the star formation rate and the surface densities of interstellar gas in a non-linear fashion. However, it is also known that there is a linear correlation between the total SFR in galaxies and the mass of dense molecular gas as traced by the high excitation HCN molecule. Contrary to the KS relation, this scaling relation suggests that the total SFR depends simply on the total amount of dense molecular gas in a star forming system. Recently, we have begun to test these scaling relations in the Galactic neighborhood where star formation rates can be much better constrained. We found that for local clouds the total SFR scales most directly, and linearly, with the total mass of high extinction (and dense) molecular gas. Furthermore, we found this linear scaling law between SFR and dense gas to extend and extrapolate directly and smoothly to external galaxies. Moreover, our observations also demonstrate that a KS type relation does not exist for molecular clouds in the Galactic neighborhood. This is a direct consequence of a well known scaling law between the mass and size of molecular clouds, Larson's third law. Overall, our results indicate that a linear scaling law, in which the total amount of dense gas controls the SFR, is the fundamental physical relation that connects star formation across the vast scales from individual GMCs to entire galaxies. Critical testing of these ideas require resolved observations of GMCs in external galaxies. Here we propose to use ALMA to evaluate star formation scaling laws in a nearby galaxy where we can obtain resolved observations of individual GMCs. This allows us to obtain observations of a larger sample of GMCs than is accessible in the Galactic neighborhood. An extensive APEX survey of HII regions in the nearby galaxy NGC 300 has provided us with a sample of 36 star-forming regions with CO(2-1) detections and 42 upper limits. We are currently working on obtaining star formation rates for these regions from multi-wavelength ancillary data including our Herschel observations. We propose to use ALMA's unequalled capabilities to obtain snapshot observations of 40 selected regions in CO(2-1) in order to make resolved measurements of cloud structure to obtain sizes and virial masses. As a pilot project, we also propose to observe the brightest subsample in HCN(1-0) as a dense-gas tracer. Our proposed ALMA CO observations will enable us to to test Larson's scaling laws in an external galaxy and to evaluate which formulation of the Schmidt law is the most meaningful and appropriate to apply to spiral galaxies, and in doing so refine Schmidt's original conjecture of a scaling relation between the rate of star formation and gas density." + self.authors = [ + + Author(author_id=37200, + project=self.project_code, + username='clada', + firstname='Charles', + lastname='Lada', + pst_person_id=None, + is_pi=True), + + Author(author_id=37201, + project=self.project_code, + username='jforbrich', + firstname='Jan', + lastname='Forbrich', + pst_person_id=None, + is_pi=False), + + Author(author_id=37202, + project=self.project_code, + username='cfaesi', + firstname='Christopher', + lastname='Faesi', + pst_person_id=None, + is_pi=False), + + ] + self.is_alma = True + super(AbstractTestProject, self).__init__(self.project_code, self.title, self.abstract, None) + + diff --git a/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/updater_tests.py b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/updater_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..4bbe4cba361c9ff61fc49cdf0f82aaca7636c38c --- /dev/null +++ b/apps/cli/utilities/s_code_project_updater/src/s_code_project_updater/updater_tests.py @@ -0,0 +1,272 @@ +import logging +import os +import unittest +import warnings + +from support.logging import get_console_logger +from .commands import ArchiveProject, ScodeProjectUpdater +from .project_fetcher import ArchiveProjectFetcher +from .test_projects import ScienceTestProject, ScodeTestProject, AlmaTestProject +from schema import Author, ArchiveDBSession +from sqlalchemy import exc as sa_exc + +_LOG = get_console_logger("scode_project_updater_tests", logging.DEBUG) + +class UpdaterTestCase(unittest.TestCase): + + @classmethod + def setUp(self) -> None: + self.profile = os.environ['CAPO_PROFILE'] + + def restore_scode_project_to_original_state(self): + self.archive_context = ArchiveDBSession('SDM', profile=self.profile) + self.pst_context = ArchiveDBSession('PST', profile=self.profile) + + scode_project = ScodeTestProject() + # start by restoring the title and abstract; authors will need special treatment + # args = scp.make_args(False) + args = ['-C', scode_project.project_code, '-P', self.profile, '-T', scode_project.title, '-A', scode_project.abstract] + project = ScodeProjectUpdater(args=args).update_project() + + with warnings.catch_warnings(), self.archive_context, self.pst_context: + # Suppress SQLAlchemy warnings + warnings.simplefilter("ignore", category=sa_exc.SAWarning) + + # clear existing investigators + investigators_list = self.archive_context.session.query(Author) \ + .filter(Author.project_code == scode_project.project_code) \ + .all() + for inv in investigators_list: + self.archive_context.session.delete(inv) + self.archive_context.session.commit() + + # insert the canonical ones + canonical_authors = scode_project.authors + for author in canonical_authors: + author.project = project + self.archive_context.session.add(author) + self.archive_context.session.commit() + + # confirm restoration + fetcher = ArchiveProjectFetcher(self.profile) + restored_project = fetcher.fetch_project(scode_project.project_code) + self.assertEqual(scode_project.title, restored_project.title) + self.assertEqual(scode_project.abstract, restored_project.abstract) + + restored_authors = fetcher.detachable_author_list + self.assertEqual(len(scode_project.authors), len(restored_authors)) + + pi_found = False + for author in restored_authors: + if author.username == 'srandall': + self.assertTrue(author.is_pi, 'author is pi') + self.assertEqual(65409, author.author_id, 'expecting author_id 65409') + self.assertEqual('4686', author.pst_person_id, 'expecting pst_person_id 4686') + pi_found = True + else: + self.assertFalse(author.is_pi, 'author is pi') + self.assertNotEqual(65409, author.author_id, 'expecting author_id not 65409') + self.assertTrue(author.pst_person_id in ('317','341','11991'), "expecting pst_person_ids 317, 341, 11991") + + self.assertTrue(pi_found) + + def restore_science_project_to_original_state(self): + science_project = ScienceTestProject() + self.archive_context = ArchiveDBSession('SDM', profile=self.profile) + self.pst_context = ArchiveDBSession('PST', profile=self.profile) + + # start by restoring the title and abstract; authors will need special treatment + # args = scp.make_args(False) + args = ['-C', science_project.project_code, '-P', self.profile, '-T', science_project.title, '-A', science_project.abstract] + project = ScodeProjectUpdater(args=args).update_project() + + with warnings.catch_warnings(), self.archive_context, self.pst_context: + # Suppress SQLAlchemy warnings + warnings.simplefilter("ignore", category=sa_exc.SAWarning) + # clear existing investigators + investigators_list = self.archive_context.session.query(Author) \ + .filter(Author.project_code == science_project.project_code) \ + .all() + for inv in investigators_list: + self.archive_context.session.delete(inv) + self.archive_context.session.commit() + + # insert the canonical ones + canonical_authors = science_project.authors + for author in canonical_authors: + author.project = project + self.archive_context.session.add(author) + self.archive_context.session.commit() + + # confirm restoration + fetcher = ArchiveProjectFetcher(self.profile) + restored_project = fetcher.fetch_project(science_project.project_code) + self.assertEqual(science_project.title, restored_project.title) + self.assertEqual(science_project.abstract, restored_project.abstract) + restored_authors = fetcher.detachable_author_list + self.assertEqual(len(science_project.authors), len(restored_authors)) + + pi_found = False + for author in restored_authors: + if author.username == 'jstrader': + self.assertTrue(author.is_pi, 'author is pi') + self.assertEqual(8749, author.author_id, 'expecting author_id 8749') + self.assertEqual('4064', author.pst_person_id, 'expecting pst_person_id 4064') + pi_found = True + else: + self.assertFalse(author.is_pi, 'author is pi') + self.assertNotEqual(8749, author.pst_person_id, 'expecting author_id not 8749') + self.assertTrue(8742 < int(author.author_id) < 8751, 'expecting pst_person_id between 8743 and 8750') + + self.assertTrue(pi_found) + + def test_alma_project_has_not_changed(self): + alma_test_project = AlmaTestProject() + fetcher = ArchiveProjectFetcher(self.profile) + fetched = fetcher.fetch_project(alma_test_project.project_code) + self.assertEqual(alma_test_project.title, fetched.title) + self.assertEqual(alma_test_project.abstract, fetched.abstract) + authors_list = fetcher.detachable_author_list + self.assertEqual(len(alma_test_project.authors), len(authors_list)) + + pi_found = False + for author in authors_list: + self.assertIsNone(author.pst_person_id, 'expecting no pst_person_id') + if author.username == 'clada': + self.assertTrue(author.is_pi, 'author is pi') + self.assertEqual(37200, author.author_id, 'expecting author_id 37200') + pi_found = True + else: + self.assertFalse(author.is_pi, 'author is pi') + self.assertNotEqual(37200, author.pst_person_id, 'expecting author_id not 8749') + + self.assertTrue(pi_found) + + def test_restores_scode_project_correctly(self): + self.restore_scode_project_to_original_state() + + def test_restores_science_project_correctly(self): + self.restore_science_project_to_original_state() + + def test_can_fetch_from_project_code(self): + scode_project = ScodeTestProject() + args = ['-C', scode_project.project_code, '-P', self.profile] + fetched = ScodeProjectUpdater(args=args).update_project() + self.assertEqual(scode_project.title, fetched.title) + self.assertEqual(scode_project.abstract, fetched.abstract) + + def test_can_fetch_non_scode_project(self): + args = ['-C', '13B-014', '-P', self.profile] + fetched = ScodeProjectUpdater(args=args).update_project() + self.assertIsNotNone(fetched) + + def test_can_modify_non_scode_project(self): + to_modify = ScienceTestProject() + authors = to_modify.authors.copy() + pst_ids = [author.pst_person_id for author in authors] + scp = ArchiveProject(to_modify.project_code, 'foo', to_modify.abstract, pst_ids) + scp_args = scp.make_args(False) + + updater = ScodeProjectUpdater(args=scp_args) + updated_project = updater.update_project() + + fetcher = ArchiveProjectFetcher(self.profile) + retrieved_project = fetcher.fetch_project(updated_project.project_code) + # title should have changed + self.assertEqual('foo', retrieved_project.title) + + self.restore_science_project_to_original_state() + + def test_no_update_with_dry_run(self): + + self.restore_scode_project_to_original_state() + scode_project = ScodeTestProject() + authors = scode_project.authors.copy() + pst_ids = [author.pst_person_id for author in authors] + pst_ids.append(5654) + + scp = ArchiveProject(scode_project.project_code, scode_project.title, scode_project.abstract, pst_ids) + scp_args = scp.make_args(True) + + updater = ScodeProjectUpdater(args=scp_args) + updated_project = updater.update_project() + + fetcher = ArchiveProjectFetcher(self.profile) + retrieved_project = fetcher.fetch_project(updated_project.project_code) + authors = retrieved_project.authors + + self.assertEqual(len(scode_project.authors), len(authors)) + for author in authors: + if author.username == 'srandall': + assert author.is_pi + else: + assert not author.is_pi + + self.assertEqual(scode_project.title, retrieved_project.title) + self.assertEqual(scode_project.abstract, retrieved_project.abstract) + + def test_setting_investigators_preserves_title_and_abstract(self): + scode_project = ScodeTestProject() + authors = scode_project.authors.copy() + pst_ids = [author.pst_person_id for author in authors] + pst_ids.append(5654) + + scp = ArchiveProject(scode_project.project_code, scode_project.title, scode_project.abstract, pst_ids) + scp_args = scp.make_args(False) + + updater = ScodeProjectUpdater(args=scp_args) + updater.update_project() + + fetcher = ArchiveProjectFetcher(self.profile) + retrieved_project = fetcher.fetch_project(scp.project_code) + + authors = fetcher.detachable_author_list + self.assertEqual(len(scode_project.authors) + 1, len(authors)) + for author in authors: + if author.username == 'srandall': + assert author.is_pi + else: + assert not author.is_pi + + self.assertEqual(scode_project.title, retrieved_project.title) + self.assertEqual(scode_project.abstract, fetcher.abstract) + self.restore_scode_project_to_original_state() + + def test_alma_project_not_updated(self): + alma_project = AlmaTestProject() + args = alma_project.make_args(False) + try: + updater = ScodeProjectUpdater(args=args) + updater.update_project() + except Exception as exc: + _LOG.info(f'attempt to update ALMA project failed, as expected: {exc}') + self.test_alma_project_has_not_changed() + + def test_output_is_desired_format(self): + scode_project = ScodeTestProject() + authors = scode_project.authors.copy() + pi = authors[0] + coi_pst_ids = [int(coi.pst_person_id) for coi in authors[1:]] + coi_pst_ids = sorted(coi_pst_ids) + + author_pst_ids = [int(pi.pst_person_id)] + [author_pst_ids.append(id) for id in coi_pst_ids] + authors_to_print = [str(id) for id in author_pst_ids] + id_list = ' '.join(authors_to_print) + + scp = ArchiveProject(scode_project.project_code, scode_project.title, scode_project.abstract, author_pst_ids) + scp_args = scp.make_args(True) + ScodeProjectUpdater(args=scp_args).update_project() + + fetcher = ArchiveProjectFetcher(self.profile) + fetcher.fetch_project(scp.project_code) + output = fetcher.build_project_info() + self.assertEqual(3, len(output)) + authors_line = output[2] + self.assertEqual(f'Authors: {id_list}', authors_line ) + + +UpdaterTestCase() + +if __name__ == '__main__': + unittest.main() diff --git a/shared/support/README.md b/shared/support/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/shared/support/setup.py b/shared/support/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..c0604ed3f00da344b7cbdec44e69d9ee7db5711e --- /dev/null +++ b/shared/support/setup.py @@ -0,0 +1,26 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +from pathlib import Path +from setuptools import setup + +VERSION = open('src/support/_version.py').readlines()[-1].split()[-1].strip("\"'") +README = Path('README.md').read_text() + +setup( + name=Path().absolute().name, + version=VERSION, + description='NRAO Archive Python Support Library', + long_description=README, + author='NRAO SSA Team', + author_email='dms-ssa@nrao.edu', + url='TBD', + license="GPL", + install_requires=['pycapo'], + keywords=[], + packages=['support'], + package_dir={'':'src'}, + classifiers=[ + 'Programming Language :: Python :: 3.8' + ], +) diff --git a/shared/support/src/support/__init__.py b/shared/support/src/support/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/shared/support/src/support/_version.py b/shared/support/src/support/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..e63ec3a147fc65f8e390ff413a349d66169f8ec1 --- /dev/null +++ b/shared/support/src/support/_version.py @@ -0,0 +1,2 @@ +""" Version information for this package, don't put anything else here. """ +___version___ = '4.0a1.dev1' diff --git a/shared/support/src/support/capo.py b/shared/support/src/support/capo.py new file mode 100644 index 0000000000000000000000000000000000000000..f9a3361cb4e1c80d444fbe59c4c0e407df4c4a4c --- /dev/null +++ b/shared/support/src/support/capo.py @@ -0,0 +1,75 @@ +import os +from os import path +from pathlib import Path +import sys +import pycapo + +from .logging import _LOG + +_CAPO_PROPERTIES_ROOT = '/home/casa/capo' + + +def profile_file_in_path(profile, properties_path): + """ + Check if the profile name provided is a valid name for a capo property file, by checking + to see if a <profile>.properties file can be found in the path provided + :param profile: the profile name we want to look for + :param properties_path: a path to a directory where we expect to find capo property files + :return: true if the profile given has a corresponding <profile>.properties file in the + path provided. + """ + valid_profile = False + profile_names = [prop.split('.properties')[0] for prop in os.listdir(properties_path) + if prop.endswith('.properties')] + if profile in profile_names: + valid_profile = True + return valid_profile + + +def get_my_capo_config(**kwargs): + r""" + A function to return a CapoConfig for the profile either passed in, found in the environment + or derived from the location we've installed pyat. We do a rudamentary test of the profile + against a series of known capo properties directories (CAPO_PATH, /home/casa/capo, or + $HOME/.capo) and print a warning if the profile is not found as a valid properties file in any + of those location. This is solely to augment the cryptic exception messages that might be + thrown when trying to access a capo key from a file that doesn't exist. + :param kwargs: may or may not contain a 'profile' + :return: a CapoConfig object for the profile derived from the kwargs, env, or deployment + location + """ + if 'profile' in kwargs and kwargs['profile']: + profile = kwargs['profile'] + elif 'CAPO_PROFILE' in os.environ: + profile = os.environ['CAPO_PROFILE'] + else: + # try to synthesize a profile from our installation root + # if you don't set the profile and end up here, but you're running locally... this might + # result in a profile of "<your virtualenv>", it's best to actually set it. + path_breakdown = os.path.abspath(sys.argv[0]).split(os.path.sep) + # CV sticks an extra subdirectory into the installation + if path_breakdown[-3] == 'current': + profile = path_breakdown[-4] + else: + profile = path_breakdown[-3] + + # We should have a profile at this point, but it might not be valid (if the user didn't provide + # one), so now we'll try checking to see if that profile matchies a <profile>.properties file + # in one of our known properties directories. We'll check for a CAPO_PATH, then, + # /home/casa/capo, then $HOME/.capo, and, if all that fails, we'll print a warning to the + # console to augment any exception that might be thrown by trying to access a property from a + # properties files that doesn't exist. + profile_exists = False + if 'CAPO_PATH' in os.environ: + profile_exists = profile_file_in_path(profile, os.environ['CAPO_PATH']) + elif path.exists(_CAPO_PROPERTIES_ROOT): + profile_exists = profile_file_in_path(profile, _CAPO_PROPERTIES_ROOT) + elif path.exists(str(Path.home()) + '/.capo'): + profile_exists = profile_file_in_path(profile, str(Path.home()) + '/.capo') + + if not profile_exists: + _LOG.error(f'The capo profile "{profile}" does not appear to match any known profile ' + f'names. This might result in unexpected behavior if the application attempts ' + f'to access a property from a file that does not exist.') + + return pycapo.CapoConfig(profile=profile) diff --git a/shared/support/src/support/logging.py b/shared/support/src/support/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..e17780a6b6829f71c943c93d6085f7c2566f57db --- /dev/null +++ b/shared/support/src/support/logging.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +""" +This is the pyat package, one of the core modules for the NRAO archive workflow system + +Authors: Daniel K Lyons <dlyons@nrao.edu> + Rick Lively <rlively@nrao.edu> + James Sheckar <jsheckar@nrao.edu> + Richard Falardeau <rfalarde@nrao.edu> +""" +import sys +import logging + +LOG_MESSAGE_FORMATTER = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + + +def get_console_logger(application_name, level=logging.INFO): + r""" + A function to build a console logger for the caller + :param application_name: the callers name + :param level: the log level to set, defaulting to INFO if not set + :return: a console logger + """ + console_log = logging.getLogger(application_name) + console_log.setLevel(level) + stdout_handler = logging.StreamHandler(sys.stdout) + stdout_handler.setLevel(level) + stdout_handler.setFormatter(LOG_MESSAGE_FORMATTER) + console_log.addHandler(stdout_handler) + return console_log + + +_LOG = get_console_logger(__name__, logging.DEBUG)