python-store/store/eval.py

import os
import filecmp
import re
from tempfile import NamedTemporaryFile
import argparse
from psutil import virtual_memory, cpu_count
import inspect
import yaml
from glob import glob
from multiprocessing.pool import Pool
import traceback
from datetime import datetime, timedelta
import functools
from enum import Enum
import time
from pathlib import Path

from sqlalchemy.exc import OperationalError

import pydoc

import numpy as np

import mdevaluate as md
from mdevaluate.logging import logger

from . import store
from . import config


def locate(fname, namespace=''):
    return pydoc.locate(namespace + '.' + fname) or pydoc.locate('store.analyse.' + fname)


def open_sim(directory, maxcache=None):
    tr = None
    if len(glob(os.path.join(directory, 'nojump.xtc'))) == 1:
        tr = md.open(directory, trajectory='nojump.xtc', cached=maxcache, reindex=True)
    else:
        tr = md.open(directory, trajectory='out/*.xtc', cached=maxcache, reindex=True, nojump=True)
    if tr is None:
        raise FileNotFoundError('Can not open trajectory.')
    return tr


def open_energy(directory):
    return md.open_energy(os.path.join(directory, 'out/*.edr'))


def dataframe_to_txt(fname, df):
    """Save a dataframe to textfile."""
    header = ' '.join(df.columns)
    np.savetxt(fname, df.values, header=header)


class RunState:
    OK = 0
    ERROR = 1
    UNKNOWN = 2

    def __init__(self, st, timing=None):
        if isinstance(st, str):
            self.state = getattr(self, st)
        else:
            self.state = st
        self.timing = timing

    def __str__(self):
        if self.state == self.OK:
            s = '\x1b[0;32m\u2713\x1b[0m'
            if self.timing is not None:
                s += '({})'.format(self.timing)
            return s
        elif self.state == self.ERROR:
            return '\x1b[0;31m\u2717\x1b[0m'
        elif self.state == self.UNKNOWN:
            return '\x1b[0;33m?\x1b[0m'


@functools.total_ordering
class Report:

    def apply(self, func, *args, err_count=1, **kwargs):
        if isinstance(func, functools.partial):
            fname = func.func.__name__
        else:
            fname = func.__name__
        try:
            start = datetime.now()
            res = func(*args, **kwargs)
            time = timedelta(seconds=round((datetime.now() - start).total_seconds()))
            self.runok(fname, timing=time)
            return res
        except Exception as e:
            self.runerror(fname)
            self.error(e, err_count=err_count)

    def runok(self, name, timing=None):
        # st = RunStates.OK.timing(timing) if timing is not None else RunStates.OK
        self.current['runs'].append((name, RunState('OK', timing)))

    def runerror(self, name):
        self.current['runs'].append((name, RunState('ERROR')))

    def unknown(self, name):
        self.current['runs'].append((name, RunState('UNKNOWN')))

    def error(self, e, err_count=1):
        if err_count > 0:
            traceback.print_exc()
            self.current['errors'].append(e)
            self.err_count += err_count

    def system(self, sys):
        self.systems.setdefault(sys, {'runs': [], 'errors': []})
        self.current_system = sys

    def __repr__(self):
        rep = ''
        for sys in self.systems:
            rep += '--= {} =--\n'.format(sys)
            for run, state in self.systems[sys]['runs']:
                rep += '{}:{}  '.format(run, state)
            rep += '\n'
            for e in self.systems[sys]['errors']:
                rep += str(e) + '\n'
            rep += '\n'
        return rep

    @property
    def current(self):
        return self.systems[self.current_system]

    @property
    def nerrors(self):
        return sum(len(sys['errors']) for sys in self.systems.values())

    def __init__(self):
        self.systems = {}
        self.current_system = None
        self.err_count = 0

    def __eq__(self, other):
        return self.systems == other.systems

    def __lt__(self, other):
        return self.err_count < other.err_count


class Loader(yaml.Loader):

    def include(self, node):
        fname = os.path.abspath(self.construct_scalar(node))
        with open(fname, 'r') as f:
            return yaml.load(f, Loader)

    def calc(self, node):
        return eval(self.construct_scalar(node))


Loader.add_constructor('!include', Loader.include)
Loader.add_constructor('!calc', Loader.calc)


def find_user():
    return os.environ.get('USER') or os.environ.get('LOGNAME')


def run_eval(yamlfile, debug=False, txtout=None, autosavedir=None):
    """
    Read an eval.yaml file and run the specified functions and store the result.
    """
    report = Report()
    with open(yamlfile) as f:
        yaml_dict = yaml.load(f.read(), Loader)

    directory = os.path.dirname(yamlfile)
    user = find_user()
    namespace = yaml_dict.pop('namespace', '')
    report.system(directory)

    for k, v in yaml_dict.pop('config', {}).items():
        config['eval'][k] = str(v)

    temperature = yaml_dict['simulation_params'].pop('T', None)
    if temperature is None:
        m = re.match('.*[^\d](\d+)K.*', directory)
        if m is not None:
            temperature = int(m.group(1))

    if debug:
        print('Evaluation for', directory)
        print('User:', user)
        print('T:', temperature)

    # dont overdo caching, this allows several parallel analysis with caching
    MAXCACHE = int(min(3000, (virtual_memory().total / 1024**2 / 5 / cpu_count() / 0.4) // 10 * 10))
    if autosavedir is not None:
        md.autosave.enable(autosavedir, verbose=True)

    if 'trajectory-open' in yaml_dict:
        fopen = locate(yaml_dict['trajectory-open'], namespace)
        if fopen is None:
            raise ValueError("Trajectory loader couldn't be located: {}".format(yaml_dict['trajectory-open']))
    else:
        fopen = open_sim
    traj = report.apply(fopen, directory, maxcache=config['eval'].getint('maxcache', MAXCACHE))

    if 'energy-open' in yaml_dict:
        eopen = locate(yaml_dict['energy-open'], namespace)
        eopen_err_count = 1
        if fopen is None:
            raise ValueError("Energy loader couldn't be located: {}".format(yaml_dict['energy-open']))
    else:
        eopen = open_energy
        eopen_err_count = 0
    energyfile = report.apply(eopen, directory, err_count=eopen_err_count)

    if config.getboolean('eval', 'require_trajectory') and traj is None:
        return report
    if traj is None and energyfile is None:
        return report

    logger.info('Running evaluations for: %s', directory)
    for evaluation in yaml_dict['evaluations']:
        subset = evaluation['subset']
        selection = subset.pop('selection')

        if 'coordinates-map' in subset:
            cmap = subset.pop('coordinates-map')
            crds_map = locate(cmap, namespace)
            if crds_map is None:
                report.unknown(cmap)
                continue
            subtraj = crds_map(traj.subset(**subset)) if traj is not None else None
        else:
            subtraj = traj.subset(**subset) if traj is not None else None
        subtraj.selection = selection

        other = evaluation.pop('other', None)
        if other is not None and traj is not None:
            selection += '-' + other.pop('selection')
            if 'coordinates-map' in other:
                cmap = other.pop('coordinates-map')
                crds_map = locate(cmap, namespace)
                if crds_map is None:
                    report.unknown(cmap)
                    continue
                othertraj = crds_map(traj.subset(**other))
            else:
                othertraj = traj.subset(**other)
        else:
            othertraj = None

        functions = evaluation.pop('functions')

        report.system('{}@{}'.format(selection, directory))
        if debug:
            print('*****')
            print('subset:', subset)
            print('selection:', selection)
            print('subtraj:', subtraj)
        for func in functions:
            if isinstance(func, dict):
                if len(func) == 1:
                    (func, params), = func.items()
                else:
                    logger.info('Function definition is unclear: %s', str(func))
            else:
                params = {}
            if othertraj is not None:
                params['other'] = othertraj
            # Locate the function: First under a specified namespace, then in the default module
            f = locate(namespace + '.' + func) or locate('store.analyse.' + func)
            if f is not None:
                if debug:
                    print('---')
                    print('function:', f, func)
                    print('params:', params)
                    report.runok(func)
                else:
                    logger.info('Run function %s', func)
                    func_args = inspect.signature(f).parameters
                    if 'trajectory' in func_args:
                        params['trajectory'] = subtraj
                    if 'energyfile' in func_args:
                        params['energyfile'] = energyfile
                    res = report.apply(f, **params)
                    if not isinstance(res, dict):
                        res = {func: res}
                    params.pop('trajectory', None)
                    params.pop('other', None)
                    params.pop('energyfile', None)
                    for obs, data in res.items():
                        if txtout is not None:
                            dataframe_to_txt(os.path.join(txtout, '{}.dat'.format(obs)), data)
                        store.update(
                            obs, data, directory=directory, user=user, selection=selection, T=temperature,
                            simulation_params=yaml_dict.get('simulation_params', {}),
                            evaluation_params=params
                        )
            else:
                report.unknown(func)
                logger.info('Function %s was not found. Namespace was: %s', func, namespace)

    return report


def recursive_analysis(basedir, processes=None, debug=False, txtout=None, autosavedir=None):
    """
    Run analysis functions recursively for baseidr on several processes.
    """
    logger.info('Starting recursive analysis in directory: {}'.format(basedir))
    reports = []

    def collect_reports(rep):
        reports.append(rep)

    def catch_error(err):
        traceback.print_exception(type(err), err, err.__traceback__)

    yaml_files = [str(y) for p in glob(basedir) for y in Path(p).glob('**/eval.yaml')]

    logger.info('Finished walking directories:')
    for y in yaml_files:
        logger.info(y)

    # Evaluation is syncronous only if processes=False.
    if processes is False:
        for y in yaml_files:
            print(y)
            try:
                reports.append(run_eval(y, debug=debug, txtout=txtout, autosavedir=autosavedir))
            except FileNotFoundError:
                print('Skipping evaluation...')
    else:
        #pool = Pool(processes=processes)
        #for y in yaml_files:
        #    pool.apply_async(run_eval, args=(y,), kwds={'debug': debug, 'txtout': txtout, 'autosavedir': autosavedir},
        #                     callback=collect_reports, error_callback=catch_error)
        #pool.close()
        #pool.join()
        with Pool(processes=processes) as pool:
            for y in yaml_files:
                time.sleep(5)
                pool.apply_async(run_eval, args=(y,), kwds={'debug': debug, 'txtout': txtout, 'autosavedir': autosavedir},
                                 callback=collect_reports, error_callback=catch_error)
            pool.close()
            pool.join()
        #    reports = pool.map(
        #        functools.partial(run_eval, debug=debug, txtout=txtout, autosavedir=autosavedir),
        #        yaml_files
        #    )

    print('#*' * 22)
    print('Finished analysis!: {}'.format(datetime.strftime(datetime.now(), '%c')))
    print('#*' * 22)
    for rep in sorted(reports):
        print(rep)
    if len(yaml_files) != len(reports):
        print('#### Error: {} / {} tasks were reported.'.format(len(reports), len(yaml_files)))
        for y in yaml_files:
            root = os.path.dirname(y)
            if not any(root in str(rep) for rep in reports):
                print('Task not reported: {}'.format(root))


# ===========================================
def cli(args=None):
    parser = argparse.ArgumentParser(description='Analyse a certain simulation')
    parser.add_argument('--recursive', '-r', action='store_true', default=False,
                        help='Perform a recusrive evaluation of the directory.')
    parser.add_argument('-d', default=None, help='simulation directory; default cwd')
    parser.add_argument('-o', default=None,
                        help='default None; output directory for human readable *.dat, if this is not a path ending with "/" then the last part will be a common prefix')
    parser.add_argument('--autosave', '-a', default=None, help='Autosave directory')
    parser.add_argument('--verbose', '-v', default=False, action='store_true',
                        help='Be verbose, i.e. set logging level to DEBUG, default is INFO')
    parser.add_argument('--processes', '-np', default=None, type=int,
                        help='Number of sub-processes for the recursive evaluation.')
    args = parser.parse_args()
    if args.verbose:
        md.logging.setlevel('DEBUG')
    SIMDIR = args.d
    if SIMDIR is None:
        SIMDIR = os.getcwd()
    SIMDIR = os.path.abspath(SIMDIR)
    OUTDIR = args.o
    if OUTDIR is not None:
        OUTDIR = os.path.abspath(OUTDIR)
        if args.o[-1] == '/':
            OUTDIR = OUTDIR + '/'

    if args.recursive:
        recursive_analysis(SIMDIR, txtout=OUTDIR, autosavedir=args.autosave, processes=args.processes)
    else:
        yamlfile = os.path.join(SIMDIR, 'eval.yaml')
        if os.path.exists(yamlfile):
            rep = run_eval(yamlfile, txtout=OUTDIR, autosavedir=args.autosave)
            print(rep)
        else:
            print('eval.yaml not found, exiting')
            quit()