Files
python-store/store/store.py
2022-04-11 10:59:06 +02:00

501 lines
16 KiB
Python
Executable File

import os
from collections.abc import Iterable
from logging import info
import argparse
import pickle
import pandas as pd
import sqlalchemy as sql
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.pool import NullPool
from .utils import numlike, number_shorthand, lazy_eq
from . import config
Base = declarative_base()
Engine = sql.create_engine(config['store']['db_file'], poolclass=NullPool)
Session = sql.orm.sessionmaker(bind=Engine)
def init_db():
pass
class FloatAttribute(Base):
__tablename__ = 'floatattributes'
id = sql.Column(sql.Integer, primary_key=True)
name = sql.Column(sql.String)
value = sql.Column(sql.Float)
simulation_id = sql.Column(sql.Integer, sql.ForeignKey('simulations.id'))
def __init__(self, name, value):
self.name = name
self.value = value
def __repr__(self):
return '<{classname}({name}={value})>'.format(
classname=self.__class__.__name__, name=self.name, value=self.value
)
class StringAttribute(Base):
__tablename__ = 'stringattributes'
id = sql.Column(sql.Integer, primary_key=True)
name = sql.Column(sql.String)
value = sql.Column(sql.String)
simulation_id = sql.Column(sql.Integer, sql.ForeignKey('simulations.id'))
def __init__(self, name, value):
self.name = name
self.value = value
def __repr__(self):
return '<{classname}({name}={value})>'.format(
classname=self.__class__.__name__, name=self.name, value=self.value
)
class Parameter(Base):
__tablename__ = 'parameters'
id = sql.Column(sql.Integer, primary_key=True)
name = sql.Column(sql.String)
value = sql.Column(sql.Float)
evaluation_id = sql.Column(sql.Integer, sql.ForeignKey('evaluations.id'))
def __init__(self, name, value):
self.name = name
self.value = value
def __repr__(self):
return '<{classname}({name}={value})>'.format(
classname=self.__class__.__name__, name=self.name, value=self.value
)
class Simulation(Base):
__tablename__ = 'simulations'
id = sql.Column(sql.Integer, primary_key=True)
directory = sql.Column(sql.String)
user = sql.Column(sql.String)
temperature = sql.Column(sql.Float)
float_params = sql.orm.relationship(FloatAttribute)
string_params = sql.orm.relationship(StringAttribute)
evaluations = sql.orm.relationship('Evaluation', order_by='Evaluation.id', back_populates='simulation')
def add_attribute(self, name, value):
self.attributes.append(FloatAttribute(name, value))
def add_parameter(self, name, value):
if numlike(value):
self.float_params.append(FloatAttribute(name, value))
else:
self.string_params.append(StringAttribute(name, value))
def copy(self):
params = {p.name: p.value for p in self.float_params + self.string_params}
return Simulation(self.directory, self.user, self.temperature, **params)
def delete(self, session=None):
if session is None:
session = Session()
for e in self.evaluations:
e.delete(session=session)
for p in self.float_params + self.string_params:
session.delete(p)
session.delete(self)
def __init__(self, directory, user, temperature, **parameters):
self.directory = directory or ''
self.user = user or ''
self.temperature = temperature
for name, value in parameters.items():
self.add_parameter(name, value)
def __repr__(self):
attrs = []
for attr, val in self.__dict__.items():
if attr[0] is not '_' and attr is not 'type':
attrs.append('{}={}'.format(attr, val))
return '<{classname}({attributes})>'.format(classname=self.__class__.__name__, attributes=', '.join(attrs))
# Dataframes can only be unpickled with the same version of Pandas used to store them
class PandasPickler:
@staticmethod
def dumps(obj, protocol=None):
if isinstance(obj, pd.DataFrame):
obj = obj.to_dict()
return pickle.dumps(obj, protocol=protocol)
@staticmethod
def loads(buf):
obj = pickle.loads(buf)
if isinstance(obj, dict):
obj = pd.DataFrame(obj)
return obj
class Evaluation(Base):
__tablename__ = 'evaluations'
id = sql.Column(sql.Integer, primary_key=True)
simulation_id = sql.Column(sql.Integer, sql.ForeignKey('simulations.id'))
observable = sql.Column(sql.String)
selection = sql.Column(sql.String)
data = sql.Column(sql.PickleType(comparator=lazy_eq, pickler=PandasPickler))
parameters = sql.orm.relationship('Parameter')
simulation = sql.orm.relationship('Simulation', back_populates='evaluations')
@property
def dataframe(self):
"""
Return a dataframe representing the evaluated data and meta data of the evaluation.
"""
if self.data is None:
df = pd.DataFrame()
elif numlike(self.data):
df = pd.DataFrame({self.observable: self.data}, index=[0])
else:
df = self.data.copy()
df['selection'] = self.selection
for p in self.parameters:
df[p.name] = p.value
df['directory'] = self.simulation.directory
df['user'] = self.simulation.user
df['T'] = self.simulation.temperature
for a in self.simulation.float_params:
df[a.name] = a.value
for a in self.simulation.string_params:
df[a.name] = a.value
return df
def add_parameter(self, name, value):
self.parameters.append(Parameter(name, value))
def copy(self, simulation=None):
return Evaluation(self.observable, simulation or self.simulation, self.data,
parameters={p.name: p.value for p in self.parameters})
def delete(self, session=None):
if session is None:
session = Session()
for p in self.parameters:
session.delete(p)
session.delete(self)
def __init__(self, observable, simulation, data, selection='', parameters={}):
self.observable = observable
self.simulation = simulation
self.data = data
self.selection = selection
for name, value in parameters.items():
self.add_parameter(name, value)
def __repr__(self):
attrs = []
for attr, val in self.__dict__.items():
if attr[0] is not '_' and attr not in ['type', 'data']:
attrs.append('{}={}'.format(attr, val))
return '<{classname}({attributes})>'.format(classname=self.__class__.__name__, attributes=', '.join(attrs))
Base.metadata.create_all(Engine)
def list_query(args, comparator):
if isinstance(args, str):
args = [args]
return sql.or_(*[comparator(a) for a in args])
def float_query(value, dbattr):
if numlike(value):
crit = dbattr == float(value)
elif isinstance(value, str):
limits = value.split('-')
if len(limits) is 2:
try:
limits = [float(x) for x in limits]
except:
raise ValueError('Could not parse value={}'.format(value))
crit = sql.and_(float(limits[0]) <= dbattr, dbattr <= float(limits[1]))
else:
raise ValueError('Could not parse value={}'.format(value))
elif isinstance(value, Iterable):
crit = dbattr.in_(value)
else:
raise ValueError('Could not parse {} as float value.'.format(value))
return crit
def query_simulation(directory=None, user=None, T=None, session=None, **attributes):
if session is None:
session = Session()
query = session.query(Simulation)
if directory is not None:
query = query.filter(list_query(directory, Simulation.directory.like))
if user is not None:
query = query.filter(list_query(user, Simulation.user.like))
if T is not None:
query = query.filter(float_query(T, Simulation.temperature))
for name, value in attributes.items():
if number_shorthand(value):
alias = sql.orm.aliased(FloatAttribute)
query = query.join(alias).filter(sql.and_(alias.name == name, float_query(value, alias.value)))
else:
alias = sql.orm.aliased(StringAttribute)
query = query.join(alias).filter(sql.and_(alias.name == name, list_query(value, alias.value.like)))
return query
def query_evaluation(observable, simulation, selection=None, session=None, **parameters):
"""
Query the database for evaluations.
Args:
observable: Observable of the evaluation
directory: Simulation directory as a string, or a list of the former.
user: Username os list of users
T: Temperature to query, can be a number, a list of numbers or a string giving a range
selection: Selection of the simulation
**attributes: Any attributes of the simulations as keyword arguments
This query function allows some shorthands for the definition of query values.
For string comparisons the SQL statement LIKE is used, which allows the use of the wildcard '%'.
Besides a concrete value, all arguments allow specification of a list of arguments,
which will query any of the given values.
Number values (i.e. temperature or any simulation attribute) can also be given as strings
which represent a range, i. e. '100 - 150'.
Example:
query_observable('isf', ['%/sys1', 'sys2'], T='100-200')
"""
if session is None:
session = Session()
if not isinstance(simulation, Iterable):
simulation = [simulation]
query = session.query(Evaluation).filter(
Evaluation.simulation_id.in_([s.id for s in simulation]),
Evaluation.observable == observable
)
if selection is not None:
query = query.filter(list_query(selection, Evaluation.selection.like))
for name, value in parameters.items():
alias = sql.orm.aliased(Parameter)
query = query.join(alias).filter(sql.and_(alias.name == name, float_query(value, alias.value)))
return query
def get(observable, directory=None, user=None, T=None, selection=None,
evaluation_params={}, simulation_params={}):
"""
Get evaluation data from the database.
Args:
observable: The observable of the evaluation
data: The data of the evaluation
simulation: The evaluated simulation
directory: The simulation directory
user: User of the simulation
T: Temperature of the simulation
selection: Selection of the evaluation
evaluation_params: A dict of parameters of the evaluation
simualtion_params: A dict of parameters of the evaluation
"""
session = Session()
sim_query = query_simulation(directory, user=user, T=T, session=session, **simulation_params)
if sim_query.count() == 0:
return None
#else:
# simulations = sim_query.all()
if isinstance(observable, str):
observable = [observable]
results = []
for obs in observable:
evals = query_evaluation(obs, sim_query, selection=selection, session=session, **evaluation_params)
results.extend([ev.dataframe for ev in evals])
if len(results) > 0:
df = pd.concat(results, ignore_index=True)
res = df.sort_values(by=['directory', 'selection', 'T'])
else:
res = None
session.close()
return res
def merge(observables, *args, **kwargs):
"""
Get the merged datasets of several observables.
Args:
observables: List of observables
*args, **kwargs: Additional Argmunets, passed to :func:`get`
"""
if len(observables) > 1:
df = pd.merge(get(observables[0], *args, **kwargs), merge(observables[1:], *args, **kwargs))
return df.sort_values(by=['system', 'selection', 'T'])
else:
return get(observables[0], *args, **kwargs)
def update(observable, data, simulation=None, directory=None, user=None, T=None, selection=None,
evaluation_params={}, simulation_params={}):
"""
Update an existing evaluation record or insert a new one into the database.
Args:
observable: The observable of the evaluation
data: The data of the evaluation
simulation: The evaluated simulation
directory: The simulation directory
user: User of the simulation
T: Temperature of the simulation
selection: Selection of the evaluation
evaluation_params: A dict of parameters of the evaluation
simualtion_params: A dict of parameters of the evaluation
"""
session = Session()
if simulation is None:
sim_query = query_simulation(directory, user=user, T=T, session=session, **simulation_params)
qu_count = sim_query.count()
if qu_count == 0:
simulation = Simulation(directory, user, T, **simulation_params)
elif qu_count == 1:
simulation = sim_query.first()
else:
session.close()
raise ValueError('Found multiple simulation records while updating: {}'.format(sim_query.all()))
eval_query = query_evaluation(observable, simulation, selection=selection, session=session, **evaluation_params)
qu_count = eval_query.count()
if qu_count == 0:
session.add(Evaluation(observable, simulation, data, selection=selection, parameters=evaluation_params))
elif qu_count == 1:
eval_query.first().data = data
else:
session.close()
raise ValueError('Found more than one record in database while updating.')
session.commit()
session.close()
def delete(*args, **kwargs):
"""
Delete evaluations from the database.
"""
session = Session()
query = query_evaluation(*args, **kwargs)
for item in query:
session.delete(item)
# item.delete()
session.commit()
session.close()
def observables(directory=None, user=None, T=None, **parameters):
"""
List all observables, if system is not None for this specific system.
"""
query = query_simulation(directory, user=user, T=T, **parameters)
obs = set()
for sim in query:
obs.update({ev.observable for ev in sim.evaluations})
return obs
def systems(observable=None, user=None):
"""
List all system, if observable is not None for this specific observable.
"""
query = query_simulation(user=user)
if observable is not None:
query = query.join(Evaluation).filter(Evaluation.observable == observable)
return {sim.directory for sim in query}
def clean_leaves():
"""Delete objects with missing foreign key from the database."""
session = Session()
for cls in (StringAttribute, FloatAttribute):
session.query(cls).filter(cls.simulation_id == None).delete()
session.query(Parameter).filter(Parameter.evaluation_id == None).delete()
# TODO: Delete evaluation leaves, requires joint query with parameter
session.commit()
session.close()
def dump_db(dbfile, user=None, **kwargs):
"""
Dump the database into a new databse.
Args:
dbfile: File descriptor of the new database.
user (opt.): If None, the database of the current user is dumped.
**kwargs: Any keyword arguments are passed to query_simulation
"""
if user is None:
user = os.getlogin()
if '://' not in dbfile:
dbfile = 'sqlite:///' + dbfile
dump_engine = sql.create_engine(dbfile)
Base.metadata.create_all(dump_engine)
dump_session = sql.orm.sessionmaker(bind=dump_engine)()
for sim in query_simulation(user=user, **kwargs):
dump_sim = sim.copy()
for ev in sim.evaluations:
dump_ev = ev.copy(simulation=dump_sim)
dump_session.add(dump_ev)
dump_session.commit()
def dump_cli(args=None):
"""Command line interface to dump the database to a file."""
parser = argparse.ArgumentParser(description='CLI tool to create a databse dump.')
parser.add_argument('dbfile', help='The database file for the dump.')
parser.add_argument(
'--user', '-u',
help='The user for which the database is dumped, default is the current user.'
)
args = parser.parse_args()
dump_db(args.dbfile, user=args.user)
def delete_dirs(dirpattern):
session = Session()
simulations = query_simulation(directory=dirpattern, session=session)
for sim in simulations:
sim.delete(session=session)
session.commit()
session.close()