From 949ed877fb1c580e9f5ebf71088844d1a0405c1c Mon Sep 17 00:00:00 2001 From: Sebastian Kloth Date: Mon, 8 Jan 2024 07:42:04 +0100 Subject: [PATCH] Added module for water structure parameters --- src/mdevaluate/extra/__init__.py | 1 + src/mdevaluate/extra/water.py | 419 +++++++++++++++++++++++++++++++ 2 files changed, 420 insertions(+) create mode 100644 src/mdevaluate/extra/water.py diff --git a/src/mdevaluate/extra/__init__.py b/src/mdevaluate/extra/__init__.py index 8d1d8fb..9066fc3 100644 --- a/src/mdevaluate/extra/__init__.py +++ b/src/mdevaluate/extra/__init__.py @@ -1,2 +1,3 @@ from . import chill from . import free_energy_landscape +from . import water diff --git a/src/mdevaluate/extra/water.py b/src/mdevaluate/extra/water.py new file mode 100644 index 0000000..0a64f3b --- /dev/null +++ b/src/mdevaluate/extra/water.py @@ -0,0 +1,419 @@ +from functools import partial +from typing import Tuple, Callable, Optional + +import numpy as np +from numpy.typing import NDArray, ArrayLike +import pandas as pd +from scipy.spatial import KDTree + +from ..distribution import hbonds +from ..pbc import pbc_points +from ..correlation import shifted_correlation, overlap +from ..coordinates import Coordinates, CoordinateFrame + + +def tanaka_zeta( + trajectory: Coordinates, angle: float = 30, segments: int = 100, skip: float = 0.1 +) -> pd.DataFrame: + frame_indices = np.unique( + np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments)) + ) + sel = trajectory.atom_subset.selection + A = np.where( + trajectory.subset(atom_name="OW", residue_name="SOL").atom_subset.selection[sel] + )[0] + D = np.vstack([A] * 2).T.reshape((-1,)) + H = np.where( + trajectory.subset(atom_name="HW.", residue_name="SOL").atom_subset.selection[ + sel + ] + )[0] + + zeta_dist = [] + zeta_cg_dist = [] + for frame_index in frame_indices: + D_frame = trajectory[frame_index][D] + H_frame = trajectory[frame_index][H] + A_frame = trajectory[frame_index][A] + box = trajectory[frame_index].box + pairs = hbonds( + D_frame, H_frame, A_frame, box, min_cos=np.cos(angle / 180 * np.pi) + ) + pairs[:, 0] = np.int_((pairs[:, 0] / 2)) + pairs = np.sort(pairs, axis=1) + pairs = np.unique(pairs, axis=0) + pairs = pairs.tolist() + + A_PBC, A_index = pbc_points(A_frame, box, thickness=0.7, index=True) + A_tree = KDTree(A_PBC) + dist, dist_index = A_tree.query(A_frame, 16, distance_upper_bound=0.7) + + dist_index = A_index[dist_index] + zeta = [] + for i, indices in enumerate(dist_index): + dist_hbond = [] + dist_non_hbond = [] + for j, index in enumerate(indices): + if j != 0: + if np.sort([indices[0], index]).tolist() in pairs: + dist_hbond.append(dist[i, j]) + else: + dist_non_hbond.append(dist[i, j]) + try: + zeta.append(np.min(dist_non_hbond) - np.max(dist_hbond)) + except ValueError: + zeta.append(0) + + zeta = np.array(zeta) + + dist, dist_index = A_tree.query(A_frame, 16, distance_upper_bound=0.7) + dist_index = A_index[dist_index] + dist_index = np.array( + [indices[dist[i] <= 0.35] for i, indices in enumerate(dist_index)] + ) + zeta_cg = np.array([np.mean(zeta[indices]) for indices in dist_index]) + + bins = np.linspace(-0.1, 0.2, 301) + zeta_dist.append(np.histogram(zeta, bins=bins)[0]) + zeta_cg_dist.append(np.histogram(zeta_cg, bins=bins)[0]) + z = bins[1:] - (bins[1] - bins[0]) / 2 + + zeta_dist = np.mean(zeta_dist, axis=0) + zeta_dist = zeta_dist / np.mean(zeta_dist) + + zeta_cg_dist = np.mean(zeta_cg_dist, axis=0) + zeta_cg_dist = zeta_cg_dist / np.mean(zeta_cg_dist) + + return pd.DataFrame({"zeta": z, "result": zeta_dist, "result_cg": zeta_cg_dist}) + + +def chi_four_trans( + trajectory: Coordinates, skip: float = 0.1, segments: int = 10000 +) -> pd.DataFrame: + traj = trajectory.nojump + N = len(trajectory[0]) + t, S = shifted_correlation( + partial(overlap, radius=0.1), traj, skip=skip, segments=segments, average=False + ) + chi = 1 / N * S.var(axis=0)[1:] + return pd.DataFrame({"time": t[1:], "chi": chi}) + + +def tanaka_correlation_map( + trajectory: Coordinates, + data_chi_four_trans: pd.DataFrame, + angle: float = 30, + segments: int = 100, + skip: float = 0.1, +) -> pd.DataFrame: + def tanaka_zeta_cg( + trajectory: Coordinates, + angle: float = 30, + segments: int = 1000, + skip: float = 0.1, + ) -> Tuple[NDArray, NDArray]: + frame_indices = np.unique( + np.int_( + np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments) + ) + ) + sel = trajectory.atom_subset.selection + A = np.where( + trajectory.subset(atom_name="OW", residue_name="SOL").atom_subset.selection[ + sel + ] + )[0] + D = np.vstack([A] * 2).T.reshape((-1,)) + H = np.where( + trajectory.subset( + atom_name="HW.", residue_name="SOL" + ).atom_subset.selection[sel] + )[0] + + zeta_cg = [] + times = [] + for frame_index in frame_indices: + D_frame = trajectory[frame_index][D] + H_frame = trajectory[frame_index][H] + A_frame = trajectory[frame_index][A] + box = trajectory[frame_index].box + pairs = hbonds( + D_frame, H_frame, A_frame, box, min_cos=np.cos(angle / 180 * np.pi) + ) + pairs[:, 0] = np.int_((pairs[:, 0] / 2)) + pairs = np.sort(pairs, axis=1) + pairs = np.unique(pairs, axis=0) + pairs = pairs.tolist() + + A_PBC, A_index = pbc_points(A_frame, box, thickness=0.7, index=True) + A_tree = KDTree(A_PBC) + dist, dist_index = A_tree.query(A_frame, 16, distance_upper_bound=0.7) + + dist_index = A_index[dist_index] + zeta = [] + for i, indices in enumerate(dist_index): + dist_hbond = [] + dist_non_hbond = [] + for j, index in enumerate(indices): + if j != 0: + if np.sort([indices[0], index]).tolist() in pairs: + dist_hbond.append(dist[i, j]) + else: + dist_non_hbond.append(dist[i, j]) + try: + zeta.append(np.min(dist_non_hbond) - np.max(dist_hbond)) + except ValueError: + zeta.append(0) + zeta = np.array(zeta) + dist_index = np.array( + [indices[dist[i] <= 0.35] for i, indices in enumerate(dist_index)] + ) + zeta_cg.append(np.array([np.mean(zeta[indices]) for indices in dist_index])) + times.append(trajectory[frame_index].time) + return np.array(times), np.array(zeta_cg) + + def delta_r_max( + trajectory: Coordinates, frame: CoordinateFrame, tau_4: float + ) -> NDArray: + dt = trajectory[1].time - trajectory[0].time + index_start = frame.step + index_end = index_start + int(tau_4 / dt) + 1 + frame_indices = np.arange(index_start, index_end + 1) + end_cords = np.array([trajectory[frame_index] for frame_index in frame_indices]) + vectors = trajectory[index_start] - end_cords + + delta_r = np.linalg.norm(vectors, axis=-1) + delta_r = np.max(delta_r, axis=0) + return delta_r + + d = np.array(data_chi_four_trans[["time", "chi"]]) + mask = d[:, 1] >= 0.7 * np.max(d[:, 1]) + fit = np.polyfit(d[mask, 0], d[mask, 1], 4) + p = np.poly1d(fit) + x_inter = np.linspace(d[mask, 0][0], d[mask, 0][-1], 1e6) + y_inter = p(x_inter) + tau_4 = x_inter[y_inter == np.max(y_inter)] + + oxygens = trajectory.nojump.subset(atom_name="OW") + window = tau_4 / trajectory[-1].time + start_frames = np.unique( + np.linspace( + len(trajectory) * skip, + len(trajectory) * (1 - window), + num=segments, + endpoint=False, + dtype=int, + ) + ) + + times, zeta_cg = tanaka_zeta_cg(trajectory, angle=angle) + + zeta_cg_mean = np.array( + [ + np.mean( + zeta_cg[ + (times >= trajectory[start_frame].time) + * (times <= (trajectory[start_frame].time + tau_4)) + ], + axis=0, + ) + for start_frame in start_frames + ] + ).flatten() + delta_r = np.array( + [ + delta_r_max(oxygens, oxygens[start_frame], tau_4) + for start_frame in start_frames + ] + ).flatten() + return pd.DataFrame({"zeta_cg": zeta_cg_mean, "delta_r": delta_r}) + + +def LSI_atom(distances: ArrayLike) -> NDArray: + r_j = distances[distances <= 0.37] + r_j = r_j.tolist() + r_j.append(distances[len(r_j)]) + delta_ji = [r_j[i + 1] - r_j[i] for i in range(0, len(r_j) - 1)] + mean_delta_i = np.mean(delta_ji) + I = 1 / len(delta_ji) * np.sum((mean_delta_i - delta_ji) ** 2) + return I + + +def LSI( + trajectory: Coordinates, segments: int = 10000, skip: float = 0 +) -> pd.DataFrame: + def LSI_distribution( + frame: CoordinateFrame, bins: NDArray, selector: Optional[Callable] = None + ) -> NDArray: + atoms_PBC = pbc_points(frame, frame.box, thickness=0.7) + atoms_tree = KDTree(atoms_PBC) + if selector: + index = selector(frame) + else: + index = np.arange(len(frame)) + dist, _ = atoms_tree.query(frame[index], 50, distance_upper_bound=0.6) + distances = dist[:, 1:] + LSI_values = np.array([LSI_atom(distance) for distance in distances]) + dist = np.histogram(LSI_values, bins=bins, density=True)[0] + return dist + + bins = np.linspace(0, 0.007, 201) + I = bins[1:] - (bins[1] - bins[0]) / 2 + + frame_indices = np.unique( + np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments)) + ) + distributions = np.array( + [ + LSI_distribution(trajectory[frame_index], trajectory, bins, selector=None) + for frame_index in frame_indices + ] + ) + P = np.mean(distributions, axis=0) + return pd.DataFrame({"I": I, "P": P}) + + +def HDL_LDL_positions(frame, trajectory, selector=None): + atoms_PBC = pbc_points(frame, frame.box, thickness=0.7) + atoms_tree = KDTree(atoms_PBC) + if selector: + index = selector(frame) + else: + index = range(len(frame)) + dist = atoms_tree.query(frame[index], 50, distance_upper_bound=0.6)[0] + distances = dist[:, 1:] + LSI_values = np.array([LSI_atom(distance) for distance in distances]) + LDL = LSI_values >= 0.0013 + HDL = LSI_values < 0.0013 + pos_HDL = frame[index][HDL] + pos_LDL = frame[index][LDL] + return pos_HDL, pos_LDL + + +def HDL_LDL_gr(trajectory, segments=10000, skip=0): + def gr_frame(frame, trajectory, bins): + atoms_ALL = frame + atoms_HDL, atoms_LDL = HDL_LDL_positions(frame, trajectory) + + atoms_PBC_ALL = pbc_points(atoms_ALL, frame.box) + atoms_PBC_LDL = pbc_points(atoms_LDL, frame.box) + atoms_PBC_HDL = pbc_points(atoms_HDL, frame.box) + + tree_ALL = KDTree(atoms_PBC_ALL) + tree_LDL = KDTree(atoms_PBC_LDL) + tree_HDL = KDTree(atoms_PBC_HDL) + + dist_ALL_ALL, _ = tree_ALL.query( + atoms_ALL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1 + ) + dist_HDL_HDL, _ = tree_HDL.query( + atoms_HDL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1 + ) + dist_LDL_LDL, _ = tree_LDL.query( + atoms_LDL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1 + ) + dist_HDL_LDL, _ = tree_LDL.query( + atoms_HDL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1 + ) + + dist_ALL_ALL = dist_ALL_ALL[:, 1:].flatten() + dist_HDL_HDL = dist_HDL_HDL[:, 1:].flatten() + dist_LDL_LDL = dist_LDL_LDL[:, 1:].flatten() + dist_HDL_LDL = dist_HDL_LDL.flatten() + + hist_ALL_ALL = np.histogram( + dist_ALL_ALL, bins=bins, range=(0, bins[-1]), density=False + )[0] + hist_HDL_HDL = np.histogram( + dist_HDL_HDL, bins=bins, range=(0, bins[-1]), density=False + )[0] + hist_LDL_LDL = np.histogram( + dist_LDL_LDL, bins=bins, range=(0, bins[-1]), density=False + )[0] + hist_HDL_LDL = np.histogram( + dist_HDL_LDL, bins=bins, range=(0, bins[-1]), density=False + )[0] + + n = [len(atoms_ALL), len(atoms_HDL), len(atoms_LDL)] / np.prod( + np.diag(frame.box) + ) + + return ( + np.array( + [ + hist_ALL_ALL / len(atoms_ALL), + hist_HDL_HDL / len(atoms_HDL), + hist_LDL_LDL / len(atoms_LDL), + hist_HDL_LDL / len(atoms_HDL), + ] + ), + n, + ) + + start_frame = trajectory[int(len(trajectory) * skip)] + upper_bound = round(np.min(np.diag(start_frame.box)) / 2 - 0.05, 1) + bins = np.linspace(0, upper_bound, upper_bound * 500 + 1) + frame_indices = np.unique( + np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments)) + ) + + gr = [] + ns = [] + for frame_index in frame_indices: + hists, n = gr_frame(trajectory[frame_index], trajectory, bins) + gr.append(hists) + ns.append(n) + + gr = np.mean(gr, axis=0) + gr = gr / (4 / 3 * np.pi * bins[1:] ** 3 - 4 / 3 * np.pi * bins[:-1] ** 3) + r = bins[1:] - (bins[1] - bins[0]) / 2 + n = np.mean(ns, axis=0) + + return pd.DataFrame( + {"r": r, "gr_ALL": [0], "gr_HDL": gr[1], "gr_LDL": gr[2], "gr_MIX": gr[3]} + ) + + +def HDL_LDL_concentration(trajectory, segments=10000, skip=0): + def HDL_LDL_concentration_frame(frame, trajectrory, bins): + atoms_HDL, atoms_LDL = HDL_LDL_positions(frame, trajectory) + atoms_PBC_HDL = pbc_points(atoms_HDL, frame.box, thickness=0.61) + atoms_PBC_LDL = pbc_points(atoms_LDL, frame.box, thickness=0.61) + tree_LDL = KDTree(atoms_PBC_LDL) + tree_HDL = KDTree(atoms_PBC_HDL) + dist_HDL_HDL, _ = tree_HDL.query(atoms_HDL, 31, distance_upper_bound=0.6) + dist_HDL_LDL, _ = tree_LDL.query(atoms_HDL, 30, distance_upper_bound=0.6) + HDL_near_HDL = np.sum( + dist_HDL_HDL <= 0.5, axis=-1 + ) # Ausgangsteilchen dazu zählen + LDL_near_HDL = np.sum(dist_HDL_LDL <= 0.5, axis=-1) + x_HDL = HDL_near_HDL / (HDL_near_HDL + LDL_near_HDL) + x_HDL_dist = np.histogram(x_HDL, bins=bins, range=(0, bins[-1]), density=True)[ + 0 + ] + dist_LDL_LDL, _ = tree_LDL.query(atoms_LDL, 31, distance_upper_bound=0.6) + dist_LDL_HDL, _ = tree_HDL.query(atoms_LDL, 30, distance_upper_bound=0.6) + LDL_near_LDL = np.sum( + dist_LDL_LDL <= 0.5, axis=-1 + ) # Ausgangsteilchen dazu zählen + HDL_near_LDL = np.sum(dist_LDL_HDL <= 0.5, axis=-1) + x_LDL = LDL_near_LDL / (LDL_near_LDL + HDL_near_LDL) + x_LDL_dist = np.histogram(x_LDL, bins=bins, range=(0, bins[-1]), density=True)[ + 0 + ] + return x_HDL_dist, x_LDL_dist + + bins = np.linspace(0, 1, 21) + x = bins[1:] - (bins[1] - bins[0]) / 2 + frame_indices = np.unique( + np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments)) + ) + local_concentration_dist = np.array( + [ + HDL_LDL_concentration_frame(trajectory[frame_index], trajectory, bins) + for frame_index in frame_indices + ] + ) + x_HDL = np.mean(local_concentration_dist[:, 0], axis=0) + x_LDL = np.mean(local_concentration_dist[:, 1], axis=0) + return pd.DataFrame({"x": x, "x_HDL": x_HDL, "x_LDL": x_LDL})