Added module for water structure parameters

2024-01-08 07:42:04 +01:00
parent 4f17cfe876
commit 949ed877fb
2 changed files with 420 additions and 0 deletions
--- a/src/mdevaluate/extra/init.py
+++ b/src/mdevaluate/extra/init.py
@@ -1,2 +1,3 @@
 from . import chill
 from . import free_energy_landscape
 from . import water
--- a/src/mdevaluate/extra/water.py
+++ b/src/mdevaluate/extra/water.py
@@ -0,0 +1,419 @@
 from functools import partial
 from typing import Tuple, Callable, Optional
 import numpy as np
 from numpy.typing import NDArray, ArrayLike
 import pandas as pd
 from scipy.spatial import KDTree
 from ..distribution import hbonds
 from ..pbc import pbc_points
 from ..correlation import shifted_correlation, overlap
 from ..coordinates import Coordinates, CoordinateFrame
 def tanaka_zeta(
    trajectory: Coordinates, angle: float = 30, segments: int = 100, skip: float = 0.1
 ) -> pd.DataFrame:
    frame_indices = np.unique(
        np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments))
    )
    sel = trajectory.atom_subset.selection
    A = np.where(
        trajectory.subset(atom_name="OW", residue_name="SOL").atom_subset.selection[sel]
    )[0]
    D = np.vstack([A] * 2).T.reshape((-1,))
    H = np.where(
        trajectory.subset(atom_name="HW.", residue_name="SOL").atom_subset.selection[
            sel
        ]
    )[0]
    zeta_dist = []
    zeta_cg_dist = []
    for frame_index in frame_indices:
        D_frame = trajectory[frame_index][D]
        H_frame = trajectory[frame_index][H]
        A_frame = trajectory[frame_index][A]
        box = trajectory[frame_index].box
        pairs = hbonds(
            D_frame, H_frame, A_frame, box, min_cos=np.cos(angle / 180 * np.pi)
        )
        pairs[:, 0] = np.int_((pairs[:, 0] / 2))
        pairs = np.sort(pairs, axis=1)
        pairs = np.unique(pairs, axis=0)
        pairs = pairs.tolist()
        A_PBC, A_index = pbc_points(A_frame, box, thickness=0.7, index=True)
        A_tree = KDTree(A_PBC)
        dist, dist_index = A_tree.query(A_frame, 16, distance_upper_bound=0.7)
        dist_index = A_index[dist_index]
        zeta = []
        for i, indices in enumerate(dist_index):
            dist_hbond = []
            dist_non_hbond = []
            for j, index in enumerate(indices):
                if j != 0:
                    if np.sort([indices[0], index]).tolist() in pairs:
                        dist_hbond.append(dist[i, j])
                    else:
                        dist_non_hbond.append(dist[i, j])
            try:
                zeta.append(np.min(dist_non_hbond) - np.max(dist_hbond))
            except ValueError:
                zeta.append(0)
        zeta = np.array(zeta)
        dist, dist_index = A_tree.query(A_frame, 16, distance_upper_bound=0.7)
        dist_index = A_index[dist_index]
        dist_index = np.array(
            [indices[dist[i] <= 0.35] for i, indices in enumerate(dist_index)]
        )
        zeta_cg = np.array([np.mean(zeta[indices]) for indices in dist_index])
        bins = np.linspace(-0.1, 0.2, 301)
        zeta_dist.append(np.histogram(zeta, bins=bins)[0])
        zeta_cg_dist.append(np.histogram(zeta_cg, bins=bins)[0])
        z = bins[1:] - (bins[1] - bins[0]) / 2
    zeta_dist = np.mean(zeta_dist, axis=0)
    zeta_dist = zeta_dist / np.mean(zeta_dist)
    zeta_cg_dist = np.mean(zeta_cg_dist, axis=0)
    zeta_cg_dist = zeta_cg_dist / np.mean(zeta_cg_dist)
    return pd.DataFrame({"zeta": z, "result": zeta_dist, "result_cg": zeta_cg_dist})
 def chi_four_trans(
    trajectory: Coordinates, skip: float = 0.1, segments: int = 10000
 ) -> pd.DataFrame:
    traj = trajectory.nojump
    N = len(trajectory[0])
    t, S = shifted_correlation(
        partial(overlap, radius=0.1), traj, skip=skip, segments=segments, average=False
    )
    chi = 1 / N * S.var(axis=0)[1:]
    return pd.DataFrame({"time": t[1:], "chi": chi})
 def tanaka_correlation_map(
    trajectory: Coordinates,
    data_chi_four_trans: pd.DataFrame,
    angle: float = 30,
    segments: int = 100,
    skip: float = 0.1,
 ) -> pd.DataFrame:
    def tanaka_zeta_cg(
        trajectory: Coordinates,
        angle: float = 30,
        segments: int = 1000,
        skip: float = 0.1,
    ) -> Tuple[NDArray, NDArray]:
        frame_indices = np.unique(
            np.int_(
                np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments)
            )
        )
        sel = trajectory.atom_subset.selection
        A = np.where(
            trajectory.subset(atom_name="OW", residue_name="SOL").atom_subset.selection[
                sel
            ]
        )[0]
        D = np.vstack([A] * 2).T.reshape((-1,))
        H = np.where(
            trajectory.subset(
                atom_name="HW.", residue_name="SOL"
            ).atom_subset.selection[sel]
        )[0]
        zeta_cg = []
        times = []
        for frame_index in frame_indices:
            D_frame = trajectory[frame_index][D]
            H_frame = trajectory[frame_index][H]
            A_frame = trajectory[frame_index][A]
            box = trajectory[frame_index].box
            pairs = hbonds(
                D_frame, H_frame, A_frame, box, min_cos=np.cos(angle / 180 * np.pi)
            )
            pairs[:, 0] = np.int_((pairs[:, 0] / 2))
            pairs = np.sort(pairs, axis=1)
            pairs = np.unique(pairs, axis=0)
            pairs = pairs.tolist()
            A_PBC, A_index = pbc_points(A_frame, box, thickness=0.7, index=True)
            A_tree = KDTree(A_PBC)
            dist, dist_index = A_tree.query(A_frame, 16, distance_upper_bound=0.7)
            dist_index = A_index[dist_index]
            zeta = []
            for i, indices in enumerate(dist_index):
                dist_hbond = []
                dist_non_hbond = []
                for j, index in enumerate(indices):
                    if j != 0:
                        if np.sort([indices[0], index]).tolist() in pairs:
                            dist_hbond.append(dist[i, j])
                        else:
                            dist_non_hbond.append(dist[i, j])
                try:
                    zeta.append(np.min(dist_non_hbond) - np.max(dist_hbond))
                except ValueError:
                    zeta.append(0)
            zeta = np.array(zeta)
            dist_index = np.array(
                [indices[dist[i] <= 0.35] for i, indices in enumerate(dist_index)]
            )
            zeta_cg.append(np.array([np.mean(zeta[indices]) for indices in dist_index]))
            times.append(trajectory[frame_index].time)
        return np.array(times), np.array(zeta_cg)
    def delta_r_max(
        trajectory: Coordinates, frame: CoordinateFrame, tau_4: float
    ) -> NDArray:
        dt = trajectory[1].time - trajectory[0].time
        index_start = frame.step
        index_end = index_start + int(tau_4 / dt) + 1
        frame_indices = np.arange(index_start, index_end + 1)
        end_cords = np.array([trajectory[frame_index] for frame_index in frame_indices])
        vectors = trajectory[index_start] - end_cords
        delta_r = np.linalg.norm(vectors, axis=-1)
        delta_r = np.max(delta_r, axis=0)
        return delta_r
    d = np.array(data_chi_four_trans[["time", "chi"]])
    mask = d[:, 1] >= 0.7 * np.max(d[:, 1])
    fit = np.polyfit(d[mask, 0], d[mask, 1], 4)
    p = np.poly1d(fit)
    x_inter = np.linspace(d[mask, 0][0], d[mask, 0][-1], 1e6)
    y_inter = p(x_inter)
    tau_4 = x_inter[y_inter == np.max(y_inter)]
    oxygens = trajectory.nojump.subset(atom_name="OW")
    window = tau_4 / trajectory[-1].time
    start_frames = np.unique(
        np.linspace(
            len(trajectory) * skip,
            len(trajectory) * (1 - window),
            num=segments,
            endpoint=False,
            dtype=int,
        )
    )
    times, zeta_cg = tanaka_zeta_cg(trajectory, angle=angle)
    zeta_cg_mean = np.array(
        [
            np.mean(
                zeta_cg[
                    (times >= trajectory[start_frame].time)
                    * (times <= (trajectory[start_frame].time + tau_4))
                ],
                axis=0,
            )
            for start_frame in start_frames
        ]
    ).flatten()
    delta_r = np.array(
        [
            delta_r_max(oxygens, oxygens[start_frame], tau_4)
            for start_frame in start_frames
        ]
    ).flatten()
    return pd.DataFrame({"zeta_cg": zeta_cg_mean, "delta_r": delta_r})
 def LSI_atom(distances: ArrayLike) -> NDArray:
    r_j = distances[distances <= 0.37]
    r_j = r_j.tolist()
    r_j.append(distances[len(r_j)])
    delta_ji = [r_j[i + 1] - r_j[i] for i in range(0, len(r_j) - 1)]
    mean_delta_i = np.mean(delta_ji)
    I = 1 / len(delta_ji) * np.sum((mean_delta_i - delta_ji) ** 2)
    return I
 def LSI(
    trajectory: Coordinates, segments: int = 10000, skip: float = 0
 ) -> pd.DataFrame:
    def LSI_distribution(
        frame: CoordinateFrame, bins: NDArray, selector: Optional[Callable] = None
    ) -> NDArray:
        atoms_PBC = pbc_points(frame, frame.box, thickness=0.7)
        atoms_tree = KDTree(atoms_PBC)
        if selector:
            index = selector(frame)
        else:
            index = np.arange(len(frame))
        dist, _ = atoms_tree.query(frame[index], 50, distance_upper_bound=0.6)
        distances = dist[:, 1:]
        LSI_values = np.array([LSI_atom(distance) for distance in distances])
        dist = np.histogram(LSI_values, bins=bins, density=True)[0]
        return dist
    bins = np.linspace(0, 0.007, 201)
    I = bins[1:] - (bins[1] - bins[0]) / 2
    frame_indices = np.unique(
        np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments))
    )
    distributions = np.array(
        [
            LSI_distribution(trajectory[frame_index], trajectory, bins, selector=None)
            for frame_index in frame_indices
        ]
    )
    P = np.mean(distributions, axis=0)
    return pd.DataFrame({"I": I, "P": P})
 def HDL_LDL_positions(frame, trajectory, selector=None):
    atoms_PBC = pbc_points(frame, frame.box, thickness=0.7)
    atoms_tree = KDTree(atoms_PBC)
    if selector:
        index = selector(frame)
    else:
        index = range(len(frame))
    dist = atoms_tree.query(frame[index], 50, distance_upper_bound=0.6)[0]
    distances = dist[:, 1:]
    LSI_values = np.array([LSI_atom(distance) for distance in distances])
    LDL = LSI_values >= 0.0013
    HDL = LSI_values < 0.0013
    pos_HDL = frame[index][HDL]
    pos_LDL = frame[index][LDL]
    return pos_HDL, pos_LDL
 def HDL_LDL_gr(trajectory, segments=10000, skip=0):
    def gr_frame(frame, trajectory, bins):
        atoms_ALL = frame
        atoms_HDL, atoms_LDL = HDL_LDL_positions(frame, trajectory)
        atoms_PBC_ALL = pbc_points(atoms_ALL, frame.box)
        atoms_PBC_LDL = pbc_points(atoms_LDL, frame.box)
        atoms_PBC_HDL = pbc_points(atoms_HDL, frame.box)
        tree_ALL = KDTree(atoms_PBC_ALL)
        tree_LDL = KDTree(atoms_PBC_LDL)
        tree_HDL = KDTree(atoms_PBC_HDL)
        dist_ALL_ALL, _ = tree_ALL.query(
            atoms_ALL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1
        )
        dist_HDL_HDL, _ = tree_HDL.query(
            atoms_HDL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1
        )
        dist_LDL_LDL, _ = tree_LDL.query(
            atoms_LDL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1
        )
        dist_HDL_LDL, _ = tree_LDL.query(
            atoms_HDL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1
        )
        dist_ALL_ALL = dist_ALL_ALL[:, 1:].flatten()
        dist_HDL_HDL = dist_HDL_HDL[:, 1:].flatten()
        dist_LDL_LDL = dist_LDL_LDL[:, 1:].flatten()
        dist_HDL_LDL = dist_HDL_LDL.flatten()
        hist_ALL_ALL = np.histogram(
            dist_ALL_ALL, bins=bins, range=(0, bins[-1]), density=False
        )[0]
        hist_HDL_HDL = np.histogram(
            dist_HDL_HDL, bins=bins, range=(0, bins[-1]), density=False
        )[0]
        hist_LDL_LDL = np.histogram(
            dist_LDL_LDL, bins=bins, range=(0, bins[-1]), density=False
        )[0]
        hist_HDL_LDL = np.histogram(
            dist_HDL_LDL, bins=bins, range=(0, bins[-1]), density=False
        )[0]
        n = [len(atoms_ALL), len(atoms_HDL), len(atoms_LDL)] / np.prod(
            np.diag(frame.box)
        )
        return (
            np.array(
                [
                    hist_ALL_ALL / len(atoms_ALL),
                    hist_HDL_HDL / len(atoms_HDL),
                    hist_LDL_LDL / len(atoms_LDL),
                    hist_HDL_LDL / len(atoms_HDL),
                ]
            ),
            n,
        )
    start_frame = trajectory[int(len(trajectory) * skip)]
    upper_bound = round(np.min(np.diag(start_frame.box)) / 2 - 0.05, 1)
    bins = np.linspace(0, upper_bound, upper_bound * 500 + 1)
    frame_indices = np.unique(
        np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments))
    )
    gr = []
    ns = []
    for frame_index in frame_indices:
        hists, n = gr_frame(trajectory[frame_index], trajectory, bins)
        gr.append(hists)
        ns.append(n)
    gr = np.mean(gr, axis=0)
    gr = gr / (4 / 3 * np.pi * bins[1:] ** 3 - 4 / 3 * np.pi * bins[:-1] ** 3)
    r = bins[1:] - (bins[1] - bins[0]) / 2
    n = np.mean(ns, axis=0)
    return pd.DataFrame(
        {"r": r, "gr_ALL": [0], "gr_HDL": gr[1], "gr_LDL": gr[2], "gr_MIX": gr[3]}
    )
 def HDL_LDL_concentration(trajectory, segments=10000, skip=0):
    def HDL_LDL_concentration_frame(frame, trajectrory, bins):
        atoms_HDL, atoms_LDL = HDL_LDL_positions(frame, trajectory)
        atoms_PBC_HDL = pbc_points(atoms_HDL, frame.box, thickness=0.61)
        atoms_PBC_LDL = pbc_points(atoms_LDL, frame.box, thickness=0.61)
        tree_LDL = KDTree(atoms_PBC_LDL)
        tree_HDL = KDTree(atoms_PBC_HDL)
        dist_HDL_HDL, _ = tree_HDL.query(atoms_HDL, 31, distance_upper_bound=0.6)
        dist_HDL_LDL, _ = tree_LDL.query(atoms_HDL, 30, distance_upper_bound=0.6)
        HDL_near_HDL = np.sum(
            dist_HDL_HDL <= 0.5, axis=-1
        )  # Ausgangsteilchen dazu zählen
        LDL_near_HDL = np.sum(dist_HDL_LDL <= 0.5, axis=-1)
        x_HDL = HDL_near_HDL / (HDL_near_HDL + LDL_near_HDL)
        x_HDL_dist = np.histogram(x_HDL, bins=bins, range=(0, bins[-1]), density=True)[
            0
        ]
        dist_LDL_LDL, _ = tree_LDL.query(atoms_LDL, 31, distance_upper_bound=0.6)
        dist_LDL_HDL, _ = tree_HDL.query(atoms_LDL, 30, distance_upper_bound=0.6)
        LDL_near_LDL = np.sum(
            dist_LDL_LDL <= 0.5, axis=-1
        )  # Ausgangsteilchen dazu zählen
        HDL_near_LDL = np.sum(dist_LDL_HDL <= 0.5, axis=-1)
        x_LDL = LDL_near_LDL / (LDL_near_LDL + HDL_near_LDL)
        x_LDL_dist = np.histogram(x_LDL, bins=bins, range=(0, bins[-1]), density=True)[
            0
        ]
        return x_HDL_dist, x_LDL_dist
    bins = np.linspace(0, 1, 21)
    x = bins[1:] - (bins[1] - bins[0]) / 2
    frame_indices = np.unique(
        np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments))
    )
    local_concentration_dist = np.array(
        [
            HDL_LDL_concentration_frame(trajectory[frame_index], trajectory, bins)
            for frame_index in frame_indices
        ]
    )
    x_HDL = np.mean(local_concentration_dist[:, 0], axis=0)
    x_LDL = np.mean(local_concentration_dist[:, 1], axis=0)
    return pd.DataFrame({"x": x, "x_HDL": x_HDL, "x_LDL": x_LDL})