From 949ed877fb1c580e9f5ebf71088844d1a0405c1c Mon Sep 17 00:00:00 2001
From: Sebastian Kloth <sebastian.kloth@pkm.tu-darmstadt.de>
Date: Mon, 8 Jan 2024 07:42:04 +0100
Subject: [PATCH] Added module for water structure parameters

---
 src/mdevaluate/extra/__init__.py |   1 +
 src/mdevaluate/extra/water.py    | 419 +++++++++++++++++++++++++++++++
 2 files changed, 420 insertions(+)
 create mode 100644 src/mdevaluate/extra/water.py

diff --git a/src/mdevaluate/extra/__init__.py b/src/mdevaluate/extra/__init__.py
index 8d1d8fb..9066fc3 100644
--- a/src/mdevaluate/extra/__init__.py
+++ b/src/mdevaluate/extra/__init__.py
@@ -1,2 +1,3 @@
 from . import chill
 from . import free_energy_landscape
+from . import water
diff --git a/src/mdevaluate/extra/water.py b/src/mdevaluate/extra/water.py
new file mode 100644
index 0000000..0a64f3b
--- /dev/null
+++ b/src/mdevaluate/extra/water.py
@@ -0,0 +1,419 @@
+from functools import partial
+from typing import Tuple, Callable, Optional
+
+import numpy as np
+from numpy.typing import NDArray, ArrayLike
+import pandas as pd
+from scipy.spatial import KDTree
+
+from ..distribution import hbonds
+from ..pbc import pbc_points
+from ..correlation import shifted_correlation, overlap
+from ..coordinates import Coordinates, CoordinateFrame
+
+
+def tanaka_zeta(
+    trajectory: Coordinates, angle: float = 30, segments: int = 100, skip: float = 0.1
+) -> pd.DataFrame:
+    frame_indices = np.unique(
+        np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments))
+    )
+    sel = trajectory.atom_subset.selection
+    A = np.where(
+        trajectory.subset(atom_name="OW", residue_name="SOL").atom_subset.selection[sel]
+    )[0]
+    D = np.vstack([A] * 2).T.reshape((-1,))
+    H = np.where(
+        trajectory.subset(atom_name="HW.", residue_name="SOL").atom_subset.selection[
+            sel
+        ]
+    )[0]
+
+    zeta_dist = []
+    zeta_cg_dist = []
+    for frame_index in frame_indices:
+        D_frame = trajectory[frame_index][D]
+        H_frame = trajectory[frame_index][H]
+        A_frame = trajectory[frame_index][A]
+        box = trajectory[frame_index].box
+        pairs = hbonds(
+            D_frame, H_frame, A_frame, box, min_cos=np.cos(angle / 180 * np.pi)
+        )
+        pairs[:, 0] = np.int_((pairs[:, 0] / 2))
+        pairs = np.sort(pairs, axis=1)
+        pairs = np.unique(pairs, axis=0)
+        pairs = pairs.tolist()
+
+        A_PBC, A_index = pbc_points(A_frame, box, thickness=0.7, index=True)
+        A_tree = KDTree(A_PBC)
+        dist, dist_index = A_tree.query(A_frame, 16, distance_upper_bound=0.7)
+
+        dist_index = A_index[dist_index]
+        zeta = []
+        for i, indices in enumerate(dist_index):
+            dist_hbond = []
+            dist_non_hbond = []
+            for j, index in enumerate(indices):
+                if j != 0:
+                    if np.sort([indices[0], index]).tolist() in pairs:
+                        dist_hbond.append(dist[i, j])
+                    else:
+                        dist_non_hbond.append(dist[i, j])
+            try:
+                zeta.append(np.min(dist_non_hbond) - np.max(dist_hbond))
+            except ValueError:
+                zeta.append(0)
+
+        zeta = np.array(zeta)
+
+        dist, dist_index = A_tree.query(A_frame, 16, distance_upper_bound=0.7)
+        dist_index = A_index[dist_index]
+        dist_index = np.array(
+            [indices[dist[i] <= 0.35] for i, indices in enumerate(dist_index)]
+        )
+        zeta_cg = np.array([np.mean(zeta[indices]) for indices in dist_index])
+
+        bins = np.linspace(-0.1, 0.2, 301)
+        zeta_dist.append(np.histogram(zeta, bins=bins)[0])
+        zeta_cg_dist.append(np.histogram(zeta_cg, bins=bins)[0])
+        z = bins[1:] - (bins[1] - bins[0]) / 2
+
+    zeta_dist = np.mean(zeta_dist, axis=0)
+    zeta_dist = zeta_dist / np.mean(zeta_dist)
+
+    zeta_cg_dist = np.mean(zeta_cg_dist, axis=0)
+    zeta_cg_dist = zeta_cg_dist / np.mean(zeta_cg_dist)
+
+    return pd.DataFrame({"zeta": z, "result": zeta_dist, "result_cg": zeta_cg_dist})
+
+
+def chi_four_trans(
+    trajectory: Coordinates, skip: float = 0.1, segments: int = 10000
+) -> pd.DataFrame:
+    traj = trajectory.nojump
+    N = len(trajectory[0])
+    t, S = shifted_correlation(
+        partial(overlap, radius=0.1), traj, skip=skip, segments=segments, average=False
+    )
+    chi = 1 / N * S.var(axis=0)[1:]
+    return pd.DataFrame({"time": t[1:], "chi": chi})
+
+
+def tanaka_correlation_map(
+    trajectory: Coordinates,
+    data_chi_four_trans: pd.DataFrame,
+    angle: float = 30,
+    segments: int = 100,
+    skip: float = 0.1,
+) -> pd.DataFrame:
+    def tanaka_zeta_cg(
+        trajectory: Coordinates,
+        angle: float = 30,
+        segments: int = 1000,
+        skip: float = 0.1,
+    ) -> Tuple[NDArray, NDArray]:
+        frame_indices = np.unique(
+            np.int_(
+                np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments)
+            )
+        )
+        sel = trajectory.atom_subset.selection
+        A = np.where(
+            trajectory.subset(atom_name="OW", residue_name="SOL").atom_subset.selection[
+                sel
+            ]
+        )[0]
+        D = np.vstack([A] * 2).T.reshape((-1,))
+        H = np.where(
+            trajectory.subset(
+                atom_name="HW.", residue_name="SOL"
+            ).atom_subset.selection[sel]
+        )[0]
+
+        zeta_cg = []
+        times = []
+        for frame_index in frame_indices:
+            D_frame = trajectory[frame_index][D]
+            H_frame = trajectory[frame_index][H]
+            A_frame = trajectory[frame_index][A]
+            box = trajectory[frame_index].box
+            pairs = hbonds(
+                D_frame, H_frame, A_frame, box, min_cos=np.cos(angle / 180 * np.pi)
+            )
+            pairs[:, 0] = np.int_((pairs[:, 0] / 2))
+            pairs = np.sort(pairs, axis=1)
+            pairs = np.unique(pairs, axis=0)
+            pairs = pairs.tolist()
+
+            A_PBC, A_index = pbc_points(A_frame, box, thickness=0.7, index=True)
+            A_tree = KDTree(A_PBC)
+            dist, dist_index = A_tree.query(A_frame, 16, distance_upper_bound=0.7)
+
+            dist_index = A_index[dist_index]
+            zeta = []
+            for i, indices in enumerate(dist_index):
+                dist_hbond = []
+                dist_non_hbond = []
+                for j, index in enumerate(indices):
+                    if j != 0:
+                        if np.sort([indices[0], index]).tolist() in pairs:
+                            dist_hbond.append(dist[i, j])
+                        else:
+                            dist_non_hbond.append(dist[i, j])
+                try:
+                    zeta.append(np.min(dist_non_hbond) - np.max(dist_hbond))
+                except ValueError:
+                    zeta.append(0)
+            zeta = np.array(zeta)
+            dist_index = np.array(
+                [indices[dist[i] <= 0.35] for i, indices in enumerate(dist_index)]
+            )
+            zeta_cg.append(np.array([np.mean(zeta[indices]) for indices in dist_index]))
+            times.append(trajectory[frame_index].time)
+        return np.array(times), np.array(zeta_cg)
+
+    def delta_r_max(
+        trajectory: Coordinates, frame: CoordinateFrame, tau_4: float
+    ) -> NDArray:
+        dt = trajectory[1].time - trajectory[0].time
+        index_start = frame.step
+        index_end = index_start + int(tau_4 / dt) + 1
+        frame_indices = np.arange(index_start, index_end + 1)
+        end_cords = np.array([trajectory[frame_index] for frame_index in frame_indices])
+        vectors = trajectory[index_start] - end_cords
+
+        delta_r = np.linalg.norm(vectors, axis=-1)
+        delta_r = np.max(delta_r, axis=0)
+        return delta_r
+
+    d = np.array(data_chi_four_trans[["time", "chi"]])
+    mask = d[:, 1] >= 0.7 * np.max(d[:, 1])
+    fit = np.polyfit(d[mask, 0], d[mask, 1], 4)
+    p = np.poly1d(fit)
+    x_inter = np.linspace(d[mask, 0][0], d[mask, 0][-1], 1e6)
+    y_inter = p(x_inter)
+    tau_4 = x_inter[y_inter == np.max(y_inter)]
+
+    oxygens = trajectory.nojump.subset(atom_name="OW")
+    window = tau_4 / trajectory[-1].time
+    start_frames = np.unique(
+        np.linspace(
+            len(trajectory) * skip,
+            len(trajectory) * (1 - window),
+            num=segments,
+            endpoint=False,
+            dtype=int,
+        )
+    )
+
+    times, zeta_cg = tanaka_zeta_cg(trajectory, angle=angle)
+
+    zeta_cg_mean = np.array(
+        [
+            np.mean(
+                zeta_cg[
+                    (times >= trajectory[start_frame].time)
+                    * (times <= (trajectory[start_frame].time + tau_4))
+                ],
+                axis=0,
+            )
+            for start_frame in start_frames
+        ]
+    ).flatten()
+    delta_r = np.array(
+        [
+            delta_r_max(oxygens, oxygens[start_frame], tau_4)
+            for start_frame in start_frames
+        ]
+    ).flatten()
+    return pd.DataFrame({"zeta_cg": zeta_cg_mean, "delta_r": delta_r})
+
+
+def LSI_atom(distances: ArrayLike) -> NDArray:
+    r_j = distances[distances <= 0.37]
+    r_j = r_j.tolist()
+    r_j.append(distances[len(r_j)])
+    delta_ji = [r_j[i + 1] - r_j[i] for i in range(0, len(r_j) - 1)]
+    mean_delta_i = np.mean(delta_ji)
+    I = 1 / len(delta_ji) * np.sum((mean_delta_i - delta_ji) ** 2)
+    return I
+
+
+def LSI(
+    trajectory: Coordinates, segments: int = 10000, skip: float = 0
+) -> pd.DataFrame:
+    def LSI_distribution(
+        frame: CoordinateFrame, bins: NDArray, selector: Optional[Callable] = None
+    ) -> NDArray:
+        atoms_PBC = pbc_points(frame, frame.box, thickness=0.7)
+        atoms_tree = KDTree(atoms_PBC)
+        if selector:
+            index = selector(frame)
+        else:
+            index = np.arange(len(frame))
+        dist, _ = atoms_tree.query(frame[index], 50, distance_upper_bound=0.6)
+        distances = dist[:, 1:]
+        LSI_values = np.array([LSI_atom(distance) for distance in distances])
+        dist = np.histogram(LSI_values, bins=bins, density=True)[0]
+        return dist
+
+    bins = np.linspace(0, 0.007, 201)
+    I = bins[1:] - (bins[1] - bins[0]) / 2
+
+    frame_indices = np.unique(
+        np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments))
+    )
+    distributions = np.array(
+        [
+            LSI_distribution(trajectory[frame_index], trajectory, bins, selector=None)
+            for frame_index in frame_indices
+        ]
+    )
+    P = np.mean(distributions, axis=0)
+    return pd.DataFrame({"I": I, "P": P})
+
+
+def HDL_LDL_positions(frame, trajectory, selector=None):
+    atoms_PBC = pbc_points(frame, frame.box, thickness=0.7)
+    atoms_tree = KDTree(atoms_PBC)
+    if selector:
+        index = selector(frame)
+    else:
+        index = range(len(frame))
+    dist = atoms_tree.query(frame[index], 50, distance_upper_bound=0.6)[0]
+    distances = dist[:, 1:]
+    LSI_values = np.array([LSI_atom(distance) for distance in distances])
+    LDL = LSI_values >= 0.0013
+    HDL = LSI_values < 0.0013
+    pos_HDL = frame[index][HDL]
+    pos_LDL = frame[index][LDL]
+    return pos_HDL, pos_LDL
+
+
+def HDL_LDL_gr(trajectory, segments=10000, skip=0):
+    def gr_frame(frame, trajectory, bins):
+        atoms_ALL = frame
+        atoms_HDL, atoms_LDL = HDL_LDL_positions(frame, trajectory)
+
+        atoms_PBC_ALL = pbc_points(atoms_ALL, frame.box)
+        atoms_PBC_LDL = pbc_points(atoms_LDL, frame.box)
+        atoms_PBC_HDL = pbc_points(atoms_HDL, frame.box)
+
+        tree_ALL = KDTree(atoms_PBC_ALL)
+        tree_LDL = KDTree(atoms_PBC_LDL)
+        tree_HDL = KDTree(atoms_PBC_HDL)
+
+        dist_ALL_ALL, _ = tree_ALL.query(
+            atoms_ALL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1
+        )
+        dist_HDL_HDL, _ = tree_HDL.query(
+            atoms_HDL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1
+        )
+        dist_LDL_LDL, _ = tree_LDL.query(
+            atoms_LDL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1
+        )
+        dist_HDL_LDL, _ = tree_LDL.query(
+            atoms_HDL, len(frame) // 2, distance_upper_bound=bins[-1] + 0.1
+        )
+
+        dist_ALL_ALL = dist_ALL_ALL[:, 1:].flatten()
+        dist_HDL_HDL = dist_HDL_HDL[:, 1:].flatten()
+        dist_LDL_LDL = dist_LDL_LDL[:, 1:].flatten()
+        dist_HDL_LDL = dist_HDL_LDL.flatten()
+
+        hist_ALL_ALL = np.histogram(
+            dist_ALL_ALL, bins=bins, range=(0, bins[-1]), density=False
+        )[0]
+        hist_HDL_HDL = np.histogram(
+            dist_HDL_HDL, bins=bins, range=(0, bins[-1]), density=False
+        )[0]
+        hist_LDL_LDL = np.histogram(
+            dist_LDL_LDL, bins=bins, range=(0, bins[-1]), density=False
+        )[0]
+        hist_HDL_LDL = np.histogram(
+            dist_HDL_LDL, bins=bins, range=(0, bins[-1]), density=False
+        )[0]
+
+        n = [len(atoms_ALL), len(atoms_HDL), len(atoms_LDL)] / np.prod(
+            np.diag(frame.box)
+        )
+
+        return (
+            np.array(
+                [
+                    hist_ALL_ALL / len(atoms_ALL),
+                    hist_HDL_HDL / len(atoms_HDL),
+                    hist_LDL_LDL / len(atoms_LDL),
+                    hist_HDL_LDL / len(atoms_HDL),
+                ]
+            ),
+            n,
+        )
+
+    start_frame = trajectory[int(len(trajectory) * skip)]
+    upper_bound = round(np.min(np.diag(start_frame.box)) / 2 - 0.05, 1)
+    bins = np.linspace(0, upper_bound, upper_bound * 500 + 1)
+    frame_indices = np.unique(
+        np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments))
+    )
+
+    gr = []
+    ns = []
+    for frame_index in frame_indices:
+        hists, n = gr_frame(trajectory[frame_index], trajectory, bins)
+        gr.append(hists)
+        ns.append(n)
+
+    gr = np.mean(gr, axis=0)
+    gr = gr / (4 / 3 * np.pi * bins[1:] ** 3 - 4 / 3 * np.pi * bins[:-1] ** 3)
+    r = bins[1:] - (bins[1] - bins[0]) / 2
+    n = np.mean(ns, axis=0)
+
+    return pd.DataFrame(
+        {"r": r, "gr_ALL": [0], "gr_HDL": gr[1], "gr_LDL": gr[2], "gr_MIX": gr[3]}
+    )
+
+
+def HDL_LDL_concentration(trajectory, segments=10000, skip=0):
+    def HDL_LDL_concentration_frame(frame, trajectrory, bins):
+        atoms_HDL, atoms_LDL = HDL_LDL_positions(frame, trajectory)
+        atoms_PBC_HDL = pbc_points(atoms_HDL, frame.box, thickness=0.61)
+        atoms_PBC_LDL = pbc_points(atoms_LDL, frame.box, thickness=0.61)
+        tree_LDL = KDTree(atoms_PBC_LDL)
+        tree_HDL = KDTree(atoms_PBC_HDL)
+        dist_HDL_HDL, _ = tree_HDL.query(atoms_HDL, 31, distance_upper_bound=0.6)
+        dist_HDL_LDL, _ = tree_LDL.query(atoms_HDL, 30, distance_upper_bound=0.6)
+        HDL_near_HDL = np.sum(
+            dist_HDL_HDL <= 0.5, axis=-1
+        )  # Ausgangsteilchen dazu zählen
+        LDL_near_HDL = np.sum(dist_HDL_LDL <= 0.5, axis=-1)
+        x_HDL = HDL_near_HDL / (HDL_near_HDL + LDL_near_HDL)
+        x_HDL_dist = np.histogram(x_HDL, bins=bins, range=(0, bins[-1]), density=True)[
+            0
+        ]
+        dist_LDL_LDL, _ = tree_LDL.query(atoms_LDL, 31, distance_upper_bound=0.6)
+        dist_LDL_HDL, _ = tree_HDL.query(atoms_LDL, 30, distance_upper_bound=0.6)
+        LDL_near_LDL = np.sum(
+            dist_LDL_LDL <= 0.5, axis=-1
+        )  # Ausgangsteilchen dazu zählen
+        HDL_near_LDL = np.sum(dist_LDL_HDL <= 0.5, axis=-1)
+        x_LDL = LDL_near_LDL / (LDL_near_LDL + HDL_near_LDL)
+        x_LDL_dist = np.histogram(x_LDL, bins=bins, range=(0, bins[-1]), density=True)[
+            0
+        ]
+        return x_HDL_dist, x_LDL_dist
+
+    bins = np.linspace(0, 1, 21)
+    x = bins[1:] - (bins[1] - bins[0]) / 2
+    frame_indices = np.unique(
+        np.int_(np.linspace(len(trajectory) * skip, len(trajectory) - 1, num=segments))
+    )
+    local_concentration_dist = np.array(
+        [
+            HDL_LDL_concentration_frame(trajectory[frame_index], trajectory, bins)
+            for frame_index in frame_indices
+        ]
+    )
+    x_HDL = np.mean(local_concentration_dist[:, 0], axis=0)
+    x_LDL = np.mean(local_concentration_dist[:, 1], axis=0)
+    return pd.DataFrame({"x": x, "x_HDL": x_HDL, "x_LDL": x_LDL})