Fixed formatting

Added autosave to shifted_correlation
Improved performance and split up shifted_correlation
2024-06-14 10:12:50 +02:00 · 2024-06-14 10:12:18 +02:00 · 2024-06-14 10:11:20 +02:00 · 2024-06-14 10:10:58 +02:00 · 2024-06-14 10:10:37 +02:00
12 changed files with 297 additions and 211 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "mdevaluate"
-version = "24.02"
+version = "24.06"
 dependencies = [
    "mdanalysis",
    "pandas",
--- a/src/mdevaluate/init.py
+++ b/src/mdevaluate/init.py
@@ -16,7 +16,7 @@ from . import reader
 from . import system
 from . import utils
 from . import extra
-from .logging_util import logger
+from .logging import logger


 def open(
--- a/src/mdevaluate/autosave.py
+++ b/src/mdevaluate/autosave.py
@@ -5,7 +5,7 @@ from typing import Optional, Callable, Iterable

 import numpy as np
 from .checksum import checksum
-from .logging_util import logger
+from .logging import logger

 autosave_directory: Optional[str] = None
 load_autosave_data = False
--- a/src/mdevaluate/checksum.py
+++ b/src/mdevaluate/checksum.py
@@ -1,14 +1,9 @@
 import functools
 import hashlib
-from .logging_util import logger
+from .logging import logger
 from types import ModuleType, FunctionType
 import inspect
 from typing import Iterable
-import ast
-import io
-import tokenize
-import re
-import textwrap

 import numpy as np

@@ -33,46 +28,19 @@ def version(version_nr: int, calls: Iterable = ()):
    return decorator


-def strip_comments(source: str) -> str:
-    """Removes docstrings, comments, and irrelevant whitespace from Python source code."""
-
-    # Step 1: Remove docstrings using AST
-    def remove_docstrings(node):
-        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Module)):
-            if (doc := ast.get_docstring(node, clean=False)):
-                first_stmt = node.body[0]
-                if isinstance(first_stmt, ast.Expr) and isinstance(first_stmt.value, ast.Constant):
-                    node.body.pop(0)  # Remove the docstring entirely
-        for child in ast.iter_child_nodes(node):
-            remove_docstrings(child)
-
-    tree = ast.parse(textwrap.dedent(source))
-    remove_docstrings(tree)
-    code_without_docstrings = ast.unparse(tree)
-
-    # Step 2: Remove comments using tokenize
-    tokens = tokenize.generate_tokens(io.StringIO(code_without_docstrings).readline)
-    result = []
-    last_lineno = -1
-    last_col = 0
-
-    for toknum, tokval, (srow, scol), (erow, ecol), line in tokens:
-        if toknum == tokenize.COMMENT:
+def strip_comments(s: str):
+    """Strips comment lines and docstring from Python source string."""
+    o = ""
+    in_docstring = False
+    for l in s.split("\n"):
+        if l.strip().startswith(("#", '"', "'")) or in_docstring:
+            in_docstring = l.strip().startswith(('"""', "'''")) + in_docstring == 1
            continue
-        if srow > last_lineno:
-            last_col = 0
-        if scol > last_col:
-            result.append(" " * (scol - last_col))
-        result.append(tokval)
-        last_lineno, last_col = erow, ecol
-
-    code_no_comments = ''.join(result)
-
-    # Step 3: Remove empty lines (whitespace-only or truly blank)
-    return "\n".join([line for line in code_no_comments.splitlines() if line.strip() != ""])
+        o += l + "\n"
+    return o


-def checksum(*args, csum=None, _seen=None):
+def checksum(*args, csum=None):
    """
    Calculate a checksum of any object, by sha1 hash.

@@ -92,15 +60,7 @@ def checksum(*args, csum=None, _seen=None):
        csum = hashlib.sha1()
        csum.update(str(SALT).encode())

-    if _seen is None:
-        _seen = set()
-
    for arg in args:
-        obj_id = id(arg)
-        if obj_id in _seen:
-            continue
-        _seen.add(obj_id)
-
        if hasattr(arg, "__checksum__"):
            logger.debug("Checksum via __checksum__: %s", str(arg))
            csum.update(str(arg.__checksum__()).encode())
@@ -113,19 +73,17 @@ def checksum(*args, csum=None, _seen=None):
        elif isinstance(arg, FunctionType):
            csum.update(strip_comments(inspect.getsource(arg)).encode())
            c = inspect.getclosurevars(arg)
-            merged = {**c.nonlocals, **c.globals}
-            for key in sorted(merged):  # deterministic ordering
-                v = merged[key]
+            for v in {**c.nonlocals, **c.globals}.values():
                if v is not arg:
-                    checksum(v, csum=csum, _seen=_seen)
+                    checksum(v, csum=csum)
        elif isinstance(arg, functools.partial):
            logger.debug("Checksum via partial for %s", str(arg))
-            checksum(arg.func, csum=csum, _seen=_seen)
+            checksum(arg.func, csum=csum)
            for x in arg.args:
-                checksum(x, csum=csum, _seen=_seen)
+                checksum(x, csum=csum)
            for k in sorted(arg.keywords.keys()):
                csum.update(k.encode())
-                checksum(arg.keywords[k], csum=csum, _seen=_seen)
+                checksum(arg.keywords[k], csum=csum)
        elif isinstance(arg, np.ndarray):
            csum.update(arg.tobytes())
        else:
--- a/src/mdevaluate/coordinates.py
+++ b/src/mdevaluate/coordinates.py
@@ -1,6 +1,6 @@
 from functools import partial, wraps
 from copy import copy
-from .logging_util import logger
+from .logging import logger
 from typing import Optional, Callable, List, Tuple

 import numpy as np
--- a/src/mdevaluate/correlation.py
+++ b/src/mdevaluate/correlation.py
@@ -13,9 +13,95 @@ from .pbc import pbc_diff, pbc_points
 from .coordinates import Coordinates, CoordinateFrame, displacements_without_drift


-def log_indices(first: int, last: int, num: int = 100) -> np.ndarray:
-    ls = np.logspace(0, np.log10(last - first + 1), num=num)
-    return np.unique(np.int_(ls) - 1 + first)
+def _is_multi_selector(selection):
+    if len(selection) == 0:
+        return False
+    elif (
+        isinstance(selection[0], int)
+        or isinstance(selection[0], bool)
+        or isinstance(selection[0], np.integer)
+        or isinstance(selection[0], np.bool_)
+    ):
+        return False
+    else:
+        for indices in selection:
+            if len(indices) == 0:
+                continue
+            elif (
+                isinstance(indices[0], int)
+                or isinstance(indices[0], bool)
+                or isinstance(indices[0], np.integer)
+                or isinstance(indices[0], np.bool_)
+            ):
+                return True
+            else:
+                raise ValueError(
+                    "selector has more than two dimensions or does not "
+                    "contain int or bool types"
+                )
+
+
+def _calc_correlation(
+    frames: Coordinates,
+    start_frame: CoordinateFrame,
+    function: Callable,
+    selection: np.ndarray,
+    shifted_idx: np.ndarray,
+) -> np.ndarray:
+    if len(selection) == 0:
+        correlation = np.zeros(len(shifted_idx))
+    else:
+        start = start_frame[selection]
+        correlation = np.array(
+            [
+                function(start, frames[frame_index][selection])
+                for frame_index in shifted_idx
+            ]
+        )
+    return correlation
+
+
+def _calc_correlation_multi(
+    frames: Coordinates,
+    start_frame: CoordinateFrame,
+    function: Callable,
+    selection: np.ndarray,
+    shifted_idx: np.ndarray,
+) -> np.ndarray:
+    correlations = np.zeros((len(selection), len(shifted_idx)))
+    for i, frame_index in enumerate(shifted_idx):
+        frame = frames[frame_index]
+        for j, current_selection in enumerate(selection):
+            if len(selection) == 0:
+                correlations[j, i] = 0
+            else:
+                correlations[j, i] = function(
+                    start_frame[current_selection], frame[current_selection]
+                )
+    return correlations
+
+
+def _average_correlation(result):
+    averaged_result = []
+    for n in range(result.shape[1]):
+        clean_result = []
+        for entry in result[:, n]:
+            if np.all(entry == 0):
+                continue
+            else:
+                clean_result.append(entry)
+        averaged_result.append(np.average(np.array(clean_result), axis=0))
+    return np.array(averaged_result)
+
+
+def _average_correlation_multi(result):
+    clean_result = []
+    for entry in result:
+        if np.all(entry == 0):
+            continue
+        else:
+            clean_result.append(entry)
+    return np.average(np.array(clean_result), axis=0)


@autosave_data(2)
@@ -28,113 +114,82 @@ def shifted_correlation(
    window: float = 0.5,
    average: bool = True,
    points: int = 100,
-) -> (np.ndarray, np.ndarray):
+) -> tuple[np.ndarray, np.ndarray]:
+    """Compute a time-dependent correlation function for a given trajectory.
+
+    To improve statistics, multiple (possibly overlapping) windows will be
+    layed over the whole trajectory and the correlation is computed for them separately.
+    The start frames of the windows are spaced linearly over the valid region of
+    the trajectory (skipping frames in the beginning given by skip parameter).
+
+    The points within each window are spaced logarithmically.
+
+    Only a certain subset of the given atoms may be selected for each window
+    individually using a selector function.
+
+    Note that this function is specifically optimized for multi selectors, which select
+    multiple selection sets per window, for which the correlation is to be computed
+    separately.
+
+
+    Arguments
+    ---------
+    function:
+        The (correlation) function to evaluate.
+        Should be of the form (CoordinateFrame, CoordinateFrame) -> float
+
+    frames:
+        Trajectory to evaluate on
+
+    selector: (optional)
+        Selection function so select only certain selection sets for each start frame.
+        Should be of the form
+            (CoordinateFrame) -> list[A]
+        where A is something you can index an ndarray with.
+        For example a list of indices or a bool array.
+        Must return the same number of selection sets for every frame.
+
+    segments:
+        Number of start frames
+
+    skip:
+        Percentage of trajectory to skip from the start
+
+    window:
+        Length of each segment given as percentage of trajectory
+
+    average:
+        Whether to return averaged results.
+        See below for details on the returned ndarray.
+
+    points:
+        Number of points per segment
+
+
+    Returns
+    -------
+    times: ndarray
+        1d array of time differences to start frame
+    result: ndarray
+        2d ndarray of averaged (or non-averaged) correlations.
+
+        When average==True (default) the returned array will be of the shape (S, P)
+        where S is the number of selection sets and P the number of points per window.
+        For selection sets that where empty for all start frames all data points will be
+        zero.
+
+        When average==False the returned array will be of shape (W, S) with
+        dtype=object. The elements are either ndarrays of shape (P,) containing the
+        correlation data for the specific window and selection set or None if the
+        corresponding selection set was empty.
+        W is the number of segments (windows).
+        S and P are the same as for average==True.
+
    """
-    Calculate the time series for a correlation function.
-
-    The times at which the correlation is calculated are determined by
-    a logarithmic distribution.
-
-    Args:
-        function: The function that should be correlated
-        frames: The coordinates of the simulation data
-        selector (opt.):
-                    A function that returns the indices depending on
-                    the staring frame for which particles the
-                    correlation should be calculated.
-        segments (int, opt.):
-                    The number of segments the time window will be
-                    shifted
-        skip (float, opt.):
-                    The fraction of the trajectory that will be skipped
-                    at the beginning, if this is None the start index
-                    of the frames slice will be used, which defaults
-                    to 0.1.
-        window (float, opt.):
-                    The fraction of the simulation the time series will
-                    cover
-        average (bool, opt.):
-                    If True, returns averaged correlation function
-        points (int, opt.):
-                    The number of timeshifts for which the correlation
-                    should be calculated
-    Returns:
-        tuple:
-            A list of length N that contains the timeshiftes of the frames at which
-            the time series was calculated and a numpy array of shape (segments, N)
-            that holds the (non-avaraged) correlation data
-
-    Example:
-        Calculating the mean square displacement of a coordinate object
-        named ``coords``:
-
-        >>> time, data = shifted_correlation(msd, coords)
-    """
-
-    def get_correlation(
-        frames: CoordinateFrame,
-        start_frame: CoordinateFrame,
-        index: np.ndarray,
-        shifted_idx: np.ndarray,
-    ) -> np.ndarray:
-        if len(index) == 0:
-            correlation = np.zeros(len(shifted_idx))
-        else:
-            start = frames[start_frame][index]
-            correlation = np.array(
-                [function(start, frames[frame][index]) for frame in shifted_idx]
-            )
-        return correlation
-
-    def apply_selector(
-        start_frame: CoordinateFrame,
-        frames: CoordinateFrame,
-        idx: np.ndarray,
-        selector: Optional[Callable] = None,
-    ):
-        shifted_idx = idx + start_frame
-
-        if selector is None:
-            index = np.arange(len(frames[start_frame]))
-            return get_correlation(frames, start_frame, index, shifted_idx)
-        else:
-            index = selector(frames[start_frame])
-            if len(index) == 0:
-                return np.zeros(len(shifted_idx))
-
-            elif (
-                isinstance(index[0], int)
-                or isinstance(index[0], bool)
-                or isinstance(index[0], np.integer)
-                or isinstance(index[0], np.bool_)
-            ):
-                return get_correlation(frames, start_frame, index, shifted_idx)
-            else:
-                correlations = []
-                for ind in index:
-                    if len(ind) == 0:
-                        correlations.append(np.zeros(len(shifted_idx)))
-
-                    elif (
-                        isinstance(ind[0], int)
-                        or isinstance(ind[0], bool)
-                        or isinstance(ind[0], np.integer)
-                        or isinstance(ind[0], np.bool_)
-                    ):
-                        correlations.append(
-                            get_correlation(frames, start_frame, ind, shifted_idx)
-                        )
-                    else:
-                        raise ValueError(
-                            "selector has more than two dimensions or does not "
-                            "contain int or bool types"
-                        )
-                return correlations
-
    if 1 - skip < window:
        window = 1 - skip

-    start_frames = np.unique(
+    start_frame_indices = np.unique(
        np.linspace(
            len(frames) * skip,
            len(frames) * (1 - window),
@@ -144,28 +199,44 @@ def shifted_correlation(
        )
    )

-    num_frames = int(len(frames) * window)
-    ls = np.logspace(0, np.log10(num_frames + 1), num=points)
-    idx = np.unique(np.int_(ls) - 1)
-    t = np.array([frames[i].time for i in idx]) - frames[0].time
-
-    result = np.array(
-        [
-            apply_selector(start_frame, frames=frames, idx=idx, selector=selector)
-            for start_frame in start_frames
-        ]
+    num_frames_per_window = int(len(frames) * window)
+    logspaced_indices = np.logspace(0, np.log10(num_frames_per_window + 1), num=points)
+    logspaced_indices = np.unique(np.int_(logspaced_indices) - 1)
+    logspaced_time = (
+        np.array([frames[i].time for i in logspaced_indices]) - frames[0].time
    )

-    if average:
-        clean_result = []
-        for entry in result:
-            if np.all(entry == 0):
-                continue
+    if selector is None:
+        multi_selector = False
    else:
-                clean_result.append(entry)
-        result = np.array(clean_result)
-        result = np.average(result, axis=0)
-    return t, result
+        selection = selector(frames[0])
+        multi_selector = _is_multi_selector(selection)
+
+    result = []
+    for start_frame_index in start_frame_indices:
+        shifted_idx = logspaced_indices + start_frame_index
+        start_frame = frames[start_frame_index]
+        if selector is None:
+            selection = np.arange(len(start_frame))
+        else:
+            selection = selector(start_frame)
+        if multi_selector:
+            result_segment = _calc_correlation_multi(
+                frames, start_frame, function, selection, shifted_idx
+            )
+        else:
+            result_segment = _calc_correlation(
+                frames, start_frame, function, selection, shifted_idx
+            )
+        result.append(result_segment)
+    result = np.array(result)
+
+    if average:
+        if multi_selector:
+            result = _average_correlation_multi(result)
+        else:
+            result = _average_correlation(result)
+    return logspaced_time, result


 def msd(
@@ -431,7 +502,7 @@ def non_gaussian_parameter(
    trajectory: Coordinates = None,
    axis: str = "all",
 ) -> float:
-    r"""
+    """
    Calculate the non-Gaussian parameter.
    ..math:
      \alpha_2 (t) =
--- a/src/mdevaluate/distribution.py
+++ b/src/mdevaluate/distribution.py
@@ -182,10 +182,10 @@ def tetrahedral_order(
    )

    # Connection vectors
-    neighbors_1 = pbc_diff(neighbors_1, atoms, box=atoms.box)
-    neighbors_2 = pbc_diff(neighbors_2, atoms, box=atoms.box)
-    neighbors_3 = pbc_diff(neighbors_3, atoms, box=atoms.box)
-    neighbors_4 = pbc_diff(neighbors_4, atoms, box=atoms.box)
+    neighbors_1 -= atoms
+    neighbors_2 -= atoms
+    neighbors_3 -= atoms
+    neighbors_4 -= atoms

    # Normed Connection vectors
    neighbors_1 /= np.linalg.norm(neighbors_1, axis=-1).reshape(-1, 1)
--- a/src/mdevaluate/logging_util.py
+++ b/src/mdevaluate/logging_util.py
--- a/src/mdevaluate/pbc.py
+++ b/src/mdevaluate/pbc.py
@@ -7,7 +7,7 @@ from numpy.typing import ArrayLike, NDArray

 from itertools import product

-from .logging_util import logger
+from .logging import logger

 if TYPE_CHECKING:
    from mdevaluate.coordinates import CoordinateFrame
--- a/src/mdevaluate/reader.py
+++ b/src/mdevaluate/reader.py
@@ -19,13 +19,13 @@ import MDAnalysis
 from scipy import sparse

 from .checksum import checksum
-from .logging_util import logger
+from .logging import logger
 from . import atoms
 from .coordinates import Coordinates

 CSR_ATTRS = ("data", "indices", "indptr")
 NOJUMP_MAGIC = 2016
-Group_RE = re.compile(r"\[ ([-+\w]+) \]")
+Group_RE = re.compile("\[ ([-+\w]+) \]")


 class NojumpError(Exception):
@@ -275,7 +275,7 @@ def load_nojump_matrices(reader: BaseReader):
                "Loaded Nojump matrices: {}".format(nojump_load_filename(reader))
            )
        else:
-            logger.info("Invalid Nojump Data: {}".format(nojump_load_filename(reader)))
+            logger.info("Invlaid Nojump Data: {}".format(nojump_load_filename(reader)))
    except KeyError:
        logger.info("Removing zip-File: %s", zipname)
        os.remove(nojump_load_filename(reader))
--- a/src/mdevaluate/utils.py
+++ b/src/mdevaluate/utils.py
@@ -14,7 +14,7 @@ from scipy.ndimage import uniform_filter1d
 from scipy.interpolate import interp1d
 from scipy.optimize import curve_fit

-from .logging_util import logger
+from .logging import logger
 from .functions import kww, kww_1e


@@ -177,7 +177,7 @@ def coherent_sum(
    func: Callable[[ArrayLike, ArrayLike], float],
    coord_a: ArrayLike,
    coord_b: ArrayLike,
-) -> float:
+) -> NDArray:
    """
    Perform a coherent sum over two arrays :math:`A, B`.

--- a/test/test_correlation.py
+++ b/test/test_correlation.py
@@ -0,0 +1,57 @@
+import os
+import pytest
+
+import mdevaluate
+from mdevaluate import correlation
+import numpy as np
+
+
+@pytest.fixture
+def trajectory(request):
+    return mdevaluate.open(os.path.join(os.path.dirname(__file__), "data/water"))
+
+
+def test_shifted_correlation(trajectory):
+    test_array = np.array([100, 82, 65, 49, 39, 29, 20, 13, 7])
+    OW = trajectory.subset(atom_name="OW")
+    t, result = correlation.shifted_correlation(
+        correlation.isf, OW, segments=10, skip=0.1, points=10
+    )
+    assert (np.array(result * 100, dtype=int) == test_array).all()
+
+
+def test_shifted_correlation_no_average(trajectory):
+    t, result = correlation.shifted_correlation(
+        correlation.isf, trajectory, segments=10, skip=0.1, points=5, average=False
+    )
+    assert result.shape == (10, 5)
+
+
+def test_shifted_correlation_selector(trajectory):
+    test_array = np.array([100, 82, 64, 48, 37, 28, 19, 11, 5])
+
+    def selector(frame):
+        index = np.argwhere((frame[:, 0] >= 0) * (frame[:, 0] < 1))
+        return index.flatten()
+
+    OW = trajectory.subset(atom_name="OW")
+    t, result = correlation.shifted_correlation(
+        correlation.isf, OW, segments=10, skip=0.1, points=10, selector=selector
+    )
+    assert (np.array(result * 100, dtype=int) == test_array).all()
+
+
+def test_shifted_correlation_multi_selector(trajectory):
+    def selector(frame):
+        indices = []
+        for i in range(3):
+            x = frame[:, 0].flatten()
+            index = np.argwhere((x >= i) * (x < i + 1))
+            indices.append(index.flatten())
+        return indices
+
+    OW = trajectory.subset(atom_name="OW")
+    t, result = correlation.shifted_correlation(
+        correlation.isf, OW, segments=10, skip=0.1, points=10, selector=selector
+    )
+    assert result.shape == (3, 9)
Author	SHA1	Message	Date
Sebastian Kloth	adae920527	Fixed formatting	2024-06-14 10:12:50 +02:00
Sebastian Kloth	11793d7960	Added autosave to shifted_correlation	2024-06-14 10:12:18 +02:00
Sebastian Kloth	55b06fa61d	Improved performance and split up shifted_correlation	2024-06-14 10:11:20 +02:00
Sebastian Kloth	3aa91d7482	Updated version number and type hint	2024-06-14 10:10:58 +02:00
Sebastian Kloth	7d8c5d849d	Added tests for shifted_correlation	2024-06-14 10:10:37 +02:00