From 6d8b86c1ef907538647c5cffc122937b961e415e Mon Sep 17 00:00:00 2001
From: robrobo <robin.horstmann@gmx.net>
Date: Mon, 16 Jun 2025 22:15:22 +0200
Subject: [PATCH] extended checksum.strip_comments function to work with
 prefixed docstrings and other small features

---
 src/mdevaluate/checksum.py | 49 +++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/src/mdevaluate/checksum.py b/src/mdevaluate/checksum.py
index 5d00aa4..fa6d85f 100755
--- a/src/mdevaluate/checksum.py
+++ b/src/mdevaluate/checksum.py
@@ -4,6 +4,10 @@ from .logging_util import logger
 from types import ModuleType, FunctionType
 import inspect
 from typing import Iterable
+import ast
+import io
+import tokenize
+import re
 
 import numpy as np
 
@@ -28,16 +32,43 @@ def version(version_nr: int, calls: Iterable = ()):
     return decorator
 
 
-def strip_comments(s: str):
-    """Strips comment lines and docstring from Python source string."""
-    o = ""
-    in_docstring = False
-    for l in s.split("\n"):
-        if l.strip().startswith(("#", '"', "'")) or in_docstring:
-            in_docstring = l.strip().startswith(('"""', "'''")) + in_docstring == 1
+def strip_comments(source: str) -> str:
+    """Removes docstrings, comments, and irrelevant whitespace from Python source code."""
+
+    # Step 1: Remove docstrings using AST
+    def remove_docstrings(node):
+        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Module)):
+            if (doc := ast.get_docstring(node, clean=False)):
+                first_stmt = node.body[0]
+                if isinstance(first_stmt, ast.Expr) and isinstance(first_stmt.value, ast.Constant):
+                    node.body.pop(0)  # Remove the docstring entirely
+        for child in ast.iter_child_nodes(node):
+            remove_docstrings(child)
+
+    tree = ast.parse(source)
+    remove_docstrings(tree)
+    code_without_docstrings = ast.unparse(tree)
+
+    # Step 2: Remove comments using tokenize
+    tokens = tokenize.generate_tokens(io.StringIO(code_without_docstrings).readline)
+    result = []
+    last_lineno = -1
+    last_col = 0
+
+    for toknum, tokval, (srow, scol), (erow, ecol), line in tokens:
+        if toknum == tokenize.COMMENT:
             continue
-        o += l + "\n"
-    return o
+        if srow > last_lineno:
+            last_col = 0
+        if scol > last_col:
+            result.append(" " * (scol - last_col))
+        result.append(tokval)
+        last_lineno, last_col = erow, ecol
+
+    code_no_comments = ''.join(result)
+
+    # Step 3: Remove empty lines (whitespace-only or truly blank)
+    return "\n".join([line for line in code_no_comments.splitlines() if line.strip() != ""])
 
 
 def checksum(*args, csum=None):