From 6d8b86c1ef907538647c5cffc122937b961e415e Mon Sep 17 00:00:00 2001 From: robrobo Date: Mon, 16 Jun 2025 22:15:22 +0200 Subject: [PATCH] extended checksum.strip_comments function to work with prefixed docstrings and other small features --- src/mdevaluate/checksum.py | 49 +++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/src/mdevaluate/checksum.py b/src/mdevaluate/checksum.py index 5d00aa4..fa6d85f 100755 --- a/src/mdevaluate/checksum.py +++ b/src/mdevaluate/checksum.py @@ -4,6 +4,10 @@ from .logging_util import logger from types import ModuleType, FunctionType import inspect from typing import Iterable +import ast +import io +import tokenize +import re import numpy as np @@ -28,16 +32,43 @@ def version(version_nr: int, calls: Iterable = ()): return decorator -def strip_comments(s: str): - """Strips comment lines and docstring from Python source string.""" - o = "" - in_docstring = False - for l in s.split("\n"): - if l.strip().startswith(("#", '"', "'")) or in_docstring: - in_docstring = l.strip().startswith(('"""', "'''")) + in_docstring == 1 +def strip_comments(source: str) -> str: + """Removes docstrings, comments, and irrelevant whitespace from Python source code.""" + + # Step 1: Remove docstrings using AST + def remove_docstrings(node): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Module)): + if (doc := ast.get_docstring(node, clean=False)): + first_stmt = node.body[0] + if isinstance(first_stmt, ast.Expr) and isinstance(first_stmt.value, ast.Constant): + node.body.pop(0) # Remove the docstring entirely + for child in ast.iter_child_nodes(node): + remove_docstrings(child) + + tree = ast.parse(source) + remove_docstrings(tree) + code_without_docstrings = ast.unparse(tree) + + # Step 2: Remove comments using tokenize + tokens = tokenize.generate_tokens(io.StringIO(code_without_docstrings).readline) + result = [] + last_lineno = -1 + last_col = 0 + + for toknum, tokval, (srow, scol), (erow, ecol), line in tokens: + if toknum == tokenize.COMMENT: continue - o += l + "\n" - return o + if srow > last_lineno: + last_col = 0 + if scol > last_col: + result.append(" " * (scol - last_col)) + result.append(tokval) + last_lineno, last_col = erow, ecol + + code_no_comments = ''.join(result) + + # Step 3: Remove empty lines (whitespace-only or truly blank) + return "\n".join([line for line in code_no_comments.splitlines() if line.strip() != ""]) def checksum(*args, csum=None):