Axivion MISRA C:2012 Compliance Agent

Overview Schema Related Servers Score Discussions

MISRA-MCP
core

c_analyzer.py•70.7 KiB

""" C Analyzer — AST-based code analysis using tree-sitter. Provides deep structural understanding of C source files: • Function body extraction (full text, params, return type) • Parameter usage analysis (read/write counts) • Pointer write-through detection • Symbol scope classification (file/block/external) • Reachability analysis (code after return/break/continue) • Declaration and reference finding • Enum constant value computation • Cross-file analysis via WorkspaceIndex (optional) """ import os import re import logging from typing import List, Dict, Optional, Tuple, Set from dataclasses import dataclass, field import tree_sitter_c as tsc from tree_sitter import Language, Parser, Node logger = logging.getLogger(__name__) # ═══════════════════════════════════════════════════════════════════════ # Type System # ═══════════════════════════════════════════════════════════════════════ @dataclass class CType: """Represents a C type with properties relevant for MISRA analysis.""" name: str width: int # heuristic width in bits (e.g. 8, 16, 32, 64) is_signed: bool is_float: bool = False is_pointer: bool = False def __repr__(self): s = "signed" if self.is_signed else "unsigned" if self.is_float: return f"{self.name} (float{self.width})" if self.is_pointer: return f"{self.name} (ptr)" return f"{self.name} ({s} {self.width}-bit)" class TypeSystem: """ Manages C types, standard MISRA essential type models, and arithmetic conversions. """ def __init__(self): # Standard primitives (assuming 32-bit int, 64-bit long for analysis) self.types: Dict[str, CType] = { "void": CType("void", 0, False), "char": CType("char", 8, True), # Implementation defined, assume signed for now "signed char": CType("signed char", 8, True), "unsigned char": CType("unsigned char", 8, False), "short": CType("short", 16, True), "unsigned short": CType("unsigned short", 16, False), "int": CType("int", 32, True), "unsigned int": CType("unsigned int", 32, False), "unsigned": CType("unsigned int", 32, False), "long": CType("long", 64, True), "unsigned long": CType("unsigned long", 64, False), "float": CType("float", 32, True, is_float=True), "double": CType("double", 64, True, is_float=True), "bool": CType("bool", 1, False), "_Bool": CType("_Bool", 1, False), } # stdint.h types for w in (8, 16, 32, 64): self.types[f"int{w}_t"] = CType(f"int{w}_t", w, True) self.types[f"uint{w}_t"] = CType(f"uint{w}_t", w, False) def get_type(self, type_name: str) -> Optional[CType]: """Resolve a type definition string to a CType.""" name = type_name.strip() # Handle pointers roughly if "*" in name: return CType(name, 64, False, is_pointer=True) # Pointers are 64-bit unsigned-ish # Strip common qualifiers cleaned = re.sub(r'\b(const|volatile|static|register|extern)\b', '', name).strip() cleaned = re.sub(r'\s+', ' ', cleaned) return self.types.get(cleaned) def get_essential_type(self, type_obj: CType) -> str: """ Map to MISRA essential type categories: Boolean, Signed, Unsigned, Floating, Character """ if type_obj.name in ("bool", "_Bool"): return "Boolean" if type_obj.is_float: return "Floating" if type_obj.name in ("char", "signed char", "unsigned char"): return "Character" if type_obj.is_signed: return "Signed" return "Unsigned" def promote(self, left: CType, right: CType) -> CType: """ Determine the result type of a binary operation (Usual Arithmetic Conversions). Simplified for essential type analysis. """ if left.is_float or right.is_float: return left if left.width >= right.width else right # Integer promotion rules (simplified) # If one is unsigned and wider or equal, result is unsigned # If both are signed, larger wins l_width, r_width = left.width, right.width if l_width > r_width: return left elif r_width > l_width: return right else: # Equal width: if either is unsigned, result is unsigned if not left.is_signed or not right.is_signed: return CType(f"uint{l_width}_t", l_width, False) return left C_LANGUAGE = Language(tsc.language()) _parser = Parser(C_LANGUAGE) # ═══════════════════════════════════════════════════════════════════════ # Data types # ═══════════════════════════════════════════════════════════════════════ @dataclass class ParamInfo: """Information about a function parameter.""" name: str type_str: str is_pointer: bool = False read_count: int = 0 write_count: int = 0 read_lines: List[int] = field(default_factory=list) write_lines: List[int] = field(default_factory=list) @dataclass class FunctionInfo: """Structural information about a C function.""" name: str return_type: str signature: str # full signature text body_text: str # full function body including braces start_line: int # 1-indexed end_line: int # 1-indexed params: List[ParamInfo] = field(default_factory=list) is_static: bool = False is_inline: bool = False has_prototype: bool = False # whether a prior declaration exists @dataclass class SymbolRef: """A reference to a symbol in the source.""" name: str line: int # 1-indexed column: int context: str # surrounding line text is_definition: bool = False is_declaration: bool = False is_write: bool = False @dataclass class EnumConstant: """An enumeration constant with its computed value.""" name: str value: int is_implicit: bool # value was not explicitly assigned line: int # ═══════════════════════════════════════════════════════════════════════ # Analyzer # ═══════════════════════════════════════════════════════════════════════ class CAnalyzer: """AST-based C source file analyzer with optional cross-file support.""" def __init__(self, workspace_root: str, workspace_index=None, preprocessor=None): self.workspace_root = workspace_root self.index = workspace_index # Optional WorkspaceIndex for cross-file queries self.preprocessor = preprocessor # Optional PreprocessorEngine self._cache: Dict[str, Tuple[bytes, object]] = {} # path -> (source, tree) self.type_system = TypeSystem() def _resolve(self, file_path: str) -> str: """Resolve a (possibly POSIX-style) relative path to an absolute path.""" # Normalise separators so 'src/main.c' works on Windows too native = file_path.replace("/", os.sep).replace("\\", os.sep) if os.path.isabs(native): return native return os.path.join(self.workspace_root, native) def _get_tree(self, file_path: str) -> Tuple[Optional[bytes], Optional[object]]: """Parse file and cache the result.""" full = self._resolve(file_path) if full in self._cache: return self._cache[full] if not os.path.isfile(full): logger.warning("File not found: %s", full) return None, None try: with open(full, "rb") as f: source = f.read() # Skip binary files if b"\x00" in source[:8192]: logger.warning("Skipping binary file: %s", full) return None, None tree = _parser.parse(source) self._cache[full] = (source, tree) return source, tree except Exception as e: logger.error("Failed to parse %s: %s", full, e) return None, None def _source_lines(self, file_path: str) -> List[str]: source, _ = self._get_tree(file_path) if source is None: return [] return source.decode("utf-8", errors="replace").splitlines(keepends=True) def _node_text(self, node: Node, source: bytes) -> str: return source[node.start_byte:node.end_byte].decode("utf-8", errors="replace") # ──────────────────────────────────────────────────────────────── # Function extraction # ──────────────────────────────────────────────────────────────── def get_functions(self, file_path: str) -> List[FunctionInfo]: """Extract all function definitions from the file.""" source, tree = self._get_tree(file_path) if tree is None: return [] functions = [] for node in self._walk_type(tree.root_node, "function_definition"): fn = self._extract_function(node, source, file_path) if fn: functions.append(fn) return functions def get_function_at_line(self, file_path: str, line: int) -> Optional[FunctionInfo]: """Get the function containing a specific line (1-indexed).""" for fn in self.get_functions(file_path): if fn.start_line <= line <= fn.end_line: return fn return None def _extract_function(self, node: Node, source: bytes, file_path: str) -> Optional[FunctionInfo]: """Build a FunctionInfo from a function_definition node.""" # Get declarator (contains function name and params) declarator = self._find_child(node, "function_declarator") if declarator is None: # Try direct_declarator inside a pointer_declarator ptr_decl = self._find_child(node, "pointer_declarator") if ptr_decl: declarator = self._find_child(ptr_decl, "function_declarator") if declarator is None: return None # Function name name_node = self._find_child(declarator, "identifier") if name_node is None: return None name = self._node_text(name_node, source) # Return type — everything before the declarator ret_type = self._extract_return_type(node, source) # Storage class specifiers is_static = "static" in ret_type is_inline = "inline" in ret_type # Parameters param_list = self._find_child(declarator, "parameter_list") params = self._extract_params(param_list, source) if param_list else [] # Signature sig = self._node_text(node, source).split("{")[0].strip() # Body body_node = self._find_child(node, "compound_statement") body_text = self._node_text(body_node, source) if body_node else "" # Analyze parameter usage within the body if body_node and params: self._analyze_param_usage(params, body_node, source) # Check for prior declaration has_prototype = self._has_prior_declaration(name, node, source) fn = FunctionInfo( name=name, return_type=ret_type, signature=sig, body_text=body_text, start_line=node.start_point[0] + 1, end_line=node.end_point[0] + 1, params=params, is_static=is_static, is_inline=is_inline, has_prototype=has_prototype, ) return fn def _extract_return_type(self, fn_node: Node, source: bytes) -> str: """Extract the return type from a function definition.""" parts = [] for child in fn_node.children: if child.type in ("storage_class_specifier", "type_qualifier", "primitive_type", "sized_type_specifier", "type_identifier", "struct_specifier", "enum_specifier", "union_specifier"): parts.append(self._node_text(child, source)) elif child.type in ("function_declarator", "pointer_declarator", "identifier", "parenthesized_declarator"): break return " ".join(parts) if parts else "int" # implicit int def _extract_params(self, param_list: Node, source: bytes) -> List[ParamInfo]: """Extract parameter info from a parameter_list node.""" params = [] for child in param_list.children: if child.type == "parameter_declaration": name, type_str, is_ptr = self._parse_param_decl(child, source) if name: params.append(ParamInfo( name=name, type_str=type_str, is_pointer=is_ptr )) return params def _parse_param_decl(self, node: Node, source: bytes) -> Tuple[str, str, bool]: """Parse a single parameter declaration.""" full_text = self._node_text(node, source).strip() # Find the identifier (parameter name) ident = None is_ptr = False for child in self._walk_all(node): if child.type == "identifier" and child.parent and child.parent.type != "type_identifier": ident = self._node_text(child, source) if child.type == "pointer_declarator" or child.type == "abstract_pointer_declarator": is_ptr = True if child.type == "*": is_ptr = True # Type is everything except the name type_str = full_text.replace(ident, "").strip() if ident else full_text type_str = re.sub(r'\s+', ' ', type_str).strip() return ident or "", type_str, is_ptr # ──────────────────────────────────────────────────────────────── # Parameter usage analysis # ──────────────────────────────────────────────────────────────── def _analyze_param_usage(self, params: List[ParamInfo], body: Node, source: bytes): """Count read/write references to each parameter within the function body. Also detects when a pointer parameter is passed as a non-const argument to another function (indirect write potential). """ param_names = {p.name for p in params} param_map = {p.name: p for p in params} for node in self._walk_all(body): if node.type != "identifier": continue name = self._node_text(node, source) if name not in param_names: continue line = node.start_point[0] + 1 p = param_map[name] if self._is_write_context(node): p.write_count += 1 p.write_lines.append(line) elif p.is_pointer and self._is_passed_as_nonconst_arg(node, source): # Pointer passed to a function that takes non-const param — # treat as potential write (conservative) p.write_count += 1 p.write_lines.append(line) else: p.read_count += 1 p.read_lines.append(line) def _is_write_context(self, node: Node) -> bool: """Check if an identifier is being written to (LHS of assignment, &, etc.).""" parent = node.parent if parent is None: return False # Direct assignment: x = ... if parent.type == "assignment_expression": if parent.children and parent.children[0] == node: return True # Compound assignment: x += ... if parent.type == "update_expression": return True # Pointer dereference write: *x = ... or x[i] = ... if parent.type == "pointer_expression": grandparent = parent.parent if grandparent and grandparent.type == "assignment_expression": if grandparent.children and grandparent.children[0] == parent: return True # Subscript write: x[i] = ... if parent.type == "subscript_expression": grandparent = parent.parent if grandparent and grandparent.type == "assignment_expression": if grandparent.children and grandparent.children[0] == parent: return True # Address-of (could be write): &x if parent.type == "pointer_expression": op = self._find_child(parent, "&") if op: return True # conservative: treat &x as potential write # Unary increment/decrement if parent.type in ("update_expression",): return True return False def _is_passed_as_nonconst_arg(self, node: Node, source: bytes) -> bool: """Check if a pointer identifier is passed as an argument to a function call. If the callee's corresponding parameter is not const-qualified, the pointer may be written through — so we conservatively treat it as a write. If we cannot determine the callee's parameter types, we assume the worst (non-const). """ # Walk up to find if this identifier is inside a call_expression's # argument_list arg_list = None current = node.parent depth = 0 while current and depth < 8: if current.type == "argument_list": arg_list = current break # Stop if we've left the immediate expression context if current.type in ("compound_statement", "function_definition", "declaration"): return False current = current.parent depth += 1 if arg_list is None: return False # The call_expression is arg_list's parent call_expr = arg_list.parent if call_expr is None or call_expr.type != "call_expression": return False # Determine which argument position this identifier is in arg_index = -1 for i, child in enumerate(arg_list.children): if child.type in (",", "(", ")"): continue if self._node_contains(child, node): arg_index = i // 1 # approximate break # Try to find the callee's declaration to check if the param is const callee_node = call_expr.children[0] if call_expr.children else None if callee_node is None: return True # Cannot determine callee — assume non-const callee_name = self._node_text(callee_node, source) # Well-known safe (const) functions — common C standard library _CONST_SAFE_FUNCS = { "printf", "fprintf", "sprintf", "snprintf", "puts", "fputs", "strlen", "strcmp", "strncmp", "memcmp", "strchr", "strstr", "fwrite", "fread", "sizeof", "assert", "free", } if callee_name in _CONST_SAFE_FUNCS: return False # Look up the callee in the current TU to check param types root = node while root.parent: root = root.parent callee_fn = self._find_function_node(root, callee_name, source) if callee_fn: decl = self._find_child(callee_fn, "function_declarator") if decl: param_list = self._find_child(decl, "parameter_list") if param_list: params = [c for c in param_list.children if c.type == "parameter_declaration"] # Find the arg_index-th parameter, check for const actual_idx = 0 for child in arg_list.children: if child.type in (",", "(", ")"): continue if self._node_contains(child, node): break actual_idx += 1 if actual_idx < len(params): param_text = self._node_text(params[actual_idx], source) if "const" in param_text: return False # Callee takes const — safe # Default conservative: treat as potential write return True @staticmethod def _node_contains(ancestor: Node, descendant: Node) -> bool: """Check if ancestor contains descendant by byte range.""" return (ancestor.start_byte <= descendant.start_byte and ancestor.end_byte >= descendant.end_byte) # ──────────────────────────────────────────────────────────────── # Reachability analysis # ──────────────────────────────────────────────────────────────── def is_unreachable(self, file_path: str, line: int) -> Optional[str]: """ Check if a line is unreachable (after return/break/continue/goto). Returns a reason string if unreachable, None otherwise. """ fn = self.get_function_at_line(file_path, line) if fn is None: return None source, tree = self._get_tree(file_path) if tree is None: return None # Find the compound_statement containing this line fn_node = self._find_function_node(tree.root_node, fn.name, source) if fn_node is None: return None body = self._find_child(fn_node, "compound_statement") if body is None: return None return self._check_reachability(body, line, source) def _check_reachability(self, block: Node, target_line: int, source: bytes) -> Optional[str]: """Check if target_line comes after a terminal statement in the same block.""" terminal_types = {"return_statement", "break_statement", "continue_statement", "goto_statement"} for i, child in enumerate(block.children): child_end_line = child.end_point[0] + 1 if child.type in terminal_types and child_end_line < target_line: # Check if the target is a sibling that comes after this terminal remaining = block.children[i + 1:] for sibling in remaining: sib_start = sibling.start_point[0] + 1 sib_end = sibling.end_point[0] + 1 if sib_start <= target_line <= sib_end: kind = child.type.replace("_statement", "") return f"Code after '{kind}' statement on line {child_end_line}" # Recurse into compound statements but not into sub-scopes like if/for if child.type == "compound_statement": result = self._check_reachability(child, target_line, source) if result: return result return None def _get_unreachable_range(self, file_path: str, flagged_line: int, fn: 'FunctionInfo') -> Optional[Dict]: """Find all unreachable siblings after the terminal statement. Returns {"start_line": int, "end_line": int} covering the full unreachable block, so the fixer can remove it in one pass. """ source, tree = self._get_tree(file_path) if tree is None: return None fn_node = self._find_function_node(tree.root_node, fn.name, source) if fn_node is None: return None body = self._find_child(fn_node, "compound_statement") if body is None: return None return self._find_unreachable_range(body, flagged_line, source) def _find_unreachable_range(self, block: Node, flagged_line: int, source: bytes) -> Optional[Dict]: """Recursively find the range of unreachable siblings in a block.""" terminal_types = {"return_statement", "break_statement", "continue_statement", "goto_statement"} for i, child in enumerate(block.children): child_end_line = child.end_point[0] + 1 if child.type in terminal_types and child_end_line < flagged_line: remaining = block.children[i + 1:] # Check if flagged_line falls within remaining siblings for sibling in remaining: sib_start = sibling.start_point[0] + 1 sib_end = sibling.end_point[0] + 1 if sib_start <= flagged_line <= sib_end: # Found: collect ALL unreachable siblings after terminal first_unreachable = remaining[0].start_point[0] + 1 last_unreachable = remaining[-1].end_point[0] + 1 # Don't include the closing brace of the block for r in reversed(remaining): if r.type == "}": continue last_unreachable = r.end_point[0] + 1 break return { "start_line": first_unreachable, "end_line": last_unreachable, } if child.type == "compound_statement": result = self._find_unreachable_range(child, flagged_line, source) if result: return result return None # ──────────────────────────────────────────────────────────────── # Symbol scope analysis # ──────────────────────────────────────────────────────────────── def get_symbol_scope(self, file_path: str, symbol: str) -> str: """ Classify a symbol's scope: 'external', 'file' (static), 'block', or 'unknown'. """ source, tree = self._get_tree(file_path) if tree is None: return "unknown" for node in self._walk_all(tree.root_node): if node.type != "identifier": continue if self._node_text(node, source) != symbol: continue decl = self._find_enclosing(node, { "declaration", "function_definition", "parameter_declaration" }) if decl is None: continue decl_text = self._node_text(decl, source) # Check if inside a function body fn = self._find_enclosing(node, {"function_definition"}) if fn: body = self._find_child(fn, "compound_statement") if body and body.start_byte <= node.start_byte <= body.end_byte: if "extern" in decl_text: return "block-extern" return "block" # File scope if "static" in decl_text: return "file" if "extern" in decl_text: return "external" return "external" # default linkage for file-scope return "unknown" # ──────────────────────────────────────────────────────────────── # Declaration finding # ──────────────────────────────────────────────────────────────── def find_declarations(self, file_path: str, symbol: str) -> List[SymbolRef]: """Find all declarations (not definitions) of a symbol.""" source, tree = self._get_tree(file_path) if tree is None: return [] refs = [] lines = self._source_lines(file_path) for node in self._walk_all(tree.root_node): if node.type != "identifier": continue if self._node_text(node, source) != symbol: continue line = node.start_point[0] + 1 parent = node.parent is_decl = False is_def = False if parent and parent.type == "function_declarator": fn_def = self._find_enclosing(node, {"function_definition"}) fn_decl = self._find_enclosing(node, {"declaration"}) if fn_def: is_def = True elif fn_decl: is_decl = True if parent and parent.type in ("init_declarator", "declaration"): is_decl = True # Check if it's a definition (has initializer or is a function definition) if parent.type == "init_declarator": for c in parent.children: if c.type == "=": is_def = True line_text = lines[line - 1].rstrip() if line <= len(lines) else "" refs.append(SymbolRef( name=symbol, line=line, column=node.start_point[1], context=line_text, is_definition=is_def, is_declaration=is_decl, )) return refs def find_all_references(self, file_path: str, symbol: str) -> List[SymbolRef]: """Find all references to a symbol (uses + declarations).""" source, tree = self._get_tree(file_path) if tree is None: return [] refs = [] lines = self._source_lines(file_path) for node in self._walk_all(tree.root_node): if node.type != "identifier": continue if self._node_text(node, source) != symbol: continue line = node.start_point[0] + 1 line_text = lines[line - 1].rstrip() if line <= len(lines) else "" refs.append(SymbolRef( name=symbol, line=line, column=node.start_point[1], context=line_text, is_write=self._is_write_context(node), )) return refs # ──────────────────────────────────────────────────────────────── # Enum analysis # ──────────────────────────────────────────────────────────────── def get_enum_values(self, file_path: str) -> List[EnumConstant]: """Extract all enum constants with computed values.""" source, tree = self._get_tree(file_path) if tree is None: return [] constants = [] for node in self._walk_type(tree.root_node, "enum_specifier"): body = self._find_child(node, "enumerator_list") if body is None: continue next_val = 0 for child in body.children: if child.type != "enumerator": continue name_node = self._find_child(child, "identifier") if name_node is None: continue name = self._node_text(name_node, source) # Check for explicit value val_node = None for c in child.children: if c.type == "number_literal": val_node = c elif c.type == "expression" or c.type == "parenthesized_expression": val_node = c is_implicit = True if val_node: try: val_text = self._node_text(val_node, source) next_val = int(val_text, 0) is_implicit = False except ValueError: pass # complex expression, keep incrementing constants.append(EnumConstant( name=name, value=next_val, is_implicit=is_implicit, line=child.start_point[0] + 1, )) next_val += 1 return constants def analyze_for_rule(self, file_path: str, line: int, rule_id: str) -> Dict: """ Run rule-specific AST analysis and return structured findings. This is the main entry point used by the FixEngine. When a WorkspaceIndex is available (self.index), cross-file evidence is added for rules 8.3, 8.4, 8.5, 8.6, 8.8, and 8.13. """ analysis = {"rule_id": rule_id, "line": line} analysis["has_cross_file"] = self.index is not None and self.index.is_built fn = self.get_function_at_line(file_path, line) if fn: analysis["function"] = { "name": fn.name, "signature": fn.signature, "return_type": fn.return_type, "start_line": fn.start_line, "end_line": fn.end_line, "is_static": fn.is_static, "is_inline": fn.is_inline, "body_lines": fn.end_line - fn.start_line + 1, } analysis["params"] = [ { "name": p.name, "type": p.type_str, "is_pointer": p.is_pointer, "read_count": p.read_count, "write_count": p.write_count, "read_lines": p.read_lines, "write_lines": p.write_lines, "unused": p.read_count == 0 and p.write_count == 0, } for p in fn.params ] else: analysis["function"] = None analysis["params"] = [] # ── Rule-specific enrichments ── if rule_id == "MisraC2012-2.1": reason = self.is_unreachable(file_path, line) # Additional preprocessor check: is the line inside an inactive block? # e.g. #if 0 ... #endif if self.preprocessor: active_regions = self.preprocessor.get_active_regions(file_path) # active_regions is list of (start, end) inclusive is_active = False for start, end in active_regions: if start <= line <= end: is_active = True break if not is_active: reason = "Code is inside an inactive preprocessor block (e.g. #if 0)" analysis["unreachable_reason"] = reason # Compute the full unreachable range so the fixer can remove # the entire block, not just the single flagged line if reason and fn: unreachable_range = self._get_unreachable_range(file_path, line, fn) if unreachable_range: analysis["unreachable_range"] = unreachable_range elif rule_id == "MisraC2012-2.7" and fn: analysis["unused_params"] = [ p.name for p in fn.params if p.read_count == 0 and p.write_count == 0 ] elif rule_id == "MisraC2012-8.2": # ── Cross-file: find definition param names for unnamed decl ── sym = self._extract_symbol_name(file_path, line) if not fn else fn.name if sym and self.index and self.index.is_built: analysis["cross_file"] = self.index.check_rule_8_2(sym) elif rule_id == "MisraC2012-8.3" and fn: # ── Cross-file: compare header declaration with definition ── if self.index and self.index.is_built: analysis["cross_file"] = self.index.check_rule_8_3(fn.name) elif rule_id == "MisraC2012-8.4" and fn: # ── Cross-file: check for prior declaration in headers ── if self.index and self.index.is_built: analysis["cross_file"] = self.index.check_rule_8_4( fn.name, file_path ) elif rule_id == "MisraC2012-8.5": sym = self._extract_symbol_name(file_path, line) if sym: analysis["symbol_scope"] = self.get_symbol_scope(file_path, sym) if self.index and self.index.is_built: analysis["cross_file"] = self.index.check_rule_8_5(sym) elif rule_id == "MisraC2012-8.6": sym = self._extract_symbol_name(file_path, line) if sym: if self.index and self.index.is_built: analysis["cross_file"] = self.index.check_rule_8_6(sym) elif rule_id == "MisraC2012-8.8" and fn: analysis["needs_static"] = not fn.is_static if self.index and self.index.is_built: analysis["cross_file"] = self.index.check_rule_8_8( fn.name, file_path ) # Override simple heuristic with cross-file evidence cf = analysis["cross_file"] analysis["safe_to_add_static"] = cf.get("safe_to_add_static", False) elif rule_id == "MisraC2012-8.10" and fn: analysis["needs_static"] = fn.is_inline and not fn.is_static elif rule_id == "MisraC2012-8.11": # ── Cross-file: find array definition and extract size ── sym = self._extract_symbol_name(file_path, line) if sym and self.index and self.index.is_built: analysis["cross_file"] = self.index.check_rule_8_11(sym) elif rule_id == "MisraC2012-8.12": enums = self.get_enum_values(file_path) val_map: Dict[int, List[str]] = {} for ec in enums: val_map.setdefault(ec.value, []).append(ec.name) analysis["enum_collisions"] = { v: names for v, names in val_map.items() if len(names) > 1 } elif rule_id == "MisraC2012-8.13" and fn: # Single-file: which pointer params are never written through? # Also checks: # - Already const → skip (not a candidate) # - Passed as non-const arg to another function → counted as write analysis["const_candidates"] = [ { "name": p.name, "type": p.type_str, "reads": p.read_count, "writes": p.write_count, "already_const": "const" in p.type_str, "safe_to_add_const": (p.is_pointer and p.write_count == 0 and "const" not in p.type_str), } for p in fn.params if p.is_pointer ] # ── Cross-file: caller impact analysis ── if self.index and self.index.is_built: analysis["cross_file"] = self.index.check_rule_8_13(fn.name) elif rule_id.startswith("MisraC2012-10."): # Expression/Type rules # First, check if this line is a macro definition macro_analysis = self._analyze_macro_definition(file_path, line) if macro_analysis: analysis["macro_analysis"] = macro_analysis else: # Fallback: try to find expressions at this line in the main tree expr_list = self._analyze_expression_at_line(file_path, line) if expr_list: analysis["expressions"] = expr_list elif rule_id == "MisraC2012-11.9": # Detect usage of '0' as null pointer constant # Logic: Find assignments where LHS is pointer and RHS is literal 0 # Weak heuristic without full type propagation, but catch obvious cases # like: int *p = 0; or p = 0; assigns = [] source, tree = self._get_tree(file_path) if tree and fn: # Limit to function scope for now fn_node = self._find_function_node(tree.root_node, fn.name, source) if fn_node: for node in self._walk_all(fn_node): # check init_declarator: int *p = 0; if node.type == "init_declarator": val = self._find_child(node, "number_literal") if val and self._node_text(val, source) == "0": # check if decl is pointer decl = self._find_child(node, "pointer_declarator") if decl: assigns.append({ "line": val.start_point[0] + 1, "start_byte": val.start_byte, "end_byte": val.end_byte, "context": "init" }) # check assignment: p = 0; elif node.type == "assignment_expression": rhs = node.children[-1] if rhs.type == "number_literal" and self._node_text(rhs, source) == "0": lhs = node.children[0] # Heuristic: if LHS is known pointer param check lhs_name = self._node_text(lhs, source) p_info = next((p for p in fn.params if p.name == lhs_name), None) if p_info and p_info.is_pointer: assigns.append({ "line": rhs.start_point[0] + 1, "start_byte": rhs.start_byte, "end_byte": rhs.end_byte, "context": "assign" }) if assigns: analysis["null_pointer_violations"] = assigns elif rule_id == "MisraC2012-14.4": # Controlling expression not essentially boolean # if (p), while (x), ternary ? ... non_bools = [] source, tree = self._get_tree(file_path) if tree and fn: fn_node = self._find_function_node(tree.root_node, fn.name, source) if fn_node: for node in self._walk_all(fn_node): cond = None if node.type in ("if_statement", "while_statement", "do_statement"): cond = self._find_child(node, "parenthesized_expression") elif node.type == "for_statement": # for (init; cond; update) - cond is 2nd child usually, but verify for c in node.children: # Condition is an expression between semicolons, usually pass # simplistic for now, focused on if/while if cond: # Strip parens inner = cond.children[1] if len(cond.children) > 2 else cond # Check if inner is comparison or logical # Safe: binary_expression (depends on op), true/false is_safe = False if inner.type == "binary_expression": op = inner.children[1].type if op in ("==", "!=", "<", ">", "<=", ">=", "&&", "||"): is_safe = True elif inner.type in ("true", "false", "parenthesized_expression"): is_safe = True # simplify if not is_safe: # Determine if the expression is a pointer # (helps the fixer choose != NULL vs != 0) expr_type = self.get_expression_type(inner, source) is_ptr = (expr_type.is_pointer if expr_type else False) non_bools.append({ "line": inner.start_point[0] + 1, "start_byte": inner.start_byte, "end_byte": inner.end_byte, "text": self._node_text(inner, source), "is_pointer": is_ptr, }) if non_bools: analysis["non_boolean_conditions"] = non_bools elif rule_id == "MisraC2012-15.6": # Body not compound statement missing_braces = [] source, tree = self._get_tree(file_path) if tree and fn: fn_node = self._find_function_node(tree.root_node, fn.name, source) if fn_node: for node in self._walk_all(fn_node): check_nodes = [] if node.type in ("if_statement", "while_statement", "for_statement"): # Last child is usually the body, unless it's if-else if node.type == "if_statement": # consequence is after condition # alternative is after 'else' children = node.children for i, c in enumerate(children): if c.type == "parenthesized_expression" and i+1 < len(children): check_nodes.append(children[i+1]) if c.type == "else" and i+1 < len(children): check_nodes.append(children[i+1]) else: check_nodes.append(node.children[-1]) for body in check_nodes: if body.type != "compound_statement" and body.type != "if_statement": # else if is ok structurally if chained missing_braces.append({ "line": body.start_point[0] + 1, "start_byte": body.start_byte, "end_byte": body.end_byte, "text": self._node_text(body, source) }) if missing_braces: analysis["missing_braces"] = missing_braces elif rule_id.startswith("MisraC2012-8."): # Generic linkage analysis for remaining 8.x rules sym = self._extract_symbol_name(file_path, line) if sym: analysis["symbol_scope"] = self.get_symbol_scope(file_path, sym) analysis["declarations"] = [ {"line": r.line, "context": r.context, "is_definition": r.is_definition} for r in self.find_declarations(file_path, sym) ] return analysis def _extract_symbol_name(self, file_path: str, line: int) -> Optional[str]: """Extract the primary symbol name declared/defined at a given line. Walks the AST looking for a declaration or function definition at the specified line and returns the identifier name. Falls back to a simple regex extraction from the source line if the AST walk finds nothing. """ source, tree = self._get_tree(file_path) if tree is None: return None target_row = line - 1 # tree-sitter uses 0-indexed rows # Strategy 1: walk top-level declarations for a match at this line decl_types = { "function_definition", "declaration", "function_declarator", } for node in self._walk_all(tree.root_node): if node.start_point[0] != target_row: continue if node.type == "function_definition": # Find the function_declarator → identifier for child in self._walk_type(node, "function_declarator"): for gc in child.children: if gc.type == "identifier": return self._node_text(gc, source) if node.type in ("declaration", "init_declarator"): for child in node.children: if child.type == "identifier": return self._node_text(child, source) if child.type in ("function_declarator", "init_declarator", "array_declarator", "pointer_declarator"): for gc in self._walk_all(child): if gc.type == "identifier": return self._node_text(gc, source) # Strategy 2: regex fallback on the source line lines = self._source_lines(file_path) if 0 < line <= len(lines): text = lines[line - 1] # Match common patterns: type name; / type name( / extern type name m = re.search(r'\b(\w+)\s*[(\[;=]', text) if m: # Skip C keywords kw = {"if", "else", "for", "while", "do", "switch", "case", "return", "sizeof", "typedef", "struct", "union", "enum", "extern", "static", "inline", "const", "volatile", "void", "int", "char", "short", "long", "float", "double", "unsigned", "signed", "_Bool"} candidates = re.findall(r'\b(\w+)\s*[(\[;=]', text) for c in candidates: if c not in kw: return c return None def _analyze_macro_definition(self, file_path: str, line: int) -> Optional[Dict]: """ If the line is a #define, try to parse its body as an expression. Returns AST analysis of the macro body if successful. """ source, tree = self._get_tree(file_path) if tree is None: return None # Find the preproc_def node at this line macro_node = None for node in self._walk_all(tree.root_node): if node.start_point[0] + 1 == line and node.type == "preproc_def": macro_node = node break if not macro_node: return None # Extract macro body: usually the last child is the 'value' # e.g. #define FOO (a + b) -> value is (a + b) # Tree-sitter structure for preproc_def: name, value (preproc_arg) # We need the raw text after the identifier ident = self._find_child(macro_node, "identifier") if not ident: return None # Get everything after the identifier from source macro_text = self._node_text(macro_node, source) ident_text = self._node_text(ident, source) # Basic split to get body parts = macro_text.split(ident_text, 1) if len(parts) < 2: return None body_text = parts[1].strip() if not body_text: return None # Calculate absolute start byte of the body in the source file # ident.end_byte gives us the end of the identifier. # We need to skip whitespace after identifier. current_byte = ident.end_byte while current_byte < len(source) and chr(source[current_byte]).isspace(): current_byte += 1 body_start_byte = current_byte # Wrap in a dummy function to parse as expression # void _dummy() { __MACRO_BODY__; } prefix = "void _dummy() { " dummy_source = f"{prefix}{body_text}; }}" try: dummy_tree = _parser.parse(dummy_source.encode("utf-8")) # Find the expression statement in the dummy tree fn_node = self._find_child(dummy_tree.root_node, "function_definition") if fn_node: body = self._find_child(fn_node, "compound_statement") if body: # The first child should be our expression statement stmt = self._find_child(body, "expression_statement") if stmt: expr = stmt.children[0] if stmt.children else None if expr: # Unwrap parenthesized_expression if present while expr.type == "parenthesized_expression" and len(expr.children) >= 3: # child 1 is the inner expression ( ( expr ) ) expr = expr.children[1] analysis = self._analyze_expression_node(expr, dummy_source.encode("utf-8")) analysis["body_start_byte"] = body_start_byte analysis["prefix_len"] = len(prefix) return analysis except Exception as e: logger.warning("Failed to parse macro body for analysis: %s", e) return {"raw_body": body_text} def _analyze_expression_at_line(self, file_path: str, line: int) -> List[Dict]: """Find all relevant expressions at the given line.""" source, tree = self._get_tree(file_path) if not tree: return [] # Find all binary or assignment expressions strictly contained in the line # or covering the line if they are single-line results = [] seen = set() for node in self._walk_all(tree.root_node): # Check if node is on the target line # node.start_point is (row, col) start_line = node.start_point[0] + 1 end_line = node.end_point[0] + 1 # We care about nodes that "contain" the violation on this line. # Usually simple expressions are single-line. # If multi-line, we might include them if they start or end on this line? # Let's restrict to nodes identified as expressions on this line. if start_line <= line <= end_line: if node.type in ("binary_expression", "assignment_expression", "cast_expression", "init_declarator"): logger.debug("Found %s at %d-%d", node.type, start_line, end_line) # Avoid duplicates (tree walk visits same node once, but avoid logic errors) if node.id in seen: continue # We want 'significant' expressions. # _analyze_expression_node handles these types. seen.add(node.id) results.append(self._analyze_expression_node(node, source)) return results def _analyze_expression_node(self, node: Node, source: bytes) -> Dict: """Analyze an expression node for operators and operands.""" info = { "type": node.type, "text": self._node_text(node, source), "start_byte": node.start_byte, "end_byte": node.end_byte, "operator": None, "operands": [] } # Binary expression: left OP right if node.type == "binary_expression": # Operator is the child that is not an expression usually # Tree-sitter: left, operator, right if len(node.children) >= 3: left = node.children[0] op = node.children[1] right = node.children[2] info["left"] = self._node_text(left, source) info["operator"] = self._node_text(op, source) info["right"] = self._node_text(right, source) t_left = self.get_expression_type(left, source) t_right = self.get_expression_type(right, source) info["operands"] = [ { "text": info["left"], "start_byte": left.start_byte, "end_byte": left.end_byte, "type": t_left.__dict__ if t_left else None }, { "text": info["right"], "start_byte": right.start_byte, "end_byte": right.end_byte, "type": t_right.__dict__ if t_right else None } ] # Cast expression: (type)value elif node.type == "cast_expression": type_node = self._find_child(node, "type_descriptor") val_node = node.children[-1] info["cast_to"] = self._node_text(type_node, source) if type_node else "?" info["operand"] = self._node_text(val_node, source) t_op = self.get_expression_type(val_node, source) info["operands"] = [ { "text": info["operand"], "start_byte": val_node.start_byte, "end_byte": val_node.end_byte, "type": t_op.__dict__ if t_op else None } ] # Assignment expression: lhs = rhs elif node.type == "assignment_expression": operator = None for child in node.children: if child.type in ("=", "+=", "-=", "*=", "/=", "%=", "<<=", ">>=", "&=", "^=", "|="): operator = child.text.decode() if hasattr(child.text, 'decode') else str(child.text) break # Skip compound assignments — too complex to cast safely if operator and operator != "=": return info lhs = node.child_by_field_name("left") rhs = node.child_by_field_name("right") if lhs and rhs: info["operator"] = "=" lhs_type = self._resolve_identifier_type(lhs, source) rhs_type = self.get_expression_type(rhs, source) info["operands"] = [ { "text": self._node_text(rhs, source), "start_byte": rhs.start_byte, "end_byte": rhs.end_byte, "type": rhs_type.__dict__ if rhs_type else None, "target_type": lhs_type.__dict__ if lhs_type else None, } ] # Init declarator: type x = expr; elif node.type == "init_declarator": value = node.child_by_field_name("value") if value: decl_type = self._extract_init_declarator_type(node, source) val_type = self.get_expression_type(value, source) info["operator"] = "=" info["operands"] = [ { "text": self._node_text(value, source), "start_byte": value.start_byte, "end_byte": value.end_byte, "type": val_type.__dict__ if val_type else None, "target_type": decl_type.__dict__ if decl_type else None, } ] return info def _extract_init_declarator_type(self, init_decl_node: Node, source: bytes) -> Optional["CType"]: """Walk up to parent 'declaration' node and extract the type specifier. For ``uint8_t x = expr;`` the tree-sitter AST looks like: declaration type_identifier "uint8_t" init_declarator identifier "x" = "=" <value> We collect all type-related children before the first init_declarator. """ parent = init_decl_node.parent if parent and parent.type == "declaration": type_parts: list[str] = [] for child in parent.children: if child.type in ("type_identifier", "primitive_type", "sized_type_specifier", "type_qualifier"): type_parts.append(self._node_text(child, source)) elif child.type == "init_declarator": break # stop before declarator if type_parts: type_name = " ".join(type_parts) return self.type_system.get_type(type_name) return None # ──────────────────────────────────────────────────────────────── # Type Inference # ──────────────────────────────────────────────────────────────── def get_expression_type(self, node: Node, source: bytes) -> Optional[CType]: """Infer the type of an expression node.""" if node.type == "number_literal": text = self._node_text(node, source) if "f" in text.lower(): return self.type_system.get_type("float") if "u" in text.lower(): # Heuristic: verify width based on value? # For now assume minimal fitting unsigned or int return self.type_system.get_type("unsigned int") return self.type_system.get_type("int") if node.type == "string_literal": return CType("char *", 64, False, is_pointer=True) if node.type == "identifier": return self._resolve_identifier_type(node, source) if node.type == "binary_expression": left = node.children[0] right = node.children[2] t_left = self.get_expression_type(left, source) t_right = self.get_expression_type(right, source) if t_left and t_right: return self.type_system.promote(t_left, t_right) return t_left or t_right if node.type == "cast_expression": type_node = self._find_child(node, "type_descriptor") if type_node: type_name = self._node_text(type_node, source) return self.type_system.get_type(type_name) if node.type == "parenthesized_expression": # ( expr ) -> type of expr for child in node.children: if child.type not in ("(", ")"): return self.get_expression_type(child, source) return None def _resolve_identifier_type(self, node: Node, source: bytes) -> Optional[CType]: """Find declaration of identifier and return its type.""" name = self._node_text(node, source) # 1. Local scope (declaration inside function) # Walk up to find Decl current = node while current: if current.type == "compound_statement": # Scan declarations in this block BEFORE the usage for child in current.children: if child.end_byte > node.start_byte: break # passed the usage point if child.type == "declaration": # int x; t = self._extract_type_from_decl(child, name, source) if t: return t if current.type == "function_definition": # Check parameters declarator = self._find_child(current, "function_declarator") if declarator: params = self._find_child(declarator, "parameter_list") if params: for p in params.children: if p.type == "parameter_declaration": t = self._extract_type_from_decl(p, name, source) if t: return t current = current.parent # 2. File scope (global variables) root = node while root.parent: root = root.parent for child in root.children: if child.type == "declaration": t = self._extract_type_from_decl(child, name, source) if t: return t return None def _extract_type_from_decl(self, decl_node: Node, name: str, source: bytes) -> Optional[CType]: """Extract type for 'name' from a declaration node.""" # This is a bit complex as C declarations are nested. # Simplified: look for type_identifier and matches of declarator decl_text = self._node_text(decl_node, source) if name not in decl_text: return None # Parse type specifier (int, uint16_t, etc) type_str = "int" # default # Collect type parts parts = [] for child in decl_node.children: if child.type in ("primitive_type", "type_identifier", "sized_type_specifier", "storage_class_specifier", "type_qualifier"): parts.append(self._node_text(child, source)) if parts: type_str = " ".join(parts) # Check if the specific identifier is declared here # (int a, b;) # We need to find the declarator that matches 'name' # to check for pointers/arrays is_ptr = False def find_decl(n): nonlocal is_ptr if n.type == "identifier" and self._node_text(n, source) == name: return True if n.type == "pointer_declarator": if find_decl(n.children[1]): is_ptr = True return True if n.type in ("init_declarator", "declaration", "parameter_declaration"): for c in n.children: if find_decl(c): return True return False if find_decl(decl_node): if is_ptr: type_str += "*" return self.type_system.get_type(type_str) return None # ──────────────────────────────────────────────────────────────── # Tree traversal helpers # ──────────────────────────────────────────────────────────────── @staticmethod def _walk_type(node: Node, type_name: str): """Yield all descendant nodes of a given type.""" cursor = node.walk() visited = False while True: if not visited and cursor.node.type == type_name: yield cursor.node if not visited and cursor.goto_first_child(): visited = False continue if cursor.goto_next_sibling(): visited = False continue if cursor.goto_parent(): visited = True continue break @staticmethod def _walk_all(node: Node): """Yield all descendant nodes.""" cursor = node.walk() visited = False while True: if not visited: yield cursor.node if not visited and cursor.goto_first_child(): visited = False continue if cursor.goto_next_sibling(): visited = False continue if cursor.goto_parent(): visited = True continue break @staticmethod def _find_child(node: Node, type_name: str) -> Optional[Node]: """Find first direct or indirect child of a given type.""" for child in node.children: if child.type == type_name: return child # Try one level deeper for child in node.children: for grandchild in child.children: if grandchild.type == type_name: return grandchild return None @staticmethod def _find_enclosing(node: Node, types: Set[str]) -> Optional[Node]: """Walk up the tree to find the nearest enclosing node of given types.""" current = node.parent while current: if current.type in types: return current current = current.parent return None def _find_function_node(self, root: Node, name: str, source: bytes) -> Optional[Node]: """Find the function_definition node for a named function.""" for node in self._walk_type(root, "function_definition"): decl = self._find_child(node, "function_declarator") if decl is None: continue ident = self._find_child(decl, "identifier") if ident and self._node_text(ident, source) == name: return node return None def _has_prior_declaration(self, name: str, fn_node: Node, source: bytes) -> bool: """Check if there's a declaration of this function before its definition.""" root = fn_node.parent if root is None: return False for child in root.children: if child == fn_node: break if child.type == "declaration": decl_text = self._node_text(child, source) if name in decl_text and "(" in decl_text: return True return False

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/varunjain-byte/MISRA-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

c_analyzer.py•70.7 KiB