""" File: extract_code_java.py Created Time: 2023-02-07 Author: Krahets (krahets@163.com) """ import re import glob import sys, os.path as osp sys.path.append(osp.dirname(osp.dirname(osp.dirname(osp.abspath(__file__))))) class ExtractCodeBlocksJava: def __init__(self) -> None: self.ind = 4 # Pattern to match function names and class names self.func_pattern = r'(\s*)(public|private|)\s*(static|)\s*(\S+)\s+(\w+)(\(.*\))\s+\{' self.class_pattern = r'(public|)\s*class\s+(\w+)\s*\{' self.func_pattern_keys = ["total", "ind", "scope", "static", "return", "label", "args"] self.class_pattern_keys = ["total", "scope", "label"] # Pattern to match the start and end of a block self.block_start_pattern = '^\s{ind}\/\*.+\*\/' self.block_end_pattern = '^\s{ind}\}' self.block_start_shift = 0 self.block_end_shift = 0 def extract(self, file_path): """ Extract classes and functions from a markdown document """ if not osp.isfile(file_path): return None self.file_path = file_path with open(file_path) as f: self.lines = f.readlines() self.content = "".join(self.lines) # Detect and extract all the classes and fucntions classes = self.extract_class_blocks() funcs = self.extract_function_blocks() self.post_process(classes, funcs) return { "classes": classes, "funcs": funcs, } def search_block(self, header_line, indentation): """ Search class/function block given the header_line and indentation """ start_line, end_line = 0, len(self.lines) block_end_pattern = re.compile( self.block_end_pattern.replace("ind", str(indentation))) block_start_pattern = re.compile( self.block_start_pattern.replace("ind", str(indentation))) # Search the code for i in range(header_line + 1, len(self.lines)): if re.match(block_end_pattern, self.lines[i]) is not None: end_line = i + self.block_end_shift break # Search the header comment for i in range(header_line - 1, -1, -1): if re.search(block_start_pattern, self.lines[i]) is not None: start_line = i + self.block_start_shift break return start_line, end_line, self.lines[start_line:end_line + 1] def extract_function_blocks(self, indentation=0, start_line=-1, end_line=-1): """ Extract all the functions with given indentation """ funcs = {} if start_line == -1: start_line = 0 if end_line == -1: end_line = len(self.lines) - 1 func_pattern = re.compile(self.func_pattern) for line_num in range(start_line, end_line + 1): # Search the function header func_match = func_pattern.match(self.lines[line_num]) if func_match is None: continue # The function should match the input indentation if len(func_match.group(self.func_pattern_keys.index("ind"))) != indentation: continue header_line = line_num # Search the block from the header line start_line, end_line, func_block = self.search_block( header_line, indentation) # Construct the funcs dict func_label = func_match.group(self.func_pattern_keys.index("label")) funcs[func_label] = { "indentation": indentation, "line_number": { "start": start_line, "end": end_line, "header": header_line, }, "block": func_block, } return funcs def extract_class_blocks(self): """ Extract all the classes with given indentation """ classes = {} class_pattern = re.compile(self.class_pattern) for line_num, line in enumerate(self.lines): # Search the class header class_match = class_pattern.match(line) if class_match is None: continue header_line = line_num # Search the block from the header line start_line, end_line, class_block = self.search_block( header_line, 0) # Construct the classes dict class_label = class_match.group(self.class_pattern_keys.index("label")) classes[class_label] = { "indentation": 0, "line_number": { "start": start_line, "end": end_line, "header": header_line, }, "block": class_block, "funcs": self.extract_function_blocks( indentation=self.ind, start_line=start_line, end_line=end_line) } return classes def post_process(self, classes, funcs): """ Process the classes and functions """ def remove_keyword(func): block = func["block"] header_line = func["line_number"]["header"] - \ func["line_number"]["start"] block[header_line] = block[header_line] \ .replace("static ", "", 1).replace("public ", "", 1).replace("private ", "", 1) for clas in classes.values(): remove_keyword(clas) for func in clas["funcs"].values(): remove_keyword(func) for func in funcs.values(): remove_keyword(func) # ext = ExtractCodeBlocksJava() # ext.extract("codes/java/chapter_array_and_linkedlist/my_list.java")