; last updated - 7 minutes read

If you want to parse a java class, it is a clever idea to fetch a java grammar and let JavaCC do the work for you. However, I didn't find a grammar for Java 7, so I decided to write my own parser. This parser can also read Groovy source code and most of the Java 8 source code (apart from defender methods).

Another reason to write the parser was to show how simple file and text manipulations can be. Groovy is optimized for programming these tasks (at the cost of being a little slower). The parser consists of merely 263 lines, including a couple of comments and blank lines. I guess the Java version is a lot more verbose.

License

Feel free to use the source code if you need it. Please note that I provide it to you on a "as-is" basis: if you use the source code, you use it at your own risk. You can download it here.

The regular expressions

The most important file of the project contains the regular expressions I use to extract classes, variable, methods, assignments and annotations:

package de.beyondjava.VariableParser import java.util.regex.Pattern; class RegularExpressions { final static Pattern SPACE= ~ /(\s)*/ final static Pattern IDENTIFIER = ~ /(\w|_|\$)+/ final static Pattern MODIFIERS = ~ /((\s)*(final|volatile|transient|static|public|private|protected)\s)*/ final static Pattern PARAMETERS = ~ /\((\w|_|,|\$|\s|=)*\)/ final static Pattern ANNOTATION = ~ /@($IDENTIFIER)($SPACE($PARAMETERS))?$SPACE/ final static Pattern ANNOTATIONS = ~ /($ANNOTATION)+/ final static Pattern ASSIGNMENT = ~ /(?:(\s)*=(?:[^;])*;(\s)*)/ final static Pattern VARIABLE_OR_METHOD_REGEXP = ~ /($ANNOTATIONS)?($MODIFIERS)$SPACE($IDENTIFIER)$SPACE($IDENTIFIER)($SPACE)?($PARAMETERS|$ASSIGNMENT)?/ static final Pattern NON_EMPTY_SPACE= ~ /(\s)+/ static final Pattern PACKAGEIDENTIFIER = ~ /(\w|_|\$|\.)+/ static final Pattern packageRegExp = ~ /\b(package)$NON_EMPTY_SPACE($PACKAGEIDENTIFIER)(;|\b)/ static final Pattern CLASS_REGEXP = ~ /($ANNOTATIONS)?($MODIFIERS)(class)$NON_EMPTY_SPACE($IDENTIFIER)/ static final Pattern STRING_REGEXP = ~ /"(.)*"/ // see http://ostermiller.org/findcomment.html static final String COMMENT_REGEXP = "(?:/\\*(?:[^*]|(?:\\*+[^*/]))*\\*+/)|(?://.*)" static final String JAVA_BLOCK_REGEXP = "(?:(\\s)*\\{(?:[^\\}])*\\}(\\s)*)" }

The main class

package de.beyondjava.VariableParser import java.util.regex.Matcher import java.util.regex.Pattern import static RegularExpressions.* /** * This is a simple class parser that reads every java or groovy class in a folder * and returns a list of class descriptions. */ class SimpleClassParser { /** Parse a file or a folder recursively. */ public List parse(String p_folderOrFilename) { File f = new File(p_folderOrFilename) parse(f); } /** Parse a file or a folder recursively. */ public List parse(File p_fileOrFolder) { List classDefinitions = [] if (p_fileOrFolder.exists()) { if (p_fileOrFolder.isDirectory()) { List files = p_fileOrFolder.listFiles(); List folders = files.findAll({File g -> g.isDirectory()}) folders.each {parse it} List classFiles = files.findAll({File g -> g.isDirectory() || g.name.endsWith(".java" ) || g.name.endsWith(".groovy")}) classFiles.each{classDefinitions = classDefinitions + parseASingleFile(it)} } } else { println "File or folder '$p_fileOrFolder' not found" } return classDefinitions } private List parseASingleFile(File p_file) { String theFile = p_file.text theFile = removeStrings(theFile) theFile = removeComments(theFile) String currentPackage=findPackage(theFile); if(!currentPackage) return []; return splitIntoClasses(theFile, currentPackage) } private String findPackage(String theFile) { Matcher m = theFile =~ packageRegExp if (m.count != 1) { println "This is neither a java nor a groovy file" return null } // the package name is the third expression enclosed by parantheses return m[0][3] } private List splitIntoClasses(String theFile, String p_currentPackage) { List classes = [] Matcher m = theFile =~ CLASS_REGEXP def implementations = theFile.split(CLASS_REGEXP.toString()) // remove the text before the first class definition implementations = implementations[1] m.eachWithIndex{ List classMatch, int index -> classes = classes + extractClass(classMatch, implementations[index], p_currentPackage)} return classes; } private ClassDefinition extractClass(List p_classPatternMatch, String p_implementation, String p_currentPackage) { // ignore the curly braces surrounding the implementation String classBody = p_implementation?.trim()[2] String classDeclaration = p_classPatternMatch[0] new ClassDefinition(p_currentPackage, classDeclaration, classBody) } /** Keep things simple by removing string literals. */ private String removeStrings(String p_theFile) { p_theFile.replaceAll(STRING_REGEXP, "null") } /** Keep things simple by removing comments (both single line and multi line comments). */ private String removeComments(String p_theFile) { p_theFile.replaceAll(COMMENT_REGEXP,"") } }

The class definition

This class is the definition of a single class.

package de.beyondjava.VariableParser; import java.util.List; import java.util.regex.Matcher import java.util.regex.Pattern; import static RegularExpressions.* /** * Description of a class, including a list of its variables and methods (but ignoring the parameter lists of the methods) */ public class ClassDefinition { String packageName; String className; List variables = [] List methods = [] List modifiers=[]; List annotations=[]; /** * extract the variables and methods from the implementation */ ClassDefinition (String p_packageName, String p_classDeclaration, String p_impl) { Matcher m = p_classDeclaration =~ ANNOTATION m.each({annotations = annotations + it[0].trim()}) // simplify expression by removing annotations p_classDeclaration = p_classDeclaration.replaceAll(ANNOTATION,"") List words = p_classDeclaration.split("class"); words = words.findAll{it?.trim()} className = words[-1].trim() if (words.size()>1) { modifiers = words.[3]collect{it?.trim()} } packageName=p_packageName def implementation=removeImplementationDetails(p_impl) extractVariablesAndMethods(implementation) } /** * In order to keep things simple, we remove the implementation of the methods. */ public String removeImplementationDetails(String p_theSourcecode) { // This regular expression removes everything enclosed between curly braces p_theSourcecode.replaceAll(JAVA_BLOCK_REGEXP,"\n") } private void extractVariablesAndMethods(String p_impl) { Matcher m = p_impl =~ VARIABLE_OR_METHOD_REGEXP def meths = m.findAll{it[0].endsWith(")")}.collect{it[0].trim()} methods = meths.collect{ new MethodDefinition(it)} def vars = m.findAll{!(it[0].endsWith(")"))}.collect{it[0].trim()} variables = vars.collect{ new VariableDefinition(it)} } }

package de.beyondjava.VariableParser; import static RegularExpressions.* import java.util.List; import java.util.regex.Matcher /** * This is the definition of a method (without the implementation details and without the parameter list). */ public class MethodDefinition { List visibility = [] String name String type List annotations=[] String parameters boolean isConstructor MethodDefinition(String p_method) { Matcher m = p_method =~ ANNOTATION m.each({annotations = annotations + it[0].trim()}) // simplify expression by removing annotations p_method = p_method.replaceAll(ANNOTATION, "") // extract parameter list m = p_method =~ PARAMETERS m.each({parameters = it[0].trim()}) // simplify expression by removing parameters p_method = p_method.replaceAll(PARAMETERS, "") String[] words = p_method.split(NON_EMPTY_SPACE.toString()) name=words[-1] if (words.size()==1 || words[-2] == "public" || words[-2] == "private" || words[-2] == "protected") { isConstructor = true if (words.length > 1) { visibility=words[4] } } else { type=words[-2] if (words.length > 2) { visibility=words[5] } } } }

The description of a variable

... is almost identical to the description of a method:

package de.beyondjava.VariableParser; import java.util.regex.Matcher import java.util.regex.Pattern; import static RegularExpressions.* /** * This a the definition of a variable. */ class VariableDefinition { List modifiers=[]; String name; String type; List annotations=[]; String value=null; // optional /** Receives the complete variable definition source code and extracts the name, * the type and the visibility. * @param p_variable variable definition (without assignment) */ VariableDefinition(String p_variable) { Matcher m = p_variable =~ ANNOTATION m.each({annotations = annotations + it[0].trim()}) // simplify expression by removing annotations p_variable = p_variable.replaceAll(ANNOTATION,"") // and assignments m = p_variable =~ ASSIGNMENT m.each({value = it[0].trim()}) p_variable = p_variable.replaceAll(ASSIGNMENT,"") String[] words = p_variable.split(NON_EMPTY_SPACE.toString()) name=words[-1] type=words[-2] if (words.length > 2) { modifiers=words[6] } } }

The JUnit test and example classes

package de.beyondjava.VariableParser; import java.util.List; import de.beyondjava.Beans.Address import groovy.util.GroovyTestCase; class VariableParserTest extends GroovyTestCase { public void testParser() { SimpleClassParser parser = new SimpleClassParser() List classes = parser.parse("src/de/beyondjava/Beans") assertEquals(2, classes.size()) int vars=0 int meths=0 classes.each({ClassDefinition c -> println "${c.className} contains ${c.variables?.size()} variables and ${c.methods?.size()} methods" vars += c.variables?.size() meths += c.methods?.size() }) assertEquals(7, vars) assertEquals(4, meths) } } package de.beyondjava.Beans; import javax.faces.bean.ManagedBean; import javax.validation.constraints.NotNull; import javax.validation.constraints.Size; @ManagedBean public class Address { @Size(max=40) String street = "{ lorem ipsum }"; @NotNull @Size(min=1, max=40) public String city= null; private int zipCode = 64546; /** * This is the default constructor. */ public Address() { street = "unknown"; city = "in the middle of nowhere"; zipCode = 0; } protected void initRandomly() { zipCode = (int)( System.currentTimeMillis() % 100000l); } public void initRandomly(int p_value) { zipCode = (int)( System.currentTimeMillis() % 100000l); } void increaseZipCode() { zipCode++; } } package de.beyondjava.Beans; import javax.validation.constraints.NotNull; import javax.validation.constraints.Size; public class Person { @NotNull @Size(max=30) String name; @NotNull Address adress; @NotNull Address secondaryAddress = new Address(); boolean ownsCar = false; }

Alternatives

Of course, there are more professional class parsers out there. For instance, you can use the class parser and AST generator of Eclipse. Two small frameworks making Eclipse JDT easily accessible are



Comments