A Day Refactoring Code, Part 0: Overview

I’ve been rewriting much of DoctorJ, which has been suffering some code rot after several years of existence, now at major release 5.

While poking through the code, I found the class ParsingSpellChecker, which as its name implies, spell-checks, in particular working on Javadoc comment blocks, such as skipping <pre> blocks and "{@link" tags:

package org.incava.text.spell;

import java.io.InputStream;
import java.util.*;
import org.incava.ijdk.util.MultiMap;

public class ParsingSpellChecker {
    /**
     * Whether we can spell check, which we can't until we get a word or a
     * dictionary (which is a thing with a lot of words).
     */
    private boolean canCheck;

    /**
     * The spell checker, which doesn't parse.
     */
    private final SpellChecker checker;

    /**
     * The current string we're working on.
     */
    private String str;

    /**
     * The length of the current string.
     */
    private int len;

    /**
     * The current position within the string.
     */
    private int pos;
    
    public ParsingSpellChecker(SpellChecker checker) {
        this(checker, false);
    }

    public ParsingSpellChecker(SpellChecker checker, boolean canCheck) {
        this.checker = checker;
        this.canCheck = canCheck;
    }

    public boolean addDictionary(String dictionary) {
        this.canCheck = this.checker.addDictionary(dictionary) || this.canCheck;
        return this.canCheck;
    }

    public boolean addDictionary(InputStream dictionary) {
        this.canCheck = this.checker.addDictionary(dictionary) || this.canCheck;
        return this.canCheck;
    }

    public void addWord(String word) {
        this.checker.addWord(word);
        this.canCheck = true;
    }

    public void check(String str) {
        if (this.canCheck) {
            this.str = str;
            this.len = this.str.length();
            this.pos = 0;
    
            while (hasMore()) {
                skipToWord();
                if (hasMore()) {
                    if (Character.isLetter(currentChar())) {
                        checkCurrentWord();
                    }
                    else {
                        // not at an alpha character. Might be some screwy formatting or
                        // a nonstandard tag.
                        skipThroughWord();
                    }
                }
            }
        }
    }

    /**
     * Consumes from one string to another.
     */
    protected void consumeFromTo(String from, String to) {
        if (consume(from)) {
            consumeTo(to);
        }
    }

    /**
     * Skips content between pairs of brackets, a la HTML, such as, oh,
     * "<html>foobar</html>." Doesn't handle slash-gt, such as "<html
     * style="lackthereof" />".
     */
    protected void skipBracketedSection(String section) {
        consumeFromTo("<" + section + ">", "</" + section + ">");
    }

    protected void skipLink() {
        consumeFromTo("{@link", "}");
    }
    
    protected void skipBlanks() {
        while (this.pos + 2 < this.len && currentChar() != '<' && !this.str.substring(this.pos, this.pos + 2).equals("{@") && !Character.isLetterOrDigit(currentChar())) {
            ++this.pos;
        }
    }
    
    protected void skipToWord() {
        skipBracketedSection("code");
        skipBracketedSection("pre");
        skipLink();
        consume("&;nbsp;");
    }

    protected void checkWord(String word, int position) {
        MultiMap<Integer, String> nearMatches = new MultiMap<Integer, String>();
        boolean valid = this.checker.checkCorrectness(word, nearMatches);
        if (!valid) {
            wordMisspelled(word, position, nearMatches);
        }
    }

    protected void wordMisspelled(String word, int position, MultiMap<Integer, String> nearMatches) {
        int nPrinted = 0;
        final int printGoal = 15;
        for (int i = 0; nPrinted < printGoal && i < 4; ++i) { // 4 == max edit distance
            Collection<String> matches = nearMatches.get(i);
            if (matches != null) {
                for (String match : matches) {
                    System.out.println("    near match '" + match + "': " + i);
                    ++nPrinted;
                }
            }
        }
    }

    protected boolean consume(String what) {
        skipBlanks();
        if (this.pos + what.length() < this.len && this.str.substring(this.pos).startsWith(what)) {
            this.pos += what.length();
            return true;
        }
        else {
            return false;
        }
    }

    protected void consumeTo(String what) {
        int len = this.str.length();
        while (this.pos < len && this.pos + what.length() < len && !this.str.substring(this.pos).startsWith(what)) {
            ++this.pos;
        }
    }

    protected Character currentChar() {
        return this.str.charAt(this.pos);
    }

    protected boolean hasMore() {
        return this.pos < this.len;
    }

    protected void checkCurrentWord() {
        StringBuffer word = new StringBuffer();
        word.append(currentChar());
        
        int startingPosition = this.pos;
        boolean canCheck = true;

        ++this.pos;

        // spell check words that do not have:
        //     - mixed case (varName)
        //     - embedded punctuation ("wouldn't", "pkg.foo")
        //     - numbers (M16, BR549)
        while (hasMore()) {
            char ch = currentChar();
            if (Character.isWhitespace(ch)) {
                break;
            }
            else if (Character.isLowerCase(ch)) {
                word.append(ch);
                ++this.pos;
            }
            else if (Character.isUpperCase(ch)) {
                skipThroughWord();
                return;
            }
            else if (Character.isDigit(ch)) {
                skipThroughWord();
                return;
            }
            else {
                // must be punctuation, which we can check it if there's nothing
                // but punctuation up to the next space or end of string.
                if (this.pos + 1 == this.len) {
                    // that's OK to check
                    break;
                }
                else {
                    ++this.pos;
                    while (hasMore() && !Character.isWhitespace(currentChar()) && !Character.isLetterOrDigit(currentChar())) {
                        // skipping through punctuation
                        ++this.pos;
                    }
                    if (this.pos == this.len || Character.isWhitespace(currentChar())) {
                        // punctuation ended the word, so we can check this
                        break;
                    }
                    else {
                        // punctuation did NOT end the word, so we can NOT check this
                        skipThroughWord();
                        return;
                    }
                }
            }
        }

        // has to be more than one character:
        if (canCheck && this.pos - startingPosition > 1) {
            checkWord(word.toString(), startingPosition);
        }
    }

    protected void skipThroughWord() {
        ++this.pos;
        while (hasMore() && !Character.isWhitespace(currentChar())) {
            ++this.pos;
        }
    }
}

The problem with ParsingSpellChecker is that the low-level functionality (going through a string by “position” (current index))and checking against string length) don’t match the more high-level behavior of the class.

Another red flag in the class is there are three instance fields (len, pos, and str) that are very tightly connected, with essentially each of them having to be used with one another.

It is also telling that the indentation levels within the class are very deep, also indicating poor abstraction. The fewer indentation levels, the more likely it is that the code is evenly abstracted at the proper level.

Code repetition is another issue, with statements such as this.pos++ found in many locations.

From running PMD, we know that StringBuilder is preferred over StringBuffer for performance reasons. Similarly, by building its own substrings (via StringBuffer.append()), the code does not take advantage of String.substring, which uses offsets in the newly-created String to refer to a substring within the lower-level char array.

That’s the overview. In the next post, we’ll begin rewriting this mess.

Advertisements

2 thoughts on “A Day Refactoring Code, Part 0: Overview

  1. Pingback: Unit Testing Principles | Jeff Pace’s Blog

  2. Pingback: A Day Refactoring Code, Part 1: Rewriting | Jeff Pace's Blog

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s