Matthew T. Caldwell

Sep 19 2011

Setting up Python Code Completion in Mac OS X with Vim

The Problem

The vim editor that is shipped with Mac OS X is not compiled with Python support, so pysmell (a wonderful tool which we will install momentarily) won’t play nicely until we do something about that.

The Solution
1. hg clone https://vim.googlecode.com/hg/ vim

2. cd vim

3. ./configure —prefix=/usr/local —enable-rubyinterp —enable-pythoninterp —with-features=huge

4. make

5. make install

6. export PATH=/usr/local/bin:$PATH

7. pip install pysmell

8. In vimrc: autocmd FileType python setlocal omnifunc=pysmell#Complete

9. You’re all set!

Sep 08 2011

keep your dotfiles on github

Special thanks to my friend djohnston (github link).

Check out my dotfiles here.

May 13 2011

Finding Duplicate Rows in a Table

Simple, but effective:

select attribute, count(*) from table group by attribute having count(*) > 1;

Feb 01 2011
Jan 26 2011
Jan 25 2011

Using Memcache for Locks

Not a good idea in probably 95% of circumstances, but for when it’s desired:

def acquire_lock(lock_name, expires=60, sleep_interval=0.5):
	""" Locks a section of code using a memcache key. """
	while not memcache.add(lock_name, True, time=expires):
		logging.info('waiting to acquire lock %s' % lock_name)
		time.sleep(sleep_interval)
	logging.debug('acquired lock %s' % lock_name)

21 notes

+
Jan 24 2011
+
Jan 11 2011

Code Kata - OCR (my solution)

To revisit the code kata I mentioned in a previous post (outlined here), I’ve posted my solution below.  First, an overview:

Though I’m normally very aware of pattern “over-usage” and make a conscious effort to avoid it, I found that the described problem lent itself very well to several patterns; most notably, the Builder and Flyweight patterns.  These patterns were utilized in the three classes I’ve posted below: AccountNumberBuilder, DigitBuilder, and Digit.

package matt.kata.ocr;

import java.util.HashSet;
import java.util.Set;

/**
 * Builder for AccountNumbers.
 * 
 * @author matt
 *
 */
public class AccountNumberBuilder {
	/** Array of lines representing the account number. **/
	private String[] lines = new String[3];
	
	/** The number of lines already added. **/
	private int numLines = 0;
	
	/**
	 * Scans a line and adds it to the builder.
	 * 
	 * @param line
	 */
	public void scanLine(String line) {
		if(numLines < 3)
			lines[numLines++] = line;
	}
	
	/**
	 * Purges the builder so that it can be reused.
	 */
	public void purge() {
		lines = new String[3];
		numLines = 0;
	}

	/**
	 * Builder method to return an account number.
	 * 
	 * @return An AccountNumber represented by the scanned lines.  Returns
	 * null if the scanned lines were not of an appropriate length or if the
	 * number of lines read is not sufficient or appropriate for representing
	 * an account number. 
	 */
	public AccountNumber build() {
		return build(true); // Turn AI on by default
	}

	/**
	 * Builder method to return an account number.
	 * See the above method description.
	 * 
	 * @param attemptBestGuess
	 * @return
	 */
	public AccountNumber build(boolean attemptBestGuess) {
		AccountNumber accountNumber = null;
		Digit[] digits = new Digit[9];
		if((numLines == 3) 	&& 	(lines[0].length() == 27)
							&&	(lines[1].length() == 27)
							&&	(lines[2].length() == 27))
		{
			/* Read digits from left to right. */
			Digit digit = null;
			DigitBuilder digitBuilder = new DigitBuilder();
			for(int i=0; i < 27; i+=3) {
				digitBuilder.scanLine(lines[0].substring(i, i+3));
				digitBuilder.scanLine(lines[1].substring(i, i+3));
				digitBuilder.scanLine(lines[2].substring(i, i+3));
				digit = digitBuilder.build();
				digitBuilder.purge(); // purge so we can reuse the builder
				
				if(digit != null) {
					digits[i/3] = digit;
				} else {
					// This shouldn't happen
					break;
				}
			}
			
			accountNumber = new AccountNumber(digits);
		}

		if(attemptBestGuess) {
			// If the account number is invalid, try to correct errors
			Set closeMatches;
			Set candidates = new HashSet();
			if(!accountNumber.isValid()) {
				closeMatches = AccountNumber.getCloseMatches(accountNumber);
	
				for(AccountNumber match : closeMatches) {
					if(match.isValid()) {
						candidates.add(match);
					}
				}
	
				if(candidates.size() == 1) {
					accountNumber = candidates.iterator().next();
					candidates.clear();
				}
	
				accountNumber.setBestGuesses(candidates);
			}
		}
		
		return accountNumber;
	}
}



package matt.kata.ocr;

/**
 * Builder for Digits.
 * 
 * @author matt
 *
 */
public class DigitBuilder {
	/** A StringBuilder used to build the digit. **/
	private StringBuilder builder = new StringBuilder();
	
	/** An int representing the number of lines scanned. **/
	private int numLines = 0;
	
	/**
	 * Scans a line of the digit.
	 * @param line
	 */
	public void scanLine(String line) {
		builder.append(line);
		if(++numLines < 3)
			builder.append("\n");
	}
	
	/**
	 * Purges the builder so that it can be reused.
	 */
	public void purge() {
		builder = new StringBuilder();
		numLines = 0;
	}
	
	/**
	 * Builds the digit and returns it.
	 * 
	 * @return The Digit represented by the scanned lines.  Returns null if
	 * the block of text representing the digit is not formatted correctly.
	 */
	public Digit build() {
		Digit digit = null;
		if((numLines == 3) && (builder.length() == 11)) {
			digit = Digit.getDigit(builder.toString());
		}
		
		return digit;
	}
}




package matt.kata.ocr;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

/**
 * Flyweight class representing OCR digits.
 * 
 * @author matt
 *
 */
public class Digit {
	/** Static flyweight map containing Digits which have been encountered. **/
	private static Map digits = new HashMap();
	
	/** The block of text representing Digit. **/
	private String blockRep = null;
	
	/** A map representing acceptable blocks of characters and the string representation of their
	 * numerical equivalents.
	 */
	public static final Map VALID_DIGITS;

	/** Enum of valid digits. **/
	public static enum DigitRep {
		ZERO 	(	        " _ \n" +
				 	"| |\n" +
				 	"|_|"		),
		
		ONE		(	"   \n" +
					"  |\n" +
					"  |"		),

		TWO		(	" _ \n" +
					" _|\n" +
					"|_ "		),

		THREE	(	        " _ \n" +
					" _|\n" +
					" _|"		),

		FOUR	(	        "   \n" +
					"|_|\n" +
					"  |"		),

		FIVE	(	        " _ \n" +
					"|_ \n" +
					" _|"		),

		SIX		(	" _ \n" +
					"|_ \n" +
					"|_|"		),

		SEVEN	(	        " _ \n" +
					"  |\n" +
					"  |"		),

		EIGHT	(	        " _ \n" +
					"|_|\n" +
					"|_|"		),

		NINE	(	        " _ \n" +
					"|_|\n" +
					" _|"		);

		private String blockRep;

		DigitRep(String blockRep) {
			this.blockRep = blockRep;
		}

		@Override
		public String toString() {
			return blockRep;
		}
	}
	
	static {
		/* Initialize the map of acceptable character blocks. */
		Map tmpMap = new HashMap();
		tmpMap.put(	DigitRep.ZERO.toString(), 	"0");
		tmpMap.put(	DigitRep.ONE.toString(), 	"1");
		tmpMap.put(	DigitRep.TWO.toString(), 	"2");
		tmpMap.put(	DigitRep.THREE.toString(), 	"3");
		tmpMap.put(	DigitRep.FOUR.toString(), 	"4");
		tmpMap.put(	DigitRep.FIVE.toString(), 	"5");
		tmpMap.put(	DigitRep.SIX.toString(), 	"6");
		tmpMap.put(	DigitRep.SEVEN.toString(), 	"7");
		tmpMap.put(	DigitRep.EIGHT.toString(), 	"8");
		tmpMap.put(	DigitRep.NINE.toString(), 	"9");		
		VALID_DIGITS = Collections.unmodifiableMap(tmpMap);
	}
	
	/**
	 * Private constructor to enforce flyweight/factory pattern.
	 * 
	 * @param blockRep
	 */
	private Digit(String blockRep) {
		this.blockRep = blockRep;
	}

	/**
	 * Flyweight factory method to create a Digit object from the "block"
	 * representation of the digit.
	 * @param blockRep The "block" representation of a digit.  Should look
	 * something like:
	 * " _ \n"
	 * "|_|\n"
	 * "|_|"
	 * @return A flyweight Digit object representing the digit.
	 */
	public static synchronized Digit getDigit(String blockRep) {
		Digit digit = digits.get(blockRep);
		if(digit == null) {
			digit = new Digit(blockRep);
			digits.put(blockRep, digit);
		}
		
		return digit;
	}
	
	/**
	 * Determines whether or not the digit is valid.
	 * @return true if the digit is valid, else false.
	 */
	public boolean isValid() {
		return VALID_DIGITS.containsKey(blockRep);
	}

	/**
	 * Accessor method to get the block representation of this digit.
	 * 
	 * @return
	 */
	public String getBlockRep() {
		return blockRep;
	}

	@Override
	public String toString() {
		return VALID_DIGITS.containsKey(blockRep) ? VALID_DIGITS.get(blockRep) : "?";
	}
    }

The Flyweight pattern works well in this case, since there are only 10 canonical Digit objects, each of which can occur multiple times. Flyweight ensures that we don’t waste memory in the course of their processing.  Finally, we have the AccountNumber class and the unit test class, TestOCR (the OCRException class is just a generic Exception wrapper, so I’ve decided it’s too trivial to post).

package matt.kata.ocr;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

/**
 * Class representing an account number.
 * 
 * @author matt
 *
 */
public class AccountNumber {
	/** An array of 9 digits comprising the account number. **/
	private Digit[] digits = new Digit[9];

	/** A set representing the best guesses for a valid account number (if it's invalid). **/
	private Set bestGuesses = new HashSet();
	
	/**
	 * Creates an AccountNumber from the array of digits.
	 * @param digits
	 */
	public AccountNumber(Digit[] digits) {
		this.digits = digits;
	}

	/**
	 * Returns a set of AccountNumbers which have a Levenshtein distance of 1 from
	 * accountNumber.  The set is not exhaustive, as transpositions, deletions, and
	 * insertions are not considered (to stay within the bounds of the problem scope).
	 * Only alterations are considered, choosing from the alphabet of (" ", "_", "|").
	 * The algorithm could be extended to consider Levenshtein distances of 2 or
	 * greater, but that is outside the scope of the problem.
	 * 
	 * References:
	 *  - http://en.wikipedia.org/wiki/Levenshtein_distance
	 *  - http://norvig.com/spell-correct.html
	 * 
	 * @param accountNumber
	 * @return
	 */
	public static Set getCloseMatches(AccountNumber accountNumber) {
		Set adjacentAcctNums = new HashSet();
		Digit[] digits = accountNumber.getDigits();

		// tmp vars
		Digit[] modifiedDigits;
		Character currentChar;
		String blockRep;
		String modifiedBlockRep;
		String prefix;
		String postfix;

		for(int i=0; i < digits.length; i++) {
			blockRep = digits[i].getBlockRep();
			modifiedDigits = Arrays.copyOf(digits, digits.length); // deep copy the original array

			for(int j=0; j < blockRep.length(); j++) {
				currentChar = blockRep.charAt(j);

				if(currentChar != '\n') {
					prefix = blockRep.substring(0, j);
					postfix = blockRep.substring(j+1, blockRep.length());

					// TODO: I don't like all these array deep copies.  This can be refactored to be more performant.
					if(currentChar == ' ') {
						modifiedBlockRep = prefix + "|" + postfix;
						modifiedDigits[i] = Digit.getDigit(modifiedBlockRep);
						adjacentAcctNums.add(new AccountNumber(Arrays.copyOf(modifiedDigits, modifiedDigits.length)));
						
						modifiedBlockRep = prefix + "_" + postfix;
						modifiedDigits[i] = Digit.getDigit(modifiedBlockRep);
						adjacentAcctNums.add(new AccountNumber(Arrays.copyOf(modifiedDigits, modifiedDigits.length)));
					} else if(currentChar == '|') {
						modifiedBlockRep = prefix + " " + postfix;
						modifiedDigits[i] = Digit.getDigit(modifiedBlockRep);
						adjacentAcctNums.add(new AccountNumber(Arrays.copyOf(modifiedDigits, modifiedDigits.length)));
						
						modifiedBlockRep = prefix + "_" + postfix;
						modifiedDigits[i] = Digit.getDigit(modifiedBlockRep);
						adjacentAcctNums.add(new AccountNumber(Arrays.copyOf(modifiedDigits, modifiedDigits.length)));
					} else if(currentChar == '_') {
						modifiedBlockRep = prefix + "|" + postfix;
						modifiedDigits[i] = Digit.getDigit(modifiedBlockRep);
						adjacentAcctNums.add(new AccountNumber(Arrays.copyOf(modifiedDigits, modifiedDigits.length)));
						
						modifiedBlockRep = prefix + " " + postfix;
						modifiedDigits[i] = Digit.getDigit(modifiedBlockRep);
						adjacentAcctNums.add(new AccountNumber(Arrays.copyOf(modifiedDigits, modifiedDigits.length)));
					} else {
						// TODO: Error condition (invalid character).
					}
				}
			}
		}

		return adjacentAcctNums;
	}

	@Override
	public String toString() {
		StringBuilder builder = new StringBuilder();
		for(Digit digit : digits) {
			builder.append(digit.toString());
		}
		
		return builder.toString();
	}

	/**
	 * Returns a detailed string including a status message.
	 * 
	 * @return
	 */
	public String toDetailedString() {
		return toString() + getStatusString();
	}
	
	/**
	 * Calculates and returns the checksum for this account number.
	 * 
	 * @return The calculated checksum. Returns -1 if the checksum is
	 * not calculable (i.e. the account number contains a "?").
	 */
	public int calculateChecksum() {
		int total = 0;

		if(!Arrays.toString(digits).matches("(.*)\\?(.*)")) {
			int tmpInt;
			for(int i=1; i <= digits.length; i++) {
				tmpInt = Integer.parseInt(digits[i-1].toString());
				total += (10-i)*tmpInt;
			}
		} else {
			total = -1; // if the account number contains "?", return -1
		}
		
		return total;
	}
	
	/**
	 * Determines whether or not the AccountNumber is valid, according to its
	 * checksum.  The checksum will be invalid if the account number contains
	 * illegal characters.
	 * 
	 * @return true if the AccountNumber is valid, else false.
	 */
	public boolean isValid() {
		return calculateChecksum() % 11 == 0;
	}

	/**
	 * Accessor method to return the array of Digits comprising this account number.
	 * 
	 * @return
	 */
	public Digit[] getDigits() {
		return digits;
	}
	
	/**
	 * Returns a status string for the account number.  "ILL" indicates the account number
	 * contains illegal characters.  "ERR" indicates the checksum was not valid.  "AMB"
	 * indicates that there were errors in scanning which was determined to be ambiguous,
	 * but the ambiguities could not be resolved.
	 * @return
	 */
	public String getStatusString() {
		String status = "";

		if(!isValid()) {
			if(!bestGuesses.isEmpty()) {
				status = " AMB [";

				for(AccountNumber acctNum : bestGuesses) {
					status += "'" + acctNum + "', ";
				}

				status = status.substring(0, status.length() -2);
				status += "]";
			} else {
				if(toString().contains("?")) {
					status = " ILL";
				} else {
					status = " ERR";
				}
			}
		}

		return status;
	}

	/**
	 * Mutator method to set the best guesses for this AccountNumber.
	 * 
	 * @param bestGuesses
	 */
	public void setBestGuesses(Set bestGuesses) {
		this.bestGuesses = bestGuesses;
	}

	/**
	 * Accessor method to get the best guesses for this AccountNumber.
	 * 
	 * @return
	 */
	public Set getBestGuesses() {
		return bestGuesses;
	}
}



package matt.kata.ocr;

import java.util.Set;
import junit.framework.TestCase;

/**
 * Test cases.
 * 
 * @author matt
 *
 */
public class TestOCR extends TestCase {
	public void testScanValidDigit() {
		String[] digitString = {
			" _ ",
			" _|",
			"|_ "
		};
		
		DigitBuilder builder = new DigitBuilder();
		builder.scanLine(digitString[0]);
		builder.scanLine(digitString[1]);
		builder.scanLine(digitString[2]);

		Digit digit = builder.build();
		assertTrue(digit.isValid());
		assertEquals("2", digit.toString());
	}
	
	public void testScanInvalidDigit() {
		String[] digitString = {
			" _ ",
			" _ ",
			"|_ "
		};
			
		DigitBuilder builder = new DigitBuilder();
		builder.scanLine(digitString[0]);
		builder.scanLine(digitString[1]);
		builder.scanLine(digitString[2]);

		Digit digit = builder.build();
		assertFalse(digit.isValid());
		assertEquals("?", digit.toString());
	}

	public void testScanDigitInvalidFormat() {
		String[] digitString = {
			" ___", // 4 characters per line (not 3)
			" _  ", // this is invalid
			"|_  "
		};
	
		DigitBuilder builder = new DigitBuilder();
		builder.scanLine(digitString[0]);
		builder.scanLine(digitString[1]);
		builder.scanLine(digitString[2]);

		Digit digit = builder.build();
		assertNull(digit);
	}

	public void testChecksumCalc() {
		String[] accountNumString = {
			"    _  _     _  _  _  _  _ ",
			"  | _| _||_||_ |_   ||_||_|",
			"  ||_  _|  | _||_|  ||_| _|"
		};

		AccountNumberBuilder builder = new AccountNumberBuilder();
		builder.scanLine(accountNumString[0]);
		builder.scanLine(accountNumString[1]);
		builder.scanLine(accountNumString[2]);

		AccountNumber accountNumber = builder.build();
		assertEquals(165, accountNumber.calculateChecksum());
	}

	public void testScanValidAccountNumber1() {
		String[] accountNumString = {
			" _  _  _  _  _  _  _  _  _ ",
			"| || || || || || || || || |",
			"|_||_||_||_||_||_||_||_||_|"
		};

		AccountNumberBuilder builder = new AccountNumberBuilder();
		builder.scanLine(accountNumString[0]);
		builder.scanLine(accountNumString[1]);
		builder.scanLine(accountNumString[2]);

		AccountNumber accountNumber = builder.build();
		assertTrue(accountNumber.isValid());
		assertEquals("000000000", accountNumber.toString());
	}

	public void testScanValidAccountNumber2() {
		String[] accountNumString = {
			"    _  _     _  _  _  _  _ ",
			"  | _| _||_||_ |_   ||_||_|",
			"  ||_  _|  | _||_|  ||_| _|"
		};

		AccountNumberBuilder builder = new AccountNumberBuilder();
		builder.scanLine(accountNumString[0]);
		builder.scanLine(accountNumString[1]);
		builder.scanLine(accountNumString[2]);

		AccountNumber accountNumber = builder.build();
		assertTrue(accountNumber.isValid());
		assertEquals("123456789", accountNumber.toString());
	}

	public void testScanInvalidAccountNumber1() {
		String[] accountNumString = {
			" _  _  _     _  _  _  _  _ ",
			" _| _| _||_||_ |_   ||_||_|",
			"|_ |_  _|  | _||_|  ||_| _|"
		};

		AccountNumberBuilder builder = new AccountNumberBuilder();
		builder.scanLine(accountNumString[0]);
		builder.scanLine(accountNumString[1]);
		builder.scanLine(accountNumString[2]);

		AccountNumber accountNumber = builder.build(false); // don't attempt to correct
		assertFalse(accountNumber.isValid()); // invalid checksum
		assertEquals("223456789", accountNumber.toString());
	}

	public void testScanInvalidAccountNumber2() {
		String[] accountNumString = {
			" _  _        _  _  _  _  _ ", // invalid digits
			" _| _| _||_||_ |_   ||_||_|",
			"|_ |_  _|  | _ |_|  ||_| _|"
		};

		AccountNumberBuilder builder = new AccountNumberBuilder();
		builder.scanLine(accountNumString[0]);
		builder.scanLine(accountNumString[1]);
		builder.scanLine(accountNumString[2]);

		AccountNumber accountNumber = builder.build();
		assertFalse(accountNumber.isValid()); // invalid checksum
		assertEquals("22?4?6789", accountNumber.toString());
	}

	public void testGenerateCloseMatches() {
		String[] accountNumString = {
			"    _  _     _  _  _  _  _ ",
			"  | _| _||_||_ |_   ||_||_|",
			"   |_  _|  | _||_|  ||_| _|"
		};

		AccountNumberBuilder builder = new AccountNumberBuilder();
		builder.scanLine(accountNumString[0]);
		builder.scanLine(accountNumString[1]);
		builder.scanLine(accountNumString[2]);

		AccountNumber accountNumber = builder.build(false); // don't attempt to correct
		assertEquals("?23456789", accountNumber.toString());
		Set matches = AccountNumber.getCloseMatches(accountNumber);
		assertEquals(9*9*2, matches.size());

		boolean found = false;
		for(AccountNumber acctNum : matches) {
			if(acctNum.toString().equals("123456789")) {
				found = true;
				break;
			}
		}

		assertTrue(found);
	}

	public void testCorrectMisreadCharacter1() {
		String[] accountNumString = {
			"    _  _     _  _  _  _  _ ",
			"  | _| _||_||_ |_   ||_||_|",
			"   |_  _|  | _||_|  ||_| _|"
		};

		AccountNumberBuilder builder = new AccountNumberBuilder();
		builder.scanLine(accountNumString[0]);
		builder.scanLine(accountNumString[1]);
		builder.scanLine(accountNumString[2]);

		AccountNumber accountNumber = builder.build();
		assertEquals("123456789", accountNumber.toString());
	}

	public void testAmbiguous1() {
		String[] accountNumString = {
			" _  _  _  _  _  _  _  _  _ ",
			"|_||_||_||_||_||_||_||_||_|",
			"|_||_||_||_||_||_||_||_||_|"
		};

		AccountNumberBuilder builder = new AccountNumberBuilder();
		builder.scanLine(accountNumString[0]);
		builder.scanLine(accountNumString[1]);
		builder.scanLine(accountNumString[2]);

		AccountNumber accountNumber = builder.build();
		assertTrue(accountNumber.toDetailedString().matches("888888888 AMB \\[(.*)'888886888'(.*)\\]"));
		assertTrue(accountNumber.toDetailedString().matches("888888888 AMB \\[(.*)'888888880'(.*)\\]"));
		assertTrue(accountNumber.toDetailedString().matches("888888888 AMB \\[(.*)'888888988'(.*)\\]"));
	}

	public void testAmbiguous2() {
		String[] accountNumString = {
			" _  _  _  _  _  _  _  _  _ ",
			"|_ |_ |_ |_ |_ |_ |_ |_ |_ " ,
			"|_||_||_||_||_||_||_||_||_|"
		};

		AccountNumberBuilder builder = new AccountNumberBuilder();
		builder.scanLine(accountNumString[0]);
		builder.scanLine(accountNumString[1]);
		builder.scanLine(accountNumString[2]);

		AccountNumber accountNumber = builder.build();
		assertTrue(accountNumber.toDetailedString().matches("666666666 AMB \\[(.*)'686666666'(.*)\\]"));
		assertTrue(accountNumber.toDetailedString().matches("666666666 AMB \\[(.*)'666566666'(.*)\\]"));
	}

	public void testStatusStringIllegal() {
		String[] accountNumString = {
			" _           _  _  _  _  _ ",
			" _|  | _||_| _ |_   ||_||_|",
			" _|  | _|  | _ |_|  | _| _|"
		};

		AccountNumberBuilder builder = new AccountNumberBuilder();
		builder.scanLine(accountNumString[0]);
		builder.scanLine(accountNumString[1]);
		builder.scanLine(accountNumString[2]);

		AccountNumber accountNumber = builder.build();
		assertFalse(accountNumber.isValid());
		assertEquals("31?4?6799", accountNumber.toString());
		assertEquals(" ILL", accountNumber.getStatusString());
		assertEquals("31?4?6799 ILL", accountNumber.toDetailedString());
	}
}

19 notes

Page 1 of 2