In the previous posts, the frequency of n-grams in a given corpus was calculated using suffix arrays. Although this works well, I wondered if there was a more accessible way to find a string’s frequency. The Map type is being used a lot, so I rewrote some previous code using a Map:

import qualified Data.Map as M
import qualified Data.List as L
import System.Environment (getArgs)
import System.IO
import Data.Char (toLower, toUpper)

main = do
	(command:args) <- getArgs
	case command of
		"freq" -> freq args
		_ -> putStrLn "Error: command does not existnPossible commands are:↵
n-freq string filenamen"

{-frequency: calculate frequency of word -}		
freq :: [String] -> IO()
freq [string, fileName] = do
		fileHandle <- openFile fileName ReadMode
		contents <- hGetContents fileHandle
		let corpus = prepare contents
		let alpha = words string
		putStrLn ("Frequency "" ++ string ++ "": " ++ (show $ frequency corpus alpha))
		hClose fileHandle
freq _ = do
		putStrLn "Error: the freq command requires two arguments: freq word filename"

frequency :: (Ord a) => [a] -> [a] -> Maybe Int
frequency corpus alpha = biFrequency nMap alpha
		where 
			nMap = createMap $ L.sort $ nGrams (length alpha) corpus 

biFrequency :: (Ord a) => M.Map Int [a] -> [a] -> Maybe Int
biFrequency nMap alpha  = do
			sLast 	<- lastOccur nMap alpha 0 (M.size nMap - 1)
			sFirst	<- firstOccur nMap alpha 0 sLast
			return (sLast - sFirst + 1)

firstOccur :: (Ord a) => M.Map Int [a] -> [a] -> Int -> Int -> Maybe Int
firstOccur nMap alpha lower upper	
			| M.null nMap = Nothing
			| upper == lower = case compare alpha (nMap M.! lower) of 
				EQ -> Just lower
				_ -> Nothing
			| otherwise = case compare alpha (nMap M.! middle) of
				GT -> firstOccur nMap alpha (middle + 1) upper
				_ -> firstOccur nMap alpha lower middle
			where middle = (lower + upper) `div` 2

lastOccur :: (Ord a) => M.Map Int [a] -> [a] -> Int -> Int -> Maybe Int
lastOccur nMap alpha lower upper 	
			| M.null nMap = Nothing
			| upper == lower = case compare alpha (nMap M.! lower) of
					EQ -> Just lower
					_ -> Nothing
			| otherwise = case compare alpha (nMap M.! middle) of
					LT -> lastOccur nMap alpha lower (middle - 1)
					_ -> lastOccur nMap alpha middle upper
			where middle = ((lower + upper) `div` 2) + 1

--create a list of n-grams
nGrams :: Int -> [a] -> [[a]]
nGrams n [] = []
nGrams n xs = take n xs : nGrams n (tail xs)

--convert list to Map
createMap :: (Ord a) => [[a]] -> M.Map Int [a]
createMap list = M.fromList (zip index list)
			where index = [0..(length list - 1)]

--clean a String: replace punctuation | map to lowercase | convert to a list of words
prepare :: String -> [String]
prepare = words . map toLower . replace'

replace' :: [Char] -> [Char]
replace' [] = []
replace' (x:xs) = case x of
				'.' -> " _." ++ replace' xs
				',' -> " _," ++ replace' xs
				';' -> " _;" ++ replace' xs
				'"' -> " _"" ++ replace' xs
				'!' -> " _!" ++ replace' xs
				':' -> " _:" ++ replace' xs
				'?' -> " _?" ++ replace' xs
				_ -> x : replace' xs

flCompare :: (Ord a) => M.Map Int [a] -> [a] -> [a] -> Ordering
flCompare nMap x y = compare (biFrequency nMap x) (biFrequency nMap y)

This works as it should:

>time ./final_brown freq “the” brown.txt
Frequency “the”: Just 69174

real 0m17.524s
user 0m16.890s
sys 0m0.467s
>time ./final_brown freq “the vice president” brown.txt
Frequency “the vice president”: Just 6

real 0m22.502s
user 0m21.838s
sys 0m0.541s

The next step is finding the most frequent n-gram in the Brown Corpus using the Map type. Time to put those little grey cells of mine to some use :-).