A short post again. I wrote functions that add the default “NN” tag to the model if the token does not contain a tag. More precisely: when the tag value is Nothing instead of Just a tag. I also copied some functions that should run the model against an untagged test file. The distinction between modeling and testing proves a bit artificial, I might change it in the future.

As an aside, the following days will be filled with live jazz music, so don’t be surprised if I don’t publish a post very soon.

import qualified Data.Map as M
import qualified Data.List as L
import Data.List.Zipper as Z
import Data.Maybe as DM
import System.Environment (getArgs)
import System.IO
 
{-----------
DATA TYPES
-----------}
type Token = String
type Tag = String
type TokenTag = (Token, Tag)
 
{-----------}
{-------------------
MODEL TRAINER
-------------------}
-- split the tokens and the tags
toTokenTag :: Char -> String -> TokenTag
toTokenTag separator string =
   let (token, tag, _) = rsplit_ separator string
   in (token, tag)
 
rsplit_ :: (Eq a) => a -> [a] -> ([a], [a], Bool)
rsplit_ separator = foldr (splitBool separator) ([],[],False)
   where
      splitBool separator letter (token, tag, True) = (letter:token, tag, True)
      splitBool separator letter (token, tag, False) | letter == separator = (token, tag, True)
                                                     | otherwise = (token, letter:tag, False)
-- Step 1: Frequency
--  Calculate the frequency of tags for a token
tokenTagFreqs :: [TokenTag] -> M.Map Token (M.Map Tag Int)
tokenTagFreqs = L.foldl' countWord M.empty
   where
      countWord map1 (token, tag) = M.insertWith (countTag tag) token (M.singleton tag 1) map1
      countTag tag _ map2 = M.insertWith ( newFreq oldFreq -> oldFreq + newFreq) tag 1 map2
 
-- Find the most frequent tag for a token
tokenMostFreqTag :: M.Map Token (M.Map Tag Int) -> M.Map Token Tag
tokenMostFreqTag = M.map (fst . M.foldlWithKey findMax ("NIL", 0))
   where findMax acc@(_, maxFreq) tag freq
           | freq > maxFreq = (tag, freq)
           | otherwise = acc
 
-- Step 2: add default tag
-- add default tag to all tokens without a tag (probably"NN")
defaultTag :: Tag -> (Token, Maybe Tag) -> (Token, Tag)
defaultTag baseTag (token, Nothing) = (token, baseTag)
defaultTag baseTag (token, Just tag) = (token, tag)
 
{-------------
MODEL TESTER
--------------}
lookupToken :: M.Map Token Tag -> Token -> Maybe Tag
lookupToken myMap token = M.lookup token myMap
 
freqTest :: [Token] -> M.Map Token Tag -> [(Token, Maybe Tag)]
freqTest wordList freqModel = L.zip wordList tagList
   where tagList = map (lookupToken freqModel) wordList
 
baseTest :: [(Token, Maybe Tag)] -> Tag -> [(Token, Tag)]
baseTest tokenTag baseTag = map (defaultTag baseTag) tokenTag