I haven’t posted something for a while, but I have been programming and rewriting parts of the posTagger from www.nlpwp.org:

  • the tagger now works with three different files (training, testing, evaluation)
  • it is trained based on the frequency of tags
  • and adds the most common tag (“NN”).

Next step is the implementation of transformational rules, but I have to dive into the Data.List.Zipper package first. There is also a small difference in statistics from the nlpwp.org version that I have to check.

posTagger   
  1. import qualified Data.Map as M
  2. import qualified Data.List as L
  3. import Data.List.Zipper as Z
  4. import Data.Maybe as DM
  5. import System.Environment (getArgs)
  6. import System.IO
  7.  
  8. {-----------
  9. DATA TYPES
  10. -----------}
  11. type Token = String
  12. type Tag = String
  13. type TokenTag = (Token, Tag)
  14.  
  15. {----------------
  16. IO: dealing with the outside world
  17. -----------------}
  18. main = do
  19. (command:args) <- getArgs
  20. case command of
  21. "tag" -> tag args
  22. _ -> putStrLn "Error: command does not existnPossible commands are:n-tag filenamen"
  23.  
  24. tag :: [String] -> IO()
  25. tag [trainName, testName, evalName] = do
  26. -- open files and retrieve contents
  27. trainFile <- openFile trainName ReadMode
  28. trainContents <- hGetContents trainFile
  29. testFile <- openFile testName ReadMode
  30. testContents <- hGetContents testFile
  31. evalFile <- openFile evalName ReadMode
  32. evalContents <- hGetContents evalFile
  33. let model = tokenMostFreqTag $ tokenTagFreqs $ map (toTokenTag '/') $ words trainContents
  34. -- test frequency tagging and print results
  35. putStrLn "***Frequency tagging***"
  36. let freqTags = freqTest model $ words testContents
  37. let evalTokenTag = map (toTokenTag '/') $ words evalContents
  38. let statFreqTags = compareMaybe evalTokenTag freqTags [0,0,0]
  39. prettyPrint statFreqTags
  40. -- test frequency tagging + base tagging and print results
  41. putStrLn "***Base tagging***"
  42. let baseTags = baseTest freqTags "NN"
  43. let statBaseTags = compareTag evalTokenTag baseTags [0,0]
  44. prettyPrint statBaseTags
  45. hClose testFile
  46. hClose evalFile
  47. test _ = do
  48. putStrLn "Error: the tag command requires three arguments: tag trainName testName evalName"
  49.  
  50. {-------------------
  51. MODEL TRAINER
  52. -------------------}
  53. -- split the tokens and the tags
  54. toTokenTag :: Char -> String -> TokenTag
  55. toTokenTag separator string =
  56. let (token, tag, _) = rsplit_ separator string
  57. in (token, tag)
  58.  
  59. rsplit_ :: (Eq a) => a -> [a] -> ([a], [a], Bool)
  60. rsplit_ separator = foldr (splitBool separator) ([],[],False)
  61. where
  62. splitBool separator letter (token, tag, True) = (letter:token, tag, True)
  63. splitBool separator letter (token, tag, False) | letter == separator = (token, tag, True)
  64. | otherwise = (token, letter:tag, False)
  65. -- Step 1: Frequency
  66. -- Calculate the frequency of tags for a token
  67. tokenTagFreqs :: [TokenTag] -> M.Map Token (M.Map Tag Int)
  68. tokenTagFreqs = L.foldl' countWord M.empty
  69. where
  70. countWord map1 (token, tag) = M.insertWith (countTag tag) token (M.singleton tag 1) map1
  71. countTag tag _ map2 = M.insertWith ( newFreq oldFreq -> oldFreq + newFreq) tag 1 map2
  72.  
  73. -- Find the most frequent tag for a token
  74. tokenMostFreqTag :: M.Map Token (M.Map Tag Int) -> M.Map Token Tag
  75. tokenMostFreqTag = M.map (fst . M.foldlWithKey findMax ("NIL", 0))
  76. where findMax acc@(_, maxFreq) tag freq
  77. | freq > maxFreq = (tag, freq)
  78. | otherwise = acc
  79.  
  80. -- Step 2: add default tag
  81. -- add default tag to all tokens without a tag (probably"NN")
  82. defaultTag :: Tag -> (Token, Maybe Tag) -> (Token, Tag)
  83. defaultTag baseTag (token, Nothing) = (token, baseTag)
  84. defaultTag baseTag (token, Just tag) = (token, tag)
  85.  
  86. {-------------
  87. MODEL TESTER
  88. --------------}
  89. lookupToken :: M.Map Token Tag -> Token -> Maybe Tag
  90. lookupToken myMap token = M.lookup token myMap
  91.  
  92. freqTest :: M.Map Token Tag -> [Token] -> [(Token, Maybe Tag)]
  93. freqTest freqModel wordList = L.zip wordList tagList
  94. where tagList = map (lookupToken freqModel) wordList
  95.  
  96. baseTest :: [(Token, Maybe Tag)] -> Tag -> [(Token, Tag)]
  97. baseTest tokenTag baseTag = map (defaultTag baseTag) tokenTag
  98.  
  99. {-------------
  100. MODEL EVALUATOR
  101. ---------------}
  102. compareTag :: [(Token, Tag)] -> [(Token, Tag)] -> [Int] -> [Int]
  103. compareTag [] [] [number, correct] = [number, correct]
  104. compareTag _ [] [number, correct] = [number, correct]
  105. compareTag [] _ [number, correct] = [number, correct]
  106. compareTag (x:xs) (y:ys) [number, correct] = compareTag xs ys [(number + 1), (correct + (eval x y))]
  107. where eval tagOriginal tagComputed | tagOriginal == tagComputed = 1
  108. | otherwise = 0
  109.  
  110. compareMaybe :: [(Token, Tag)] -> [(Token, Maybe Tag)] -> [Int] -> [Int]
  111. compareMaybe [] [] [number, correct, empty] = [number, correct, empty]
  112. compareMaybe _ [] [number, correct, empty] = [number, correct, empty]
  113. compareMaybe [] _ [number, correct, empty] = [number, correct, empty]
  114. compareMaybe (x:xs) (y:ys) [number, correct, empty] =
  115. compareMaybe xs ys [(number + 1), (correct + (evalJust x y)), (empty + (evalNothing x y))]
  116. where
  117. evalJust (_,tagOriginal) (_,tagComputed) | tagOriginal == (DM.fromMaybe "NIHIL" tagComputed) = 1
  118. | otherwise = 0
  119. evalNothing (_, tagOriginal) (_,tagComputed) | tagComputed == Nothing = 1
  120. | otherwise = 0
  121.  
  122. prettyPrint :: [Int] -> IO()
  123. prettyPrint [number, correct, empty] = do
  124. putStrLn ("Total tagged: " ++ (show $ (((number - empty) `myDiv` number) * 100)) ++ " %")
  125. putStrLn ("Total tagged correctly: " ++ (show $ ((correct `myDiv` number) * 100)) ++ " %")
  126. putStrLn ("Total empty: " ++ (show $ ((empty `myDiv` number) * 100)) ++ " %")
  127. putStrLn ("Total tagged known: " ++ (show $ ((correct `myDiv` (number-empty)) * 100)) ++ " %")
  128. prettyPrint [number, correct] = do
  129. putStrLn ("Total tagged correctly: " ++ (show $ ((correct `myDiv` number) * 100)) ++ " %")
  130.  
  131. myDiv :: Int -> Int -> Float
  132. myDiv n1 n2 = (fromIntegral n1) / (fromIntegral n2)

When you run this in the Terminal, it goes a little something like this:

koenroelandt$ ./posTagger tag train.txt test.txt eval.txt
***Frequency tagging***
Total tagged: 95.773186 %
Total tagged correctly: 87.66148 %
Total empty: 4.226818 %
Total tagged known: 91.530304 %
***Base tagging***
Total tagged correctly: 88.52606 %