summaryrefslogtreecommitdiff
path: root/examples/trigram/Trigram.gf
diff options
context:
space:
mode:
authorkrasimir <krasimir@chalmers.se>2009-10-08 13:13:48 +0000
committerkrasimir <krasimir@chalmers.se>2009-10-08 13:13:48 +0000
commita84cdb32c75b11a2e7bae2906137cefae4ad03ce (patch)
tree5576798ba6f130070615beab0511ad1fd6e942db /examples/trigram/Trigram.gf
parentffb5fdcd2580b00203456636be887adcc9759bda (diff)
just for fun - grammar for trigram models in GF
Diffstat (limited to 'examples/trigram/Trigram.gf')
-rw-r--r--examples/trigram/Trigram.gf34
1 files changed, 34 insertions, 0 deletions
diff --git a/examples/trigram/Trigram.gf b/examples/trigram/Trigram.gf
new file mode 100644
index 000000000..2e6e18fdd
--- /dev/null
+++ b/examples/trigram/Trigram.gf
@@ -0,0 +1,34 @@
+abstract Trigram = {
+
+cat
+ -- A lexicon is a set of 'Word's
+ Word ;
+
+ -- All N-gram instances seen in the corpus are abstract syntax constants
+ Unigram (a : Word) ;
+ Bigram (a,b : Word) ;
+ Trigram (a,b,c : Word) ;
+
+ -- A text is a sequence words where the sequence is indexed by the last two tokens
+ Seq (a,b : Word) ;
+
+ -- The estimated probability of the trigram 'a b c' is the total probability of all
+ -- trees of type Prob a b c.
+ Prob (a,b,c : Word) ;
+
+data
+ -- Here we construct sequence by using nil and cons. The Prob argument ensures
+ -- that the sequence contains only valid N-grams and contributes with the right
+ -- probability mass
+ nil : (a,b,c : Word) -> Prob a b c -> Seq b c ;
+ cons : ({a,b} : Word) -> Seq a b -> (c : Word) -> Prob a b c -> Seq b c ;
+
+ -- Here we construct probabilities. There are two ways: by trigrams, by bigrams and
+ -- by unigrams. Since the trigramP, bigramP, unigramP functions have some associated
+ -- probabilities as well this results in linear smoothing between the unigram, bigram
+ -- and trigram models
+ trigramP : ({a,b,c} : Word) -> Trigram a b c -> Prob a b c ;
+ bigramP : ({a,b,c} : Word) -> Bigram a b -> Bigram b c -> Prob a b c ;
+ unigramP : ({a,b,c} : Word) -> Unigram a -> Unigram b -> Unigram c -> Prob a b c ;
+
+} \ No newline at end of file