From 604ec0a8c9c0c2da963f3abb9c00ceadd399d3aa Mon Sep 17 00:00:00 2001 From: aarne Date: Mon, 9 Oct 2006 07:37:25 +0000 Subject: gfcc doc --- src/GF/Canon/GFCC/doc/gfcc.html | 201 ++++++++++++++++++++++++++++------------ 1 file changed, 140 insertions(+), 61 deletions(-) (limited to 'src/GF/Canon/GFCC/doc/gfcc.html') diff --git a/src/GF/Canon/GFCC/doc/gfcc.html b/src/GF/Canon/GFCC/doc/gfcc.html index ef88980a4..87f80922c 100644 --- a/src/GF/Canon/GFCC/doc/gfcc.html +++ b/src/GF/Canon/GFCC/doc/gfcc.html @@ -34,7 +34,8 @@ October 3, 2006
  • Running the compiler and the GFCC interpreter
  • The reference interpreter -
  • Some things to do +
  • Interpreter in C++ +
  • Some things to do

    @@ -102,18 +103,18 @@ as translated to GFCC. The representations are aligned, with the exceptions due to the alphabetical sorting of GFCC grammars.

    -                                      grammar Ex (Eng Swe);
    +                                      grammar Ex(Eng,Swe);
       
       abstract Ex = {                     abstract {
         cat 
           S ; NP ; VP ;
         fun 
    -      Pred : NP -> VP -> S ;            Pred : NP VP -> S = (Pred);
    +      Pred : NP -> VP -> S ;            Pred : NP,VP -> S = (Pred);
           She, They : NP ;                  She : -> NP = (She);
           Sleep : VP ;                      Sleep : -> VP = (Sleep); 
                                             They : -> NP = (They);
       }                                     } ;
    -                                      ;
    +                                      
       concrete Eng of Ex = {              concrete Eng {
         lincat
           S  = {s : Str} ;
    @@ -122,7 +123,7 @@ due to the alphabetical sorting of GFCC grammars.
         param
           Num = Sg | Pl ;
         lin
    -      Pred np vp = {                    Pred = [($0[1], $1[0][$0[0]])] ;
    +      Pred np vp = {                    Pred = [(($0!1),(($1!0)!($0!0)))];
             s = np.s ++ vp.s ! np.n} ;      
           She = {s = "she" ; n = Sg} ;      She = [0, "she"];
           They = {s = "they" ; n = Pl} ;    
    @@ -141,13 +142,12 @@ due to the alphabetical sorting of GFCC grammars.
         param
           Num = Sg | Pl ;
         lin
    -      Pred np vp = {                    Pred = [($0[1], $1[0])];
    +      Pred np vp = {                    Pred = [(($0!0),($1!0))];
             s = np.s ++ vp.s} ;
           She = {s = "hon"} ;               She = ["hon"];
           They = {s = "de"} ;               They = ["de"];
           Sleep = {s = "sover"} ;           Sleep = ["sover"];
    -                                        } ;
    -  }                                   ;
    +  }                                     } ;                                   
     

    @@ -161,9 +161,9 @@ the concrete languages. The abstract syntax and the concrete syntaxes themselves follow.

    -    Grammar  ::= Header ";" Abstract ";" [Concrete] ";" ;
    +    Grammar  ::= Header ";" Abstract ";" [Concrete] ;
         Header   ::= "grammar" CId "(" [CId] ")" ;
    -    Abstract ::= "abstract" "{" [AbsDef] "}" ";" ;
    +    Abstract ::= "abstract" "{" [AbsDef] "}" ;
         Concrete ::= "concrete" CId "{" [CncDef] "}" ;
     

    @@ -224,24 +224,27 @@ literal.

    Concrete syntax

    Linearization terms (Term) are built as follows. +Constructor names are shown to make the later code +examples readable.

    -    Term     ::= "[" [Term] "]" ;          -- array
    -    Term     ::= Term "[" Term "]" ;       -- access to indexed field
    -    Term     ::= "(" [Term] ")" ;          -- sequence with ++
    -    Term     ::= Tokn ;                    -- token
    -    Term     ::= "$" Integer ;             -- argument subtree
    -    Term     ::= Integer ;                 -- array index
    -    Term     ::= "[|" [Term] "|]" ;        -- free variation
    +    R.  Term ::= "[" [Term] "]" ;        -- array
    +    P.  Term ::= "(" Term "!" Term ")" ; -- access to indexed field
    +    S.  Term ::= "(" [Term] ")" ;        -- sequence with ++
    +    K.  Term ::= Tokn ;                  -- token
    +    V.  Term ::= "$" Integer ;           -- argument
    +    C.  Term ::= Integer ;               -- array index
    +    FV. Term ::= "[|" [Term] "|]" ;      -- free variation
    +    TM. Term ::= "?" ;                   -- linearization of metavariable
     

    Tokens are strings or (maybe obsolescent) prefix-dependent variant lists.

    -    Tokn     ::= String ;
    -    Tokn     ::= "[" "pre" [String] "[" [Variant] "]" "]" ;
    -    Variant  ::= [String] "/" [String] ;
    +    KS.  Tokn     ::= String ;
    +    KP.  Tokn     ::= "[" "pre" [String] "[" [Variant] "]" "]" ;
    +    Var. Variant  ::= [String] "/" [String] ;
     

    Three special forms of terms are introduced by the compiler @@ -250,9 +253,9 @@ their presence makes grammars much more compact. Their semantics will be explained in a later section.

    -    Term     ::= CId ;                     -- global constant
    -    Term     ::= "(" String "+" Term ")" ; -- prefix + suffix table
    -    Term     ::= "(" Term "@" Term ")";    -- record parameter alias
    +    F.  Term ::= CId ;                     -- global constant
    +    W.  Term ::= "(" String "+" Term ")" ; -- prefix + suffix table
    +    RP. Term ::= "(" Term "@" Term ")";    -- record parameter alias
     

    Identifiers are like Ident in GF and GFC, except that @@ -282,7 +285,7 @@ in which linearization is performed. AS s -> R [kks (show s)] -- quoted AI i -> R [kks (show i)] AF d -> R [kks (show d)] - AM -> R [kks "?"] ---- TODO: proper lincat + AM -> TM where lin = linExp mcfg lang comp = compute mcfg lang @@ -301,6 +304,7 @@ a string using the following algorithm. K (KP s _) -> unwords s ---- prefix choice TODO W s t -> s ++ realize t FV (t:_) -> realize t + TM -> "?"

    Since the order of record fields is not necessarily @@ -320,39 +324,48 @@ needed:

    -The code is cleaned from debugging information present in the working -version. -

    -
    -    compute :: GFCC -> CId -> [Term] -> Term -> Term
    -    compute mcfg lang args = comp where
    -      comp trm = case trm of
    -        P r (FV ts) -> FV $ Prelude.map (comp . P r) ts
    +The code is presented in one-level pattern matching, to
    +enable reimplementations in languages that do not permit
    +deep patterns (such as Java and C++).
    +

    +
    +  compute :: GFCC -> CId -> [Term] -> Term -> Term
    +  compute mcfg lang args = comp where
    +    comp trm = case trm of
    +      P r p  -> proj (comp r) (comp p)
    +      RP i t -> RP (comp i) (comp t)
    +      W s t  -> W s (comp t)
    +      R ts   -> R $ Prelude.map comp ts
    +      V i    -> idx args (fromInteger i)  -- already computed
    +      F c    -> comp $ look c             -- not computed (if contains V)
    +      FV ts  -> FV $ Prelude.map comp ts
    +      S ts   -> S $ Prelude.filter (/= S []) $ Prelude.map comp ts
    +      _ -> trm
       
    -        P r p -> case (comp r, comp p) of 
    +    look = lookLin mcfg lang
       
    -          -- for the suffix optimization
    -          (W s (R ss), p') -> case comp $ idx ss (getIndex p') of
    -            K (KS u) -> kks (s ++ u)
    +    idx xs i = xs !! i
       
    -          (r', p') -> comp $ (getFields r') !! (getIndex p')
    +    proj r p = case (r,p) of
    +      (_,     FV ts) -> FV $ Prelude.map (proj r) ts
    +      (W s t, _)     -> kks (s ++ getString (proj t p))
    +      _              -> comp $ getField r (getIndex p)
       
    -        RP i t -> RP (comp i) (comp t)
    -        W s t -> W s (comp t)
    -        R ts  -> R $ Prelude.map comp ts
    -        V i   -> args !! (fromInteger i)    -- already computed
    -        S ts  -> S $ Prelude.filter (/= S []) $ Prelude.map comp ts
    -        F c   -> comp $ lookLin mcfg lang   -- not yet computed
    -        FV ts -> FV $ Prelude.map comp ts
    -        _ -> trm
    +    getString t = case t of
    +      K (KS s) -> s
    +      _ -> trace ("ERROR in grammar compiler: string from "++ show t) "ERR"
       
    -      getIndex t =  case t of
    -        C i -> fromInteger i
    -        RP p _ -> getIndex p
    +    getIndex t =  case t of
    +      C i    -> fromInteger i
    +      RP p _ -> getIndex p
    +      TM     -> 0  -- default value for parameter
    +      _ -> trace ("ERROR in grammar compiler: index from " ++ show t) 0
       
    -      getFields t = case t of
    -        R rs -> rs
    -        RP _ r -> getFields r
    +    getField t i = case t of
    +      R rs   -> idx rs i
    +      RP _ r -> getField r i
    +      TM     -> TM
    +      _ -> trace ("ERROR in grammar compiler: field from " ++ show t) t
     

    @@ -365,7 +378,7 @@ explanation. Global constants

    -    Term     ::= CId ;
    +    Term ::= CId ;
     

    are shorthands for complex terms. They are produced by the @@ -378,7 +391,7 @@ its definition. Prefix-suffix tables

    -    Term     ::= "(" String "+" Term ")" ; 
    +    Term ::= "(" String "+" Term ")" ; 
     

    represent tables of word forms divided to the longest common prefix @@ -410,7 +423,7 @@ take effect. The most curious construct of GFCC is the parameter array alias,

    -    Term     ::= "(" Term "@" Term ")";
    +    Term ::= "(" Term "@" Term ")";
     

    This form is used as the value of parameter records, such as the type @@ -451,8 +464,8 @@ we get the encoding The GFCC computation rules are essentially

    -    t [(i @ r)]  = t[i]
    -    (i @ r) [j]  = r[j]
    +    (t ! (i @ _)) = (t ! i)
    +    ((_ @ r) ! j)  =(r ! j)
     

    @@ -574,11 +587,11 @@ This expression must first be translated to a case expression, which can then be translated to the GFCC term

    -    [2,5][$0[$1]]  
    +    ([2,5] ! ($0 ! $1))  
     

    -assuming that the variable $np$ is the first argument and that its -$Number$ field is the second in the record. +assuming that the variable np is the first argument and that its +Number field is the second in the record.

    This transformation of course has to be performed recursively, since @@ -693,9 +706,71 @@ The available commands are +

    Interpreter in C++

    +

    +A base-line interpreter in C++ has been started. +Its main functionality is random generation of trees and linearization of them. +

    +

    +Here are some results from running the different interpreters, compared +to running the same grammar in GF, saved in .gfcm format. +The grammar contains the English, German, and Norwegian +versions of Bronzeage. The experiment was carried out on +Ubuntu Linux laptop with 1.5 GHz Intel centrino processor. +

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    GFgfcc(hs)gfcc++
    program size7249k803k113k
    grammar size336k119k119k
    read grammar1150ms510ms150ms
    generate 2229500ms450ms800ms
    memory21M10M2M
    + +

    +

    +To summarize: +

    +
      +
    • going from GF to gfcc is a major win in both code size and efficiency +
    • going from Haskell to C++ interpreter is a win in code size and memory, + but not so much in speed +
    + +

    Some things to do

    -Interpreters in Java and C++. +Interpreter in Java.

    Parsing via MCFG @@ -706,7 +781,11 @@ Parsing via MCFG

    -File compression of GFCC output. +Hand-written parsers for GFCC grammars to reduce code size +(and efficiency?) of interpreters. +

    +

    +Binary format and/or file compression of GFCC output.

    Syntax editor based on GFCC. -- cgit v1.2.3