2010-04-19 85 views
0
#include<stdio.h> 
#include<ctype.h> 
#include<string.h> 

/* this is a lexer which recognizes constants , variables ,symbols, identifiers , functions , comments and also header files . It stores the lexemes in 3 different files . One file contains all the headers and the comments . Another file will contain all the variables , another will contain all the symbols. */ 

int main() 
{ 
    int i=0,j,k,count=0; 
    char a,b[100],c[10000],d[100]; 
    memset (d, 0, 100); 
    j=30; 



    FILE *fp1,*fp2; 


    fp1=fopen("source.txt","r"); //the source file is opened in read only mode which will passed through the lexer 
    fp2=fopen("lext.txt","w"); 
    //now lets remove all the white spaces and store the rest of the words in a file 


    if(fp1==NULL) 
    { 
     perror("failed to open source.txt"); 
     //return EXIT_FAILURE; 
    } 
    i=0; 
    k=0; 
    while(!feof(fp1)) 
    { 


     a=fgetc(fp1); 


     if(a!=' '&&a!='\n') 
     { 
       if (!isalpha(a)) 
        { 

        switch(a) 
         { 

         case '+':{fprintf(fp2,"+ ----> PLUS \n"); 
           i=0;break;} 
         case '-':{fprintf(fp2,"- ---> MINUS \n"); 
           i=0;break;} 
         case '*':{fprintf(fp2, "* --->MULT \n"); 
           i=0;break;} 
         case '/':{fprintf(fp2, "/ --->DIV \n"); 
           i=0;break;} 
         //case '+=':fprintf(fp2, "%.20s\n", "ADD_ASSIGN"); 
         //case '-=':fprintf(fp2, "%.20s\n", "SUB_ASSIGN"); 
         case '=':{fprintf(fp2, "= ---> ASSIGN \n"); 
           i=0;break;} 
         case '%':{fprintf(fp2, "% ---> MOD \n"); 
           i=0;break;} 
         case '<':{fprintf(fp2, "< ---> LESSER_THAN \n"); 
           i=0;break;} 
         case '>':{fprintf(fp2, "> --> GREATER_THAN \n"); 
           i=0;break;} 
         //case '++':fprintf(fp2, "%.20s\n", "INCREMENT"); 
         //case '--':fprintf(fp2, "%.20s\n", "DECREMENT"); 
         //case '==':fprintf(fp2, "%.20s\n", "ASSIGNMENT"); 
         case ';':{fprintf(fp2, "; --->SEMI_COLUMN \n"); 
           i=0;break;} 
         case ':':{fprintf(fp2, ": --->COLUMN \n"); 
           i=0;break;} 
         case '(':{fprintf(fp2, "(--->LPAR \n"); 
           i=0;break;} 
         case ')':{fprintf(fp2, ") --->RPAR \n"); 
           i=0;break;} 
         case '{':{fprintf(fp2, "{ --->LBRACE \n"); 
           i=0;break;} 
         case '}':{fprintf(fp2, "} ---> RBRACE \n"); 
           i=0;break;} 
         } 
        } 
      else 
       { 

        d[i]=a; 
        //printf("%c\n",d[i]); 
        i=i+1; 


       } 
         //} 
         /* we can make the lexer more complex by including even more depths of checks for the symbols*/ 









     } 
     else 
      { 


      d[i+1]='\0'; 


     printf("\n"); 

      if((strcmp(d,"if ")==0)){fprintf(fp2,"if ----> IDENTIFIER \n"); 
         //printf("%s \n",d); 
         memset (d, 0, 100); 
         //printf("%s \n",d); 
         count=count+1;} 

      else if(strcmp(d,"then")==0){fprintf(fp2,"then ----> IDENTIFIER \n"); 
         count=count+1;} 

       else if(strcmp(d,"else")==0){fprintf(fp2,"else ----> IDENTIFIER \n"); 
          count=count+1;} 

       else if(strcmp(d,"switch")==0){fprintf(fp2,"switch ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"printf")==0){fprintf(fp2,"prtintf ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"scanf")==0){fprintf(fp2,"scanf ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"NULL")==0){fprintf(fp2,"NULL ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"int")==0){fprintf(fp2,"INT ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"char")==0){fprintf(fp2,"char ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"float")==0){fprintf(fp2,"float ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"long")==0){fprintf(fp2,"long ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"double")==0){fprintf(fp2,"double ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"const")==0){fprintf(fp2,"const ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"continue")==0)fprintf(fp2,"continue ----> IDENTIFIER \n"); 

       else if(strcmp(d,"size of")==0){fprintf(fp2,"size of ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"register")==0){fprintf(fp2,"register ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"short")==0){fprintf(fp2,"short ----> IDENTIFIER \n"); 
          count=count+1;} 
      else if(strcmp(d,"auto")==0){fprintf(fp2,"auto ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"while")==0){fprintf(fp2,"while ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"do")==0){fprintf(fp2,"do ----> IDENTIFIER \n"); 
          count=count+1;} 
       else if(strcmp(d,"case")==0){fprintf(fp2,"case ----> IDENTIFIER \n"); 
          count=count+1;} 
     else if (isdigit(d[i])) 
      { 
       fprintf(fp2,"%s ---->NUMBER",d); 
      } 
     else if (isalpha(a)) 
      { 
       fprintf(fp2,"%s ----> Variable",d); 
       //printf("%s",d); 
       // memset (d, 0, 100);} 
       //fprintf(fp2, "s\n", b); 
       i=0; 
     k=k+1; 

       continue; 
      } 

     i=i+1; 
    k=k+1; 


    } 
fclose(fp1); 
fclose(fp2); 
printf("%d",count); 
return 0; 
} 

在这段代码中,我的source.txt有if(a + b)存储。但只有(,+和)被写入lext.txt而不是标识符if或变量a和b。有什么特别的原因?在编码一个词法分析器时出错c

+2

一个有用的提示:C允许你用比单个字符长的标识符来命名你的变量。 – 2010-04-19 17:38:37

+0

因为你的代码有错误。 – qrdl 2010-04-19 17:57:46

+1

下一次**缩进**您的代码和**删除不需要的空**行。这使我们可以轻松分析您的代码。 – codaddict 2010-04-19 18:00:43

回答

1

有不少问题:

一旦你找到一个空间或尝试和比较的是字符串d使用一系列strcmp一个换行符。但是您正在重置else if (isalpha(a))中的i的值,因为a是空格或换行符,因此它将永远不会执行。你应该无条件设置的i0的价值就在串此事

d[i+1]='\0'; 

空间后,所以

if((strcmp(d,"if ")==0)) // d will never have a space as you never stuff it with one. 

应该

if((strcmp(d,"if")==0)) 
1

“if”关键字的比较与"if "(带空格)进行比较,但代码不会将该空间复制到缓冲区d。您最好的选择是使用调试器来浏览它,看看发生了什么。

0
/* 
ref: http://msdn2.microsoft.com/en-us/library/y39145bk(vs.80).aspx 

C language Tokens 
token: 
keyword 
identifier 
constant 
string-literal 
operator 
punctuator 


operator: one of 
[ ] () . –> ++  & * + – ~ ! sizeof/ % << >> <> <= >= == != ^ | && !!? := *= /= %= += –= <<= >>= &= ^= |=, # ## 

assignment-operator: one of 
= *= /= %= += –= <<= >>= &= ^= |= 

punctuator: one of 
[ ] () { } * , : = ; ... # 




* This is a generalized program working for any kind of input(input C program). 
* It fails to recognise stdio.h in "#include<stdio.h>" as one token. 
* Instead it identifies stdio as Identifier, . as dot operator , h as Identifier. 
* this is becoz i considered .(dot) as a separator b/w tokens of structure variables 

ex: 
struct book 
{ 
int price; 
}b1; 

b1.price; //here dot is used as seperator b/w b1 token and price token......... 

This program is not exactly correct....... 
It neither parses the comments nor the header files..... 

At first time if u get an error saying "Abnormal program termination" 
simply reduce the TableSize macro value. Becoz TC cannot allocated more global memory... 
*/ 


#include<stdio.h> 
#include<string.h> 
#include<conio.h> 

//=============================================================================================== 

#define true 1 
#define false 0 

//simple macros for moving back and front in the file 
#define MoveFront(units) fseek(fp,+(units),SEEK_CUR) 
#define MoveBack(units) fseek(fp,-(units),SEEK_CUR) 

//Token Table size 
#define TableSize  9144 

//=============================================================================================== 

//used to open the file 
int FileOpen(const char * path); 

//Core function which splits the given program in to tokens.... 
int CreateTokens(); 

//forms the token string based on the starting(start) of file pointer 
//position and ending(end) of file pointer position of the token 
void GetToken(int start,int end); 

//recognises whether the formed token as keyword,identifier, numeric constant,....... 
void IdentifyTokenType(int index); 

int EOFReached(); 
//=============================================================================================== 
//Keywords are arranged in such a way that most frequently used onces come first..... 
//So that the comparison b/w retrieved tokens and keywords become more efficient.... 
const char *keywords[]= 
{ 
    "int",  "char",  "double", "float", 
    "if",  "else",  "for",  "while", 
    "return", "switch", "case",  "break", 
    "do",  "default", "void",  "struct", 
    "long",  "const", "static", "union", 
    "enum",  "register", "short", "unsigned", 
    "continue", "goto",  "sizeof", "signed", 
    "auto",  "volatile", "typedef", "extern", 
}; 
//=============================================================================================== 
const char *preprocess[]= 
{ 
    "#define", "#include", // here rest of the preprocessors can be included too............. 
}; 

//=============================================================================================== 
struct TokenEntry 
{ 
    int start, //start contains starting of token's file pointer position.... 
     end; //end contains one more than the ending of token's file pointer position.... 
    char * type; //Type of the token ,whether it is identifier ,keyword,numeric constant.... 
}tokenTable[TableSize]; //TableSize is the TC limit of TokenArray :(

//=============================================================================================== 
struct DelimEntry 
{ 
    char * delim; //delimiters .... "[","{".... 
    char * type; //delimiter's name ................"LSquare","LBrace".... 
}; 

//=============================================================================================== 
//These are the delimiters in C language........(Not all of them are called as Delimiters...) 
struct DelimEntry const DelimTable[]= 
{ 
    //Single Character Delimiters //Set 1 
    {"[","LSquare"},    //0 
    {"]","RSquare"},    //1 
    {"(","LParen"}, 
    {")","RParen"}, 
    {"{","LBrace"}, 
    {"}","RBrace"}, 
    {",","Comma"}, 
    {";","SemiColon"}, 
    {":","Colon"}, 
    {"?","QuestionMark"}, 
    {"~","BitwiseNOT"}, 
    {".","Dot"},     //11 

    //Singles in triple character delimiters.... //Set 2 
    //When ever u encounter a Set 2 character .... 
    //U cannot confirm it as "Singles in triple character delimiters" 
    //U need to check for its next character also..... 
    //similarly When ever u encounter a Set 3 character .... 
    //U need to check for its next character also..... 
    // 
    {"<","LessThan"},    //12 
    {">","GreaterThan"}, 
    {"&","BitwiseAND"}, 
    {"|","BitwiseOR"}, 
    {"^","XOR"}, 
    {"=","Assignment"}, 
    {"!","Not"}, 
    {"%","Remainder"}, 
    {"-","Minus"}, 
    {"+","Plus"}, 
    {"*","Multiply"}, 
    {"/","DividedBy"},   //23 

    //Doubles in triple character delimiters.... //Set 3 
    {"<<","LeftShift"},   //24 
    {"<=","LessThanOrEqual"}, 
    {">>","RightShift"}, 
    {">=","GreaterThanOrEqual"}, 
    {"&&","LogicalAND"}, 
    {"&=","BitwiseANDEqual"}, 
    {"||","LogicalOR"}, 
    {"|=","BitwiseOREqual"}, 
    {"^=","XOREqual"}, 
    {"==","LogicalEqual"}, 
    {"!=","LogicalNotEqual"}, 
    {"%=","RemainderEquals"}, 
    {"-=","MinusEquals"}, 
    {"->","PointerArrow"}, 
    {"--","DecrementOperator"}, 
    {"++","IncrementOperator"}, 
    {"+=","PlusEquals"}, 
    {"*=","MultiplyEquals"}, 
    {"/=","DividedByEquals"},  //42 

    //Triples in triple character delimiters.... //Set 4 
    {"<<=","LeftShiftEquals"}, //43 
    {">>=","RightShiftEquals"}, //44 

}; 
//=============================================================================================== 
FILE * fp = NULL; 
int index;  //Variable used to iterate over the tokenTable Array.......... 
char buf[200];  //temporary variable to hold the token string........ 
//=============================================================================================== 
main(int argc , char *argv[]) 
{ 
    int i; 
    if(!FileOpen(argv[1])){ 
     printf("\nUnable to Open the File : %s",argv[1]); 
     return; 
    } 
    if(!CreateTokens()){ 
     //It is the problem of TC ..... :(
     printf("\nUnable to Create Tokens - May be the given program contains tokens more than the maximum token table size"); 
     return; 
    } 

    //Printing the Created Tokens.......... 
    printf("\n%-5s %-16s %-18s %-8s %-8s\n","No","Token","Token Type","Begin","End"); 
    printf("============================================================"); 
    for(i=0 ; i<index;i++){ 
     GetToken(tokenTable[i].start,tokenTable[i].end); 
     IdentifyTokenType(i); 
     printf("\n%-5d %-16s %-18s %-6d %-6d",i,buf,tokenTable[i].type,tokenTable[i].start,tokenTable[i].end); 
    } 
} 
//=============================================================================================== 
void IdentifyTokenType(int index) 
{ 
    int no,i; 
    if(strcmp(tokenTable[index].type ,"Unknown")==0) 
    { 
     //determining keywords present in tokens 
     no = sizeof(keywords)/sizeof(int); 
     for(i = 0;i<no;i++) 
      if(strcmp(buf,keywords[i]) == 0){ 
       tokenTable[index].type = "Keyword"; 
       return; 
      } 

     //determining identifiers present in tokens 
     if((buf[0]>='a'&&buf[0]<='z')|| (buf[0]>='A'&&buf[0]<='Z')||buf[0] == '_'){ 
      tokenTable[index].type = "Identifier"; 
      return; 
     } 


     //determining Preprocessor directives.. 
     no = sizeof(preprocess)/sizeof(int); 
     for(i = 0;i<no;i++) 
      if(strcmp(buf,preprocess[i]) == 0){ 
       tokenTable[index].type = "Preprocessor"; 
       return; 
      } 

     //Determining String Literals 
     if(buf[0] == '"' && buf[strlen(buf)-1] == '"'){ 
      tokenTable[index].type = "String Literal"; 
      return; 
     } 
     //Determining Char Literals 


     //Determining Numeric constants 
     for(i=0;buf[i]!='\0';i++) 
      if(!(buf[i]>='0'&&buf[i]<='9'))return; 
     tokenTable[index].type = "Numeric Constant"; 
     return; 
    } 
} 
//=============================================================================================== 
void GetToken(int start,int end) 
{ 
    int i=0; 
    fseek(fp,start,SEEK_SET); 
    while(i<end-start) 
     buf[i++] = fgetc(fp); 

    buf[i] = '\0'; 
    //Trim trailing newline chars.... 
    for(i--;i>=0;i--) 
     if(buf[i] == '\n')buf[i] = '\0'; 
} 
//=============================================================================================== 
int EOFReached() 
{ 
    return feof(fp) != 0; 
} 

//=============================================================================================== 
//Implements the state machine for splitting the given program(input) in to tokens............... 
int CreateTokens() 
{ 
    int i=0,j=0,k=0; 
    char c[4]={'\0','\0','\0','\0'}; //Array holding temporary characters..... 
    do{ 
     //state1 
     c[0] = fgetc(fp); 

     //--------             //Path A 
     if(c[0] == ' ' || c[0] == '\t' || c[0] == '\n' || EOFReached()) goto End; 

     /* 
     //-------- Skipping the comments..... 
     if(c[0] == '/') 
     { 
     c[1] = fgetc(fp); 
     //Skipping // type of comments.............. 
     if(c[1] == '/') 
     { 
     do 
     c[2] = fgetc(fp); 
     while(c[2]!='\n'&&!EOFReached()); 
     goto End; 
     } 
     //Skipping /* type of comments.............. 
     else if(c[1] == '*') 
     { 
     do{ 
     c[1] = fgetc(fp); 
     c[2] = fgetc(fp); 
     ungetc(c[2],fp); 
     }while((c[1] != '*' || c[2] != '/')&&!EOFReached()); 
     goto End; 
     } 
     MoveBack(1); 
     } 
     */ 
     MoveBack(1); 
     tokenTable[index].start = ftell(fp); 
     MoveFront(1); 

     //String literals 
     if(c[0] == '"'){ 
      do{ 
       c[1] = fgetc(fp); 
       if(c[1] =='\\') { fgetc(fp); c[1] = fgetc(fp);}  //skip \" character in b/w a string literal 
      }while(c[1] != '"'&&!EOFReached()); 
      tokenTable[index].type = "Unknown"; 
      tokenTable[index++].end = ftell(fp); 
      goto End; 
     } 
     //-------- 
     for(i=0;i<12;i++) 
     { 
      if(strcmp(DelimTable[i].delim,c) == 0)     //Path B 
      { 
       tokenTable[index].type = DelimTable[i].type; 
       tokenTable[index++].end = ftell(fp); 
       goto End; 
      } 
     } 

     //-------- 
     for(i=12;i<24;i++) 
     { 
      //Checking for single char in triples 
      if(strcmp(DelimTable[i].delim,c) == 0)     //Path C 
      { 
       c[1] = fgetc(fp);         //State 2 
       for(j=24;j<43&&!EOFReached();j++) 
       { 
        //Checking for double char in triples 
        if(strcmp(DelimTable[j].delim,c) == 0)   //Path E 
        { 
         c[2] = fgetc(fp);       //State 3 
         for(k=43;k<45&& !EOFReached();k++) 
         { //Checking for triplets in triples 
          if(strcmp(DelimTable[k].delim,c) == 0) //Path G 
          { 
           tokenTable[index].type = DelimTable[k].type; 
           tokenTable[index++].end = ftell(fp); 
           goto End; 
          } 
         } 
         //Path F 
         if(!EOFReached())MoveBack(1); 
         tokenTable[index].type = DelimTable[j].type; 
         tokenTable[index++].end = ftell(fp); 
         goto End; 
        } 
       } 
       //Path D 
       if(!EOFReached())MoveBack(1); 
       tokenTable[index].type = DelimTable[i].type; 
       tokenTable[index++].end = ftell(fp); 
       goto End; 
      } 
     } 
     //--------- 
     do{ 
      //Path H 
      c[0] = fgetc(fp);          //State 4 
      //Path I 
      //Checking for White Spaces 
      if(c[0] == ' ' || c[0] == '\t' || c[0] == '\n' ||EOFReached()) 
      { 
       if(!EOFReached()) MoveBack(1); 
       tokenTable[index].type = "Unknown"; 
       tokenTable[index++].end = ftell(fp); 
       goto End; 
      } 
      //Checking for Single char Delims //Checking for Single char Delims in triplets 
      for(i=0;i<24;i++) 
       if(strcmp(DelimTable[i].delim,c) == 0) 
       { 
        MoveBack(1); 
        tokenTable[index].type = "Unknown"; 
        tokenTable[index++].end = ftell(fp); 
        goto End; 
       } 

     }while(!feof(fp)); 

End: 
     c[0] = c[1] = c[2] = c[3] = '\0'; 
    }while((!feof(fp))&&index<TableSize); 

    //Tokens May(Not) be Ready 
    return index < TableSize; 
} 

//=============================================================================================== 
int FileOpen(const char *path) 
{ 
    fp = fopen(path,"r"); 
    if(fp == NULL) return false; 
    return true; 
} 
//=============================================================================================== 

希望这有助于你........

+1

'#'和'##'只是预处理器宏定义中的运算符。另外,如果这个答案帮助了任何人,我会感到惊讶。代码太多,解释太少。 – 2010-04-20 02:45:12