2012-03-24 75 views
2

我的程序不识别中文。 如何用精神来认识中国人? 我使用wstring并将其转换为utf-16。如何使用Boost Spirit解析中文(unicode utf-16)?

这里是我的头文件:

#pragma once 

#define BOOST_SPIRIT_UNICODE 

#include <boost/spirit/include/qi.hpp> 
#include <string> 
#include <vector> 
#include <map> 
using namespace std; 



namespace qi = boost::spirit::qi; 
namespace ascii = boost::spirit::ascii; 


typedef pair<wstring,wstring> WordMeaningType; 
typedef vector<WordMeaningType> WordMeaningsType; 
typedef pair<wstring,WordMeaningsType> WordType; 
typedef vector<WordType> WordListType; 

struct WordPaser 
    :qi::grammar<wstring::iterator,WordListType(),ascii::space_type > 
{ 
public: 
    qi::rule<wstring::iterator, wstring(),ascii::space_type> mRuleWordPart; 
    qi::rule<wstring::iterator, wstring(),ascii::space_type> mRuleWordMeaning; 
    qi::rule<wstring::iterator, wstring(),ascii::space_type> mRuleWord; 

    qi::rule<wstring::iterator, WordMeaningType(),ascii::space_type> mRulePM; 
    qi::rule<wstring::iterator, WordMeaningsType(),ascii::space_type> mRulePMs; 
    qi::rule<wstring::iterator, WordType(),ascii::space_type> mRuleCurWPM; 

    qi::rule<wstring::iterator, WordListType(),ascii::space_type> mRuleEntrence; 


    wstring mCurWord; 
    wstring mCurWordPart; 
    wstring mCurWordMeaning; 
    WordMeaningType mCurPM; 
    WordMeaningsType mCurPMs; 
    WordType mCurWPM; 

    WordPaser(); 


}; 

和我的CPP文件:

#include <boost/tuple/tuple.hpp> 
#include <boost/spirit/include/qi.hpp> 
#include <boost/spirit/include/phoenix.hpp> 
#include "WordPaser.h" 

namespace fusion = boost::fusion; 
namespace phoenix = boost::phoenix; 

using qi::_val; 
using qi::_1; 
using qi::_2; 
using qi::lit; 
using qi::lexeme; 
using qi::space; 
using ascii::char_; 
using ascii::string; 
using qi::graph; 
using qi::word; 

WordPaser::WordPaser() 
    : WordPaser::base_type(mRuleEntrence) 
{ 


    mRuleWord %= lexeme[(+((qi::alpha)[std::wcout<<L'|'<<_1])>>*(qi::alpha|char_('(')|char_(')')))]; 

    mRuleWordPart %= lexeme[(+(qi::alpha)[std::wcout<<L'@'<<_1]>>*(qi::alpha|char_('/')|char_('.'))[std::wcout<<L'@'<<_1])]; 
    mRuleWordMeaning %= lexeme[(+(qi::graph|char_(L'(')|char_(L')')[std::wcout<<L':'<<_1]))]; 

    mRulePM = (mRuleWordPart>>mRuleWordMeaning) 
     [ 
      phoenix::bind(&WordMeaningType::first, _val)=_1, 
      phoenix::bind(&WordMeaningType::second, _val)=_2 

     ]; 

    mRulePMs = +(mRulePM 
     [ 
      phoenix::push_back( _val , _1) 
     ]); 

    mRuleCurWPM = (mRuleWord>>mRulePMs) 
     [ 
      phoenix::bind(&WordType::first, _val)=_1, 
      phoenix::bind(&WordType::second, _val)=_2    
     ]; 

    mRuleEntrence = +mRuleCurWPM 
     [ 
      phoenix::push_back(_val, _1) 
     ]; 

} 
+1

请至少指定样本输入和预期输出。这里没有问题,现在 – sehe 2012-03-25 20:38:06

回答

1

您应该使用的解析器/船长从另一个命名空间,而不是从ASCII。我想,在你的情况下,它应该是standard_wide。

+0

认为你的回答是“standard_wide”! – Vapor 2012-03-30 07:39:58

4

此代码可以解析中文。

#define BOOST_TEST_DYN_LINK 
    #define BOOST_SPIRIT_USE_PHOENIX_V3 
    #define BOOST_SPIRIT_UNICODE 
    #include <boost/config/warning_disable.hpp> 
    #include <boost/spirit/include/qi.hpp> 
    #include <boost/spirit/include/support_standard_wide.hpp> 
    #include <boost/spirit/include/karma.hpp> 
    #include <boost/spirit/include/qi_parse.hpp> 
    #include <boost/phoenix.hpp> 
    #include <boost/fusion/include/std_pair.hpp> 
    #define BOOST_TEST_MODULE MyTest 
    #include <boost/test/unit_test.hpp> 
    using namespace std; 

BOOST_AUTO_TEST_CASE(parse_chinese) 
    { 
      namespace qi  = boost::spirit::qi ; 
      namespace ascii  = boost::spirit::ascii ; 
      namespace encoding = boost::spirit::unicode; 
      namespace px  = boost::phoenix ; 
      using namespace qi::labels; 
      std::wstring test=L"中国" ; 
      std::wstring found ; 
      qi::rule<wstring::iterator,wstring(),encoding::space_type> unicode_string; 
      unicode_string = * qi::unicode::char_ [_val += _1 ] ; 
      if(qi::phrase_parse(test.begin(),test.end(), 
         unicode_string, 
         encoding::space, 
         found) 
      ) 
      { 
       BOOST_CHECK(true); 
       //std::cout << "OK:" << utf16to8(found) << std::endl ; 
      } 
      else 
      { 
       BOOST_CHECK(false); 
      } 
    } 
相关问题