2011-11-30 75 views
0
之间提取文本

问题:C#正则表达式来去除空调风格注释和括号

我需要从这个JavaScript 自动提取所有名的属性(分别对供应商大和提供小)

/* 
    Simple OpenID Plugin 
    http://code.google.com/p/openid-selector/ 

    This code is licensed under the New BSD License. 
*/ 

var providers_large = { 
    google : { 
     name : 'Google', 
     url : 'https://www.google.com/accounts/o8/id' 
    }, 
    yahoo : { 
     name : 'Yahoo', 
     url : 'http://me.yahoo.com/' 
    }, 
    aol : { 
     name : 'AOL', 
     label : 'Enter your AOL screenname.', 
     url : 'http://openid.aol.com/{username}' 
    }, 
    myopenid : { 
     name : 'MyOpenID', 
     label : 'Enter your MyOpenID username.', 
     url : 'http://{username}.myopenid.com/' 
    }, 
    openid : { 
     name : 'OpenID', 
     label : 'Enter your OpenID.', 
     url : null 
    } 
}; 

var providers_small = { 
    livejournal : { 
     name : 'LiveJournal', 
     label : 'Enter your Livejournal username.', 
     url : 'http://{username}.livejournal.com/' 
    }, 
    /* flickr: { 
     name: 'Flickr',   
     label: 'Enter your Flickr username.', 
     url: 'http://flickr.com/{username}/' 
    }, */ 
    /* technorati: { 
     name: 'Technorati', 
     label: 'Enter your Technorati username.', 
     url: 'http://technorati.com/people/technorati/{username}/' 
    }, */ 
    wordpress : { 
     name : 'Wordpress', 
     label : 'Enter your Wordpress.com username.', 
     url : 'http://{username}.wordpress.com/' 
    }, 
    blogger : { 
     name : 'Blogger', 
     label : 'Your Blogger account', 
     url : 'http://{username}.blogspot.com/' 
    }, 
    verisign : { 
     name : 'Verisign', 
     label : 'Your Verisign username', 
     url : 'http://{username}.pip.verisignlabs.com/' 
    }, 
    /* vidoop: { 
     name: 'Vidoop', 
     label: 'Your Vidoop username', 
     url: 'http://{username}.myvidoop.com/' 
    }, */ 
    /* launchpad: { 
     name: 'Launchpad', 
     label: 'Your Launchpad username', 
     url: 'https://launchpad.net/~{username}' 
    }, */ 
    claimid : { 
     name : 'ClaimID', 
     label : 'Your ClaimID username', 
     url : 'http://claimid.com/{username}' 
    }, 
    clickpass : { 
     name : 'ClickPass', 
     label : 'Enter your ClickPass username', 
     url : 'http://clickpass.com/public/{username}' 
    }, 
    google_profile : { 
     name : 'Google Profile', 
     label : 'Enter your Google Profile username', 
     url : 'http://www.google.com/profiles/{username}' 
    } 
}; 

openid.locale = 'en'; 
openid.sprite = 'en'; // reused in german& japan localization 
openid.demo_text = 'In client demo mode. Normally would have submitted OpenID:'; 
openid.signin_text = 'Sign-In'; 
openid.image_title = 'log in with {provider}'; 

所以我需要: A)删除所有C风格的评论 和B)获取[providers_large,providers_small]的所有名称值(在评论已被删除后)

到目前为止,我已经试过正则表达式来删除C风格的注释(和失败) 和正则表达式来获得大括号之间的所有内容(和失败)

我后来试图给它读成JSON, 但这当然失败与“无效的JSON primitve无所谓”

这是计算器的网站我使用 ,这是我的例子,我试过到目前为止

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 


namespace ConsoleExperiments 
{ 

    public class Program 
    { 

     // http://stackoverflow.com/questions/2538279/strip-out-c-style-multi-line-comments 
     // NOT working 
     static string RemoveCstyleComments(string strInput) 
     { 
      string strPattern = @"/[*][\w\d\s]+[*]/"; 
      //strPattern = @"/\*.*?\*/"; 
      strPattern = "/\\*.*?\\*/"; 

      string strOutput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, string.Empty, System.Text.RegularExpressions.RegexOptions.Multiline); 
      Console.WriteLine(strOutput); 
      return strOutput; 
     } 


     // http://stackoverflow.com/questions/413071/regex-to-get-string-between-curly-braces-i-want-whats-between-the-curly-brace 
     // http://stackoverflow.com/questions/5337166/regular-expression-get-string-between-curly-braces 
     // http://stackoverflow.com/questions/1904617/regex-for-removing-curly-brackets-with-nested-curly-brackets 
     // http://stackoverflow.com/questions/378415/how-do-i-extract-a-string-of-text-that-lies-between-two-brackets-using-net 
     static string GetCurlyValues(string strInput) 
     { 
      string strPattern = "/{(.*?)}/"; 
      strPattern = "/{([^}]*)}/"; 
      strPattern = @"\{(\s*?.*?)*?\}"; 
      strPattern = @"(?<=\{).*(?=\})"; 
      strPattern = "{(.*{(.*)}.*)}"; 
      strPattern = "{{([^}]*)}}"; 
      strPattern = "{{({?}?[^{}])*}}"; 
      strPattern = @"\(([^)]*)\)"; 

      System.Text.RegularExpressions.Regex rex = new System.Text.RegularExpressions.Regex(strPattern, System.Text.RegularExpressions.RegexOptions.Multiline); 

      System.Text.RegularExpressions.Match mMatch = rex.Match(strInput); 

      foreach (System.Text.RegularExpressions.Group g in mMatch.Groups) 
      { 
       Console.WriteLine("Group: " + g.Value); 
       foreach (System.Text.RegularExpressions.Capture c in g.Captures) 
       { 
        Console.WriteLine("Capture: " + c.Value); 
       } 
      } 

      return ""; 
     } 


     static void ReadFile() 
     { 
      try 
      { 
       string strFilePath = @"TestFile.txt"; 
       if (System.IO.File.Exists(strFilePath)) 
       { 
        // Create an instance of StreamReader to read from a file. 
        // The using statement also closes the StreamReader. 
        using (System.IO.StreamReader sr = new System.IO.StreamReader(strFilePath)) 
        { 
         string line; 
         // Read and display lines from the file until the end of 
         // the file is reached. 
         while ((line = sr.ReadLine()) != null) 
         { 
          Console.WriteLine(line); 
         } // Whend 

         sr.Close(); 
        } // End Using 

       } // End if (System.IO.File.Exists(strFilePath)) 
       else 
        Console.WriteLine("File \"" + strFilePath + "\" does not exist."); 
      } // End Try 
      catch (Exception e) 
      { 
       // Let the user know what went wrong. 
       Console.WriteLine("The file could not be read:"); 
       Console.WriteLine(e.Message); 
      } // End Catch 

     } // End Sub 

     public class cProvider 
     { 
      public string name = "abc"; 
      public string label ="def"; 
      public string url ="url"; 
     } 


     public class cProviders_large 
     { 
      public List<cProvider> foo = new List<cProvider>(); 
     } 


     static void Main(string[] args) 
     { 
      string strContent = System.IO.File.ReadAllText(@"D:\UserName\Downloads\openid-selector-1.3\openid-selector\js\openid-en - Kopie.js.txt"); 
      Console.WriteLine(strContent); 
      //RemoveCstyleComments(strContent); 
      //GetCurlyValues(strContent); 
      System.Web.Script.Serialization.JavaScriptSerializer js = new System.Web.Script.Serialization.JavaScriptSerializer(); 
      //object obj = js.DeserializeObject(strContent); 

      cProviders_large xx = new cProviders_large(); 
      cProvider ap = new cProvider(); 
      xx.foo.Add(ap); 
      xx.foo.Add(ap); 

      string res = js.Serialize(xx); 
      Console.WriteLine(res); 


      Console.WriteLine(Environment.NewLine); 
      Console.WriteLine(" --- Press any key to continue --- "); 
      Console.ReadKey(); 
     } // End Sub Main 

    } // End Class Program 


} // End namespace ConsoleExperiments 

可能有人谁明白正则表达式比我好为我提供了必要的正则表达式 - 表达? 现在,它看起来像我会最终通过手动每个文件的变化, 我真的真的很讨厌这个时候做这件事......

编辑: 在一个旁注,V8的包装采用C++ .NET,因此在Linux上不起作用,尽管v8引擎在Linux上确实工作得很好。

所以我坚持通过JSON转换来解决问题。

+1

我相信问题就相当于这一个: http://stackoverflow.com/questions/1732348/regex-match-open -tags-except-xhtml-self-contained-tags/1732454#1732454 JavaScript不是一种常规语言。删除评论应该是可能的,但是如果你不能完成剩下的工作,它会有用吗? – Stilgar

+0

@Stilgar:其实,其余的事情要比正确删除评论要复杂得多。其余的我已经达到了90%到95%。 –

回答

0

达林季米特洛夫的回答肯定是最简单的。
然而,Noesis.Javascript最令人恼火的是用C++ .NET写的,这意味着它不能在Linux上编译,尽管C#/ .NET(通过mono)和v8引擎在Linux上运行得非常好。

因此,这里是通过转换为JSON和deserialzation锻炼:

static string RemoveCstyleComments(string strInput) 
     { 
      string strPattern = @"/[*][\w\d\s]+[*]/"; 
      //strPattern = @"/\*.*?\*/"; // Doesn't work 
      //strPattern = "/\\*.*?\\*/"; // Doesn't work 
      //strPattern = @"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/ "; // Doesn't work 
      //strPattern = @"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/ "; // Doesn't work 

      // http://stackoverflow.com/questions/462843/improving-fixing-a-regex-for-c-style-block-comments 
      strPattern = @"/\*(?>(?:(?>[^*]+)|\*(?!/))*)\*/"; // Works ! 

      string strOutput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, string.Empty, System.Text.RegularExpressions.RegexOptions.Multiline); 
      Console.WriteLine(strOutput); 
      return strOutput; 
     } // End Function RemoveCstyleComments 




     static string ReplaceVariables(string strInput) 
    { 
     string strPattern = @"var\s+providers_large(\s+)?=(\s+)?{(\s+)?"; 
     strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "\"providers_large\" : {" + Environment.NewLine, System.Text.RegularExpressions.RegexOptions.Multiline); 

     strPattern = @"(\s+)?var\s+providers_small(\s+)?=(\s+)?{(\s+)?"; 
     strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, ", \"providers_small\" : {" + Environment.NewLine, System.Text.RegularExpressions.RegexOptions.Multiline); 

     strPattern = @"}(\s+)?;(\s+)?"; 
     strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "}" + Environment.NewLine, System.Text.RegularExpressions.RegexOptions.Multiline); 

     strPattern = @"$(\s+)?(\w+)(\s+)?:(\s+)?{"; 
     strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "\"$2\" : {", System.Text.RegularExpressions.RegexOptions.Multiline); 

     strPattern = @"name(\s+)?:(\s+)?'"; 
     strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "\"name\" : '", System.Text.RegularExpressions.RegexOptions.Multiline); 

     strPattern = @"url(\s+)?:(\s+)?'"; 
     strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "\"url\" : '", System.Text.RegularExpressions.RegexOptions.Multiline); 

     strPattern = @"label(\s+)?:(\s+)?'"; 
     strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "\"label\" : '", System.Text.RegularExpressions.RegexOptions.Multiline); 


     strInput = strInput.Replace("'", "\""); 


     strPattern = "openid\\.locale.*"; 
     //strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "", System.Text.RegularExpressions.RegexOptions.Multiline); 
     strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strPattern, "", System.Text.RegularExpressions.RegexOptions.Singleline); 

     strPattern = null; 

     /* 
     string[] astrTrailingComments = { 
         @"openid\.locale" 
         ,@"openid\.sprite" 
         ,@"openid\.demo_text" 
         ,@"openid\.signin_text" 
         ,@"openid\.image_title" 
     }; 

     foreach (string strThisPattern in astrTrailingComments) 
     { 
      strInput = System.Text.RegularExpressions.Regex.Replace(strInput, strThisPattern + ".+", "", System.Text.RegularExpressions.RegexOptions.Multiline); 
     } // Next strThisPattern 
     */ 

     strInput = "{" + strInput + "}"; 

     //Console.WriteLine(strInput); 
     return strInput; 
    } // End Function ReplaceVariables 


     static System.Collections.Specialized.NameValueCollection TrySerialize(string strInput) 
     { 
      strInput = RemoveCstyleComments(strInput); 
      strInput = ReplaceVariables(strInput); 

      System.Collections.Specialized.NameValueCollection nvc = new System.Collections.Specialized.NameValueCollection(StringComparer.OrdinalIgnoreCase); 

      System.Web.Script.Serialization.JavaScriptSerializer js = new System.Web.Script.Serialization.JavaScriptSerializer(); 
      dynamic objScript = js.DeserializeObject(strInput); 
      js = null; 


      foreach (dynamic kvp in objScript) 
      { 
       dynamic dictValues = kvp.Value; 

       //Console.WriteLine(Environment.NewLine); 
       //Console.WriteLine(Environment.NewLine); 
       //Console.WriteLine(kvp.Key); 
       //Console.WriteLine(Environment.NewLine); 

       foreach (string strMemberVariable in dictValues.Keys) 
       { 

        if(StringComparer.OrdinalIgnoreCase.Equals(kvp.Key,"providers_small")) 
        { 
         nvc.Add("providers_small", strMemberVariable); 
        } 


        if(StringComparer.OrdinalIgnoreCase.Equals(kvp.Key,"providers_large")) 
        { 
         nvc.Add("providers_large", strMemberVariable); 
        } 

        //Console.WriteLine(strMemberVariable + ":"); 

        dynamic MemberVariable = dictValues[strMemberVariable]; 
        //Console.WriteLine(MemberVariable.GetType().ToString()); 

        foreach (string strProperty in MemberVariable.Keys) 
        { 
         //Console.WriteLine(strValue); 
         dynamic objPropertyValue = MemberVariable[strProperty]; 

         //if (objPropertyValue != null) 
         //Console.WriteLine("  - " + (strProperty + ":").PadRight(8, ' ') + objPropertyValue.ToString()); 
        } // Next strProperty 

       } // Next strMemberVariable 

      } // Next kvp 


      // Console.WriteLine("providers large: "); 
      // Console.WriteLine(nvc["providers_large"]); 

      // Console.WriteLine(Environment.NewLine); 
      // Console.WriteLine("providers small: "); 
      // Console.WriteLine(nvc["providers_small"]); 

      return nvc; 
     } // End Function TrySerialize 


     public static void GetProviders() 
     { 
      string strContent = System.IO.File.ReadAllText(@"D:\UserName\Downloads\openid-selector-1.3\openid-selector\js\openid-en.js"); 
      strContent = System.IO.File.ReadAllText(@"D:\UserName\Downloads\openid-selector-1.3\openid-selector\js\openid-ru.js"); 
      //Console.WriteLine(strContent); 

      //JavaScriptEngineTest(strContent); 
      //GetCurlyValues(strContent); 
      System.Collections.Specialized.NameValueCollection nvc = TrySerialize(strContent); 

      Console.WriteLine(Environment.NewLine); 
      Console.WriteLine("providers large: "); 
      foreach (string strValue in nvc.GetValues("providers_large")) 
      { 
       Console.WriteLine(" " + strValue); 
      } // Next strValue 

      //System.Runtime.Serialization.Json.DataContractJsonSerializer dcjs = new System.Runtime.Serialization.Json.DataContractJsonSerializer(); 
      // The above is bullshit in unadulterated filth. ==> Use System.Web.Extensions instead 

      Console.WriteLine(Environment.NewLine); 
      Console.WriteLine("providers small: "); 
      foreach (string strValue in nvc.GetValues("providers_small")) 
      { 
       Console.WriteLine(" " + strValue); 
      } // Next strValue 

     } // End Sub GetProviders 
0

为此考虑JavaScriptSerializer,提供json反序列化,如果删除了变量和注释,它应该能够创建一个对象图。

4

你可以使用一个javascript engine

using System; 
using System.IO; 
using Noesis.Javascript; 

class Program 
{ 
    static void Main() 
    { 
     var context = new JavascriptContext(); 
     context.SetParameter("openid", new object()); 
     context.Run(File.ReadAllText("test.js")); 
     dynamic providers_large = context.GetParameter("providers_large"); 
     foreach (var provider in providers_large) 
     { 
      Console.WriteLine(
       "name: {0}, url: {1}", 
       provider.Value["name"], 
       provider.Value["url"] 
      ); 
     } 
    } 
} 

打印我的控制台上执行以下操作:

name: Google, url: https://www.google.com/accounts/o8/id 
name: Yahoo, url: http://me.yahoo.com/ 
name: AOL, url: http://openid.aol.com/{username} 
name: MyOpenID, url: http://{username}.myopenid.com/ 
name: OpenID, url: