2012-06-19 57 views
0

我有以下节点(s),我在流读取器中检索。可能有很多这些。我只想检索此节点中的几个组,例如REPLICATE_ID, ASSAY_NUMBER,FEW DATES FIELDS如何忽略组

节点中字段的排序可能不同,有时候也可能会出现新字段,但我想提取的字段不会更改。

到目前为止,正则表达式我匹配整个节点,所以如果节点有新字段或顺序不同,它会中断。是否可以匹配我只感兴趣的组?

TEST_REPLICATE 
    { 
     REPLICATE_ID   453w 
     ASSAY_NUMBER   334 
     ASSAY_VERSION   4 
     ASSAY_STATUS   test 
     DILUTION_ID   1 
     SAMPLE_ID   "NC_dede" 
     SAMPLE_TYPE   Specimen 
     TEST_ORDER_DATE   05.23.2012 
     TEST_ORDER_TIME   04:25:07 
     TEST_INITIATION_DATE  05.23.2012 
     TEST_INITIATION_TIME  05:19:43 
     TEST_COMPLETION_DATE  05.23.2012 
     TEST_COMPLETION_TIME  05:48:01 
     ASSAY_CALIBRATION_DATE  NA 
     ASSAY_CALIBRATION_TIME  NA 
     TRACK   1 
     PROCESSING_LANE  1 
     MODULE_SN  "EP004" 
     LOAD_LIST_NAME   C:\BwedwQwedw_SCC\edwLoadlist2RACKSB.json 
     OPERATOR_ID   "Q_dwe" 
     DARK_SUBREADS   16 23 19 20 16 18 21 16 17 18 19 19 20 22 19 20 19 20 18 20 17 20 21 16 19 23 20 22 19 20 
     SIGNAL_SUBREADS   18 17 20 21 42 61 41 31 30 30 26 26 25 22 24  DARK_COUNT   577 
     SIGNAL_COUNT   781 
     CORRECTED_COUNT   204 
     STD_BAK    1.95965044971226 
     AVG_BAK    19.2333333333333 
     STD_FOR    8.67212471810898 
     AVG_FOR    26.0333333333333 
     SHAPE    NA 
     EXCEPTION_STRING  TestException - Parameters:Unable to process test, background read failure. 
     RESULT    NA 
     REPORTED_RESULT   NA 
     REPORTED_RESULT_UNITS  NA 
     REAGENT_MASTER_LOT  13600LI02 
     REAGENT_SERIAL_NUMBER  25022 
     RESULT_FLAGS   RUO 
     RESULT_INTERPRETATION  NA 
     DILUTION_PROTOCOL  UNDILUTED 
     RESULT_COMMENT   frer 1 LANE A 
     DATA_MANAGEMENT_FIELD_1  NA 
     DATA_MANAGEMENT_FIELD_2  NA 
     DATA_MANAGEMENT_FIELD_3  NA 
     DATA_MANAGEMENT_FIELD_4  NA 
    } 

    string pat = @"TEST_REPLICATE\s*{\s*REPLICATE_ID\s*([^}]*?)\s+ASSAY_NUMBER\s*([^}]*?)\s+ASSAY_VERSION\s*([^}]*?)\s+DILUTION_ID\s*([^}]*?)\s+SAMPLE_ID\s*([^}]*?)\s+SAMPLE_TYPE\s*([^}]*?)\s+TEST_ORDER_DATE\s*([^}]*?)\s+TEST_ORDER_TIME\s*([^}]*?)\s+TEST_INITIATION_DATE\s*([^}]*?)\s+TEST_INITIATION_TIME\s*([^}]*?)\s+TEST_COMPLETION_DATE\s*([^}]*?)\s+TEST_COMPLETION_TIME\s*([^}]*?)\s+ASSAY_CALIBRATION_DATE\s*([^}]*?)\s+ASSAY_CALIBRATION_TIME\s*([^}]*?)\s+TRACK\s*([^}]*?)\s+PROCESSING_LANE\s*([^}]*?)\s+MODULE_SN\s*([^}]*?)\s+LOAD_LIST_NAME\s*([^}]*?)\s+OPERATOR_ID\s*([^}]*?)\s+DARK_SUBREADS\s*([^}]*?)\s+SIGNAL_SUBREADS\s*([^}]*?)\s+DARK_COUNT\s*([^}]*?)\s+SIGNAL_COUNT\s*([^}]*?)\s+CORRECTED_COUNT\s*([^}]*?)\s+STD_BAK\s*([^}]*?)\s+AVG_BAK\s*([^}]*?)\s+STD_FOR\s*([^}]*?)\s+AVG_FOR\s*([^}]*?)\s+SHAPE\s*([^}]*?)\s+EXCEPTION_STRING\s*([^}]*?)\s+RESULT\s*([^}]*?)\s+REPORTED_RESULT\s*([^}]*?)\s+REPORTED_RESULT_UNITS\s*([^}]*?)\s+REAGENT_MASTER_LOT\s*([^}]*?)\s+REAGENT_SERIAL_NUMBER\s*([^}]*?)\s+RESULT_FLAGS\s*([^}]*?)\s+RESULT_INTERPRETATION\s*([^}]*?)\s+DILUTION_PROTOCOL\s*([^}]*?)\s+RESULT_COMMENT\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_1\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_2\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_3\s*([^}]*?)\s+DATA_MANAGEMENT_FIELD_4\s*([^}]*?)\s*}"; 
+3

停止这种疯狂,只是解析正确的结构。一行有一个键/名称,后跟空格和一个值(行的其余部分)。迭代这些行并将其全部解析为散列/字典或保存您所需的任何内容。 – Qtax

回答

0

是的,你可能应该只是解析键值对的记录。

这是一个代码示例,如果您想从记录中提取键值对。
如果找到匹配项,则可以针对捕获集合中的键对您所需的键进行测试。

你也可以改变正则表达式如何允许记录的开始/结束。
但是不要改变核心,它可以防止灾难性的回溯。

正则表达式的替代品:

# Record starts on a new line, closing brace can be anywhere 

^ [^\S\n]*TEST_REPLICATE\s*\{ 
(?> 
     \s* (?<key> [^\s{}]+) [^\S\n]* (?<val> [^\n{}]*?) [^\S\n]* (?:$|(?=\})) 
)* 
\s*\} 


# Record starts anywhere, closing brace is on a new line 

TEST_REPLICATE\s*\{ 
(?> 
     \s* (?<key> [^\s{}]+) [^\S\n]* (?<val> [^\n{}]*?) [^\S\n]* $ 
)* 
\s*\} 

C#测试代码:

Regex testRx = new Regex(
@" 
^[^\S\n]* TEST_REPLICATE  # Record, starts on a newline 
    \s*       # Optional whitespaces (trims blank lines) 
    \{       # Record opening brace 
     (?>       # Atomic group 
     \s*       # Optional many whitespace (trims blank lines) 
     # Line in record to be recorded 
     (?<key> [^\s{}]+)    # required <key>, not whitespacs nor braces 
     [^\S\n]*       # trim whitespaces (don't include newline) 
     (?<val> [^\n{}]*?)    # optional <value>, not newlines nor braces 
     [^\S\n]*       # trim whitespaces (don't include newline) 
     (?:$|(?=\}))     # End of line, or next char is a closing brace 
    )*       # End atomic group, do many times (optional) 
    \s*       # Optional whitespaces (trims blank lines) 
    \}       # Record closing brace 
", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); 

string testdata = @" 
TEST_REPLICATE{} 
TEST_REPLICATE{ 
    REPLICATE_ID   1asdf985 
    ASSAY_NUMBER   123sdg 
    ASSAY_VERSION   4sdgn 
    ASSAY_TYPE   unknown 
} 

TEST_REPLICATE 
{ 
    REPLICATE_ID    
    ASSAY_NUMBER   123  
    ASSAY_VERSION   4 
    ASSAY_TYPE   unknown 
    DILUTION_ID   1 
    SAMPLE_ID   ""NC_HIV1"" 
    SAMPLE_TYPE   Specimen 
    TEST_ORDER_DATE   05.21.2012 
    TEST_ORDER_TIME   03:44:01 
    TEST_INITIATION_DATE  05.21.2012 
    TEST_INITIATION_TIME  04:03:36 

TEST_COMPLETION_DATE  05.21.2012 
TEST_COMPLETION_TIME  04:29:32 
    ASSAY_CALIBRATION_DATE    NA 
    ASSAY_CALIBRATION_TIME  NA 
    TRACK   1 
    PROCESSING_LANE  1 
    MODULE_SN  ""EP004"" 
    LOAD_LIST_NAME   C:\sdddd 
    OPERATOR_ID   ""Q_SI"" 
    DARK_SUBREADS   NA 
    SIGNAL_SUBREADS   NA 
    DARK_COUNT   NA 
    SIGNAL_COUNT   NA 
    CORRECTED_COUNT   NA 
    STD_BAK    NA 
    AVG_BAK    NA 
    STD_FOR    NA 
    AVG_FOR    NA 
    SHAPE    NA 
    EXCEPTION_STRING  Test execution was stopped. 
    RESULT    NA 
    REPORTED_RESULT   NA 
    REPORTED_RESULT_UNITS  NA 
    REAGENT_MASTER_LOT  2345 
    REAGENT_SERIAL_NUMBER  25022 
    RESULT_FLAGS   NA 
    RESULT_INTERPRETATION  NA 
    DILUTION_PROTOCOL  UNDILUTED 
    RESULT_COMMENT   HIV NC 1 
    DATA_MANAGEMENT_FIELD_1  NA 
    DATA_MANAGEMENT_FIELD_2  NA 
    DATA_MANAGEMENT_FIELD_3  NA 
    DATA_MANAGEMENT_FIELD_4  NA 
} 
    "; 

Match m_testrec = testRx.Match(testdata); 

// Each match contains a single record 
// 
while (m_testrec.Success) 
{ 
    Console.WriteLine("New Record\n------------------------"); 

    CaptureCollection cc_key = m_testrec.Groups["key"].Captures; 
    CaptureCollection cc_val = m_testrec.Groups["val"].Captures; 

    for (int i = 0; i < cc_key.Count; i++) 
    { 
     Console.WriteLine("'{0}' = '{1}'", cc_key[i].Value, cc_val[i].Value);                         
     // 
     // Test specific keys here 
     // if (cc_key[i].Value == "REAGENT_SERIAL_NUMBER") ... 

    } 
    Console.WriteLine("------------------------"); 

    // Get next record 
    m_testrec = m_testrec.NextMatch(); 
}