2011-05-07 82 views
1

我试图从短的非统一产品描述中提取数据,以部分自动化为我公司的在线商店制作产品网页。不幸的是,描述并不统一。感谢这个网站,我已经学到了足够多的关于正则表达式来公平地刺激它。Java正则表达式从字符串中提取度量值

在产品测量方法中,这里是一个失败的测试。

w. This product is 68 cm by 22 cm by 73 cm -- Length: 68 cm Width: 73 cm Height:

但这非常相似的测试不会失败。为什么?

x. This product is 68 cm x 22 cm x 73 cm -- Length: 68 cm Width: 22 cm Height: 73 cm

下面是测试类。作为一个正则表达式noob,我确信我没有这么做,所以关于效率的建议会很棒。如果你能想到其他任何失败的测试用例,请告诉我。

import java.util.regex.Matcher; 
import java.util.regex.Pattern; 
public class TesterClass { 
    public static final String[] testArray = 
    { 
     "a. Dynabrade 4\" Discs 34-333-102", 
     "b. Mercer 4 Inch Discs", 
     "c. Mercer 4in Discs", 
     "d. Carbo CleanAir 6\' Vacuum Tube Attachment", 
     "e. 4 feet", 
     "f. 4 Ft", 
     "g. 4 foot", 
     "h. Carborundum 2-3/4\" Tape 4 yd Roll 97580", 
     "i. I want 5 Inches and later 6 Feet", 
     "j. I don't want this one pyrex 9", 
     "k. This is 4 Inches x 5 Inches x 6 Feet in Size", 
     "l. This one is 6 x 5 Inches", 
     "m. This is 4\" x 5\" x 6\' in size", 
     "n. Something 4-3/4\" Long", 
     "o. I don't want 9 xtreme things", 
     "p. I don't want 9 men", 
     "q. 674m", 
     "r. 4 Inches", 
     "s. 5x8", 
     "t. P58\"", 
     "u. 5 x 7", 
     "v. 6 yards", 
     "w. This product is 68 cm by 22 cm by 73 cm", 
     "x. This product is 68 cm x 22 cm x 73 cm" 
    }; 
    public static final String[] FIELDNAMES = {"Length: ","Width: ","Height: "}; 

    public static final String 
     MEASURE = "(f(ee)?(oo)?t|in(ch)?(es)?|y(ar)?d(s)?|cm|m|meter(s)?|\"|\')", 
     MAYBE_MEASURE = MEASURE + "?", 
     NUMBER = "([0-9\\-/]+)", 
     X = "[(x)(by)]", 
     SPACE = "\\s", 
     MAYBE_SPACE = "\\s?", 
     SPACE_OR_END = "(\\s|$)", 
     START = "^", 
     END = "$", 
     TAB = "\t"; 

    public static final Pattern 
     regular = Pattern.compile(NUMBER + MAYBE_SPACE + MEASURE + SPACE_OR_END, Pattern.CASE_INSENSITIVE), 
     lengthXwidth = Pattern.compile(NUMBER + MAYBE_SPACE + MAYBE_MEASURE + MAYBE_SPACE + X + MAYBE_SPACE + NUMBER + MAYBE_MEASURE, Pattern.CASE_INSENSITIVE), 
     beforeX = Pattern.compile(NUMBER + MAYBE_SPACE + MAYBE_MEASURE + MAYBE_SPACE + END, Pattern.CASE_INSENSITIVE), 
     afterX = Pattern.compile(START + MAYBE_SPACE + NUMBER + MAYBE_SPACE + MAYBE_MEASURE, Pattern.CASE_INSENSITIVE), 
     measure = Pattern.compile(MAYBE_SPACE + MEASURE, Pattern.CASE_INSENSITIVE); 


    public static void main(String[] args) 
    { 
     for (String testString:testArray) 
     { 
      String[] fields = {"","",""}; 
      int match = 0; 
      Matcher lengthXwidthMatcher = lengthXwidth.matcher(testString); 
      Matcher regularMatcher = regular.matcher(testString); 

      if (lengthXwidthMatcher.find()) 
      { 
       String[] split = testString.split(X); 
       for (int i = 0; i<split.length; i++) 
       { 
        Matcher beforeXMatcher = beforeX.matcher(split[i]); 
        Matcher afterXMatcher = afterX.matcher(split[i]); 
        if (beforeXMatcher.find() && match==0) 
        { 
         fields[0] = beforeXMatcher.group(); 
         match++; 
        } 
        if (afterXMatcher.find()) 
        { 
         if (match==1) 
         { 
          fields[1] = afterXMatcher.group(); 
          match++; 
         } 
         else if (match==2) 
         { 
          fields[2] = afterXMatcher.group(); 
          match++; 
         } 
        } 
       } 
       Matcher lengthHasMeasure = measure.matcher(fields[0]); 
       Matcher widthHasMeasure = measure.matcher(fields[1]); 
       Matcher heightHasMeasure = measure.matcher(fields[2]); 
       if (heightHasMeasure.find()==true) 
       { 
        if (lengthHasMeasure.find()==false) 
        { 
         fields[0] = fields[0] + heightHasMeasure.group(); 
        } 
        if (widthHasMeasure.find()==false) 
        { 
         fields[1] = fields[1] + heightHasMeasure.group(); 
        } 
       } 
       else if (widthHasMeasure.find()==true) 
       { 
        if (lengthHasMeasure.find()==false) 
        { 
         fields[0] = fields[0] + widthHasMeasure.group(); 
        } 
       } 
      } 
      else if(regularMatcher.find()) 
      { 
       fields[0] = regularMatcher.group(); 
       match++; 
       while (regularMatcher.find() && match<3) 
       { 
        fields[match] = regularMatcher.group(); 
       } 
      } 

      System.out.println(testString + " -- " + TAB + FIELDNAMES[0] + fields[0] + TAB + FIELDNAMES[1] + fields[1] + TAB + FIELDNAMES[2] + fields[2]); 
     } 
    } 
} 

回答

2

看起来好像您的X生产被指定为字符类,而不是交替。试试:

X = "(x|by)" 
+0

这是行得通!我知道它必须是一些noob错误。 – DA555 2011-05-07 21:35:20