2016-11-10 37 views
3

执行慢我创建了一个简单的脚本,两个字符串之间的得分。请找到US​​QL和后端.NET代码下面的USQL

CN_Matcher.usql:

REFERENCE ASSEMBLY master.FuzzyString; 

@searchlog = 
     EXTRACT ID int, 
       Input_CN string, 
       Output_CN string 
     FROM "/CN_Matcher/Input/sample.txt" 
     USING Extractors.Tsv(); 

@CleansCheck = 
    SELECT ID,Input_CN, Output_CN, CN_Validator.trial.cleanser(Input_CN) AS Input_CN_Cleansed, 
      CN_Validator.trial.cleanser(Output_CN) AS Output_CN_Cleansed 
    FROM @searchlog; 

@CheckData= SELECT ID,Input_CN, Output_CN, Input_CN_Cleansed, Output_CN_Cleansed, 
        CN_Validator.trial.Hamming(Input_CN_Cleansed, Output_CN_Cleansed) AS HammingScore, 
        CN_Validator.trial.LevinstienDistance(Input_CN_Cleansed, Output_CN_Cleansed) AS LevinstienDistance, 
        FuzzyString.ComparisonMetrics.JaroWinklerDistance(Input_CN_Cleansed, Output_CN_Cleansed) AS JaroWinklerDistance 
             FROM @CleansCheck; 

OUTPUT @CheckData 
    TO "/CN_Matcher/CN_Full_Run.txt" 
    USING Outputters.Tsv(); 

CN_Matcher.usql.cs:

using Microsoft.Analytics.Interfaces; 
using Microsoft.Analytics.Types.Sql; 
using System; 
using System.Collections.Generic; 
using System.IO; 
using System.Linq; 
using System.Text; 

namespace CN_Validator 
{ 
    public static class trial 
    { 

     public static string cleanser(string val) 
     { 
      List<string> wordsToRemove = "l.p. registered pc bldg pllc lp. l.c. div. national l p l.l.c international r. limited school azioni joint co-op corporation corp., (corp) inc., societa company llp liability l.l.l.p llc bancorporation manufacturing c dst (inc) jv ltd. llc. technology ltd., s.a. mfg rllp incorporated per venture l.l.p c. p.l.l.c l.p.. p. partnership corp co-operative s.p.a tech schl bancorp association lllp n r ltd inc. l.l.p. p.c. co district int intl assn. sa inc l.p co, co. division lc intl. lp professional corp. a l. l.l.c. building r.l.l.p co.,".Split(' ').ToList(); 
      return string.Join(" ", val.ToLower().Split(' ').Except(wordsToRemove)); 
     } 

     public static int Hamming(string source, string target) 
     { 
      int distance = 0; 
      if (source.Length == target.Length) 
      { 
       for (int i = 0; i < source.Length; i++) 
       { 
        if (!source[i].Equals(target[i])) 
        { 
         distance++; 
        } 
       } 
       return distance; 
      } 
      else { return 99999; } 
     } 

     public static int LevinstienDistance(string source, string target) 
     { 
      int n = source.Length; 
      int m = target.Length; 
      int[,] d = new int[n + 1, m + 1]; // matrix 
      int cost; // cost 
      // Step 1 
      if (n == 0) return m; 
      if (m == 0) return n; 
      for (int i = 0; i <= n; d[i, 0] = i++) ; 
      for (int j = 0; j <= m; d[0, j] = j++) ; 
      for (int i = 1; i <= n; i++) 
      { 
       for (int j = 1; j <= m; j++) 
       { 
        cost = (target.Substring(j - 1, 1) == source.Substring(i - 1, 1) ? 0 : 1); 
        d[i, j] = System.Math.Min(System.Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), 
           d[i - 1, j - 1] + cost); 
       } 
      } 
      return d[n, m]; 
     } 

    } 
} 

我已经跑了样品批次100输入并设置并行度为1,优先级为1000. 工作在1.6分钟内完成

我想用1000个输入测试相同的作业,并将并行度设置为1,优先级设置为1000,并根据我的计算,因为它需要1.6分钟的100个输入我认为1000个输入需要大约20分钟,但它跑了超过50分钟,我没有看到任何进展

所以我增加了一个100的输入工作,并测试它跑了一样以前的时间。所以我想增加平行度并将其提高到3并再次运行,即使在1小时后也没有完成。

JOB_ID = 07c0850d-0770-4430-a288-5cddcfc26699

的主要问题是,我无法看到任何进展或状态。

请让我知道如果我做错什么。

反正在USQL使用构造函数?因为如果我能够做到这一点,我不需要一次又一次地执行相同的清洁步骤。

回答

2

我假设你正在使用的文件集语法指定1000个文件?不幸的是,文件集的当前默认实现不能很好地扩展,编译(准备)阶段将需要很长时间(执行也是如此)。我们目前在预览中有更好的实现。你可以给我发一封邮件到usql在微软网络公司,我会告诉你如何试用预览实现。

感谢 迈克尔

+0

嗨迈克尔它不是1000个文件它是1000个输入的一个文件。我会邮寄给你。感谢您的回应。 – The6thSense

0

我看着这样做的更多的基于集合的方式。例如,而不是抱着字的代码隐藏文件中删除,追究他们的U-SQL表,因此很容易添加到:

CREATE TABLE IF NOT EXISTS dbo.wordsToRemove 
(
    word string, 

    INDEX cdx_wordsToRemvoe CLUSTERED (word ASC) 
    DISTRIBUTED BY HASH (word) 
); 

INSERT INTO dbo.wordsToRemove (word) 
SELECT word 
FROM (
VALUES 
    ("l.p."), 
    ("registered"), 
    ("pc"), 
    ("bldg"), 
    ("pllc"), 
    ("lp."), 
    ("l.c."), 
    ("div."), 
    ("national"), 
    ("l"), 
    ("p"), 
    ("l.l.c"), 
    ("international"), 
    ("r."), 
    ("limited"), 
    ("school"), 
    ("azioni"), 
    ("joint"), 
    ("co-op"), 
    ("corporation"), 
    ("corp.,"), 
    ("(corp)"), 
    ("inc.,"), 
    ("societa"), 
    ("company"), 
    ("llp"), 
    ("liability"), 
    ("l.l.l.p"), 
    ("llc"), 
    ("bancorporation"), 
    ("manufacturing"), 
    ("c"), 
    ("dst"), 
    ("(inc)"), 
    ("jv"), 
    ("ltd."), 
    ("llc."), 
    ("technology"), 
    ("ltd.,"), 
    ("s.a."), 
    ("mfg"), 
    ("rllp"), 
    ("incorporated"), 
    ("per"), 
    ("venture"), 
    ("l.l.p"), 
    ("c."), 
    ("p.l.l.c"), 
    ("l.p.."), 
    ("p."), 
    ("partnership"), 
    ("corp"), 
    ("co-operative"), 
    ("s.p.a"), 
    ("tech"), 
    ("schl"), 
    ("bancorp"), 
    ("association"), 
    ("lllp"), 
    ("n"), 
    ("r"), 
    ("ltd"), 
    ("inc."), 
    ("l.l.p."), 
    ("p.c."), 
    ("co"), 
    ("district"), 
    ("int"), 
    ("intl"), 
    ("assn."), 
    ("sa"), 
    ("inc"), 
    ("l.p"), 
    ("co,"), 
    ("co."), 
    ("division"), 
    ("lc"), 
    ("intl."), 
    ("lp"), 
    ("professional"), 
    ("corp."), 
    ("a"), 
    ("l."), 
    ("l.l.c."), 
    ("building"), 
    ("r.l.l.p"), 
    ("co.,") 
) AS words(word); 

然后做比较,我分裂原语起来,去掉我们不想再把语句后面的话再度合作,这样的事情:

//DECLARE @inputFile string = "input/input.csv"; // 500 companies, Standard & Poor 500 companies from wikipedia 
DECLARE @inputFile string = "input/input2.csv"; // 850,000 companies, part 1 of extract from Companies House 


@searchlog = 
    EXTRACT id int, 
      Input_CN string, 
      Output_CN string 
    FROM @inputFile 
    USING Extractors.Csv(silent : true); 
    //USING Extractors.Csv(skipFirstNRows:1); 


// Split the input string to remove unwanted words 
@Input_CN = 
    SELECT id, 
      new SQL.ARRAY<string>(Input_CN.Split(' ')) AS splitWords 
    FROM @searchlog; 


@Output_CN = 
    SELECT id, 
      new SQL.ARRAY<string>(Output_CN.Split(' ')) AS splitWords 
    FROM @searchlog; 


// Remove unwanted words from input string 
@Input_CN = 
    SELECT * 
    FROM 
    (
     SELECT o.id, 
       x.splitWord.ToLower() AS splitWord 
     FROM @Input_CN AS o 
      CROSS APPLY 
       EXPLODE(splitWords) AS x(splitWord) 
    ) AS y  
    ANTISEMIJOIN 
     dbo.wordsToRemove AS w 
    ON y.splitWord == w.word; 

// Remove unwanted words from output string 
@Output_CN = 
    SELECT * 
    FROM 
    (
     SELECT o.id, 
       x.splitWord.ToLower() AS splitWord 
     FROM @Output_CN AS o 
      CROSS APPLY 
       EXPLODE(splitWords) AS x(splitWord) 
    ) AS y 
    ANTISEMIJOIN 
     dbo.wordsToRemove AS w 
    ON y.splitWord == w.word; 




// Put the input string back together again 
@Input_CN = 
    SELECT id, 
      String.Join(" ", ARRAY_AGG (splitWord)) AS Input_CN_Cleansed 
    FROM @Input_CN 
    GROUP BY id; 


@Output_CN = 
    SELECT id, 
      String.Join(" ", ARRAY_AGG (splitWord)) AS Output_CN_Cleansed 
    FROM @Output_CN 
    GROUP BY id; 



@output = 
    SELECT i.id, 
      i.Input_CN_Cleansed, 
      o.Output_CN_Cleansed, 
      CN_Validator.trial.Hamming(i.Input_CN_Cleansed, o.Output_CN_Cleansed) AS HammingScore, 
      CN_Validator.trial.LevinstienDistance(i.Input_CN_Cleansed, o.Output_CN_Cleansed) AS LevinstienDistance 
    FROM @Input_CN AS i 
     INNER JOIN 
      @Output_CN AS o 
     ON i.id == o.id; 



OUTPUT @output 
    TO "/output/output.csv" 
    USING Outputters.Csv(); 

我发现性能相似,但有可能设计更易于维护。无论如何,我的代码只需要几分钟就能运行850 + k条记录,而不是50分钟以上,所以也许还有另一个问题。 NB我错过了FuzzyString库,所以在我的测试中没有包括这个 - 它可以解释这个差异。

如果你从微软获得此更新,请回发到这个线程,甚至将其标记为答案,如果你喜欢。

+0

如果我在这个问题上得到解决,我一定会在这里发布。感谢代码翻新。由于不建议在SQL中对数据进行规范化,所以我认为这是在.net版本中完成的,但是您的代码寻找可维护性,而且看起来您正在使用USQL的全部功能。 – The6thSense

相关问题