2016-07-28 47 views
0

我正在处理一个查询问题,我正在努力解决问题。我有一个名字的数据库。什么我希望做的是找出那些谁在数据库连接到相同的ID,在这些名字非常相似,彼此有多个名称:识别数据库中的类似字段(但不重复)

ID       Name 
-------------    ---------- 
123ABC      Joe Smith 

123ABC      Joseph Smith 

345XYZ      Michael Johnson 

345XYZ      MikeJohnson 

678LMN      Suzyjones 

678LMN      Suzanne Mary Jones 

所以我希望建立一个查询可以识别这些人。任何人有任何建议或意见?显然,这可能相当棘手,因为我们不处理直接重复,而是小而细微的变化。

+0

修改标签为实际的数据库 – dbmitch

+0

你看到http://stackoverflow.com/a/测试38513900/5221944?这个解决了这些细微差别(适用于BigQuery),并且可以轻松移植到您的新示例中。顺便说一句 - 你能在你之前的问题中实现它吗? –

+0

@dbmitch - 你是什么意思? – wizkids121

回答

0

做一个自我加入ID匹配的位置和名称不:

select t1.ID, t1.NAME, t2.NAME 
from your_table t1 
join your_table t2 
    on t1.ID = t2.ID 
and t1.NAME <> t2.NAME 
+0

对,但这不是我要找的。 这是关于识别数据库中具有两个彼此类似的不同名称的ID。我知道如何找到这些ID,但是它找到了那些我有麻烦的名称类型的例子。 – wizkids121

0

您可以通过多种方式实现这一点,我建议你去通过group by子句的路线。

下面的查询假设您只有 的记录,因此有一个名称附加到ID上。

;WITH CTE AS 
(
SELECT ID 
FROM <yourTable> 
group by ID 
HAVING COUNT(1) > 1 
) 
SELECT T.* 
FROM CTE C 
JOIN <yourTable> T 
ON C.id - T.ID 

如果你有在同一个表中的多个同名行,那么你只需要事先申请的不同条款。下面

0

检查 - 应在查询的结尾你
WHERE similarity > -1工作 - 通过设定值,而不是-1你可以控制的相似性阈值。越接近1,你想捕捉的对象就越相似。更接近0 - 更多对捕捉!

SELECT ID, Name1, Name2, similarity FROM 
JS(// input table 
(
    SELECT one.ID AS ID, one.Name AS Name1, two.Name AS Name2 
    FROM YourTable AS one 
    JOIN YourTable AS two ON one.ID = two.ID 
    HAVING Name1 < Name2 
) , 
// input columns 
ID, Name1, Name2, 
// output schema 
"[{name: 'ID', type:'string'}, 
    {name: 'Name1', type:'string'}, 
    {name: 'Name2', type:'string'}, 
    {name: 'similarity', type:'float'}] 
", 
// function 
"function(r, emit) { 

    var _extend = function(dst) { 
    var sources = Array.prototype.slice.call(arguments, 1); 
    for (var i=0; i<sources.length; ++i) { 
     var src = sources[i]; 
     for (var p in src) { 
     if (src.hasOwnProperty(p)) dst[p] = src[p]; 
     } 
    } 
    return dst; 
    }; 

    var Levenshtein = { 
    /** 
    * Calculate levenshtein distance of the two strings. 
    * 
    * @param str1 String the first string. 
    * @param str2 String the second string. 
    * @return Integer the levenshtein distance (0 and above). 
    */ 
    get: function(str1, str2) { 
     // base cases 
     if (str1 === str2) return 0; 
     if (str1.length === 0) return str2.length; 
     if (str2.length === 0) return str1.length; 

     // two rows 
     var prevRow = new Array(str2.length + 1), 
      curCol, nextCol, i, j, tmp; 

     // initialise previous row 
     for (i=0; i<prevRow.length; ++i) { 
     prevRow[i] = i; 
     } 

     // calculate current row distance from previous row 
     for (i=0; i<str1.length; ++i) { 
     nextCol = i + 1; 

     for (j=0; j<str2.length; ++j) { 
      curCol = nextCol; 

      // substution 
      nextCol = prevRow[j] + ((str1.charAt(i) === str2.charAt(j)) ? 0 : 1); 
      // insertion 
      tmp = curCol + 1; 
      if (nextCol > tmp) { 
      nextCol = tmp; 
      } 
      // deletion 
      tmp = prevRow[j + 1] + 1; 
      if (nextCol > tmp) { 
      nextCol = tmp; 
      } 

      // copy current col value into previous (in preparation for next iteration) 
      prevRow[j] = curCol; 
     } 

     // copy last col value into previous (in preparation for next iteration) 
     prevRow[j] = nextCol; 
     } 

     return nextCol; 
    } 

    }; 

    var the_Name1; 

    try { 
    the_Name1 = decodeURI(r.Name1).toLowerCase(); 
    } catch (ex) { 
    the_Name1 = r.Name1.toLowerCase(); 
    } 

    try { 
    the_Name2 = decodeURI(r.Name2).toLowerCase(); 
    } catch (ex) { 
    the_Name2 = r.Name2.toLowerCase(); 
    } 

    emit({ID: r.ID, Name1: the_Name1, Name2: the_Name2, 
     similarity: 1 - Levenshtein.get(the_Name1, the_Name2)/the_Name1.length}); 

    }" 
) 
WHERE similarity > -1 
ORDER BY similarity DESC 

你可以用下面的例子

SELECT ID, Name1, Name2, similarity FROM 
JS(// input table 
(
    SELECT one.ID AS ID, one.Name AS Name1, two.Name AS Name2 
    FROM (
    SELECT ID, Name FROM 
     (SELECT '123ABC' AS ID, 'Joe Smith' AS Name), 
     (SELECT '123ABC' AS ID, 'Joseph Smith' AS Name), 
     (SELECT '345XYZ' AS ID, 'Michael Johnson' AS Name), 
     (SELECT '345XYZ' AS ID, 'MikeJohnson' AS Name), 
     (SELECT '678LMN' AS ID, 'Suzyjones' AS Name), 
     (SELECT '678LMN' AS ID, 'Suzanne Mary Jones' AS Name), 
     (SELECT 'AAA' AS ID, 'Jordan Tigani' AS Name), 
     (SELECT 'AAA' AS ID, 'Felipe Hoffa' AS Name), 
     (SELECT 'BBB' AS ID, 'Mikhail Berlyant' AS Name), 
     (SELECT 'BBB' AS ID, 'Michael Sheldon' AS Name), 
) AS one 
    JOIN (
    SELECT ID, Name FROM 
     (SELECT '123ABC' AS ID, 'Joe Smith' AS Name), 
     (SELECT '123ABC' AS ID, 'Joseph Smith' AS Name), 
     (SELECT '345XYZ' AS ID, 'Michael Johnson' AS Name), 
     (SELECT '345XYZ' AS ID, 'MikeJohnson' AS Name), 
     (SELECT '678LMN' AS ID, 'Suzyjones' AS Name), 
     (SELECT '678LMN' AS ID, 'Suzanne Mary Jones' AS Name), 
     (SELECT 'AAA' AS ID, 'Jordan Tigani' AS Name), 
     (SELECT 'AAA' AS ID, 'Felipe Hoffa' AS Name), 
     (SELECT 'BBB' AS ID, 'Mikhail Berlyant' AS Name), 
     (SELECT 'BBB' AS ID, 'Michael Sheldon' AS Name), 
) AS two 
    ON one.ID = two.ID 
    HAVING Name1 < Name2 
) , 
// input columns 
ID, Name1, Name2, 
// output schema 
"[{name: 'ID', type:'string'}, 
    {name: 'Name1', type:'string'}, 
    {name: 'Name2', type:'string'}, 
    {name: 'similarity', type:'float'}] 
", 
// function 
"function(r, emit) { 

    var _extend = function(dst) { 
    var sources = Array.prototype.slice.call(arguments, 1); 
    for (var i=0; i<sources.length; ++i) { 
     var src = sources[i]; 
     for (var p in src) { 
     if (src.hasOwnProperty(p)) dst[p] = src[p]; 
     } 
    } 
    return dst; 
    }; 

    var Levenshtein = { 
    /** 
    * Calculate levenshtein distance of the two strings. 
    * 
    * @param str1 String the first string. 
    * @param str2 String the second string. 
    * @return Integer the levenshtein distance (0 and above). 
    */ 
    get: function(str1, str2) { 
     // base cases 
     if (str1 === str2) return 0; 
     if (str1.length === 0) return str2.length; 
     if (str2.length === 0) return str1.length; 

     // two rows 
     var prevRow = new Array(str2.length + 1), 
      curCol, nextCol, i, j, tmp; 

     // initialise previous row 
     for (i=0; i<prevRow.length; ++i) { 
     prevRow[i] = i; 
     } 

     // calculate current row distance from previous row 
     for (i=0; i<str1.length; ++i) { 
     nextCol = i + 1; 

     for (j=0; j<str2.length; ++j) { 
      curCol = nextCol; 

      // substution 
      nextCol = prevRow[j] + ((str1.charAt(i) === str2.charAt(j)) ? 0 : 1); 
      // insertion 
      tmp = curCol + 1; 
      if (nextCol > tmp) { 
      nextCol = tmp; 
      } 
      // deletion 
      tmp = prevRow[j + 1] + 1; 
      if (nextCol > tmp) { 
      nextCol = tmp; 
      } 

      // copy current col value into previous (in preparation for next iteration) 
      prevRow[j] = curCol; 
     } 

     // copy last col value into previous (in preparation for next iteration) 
     prevRow[j] = nextCol; 
     } 

     return nextCol; 
    } 

    }; 

    var the_Name1; 

    try { 
    the_Name1 = decodeURI(r.Name1).toLowerCase(); 
    } catch (ex) { 
    the_Name1 = r.Name1.toLowerCase(); 
    } 

    try { 
    the_Name2 = decodeURI(r.Name2).toLowerCase(); 
    } catch (ex) { 
    the_Name2 = r.Name2.toLowerCase(); 
    } 

    emit({ID: r.ID, Name1: the_Name1, Name2: the_Name2, 
     similarity: 1 - Levenshtein.get(the_Name1, the_Name2)/the_Name1.length}); 

    }" 
) 
WHERE similarity > -1 
ORDER BY similarity DESC 

它产生以下结果

ID   Name1    Name2    similarity 
123ABC  joe smith   joseph smith  0.6666666666666667 
345XYZ  michael johnson  mikejohnson   0.6666666666666667 
678LMN  suzanne mary jones suzyjones   0.5 
BBB   michael sheldon  mikhail berlyant 0.4666666666666667 
AAA   felipe hoffa  jordan tigani  0.0