找到最长的重复子串

5

看一看http://en.wikipedia.org/wiki/Suffix_array以及 - 他们是有相当的空间，高效，有一些合理的可编程的算法来生产它们，如“简单的线性工作后缀数组建设”，由Karkkainen和桑德斯

来源

2012-04-27 18:09:21 mcdowella

0

有太多事情会影响性能我们只回答您提供给我们的问题。（操作系统，语言，记忆问题，代码本身）

如果你只是想找的算法效率的数学分析，你可能要改变的问题。

编辑

当我提到“内存问题”和“代码”我没有提供所有的细节。你将要分析的字符串的长度是一个很大的因素。此外，代码不会单独运行 - 它必须位于程序内部才能有用。那个程序影响这个算法的使用和性能的特点是什么？

基本上，你无法进行性能调整，直到你有一个实际情况进行测试。你可以对可能表现最好的内容做出非常有教育的猜测，但直到你有真实的数据和真实的代码，你永远不会确定。

来源

2012-04-27 17:25:08

+2

我正在使用4 GB的RAM在Windows 7上使用C++编写 – kukit 2012-04-27 17:30:12

+0

谢谢澄清，最大字符串长度为5000个字符，它将是一个工作线程，只读该字符串并写入结果，以便您可以假定程序中没有其他代码。 – kukit 2012-04-27 17:39:12

+0

为什么这会得到票选？每个人都应该知道预先优化是浪费时间。我们可以选择通常是一个好主意的算法，但是如果不测量特定的场景，我们就不能选择“最好的”。 – 2015-02-16 16:18:43

7

退房此链接：http://introcs.cs.princeton.edu/java/42sort/LRS.java.html

/************************************************************************* 
* Compilation: javac LRS.java 
* Execution: java LRS < file.txt 
* Dependencies: StdIn.java 
* 
* Reads a text corpus from stdin, replaces all consecutive blocks of 
* whitespace with a single space, and then computes the longest 
* repeated substring in that corpus. Suffix sorts the corpus using 
* the system sort, then finds the longest repeated substring among 
* consecutive suffixes in the sorted order. 
* 
* % java LRS < mobydick.txt 
* ',- Such a funny, sporty, gamy, jesty, joky, hoky-poky lad, is the Ocean, oh! Th' 
* 
* % java LRS 
* aaaaaaaaa 
* 'aaaaaaaa' 
* 
* % java LRS 
* abcdefg 
* '' 
* 
*************************************************************************/ 


import java.util.Arrays; 

public class LRS { 

    // return the longest common prefix of s and t 
    public static String lcp(String s, String t) { 
     int n = Math.min(s.length(), t.length()); 
     for (int i = 0; i < n; i++) { 
      if (s.charAt(i) != t.charAt(i)) 
       return s.substring(0, i); 
     } 
     return s.substring(0, n); 
    } 


    // return the longest repeated string in s 
    public static String lrs(String s) { 

     // form the N suffixes 
     int N = s.length(); 
     String[] suffixes = new String[N]; 
     for (int i = 0; i < N; i++) { 
      suffixes[i] = s.substring(i, N); 
     } 

     // sort them 
     Arrays.sort(suffixes); 

     // find longest repeated substring by comparing adjacent sorted suffixes 
     String lrs = ""; 
     for (int i = 0; i < N - 1; i++) { 
      String x = lcp(suffixes[i], suffixes[i+1]); 
      if (x.length() > lrs.length()) 
       lrs = x; 
     } 
     return lrs; 
    } 



    // read in text, replacing all consecutive whitespace with a single space 
    // then compute longest repeated substring 
    public static void main(String[] args) { 
     String s = StdIn.readAll(); 
     s = s.replaceAll("\\s+", " "); 
     StdOut.println("'" + lrs(s) + "'"); 
    } 
}

来源

2013-01-20 06:19:55 user1071840

+0

非常感谢资源 – 2014-06-04 03:11:46

+0

这个代码的时间复杂度是多少 – Newbie 2015-12-01 20:57:16

+0

它是O（n^2）..因为lcp在最坏的情况下需要O（n）.. – paramvir 2017-02-06 03:52:39

3

下面是一个简单的实现使用简单的后缀树最长重复子的。后缀树通过这种方式很容易实现。

#include <iostream> 
#include <vector> 
#include <unordered_map> 
#include <string> 
using namespace std; 

class Node 
{ 
public: 
    char ch; 
    unordered_map<char, Node*> children; 
    vector<int> indexes; //store the indexes of the substring from where it starts 
    Node(char c):ch(c){} 
}; 

int maxLen = 0; 
string maxStr = ""; 

void insertInSuffixTree(Node* root, string str, int index, string originalSuffix, int level=0) 
{ 
    root->indexes.push_back(index); 

    // it is repeated and length is greater than maxLen 
    // then store the substring 
    if(root->indexes.size() > 1 && maxLen < level) 
    { 
     maxLen = level; 
     maxStr = originalSuffix.substr(0, level); 
    } 

    if(str.empty()) return; 

    Node* child; 
    if(root->children.count(str[0]) == 0) { 
     child = new Node(str[0]); 
     root->children[str[0]] = child; 
    } else { 
     child = root->children[str[0]]; 
    } 

    insertInSuffixTree(child, str.substr(1), index, originalSuffix, level+1); 
} 

int main() 
{ 
    string str = "banana"; //"abcabcaacb"; //"banana"; //"mississippi"; 
    Node* root = new Node('@'); 

    //insert all substring in suffix tree 
    for(int i=0; i<str.size(); i++){ 
     string s = str.substr(i); 
     insertInSuffixTree(root, s, i, s); 
    } 

    cout << maxLen << "->" << maxStr << endl; 

    return 1; 
} 

/* 
s = "mississippi", return "issi" 
s = "banana", return "ana" 
s = "abcabcaacb", return "abca" 
s = "aababa", return "aba" 
*/

来源

2014-03-30 00:29:07

0

public class LongestSubString { 

    public static void main(String[] args) { 
     String s = findMaxRepeatedString("ssssssssssss this is a ddddddd word with iiiiiiiiiis and loads of these are ppppppppppppps"); 
     System.out.println(s); 
    } 

    private static String findMaxRepeatedString(String s) { 
     Processor p = new Processor(); 
     char[] c = s.toCharArray(); 
     for (char ch : c) { 
      p.process(ch); 
     } 
     System.out.println(p.bigger()); 
     return new String(new char[p.bigger().count]).replace('\0', p.bigger().letter); 
    } 

    static class CharSet { 
     int count; 
     Character letter; 
     boolean isLastPush; 

     boolean assign(char c) { 
      if (letter == null) { 
       count++; 
       letter = c; 
       isLastPush = true; 
       return true; 
      } 
      return false; 
     } 

     void reassign(char c) { 
      count = 1; 
      letter = c; 
      isLastPush = true; 
     } 

     boolean push(char c) { 
      if (isLastPush && letter == c) { 
       count++; 
       return true; 
      } 
      return false; 
     } 

     @Override 
     public String toString() { 
      return "CharSet [count=" + count + ", letter=" + letter + "]"; 
     } 

    } 

    static class Processor { 

     Character previousLetter = null; 
     CharSet set1 = new CharSet(); 
     CharSet set2 = new CharSet(); 

     void process(char c) { 
      if ((set1.assign(c)) || set1.push(c)) { 
       set2.isLastPush = false; 
      } else if ((set2.assign(c)) || set2.push(c)) { 
       set1.isLastPush = false;     
      } else { 
       set1.isLastPush = set2.isLastPush = false; 
       smaller().reassign(c); 
      } 
     }  

     CharSet smaller() { 
      return set1.count < set2.count ? set1 : set2; 
     } 

     CharSet bigger() { 
      return set1.count < set2.count ? set2 : set1; 
     } 

    } 
}

来源

2016-12-01 22:57:44

+0

关于为什么这是最好的方法进行详细说明？ – 2016-12-01 23:43:25

0

的LRS问题是一个通过使用一个后缀树或后缀数组最好解决。这两种方法的最佳时间复杂度为O（n）。

这里是O（n日志（n））的使用后缀数组解决LRS问题。如果您有后缀数组的线性构建时间算法（这很难实现），我的解决方案可以改进为O（n）。代码取自我的library。如果你想在后缀数组是如何工作一定要看看我的tutorials

/** 
* Finds the longest repeated substring(s) of a string. 
* 
* Time complexity: O(nlogn), bounded by suffix array construction 
* 
* @author William Fiset, [email protected] 
**/ 

import java.util.*; 

public class LongestRepeatedSubstring { 

    // Example usage 
    public static void main(String[] args) { 

    String str = "ABC$BCA$CAB"; 
    SuffixArray sa = new SuffixArray(str); 
    System.out.printf("LRS(s) of %s is/are: %s\n", str, sa.lrs()); 

    str = "aaaaa"; 
    sa = new SuffixArray(str); 
    System.out.printf("LRS(s) of %s is/are: %s\n", str, sa.lrs()); 

    str = "abcde"; 
    sa = new SuffixArray(str); 
    System.out.printf("LRS(s) of %s is/are: %s\n", str, sa.lrs()); 

    } 

} 

class SuffixArray { 

    // ALPHABET_SZ is the default alphabet size, this may need to be much larger 
    int ALPHABET_SZ = 256, N; 
    int[] T, lcp, sa, sa2, rank, tmp, c; 

    public SuffixArray(String str) {  
    this(toIntArray(str));  
    } 

    private static int[] toIntArray(String s) { 
    int[] text = new int[s.length()]; 
    for(int i=0;i<s.length();i++)text[i] = s.charAt(i); 
    return text;  
    } 

    // Designated constructor 
    public SuffixArray(int[] text) { 
    T = text; 
    N = text.length; 
    sa = new int[N]; 
    sa2 = new int[N]; 
    rank = new int[N]; 
    c = new int[Math.max(ALPHABET_SZ, N)]; 
    construct(); 
    kasai(); 
    } 

    private void construct() { 
    int i, p, r; 
    for (i=0; i<N; ++i) c[rank[i] = T[i]]++; 
    for (i=1; i<ALPHABET_SZ; ++i) c[i] += c[i-1]; 
    for (i=N-1; i>=0; --i) sa[--c[T[i]]] = i; 
    for (p=1; p<N; p <<= 1) { 
     for (r=0, i=N-p; i<N; ++i) sa2[r++] = i; 
     for (i=0; i<N; ++i) if (sa[i] >= p) sa2[r++] = sa[i] - p; 
     Arrays.fill(c, 0, ALPHABET_SZ, 0); 
     for (i=0; i<N; ++i) c[rank[i]]++; 
     for (i=1; i<ALPHABET_SZ; ++i) c[i] += c[i-1]; 
     for (i=N-1; i>=0; --i) sa[--c[rank[sa2[i]]]] = sa2[i]; 
     for (sa2[sa[0]] = r = 0, i=1; i<N; ++i) { 
      if (!(rank[sa[i-1]] == rank[sa[i]] && 
       sa[i-1]+p < N && sa[i]+p < N && 
       rank[sa[i-1]+p] == rank[sa[i]+p])) r++; 
      sa2[sa[i]] = r; 
     } tmp = rank; rank = sa2; sa2 = tmp; 
     if (r == N-1) break; ALPHABET_SZ = r + 1; 
    } 
    } 

    // Use Kasai algorithm to build LCP array 
    private void kasai() { 
    lcp = new int[N]; 
    int [] inv = new int[N]; 
    for (int i = 0; i < N; i++) inv[sa[i]] = i; 
    for (int i = 0, len = 0; i < N; i++) { 
     if (inv[i] > 0) { 
     int k = sa[inv[i]-1]; 
     while((i + len < N) && (k + len < N) && T[i+len] == T[k+len]) len++; 
     lcp[inv[i]-1] = len; 
     if (len > 0) len--; 
     } 
    } 
    } 

    // Finds the LRS(s) (Longest Repeated Substring) that occurs in a string. 
    // Traditionally we are only interested in substrings that appear at 
    // least twice, so this method returns an empty set if this is not the case. 
    // @return an ordered set of longest repeated substrings 
    public TreeSet <String> lrs() { 

    int max_len = 0; 
    TreeSet <String> lrss = new TreeSet<>(); 

    for (int i = 0; i < N; i++) { 
     if (lcp[i] > 0 && lcp[i] >= max_len) { 

     // We found a longer LRS 
     if (lcp[i] > max_len) 
      lrss.clear(); 

     // Append substring to the list and update max 
     max_len = lcp[i]; 
     lrss.add(new String(T, sa[i], max_len)); 

     } 
    } 

    return lrss; 

    } 

    public void display() { 
    System.out.printf("-----i-----SA-----LCP---Suffix\n"); 
    for(int i = 0; i < N; i++) { 
     int suffixLen = N - sa[i]; 
     String suffix = new String(T, sa[i], suffixLen); 
     System.out.printf("% 7d % 7d % 7d %s\n", i, sa[i],lcp[i], suffix); 
    } 
    } 

}

来源

2017-08-30 02:53:04

找到最长的重复子串

回答

相关问题