0

我正在做我的项目使用聚类和KNN算法的文本分类。但是我的分类工作不正常。我只是使用每个字母的ASCII值来进行分类。在我的分类中,我只是根据数据的大小进行分类。我不知道这个项目更多。我想获得关于KNN分类的一些信息。KNN在Matlab中的文本分类

我已经获得路透社21578,路透社转录子集的子集。我不知道该怎么做。

这是我的示例代码

function [out2, out3, tme] = knnk_latests(data, foll) 
    data  = data; 
    s   = cd; 
    tic; 

    if isempty(foll) 
     foll = 'tempdat'; 
    end 

    ss  = [s '\' foll]; 
    dir1  = dir(ss); 
    c   = []; 
    hlp_count = zeros(length(dir1) - 2, 1); 

    for j = 3:length(dir1) 
     sss = [ss '\' dir1(j).name]; 

     cd(sss); 
     dir2 = dir; 

     for i = 3:length(dir2) 
     fid = fopen([sss '\' dir2(i).name]); 
     ct = fread(fid, 10000000, 'uint8=>char'); %% an erroneous "'" after the closing parenthesis was removed 
     fclose(fid); 

     if i == 3 && j == 3 
      c=[ct '@@@@']; 
     else 
      if i == length(dir2) && j == length(dir1) 
       c = [c ct]; 
      else 
       c = [c ct '@@@@']; 
      end 
     end 

     hlp_count(j - 2) = hlp_count(j - 2) + 1; 
     end 
    end 

    cd(s); 
    trainset = strsplit(c, '@@@@'); 
    trainset = trainset'; 
    trainset = char(trainset); 
    lng_hlp = size(trainset); 
    lngt  = lng_hlp(1); 

    for i = 1:lngt 
     if strcmp(trainset(i, 1), ' ') 
     trainset(i, 1:end - 1) = trainset(i, 2:end); 
     trainset(i, end)  = ' '; 
     else 
     trainset(i,:)=trainset(i,:); 
     end 
    end 

    trainset1 = double(trainset); 

    sample11 = data; 
    samples = char(sample11); 
    ln_hlp = size(samples); 
    lnt  = ln_hlp(1); 

    for i = 1:lnt 
     if strcmp(samples(i, 1),' ') 
     samples(i, 1:end - 1) = samples(i, 2:end); 
     samples(i, end)  = ' '; 
     else 
     samples(i,:)=samples(i,:); 
     end 
    end 

    samples1 = double(samples); 
    a  = size(samples1); 
    b  = size(trainset1); 
    if a(2) > b(2) == 1 
     trainset1 = [ trainset1 32 * ones(b(1), a(2) - b(2)) ]; 
    else 
     samples1 = [ samples1 32 * ones(a(1), b(2) - a(2)) ]; 
    end 

    grp = [1:lngt]'; 
    class = knnclassify(samples1, trainset1, grp); 
    fle = class; 

    tme = num2str(toc/fle * 1000); 

    i  = 0; 

    while class > 0 
     i  = i + 1; 
     hllp = class; 
     class = class - hlp_count(i); 
    end 

    fprintf('\n'); 
    fprintf('\n'); 
    out2  = dir1(i + 2).name; 
    hlp_nmedr = dir([ss '\' out2]); 
    hlp_nme = hlp_nmedr(hllp + 2).name(1:end - 4); 
    fprintf('Output:- '); 
    fprintf('\n'); 
    fprintf(2, 'The input is matched to the class : ') 
    disp(out2); 
    fprintf(2, 'Sub class : ') 
    out3  = [out2 hlp_nme]; 
    outt3  = [out2 ',' num2str(hllp)]; 
    disp(outt3); 
    fprintf('\n'); 
    cd(s); 

回答