2017-06-07 20 views
0

总体目标是根据百分比对数量变量进行分层。我想把它分成10个等级(例如10,20,...... 100百分位数),如果它落入第10个百分点,则将其重新编码为1;如果落入第20个百分位,则将其重新编码为1,等等。此方法需要适用于我插入的任何数据集,我希望此过程尽可能自动化。下面,我已经产生了一些测试数据:如何使用表2中的信息重新编码表1中的变量(在SAS中)

data test (drop=i); 
do i=1 to 1000; 
a=round(uniform(1)*4,.01); 
b=round(uniform(1)*10,.01); 
c=round(uniform(1)*7.5,.01); 
output; 
end; 
stop; 
run; 

下面的宏用于创建值的表,告诉你切断每个变量的10个百分位数。我在代码下面添加了示例输出的图片。

/*Recode variables based on quartiles from boxplot*/ 
%macro percentiles(var);                           
    /* Count the number of values in the strinrecode */                                 
    %let count=%sysfunc(countw(&var)); 
    /* Loop throurecodeh the total number of values */                       
    %do i = 1 %to &count;                            
     %let variables=%qscan(&var,&i,%str(,)); 
proc univariate data=test noprint; 
    var &variables; 
    output out=pcts pctlpts = 10 20 30 40 50 60 70 80 90 100 
        pctlpre = &variables; 
run; 
proc transpose data=pcts out=&variables (rename=(col1=&variables) drop=_NAME_ _LABEL_); 
run;                              
    %end; 
data percentiles (drop=i); 
do i=1 to 10; 
recode=i; 
percentile=i*10; 
output; 
end; 
stop; 
run; 

data pcts; 
merge percentiles %sysfunc(tranwrd(&var.,%str(,),%str())); 
run; 
%mend; 
%percentiles(%str(a,b,c)); 

output from above macro

下面的代码是如何我目前正在重新编写我的变量。我使用上面宏中生成的表格来填充每个变量的每个百分点的截止点。正如你所看到的,如果我有大量的重新编码变量,这是非常单调乏味的,并且会变得过分。有没有更好的过程,或者最好的方法是我可以自动化这部分?

data test; 
set test; 
if a <= .415 then recode_a = 1; else if a <= .785 then recode_a = 2; else if a <= 1.255 then recode_a = 3; 
else if a <= 1.61 then recode_a = 4; else if a <= 2.03 then recode_a = 5; else if a <= 2.42 then recode_a = 6; 
else if a <= 2.76 then recode_a = 7; else if a <= 3.18 then recode_a = 8; else if a <= 3.64 then recode_a = 9; 
else if a <= 3.99 then recode_a = 10; 
if b <= .845 then recode_b = 1; else if b <= 1.88 then recode_b = 2; else if b <= 2.86 then recode_b = 3; 
else if b <= 4.005 then recode_b = 4; else if b <= 5.03 then recode_b = 5; else if b <= 6.07 then recode_b = 6; 
else if b <= 6.995 then recode_b = 7; else if b <= 8.035 then recode_b = 8; else if b <= 9.16 then recode_b = 9; 
else if b <= 10 then recode_b = 10; 
if c <= .86 then recode_c = 1; else if c <= 1.58 then recode_c = 2; else if c <= 2.34 then recode_c = 3; 
else if c <= 3.15 then recode_c = 4; else if c <= 3.85 then recode_c = 5; else if c <= 4.615 then recode_c = 6; 
else if c <= 5.315 then recode_c = 7; else if c <= 5.96 then recode_c = 8; else if c <= 6.75 then recode_c = 9; 
else if c <= 7.5 then recode_c = 10; 
run; 

proc print data=test (obs=5); 
run; 

sample of desired output

回答

1

不同的选项 - PROC RANK。你可以让它更“自动化”,但它非常简单。使用PROC RANK你也可以指定处理关系的不同方式。请注意,它将从0到9而不是1到10,但这是微不足道的改变。

data test (drop=i); 
do i=1 to 1000; 
a=round(uniform(1)*4,.01); 
b=round(uniform(1)*10,.01); 
c=round(uniform(1)*7.5,.01); 
output; 
end; 
stop; 
run; 

proc rank data=test out=want groups=10; 
var a b c; 
ranks rankA rankB rankC; 
run; 
+0

该死的......我只希望我早点看到你的帖子,这会为我节省很多时间和精力。我从别人的答案中学到了很多有用的东西,但这正是我所期待的。它整洁,简短,简单。非常感谢你! – NicChik

+0

如果我想创建一个宏,并且这是我的值被输入的方式: '%percentiles(var = abc);' 我怎样才能使得生成的新变量被称为“recode_ [无论输入是什么]“而不必将它们输入排名声明中? – NicChik

+0

是的,我建议查询SASHELP表来生成名称列表,类似于@foxer解决方案。在某些方面,重新命名或重新合并到主数据集可能更容易。如果您不使用RANK语句,它将使用相同的变量名称生成等级。 – Reeza

1

下面应该为你动态的,没有硬编码的工作 - 我编辑把它压缩成一个单一的宏。本质上,它将你想要的变量放入一个列表中,使用你的输出创建一个数据集,然后使用变量内容将你的数据步骤放入长字符串中。然后将这些字符串放入一个宏变量中,您可以在最终数据步骤中调用它。再一次,没有涉及硬编码。

%MACRO stratify(library=,input=,output=); 
%local varlist varlist_space data_step_list; 

    ** get vars into comma-separated list and space-separated list **; 
    proc sql noprint; 
     select NAME 
     into: varlist separated by "," 
     from dictionary.columns 
     where libname=upcase("&library.") and memname=upcase("&input."); 

     select NAME 
     into: varlist_space separated by " " 
     from dictionary.columns 
     where libname=upcase("&library.") and memname=upcase("&input."); 
    quit; 

    %percentiles(%bquote(&varlist.)); 

    ** put data into long format **; 
    proc transpose data = pcts out=pcts_long; 
     by recode percentile; 
     var &varlist_space.; 
    run; 

    ** sort to get if-else order **; 
    proc sort data = pcts_long; 
     by _NAME_ percentile; 
    run; 

    ** create your if-then strings using data itself **; 
    data str; 
     length STR $100; 
     set pcts_long; 
     bin = percentile/10; 
     by _NAME_; 
     if first._NAME_ then do; 
      STR = "if "||strip(_NAME_)||" <= "||strip(put(COL1,best.))||" then "||catx("_","recode",_NAME_)||" = "||strip(put(bin,best.))||";"; 
     end; 
     else do; 
      STR = "else if "||strip(_NAME_)||" <= "||strip(put(COL1,best.))||" then "||catx("_","recode",_NAME_)||" = "||strip(put(bin,best.))||";"; 
     end; 
    run; 

    ** put strings into a list **; 
    proc sql noprint; 
     select STR 
     into: data_step_list separated by " " 
     from STR; 
    quit; 

    ** call data step list in final data **; 
    data &output.; set &input.; 
     &data_step_list.; 
    run; 

    proc print data = &output.(obs=5); 
    run; 

%MEND; 

%stratify(library=work,input=test,output=final); 
+0

非常感谢!完美无瑕地工作。 :) – NicChik

+0

没问题。我再次编辑,以便它可以处理任何变量名称,而不仅仅是'a,b,c'。它只是创建另一个变量列表只有空格分开。 – Foxer

0

不需要所有的代码生成。只需使用一个数组。基本上将由PROC UNIVARIATE生成的数据集中的百分位数加载到二维数组中,然后为您的实际值找到十分位数。

%macro stratify(varlist,in=,out=,pcts=pcts); 
%local nvars pctls droplist recodes ; 
%let varlist=%sysfunc(compbl(&varlist)); 
%let nvars=%sysfunc(countw(&varlist)); 
%let pctls=pctl_%sysfunc(tranwrd(&varlist,%str(),%str(pctl_))); 
%let droplist=pctl_%sysfunc(tranwrd(&varlist,%str(),%str(: pctl_))):; 
%let recodes=recode_%sysfunc(tranwrd(&varlist,%str(),%str(recode_))); 

proc univariate data=&in noprint ; 
    var &varlist; 
    output out=&pcts pctlpre=&pctls 
     pctlpts = 10 20 30 40 50 60 70 80 90 100 
    ; 
run; 

data want ; 
    if _n_=1 then set &pcts ; 
    array _pcts (10,&nvars) _numeric_; 
    set test; 
    array _in &varlist ; 
    array out &recodes ; 
    do i=1 to dim(_in); 
    do j=1 to 10 while(_in(i) > _pcts(j,i)); 
    end; 
    out(i)=j; 
    end; 
    drop i j &droplist; 
run; 
%mend stratify; 

所以,如果我在这里用你生成的样本是日志的样子与MPRINT选项打开。

1093 %stratify(a b c,in=test,out=want); 
MPRINT(STRATIFY): proc univariate data=test noprint ; 
MPRINT(STRATIFY): var a b c; 
MPRINT(STRATIFY): output out=pcts pctlpre=pctl_a pctl_b pctl_c pctlpts = 10 20 30 40 50 
60 70 80 90 100 ; 
MPRINT(STRATIFY): run; 

NOTE: The data set WORK.PCTS has 1 observations and 30 variables. 
NOTE: PROCEDURE UNIVARIATE used (Total process time): 
     real time   0.01 seconds 
     cpu time   0.01 seconds 


MPRINT(STRATIFY): data want ; 
MPRINT(STRATIFY): if _n_=1 then set pcts ; 
MPRINT(STRATIFY): array _pcts (10,3) _numeric_; 
MPRINT(STRATIFY): set test; 
MPRINT(STRATIFY): array _in a b c ; 
MPRINT(STRATIFY): array out recode_a recode_b recode_c ; 
MPRINT(STRATIFY): do i=1 to dim(_in); 
MPRINT(STRATIFY): do j=1 to 10 while(_in(i) > _pcts(j,i)); 
MPRINT(STRATIFY): end; 
MPRINT(STRATIFY): out(i)=j; 
MPRINT(STRATIFY): end; 
MPRINT(STRATIFY): drop i j pctl_a: pctl_b: pctl_c:; 
MPRINT(STRATIFY): run; 

NOTE: There were 1 observations read from the data set WORK.PCTS. 
NOTE: There were 1000 observations read from the data set WORK.TEST. 
NOTE: The data set WORK.WANT has 1000 observations and 6 variables 

而第一个五年的观察是:

enter image description here

相关问题