2014-10-05 55 views
0

我很难找到解决方案,在Rapid Miner(6)中通过收集数据表进行联盟。 我基本上有一个集合(如果你不知道它,把它想象成一个对象数组),其中包含可变数量的数据表,每个数据表至少有一个具有相同ID和另一列的“ID”列每个都有不同的名字。 我想合并所有这些作为联盟(aka加入),以具有形式[id,column_from_data_table_1,column_from_data_table2,...]的记录。RapidMiner联盟收集(数据表)

例如,集合将包含一个数N的形式的数据表:集合中的表

Table 1 
id col1 
1 0.5 
2 0.7 

Table 2 
id col2 
1 0.1 
2 0.0 

........ 
........ 
........ 

Table N 
id colN 
1 0.0 
2 0.8 

和结束时,联盟(加盟)应该是这样的:

Result 
id col1 col2 ... colN 
1 0.5 0.1 ... 0.0 
2 0.7 0.0 ... 0.8 

请注意,每个表具有相同数量的记录和分配给它们的相同ID(+除了id以外,表中的列名是唯一的 - 换句话说,数据不可能比这更理想) 。

回答

1

涉及RememberRecallBranchJoinSelectLoop Collection相当先进的工艺。这是一个减少的例子。

<?xml version="1.0" encoding="UTF-8" standalone="no"?> 
<process version="6.1.008"> 
    <context> 
    <input/> 
    <output/> 
    <macros/> 
    </context> 
    <operator activated="true" class="process" compatibility="6.1.000-SNAPSHOT" expanded="true" name="Process"> 
    <process expanded="true"> 
     <operator activated="true" class="subprocess" compatibility="6.1.000-SNAPSHOT" expanded="true" height="76" name="Subprocess" width="90" x="112" y="30"> 
     <process expanded="true"> 
      <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="30"> 
      <list key="attribute_values"> 
        <parameter key="id" value="1"/> 
       <parameter key="col1" value="48"/> 
      </list> 
      <list key="set_additional_roles"> 
       <parameter key="id" value="id"/> 
      </list> 
      </operator> 
      <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="45" y="120"> 
      <list key="attribute_values"> 
       <parameter key="id" value="2"/> 
       <parameter key="col1" value="4"/> 
      </list> 
      <list key="set_additional_roles"> 
       <parameter key="id" value="id"/> 
      </list> 
      </operator> 
      <operator activated="true" class="append" compatibility="6.1.000-SNAPSHOT" expanded="true" height="94" name="Append" width="90" x="179" y="30"/> 
      <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Generate Data by User Specification (3)" width="90" x="45" y="210"> 
      <list key="attribute_values"> 
       <parameter key="id" value="1"/> 
       <parameter key="col2" value="9"/> 
      </list> 
      <list key="set_additional_roles"> 
       <parameter key="id" value="id"/> 
      </list> 
      </operator> 
      <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Generate Data by User Specification (4)" width="90" x="45" y="300"> 
      <list key="attribute_values"> 
       <parameter key="id" value="2"/> 
       <parameter key="col2" value="7"/> 
      </list> 
      <list key="set_additional_roles"> 
       <parameter key="id" value="id"/> 
      </list> 
      </operator> 
      <operator activated="true" class="append" compatibility="6.1.000-SNAPSHOT" expanded="true" height="94" name="Append (2)" width="90" x="179" y="210"/> 
      <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Generate Data by User Specification (5)" width="90" x="45" y="390"> 
      <list key="attribute_values"> 
       <parameter key="id" value="1"/> 
       <parameter key="col3" value="88"/> 
      </list> 
      <list key="set_additional_roles"> 
       <parameter key="id" value="id"/> 
      </list> 
      </operator> 
      <operator activated="true" class="generate_data_user_specification" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Generate Data by User Specification (6)" width="90" x="45" y="480"> 
      <list key="attribute_values"> 
       <parameter key="id" value="2"/> 
       <parameter key="col3" value="78"/> 
      </list> 
      <list key="set_additional_roles"> 
       <parameter key="id" value="id"/> 
      </list> 
      </operator> 
      <operator activated="true" class="append" compatibility="6.1.000-SNAPSHOT" expanded="true" height="94" name="Append (3)" width="90" x="179" y="390"/> 
      <operator activated="true" class="collect" compatibility="6.1.000-SNAPSHOT" expanded="true" height="112" name="Collect" width="90" x="380" y="210"/> 
      <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/> 
      <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/> 
      <connect from_op="Append" from_port="merged set" to_op="Collect" to_port="input 1"/> 
      <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append (2)" to_port="example set 1"/> 
      <connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append (2)" to_port="example set 2"/> 
      <connect from_op="Append (2)" from_port="merged set" to_op="Collect" to_port="input 2"/> 
      <connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append (3)" to_port="example set 1"/> 
      <connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append (3)" to_port="example set 2"/> 
      <connect from_op="Append (3)" from_port="merged set" to_op="Collect" to_port="input 3"/> 
      <connect from_op="Collect" from_port="collection" to_port="out 1"/> 
      <portSpacing port="source_in 1" spacing="0"/> 
      <portSpacing port="sink_out 1" spacing="0"/> 
      <portSpacing port="sink_out 2" spacing="0"/> 
     </process> 
     </operator> 
     <operator activated="true" class="multiply" compatibility="6.1.000-SNAPSHOT" expanded="true" height="94" name="Multiply (2)" width="90" x="246" y="30"/> 
     <operator activated="true" class="select" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Select (2)" width="90" x="447" y="30"/> 
     <operator activated="true" class="remember" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Remember" width="90" x="581" y="30"> 
     <parameter key="name" value="1"/> 
     </operator> 
     <operator activated="true" class="loop_collection" compatibility="6.1.000-SNAPSHOT" expanded="true" height="76" name="Loop Collection" width="90" x="447" y="165"> 
     <parameter key="set_iteration_macro" value="true"/> 
     <process expanded="true"> 
      <operator activated="true" class="branch" compatibility="6.1.000-SNAPSHOT" expanded="true" height="76" name="Branch" width="90" x="112" y="120"> 
      <parameter key="condition_type" value="expression"/> 
      <parameter key="condition_value" value="%{iteration}==1"/> 
      <process expanded="true"> 
       <connect from_port="condition" to_port="input 1"/> 
       <portSpacing port="source_condition" spacing="0"/> 
       <portSpacing port="source_input 1" spacing="0"/> 
       <portSpacing port="sink_input 1" spacing="0"/> 
       <portSpacing port="sink_input 2" spacing="0"/> 
      </process> 
      <process expanded="true"> 
       <operator activated="true" class="recall" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Recall" width="90" x="112" y="75"> 
       <parameter key="name" value="1"/> 
       </operator> 
       <operator activated="true" class="join" compatibility="6.1.000-SNAPSHOT" expanded="true" height="76" name="Join" width="90" x="246" y="30"> 
       <list key="key_attributes"/> 
       </operator> 
       <operator activated="true" class="remember" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Remember (2)" width="90" x="380" y="30"> 
       <parameter key="name" value="1"/> 
       </operator> 
       <connect from_port="condition" to_op="Join" to_port="left"/> 
       <connect from_op="Recall" from_port="result" to_op="Join" to_port="right"/> 
       <connect from_op="Join" from_port="join" to_op="Remember (2)" to_port="store"/> 
       <connect from_op="Remember (2)" from_port="stored" to_port="input 1"/> 
       <portSpacing port="source_condition" spacing="0"/> 
       <portSpacing port="source_input 1" spacing="0"/> 
       <portSpacing port="sink_input 1" spacing="0"/> 
       <portSpacing port="sink_input 2" spacing="0"/> 
      </process> 
      </operator> 
      <connect from_port="single" to_op="Branch" to_port="condition"/> 
      <connect from_op="Branch" from_port="input 1" to_port="output 1"/> 
      <portSpacing port="source_single" spacing="0"/> 
      <portSpacing port="sink_output 1" spacing="0"/> 
      <portSpacing port="sink_output 2" spacing="0"/> 
     </process> 
     </operator> 
     <operator activated="true" class="recall" compatibility="6.1.000-SNAPSHOT" expanded="true" height="60" name="Recall (2)" width="90" x="581" y="165"> 
     <parameter key="name" value="1"/> 
     </operator> 
     <connect from_op="Subprocess" from_port="out 1" to_op="Multiply (2)" to_port="input"/> 
     <connect from_op="Multiply (2)" from_port="output 1" to_op="Select (2)" to_port="collection"/> 
     <connect from_op="Multiply (2)" from_port="output 2" to_op="Loop Collection" to_port="collection"/> 
     <connect from_op="Select (2)" from_port="selected" to_op="Remember" to_port="store"/> 
     <connect from_op="Recall (2)" from_port="result" to_port="result 1"/> 
     <portSpacing port="source_input 1" spacing="0"/> 
     <portSpacing port="sink_result 1" spacing="0"/> 
     <portSpacing port="sink_result 2" spacing="0"/> 
    </process> 
    </operator> 
</process> 

希望它有帮助。

+0

我注意到上述过程中可能出现的错误 - 它是使用有限的可用性云版本创建的,因此可能无法加载到正常的独立版本中。要修复它,请将“6.1.000-SNAPSHOT”更改为“6.0.008” – awchisholm 2014-10-12 17:46:23