2011-03-17 104 views
1

我使用从IMDB收集信息并将它们传输到MYSQL数据库的应用程序导入了一些数据。MYSQL - 将数据拆分为多行

看来领域都没有1场

内归一化和载有许多值。例如:

Table Movie 
MovieID   Movie_Title   Written_By 
1    Movie1    Person1, Person2 
2    Movie2    Person3 
3    Movie3    Person4, Person2, Person6 

有没有办法单独的值并将它们插入到另一个表像这和没有任何重复?

Table Writers 
WriterID   Written_By    MovieId  
1    Person1     1 
2    Person2     1 
3    Person3     3 

我做了一些谷歌上搜索,发现我应该使用PHP来处理这些数据。 但我完全不知道PHP。

是否有反正使用MYSQL转换这些数据?

+0

你会不断收到这样的饲料?或者这是一次性操作?此外,源代码的初始数据以什么格式输入(在输入MySQL之前)。很可能,加载MySql表的另一种方法就是您将需要的。 – Jai 2011-03-17 17:16:04

+0

重复? HTTP://计算器。com/questions/3936088/mysql-split-comma-separated-list-into-multiple-rows – TheSean 2011-03-17 17:17:55

+0

也读这个 - http://stackoverflow.com/questions/1096679/can-mysql-split-a-column – Jai 2011-03-17 17:19:05

回答

2

您可以使用它使用游标一个存储过程来解决这个问题,但它不是很优雅,但也不是一个逗号分隔的作家名单!

有以下代码从一个类似的问题躺在身边,但你最好彻底检查。

希望它能帮助:)

mysql> select * from movies_unf; 
+---------+-------------+------------------------------------------------------+ 
| movieID | movie_title | written_by           | 
+---------+-------------+------------------------------------------------------+ 
|  1 | movie1  | person1, person2          | 
|  2 | movie2  | person3            | 
|  3 | movie3  | person4, person2, person6       | 
|  4 | movie4  | person4, person4, person1, person2, person1,person8, | 
|  5 | movie1  | person1, person2          | 
+---------+-------------+------------------------------------------------------+ 
5 rows in set (0.00 sec) 

call normalise_movies_unf(); 

mysql> select * from movies; 
+----------+--------+ 
| movie_id | title | 
+----------+--------+ 
|  1 | movie1 | 
|  2 | movie2 | 
|  3 | movie3 | 
|  4 | movie4 | 
+----------+--------+ 
4 rows in set (0.00 sec) 

mysql> select * from writers; 
+-----------+---------+ 
| writer_id | name | 
+-----------+---------+ 
|   1 | person1 | 
|   2 | person2 | 
|   3 | person3 | 
|   4 | person4 | 
|   6 | person6 | 
|  12 | person8 | 
+-----------+---------+ 
6 rows in set (0.00 sec) 

mysql> select * from movie_writers; 
+----------+-----------+ 
| movie_id | writer_id | 
+----------+-----------+ 
|  1 |   1 | 
|  1 |   2 | 
|  2 |   3 | 
|  3 |   2 | 
|  3 |   4 | 
|  3 |   6 | 
|  4 |   1 | 
|  4 |   2 | 
|  4 |   4 | 
|  4 |  12 | 
+----------+-----------+ 
10 rows in set (0.00 sec) 

例表

drop table if exists movies_unf; 
create table movies_unf 
(
movieID int unsigned not null primary key, 
movie_title varchar(255) not null, 
written_by varchar(1024) not null 
)engine=innodb; 

insert into movies_unf values 
(1,'movie1','person1, person2'), 
(2,'movie2','person3'), 
(3,'movie3','person4, person2, person6'), 
(4,'movie4','person4, person4, person1, person2, person1,person8,'), -- dodgy writers 
(5,'movie1','person1, person2'); -- dodgy movie 

drop table if exists movies; 
create table movies 
(
movie_id int unsigned not null auto_increment primary key, 
title varchar(255) unique not null 
)engine=innodb; 

drop table if exists writers; 
create table writers 
(
writer_id int unsigned not null auto_increment primary key, 
name varchar(255) unique not null 
)engine=innodb; 

drop table if exists movie_writers; 
create table movie_writers 
(
movie_id int unsigned not null, 
writer_id int unsigned not null, 
primary key (movie_id, writer_id) 
)engine=innodb; 

存储过程

drop procedure if exists normalise_movies_unf; 

delimiter # 

create procedure normalise_movies_unf() 
begin 

declare v_movieID int unsigned default 0; 
declare v_movie_title varchar(255); 
declare v_writers varchar(1024); 

declare v_movie_id int unsigned default 0; 
declare v_writer_id int unsigned default 0; 
declare v_name varchar(255); 

declare v_csv_done tinyint unsigned default 0; 
declare v_csv_idx int unsigned default 0; 

declare v_done tinyint default 0; 
declare v_cursor cursor for 
    select distinct movieID, movie_title, written_by from movies_unf; 

declare continue handler for not found set v_done = 1; 

start transaction; 

open v_cursor; 
repeat 
    fetch v_cursor into v_movieID, v_movie_title, v_writers; 

    set v_movie_title = trim(v_movie_title); 
    set v_writers = replace(v_writers,' ', ''); 

    -- insert the movie 
    insert ignore into movies (title) values (v_movie_title); 
    select movie_id into v_movie_id from movies where title = v_movie_title; 

    -- split the out the writers and insert 
    set v_csv_done = 0;  
    set v_csv_idx = 1; 

    while not v_csv_done do 
    set v_name = substring(v_writers, v_csv_idx, 
     if(locate(',', v_writers, v_csv_idx) > 0, 
     locate(',', v_writers, v_csv_idx) - v_csv_idx, 
     length(v_writers))); 

     set v_name = trim(v_name); 

     if length(v_name) > 0 then 
     set v_csv_idx = v_csv_idx + length(v_name) + 1; 

     insert ignore into writers (name) values (v_name); 
     select writer_id into v_writer_id from writers where name = v_name; 
     insert ignore into movie_writers (movie_id, writer_id) values (v_movie_id, v_writer_id); 
     else 
     set v_csv_done = 1; 
     end if; 

    end while; 

until v_done end repeat; 
close v_cursor; 

commit; 

truncate table movies_unf; 

end# 

delimiter ; 

编辑

修正了sproc,使它不会跳过关键值!

drop procedure if exists normalise_movies_unf; 

delimiter # 

create procedure normalise_movies_unf() 
begin 

declare v_movieID int unsigned default 0; 
declare v_movie_title varchar(255); 
declare v_writers varchar(1024); 

declare v_movie_id int unsigned default 0; 
declare v_writer_id int unsigned default 0; 
declare v_name varchar(255); 

declare v_csv_done tinyint unsigned default 0; 
declare v_csv_idx int unsigned default 0; 

declare v_done tinyint default 0; 
declare v_cursor cursor for 
    select distinct movieID, movie_title, written_by from movies_unf; 

declare continue handler for not found set v_done = 1; 

start transaction; 

open v_cursor; 
repeat 
    fetch v_cursor into v_movieID, v_movie_title, v_writers; 

    set v_movie_title = trim(v_movie_title); 
    set v_writers = replace(v_writers,' ', ''); 

    -- insert the movie 

    if not exists (select 1 from movies where title = v_movie_title) then 
    insert ignore into movies (title) values (v_movie_title); 
    end if; 
    select movie_id into v_movie_id from movies where title = v_movie_title; 

    -- split the out the writers and insert 
    set v_csv_done = 0;  
    set v_csv_idx = 1; 

    while not v_csv_done do 
    set v_name = substring(v_writers, v_csv_idx, 
     if(locate(',', v_writers, v_csv_idx) > 0, 
     locate(',', v_writers, v_csv_idx) - v_csv_idx, 
     length(v_writers))); 

     set v_name = trim(v_name); 

     if length(v_name) > 0 then 
     set v_csv_idx = v_csv_idx + length(v_name) + 1; 


     if not exists (select 1 from writers where name = v_name) then 
      insert ignore into writers (name) values (v_name); 
     end if; 
     select writer_id into v_writer_id from writers where name = v_name; 
     insert ignore into movie_writers (movie_id, writer_id) values (v_movie_id, v_writer_id); 
     else 
     set v_csv_done = 1; 
     end if; 

    end while; 

until v_done end repeat; 
close v_cursor; 

commit; 

truncate table movies_unf; 

end# 

delimiter ; 
+0

谢谢你代码!有一个小问题。 writers表中的writer_id不是增量式的。 3,4,6,12等 – huor11 2011-03-17 23:52:47

+0

这并不重要 - 它仍然是一个独特的关键,它跳过值的原因是,我使用插入忽略与检查插入前是否存在作家 - 这是一个如果你认为这是一个问题,很容易改变 – 2011-03-17 23:57:10

+0

请参阅编辑 - 新的sproc代码:) – 2011-03-18 00:01:53

0

MySQL对于这种类型的字符串操作并不是特别有用。你很可能会发现通过一种常规的编程语言(perl,php,ruby,python等)可以更容易地实现数据的双向传递,这些语言具有更强大的文本嵌入功能。

而且,在做任何不可逆转的事情之前,您最希望查看结果,特别是如果名称可能含有逗号。

Alice,Eve,Bob 

容易拆分的逗号,但对于

Alice,Eve,Esquire.,Bob 
0

不幸的是,在MySQL中没有字符串分割功能。这里有一个related post(不完全是你的重复),它将一个字符串分成多列。