如何用大数据优化此查询

select ticket_type,f_rows.remaining_uses,t.source,count(t.id) as total 
FROM (
    -- Filter rows to get those where remaining_uses > 0 and status = 1 
    SELECT * FROM (
     --Get all the latest rows for each ticket 
     SELECT ticket_id,final_remaining_uses AS remaining_uses,final_status AS status,action_when 
     FROM TicketHistory th 
     INNER JOIN (SELECT max(th.id) AS id FROM TicketHistory GROUP BY ticket_id) maxid ON th.id = maxid.id 
    ) latest_rows 
    WHERE remaining_uses > 0 AND status = 1 --and (action_when < current_date and action_when > current_date -30) 
) f_rows 
INNER JOIN Ticket t ON f_rows.ticket_id = t.id 
WHERE t.expiry_date >= current_date -1 and source in (0,1,2,6,7,8) and (created_date < current_date and created_date > current_date - 30) 


GROUP BY ticket_type, f_rows.remaining_uses, t.source 
order by source, ticket_type, remaining_uses;

我在这里做的是从历史记录表中获取每张票的最新行。然后筛选非活动票证的行，并且该票证上没有剩余使用量。然后过滤与过期日期等检查如何用大数据优化此查询

的数据是有一种方法来优化这个查询？目前这个查询需要很长时间，postgresql在返回任何数据之前崩溃。

两个票和门票历史上每一个超过11M行。

编辑

CREATE TABLE ticket 
(
    id serial NOT NULL, 
    source integer NOT NULL, 
    status integer NOT NULL, 
    ticket_type integer NOT NULL, 
    remaining_uses integer NOT NULL, 
    expiry_date timestamp with time zone NOT NULL, 
    price numeric(20,2) NOT NULL, 
    created_date timestamp with time zone NOT NULL, 
    pax_type integer NOT NULL, 
    last_updated timestamp with time zone NOT NULL, 
    service integer, 
    client_id character varying(50), 
    CONSTRAINT skybus_ticket_pkey PRIMARY KEY (id), 
    CONSTRAINT skybus_ticket_sale_id_fkey FOREIGN KEY (sale_id) 
     REFERENCES skybus_sale (id) MATCH SIMPLE 
     ON UPDATE NO ACTION ON DELETE NO ACTION DEFERRABLE INITIALLY DEFERRED 
) 
WITH (
    OIDS=FALSE 
); 
ALTER TABLE ticket 
    OWNER TO umd; 

-- Index: ticket_client_id_idx 

-- DROP INDEX ticket_client_id_idx; 

CREATE INDEX ticket_client_id_idx 
    ON ticket 
    USING btree 
    (client_id COLLATE pg_catalog."default"); 

-- Index: ticket_profile_id_idx 

-- DROP INDEX ticket_profile_id_idx; 

CREATE INDEX ticket_profile_id_idx 
    ON ticket 
    USING btree 
    (profile_id); 

-- Index: ticket_sale_id 

-- DROP INDEX ticket_sale_id; 

CREATE INDEX skybus_ticket_sale_id 
    ON ticket 
    USING btree 
    (sale_id); 

-- Index: ticket_ticket_number 

-- DROP INDEX ticket_ticket_number; 

CREATE INDEX ticket_ticket_number 
    ON ticket 
    USING btree 
    (ticket_number COLLATE pg_catalog."default"); 

-- Index: ticket_ticket_number_like 

-- DROP INDEX ticket_ticket_number_like; 

CREATE INDEX ticket_ticket_number_like 
    ON ticket 
    USING btree 
    (ticket_number COLLATE pg_catalog."default" varchar_pattern_ops); 

-- Index: ticket_topup_for_idx 

-- DROP INDEX ticket_topup_for_idx; 

CREATE INDEX ticket_topup_for_idx 
    ON ticket 
    USING btree 
    (topup_for COLLATE pg_catalog."default");

- ===============================

CREATE TABLE tickethistory 
(
    id serial NOT NULL, 
    ticket_id integer, 
    action integer NOT NULL, 
    action_result integer NOT NULL, 
    initial_status integer NOT NULL, 
    final_status integer NOT NULL, 
    final_remaining_uses integer NOT NULL, 
    ticket_type integer NOT NULL, 
    action_when timestamp with time zone NOT NULL, 
    last_updated timestamp with time zone NOT NULL, 
    service integer, 
    CONSTRAINT tickethistory_pkey PRIMARY KEY (id), 
    CONSTRAINT tickethistory_ticket_id_fkey FOREIGN KEY (ticket_id) 
     REFERENCES ticket (id) MATCH SIMPLE 
     ON UPDATE NO ACTION ON DELETE NO ACTION DEFERRABLE INITIALLY DEFERRED 
) 
WITH (
    OIDS=FALSE 
); 
ALTER TABLE tickethistory 
    OWNER TO umd; 

-- Index: tickethistory_ticket_id 

-- DROP INDEX tickethistory_ticket_id; 

CREATE INDEX tickethistory_ticket_id 
    ON tickethistory 
    USING btree 
    (ticket_id);

- =====执行计划 - 这是与ROW_NUMBER（）变化

"HashAggregate (cost=4526158.63..4526158.64 rows=1 width=16) (actual time=382849.323..382849.376 rows=41 loops=1)" 
" -> Nested Loop (cost=3880592.94..4526158.62 rows=1 width=16) (actual time=380338.613..382825.688 rows=11745 loops=1)" 
"  -> Subquery Scan on sub (cost=3880592.94..4463424.47 rows=6563 width=8) (actual time=126346.043..258837.523 rows=293717 loops=1)" 
"    Filter: ((sub.remaining_uses > 0) AND (sub.rn = 1) AND (sub.status = 1))" 
"    Rows Removed by Filter: 15244064" 
"    -> WindowAgg (cost=3880592.94..4191436.42 rows=15542174 width=203) (actual time=126345.775..237172.180 rows=15537781 loops=1)" 
"     -> Sort (cost=3880592.94..3919448.38 rows=15542174 width=203) (actual time=126345.757..180461.191 rows=15537781 loops=1)" 
"       Sort Key: th.ticket_id, th.*" 
"       Sort Method: external merge Disk: 3050616kB" 
"       -> Seq Scan on skybus_tickethistory th (cost=0.00..483544.74 rows=15542174 width=203) (actual time=14.091..53312.782 rows=15537781 loops=1)" 
"  -> Index Scan using skybus_ticket_pkey on skybus_ticket t (cost=0.00..9.55 rows=1 width=12) (actual time=0.418..0.418 rows=0 loops=293717)" 
"    Index Cond: (id = sub.ticket_id)" 
"    Filter: ((source = ANY ('{0,1,2,6,7,8}'::integer[])) AND (created_date < ('now'::cstring)::date) AND (expiry_date >= (('now'::cstring)::date - 1)) AND (created_date > (('now'::cstring)::date - 30)) AND (ticket_type = ANY ('{2,3,4,5,6,7,16,17, (...)" 
"    Rows Removed by Filter: 1" 
"Total runtime: 383045.381 ms"

来源

2017-04-03 Nouman Bhatti

@a_horse_with_no_name，我编辑了问题 –

这不是您的查询的执行计划 - 执行计划有一个'WindowAgg'步骤，但您的查询没有窗口函数。原始查询的计划可能会更有益的（也许在一个有关'明显的（）'的解决方案 –

distinct on()通常解决的Postgres greatest-n-per-group问题的最快方法：

select ticket_type,f_rows.remaining_uses,t.source,count(t.id) as total 
FROM (
    -- Filter rows to get those where remaining_uses > 0 and status = 1 
    SELECT * 
    FROM (
     --Get all the latest rows for each ticket 
     SELECT distinct on (ticket_id) 
       ticket_id, 
       final_remaining_uses AS remaining_uses, 
       final_status AS status, action_when 
     FROM TicketHistory th 
     order by ticket_id, id desc 
    ) latest_rows 
    WHERE remaining_uses > 0 
     AND status = 1 --and (action_when current_date -30) 
) f_rows 
    JOIN Ticket t ON f_rows.ticket_id = t.id 
WHERE t.expiry_date >= current_date -1 
    and source in (0,1,2,6,7,8) 
    and created_date current_date - 30 
GROUP BY ticket_type, f_rows.remaining_uses, t.source 
order by source, ticket_type, remaining_uses;

与order by回报distinct on()连同tickethistory.id每个最高值的行ticket_id。

上tickethistory (ticket_id, id desc)的指标可能会有所帮助。甚至可以在tickethistory (ticket_id, id desc, final_remaining_uses, final_status, action_when)上启用仅索引扫描。

然而，存储创造的那一刻时间戳列可能会更准确。如果tickethistory.id是例如通过序列生成的id（因为它是serial），那么这些值可能不会反映实际的插入顺序。

来源

2017-04-03 06:20:37

你可以使用row_number()获得EA的最新行在单次CH票：

with last_history as 
     (
     select * 
     from (
       select row_number() over (partition by ticket_id 
              order by th desc) rn 
       ,  * 
       from TicketHistory 
       ) sub 
     where rn = 1 -- Latest history row only 
     ) 
select * 
from ticket t 
join th 
on  t.id = th.ticket_id 
where remaining_uses > 0 
     and <... other conditions ...>

来源

2017-04-03 06:06:29 Andomar

这无疑提高了查询时间。所以我需要使用不同的或这种方法？我的印象我正在做的方式更快 –

我会用a_horse_with_no_name的答案去，这比'row_number（）':)更具可读性 – Andomar

如何用大数据优化此查询

回答

相关问题