Unix Technical Forum

tsearch2: ts_headline performance problem

This is a discussion on tsearch2: ts_headline performance problem within the pgsql Sql forums, part of the PostgreSQL category; --> Hi there people, I'm trying to build a text search engine for relatively large PDF documents with a web ...


Go Back   Unix Technical Forum > Database Server Software > PostgreSQL > pgsql Sql

FAQ Members List Calendar Search Today's Posts Mark Forums Read
  #1 (permalink)  
Old 04-19-2008, 07:00 PM
=?ISO-8859-1?Q?Anders_=D8stergaard_Jensen?=
 
Posts: n/a
Default tsearch2: ts_headline performance problem

Hi there people,

I'm trying to build a text search engine for relatively large PDF
documents with a web frontend. I use PostgreSQL 8.3 beta3/4 with
tsearch2 for full-text indexing. Everything performs quite well, until
I start indexing 800 pages of text into the database (with the utf8
danish locale and dictionary) -- then ts_headline performs
unreasonably slow. I use a GIN index on the table (called
document_revisions which is linked to a master document table called
'documents' -- the system is made for version tracking of multiple
documents into the same index) and an update trigger for maintaining
the tsvector row on the table.

Omitting ts_headline yields the following EXPLAIN result:

<snip>
metabase=# explain analyze select * from f_search_revision('website');
NOTICE: f_search_revisions_arr: loop: (1) doc_id = (1000)
CONTEXT: SQL statement "SELECT f_search_revision_arr( $1 ) AS res"
PL/pgSQL function "f_search_revision_int" line 13 at SQL statement
SQL function "f_search_revision" statement 1
NOTICE: hest: ({1000})
CONTEXT: SQL function "f_search_revision" statement 1
NOTICE: total ranking for (1) eq (0.0865452) for id = (1000)
CONTEXT: SQL function "f_search_revision" statement 1
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------
Function Scan on f_search_revision (cost=0.00..260.00 rows=1000
width=408) (actual time=2413.384..2413.384 rows=1 loops=1)
Total runtime: 2413.465 ms
(2 rows)
</snip>

The function f_search_revision invokes another function that fetches
all matching document_revisions without calling ts_headline (method
f_search_revision_arr), followed by a post-processing of the matching
rows for fetching the ts_ranking values and ts_headline. The following
snippet shows the query plan for the function call with the same query:

<snip>
metabase=# explain analyze select * from
f_search_revision_arr('website');
NOTICE: f_search_revisions_arr: loop: (1) doc_id = (1000)
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------
Function Scan on f_search_revision_arr (cost=0.00..0.26 rows=1
width=32) (actual time=3.417..3.418 rows=1 loops=1)
Total runtime: 3.442 ms
(2 rows)
</snip>

Here is the code for the functions that I am using:

CREATE OR REPLACE FUNCTION f_search_revision_arr(q TEXT) RETURNS
INTEGER[] AS $$
declare
iter record;
results integer[];
i integer;
begin
i := 1;
for iter in SELECT * FROM documents doc, plainto_tsquery(q) AS tsq
WHERE (doc.search_idx @@ tsq)
OR doc.id IN (SELECT document_id FROM document_revisions dr,
plainto_tsquery(q) AS tsq2
WHERE dr.search_idx @@ tsq2) loop
results[i] = iter.id;
raise notice 'f_search_revisions_arr: loop: (%) doc_id = (%)', i,
results[i];
i := i + 1;
end loop;
return results;
end;
$$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION f_search_revision_int(q TEXT) RETURNS
t_search_result[] AS $$
declare
doc_ids integer[];
doc_rankings float[];
i integer;
iter record;
pointer_doc record;
pointer_rev record;
pointer_type record;
results t_search_result[];
begin
i := 1;

SELECT INTO iter f_search_revision_arr(q) AS res;
doc_ids := iter.res;
raise notice 'hest: (%)', doc_ids;

-- Ensure that something was found:
if (doc_ids IS NULL) then
return results;
end if;

for i in 1..array_upper(doc_ids, 1) loop
-- raise notice 'upper = (%)', doc_ids[i];
SELECT INTO pointer_doc ts_rank(doc.search_idx, tsq) AS rank_doc,
ts_headline(doc.name, tsq) AS headline_name,
ts_headline(doc.description, tsq) AS headline_description,
ts_headline(doc.keywords, tsq) AS headline_keywords,
doc.name,
doc.description,
doc.keywords,
doc.document_type_id
FROM documents doc, plainto_tsquery(q) AS tsq
WHERE doc.id = doc_ids[i];

SELECT INTO pointer_rev ts_rank(rev.search_idx, tsq) AS rank_rev,
ts_headline(rev.content, tsq) AS headline_content,
rev.id AS revision_id
FROM document_revisions rev, plainto_tsquery(q) AS tsq
WHERE rev.document_id = doc_ids[i];

-- Select the document type:
SELECT INTO pointer_type mime_type, type_name, image_url,
extension, internal_type
FROM document_types WHERE id = pointer_doc.document_type_id;


doc_rankings[i] = pointer_doc.rank_doc + pointer_rev.rank_rev; --
SUM!
raise notice 'total ranking for (%) eq (%) for id = (%)', i,
doc_rankings[i], doc_ids[i];
results[i] = ROW(pointer_rev.revision_id,
doc_ids[i],
pointer_doc.name, pointer_doc.description, pointer_doc.keywords,
pointer_doc.document_type_id, pointer_type.mime_type,
pointer_type.type_name, pointer_type.image_url,
pointer_type.extension, pointer_type.internal_type,
doc_rankings[i], pointer_doc.headline_name,
pointer_doc.headline_description, pointer_doc.headline_keywords,
pointer_rev.headline_content,
0);
end loop;
return results;
end;
$$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION f_search_revision(q TEXT) RETURNS SETOF
t_search_result AS $$
SELECT * FROM f_unnest(f_search_revision_int($1));
$$ LANGUAGE sql;

------

How come that ts_headline yields so bad performance? Is there any way
of tuning ts_headline? I need the highlighting functionality for
presenting the search results in the web frontend properly.
Alternately, are there any alternatives to the ts_headline function?

Sincerely,

Anders




---------------------------(end of broadcast)---------------------------
TIP 2: Don't 'kill -9' the postmaster

Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
  #2 (permalink)  
Old 04-19-2008, 07:00 PM
Oleg Bartunov
 
Posts: n/a
Default Re: tsearch2: ts_headline performance problem

Anders,

headline is a slow, since it should read and process an original document,
so you should avoid simple search query, which calculates ts_headline()
for each document in the result set. Use subselect instead !

In example below, 2nd query is way faster the 1rd !

select id,headline(body,q),rank(ti,q) as rank
from apod, to_tsquery('stars') q
where ti @@ q order by rank desc limit 10;

and

select id,headline(body,q),rank
from ( select id,body,q, rank(ti,q) as rank from apod, to_tsquery('stars') q
where ti @@ q order by rank desc limit 10) as foo;


Oleg
On Sun, 6 Jan 2008, Anders ?stergaard Jensen wrote:

> Hi there people,
>
> I'm trying to build a text search engine for relatively large PDF documents
> with a web frontend. I use PostgreSQL 8.3 beta3/4 with tsearch2 for full-text
> indexing. Everything performs quite well, until I start indexing 800 pages of
> text into the database (with the utf8 danish locale and dictionary) -- then
> ts_headline performs unreasonably slow. I use a GIN index on the table
> (called document_revisions which is linked to a master document table called
> 'documents' -- the system is made for version tracking of multiple documents
> into the same index) and an update trigger for maintaining the tsvector row
> on the table.
>
> Omitting ts_headline yields the following EXPLAIN result:
>
> <snip>
> metabase=# explain analyze select * from f_search_revision('website');
> NOTICE: f_search_revisions_arr: loop: (1) doc_id = (1000)
> CONTEXT: SQL statement "SELECT f_search_revision_arr( $1 ) AS res"
> PL/pgSQL function "f_search_revision_int" line 13 at SQL statement
> SQL function "f_search_revision" statement 1
> NOTICE: hest: ({1000})
> CONTEXT: SQL function "f_search_revision" statement 1
> NOTICE: total ranking for (1) eq (0.0865452) for id = (1000)
> CONTEXT: SQL function "f_search_revision" statement 1
> QUERY PLAN
> -----------------------------------------------------------------------------------------------------------------------------
> Function Scan on f_search_revision (cost=0.00..260.00 rows=1000 width=408)
> (actual time=2413.384..2413.384 rows=1 loops=1)
> Total runtime: 2413.465 ms
> (2 rows)
> </snip>
>
> The function f_search_revision invokes another function that fetches all
> matching document_revisions without calling ts_headline (method
> f_search_revision_arr), followed by a post-processing of the matching rows
> for fetching the ts_ranking values and ts_headline. The following snippet
> shows the query plan for the function call with the same query:
>
> <snip>
> metabase=# explain analyze select * from f_search_revision_arr('website');
> NOTICE: f_search_revisions_arr: loop: (1) doc_id = (1000)
> QUERY PLAN
> ---------------------------------------------------------------------------------------------------------------------
> Function Scan on f_search_revision_arr (cost=0.00..0.26 rows=1 width=32)
> (actual time=3.417..3.418 rows=1 loops=1)
> Total runtime: 3.442 ms
> (2 rows)
> </snip>
>
> Here is the code for the functions that I am using:
>
> CREATE OR REPLACE FUNCTION f_search_revision_arr(q TEXT) RETURNS INTEGER[] AS
> $$
> declare
> iter record; results integer[];
> i integer;
> begin
> i := 1;
> for iter in SELECT * FROM documents doc, plainto_tsquery(q)
> AS tsq
> WHERE (doc.search_idx @@ tsq)
> OR doc.id IN (SELECT document_id FROM
> document_revisions dr, plainto_tsquery(q) AS tsq2
> WHERE dr.search_idx @@ tsq2) loop
> results[i] = iter.id;
> raise notice 'f_search_revisions_arr: loop: (%)
> doc_id = (%)', i, results[i];
> i := i + 1;
> end loop;
> return results;
> end;
> $$ LANGUAGE plpgsql;
>
> CREATE OR REPLACE FUNCTION f_search_revision_int(q TEXT) RETURNS
> t_search_result[] AS $$
> declare
> doc_ids integer[];
> doc_rankings float[];
> i integer;
> iter record;
> pointer_doc record;
> pointer_rev record;
> pointer_type record;
> results t_search_result[];
> begin
> i := 1;
>
> SELECT INTO iter f_search_revision_arr(q) AS res;
> doc_ids := iter.res;
> raise notice 'hest: (%)', doc_ids;
>
> -- Ensure that something was found:
> if (doc_ids IS NULL) then
> return results;
> end if;
>
> for i in 1..array_upper(doc_ids, 1) loop
> -- raise notice 'upper = (%)', doc_ids[i];
> SELECT INTO pointer_doc ts_rank(doc.search_idx, tsq)
> AS rank_doc,
> ts_headline(doc.name, tsq) AS headline_name,
> ts_headline(doc.description, tsq) AS
> headline_description,
> ts_headline(doc.keywords, tsq) AS
> headline_keywords,
> doc.name,
> doc.description,
> doc.keywords,
> doc.document_type_id
> FROM documents doc, plainto_tsquery(q) AS tsq
> WHERE doc.id = doc_ids[i];
>
> SELECT INTO pointer_rev ts_rank(rev.search_idx, tsq)
> AS rank_rev,
> ts_headline(rev.content, tsq) AS
> headline_content,
> rev.id AS revision_id
> FROM document_revisions rev,
> plainto_tsquery(q) AS tsq
> WHERE rev.document_id = doc_ids[i];
>
> -- Select the document type:
> SELECT INTO pointer_type mime_type, type_name,
> image_url, extension, internal_type
> FROM document_types WHERE id =
> pointer_doc.document_type_id;
>
> doc_rankings[i] = pointer_doc.rank_doc +
> pointer_rev.rank_rev; -- SUM!
> raise notice 'total ranking for (%) eq (%) for id =
> (%)', i, doc_rankings[i], doc_ids[i];
> results[i] = ROW(pointer_rev.revision_id,
> doc_ids[i],
> pointer_doc.name,
> pointer_doc.description, pointer_doc.keywords,
> pointer_doc.document_type_id,
> pointer_type.mime_type, pointer_type.type_name, pointer_type.image_url,
> pointer_type.extension, pointer_type.internal_type,
> doc_rankings[i],
> pointer_doc.headline_name, pointer_doc.headline_description,
> pointer_doc.headline_keywords, pointer_rev.headline_content,
> 0);
> end loop;
> return results;
> end;
> $$ LANGUAGE plpgsql;
>
> CREATE OR REPLACE FUNCTION f_search_revision(q TEXT) RETURNS SETOF
> t_search_result AS $$
> SELECT * FROM f_unnest(f_search_revision_int($1));
> $$ LANGUAGE sql;
>
> ------
>
> How come that ts_headline yields so bad performance? Is there any way of
> tuning ts_headline? I need the highlighting functionality for presenting the
> search results in the web frontend properly. Alternately, are there any
> alternatives to the ts_headline function?
>
> Sincerely,
>
> Anders
>
>
>
>
> ---------------------------(end of broadcast)---------------------------
> TIP 2: Don't 'kill -9' the postmaster


Regards,
Oleg
__________________________________________________ ___________
Oleg Bartunov, Research Scientist, Head of AstroNet (www.astronet.ru),
Sternberg Astronomical Institute, Moscow University, Russia
Internet: oleg@sai.msu.su, http://www.sai.msu.su/~megera/
phone: +007(495)939-16-83, +007(495)939-23-83

---------------------------(end of broadcast)---------------------------
TIP 9: In versions below 8.0, the planner will ignore your desire to
choose an index scan if your joining column's datatypes do not
match

Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
Reply


Thread Tools
Display Modes

Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

vB code is On
Smilies are On
[IMG] code is On
HTML code is Off
Trackbacks are On
Pingbacks are On
Refbacks are On
Forum Jump


All times are GMT. The time now is 10:20 AM.


Powered by vBulletin® Version 3.6.5
Copyright ©2000 - 2008, Jelsoft Enterprises Ltd.
SEO by vBSEO 3.2.0
www.UnixAdminTalk.com