From a9b7b2ffc1bc19f7770f3a1aadbaca1d95c91d45 Mon Sep 17 00:00:00 2001 From: Graham Knop Date: Mon, 4 Nov 2024 22:40:09 +0100 Subject: [PATCH] contributors: fetch all contrib pauseids in one query Rather than trying to fill in each pause id on a separate query, run one query for all contributors. Significantly speeds up fetches for large contributor lists. --- lib/MetaCPAN/Query/Release.pm | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/lib/MetaCPAN/Query/Release.pm b/lib/MetaCPAN/Query/Release.pm index 8a3a49802..d114f1cdf 100644 --- a/lib/MetaCPAN/Query/Release.pm +++ b/lib/MetaCPAN/Query/Release.pm @@ -153,6 +153,7 @@ sub get_contributors { $dupe ? () : $info; } ( @$authors, @$contribs ); + my %want_email; for my $contrib (@contribs) { # heuristic to autofill pause accounts @@ -165,20 +166,27 @@ sub get_contributors { } - # check if contributor's email points to a registered author - if ( !$contrib->{pauseid} ) { - for my $email ( @{ $contrib->{email} } ) { - my $check_author = $self->es->search( - es_doc_path('author'), - body => { - query => { term => { email => $email } }, - size => 10, - } - ); + push @{ $want_email{$_} }, $contrib for @{ $contrib->{email} }; + } + + if (%want_email) { + my $check_author = $self->es->search( + es_doc_path('author'), + body => { + query => { terms => { email => [ sort keys %want_email ] } }, + _source => [ 'email', 'pauseid' ], + size => 100, + }, + ); - if ( hit_total($check_author) ) { - $contrib->{pauseid} - = uc $check_author->{hits}{hits}[0]{_source}{pauseid}; + for my $author ( @{ $check_author->{hits}{hits} } ) { + my $emails = $author->{_source}{email}; + $emails = [$emails] + if !ref $emails; + my $pauseid = uc $author->{_source}{pauseid}; + for my $email (@$emails) { + for my $contrib ( @{ $want_email{$email} } ) { + $contrib->{pauseid} = $pauseid; } } }