[PATCH v2 2/2] post_pw: Store submitted checks locally as well
Aaron Conole
aconole at redhat.com
Tue Jan 23 00:14:51 CET 2024
Michael Santana <msantana at redhat.com> writes:
> On Mon, Jan 22, 2024 at 2:32 PM Aaron Conole <aconole at redhat.com> wrote:
>>
>> Jeremy Kerr reports that our PW checks reporting submitted 43000 API calls
>> in just a single day. That is alarmingly unacceptable. We can store the
>> URLs we've already submitted and then just skip over any additional
>> processing at least on the PW side.
>>
>> This patch does two things to try and mitigate this issue:
>>
>> 1. Store each patch ID and URL in the series DB to show that we reported
>> the check. This means we don't need to poll patchwork for check status
>>
>> 2. Store the last modified time of the reports mailing list. This means
>> we only poll the mailing list when a new email has surely landed.
>>
>> Signed-off-by: Aaron Conole <aconole at redhat.com>
>> ---
>> v2: fixed up the Last-Modified grep and storage
>>
>> post_pw.sh | 38 +++++++++++++++++++++++++++++++++++++-
>> series_db_lib.sh | 25 +++++++++++++++++++++++++
>> 2 files changed, 62 insertions(+), 1 deletion(-)
>>
>> diff --git a/post_pw.sh b/post_pw.sh
>> index 9163ea1..a23bdc5 100755
>> --- a/post_pw.sh
>> +++ b/post_pw.sh
>> @@ -20,6 +20,7 @@
>> # License for the specific language governing permissions and limitations
>> # under the License.
>>
>> +[ -f "$(dirname $0)/series_db_lib.sh" ] && source "$(dirname $0)/series_db_lib.sh" || exit 1
>> [ -f "${HOME}/.mail_patchwork_sync.rc" ] && source "${HOME}/.mail_patchwork_sync.rc"
>>
>> # Patchwork instance to update with new reports from mailing list
>> @@ -75,6 +76,13 @@ send_post() {
>> if [ -z "$context" -o -z "$state" -o -z "$description" -o -z "$patch_id" ]; then
>> echo "Skpping \"$link\" due to missing context, state, description," \
>> "or patch_id" 1>&2
>> + # Just don't want to even bother seeing these "bad" patches as well.
>> + add_check_scanned_url "$patch_id" "$target_url"
>> + return 0
>> + fi
>> +
>> + if check_id_exists "$patch_id" "$target_url" ; then
>> + echo "Skipping \"$link\" - already reported." 1>&2
>> return 0
>> fi
>>
>> @@ -84,6 +92,7 @@ send_post() {
>> "$api_url")"
>> if [ $? -ne 0 ]; then
>> echo "Failed to get proper server response on link ${api_url}" 1>&2
>> + # Don't store these as processed in case the server has a temporary issue.
>> return 0
>> fi
>>
>> @@ -95,6 +104,9 @@ send_post() {
>> jq -e "[.[].target_url] | contains([\"$mail_url\"])" >/dev/null
>> then
>> echo "Report ${target_url} already pushed to patchwork. Skipping." 1>&2
>> + # Somehow this was not stored (for example, first time we apply the tracking
>> + # feature). Store it now.
>> + add_check_scanned_url "$patch_id" "$target_url"
>> return 0
>> fi
>>
>> @@ -114,12 +126,34 @@ send_post() {
>> if [ $? -ne 0 ]; then
>> echo -e "Failed to push retults based on report ${link} to the"\
>> "patchwork instance ${pw_instance} using the following REST"\
>> - "API Endpoint ${api_url} with the following data:\n$data\n"
>> + "API Endpoint ${api_url} with the following data:\n$data\n" 1>&2
>> return 0
>> fi
>> +
>> + add_check_scanned_url "$patch_id" "$target_url"
>> }
>>
>> +# Collect the date. NOTE: this needs some accomodate to catch the month change-overs
>> year_month="$(date +"%Y-%B")"
>> +
>> +# Get the last modified time
>> +report_last_mod=$(curl --head -A "(pw-ci) pw-post" -sSf "${mail_archive}${year_month}/thread.html" | grep -i Last-Modified)
>> +
>> +mailing_list_save_file=$(echo ".post_pw_${mail_archive}${year_month}" | sed -e "s@/@_ at g" -e "s@:@_ at g" -e "s,@,_,g")
>> +
>> +if [ -e "${HOME}/${mailing_list_save_file}" ]; then
>> + last_read_date=$(cat "${HOME}/${mailing_list_save_file}")
>> + if [ "$last_read_date" -a "$last_read_date" == "$report_last_mod" ]; then
>> + echo "Last modified times match. Skipping list parsing."
>> + exit 0
>> + else
>> + last_read_date="$report_last_mod"
>> + fi
>> +else
>> + last_read_date="Failed curl."
>> + touch "${HOME}/${mailing_list_save_file}"
> One last thing, sorry
>
> Instead of touch, could we propagate with $report_last_mod ?
> Im looking at it from the POV the first time this script is run and
> the file does not exist, it creates the timestamp. Next time it runs,
> if the timestamps are the same the script exits, which is what we
> want. If we keep it as its written, the second time the script runs
> the variable will be propagated with "Failed curl" and the script will
> run fully. This might not be ideal if the timestamps match but just
> not yet propagated on the file
>
> or maybe another last_read_date="$report_last_mod" might do the job
>
> Otherwise, everything looks good!
Ack. Will send a v3.
>> +fi
>> +
>> reports="$(curl -A "(pw-ci) pw-post" -sSf "${mail_archive}${year_month}/thread.html" | \
>> grep -i 'HREF=' | sed -e 's@[0-9]*<LI><A HREF="@\|@' -e 's@">@\|@')"
>> if [ $? -ne 0 ]; then
>> @@ -132,3 +166,5 @@ echo "$reports" | while IFS='|' read -r blank link title; do
>> send_post "${mail_archive}${year_month}/$link"
>> fi
>> done
>> +
>> +echo "$last_read_date" > "${HOME}/${mailing_list_save_file}"
>> diff --git a/series_db_lib.sh b/series_db_lib.sh
>> index c5f42e0..0635469 100644
>> --- a/series_db_lib.sh
>> +++ b/series_db_lib.sh
>> @@ -130,6 +130,17 @@ recheck_sync INTEGER
>> EOF
>> run_db_command "INSERT INTO series_schema_version(id) values (8);"
>> fi
>> +
>> + run_db_command "select * from series_schema_version;" | egrep '^9$' > /dev/null 2>&1
>> + if [ $? -eq 1 ]; then
>> + sqlite3 ${HOME}/.series-db <<EOF
>> +CREATE TABLE check_id_scanned (
>> +check_patch_id INTEGER,
>> +check_url STRING
>> +)
>> +EOF
>> + run_db_command "INSERT INTO series_schema_version(id) values (9);"
>> + fi
>> }
>>
>> function series_db_exists() {
>> @@ -468,3 +479,17 @@ function set_recheck_request_state() {
>>
>> echo "UPDATE recheck_requests set recheck_sync=$recheck_state where patchwork_instance=\"$recheck_instance\" and patchwork_project=\"$recheck_project\" and recheck_requested_by=\"$recheck_requested_by\" and recheck_series=\"$recheck_series\";" | series_db_execute
>> }
>> +
>> +function add_check_scanned_url() {
>> + local patch_id="$1"
>> + local url="$2"
>> +
>> + echo "INSERT into check_id_scanned (check_patch_id, check_url) values (${patch_id}, \"$url\");" | series_db_execute
>> +}
>> +
>> +function check_id_exists() {
>> + local patch_id="$1"
>> + local url="$2"
>> +
>> + echo "select * from check_id_scanned where check_patch_id=$patch_id and check_url=\"$url\";" | series_db_execute | grep "$url" >/dev/null 2>&1
>> +}
>> --
>> 2.41.0
>>
More information about the ci
mailing list