From 3693441d9be6fd1123443afc36cc701f16c40620 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Wed, 5 Sep 2018 15:46:32 -0700
Subject: [PATCH] lmkd: rate-limit and cleanup failed kill reports

Excessive number of failed kill reports when lmkd can't find an eligible
process to kill or frees not enough memory pollutes logs and bugreports.
Cleanup kill reports to remove duplicate information and rate limit failed
kill attempts at 1 report per sec. The number of suppressed failed kills
will be reported in the next lmkd report.

Bug: 113864581
Test: Verified using lmkd_unit_test
Change-Id: I67fa1fec97613f136c7582115edcbc56b1503c9c
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 lmkd/lmkd.c              | 81 ++++++++++++++++++++++------------------
 lmkd/tests/lmkd_test.cpp |  2 +-
 2 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/lmkd/lmkd.c b/lmkd/lmkd.c
index ce2421e6a2..1980dc6608 100644
--- a/lmkd/lmkd.c
+++ b/lmkd/lmkd.c
@@ -97,6 +97,8 @@
 
 #define min(a, b) (((a) < (b)) ? (a) : (b))
 
+#define FAIL_REPORT_RLIMIT_MS 1000
+
 /* default to old in-kernel interface if no memory pressure events */
 static bool use_inkernel_interface = true;
 static bool has_inkernel_module;
@@ -1097,8 +1099,7 @@ static struct proc *proc_get_heaviest(int oomadj) {
 }
 
 /* Kill one process specified by procp.  Returns the size of the process killed */
-static int kill_one_process(struct proc* procp, int min_score_adj,
-                            enum vmpressure_level level) {
+static int kill_one_process(struct proc* procp) {
     int pid = procp->pid;
     uid_t uid = procp->uid;
     char *taskname;
@@ -1132,11 +1133,8 @@ static int kill_one_process(struct proc* procp, int min_score_adj,
 
     /* CAP_KILL required */
     r = kill(pid, SIGKILL);
-    ALOGI(
-        "Killing '%s' (%d), uid %d, adj %d\n"
-        "   to free %ldkB because system is under %s memory pressure (min_oom_adj=%d)\n",
-        taskname, pid, uid, procp->oomadj, tasksize * page_k,
-        level_name[level], min_score_adj);
+    ALOGI("Kill '%s' (%d), uid %d, oom_adj %d to free %ldkB",
+        taskname, pid, uid, procp->oomadj, tasksize * page_k);
     pid_remove(pid);
 
     TRACE_KILL_END();
@@ -1163,8 +1161,7 @@ static int kill_one_process(struct proc* procp, int min_score_adj,
  * If pages_to_free is set to 0 only one process will be killed.
  * Returns the size of the killed processes.
  */
-static int find_and_kill_processes(enum vmpressure_level level,
-                                   int min_score_adj, int pages_to_free) {
+static int find_and_kill_processes(int min_score_adj, int pages_to_free) {
     int i;
     int killed_size;
     int pages_freed = 0;
@@ -1183,7 +1180,7 @@ static int find_and_kill_processes(enum vmpressure_level level,
             if (!procp)
                 break;
 
-            killed_size = kill_one_process(procp, min_score_adj, level);
+            killed_size = kill_one_process(procp);
             if (killed_size >= 0) {
 #ifdef LMKD_LOG_STATS
                 if (enable_stats_log && !lmk_state_change_start) {
@@ -1282,6 +1279,7 @@ static void mp_event_common(int data, uint32_t events __unused) {
     enum vmpressure_level lvl;
     union meminfo mi;
     union zoneinfo zi;
+    struct timespec curr_tm;
     static struct timespec last_kill_tm;
     static unsigned long kill_skip_count = 0;
     enum vmpressure_level level = (enum vmpressure_level)data;
@@ -1312,14 +1310,12 @@ static void mp_event_common(int data, uint32_t events __unused) {
         }
     }
 
-    if (kill_timeout_ms) {
-        struct timespec curr_tm;
-
-        if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
-            ALOGE("Failed to get current time");
-            return;
-        }
+    if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
+        ALOGE("Failed to get current time");
+        return;
+    }
 
+    if (kill_timeout_ms) {
         if (get_time_diff_ms(&last_kill_tm, &curr_tm) < kill_timeout_ms) {
             kill_skip_count++;
             return;
@@ -1425,7 +1421,7 @@ static void mp_event_common(int data, uint32_t events __unused) {
 do_kill:
     if (low_ram_device) {
         /* For Go devices kill only one task */
-        if (find_and_kill_processes(level, level_oomadj[level], 0) == 0) {
+        if (find_and_kill_processes(level_oomadj[level], 0) == 0) {
             if (debug_process_killing) {
                 ALOGI("Nothing to kill");
             }
@@ -1434,6 +1430,8 @@ do_kill:
         }
     } else {
         int pages_freed;
+        static struct timespec last_report_tm;
+        static unsigned long report_skip_count = 0;
 
         if (!use_minfree_levels) {
             /* If pressure level is less than critical and enough free swap then ignore */
@@ -1461,30 +1459,41 @@ do_kill:
             min_score_adj = level_oomadj[level];
         }
 
-        pages_freed = find_and_kill_processes(level, min_score_adj, pages_to_free);
+        pages_freed = find_and_kill_processes(min_score_adj, pages_to_free);
 
-        if (use_minfree_levels) {
-            ALOGI("Killing because cache %ldkB is below "
-                  "limit %ldkB for oom_adj %d\n"
-                  "   Free memory is %ldkB %s reserved",
-                  other_file * page_k, minfree * page_k, min_score_adj,
-                  other_free * page_k, other_free >= 0 ? "above" : "below");
+        if (pages_freed == 0) {
+            /* Rate limit kill reports when nothing was reclaimed */
+            if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
+                report_skip_count++;
+                return;
+            }
         }
 
-        if (pages_freed < pages_to_free) {
-            ALOGI("Unable to free enough memory (pages to free=%d, pages freed=%d)",
-                  pages_to_free, pages_freed);
+        /* Log meminfo whenever we kill or when report rate limit allows */
+        meminfo_log(&mi);
+        if (pages_freed >= pages_to_free) {
+            /* Reset kill time only if reclaimed enough memory */
+            last_kill_tm = curr_tm;
+        }
+
+        if (use_minfree_levels) {
+            ALOGI("Killing to reclaim %ldkB, reclaimed %ldkB, cache(%ldkB) and "
+                "free(%" PRId64 "kB)-reserved(%" PRId64 "kB) below min(%ldkB) for oom_adj %d",
+                pages_to_free * page_k, pages_freed * page_k,
+                other_file * page_k, mi.field.nr_free_pages * page_k,
+                zi.field.totalreserve_pages * page_k,
+                minfree * page_k, min_score_adj);
         } else {
-            ALOGI("Reclaimed enough memory (pages to free=%d, pages freed=%d)",
-                  pages_to_free, pages_freed);
-            if (clock_gettime(CLOCK_MONOTONIC_COARSE, &last_kill_tm) != 0) {
-                ALOGE("Failed to get current time");
-                return;
-            }
+            ALOGI("Killing to reclaim %ldkB, reclaimed %ldkB at oom_adj %d",
+                pages_to_free * page_k, pages_freed * page_k, min_score_adj);
         }
-        if (pages_freed > 0) {
-            meminfo_log(&mi);
+
+        if (report_skip_count > 0) {
+            ALOGI("Suppressed %lu failed kill reports", report_skip_count);
+            report_skip_count = 0;
         }
+
+        last_report_tm = curr_tm;
     }
 }
 
diff --git a/lmkd/tests/lmkd_test.cpp b/lmkd/tests/lmkd_test.cpp
index 1996bae29a..f54b25cf7f 100644
--- a/lmkd/tests/lmkd_test.cpp
+++ b/lmkd/tests/lmkd_test.cpp
@@ -39,7 +39,7 @@ using namespace android::base;
 #define LMKDTEST_RESPAWN_FLAG "LMKDTEST_RESPAWN"
 
 #define LMKD_LOGCAT_MARKER "lowmemorykiller"
-#define LMKD_KILL_MARKER_TEMPLATE LMKD_LOGCAT_MARKER ": Killing '%s'"
+#define LMKD_KILL_MARKER_TEMPLATE LMKD_LOGCAT_MARKER ": Kill '%s'"
 #define OOM_MARKER "Out of memory"
 #define OOM_KILL_MARKER "Killed process"
 #define MIN_LOG_SIZE 100
-- 
GitLab