change our tokenize to yield empty tokens

Our tokenize function currently skips empty tokens.  This means we
incorrectly accept invalid syntax with our seccomp filter like:
	close: arg0 == 1 |||||| arg0 == 2

Change the tokenizer helper to yield an empty string in this case so
we can correctly detect & reject these.  We don't have any scenarios
where we actually want to allow empty strings currently either (and
if we did, the callers could check themselves).

Bug: None
Test: unittests pass
Change-Id: I282e4e4544a24c0e5a7036b693429bdd209339cf
diff --git a/syscall_filter_unittest.cc b/syscall_filter_unittest.cc
index 6bc044d..db01fbb 100644
--- a/syscall_filter_unittest.cc
+++ b/syscall_filter_unittest.cc
@@ -1261,6 +1261,19 @@
   ASSERT_NE(res, 0);
 }
 
+TEST(FilterTest, invalid_tokens) {
+  struct sock_fprog actual;
+  const char *policy = "read: arg0 == 1 |||| arg0 == 2\n";
+
+  FILE *policy_file = write_policy_to_pipe(policy, strlen(policy));
+  ASSERT_NE(policy_file, nullptr);
+
+  int res =
+      compile_filter("policy", policy_file, &actual, USE_RET_KILL, NO_LOGGING);
+  fclose(policy_file);
+  ASSERT_NE(res, 0);
+}
+
 TEST(FilterTest, nonexistent) {
   struct sock_fprog actual;
   int res = compile_filter("policy", NULL, &actual, USE_RET_KILL, NO_LOGGING);
diff --git a/util.c b/util.c
index 14c028a..9bb37ca 100644
--- a/util.c
+++ b/util.c
@@ -257,8 +257,8 @@
 {
 	char *ret = NULL;
 
-	/* If the string is NULL or empty, there are no tokens to be found. */
-	if (stringp == NULL || *stringp == NULL || **stringp == '\0')
+	/* If the string is NULL, there are no tokens to be found. */
+	if (stringp == NULL || *stringp == NULL)
 		return NULL;
 
 	/*
@@ -271,33 +271,19 @@
 		return ret;
 	}
 
-	char *found;
-	while (**stringp != '\0') {
-		found = strstr(*stringp, delim);
-
-		if (!found) {
-			/*
-			 * The delimiter was not found, so the full string
-			 * makes up the only token, and we're done.
-			 */
-			ret = *stringp;
-			*stringp = NULL;
-			break;
-		}
-
-		if (found != *stringp) {
-			/* There's a non-empty token before the delimiter. */
-			*found = '\0';
-			ret = *stringp;
-			*stringp = found + strlen(delim);
-			break;
-		}
-
+	char *found = strstr(*stringp, delim);
+	if (!found) {
 		/*
-		 * The delimiter was found at the start of the string,
-		 * skip it and keep looking for a non-empty token.
+		 * The delimiter was not found, so the full string
+		 * makes up the only token, and we're done.
 		 */
-		*stringp += strlen(delim);
+		ret = *stringp;
+		*stringp = NULL;
+	} else {
+		/* There's a token here, possibly empty.  That's OK. */
+		*found = '\0';
+		ret = *stringp;
+		*stringp = found + strlen(delim);
 	}
 
 	return ret;
diff --git a/util.h b/util.h
index 9ec88ce..7ff86b8 100644
--- a/util.h
+++ b/util.h
@@ -83,6 +83,18 @@
 int parse_size(size_t *size, const char *sizespec);
 
 char *strip(char *s);
+
+/*
+ * tokenize: locate the next token in @stringp using the @delim
+ * @stringp A pointer to the string to scan for tokens
+ * @delim   The delimiter to split by
+ *
+ * Note that, unlike strtok, @delim is not a set of characters, but the full
+ * delimiter.  e.g. "a,;b,;c" with a delim of ",;" will yield ["a","b","c"].
+ *
+ * Note that, unlike strtok, this may return an empty token.  e.g. "a,,b" with
+ * strtok will yield ["a","b"], but this will yield ["a","","b"].
+ */
 char *tokenize(char **stringp, const char *delim);
 
 char *path_join(const char *external_path, const char *internal_path);
diff --git a/util_unittest.cc b/util_unittest.cc
index b5cdff7..ec3d714 100644
--- a/util_unittest.cc
+++ b/util_unittest.cc
@@ -65,3 +65,29 @@
   ASSERT_EQ(nullptr, p);
   ASSERT_EQ(nullptr, tokenize(&p, ","));
 }
+
+// Check edge case with an empty string.
+TEST(tokenize, empty_string) {
+  char str[] = "";
+  char *p = str;
+  ASSERT_EQ("", std::string(tokenize(&p, ",")));
+  ASSERT_EQ(nullptr, p);
+  ASSERT_EQ(nullptr, tokenize(&p, ","));
+}
+
+// Check behavior with empty tokens at the start/middle/end.
+TEST(tokenize, empty_tokens) {
+  char str[] = ",,a,b,,,c,,";
+  char *p = str;
+  ASSERT_EQ("", std::string(tokenize(&p, ",")));
+  ASSERT_EQ("", std::string(tokenize(&p, ",")));
+  ASSERT_EQ("a", std::string(tokenize(&p, ",")));
+  ASSERT_EQ("b", std::string(tokenize(&p, ",")));
+  ASSERT_EQ("", std::string(tokenize(&p, ",")));
+  ASSERT_EQ("", std::string(tokenize(&p, ",")));
+  ASSERT_EQ("c", std::string(tokenize(&p, ",")));
+  ASSERT_EQ("", std::string(tokenize(&p, ",")));
+  ASSERT_EQ("", std::string(tokenize(&p, ",")));
+  ASSERT_EQ(nullptr, p);
+  ASSERT_EQ(nullptr, tokenize(&p, ","));
+}