Merge pull request #17678 from MathiasVP/modernize-unclear-array-inde…

…x-validation C++: Modernize `cpp/unclear-array-index-validation`
github · Oct 9, 2024 · b087fde · b087fde
2 parents 918e435 + 61a012f
commit b087fde
Show file tree

Hide file tree

Showing 6 changed files with 136 additions and 93 deletions.
diff --git a/cpp/ql/lib/change-notes/2024-10-07-range-analysis-of-getc.md b/cpp/ql/lib/change-notes/2024-10-07-range-analysis-of-getc.md
@@ -0,0 +1,4 @@
+---
+category: minorAnalysis
+---
+* The `SimpleRangeAnalysis` library (`semmle.code.cpp.rangeanalysis.SimpleRangeAnalysis`) now generates more precise ranges for calls to `fgetc` and `getc`.
diff --git a/cpp/ql/lib/semmle/code/cpp/rangeanalysis/SimpleRangeAnalysis.qll b/cpp/ql/lib/semmle/code/cpp/rangeanalysis/SimpleRangeAnalysis.qll
@@ -192,6 +192,37 @@ private class UnsignedMulExpr extends MulExpr {
   }
 }
 
+/**
+ * Gets the value of the `EOF` macro.
+ *
+ * This is typically `"-1"`, but this is not guaranteed to be the case on all
+ * systems.
+ */
+private int getEofValue() {
+  exists(MacroInvocation mi |
+    mi.getMacroName() = "EOF" and
+    result = unique( | | mi.getExpr().getValue().toInt())
+  )
+}
+
+/** Get standard `getc` function or related variants. */
+private class Getc extends Function {
+  Getc() { this.hasGlobalOrStdOrBslName(["fgetc", "getc"]) }
+}
+
+/** A call to `getc` */
+private class CallToGetc extends FunctionCall {
+  CallToGetc() { this.getTarget() instanceof Getc }
+}
+
+/**
+ * A call to `getc` that we can analyze because we know
+ * the value of the `EOF` macro.
+ */
+private class AnalyzableCallToGetc extends CallToGetc {
+  AnalyzableCallToGetc() { exists(getEofValue()) }
+}
+
 /**
  * Holds if `expr` is effectively a multiplication of `operand` with the
  * positive constant `positive`.
@@ -287,6 +318,8 @@ private predicate analyzableExpr(Expr e) {
     or
     e instanceof RemExpr
     or
+    e instanceof AnalyzableCallToGetc
+    or
     // A conversion is analyzable, provided that its child has an arithmetic
     // type. (Sometimes the child is a reference type, and so does not get
     // any bounds.) Rather than checking whether the type of the child is
@@ -861,6 +894,14 @@ private float getLowerBoundsImpl(Expr expr) {
       )
     )
     or
+    exists(AnalyzableCallToGetc getc |
+      expr = getc and
+      // from https://en.cppreference.com/w/c/io/fgetc:
+      // On success, returns the obtained character as an unsigned char
+      // converted to an int. On failure, returns EOF.
+      result = min([typeLowerBound(any(UnsignedCharType pct)), getEofValue()])
+    )
+    or
     // If the conversion is to an arithmetic type then we just return the
     // lower bound of the child. We do not need to handle truncation and
     // overflow here, because that is done in `getTruncatedLowerBounds`.
@@ -1055,6 +1096,14 @@ private float getUpperBoundsImpl(Expr expr) {
       )
     )
     or
+    exists(AnalyzableCallToGetc getc |
+      expr = getc and
+      // from https://en.cppreference.com/w/c/io/fgetc:
+      // On success, returns the obtained character as an unsigned char
+      // converted to an int. On failure, returns EOF.
+      result = max([typeUpperBound(any(UnsignedCharType pct)), getEofValue()])
+    )
+    or
     // If the conversion is to an arithmetic type then we just return the
     // upper bound of the child. We do not need to handle truncation and
     // overflow here, because that is done in `getTruncatedUpperBounds`.

diff --git a/cpp/ql/src/Security/CWE/CWE-129/ImproperArrayIndexValidation.ql b/cpp/ql/src/Security/CWE/CWE-129/ImproperArrayIndexValidation.ql
@@ -14,102 +14,56 @@
 
 import cpp
 import semmle.code.cpp.controlflow.IRGuards
-import semmle.code.cpp.security.FlowSources
-import semmle.code.cpp.ir.dataflow.TaintTracking
-import semmle.code.cpp.rangeanalysis.RangeAnalysisUtils
+import semmle.code.cpp.security.FlowSources as FS
+import semmle.code.cpp.dataflow.new.TaintTracking
+import semmle.code.cpp.rangeanalysis.SimpleRangeAnalysis
 import ImproperArrayIndexValidation::PathGraph
-import semmle.code.cpp.security.Security
 
-predicate hasUpperBound(VariableAccess offsetExpr) {
-  exists(BasicBlock controlled, StackVariable offsetVar, SsaDefinition def |
-    controlled.contains(offsetExpr) and
-    linearBoundControls(controlled, def, offsetVar) and
-    offsetExpr = def.getAUse(offsetVar)
-  )
+predicate isFlowSource(FS::FlowSource source, string sourceType) {
+  sourceType = source.getSourceType()
 }
 
-pragma[noinline]
-predicate linearBoundControls(BasicBlock controlled, SsaDefinition def, StackVariable offsetVar) {
-  exists(GuardCondition guard, boolean branch |
-    guard.controls(controlled, branch) and
-    cmpWithLinearBound(guard, def.getAUse(offsetVar), Lesser(), branch)
+predicate guardChecks(IRGuardCondition g, Expr e, boolean branch) {
+  exists(Operand op | op.getDef().getConvertedResultExpression() = e |
+    // `op < k` is true and `k > 0`
+    g.comparesLt(op, any(int k | k > 0), true, any(BooleanValue bv | bv.getValue() = branch))
+    or
+    // `op < _ + k` is true and `k > 0`.
+    g.comparesLt(op, _, any(int k | k > 0), true, branch)
+    or
+    // op == k
+    g.comparesEq(op, _, true, any(BooleanValue bv | bv.getValue() = branch))
+    or
+    // op == _ + k
+    g.comparesEq(op, _, _, true, branch)
   )
 }
 
-predicate readsVariable(LoadInstruction load, Variable var) {
-  load.getSourceAddress().(VariableAddressInstruction).getAstVariable() = var
-}
-
-predicate hasUpperBoundsCheck(Variable var) {
-  exists(RelationalOperation oper, VariableAccess access |
-    oper.getAnOperand() = access and
-    access.getTarget() = var and
-    // Comparing to 0 is not an upper bound check
-    not oper.getAnOperand().getValue() = "0"
+/**
+ * Holds if `arrayExpr` accesses an `ArrayType` with a constant size `N`, and
+ * the value of `offsetExpr` is known to be smaller than `N`.
+ */
+predicate offsetIsAlwaysInBounds(ArrayExpr arrayExpr, VariableAccess offsetExpr) {
+  exists(ArrayType arrayType |
+    arrayType = arrayExpr.getArrayBase().getUnspecifiedType() and
+    arrayType.getArraySize() > upperBound(offsetExpr.getFullyConverted())
   )
 }
 
-predicate nodeIsBarrierEqualityCandidate(DataFlow::Node node, Operand access, Variable checkedVar) {
-  readsVariable(node.asInstruction(), checkedVar) and
-  any(IRGuardCondition guard).ensuresEq(access, _, _, node.asInstruction().getBlock(), true)
-}
-
-predicate isFlowSource(FlowSource source, string sourceType) { sourceType = source.getSourceType() }
-
-predicate predictableInstruction(Instruction instr) {
-  instr instanceof ConstantInstruction
-  or
-  instr instanceof StringConstantInstruction
-  or
-  // This could be a conversion on a string literal
-  predictableInstruction(instr.(UnaryInstruction).getUnary())
-}
-
 module ImproperArrayIndexValidationConfig implements DataFlow::ConfigSig {
   predicate isSource(DataFlow::Node source) { isFlowSource(source, _) }
 
   predicate isBarrier(DataFlow::Node node) {
-    hasUpperBound(node.asExpr())
-    or
-    // These barriers are ported from `DefaultTaintTracking` because this query is quite noisy
-    // otherwise.
-    exists(Variable checkedVar |
-      readsVariable(node.asInstruction(), checkedVar) and
-      hasUpperBoundsCheck(checkedVar)
-    )
-    or
-    exists(Variable checkedVar, Operand access |
-      readsVariable(access.getDef(), checkedVar) and
-      nodeIsBarrierEqualityCandidate(node, access, checkedVar)
-    )
-    or
-    // Don't use dataflow into binary instructions if both operands are unpredictable
-    exists(BinaryInstruction iTo |
-      iTo = node.asInstruction() and
-      not predictableInstruction(iTo.getLeft()) and
-      not predictableInstruction(iTo.getRight()) and
-      // propagate taint from either the pointer or the offset, regardless of predictability
-      not iTo instanceof PointerArithmeticInstruction
-    )
-    or
-    // don't use dataflow through calls to pure functions if two or more operands
-    // are unpredictable
-    exists(Instruction iFrom1, Instruction iFrom2, CallInstruction iTo |
-      iTo = node.asInstruction() and
-      isPureFunction(iTo.getStaticCallTarget().getName()) and
-      iFrom1 = iTo.getAnArgument() and
-      iFrom2 = iTo.getAnArgument() and
-      not predictableInstruction(iFrom1) and
-      not predictableInstruction(iFrom2) and
-      iFrom1 != iFrom2
-    )
+    node = DataFlow::BarrierGuard<guardChecks/3>::getABarrierNode()
   }
 
+  predicate isBarrierOut(DataFlow::Node node) { isSink(node) }
+
   predicate isSink(DataFlow::Node sink) {
     exists(ArrayExpr arrayExpr, VariableAccess offsetExpr |
       offsetExpr = arrayExpr.getArrayOffset() and
       sink.asExpr() = offsetExpr and
-      not hasUpperBound(offsetExpr)
+      not offsetIsAlwaysInBounds(arrayExpr, offsetExpr)
     )
   }
 }

diff --git a/cpp/ql/src/change-notes/2024-10-07-unclear-array-index-validation.md b/cpp/ql/src/change-notes/2024-10-07-unclear-array-index-validation.md
@@ -0,0 +1,4 @@
+---
+category: minorAnalysis
+---
+* The `cpp/unclear-array-index-validation` ("Unclear validation of array index") query has been improved to reduce false positives increase true positives.
diff --git a/...ity/CWE/CWE-129/semmle/ImproperArrayIndexValidation/ImproperArrayIndexValidation.expected b/...ity/CWE/CWE-129/semmle/ImproperArrayIndexValidation/ImproperArrayIndexValidation.expected
@@ -2,29 +2,36 @@ edges
 | test1.c:7:26:7:29 | **argv | test1.c:8:11:8:14 | call to atoi | provenance | TaintFunction |
 | test1.c:8:11:8:14 | call to atoi | test1.c:9:9:9:9 | i | provenance |  |
 | test1.c:8:11:8:14 | call to atoi | test1.c:11:9:11:9 | i | provenance |  |
+| test1.c:8:11:8:14 | call to atoi | test1.c:12:9:12:9 | i | provenance |  |
 | test1.c:8:11:8:14 | call to atoi | test1.c:13:9:13:9 | i | provenance |  |
-| test1.c:9:9:9:9 | i | test1.c:16:16:16:16 | i | provenance |  |
-| test1.c:11:9:11:9 | i | test1.c:32:16:32:16 | i | provenance |  |
-| test1.c:13:9:13:9 | i | test1.c:48:16:48:16 | i | provenance |  |
-| test1.c:16:16:16:16 | i | test1.c:18:16:18:16 | i | provenance |  |
-| test1.c:32:16:32:16 | i | test1.c:33:11:33:11 | i | provenance |  |
-| test1.c:48:16:48:16 | i | test1.c:51:3:51:7 | ... = ... | provenance |  |
-| test1.c:51:3:51:7 | ... = ... | test1.c:53:15:53:15 | j | provenance |  |
+| test1.c:9:9:9:9 | i | test1.c:18:16:18:16 | i | provenance |  |
+| test1.c:11:9:11:9 | i | test1.c:34:16:34:16 | i | provenance |  |
+| test1.c:12:9:12:9 | i | test1.c:42:16:42:16 | i | provenance |  |
+| test1.c:13:9:13:9 | i | test1.c:50:16:50:16 | i | provenance |  |
+| test1.c:18:16:18:16 | i | test1.c:20:16:20:16 | i | provenance |  |
+| test1.c:34:16:34:16 | i | test1.c:35:11:35:11 | i | provenance |  |
+| test1.c:42:16:42:16 | i | test1.c:43:11:43:11 | i | provenance |  |
+| test1.c:50:16:50:16 | i | test1.c:53:3:53:7 | ... = ... | provenance |  |
+| test1.c:53:3:53:7 | ... = ... | test1.c:55:15:55:15 | j | provenance |  |
 nodes
 | test1.c:7:26:7:29 | **argv | semmle.label | **argv |
 | test1.c:8:11:8:14 | call to atoi | semmle.label | call to atoi |
 | test1.c:9:9:9:9 | i | semmle.label | i |
 | test1.c:11:9:11:9 | i | semmle.label | i |
+| test1.c:12:9:12:9 | i | semmle.label | i |
 | test1.c:13:9:13:9 | i | semmle.label | i |
-| test1.c:16:16:16:16 | i | semmle.label | i |
 | test1.c:18:16:18:16 | i | semmle.label | i |
-| test1.c:32:16:32:16 | i | semmle.label | i |
-| test1.c:33:11:33:11 | i | semmle.label | i |
-| test1.c:48:16:48:16 | i | semmle.label | i |
-| test1.c:51:3:51:7 | ... = ... | semmle.label | ... = ... |
-| test1.c:53:15:53:15 | j | semmle.label | j |
+| test1.c:20:16:20:16 | i | semmle.label | i |
+| test1.c:34:16:34:16 | i | semmle.label | i |
+| test1.c:35:11:35:11 | i | semmle.label | i |
+| test1.c:42:16:42:16 | i | semmle.label | i |
+| test1.c:43:11:43:11 | i | semmle.label | i |
+| test1.c:50:16:50:16 | i | semmle.label | i |
+| test1.c:53:3:53:7 | ... = ... | semmle.label | ... = ... |
+| test1.c:55:15:55:15 | j | semmle.label | j |
 subpaths
 #select
-| test1.c:18:16:18:16 | i | test1.c:7:26:7:29 | **argv | test1.c:18:16:18:16 | i | An array indexing expression depends on $@ that might be outside the bounds of the array. | test1.c:7:26:7:29 | **argv | a command-line argument |
-| test1.c:33:11:33:11 | i | test1.c:7:26:7:29 | **argv | test1.c:33:11:33:11 | i | An array indexing expression depends on $@ that might be outside the bounds of the array. | test1.c:7:26:7:29 | **argv | a command-line argument |
-| test1.c:53:15:53:15 | j | test1.c:7:26:7:29 | **argv | test1.c:53:15:53:15 | j | An array indexing expression depends on $@ that might be outside the bounds of the array. | test1.c:7:26:7:29 | **argv | a command-line argument |
+| test1.c:20:16:20:16 | i | test1.c:7:26:7:29 | **argv | test1.c:20:16:20:16 | i | An array indexing expression depends on $@ that might be outside the bounds of the array. | test1.c:7:26:7:29 | **argv | a command-line argument |
+| test1.c:35:11:35:11 | i | test1.c:7:26:7:29 | **argv | test1.c:35:11:35:11 | i | An array indexing expression depends on $@ that might be outside the bounds of the array. | test1.c:7:26:7:29 | **argv | a command-line argument |
+| test1.c:43:11:43:11 | i | test1.c:7:26:7:29 | **argv | test1.c:43:11:43:11 | i | An array indexing expression depends on $@ that might be outside the bounds of the array. | test1.c:7:26:7:29 | **argv | a command-line argument |
+| test1.c:55:15:55:15 | j | test1.c:7:26:7:29 | **argv | test1.c:55:15:55:15 | j | An array indexing expression depends on $@ that might be outside the bounds of the array. | test1.c:7:26:7:29 | **argv | a command-line argument |
diff --git a/cpp/ql/test/query-tests/Security/CWE/CWE-129/semmle/ImproperArrayIndexValidation/test1.c b/cpp/ql/test/query-tests/Security/CWE/CWE-129/semmle/ImproperArrayIndexValidation/test1.c
@@ -11,6 +11,8 @@ int main(int argc, char *argv[]) {
   test3(i);
   test4(i);
   test5(i);
+  test6(i);
+  test7(argv[1]);
 }
 
 void test1(int i) {
@@ -38,7 +40,7 @@ void test3(int i) {
 }
 
 void test4(int i) {
-  myArray[i] = 0; // BAD: i has not been validated [NOT REPORTED]
+  myArray[i] = 0; // BAD: i has not been validated
 
   if ((i < 0) || (i >= 10)) return;
 
@@ -52,3 +54,26 @@ void test5(int i) {
 
   j = myArray[j]; // BAD: j has not been validated
 }
+
+extern int myTable[256];
+
+void test6(int i) {
+  unsigned char s = i;
+
+  myTable[s] = 0; // GOOD: Input is small [FALSE POSITIVE]
+}
+
+typedef void FILE;
+#define EOF (-1)
+
+int getc(FILE*);
+
+extern int myMaxCharTable[256];
+
+void test7(FILE* fp) {
+  int ch;
+  while ((ch = getc(fp)) != EOF) {
+    myMaxCharTable[ch] = 0; // GOOD
+  }
+}
+