Skip to content

Commit

Permalink
sarif: initial implementation of csdiff/v0 fingerprints
Browse files Browse the repository at this point in the history
It hashes the data that csdiff uses in its matching algorithm.
The interfaces are already prepared for csdiff/v1, which will
also take the line content into account when available.

From the updated tests it is obvious that these hashes already
have numerous collisions on the existing test data.

Related: https://issues.redhat.com/browse/OSH-9
Related: #98
  • Loading branch information
kdudka committed Mar 29, 2024
1 parent cf0c993 commit 1c3d4cb
Show file tree
Hide file tree
Showing 14 changed files with 3,679 additions and 867 deletions.
1 change: 1 addition & 0 deletions src/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ add_library(cs STATIC
cwe-name-lookup.cc
deflookup.cc
filter.cc
finger-print.cc
instream.cc
msg-filter.cc
parser.cc
Expand Down
84 changes: 84 additions & 0 deletions src/lib/finger-print.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (C) 2024 Red Hat, Inc.
*
* This file is part of csdiff.
*
* csdiff is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* csdiff is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with csdiff. If not, see <http://www.gnu.org/licenses/>.
*/

#include "finger-print.hh"

#include "hash-util.hh"
#include "msg-filter.hh"

#include <cassert>

#include <boost/uuid/name_generator.hpp> // for boost::uuids::detail::sha1

/// return SHA1 hash of `str` as hex-encoded string
static std::string computeHexSHA1(const std::string &str)
{
using boost::uuids::detail::sha1;
return hexHashStr<sha1>(str);
}

struct FingerPrinter::Private {
std::string basicData;
std::string lineContent;
};

// separator used for hashing of data composed from multiple strings
const std::string sep = "\n";

// TODO: consider lazy evaluation of basicData/lineContent
FingerPrinter::FingerPrinter(const Defect &def):
d(new Private)
{
// filter that csdiff uses to drop details insignificant for matching
const MsgFilter &filt = MsgFilter::inst();

// read and transform file path
const DefEvent &keyEvt = def.events[def.keyEventIdx];
const std::string path =
filt.filterPath(keyEvt.fileName, /* forceFullPath */ true);

// initialize basicData by taking all that DefLookup::lookup() looks at
d->basicData =
/* checker */ def.checker + sep +
/* file path */ path + sep +
/* key event */ keyEvt.event + sep +
/* message */ filt.filterMsg(keyEvt.msg, def.checker);
}

FingerPrinter::~FingerPrinter() = default;

// TODO: consider caching of SHA1 hashes for subsequent calls
std::string FingerPrinter::getHash(const EFingerPrintVer fpv) const
{
if (d->basicData.empty())
// not enough data to compute the hash from
return "";

if (fpv == FPV_CSDIFF)
// return SHA1 hash from basicData
return computeHexSHA1(d->basicData);

assert(fpv == FPV_CSDIFF_WITH_LINE_CONTENT);
if (d->lineContent.empty())
// no line content available
return "";

// return SHA1 hash from basicData AND lineContent
return computeHexSHA1(d->basicData + sep + d->lineContent);
}
46 changes: 46 additions & 0 deletions src/lib/finger-print.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Copyright (C) 2024 Red Hat, Inc.
*
* This file is part of csdiff.
*
* csdiff is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* csdiff is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with csdiff. If not, see <http://www.gnu.org/licenses/>.
*/

#ifndef H_GUARD_FINGER_PRINT_H
#define H_GUARD_FINGER_PRINT_H

#include "defect.hh"

#include <memory>

enum EFingerPrintVer {
FPV_CSDIFF = 0,
FPV_CSDIFF_WITH_LINE_CONTENT,
FPV_MAX
};

class FingerPrinter {
public:
FingerPrinter(const Defect &);
~FingerPrinter();

/// return fingerprint of the selected kind
std::string getHash(EFingerPrintVer) const;

private:
struct Private;
std::unique_ptr<Private> d;
};

#endif /* H_GUARD_FINGER_PRINT_H */
51 changes: 51 additions & 0 deletions src/lib/hash-util.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Copyright (C) 2024 Red Hat, Inc.
*
* This file is part of csdiff.
*
* csdiff is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* csdiff is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with csdiff. If not, see <http://www.gnu.org/licenses/>.
*/

#include <string>
#include <vector>

#include <boost/algorithm/hex.hpp>
#include <boost/algorithm/string.hpp>

/// compute TEng hash of `src` and return it as hex-encoded string
template <typename TEng, typename TSrc>
std::string hexHashStr(const TSrc &src)
{
// create hash engine and process the input
TEng eng;
eng.process_bytes(src.data(), src.size());

// export the hash as an array of unsigned int
// FIXME: std::remove_reference is needed on el7 (boost-1.53)
using TDst = typename TEng::digest_type;
typename std::remove_reference<TDst>::type dst;
eng.get_digest(dst);

// convert the hash to a vector of unsigned int
static const size_t len = sizeof(dst) / sizeof(dst[0]);
const std::vector<unsigned> hash(dst, dst + len);

// convert the hash to a hex string
std::string result;
boost::algorithm::hex(hash.begin(), hash.end(), back_inserter(result));

// convert uppercase letters to lowercase
boost::algorithm::to_lower(result);
return result;
}
7 changes: 5 additions & 2 deletions src/lib/msg-filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,10 @@ std::string MsgFilter::filterMsg(
return filtered;
}

std::string MsgFilter::filterPath(const std::string &origPath) const
std::string MsgFilter::filterPath(
const std::string &origPath,
const bool forceFullPath)
const
{
std::string path = origPath;

Expand All @@ -254,7 +257,7 @@ std::string MsgFilter::filterPath(const std::string &origPath) const
}
}

if (d->ignorePath)
if (!forceFullPath && d->ignorePath)
return regexReplaceWrap(path, d->reDir, "");

if (boost::regex_match(path, d->reTmpPath)) {
Expand Down
5 changes: 4 additions & 1 deletion src/lib/msg-filter.hh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ class MsgFilter {
std::string filterMsg(
const std::string &msg,
const std::string &checker) const;
std::string filterPath(const std::string &path) const;
std::string filterPath(
const std::string &path,
bool forceFullPath = false)
const;

private:
MsgFilter();
Expand Down
32 changes: 32 additions & 0 deletions src/lib/writer-json-sarif.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "writer-json-sarif.hh"

#include "finger-print.hh"
#include "regex.hh"
#include "version.hh"
#include "writer-json-common.hh"
Expand All @@ -42,6 +43,7 @@ struct SarifTreeEncoder::Private {

void initToolVersion();
void serializeRules();
void encodeFingerPrints(boost::json::object *, const Defect &);
};

SarifTreeEncoder::SarifTreeEncoder():
Expand Down Expand Up @@ -181,6 +183,33 @@ void SarifTreeEncoder::Private::serializeRules()
this->driver["rules"] = std::move(ruleList);
}

void SarifTreeEncoder::Private::encodeFingerPrints(
object *pResult,
const Defect &def)
{
// interface to compute fingerprints
const FingerPrinter fp(def);

// collect the array of fingerprints
object fps;
for (int v = 0; v < FPV_MAX; ++v) {
// compute a fingerprint of version `v`
const EFingerPrintVer fpv = static_cast<EFingerPrintVer>(v);
const std::string fingerPrint = fp.getHash(fpv);
if (fingerPrint.empty())
// fingerprint computation failed
continue;

// construct the "version" -> "value" pair
const std::string label = "csdiff/v" + std::to_string(v);
fps[label] = std::move(fingerPrint);
}

// if the array is non-empty, append it to the result
if (!fps.empty())
(*pResult)["fingerprints"] = std::move(fps);
}

void SarifTreeEncoder::importScanProps(const TScanProps &scanProps)
{
d->scanProps = scanProps;
Expand Down Expand Up @@ -374,6 +403,9 @@ void SarifTreeEncoder::appendDef(const Defect &def)
// our stash for comments
result["relatedLocations"] = std::move(relatedLocs);

// SARIF fingerprints
d->encodeFingerPrints(&result, def);

// append the `result` object to the `results` array
d->results.push_back(std::move(result));
}
Expand Down
Loading

0 comments on commit 1c3d4cb

Please sign in to comment.