From 4ac456c2b13eb15ecdbbe8af223656ea0cb45a9c Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Mon, 12 Feb 2018 20:23:51 +0100 Subject: [PATCH 1/8] Refactoring: extract table name Signed-off-by: Alexander Bezzubov --- src/main/go/README.md | 8 +++++++- src/main/go/query.go | 12 +++++++++--- src/main/scala/tech/sourced/gemini/Gemini.scala | 12 +++++++----- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/main/go/README.md b/src/main/go/README.md index 36e9c7ec..5bc07e12 100644 --- a/src/main/go/README.md +++ b/src/main/go/README.md @@ -8,5 +8,11 @@ Trivial example of client application in Golang to query for same files. Add path to the file you want to search duplicates for and ``` -go run query.go +go run src/main/go/query.go +``` + +or, to test a connection to DB use + +``` +go run -tags="gocql_debug" src/main/go/connect.go ``` diff --git a/src/main/go/query.go b/src/main/go/query.go index e6b34765..c4ab967b 100644 --- a/src/main/go/query.go +++ b/src/main/go/query.go @@ -14,12 +14,18 @@ import ( "github.com/scylladb/gocqlx/qb" ) +// BlobHash is single blob inside a repository type BlobHash struct { BlobHash string Repo string FilePath string } +const ( + defaultKeyspace = "hashes" + defaultTable = "blob_hash_files" +) + func main() { flag.Parse() args := flag.Args() @@ -35,7 +41,7 @@ func main() { session := connect() defer session.Close() - stmt, names := qb.Select("hashes.blob_hash_files"). + stmt, names := qb.Select(fmt.Sprintf("%s.%s", defaultKeyspace, defaultTable)). Where(qb.In("blob_hash")). ToCql() @@ -58,7 +64,7 @@ func main() { func connect() *gocql.Session { node := "127.0.0.1" cluster := gocql.NewCluster(node) - cluster.Keyspace = "hashes" + cluster.Keyspace = defaultKeyspace session, err := cluster.CreateSession() if err != nil { log.Fatalf("Can not create connection to %s, %v", node, err) @@ -75,7 +81,7 @@ func sha1hash(file string) string { f, err := os.Open(file) if err != nil { - log.Fatal("Can not open a file %s", file, err) + log.Fatalf("Can not open a file %s, err:+%v", file, err) } defer f.Close() diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index 20bcb8dc..581451b2 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -49,10 +49,11 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. .withColumnRenamed("path", "file_path") def save(files: DataFrame): Unit = { - log.info(s"Writing ${files.rdd.countApprox(10000L)} files to DB") + val approxFileCount = files.rdd.countApprox(10000L) + log.info(s"Writing ${approxFileCount} files to DB") files.write .mode("append") - .cassandraFormat("blob_hash_files", keyspace) + .cassandraFormat(Gemini.defaultTable, keyspace) .save() } @@ -171,6 +172,7 @@ object Gemini { val defaultCassandraPort: Int = 9042 val defaultSchemaFile: String = "src/main/resources/schema.cql" val defautKeyspace: String = "hashes" + val defaultTable: String = "blob_hash_files" val formatter = new ObjectInserter.Formatter @@ -193,7 +195,7 @@ object Gemini { * @return */ def findAllDuplicateBlobHashes(conn: Session, keyspace: String): Iterable[DuplicateBlobHash] = { - val duplicatesCountCql = s"SELECT blob_hash, COUNT(*) as count FROM ${keyspace}.blob_hash_files GROUP BY blob_hash" + val duplicatesCountCql = s"SELECT blob_hash, COUNT(*) as count FROM ${keyspace}.${defaultTable} GROUP BY blob_hash" conn .execute(new SimpleStatement(duplicatesCountCql)) .asScala @@ -211,7 +213,7 @@ object Gemini { * @return */ def findAllDuplicateItems(conn: Session, keyspace: String): Iterable[Iterable[RepoFile]] = { - val distinctBlobHash = s"SELECT distinct blob_hash FROM ${keyspace}.blob_hash_files" + val distinctBlobHash = s"SELECT distinct blob_hash FROM ${keyspace}.${defaultTable}" conn .execute(new SimpleStatement(distinctBlobHash)) .asScala @@ -235,7 +237,7 @@ object Gemini { } def findDuplicateItemForBlobHash(sha: String, conn: Session, keyspace: String): Iterable[RepoFile] = { - val query = QueryBuilder.select().all().from(keyspace, "blob_hash_files") + val query = QueryBuilder.select().all().from(keyspace, defaultTable) .where(QueryBuilder.eq("blob_hash", sha)) conn.execute(query).asScala.map { row => From acadc562276155e7f8e49d9b90513d91fb3cde5a Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Wed, 14 Feb 2018 16:37:09 +0100 Subject: [PATCH 2/8] Refactoring: extract column names Signed-off-by: Alexander Bezzubov --- .../scala/tech/sourced/gemini/Gemini.scala | 31 ++++++++++++------- .../sourced/gemini/CassandraSparkSpec.scala | 2 +- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index 581451b2..d95ff935 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -16,6 +16,7 @@ import scala.io.Source class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.defautKeyspace) { + import Gemini._ import session.implicits._ def hash(reposPath: String, limit: Int = 0, format: String = "siva"): DataFrame = { @@ -43,10 +44,10 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. .getTreeEntries .getBlobs .select("blob_id", "repository_id", "commit_hash", "path") - .withColumnRenamed("blob_id", "blob_hash") - .withColumnRenamed("repository_id", "repo") - .withColumnRenamed("commit_hash", "ref_hash") - .withColumnRenamed("path", "file_path") + .withColumnRenamed("blob_id", meta.sha) + .withColumnRenamed("repository_id", meta.repo) + .withColumnRenamed("commit_hash", meta.commit) + .withColumnRenamed("path", meta.path) def save(files: DataFrame): Unit = { val approxFileCount = files.rdd.countApprox(10000L) @@ -158,6 +159,8 @@ object URLFormatter { } } +case class Meta(sha: String, repo: String, commit: String, path: String) + case class RepoFile(repo: String, ref_hash: String, file: String, sha: String) { override def toString(): String = URLFormatter.format(repo, ref_hash, file) } @@ -174,6 +177,9 @@ object Gemini { val defautKeyspace: String = "hashes" val defaultTable: String = "blob_hash_files" + //TODO(bzz): switch to `tables("meta")` + val meta = Meta("blob_hash", "repo", "ref_hash", "file_path") + val formatter = new ObjectInserter.Formatter def apply(ss: SparkSession, log: Slf4jLogger = Logger("gemini"), keyspace: String = defautKeyspace): Gemini = @@ -195,13 +201,15 @@ object Gemini { * @return */ def findAllDuplicateBlobHashes(conn: Session, keyspace: String): Iterable[DuplicateBlobHash] = { - val duplicatesCountCql = s"SELECT blob_hash, COUNT(*) as count FROM ${keyspace}.${defaultTable} GROUP BY blob_hash" + val hash = meta.sha + val dupCount = "count" + val duplicatesCountCql = s"SELECT ${hash}, COUNT(*) as ${dupCount} FROM ${keyspace}.${defaultTable} GROUP BY ${hash}" conn .execute(new SimpleStatement(duplicatesCountCql)) .asScala - .filter(_.getLong("count") > 1) + .filter(_.getLong(dupCount) > 1) .map { r => - DuplicateBlobHash(r.getString("blob_hash"), r.getLong("count")) + DuplicateBlobHash(r.getString(meta.sha), r.getLong(dupCount)) } } @@ -213,12 +221,13 @@ object Gemini { * @return */ def findAllDuplicateItems(conn: Session, keyspace: String): Iterable[Iterable[RepoFile]] = { - val distinctBlobHash = s"SELECT distinct blob_hash FROM ${keyspace}.${defaultTable}" + val hash = meta.sha + val distinctBlobHash = s"SELECT distinct ${hash} FROM ${keyspace}.${defaultTable}" conn .execute(new SimpleStatement(distinctBlobHash)) .asScala .flatMap { r => - val dupes = findDuplicateItemForBlobHash(r.getString("blob_hash"), conn, keyspace) + val dupes = findDuplicateItemForBlobHash(r.getString(hash), conn, keyspace) if (dupes.size > 1) { List(dupes) } else { @@ -238,10 +247,10 @@ object Gemini { def findDuplicateItemForBlobHash(sha: String, conn: Session, keyspace: String): Iterable[RepoFile] = { val query = QueryBuilder.select().all().from(keyspace, defaultTable) - .where(QueryBuilder.eq("blob_hash", sha)) + .where(QueryBuilder.eq(meta.sha, sha)) conn.execute(query).asScala.map { row => - RepoFile(row.getString("repo"), row.getString("ref_hash"), row.getString("file_path"), row.getString("blob_hash")) + RepoFile(row.getString(meta.repo), row.getString(meta.commit), row.getString(meta.path), row.getString(meta.sha)) } } diff --git a/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala b/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala index 6e015f89..1616b85f 100644 --- a/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala +++ b/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala @@ -140,7 +140,7 @@ class CassandraSparkSpec extends FlatSpec "Hash with limit" should "collect files only from limit repos" in { val gemini = Gemini(sparkSession) - val repos = gemini.hash("src/test/resources/siva", 1).select("repo").distinct().count() + val repos = gemini.hash("src/test/resources/siva", 1).select(Gemini.meta.repo).distinct().count() repos should be(1) } From 568eead7fbf4f15b9fea63091d682863ed7370d5 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Fri, 16 Feb 2018 00:32:50 +0100 Subject: [PATCH 3/8] Refactoring: using companion object import Signed-off-by: Alexander Bezzubov --- src/main/scala/tech/sourced/gemini/Gemini.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index d95ff935..a3e4a165 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -54,7 +54,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. log.info(s"Writing ${approxFileCount} files to DB") files.write .mode("append") - .cassandraFormat(Gemini.defaultTable, keyspace) + .cassandraFormat(defaultTable, keyspace) .save() } @@ -73,11 +73,11 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. def query(inPath: String, conn: Session): ReportByLine = { val path = new File(inPath) if (path.isDirectory) { - ReportByLine(Gemini.findDuplicateProjects(path, conn, keyspace)) + ReportByLine(findDuplicateProjects(path, conn, keyspace)) //TODO: implement based on Apolo //findSimilarProjects(path) } else { - ReportByLine(Gemini.findDuplicateItemForFile(path, conn, keyspace)) + ReportByLine(findDuplicateItemForFile(path, conn, keyspace)) //TODO: implement based on Apolo //findSimilarFiles(path) } @@ -91,7 +91,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. * @return */ def report(conn: Session): ReportExpandedGroup = { - ReportExpandedGroup(Gemini.findAllDuplicateItems(conn, keyspace)) + ReportExpandedGroup(findAllDuplicateItems(conn, keyspace)) } /** @@ -103,7 +103,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. * @return */ def reportCassandraCondensed(conn: Session): ReportGrouped = { - ReportGrouped(Gemini.findAllDuplicateBlobHashes(conn, keyspace)) + ReportGrouped(findAllDuplicateBlobHashes(conn, keyspace)) } /** @@ -117,7 +117,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. def reportCassandraGroupBy(conn: Session): ReportExpandedGroup = { val duplicates = reportCassandraCondensed(conn).v .map { item => - Gemini.findDuplicateItemForBlobHash(item.sha, conn, keyspace) + findDuplicateItemForBlobHash(item.sha, conn, keyspace) } ReportExpandedGroup(duplicates) } @@ -125,7 +125,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. def applySchema(session: Session): Unit = { log.debug("CQL: creating schema") Source - .fromFile(Gemini.defaultSchemaFile) + .fromFile(defaultSchemaFile) .getLines .map(_.trim) .filter(!_.isEmpty) @@ -242,7 +242,7 @@ object Gemini { } def findDuplicateItemForFile(file: File, conn: Session, keyspace: String): Iterable[RepoFile] = { - findDuplicateItemForBlobHash(Gemini.computeSha1(file), conn, keyspace) + findDuplicateItemForBlobHash(computeSha1(file), conn, keyspace) } def findDuplicateItemForBlobHash(sha: String, conn: Session, keyspace: String): Iterable[RepoFile] = { From 21e8f83a4cc60c8ebc9b7e28a8e7175b9e5503fa Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Fri, 16 Feb 2018 01:48:00 +0100 Subject: [PATCH 4/8] Refactoring: cleanup the warnings Signed-off-by: Alexander Bezzubov --- src/main/scala/tech/sourced/gemini/Gemini.scala | 14 +++++++------- .../scala/tech/sourced/gemini/HashSparkApp.scala | 2 +- src/main/scala/tech/sourced/gemini/Logger.scala | 2 +- src/main/scala/tech/sourced/gemini/QueryApp.scala | 4 ++-- .../scala/tech/sourced/gemini/ReportSparkApp.scala | 7 +++---- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index a3e4a165..300b810d 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -51,7 +51,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. def save(files: DataFrame): Unit = { val approxFileCount = files.rdd.countApprox(10000L) - log.info(s"Writing ${approxFileCount} files to DB") + log.info(s"Writing $approxFileCount files to DB") files.write .mode("append") .cassandraFormat(defaultTable, keyspace) @@ -144,12 +144,12 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini. } object URLFormatter { - val services = Map( + private val services = Map( "github.com" -> "https://%s/blob/%s/%s", "bitbucket.org" -> "https://%s/src/%s/%s", "gitlab.com" -> "https://%s/blob/%s/%s" ) - val default = ("", "repo: %s ref_hash: %s file: %s") + private val default = ("", "repo: %s ref_hash: %s file: %s") def format(repo: String, ref_hash: String, file: String): String = { val urlTemplateByRepo = services.find { case (h, _) => repo.startsWith(h) }.getOrElse(default)._2 @@ -162,11 +162,11 @@ object URLFormatter { case class Meta(sha: String, repo: String, commit: String, path: String) case class RepoFile(repo: String, ref_hash: String, file: String, sha: String) { - override def toString(): String = URLFormatter.format(repo, ref_hash, file) + override def toString: String = URLFormatter.format(repo, ref_hash, file) } case class DuplicateBlobHash(sha: String, count: Long) { - override def toString(): String = s"$sha ($count duplicates)" + override def toString: String = s"$sha ($count duplicates)" } object Gemini { @@ -203,7 +203,7 @@ object Gemini { def findAllDuplicateBlobHashes(conn: Session, keyspace: String): Iterable[DuplicateBlobHash] = { val hash = meta.sha val dupCount = "count" - val duplicatesCountCql = s"SELECT ${hash}, COUNT(*) as ${dupCount} FROM ${keyspace}.${defaultTable} GROUP BY ${hash}" + val duplicatesCountCql = s"SELECT $hash, COUNT(*) as $dupCount FROM $keyspace.$defaultTable GROUP BY $hash" conn .execute(new SimpleStatement(duplicatesCountCql)) .asScala @@ -222,7 +222,7 @@ object Gemini { */ def findAllDuplicateItems(conn: Session, keyspace: String): Iterable[Iterable[RepoFile]] = { val hash = meta.sha - val distinctBlobHash = s"SELECT distinct ${hash} FROM ${keyspace}.${defaultTable}" + val distinctBlobHash = s"SELECT distinct $hash FROM $keyspace.$defaultTable" conn .execute(new SimpleStatement(distinctBlobHash)) .asScala diff --git a/src/main/scala/tech/sourced/gemini/HashSparkApp.scala b/src/main/scala/tech/sourced/gemini/HashSparkApp.scala index 88bd9ff0..14a27ea0 100644 --- a/src/main/scala/tech/sourced/gemini/HashSparkApp.scala +++ b/src/main/scala/tech/sourced/gemini/HashSparkApp.scala @@ -66,7 +66,7 @@ object HashSparkApp extends App with Logging { .getOrCreate() if (config.verbose) { - LogManager.getRootLogger().setLevel(Level.INFO) + LogManager.getRootLogger.setLevel(Level.INFO) } val repos = listRepositories(reposPath, config.format, spark.sparkContext.hadoopConfiguration, config.limit) diff --git a/src/main/scala/tech/sourced/gemini/Logger.scala b/src/main/scala/tech/sourced/gemini/Logger.scala index aaaac0bb..8c5d2f81 100644 --- a/src/main/scala/tech/sourced/gemini/Logger.scala +++ b/src/main/scala/tech/sourced/gemini/Logger.scala @@ -7,7 +7,7 @@ import org.slf4j.{Logger => Slf4jLogger} object Logger { def apply(name: String, verbose: Boolean = false): Slf4jLogger = { if (verbose) { - LogManager.getRootLogger().setLevel(Level.INFO) + LogManager.getRootLogger.setLevel(Level.INFO) } LoggerFactory.getLogger(name) } diff --git a/src/main/scala/tech/sourced/gemini/QueryApp.scala b/src/main/scala/tech/sourced/gemini/QueryApp.scala index d78807e9..7ca9299d 100644 --- a/src/main/scala/tech/sourced/gemini/QueryApp.scala +++ b/src/main/scala/tech/sourced/gemini/QueryApp.scala @@ -48,8 +48,8 @@ object QueryApp extends App { val similar = gemini.query(file, cassandra).v - cassandra.close - cluster.close + cassandra.close() + cluster.close() if (similar.isEmpty) { println(s"No duplicates of $file found.") diff --git a/src/main/scala/tech/sourced/gemini/ReportSparkApp.scala b/src/main/scala/tech/sourced/gemini/ReportSparkApp.scala index 4b0ccf1a..03e7609a 100644 --- a/src/main/scala/tech/sourced/gemini/ReportSparkApp.scala +++ b/src/main/scala/tech/sourced/gemini/ReportSparkApp.scala @@ -53,8 +53,8 @@ object ReportSparkApp extends App { case `groupByMode` => gemini.reportCassandraGroupBy(cassandra) } - cassandra.close - cluster.close + cassandra.close() + cluster.close() print(report) case None => @@ -65,12 +65,11 @@ object ReportSparkApp extends App { report match { case e if e.empty() => println(s"No duplicates found.") case ReportGrouped(v) => println(s"Duplicates found:\n\t" + (v mkString "\n\t")) - case ReportExpandedGroup(v) => { + case ReportExpandedGroup(v) => v.foreach { item => val count = item.size println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n") } - } } } From 9fe7b99f0cca4fcc0eeca40c45195883267d1b3e Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Fri, 2 Mar 2018 17:42:26 +0100 Subject: [PATCH 5/8] Refactoring: Scala - column names match Apollo Signed-off-by: Alexander Bezzubov --- src/main/resources/schema.cql | 2 +- src/main/scala/tech/sourced/gemini/Gemini.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/resources/schema.cql b/src/main/resources/schema.cql index 2ed367d2..e540ee02 100644 --- a/src/main/resources/schema.cql +++ b/src/main/resources/schema.cql @@ -1,3 +1,3 @@ CREATE KEYSPACE IF NOT EXISTS __KEYSPACE__ WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}; USE __KEYSPACE__; -CREATE TABLE IF NOT EXISTS __KEYSPACE__.blob_hash_files (blob_hash ascii, repo text, ref_hash ascii, file_path text, PRIMARY KEY (blob_hash, repo, ref_hash, file_path)); +CREATE TABLE IF NOT EXISTS __KEYSPACE__.blob_hash_files (sha1 ascii, repo text, commit ascii, path text, PRIMARY KEY (sha1, repo, commit, path)); diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index 300b810d..9bcf0f36 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -178,7 +178,7 @@ object Gemini { val defaultTable: String = "blob_hash_files" //TODO(bzz): switch to `tables("meta")` - val meta = Meta("blob_hash", "repo", "ref_hash", "file_path") + val meta = Meta("sha1", "repo", "commit", "path") val formatter = new ObjectInserter.Formatter From 1ca19128b228129a2a416e16015d97d288ab5d41 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Fri, 2 Mar 2018 17:57:03 +0100 Subject: [PATCH 6/8] Refactoring: change var names to follow the schema Signed-off-by: Alexander Bezzubov --- .gitignore | 1 + src/main/scala/tech/sourced/gemini/Gemini.scala | 10 +++++----- .../scala/tech/sourced/gemini/CassandraSparkSpec.scala | 6 +++--- .../scala/tech/sourced/gemini/URLFormatterSpec.scala | 8 +++++--- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 2b8e7391..94c64753 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ ~* target/ .idea/ +.vscode metastore_db/ scylla diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala index 9bcf0f36..4c7e86a0 100644 --- a/src/main/scala/tech/sourced/gemini/Gemini.scala +++ b/src/main/scala/tech/sourced/gemini/Gemini.scala @@ -149,20 +149,20 @@ object URLFormatter { "bitbucket.org" -> "https://%s/src/%s/%s", "gitlab.com" -> "https://%s/blob/%s/%s" ) - private val default = ("", "repo: %s ref_hash: %s file: %s") + private val default = ("", "repo: %s commit: %s path: %s") - def format(repo: String, ref_hash: String, file: String): String = { + def format(repo: String, commit: String, path: String): String = { val urlTemplateByRepo = services.find { case (h, _) => repo.startsWith(h) }.getOrElse(default)._2 val repoWithoutSuffix = repo.replaceFirst("\\.git$", "") - urlTemplateByRepo.format(repoWithoutSuffix, ref_hash, file) + urlTemplateByRepo.format(repoWithoutSuffix, commit, path) } } case class Meta(sha: String, repo: String, commit: String, path: String) -case class RepoFile(repo: String, ref_hash: String, file: String, sha: String) { - override def toString: String = URLFormatter.format(repo, ref_hash, file) +case class RepoFile(repo: String, commit: String, path: String, sha: String) { + override def toString: String = URLFormatter.format(repo, commit, path) } case class DuplicateBlobHash(sha: String, count: Long) { diff --git a/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala b/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala index 1616b85f..342266b2 100644 --- a/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala +++ b/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala @@ -82,7 +82,7 @@ class CassandraSparkSpec extends FlatSpec sha1.v should not be empty sha1.v.head.sha should be("097f4a292c384e002c5b5ce8e15d746849af7b37") // git hash-object -w LICENSE sha1.v.head.repo should be("null/Users/alex/src-d/gemini") - sha1.v.head.ref_hash should be("4aa29ac236c55ebbfbef149fef7054d25832717f") + sha1.v.head.commit should be("4aa29ac236c55ebbfbef149fef7054d25832717f") } "Query for duplicates in single repository" should "return 2 files" in { @@ -113,7 +113,7 @@ class CassandraSparkSpec extends FlatSpec val detailedReport = gemini.reportCassandraGroupBy(session).v println("Done") - val duplicatedFileNames = detailedReport map (_.head.file) + val duplicatedFileNames = detailedReport map (_.head.path) duplicatedFileNames.toSeq should contain theSameElementsAs expectedDuplicateFiles } @@ -124,7 +124,7 @@ class CassandraSparkSpec extends FlatSpec val detailedReport = gemini.report(session).v println("Done") - val duplicatedFileNames = detailedReport map (_.head.file) + val duplicatedFileNames = detailedReport map (_.head.path) duplicatedFileNames.toSeq should contain theSameElementsAs expectedDuplicateFiles } diff --git a/src/test/scala/tech/sourced/gemini/URLFormatterSpec.scala b/src/test/scala/tech/sourced/gemini/URLFormatterSpec.scala index 218659d8..d05138fa 100644 --- a/src/test/scala/tech/sourced/gemini/URLFormatterSpec.scala +++ b/src/test/scala/tech/sourced/gemini/URLFormatterSpec.scala @@ -6,16 +6,18 @@ class URLFormatterSpec extends FlatSpec with Matchers { "URLFormatter" should "format correctly" in { - case class Input(repo: String, ref_hash: String, file: String) + case class Input(repo: String, commit: String, path: String) val cases = Map( Input("github.com/src-d/test", "sha1", "path/file") -> "https://github.com/src-d/test/blob/sha1/path/file", Input("gitlab.com/src-d/test", "sha1", "path/file") -> "https://gitlab.com/src-d/test/blob/sha1/path/file", Input("bitbucket.org/src-d/test", "sha1", "path/file") -> "https://bitbucket.org/src-d/test/src/sha1/path/file", Input("github.com/src-d/test.git", "sha1", "path/file") -> "https://github.com/src-d/test/blob/sha1/path/file", - Input("unknown", "sha1", "path/file") -> "repo: unknown ref_hash: sha1 file: path/file" + Input("unknown", "sha1", "path/file") -> "repo: unknown commit: sha1 path: path/file" ) - for ((i, expected) <- cases) URLFormatter.format(i.repo, i.ref_hash, i.file) should be(expected) + for ((i, expected) <- cases) { + URLFormatter.format(i.repo, i.commit, i.path) should be(expected) + } } } From 557f8c167ebc4d30e2910a339c4425efe068ea78 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Tue, 6 Mar 2018 22:42:58 +0100 Subject: [PATCH 7/8] Refactoring: Go - column names match Apollo Signed-off-by: Alexander Bezzubov --- src/main/go/query.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main/go/query.go b/src/main/go/query.go index c4ab967b..bc5093a7 100644 --- a/src/main/go/query.go +++ b/src/main/go/query.go @@ -16,9 +16,10 @@ import ( // BlobHash is single blob inside a repository type BlobHash struct { - BlobHash string - Repo string - FilePath string + Sha1 string + Commit string + Repo string + Path string } const ( @@ -42,11 +43,11 @@ func main() { defer session.Close() stmt, names := qb.Select(fmt.Sprintf("%s.%s", defaultKeyspace, defaultTable)). - Where(qb.In("blob_hash")). + Where(qb.In("sha1")). ToCql() q := gocqlx.Query(session.Query(stmt), names).BindMap(qb.M{ - "blob_hash": []string{hash}, + "sha1": []string{hash}, }) defer q.Release() From 8f734c50d3736f403384a03eccdd88d585dc9256 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Tue, 6 Mar 2018 22:43:28 +0100 Subject: [PATCH 8/8] Go - add query to CI integration test Signed-off-by: Alexander Bezzubov --- .travis.yml | 2 ++ src/main/go/query.go | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 12f56590..f845a048 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,6 +28,8 @@ matrix: - ./hash src/test/resources/siva || travis_terminate 1 - ./query ./LICENSE - ./report + - go get ./src/main/go/... || true + - go run ./src/main/go/query.go ./LICENSE before_deploy: - VERSION=$TRAVIS_TAG ./scripts/release.sh deploy: diff --git a/src/main/go/query.go b/src/main/go/query.go index bc5093a7..e2e1d5d0 100644 --- a/src/main/go/query.go +++ b/src/main/go/query.go @@ -53,12 +53,16 @@ func main() { var similarHashes []BlobHash if err := gocqlx.Select(&similarHashes, q.Query); err != nil { - log.Fatal("select:", err) + log.Fatalf("select: %v in %s", err, q.Query) } for _, hash := range similarHashes { fmt.Printf("\t%+v\n", hash) } + + if len(similarHashes) == 0 { + os.Exit(2) + } } // connect to the cluster