From 4ac456c2b13eb15ecdbbe8af223656ea0cb45a9c Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Mon, 12 Feb 2018 20:23:51 +0100
Subject: [PATCH 1/8] Refactoring: extract table name

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 src/main/go/README.md                           |  8 +++++++-
 src/main/go/query.go                            | 12 +++++++++---
 src/main/scala/tech/sourced/gemini/Gemini.scala | 12 +++++++-----
 3 files changed, 23 insertions(+), 9 deletions(-)
diff --git a/src/main/go/README.md b/src/main/go/README.md
index 36e9c7ec..5bc07e12 100644
--- a/src/main/go/README.md
+++ b/src/main/go/README.md
@@ -8,5 +8,11 @@ Trivial example of client application in Golang to query for same files.
 Add path to the file you want to search duplicates for and
 
 ```
-go run query.go <path-to-file>
+go run src/main/go/query.go <path-to-file>
+```
+
+or, to test a connection to DB use
+
+```
+go run -tags="gocql_debug" src/main/go/connect.go
 ```
diff --git a/src/main/go/query.go b/src/main/go/query.go
index e6b34765..c4ab967b 100644
--- a/src/main/go/query.go
+++ b/src/main/go/query.go
@@ -14,12 +14,18 @@ import (
 	"github.com/scylladb/gocqlx/qb"
 )
 
+// BlobHash is single blob inside a repository
 type BlobHash struct {
 	BlobHash string
 	Repo     string
 	FilePath string
 }
 
+const (
+	defaultKeyspace = "hashes"
+	defaultTable    = "blob_hash_files"
+)
+
 func main() {
 	flag.Parse()
 	args := flag.Args()
@@ -35,7 +41,7 @@ func main() {
 	session := connect()
 	defer session.Close()
 
-	stmt, names := qb.Select("hashes.blob_hash_files").
+	stmt, names := qb.Select(fmt.Sprintf("%s.%s", defaultKeyspace, defaultTable)).
 		Where(qb.In("blob_hash")).
 		ToCql()
 
@@ -58,7 +64,7 @@ func main() {
 func connect() *gocql.Session {
 	node := "127.0.0.1"
 	cluster := gocql.NewCluster(node)
-	cluster.Keyspace = "hashes"
+	cluster.Keyspace = defaultKeyspace
 	session, err := cluster.CreateSession()
 	if err != nil {
 		log.Fatalf("Can not create connection to %s, %v", node, err)
@@ -75,7 +81,7 @@ func sha1hash(file string) string {
 
 	f, err := os.Open(file)
 	if err != nil {
-		log.Fatal("Can not open a file %s", file, err)
+		log.Fatalf("Can not open a file %s, err:+%v", file, err)
 	}
 	defer f.Close()
 
diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala
index 20bcb8dc..581451b2 100644
--- a/src/main/scala/tech/sourced/gemini/Gemini.scala
+++ b/src/main/scala/tech/sourced/gemini/Gemini.scala
@@ -49,10 +49,11 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
       .withColumnRenamed("path", "file_path")
 
   def save(files: DataFrame): Unit = {
-    log.info(s"Writing ${files.rdd.countApprox(10000L)} files to DB")
+    val approxFileCount = files.rdd.countApprox(10000L)
+    log.info(s"Writing ${approxFileCount} files to DB")
     files.write
       .mode("append")
-      .cassandraFormat("blob_hash_files", keyspace)
+      .cassandraFormat(Gemini.defaultTable, keyspace)
       .save()
   }
 
@@ -171,6 +172,7 @@ object Gemini {
   val defaultCassandraPort: Int = 9042
   val defaultSchemaFile: String = "src/main/resources/schema.cql"
   val defautKeyspace: String = "hashes"
+  val defaultTable: String = "blob_hash_files"
 
   val formatter = new ObjectInserter.Formatter
 
@@ -193,7 +195,7 @@ object Gemini {
     * @return
     */
   def findAllDuplicateBlobHashes(conn: Session, keyspace: String): Iterable[DuplicateBlobHash] = {
-    val duplicatesCountCql = s"SELECT blob_hash, COUNT(*) as count FROM ${keyspace}.blob_hash_files GROUP BY blob_hash"
+    val duplicatesCountCql = s"SELECT blob_hash, COUNT(*) as count FROM ${keyspace}.${defaultTable} GROUP BY blob_hash"
     conn
       .execute(new SimpleStatement(duplicatesCountCql))
       .asScala
@@ -211,7 +213,7 @@ object Gemini {
     * @return
     */
   def findAllDuplicateItems(conn: Session, keyspace: String): Iterable[Iterable[RepoFile]] = {
-    val distinctBlobHash = s"SELECT distinct blob_hash FROM ${keyspace}.blob_hash_files"
+    val distinctBlobHash = s"SELECT distinct blob_hash FROM ${keyspace}.${defaultTable}"
     conn
       .execute(new SimpleStatement(distinctBlobHash))
       .asScala
@@ -235,7 +237,7 @@ object Gemini {
   }
 
   def findDuplicateItemForBlobHash(sha: String, conn: Session, keyspace: String): Iterable[RepoFile] = {
-    val query = QueryBuilder.select().all().from(keyspace, "blob_hash_files")
+    val query = QueryBuilder.select().all().from(keyspace, defaultTable)
       .where(QueryBuilder.eq("blob_hash", sha))
 
     conn.execute(query).asScala.map { row =>

From acadc562276155e7f8e49d9b90513d91fb3cde5a Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Wed, 14 Feb 2018 16:37:09 +0100
Subject: [PATCH 2/8] Refactoring: extract column names

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 .../scala/tech/sourced/gemini/Gemini.scala    | 31 ++++++++++++-------
 .../sourced/gemini/CassandraSparkSpec.scala   |  2 +-
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala
index 581451b2..d95ff935 100644
--- a/src/main/scala/tech/sourced/gemini/Gemini.scala
+++ b/src/main/scala/tech/sourced/gemini/Gemini.scala
@@ -16,6 +16,7 @@ import scala.io.Source
 
 class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.defautKeyspace) {
 
+  import Gemini._
   import session.implicits._
 
   def hash(reposPath: String, limit: Int = 0, format: String = "siva"): DataFrame = {
@@ -43,10 +44,10 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
       .getTreeEntries
       .getBlobs
       .select("blob_id", "repository_id", "commit_hash", "path")
-      .withColumnRenamed("blob_id", "blob_hash")
-      .withColumnRenamed("repository_id", "repo")
-      .withColumnRenamed("commit_hash", "ref_hash")
-      .withColumnRenamed("path", "file_path")
+      .withColumnRenamed("blob_id", meta.sha)
+      .withColumnRenamed("repository_id", meta.repo)
+      .withColumnRenamed("commit_hash", meta.commit)
+      .withColumnRenamed("path", meta.path)
 
   def save(files: DataFrame): Unit = {
     val approxFileCount = files.rdd.countApprox(10000L)
@@ -158,6 +159,8 @@ object URLFormatter {
   }
 }
 
+case class Meta(sha: String, repo: String, commit: String, path: String)
+
 case class RepoFile(repo: String, ref_hash: String, file: String, sha: String) {
   override def toString(): String = URLFormatter.format(repo, ref_hash, file)
 }
@@ -174,6 +177,9 @@ object Gemini {
   val defautKeyspace: String = "hashes"
   val defaultTable: String = "blob_hash_files"
 
+  //TODO(bzz): switch to `tables("meta")`
+  val meta = Meta("blob_hash", "repo", "ref_hash", "file_path")
+
   val formatter = new ObjectInserter.Formatter
 
   def apply(ss: SparkSession, log: Slf4jLogger = Logger("gemini"), keyspace: String = defautKeyspace): Gemini =
@@ -195,13 +201,15 @@ object Gemini {
     * @return
     */
   def findAllDuplicateBlobHashes(conn: Session, keyspace: String): Iterable[DuplicateBlobHash] = {
-    val duplicatesCountCql = s"SELECT blob_hash, COUNT(*) as count FROM ${keyspace}.${defaultTable} GROUP BY blob_hash"
+    val hash = meta.sha
+    val dupCount = "count"
+    val duplicatesCountCql = s"SELECT ${hash}, COUNT(*) as ${dupCount} FROM ${keyspace}.${defaultTable} GROUP BY ${hash}"
     conn
       .execute(new SimpleStatement(duplicatesCountCql))
       .asScala
-      .filter(_.getLong("count") > 1)
+      .filter(_.getLong(dupCount) > 1)
       .map { r =>
-        DuplicateBlobHash(r.getString("blob_hash"), r.getLong("count"))
+        DuplicateBlobHash(r.getString(meta.sha), r.getLong(dupCount))
       }
   }
 
@@ -213,12 +221,13 @@ object Gemini {
     * @return
     */
   def findAllDuplicateItems(conn: Session, keyspace: String): Iterable[Iterable[RepoFile]] = {
-    val distinctBlobHash = s"SELECT distinct blob_hash FROM ${keyspace}.${defaultTable}"
+    val hash = meta.sha
+    val distinctBlobHash = s"SELECT distinct ${hash} FROM ${keyspace}.${defaultTable}"
     conn
       .execute(new SimpleStatement(distinctBlobHash))
       .asScala
       .flatMap { r =>
-        val dupes = findDuplicateItemForBlobHash(r.getString("blob_hash"), conn, keyspace)
+        val dupes = findDuplicateItemForBlobHash(r.getString(hash), conn, keyspace)
         if (dupes.size > 1) {
           List(dupes)
         } else {
@@ -238,10 +247,10 @@ object Gemini {
 
   def findDuplicateItemForBlobHash(sha: String, conn: Session, keyspace: String): Iterable[RepoFile] = {
     val query = QueryBuilder.select().all().from(keyspace, defaultTable)
-      .where(QueryBuilder.eq("blob_hash", sha))
+      .where(QueryBuilder.eq(meta.sha, sha))
 
     conn.execute(query).asScala.map { row =>
-      RepoFile(row.getString("repo"), row.getString("ref_hash"), row.getString("file_path"), row.getString("blob_hash"))
+      RepoFile(row.getString(meta.repo), row.getString(meta.commit), row.getString(meta.path), row.getString(meta.sha))
     }
   }
 
diff --git a/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala b/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala
index 6e015f89..1616b85f 100644
--- a/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala
+++ b/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala
@@ -140,7 +140,7 @@ class CassandraSparkSpec extends FlatSpec
 
   "Hash with limit" should "collect files only from limit repos" in {
     val gemini = Gemini(sparkSession)
-    val repos = gemini.hash("src/test/resources/siva", 1).select("repo").distinct().count()
+    val repos = gemini.hash("src/test/resources/siva", 1).select(Gemini.meta.repo).distinct().count()
     repos should be(1)
   }
 

From 568eead7fbf4f15b9fea63091d682863ed7370d5 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Fri, 16 Feb 2018 00:32:50 +0100
Subject: [PATCH 3/8] Refactoring: using companion object import

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 src/main/scala/tech/sourced/gemini/Gemini.scala | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala
index d95ff935..a3e4a165 100644
--- a/src/main/scala/tech/sourced/gemini/Gemini.scala
+++ b/src/main/scala/tech/sourced/gemini/Gemini.scala
@@ -54,7 +54,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
     log.info(s"Writing ${approxFileCount} files to DB")
     files.write
       .mode("append")
-      .cassandraFormat(Gemini.defaultTable, keyspace)
+      .cassandraFormat(defaultTable, keyspace)
       .save()
   }
 
@@ -73,11 +73,11 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
   def query(inPath: String, conn: Session): ReportByLine = {
     val path = new File(inPath)
     if (path.isDirectory) {
-      ReportByLine(Gemini.findDuplicateProjects(path, conn, keyspace))
+      ReportByLine(findDuplicateProjects(path, conn, keyspace))
       //TODO: implement based on Apolo
       //findSimilarProjects(path)
     } else {
-      ReportByLine(Gemini.findDuplicateItemForFile(path, conn, keyspace))
+      ReportByLine(findDuplicateItemForFile(path, conn, keyspace))
       //TODO: implement based on Apolo
       //findSimilarFiles(path)
     }
@@ -91,7 +91,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
     * @return
     */
   def report(conn: Session): ReportExpandedGroup = {
-    ReportExpandedGroup(Gemini.findAllDuplicateItems(conn, keyspace))
+    ReportExpandedGroup(findAllDuplicateItems(conn, keyspace))
   }
 
   /**
@@ -103,7 +103,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
     * @return
     */
   def reportCassandraCondensed(conn: Session): ReportGrouped = {
-    ReportGrouped(Gemini.findAllDuplicateBlobHashes(conn, keyspace))
+    ReportGrouped(findAllDuplicateBlobHashes(conn, keyspace))
   }
 
   /**
@@ -117,7 +117,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
   def reportCassandraGroupBy(conn: Session): ReportExpandedGroup = {
     val duplicates = reportCassandraCondensed(conn).v
       .map { item =>
-        Gemini.findDuplicateItemForBlobHash(item.sha, conn, keyspace)
+        findDuplicateItemForBlobHash(item.sha, conn, keyspace)
       }
     ReportExpandedGroup(duplicates)
   }
@@ -125,7 +125,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
   def applySchema(session: Session): Unit = {
     log.debug("CQL: creating schema")
     Source
-      .fromFile(Gemini.defaultSchemaFile)
+      .fromFile(defaultSchemaFile)
       .getLines
       .map(_.trim)
       .filter(!_.isEmpty)
@@ -242,7 +242,7 @@ object Gemini {
   }
 
   def findDuplicateItemForFile(file: File, conn: Session, keyspace: String): Iterable[RepoFile] = {
-    findDuplicateItemForBlobHash(Gemini.computeSha1(file), conn, keyspace)
+    findDuplicateItemForBlobHash(computeSha1(file), conn, keyspace)
   }
 
   def findDuplicateItemForBlobHash(sha: String, conn: Session, keyspace: String): Iterable[RepoFile] = {

From 21e8f83a4cc60c8ebc9b7e28a8e7175b9e5503fa Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Fri, 16 Feb 2018 01:48:00 +0100
Subject: [PATCH 4/8] Refactoring: cleanup the warnings

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 src/main/scala/tech/sourced/gemini/Gemini.scala    | 14 +++++++-------
 .../scala/tech/sourced/gemini/HashSparkApp.scala   |  2 +-
 src/main/scala/tech/sourced/gemini/Logger.scala    |  2 +-
 src/main/scala/tech/sourced/gemini/QueryApp.scala  |  4 ++--
 .../scala/tech/sourced/gemini/ReportSparkApp.scala |  7 +++----
 5 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala
index a3e4a165..300b810d 100644
--- a/src/main/scala/tech/sourced/gemini/Gemini.scala
+++ b/src/main/scala/tech/sourced/gemini/Gemini.scala
@@ -51,7 +51,7 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
 
   def save(files: DataFrame): Unit = {
     val approxFileCount = files.rdd.countApprox(10000L)
-    log.info(s"Writing ${approxFileCount} files to DB")
+    log.info(s"Writing $approxFileCount files to DB")
     files.write
       .mode("append")
       .cassandraFormat(defaultTable, keyspace)
@@ -144,12 +144,12 @@ class Gemini(session: SparkSession, log: Slf4jLogger, keyspace: String = Gemini.
 }
 
 object URLFormatter {
-  val services = Map(
+  private val services = Map(
     "github.com" -> "https://%s/blob/%s/%s",
     "bitbucket.org" -> "https://%s/src/%s/%s",
     "gitlab.com" -> "https://%s/blob/%s/%s"
   )
-  val default = ("", "repo: %s ref_hash: %s file: %s")
+  private val default = ("", "repo: %s ref_hash: %s file: %s")
 
   def format(repo: String, ref_hash: String, file: String): String = {
     val urlTemplateByRepo = services.find { case (h, _) => repo.startsWith(h) }.getOrElse(default)._2
@@ -162,11 +162,11 @@ object URLFormatter {
 case class Meta(sha: String, repo: String, commit: String, path: String)
 
 case class RepoFile(repo: String, ref_hash: String, file: String, sha: String) {
-  override def toString(): String = URLFormatter.format(repo, ref_hash, file)
+  override def toString: String = URLFormatter.format(repo, ref_hash, file)
 }
 
 case class DuplicateBlobHash(sha: String, count: Long) {
-  override def toString(): String = s"$sha ($count duplicates)"
+  override def toString: String = s"$sha ($count duplicates)"
 }
 
 object Gemini {
@@ -203,7 +203,7 @@ object Gemini {
   def findAllDuplicateBlobHashes(conn: Session, keyspace: String): Iterable[DuplicateBlobHash] = {
     val hash = meta.sha
     val dupCount = "count"
-    val duplicatesCountCql = s"SELECT ${hash}, COUNT(*) as ${dupCount} FROM ${keyspace}.${defaultTable} GROUP BY ${hash}"
+    val duplicatesCountCql = s"SELECT $hash, COUNT(*) as $dupCount FROM $keyspace.$defaultTable GROUP BY $hash"
     conn
       .execute(new SimpleStatement(duplicatesCountCql))
       .asScala
@@ -222,7 +222,7 @@ object Gemini {
     */
   def findAllDuplicateItems(conn: Session, keyspace: String): Iterable[Iterable[RepoFile]] = {
     val hash = meta.sha
-    val distinctBlobHash = s"SELECT distinct ${hash} FROM ${keyspace}.${defaultTable}"
+    val distinctBlobHash = s"SELECT distinct $hash FROM $keyspace.$defaultTable"
     conn
       .execute(new SimpleStatement(distinctBlobHash))
       .asScala
diff --git a/src/main/scala/tech/sourced/gemini/HashSparkApp.scala b/src/main/scala/tech/sourced/gemini/HashSparkApp.scala
index 88bd9ff0..14a27ea0 100644
--- a/src/main/scala/tech/sourced/gemini/HashSparkApp.scala
+++ b/src/main/scala/tech/sourced/gemini/HashSparkApp.scala
@@ -66,7 +66,7 @@ object HashSparkApp extends App with Logging {
         .getOrCreate()
 
       if (config.verbose) {
-        LogManager.getRootLogger().setLevel(Level.INFO)
+        LogManager.getRootLogger.setLevel(Level.INFO)
       }
 
       val repos = listRepositories(reposPath, config.format, spark.sparkContext.hadoopConfiguration, config.limit)
diff --git a/src/main/scala/tech/sourced/gemini/Logger.scala b/src/main/scala/tech/sourced/gemini/Logger.scala
index aaaac0bb..8c5d2f81 100644
--- a/src/main/scala/tech/sourced/gemini/Logger.scala
+++ b/src/main/scala/tech/sourced/gemini/Logger.scala
@@ -7,7 +7,7 @@ import org.slf4j.{Logger => Slf4jLogger}
 object Logger {
   def apply(name: String, verbose: Boolean = false): Slf4jLogger = {
     if (verbose) {
-      LogManager.getRootLogger().setLevel(Level.INFO)
+      LogManager.getRootLogger.setLevel(Level.INFO)
     }
     LoggerFactory.getLogger(name)
   }
diff --git a/src/main/scala/tech/sourced/gemini/QueryApp.scala b/src/main/scala/tech/sourced/gemini/QueryApp.scala
index d78807e9..7ca9299d 100644
--- a/src/main/scala/tech/sourced/gemini/QueryApp.scala
+++ b/src/main/scala/tech/sourced/gemini/QueryApp.scala
@@ -48,8 +48,8 @@ object QueryApp extends App {
 
       val similar = gemini.query(file, cassandra).v
 
-      cassandra.close
-      cluster.close
+      cassandra.close()
+      cluster.close()
 
       if (similar.isEmpty) {
         println(s"No duplicates of $file found.")
diff --git a/src/main/scala/tech/sourced/gemini/ReportSparkApp.scala b/src/main/scala/tech/sourced/gemini/ReportSparkApp.scala
index 4b0ccf1a..03e7609a 100644
--- a/src/main/scala/tech/sourced/gemini/ReportSparkApp.scala
+++ b/src/main/scala/tech/sourced/gemini/ReportSparkApp.scala
@@ -53,8 +53,8 @@ object ReportSparkApp extends App {
         case `groupByMode` => gemini.reportCassandraGroupBy(cassandra)
       }
 
-      cassandra.close
-      cluster.close
+      cassandra.close()
+      cluster.close()
 
       print(report)
     case None =>
@@ -65,12 +65,11 @@ object ReportSparkApp extends App {
     report match {
       case e if e.empty() => println(s"No duplicates found.")
       case ReportGrouped(v) => println(s"Duplicates found:\n\t" + (v mkString "\n\t"))
-      case ReportExpandedGroup(v) => {
+      case ReportExpandedGroup(v) =>
         v.foreach { item =>
           val count = item.size
           println(s"$count duplicates:\n\t" + (item mkString "\n\t") + "\n")
         }
-      }
     }
   }
 

From 9fe7b99f0cca4fcc0eeca40c45195883267d1b3e Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Fri, 2 Mar 2018 17:42:26 +0100
Subject: [PATCH 5/8] Refactoring: Scala - column names match Apollo

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 src/main/resources/schema.cql                   | 2 +-
 src/main/scala/tech/sourced/gemini/Gemini.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/main/resources/schema.cql b/src/main/resources/schema.cql
index 2ed367d2..e540ee02 100644
--- a/src/main/resources/schema.cql
+++ b/src/main/resources/schema.cql
@@ -1,3 +1,3 @@
 CREATE KEYSPACE IF NOT EXISTS __KEYSPACE__ WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
 USE __KEYSPACE__;
-CREATE TABLE IF NOT EXISTS __KEYSPACE__.blob_hash_files (blob_hash ascii, repo text, ref_hash ascii, file_path text, PRIMARY KEY (blob_hash, repo, ref_hash, file_path));
+CREATE TABLE IF NOT EXISTS __KEYSPACE__.blob_hash_files (sha1 ascii, repo text, commit ascii, path text, PRIMARY KEY (sha1, repo, commit, path));
diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala
index 300b810d..9bcf0f36 100644
--- a/src/main/scala/tech/sourced/gemini/Gemini.scala
+++ b/src/main/scala/tech/sourced/gemini/Gemini.scala
@@ -178,7 +178,7 @@ object Gemini {
   val defaultTable: String = "blob_hash_files"
 
   //TODO(bzz): switch to `tables("meta")`
-  val meta = Meta("blob_hash", "repo", "ref_hash", "file_path")
+  val meta = Meta("sha1", "repo", "commit", "path")
 
   val formatter = new ObjectInserter.Formatter
 

From 1ca19128b228129a2a416e16015d97d288ab5d41 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Fri, 2 Mar 2018 17:57:03 +0100
Subject: [PATCH 6/8] Refactoring: change var names to follow the schema

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 .gitignore                                             |  1 +
 src/main/scala/tech/sourced/gemini/Gemini.scala        | 10 +++++-----
 .../scala/tech/sourced/gemini/CassandraSparkSpec.scala |  6 +++---
 .../scala/tech/sourced/gemini/URLFormatterSpec.scala   |  8 +++++---
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2b8e7391..94c64753 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 ~*
 target/
 .idea/
+.vscode
 
 metastore_db/
 scylla
diff --git a/src/main/scala/tech/sourced/gemini/Gemini.scala b/src/main/scala/tech/sourced/gemini/Gemini.scala
index 9bcf0f36..4c7e86a0 100644
--- a/src/main/scala/tech/sourced/gemini/Gemini.scala
+++ b/src/main/scala/tech/sourced/gemini/Gemini.scala
@@ -149,20 +149,20 @@ object URLFormatter {
     "bitbucket.org" -> "https://%s/src/%s/%s",
     "gitlab.com" -> "https://%s/blob/%s/%s"
   )
-  private val default = ("", "repo: %s ref_hash: %s file: %s")
+  private val default = ("", "repo: %s commit: %s path: %s")
 
-  def format(repo: String, ref_hash: String, file: String): String = {
+  def format(repo: String, commit: String, path: String): String = {
     val urlTemplateByRepo = services.find { case (h, _) => repo.startsWith(h) }.getOrElse(default)._2
     val repoWithoutSuffix = repo.replaceFirst("\\.git$", "")
 
-    urlTemplateByRepo.format(repoWithoutSuffix, ref_hash, file)
+    urlTemplateByRepo.format(repoWithoutSuffix, commit, path)
   }
 }
 
 case class Meta(sha: String, repo: String, commit: String, path: String)
 
-case class RepoFile(repo: String, ref_hash: String, file: String, sha: String) {
-  override def toString: String = URLFormatter.format(repo, ref_hash, file)
+case class RepoFile(repo: String, commit: String, path: String, sha: String) {
+  override def toString: String = URLFormatter.format(repo, commit, path)
 }
 
 case class DuplicateBlobHash(sha: String, count: Long) {
diff --git a/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala b/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala
index 1616b85f..342266b2 100644
--- a/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala
+++ b/src/test/scala/tech/sourced/gemini/CassandraSparkSpec.scala
@@ -82,7 +82,7 @@ class CassandraSparkSpec extends FlatSpec
     sha1.v should not be empty
     sha1.v.head.sha should be("097f4a292c384e002c5b5ce8e15d746849af7b37") // git hash-object -w LICENSE
     sha1.v.head.repo should be("null/Users/alex/src-d/gemini")
-    sha1.v.head.ref_hash should be("4aa29ac236c55ebbfbef149fef7054d25832717f")
+    sha1.v.head.commit should be("4aa29ac236c55ebbfbef149fef7054d25832717f")
   }
 
   "Query for duplicates in single repository" should "return 2 files" in {
@@ -113,7 +113,7 @@ class CassandraSparkSpec extends FlatSpec
     val detailedReport = gemini.reportCassandraGroupBy(session).v
     println("Done")
 
-    val duplicatedFileNames = detailedReport map (_.head.file)
+    val duplicatedFileNames = detailedReport map (_.head.path)
     duplicatedFileNames.toSeq should contain theSameElementsAs expectedDuplicateFiles
   }
 
@@ -124,7 +124,7 @@ class CassandraSparkSpec extends FlatSpec
     val detailedReport = gemini.report(session).v
     println("Done")
 
-    val duplicatedFileNames = detailedReport map (_.head.file)
+    val duplicatedFileNames = detailedReport map (_.head.path)
     duplicatedFileNames.toSeq should contain theSameElementsAs expectedDuplicateFiles
   }
 
diff --git a/src/test/scala/tech/sourced/gemini/URLFormatterSpec.scala b/src/test/scala/tech/sourced/gemini/URLFormatterSpec.scala
index 218659d8..d05138fa 100644
--- a/src/test/scala/tech/sourced/gemini/URLFormatterSpec.scala
+++ b/src/test/scala/tech/sourced/gemini/URLFormatterSpec.scala
@@ -6,16 +6,18 @@ class URLFormatterSpec extends FlatSpec
   with Matchers {
 
   "URLFormatter" should "format correctly" in {
-    case class Input(repo: String, ref_hash: String, file: String)
+    case class Input(repo: String, commit: String, path: String)
 
     val cases = Map(
       Input("github.com/src-d/test", "sha1", "path/file") -> "https://github.com/src-d/test/blob/sha1/path/file",
       Input("gitlab.com/src-d/test", "sha1", "path/file") -> "https://gitlab.com/src-d/test/blob/sha1/path/file",
       Input("bitbucket.org/src-d/test", "sha1", "path/file") -> "https://bitbucket.org/src-d/test/src/sha1/path/file",
       Input("github.com/src-d/test.git", "sha1", "path/file") -> "https://github.com/src-d/test/blob/sha1/path/file",
-      Input("unknown", "sha1", "path/file") -> "repo: unknown ref_hash: sha1 file: path/file"
+      Input("unknown", "sha1", "path/file") -> "repo: unknown commit: sha1 path: path/file"
     )
 
-    for ((i, expected) <- cases) URLFormatter.format(i.repo, i.ref_hash, i.file) should be(expected)
+    for ((i, expected) <- cases) {
+      URLFormatter.format(i.repo, i.commit, i.path) should be(expected)
+    }
   }
 }

From 557f8c167ebc4d30e2910a339c4425efe068ea78 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Tue, 6 Mar 2018 22:42:58 +0100
Subject: [PATCH 7/8] Refactoring: Go - column names match Apollo

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 src/main/go/query.go | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/main/go/query.go b/src/main/go/query.go
index c4ab967b..bc5093a7 100644
--- a/src/main/go/query.go
+++ b/src/main/go/query.go
@@ -16,9 +16,10 @@ import (
 
 // BlobHash is single blob inside a repository
 type BlobHash struct {
-	BlobHash string
-	Repo     string
-	FilePath string
+	Sha1   string
+	Commit string
+	Repo   string
+	Path   string
 }
 
 const (
@@ -42,11 +43,11 @@ func main() {
 	defer session.Close()
 
 	stmt, names := qb.Select(fmt.Sprintf("%s.%s", defaultKeyspace, defaultTable)).
-		Where(qb.In("blob_hash")).
+		Where(qb.In("sha1")).
 		ToCql()
 
 	q := gocqlx.Query(session.Query(stmt), names).BindMap(qb.M{
-		"blob_hash": []string{hash},
+		"sha1": []string{hash},
 	})
 	defer q.Release()
 

From 8f734c50d3736f403384a03eccdd88d585dc9256 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Tue, 6 Mar 2018 22:43:28 +0100
Subject: [PATCH 8/8] Go - add query to CI integration test

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 .travis.yml          | 2 ++
 src/main/go/query.go | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 12f56590..f845a048 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,6 +28,8 @@ matrix:
       - ./hash src/test/resources/siva || travis_terminate 1
       - ./query ./LICENSE
       - ./report
+      - go get ./src/main/go/... || true
+      - go run ./src/main/go/query.go ./LICENSE
     before_deploy:
       - VERSION=$TRAVIS_TAG ./scripts/release.sh
     deploy:
diff --git a/src/main/go/query.go b/src/main/go/query.go
index bc5093a7..e2e1d5d0 100644
--- a/src/main/go/query.go
+++ b/src/main/go/query.go
@@ -53,12 +53,16 @@ func main() {
 
 	var similarHashes []BlobHash
 	if err := gocqlx.Select(&similarHashes, q.Query); err != nil {
-		log.Fatal("select:", err)
+		log.Fatalf("select: %v in %s", err, q.Query)
 	}
 
 	for _, hash := range similarHashes {
 		fmt.Printf("\t%+v\n", hash)
 	}
+
+	if len(similarHashes) == 0 {
+		os.Exit(2)
+	}
 }
 
 // connect to the cluster