mezuro · danielkza · Jul 16, 2016 · Jul 16, 2016 · Jul 16, 2016 · Jul 16, 2016
diff --git a/CHANGELOG.rdoc b/CHANGELOG.rdoc
@@ -4,6 +4,7 @@ KalibroProcessor is the processing web service for Mezuro.
 
 == Unreleased
 
+* Optimize database structure by adding foreign keys and indexes where needed
 * Insert in one query all aggregated MetricResults
 * Aggregate values by tree levels
 * Enable ModuleResult tree walking by level

diff --git a/db/migrate/20160720185407_clean_inconsistencies.rb b/db/migrate/20160720185407_clean_inconsistencies.rb
@@ -0,0 +1,103 @@
+require 'fileutils'
+
+class CleanInconsistencies < ActiveRecord::Migration
+  def backup_dir
+    return @backup_dir if @backup_dir
+    @backup_dir = Rails.root.join('db', 'backup', Time.now.strftime('%Y-%m-%d_%H-%M-%S'))
+    FileUtils.mkdir_p(@backup_dir)
+  end
+
+  def backup(name, query, header: true)
+    say_with_time("backup(#{name.inspect}, #{query.inspect})") do
+      copy_query = "COPY (#{query}) TO STDOUT WITH DELIMITER ',' CSV #{header ? 'HEADER' : ''}"
+
+      File.open(backup_dir.join("#{name}.csv"), 'a') do |f|
+        connection.raw_connection.copy_data(copy_query) do
+          while line = connection.raw_connection.get_copy_data
+            f.write(line)
+          end
+        end
+      end
+    end
+  end
+
+  def backup_and_delete_missing(table, exists_query)
+    backup(table, "SELECT * FROM \"#{table}\" WHERE NOT EXISTS(#{exists_query})")
+    execute "DELETE FROM \"#{table}\" WHERE NOT EXISTS(#{exists_query})"
+  end
+
+  def up
+    say "WARNING: destructive migration necessary. Deleted data will be backed up to #{backup_dir}"
+
+    # Unset project reference for repositories with non-existing projects
+    execute <<-SQL
+      UPDATE repositories AS r
+      SET project_id = NULL
+      WHERE project_id = 0 OR NOT EXISTS (
+        SELECT 1 FROM projects AS p WHERE p.id = r.project_id
+      )
+    SQL
+
+    # Delete processings with non-existing repositories
+    backup_and_delete_missing("processings",
+      "SELECT 1 FROM repositories AS r WHERE r.id = processings.repository_id")
+
+    # Delete process times with non-existing processings
+    backup_and_delete_missing("process_times",
+      "SELECT 1 FROM processings AS p WHERE p.id = process_times.processing_id")
+
+    # Delete module results with non-existing processings
+    backup_and_delete_missing("module_results",
+      "SELECT 1 FROM processings AS p WHERE p.id = module_results.processing_id")
+
+    # Delete kalibro modules with non-existing module results
+    backup_and_delete_missing("kalibro_modules",
+      "SELECT 1 FROM module_results AS m WHERE m.id = kalibro_modules.module_result_id")
+
+    # Fix up metric results type, even before backing up so the backup is cleaner
+    execute <<-SQL
+      UPDATE metric_results SET "type" = 'TreeMetricResult' WHERE "type" = 'MetricResult'
+    SQL
+
+    # Delete metric results with non-existing module results
+    backup_and_delete_missing("metric_results",
+      "SELECT 1 FROM module_results AS m WHERE m.id = metric_results.module_result_id")
+
+    # Delete duplicate metric_results. Group them by (module_result_id, metric_configuration_id),
+    # then delete all but the one with the highest ID. The double wrapping on the inner query is
+    # necessary because window functions cannot be used in WHERE in PostgreSQL.
+    repeated_metric_result_query = exec_query <<-SQL
+      SELECT t.id FROM (
+        SELECT metric_results.*, ROW_NUMBER() OVER (
+          PARTITION BY module_result_id, metric_configuration_id, "type"
+          ORDER BY id DESC) AS rnum
+        FROM metric_results
+        WHERE "type" = 'TreeMetricResult'
+      ) AS t
+      WHERE t.rnum > 1
+    SQL
+
+    unless repeated_metric_result_query.empty?
+      repeated_metric_result_ids = repeated_metric_result_query.rows.flat_map(&:first).join(',')
+
+      # Replace default messages with custom ones to avoid flooding the screen with the huge query
+      say_with_time('backup("metric_results", "SELECT * metric_results WHERE id IN (...)")') do
+        suppress_messages do
+          backup('metric_results',
+            "SELECT * FROM metric_results WHERE id IN (#{repeated_metric_result_ids})",
+            header: false)
+        end
+      end
+
+      say_with_time('execute("DELETE FROM metric_results WHERE id IN (...)")') do
+        suppress_messages do
+          execute "DELETE FROM metric_results WHERE id IN (#{repeated_metric_result_ids})"
+        end
+      end
+    end
+  end
+
+  def self.down
+    raise ActiveRecord::IrreversibleMigration
+  end
+end
diff --git a/db/migrate/20160720185408_add_indexes_to_kalibro_modules.rb b/db/migrate/20160720185408_add_indexes_to_kalibro_modules.rb
@@ -0,0 +1,6 @@
+class AddIndexesToKalibroModules < ActiveRecord::Migration
+  def change
+    add_foreign_key :kalibro_modules, :module_results, on_delete: :cascade
+    add_index :kalibro_modules, [:long_name, :granularity]
+  end
+end
diff --git a/db/migrate/20160720185409_add_indexes_to_module_results.rb b/db/migrate/20160720185409_add_indexes_to_module_results.rb
@@ -0,0 +1,6 @@
+class AddIndexesToModuleResults < ActiveRecord::Migration
+  def change
+    add_foreign_key :module_results, :module_results, column: 'parent_id'
+    add_foreign_key :module_results, :processings, on_delete: :cascade
+  end
+end
diff --git a/db/migrate/20160720185410_add_indexes_to_metric_results.rb b/db/migrate/20160720185410_add_indexes_to_metric_results.rb
@@ -0,0 +1,11 @@
+class AddIndexesToMetricResults < ActiveRecord::Migration
+  def change
+    add_foreign_key :metric_results, :module_results, on_delete: :cascade
+    add_index :metric_results, :type
+    add_index :metric_results, :module_result_id
+    add_index :metric_results, :metric_configuration_id
+    add_index :metric_results, [:module_result_id, :metric_configuration_id],
+              unique: true, where: "type = 'TreeMetricResult'",
+              name: 'metric_results_module_res_metric_cfg_uniq_idx'
+  end
+end
diff --git a/db/migrate/20160720185411_add_indexes_to_processings.rb b/db/migrate/20160720185411_add_indexes_to_processings.rb
@@ -0,0 +1,6 @@
+class AddIndexesToProcessings < ActiveRecord::Migration
+  def change
+    add_foreign_key :processings, :repositories
+    add_foreign_key :processings, :module_results, column: 'root_module_result_id'
+  end
+end
diff --git a/db/migrate/20160720185412_add_indexes_to_process_times.rb b/db/migrate/20160720185412_add_indexes_to_process_times.rb
@@ -0,0 +1,5 @@
+class AddIndexesToProcessTimes < ActiveRecord::Migration
+  def change
+    add_foreign_key :process_times, :processings, on_delete: :cascade
+  end
+end
diff --git a/db/migrate/20160720185413_add_indexes_to_repositories.rb b/db/migrate/20160720185413_add_indexes_to_repositories.rb
@@ -0,0 +1,5 @@
+class AddIndexesToRepositories < ActiveRecord::Migration
+  def change
+    add_foreign_key :repositories, :projects
+  end
+end
diff --git a/db/migrate/20160720185414_add_indexes_of_foreign_keys.rb b/db/migrate/20160720185414_add_indexes_of_foreign_keys.rb
@@ -0,0 +1,9 @@
+class AddIndexesOfForeignKeys < ActiveRecord::Migration
+  def change
+    add_index :module_results, :processing_id
+    add_index :kalibro_modules, :module_result_id
+    add_index :process_times, :processing_id
+    add_index :processings, :repository_id
+    add_index :repositories, :project_id 
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
@@ -11,7 +11,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema.define(version: 20151002172231) do
+ActiveRecord::Schema.define(version: 20160720185414) do
 
   # These are extensions that must be enabled in order to support this database
   enable_extension "plpgsql"
@@ -40,6 +40,9 @@
     t.integer  "module_result_id"
   end
 
+  add_index "kalibro_modules", ["long_name", "granularity"], name: "index_kalibro_modules_on_long_name_and_granularity", using: :btree
+  add_index "kalibro_modules", ["module_result_id"], name: "index_kalibro_modules_on_module_result_id", using: :btree
+
   create_table "metric_results", force: :cascade do |t|
     t.integer  "module_result_id"
     t.integer  "metric_configuration_id"
@@ -52,7 +55,11 @@
     t.integer  "related_hotspot_metric_results_id"
   end
 
+  add_index "metric_results", ["metric_configuration_id"], name: "index_metric_results_on_metric_configuration_id", using: :btree
+  add_index "metric_results", ["module_result_id", "metric_configuration_id"], name: "metric_results_module_res_metric_cfg_uniq_idx", unique: true, where: "((type)::text = 'TreeMetricResult'::text)", using: :btree
+  add_index "metric_results", ["module_result_id"], name: "index_metric_results_on_module_result_id", using: :btree
   add_index "metric_results", ["related_hotspot_metric_results_id"], name: "index_metric_results_on_related_hotspot_metric_results_id", using: :btree
+  add_index "metric_results", ["type"], name: "index_metric_results_on_type", using: :btree
 
   create_table "module_results", force: :cascade do |t|
     t.float    "grade"
@@ -63,6 +70,7 @@
   end
 
   add_index "module_results", ["parent_id"], name: "index_module_results_on_parent_id", using: :btree
+  add_index "module_results", ["processing_id"], name: "index_module_results_on_processing_id", using: :btree
 
   create_table "process_times", force: :cascade do |t|
     t.string   "state",         limit: 255
@@ -72,6 +80,8 @@
     t.float    "time"
   end
 
+  add_index "process_times", ["processing_id"], name: "index_process_times_on_processing_id", using: :btree
+
   create_table "processings", force: :cascade do |t|
     t.string   "state",                 limit: 255
     t.integer  "repository_id"
@@ -81,6 +91,8 @@
     t.text     "error_message"
   end
 
+  add_index "processings", ["repository_id"], name: "index_processings_on_repository_id", using: :btree
+
   create_table "projects", force: :cascade do |t|
     t.string   "name",        limit: 255
     t.string   "description", limit: 255
@@ -106,5 +118,15 @@
     t.string   "branch",                               default: "master", null: false
   end
 
+  add_index "repositories", ["project_id"], name: "index_repositories_on_project_id", using: :btree
+
+  add_foreign_key "kalibro_modules", "module_results", on_delete: :cascade
+  add_foreign_key "metric_results", "module_results", on_delete: :cascade
   add_foreign_key "metric_results", "related_hotspot_metric_results", column: "related_hotspot_metric_results_id"
+  add_foreign_key "module_results", "module_results", column: "parent_id"
+  add_foreign_key "module_results", "processings", on_delete: :cascade
+  add_foreign_key "process_times", "processings", on_delete: :cascade
+  add_foreign_key "processings", "module_results", column: "root_module_result_id"
+  add_foreign_key "processings", "repositories"
+  add_foreign_key "repositories", "projects"
 end
diff --git a/features/metric_result/module_result.feature b/features/metric_result/module_result.feature
@@ -5,8 +5,7 @@ Feature: ModuleResult retrieval
 
   @clear_repository @kalibro_configuration_restart
   Scenario: With a valid MetricResult id
-    Given I have sample readings
-    And I have a sample configuration with the Flay hotspot metric
+    Given I have a sample configuration with the Flay hotspot metric
     And I have the kalibro processor ruby repository with revision "v0.11.0"
     And I have a processing within the sample repository
     And I run for the given repository
@@ -15,5 +14,5 @@ Feature: ModuleResult retrieval
     Then I should get the given ModuleResult json
 
   Scenario: With an invalid MetricResult id
-    When I request for the ModuleResult of the MetricResult with id "42"
+    When I request for the ModuleResult of the MetricResult with id "0"
     Then I should get an error response
diff --git a/features/repository/metric_result_history_of.feature b/features/repository/metric_result_history_of.feature
@@ -7,7 +7,7 @@ Feature: Metric Result History Of
   Scenario: After processing an existing repository with a kalibro configuration
     Given I have sample readings
     And I have a sample kalibro configuration with native metrics
-    And I have a sample repository within the sample project
+    And I have a sample repository
     And I have a processing within the sample repository
     And I run for the given repository
     When I get the history for the first metric result of the root

diff --git a/features/runner.feature b/features/runner.feature
@@ -87,7 +87,7 @@ Feature: Runner run
     Given I have sample readings
     And I have a sample configuration with the Cyclomatic python native metric
     And I add the "Maintainability" native metric to the sample configuration
-    And I have a sample python repository within the sample project
+    And I have a sample python repository
     And I have a processing within the sample repository
     When I run for the given repository
     Then the repository code_directory should exist
@@ -107,7 +107,7 @@ Feature: Runner run
     And the processing retrieved should have a Root ModuleResult
     And the Root ModuleResult retrieved should have a list of MetricResults
 
-  @clear_repository @kalibro_configuration_restart @docker
+  @clear_repository @kalibro_configuration_restart @docker @no_transaction
   Scenario: An existing php repository with a configuration with PHPMD (Hotspot Metrics)
     Given I have a sample configuration with the PHPMD hotspot metric
     And I have a sample php repository