diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ce9f27..6947b52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,7 @@ fetch_if_not_exists(minizip find_package(TBB REQUIRED) -add_executable(search src/search.cpp) +add_executable(search src/Search.cc) target_link_libraries(search PRIVATE minizip TBB::tbb) diff --git a/src/Search.cc b/src/Search.cc new file mode 100644 index 0000000..008f921 --- /dev/null +++ b/src/Search.cc @@ -0,0 +1,397 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "unzip.h" +#include "zlib.h" + +namespace Search +{ + + bool case_insensitive = false; + constexpr size_t kChunkSize = 1024 * 1024; // 1 MB + constexpr size_t kMinFileSizeForMmap = 10 * 1024 * 1024; // 10 MB + constexpr int kContextSize = 20; + + struct FileInfo + { + size_t offset; + size_t size; + }; + + struct SearchResult + { + std::string filename; + std::vector> occurrences; + }; + + using ZipIndex = std::map; + + void PrintHeader() + { + const char *kAsciiArt = R"( + ____ ___ ____ _ ____ __ ___ _ _ ____ ___ ____ _ _ +| _ \ / _ \ / ___| |/ /\ \ / // _ \| | | |___ \ / _ \___ \| || | +| |_) | | | | | | ' / \ V /| | | | | | | __) | | | |__) | || |_ +| _ <| |_| | |___| . \ | | | |_| | |_| |/ __/| |_| / __/|__ _| +|_| \_\\___/ \____|_|\_\ |_| \___/ \___/|_____|\___/_____| |_| + +© 2024 Volker Schwaberow +Based on rockyou2024 cpp by Mike Madden + +)"; + + std::cout << "\033[1;34m"; // Set text color to bright blue + for (size_t i = 0; kAsciiArt[i] != '\0'; ++i) + { + std::cout << kAsciiArt[i] << std::flush; + std::this_thread::sleep_for(std::chrono::milliseconds(3)); + } + std::cout << "\033[0m"; + std::cout << "Press Enter to continue..."; + std::cin.get(); + } + + void PrintUsage(const char *program_name) + { + std::cout << "Usage: " << program_name << " [-i]\n" + << " or: " << program_name << " --interactive\n\n" + << "Options:\n" + << " --interactive Run in interactive mode\n" + << " -i Perform case-insensitive search\n" + << " --help Display this help message\n"; + } + + ZipIndex CreateZipIndex(const std::string &filename) + { + ZipIndex index; + unzFile zip_file = unzOpen(filename.c_str()); + if (!zip_file) + { + throw std::runtime_error("Error opening zip file: " + filename); + } + + unz_global_info global_info; + if (unzGetGlobalInfo(zip_file, &global_info) != UNZ_OK) + { + unzClose(zip_file); + throw std::runtime_error("Error reading zip file info"); + } + + for (uLong i = 0; i < global_info.number_entry; ++i) + { + char filename_inzip[256]; + unz_file_info file_info; + if (unzGetCurrentFileInfo(zip_file, &file_info, filename_inzip, + sizeof(filename_inzip), nullptr, 0, nullptr, + 0) != UNZ_OK) + { + unzClose(zip_file); + throw std::runtime_error("Error getting file info"); + } + + uLong file_offset = unzGetOffset(zip_file); + if (file_offset == 0) + { + std::cerr << "Warning: Unable to get offset for file: " << filename_inzip + << std::endl; + } + + index[filename_inzip] = {file_offset, file_info.uncompressed_size}; + + if (i + 1 < global_info.number_entry && unzGoToNextFile(zip_file) != UNZ_OK) + { + unzClose(zip_file); + throw std::runtime_error("Error moving to next file in zip"); + } + } + + unzClose(zip_file); + return index; + } + + std::vector BoyerMoore(std::string_view text, std::string_view pattern) + { + std::vector results; + int m = pattern.length(); + int n = text.length(); + + if (m == 0 || n == 0 || m > n) + return results; + + std::array bad_char; + bad_char.fill(-1); + + for (int i = 0; i < m; i++) + bad_char[pattern[i]] = i; + + for (int s = 0; s <= (n - m);) + { + int j = m - 1; + while (j >= 0 && pattern[j] == text[s + j]) + j--; + + if (j < 0) + { + results.push_back(s); + s += (s + m < n) ? m - bad_char[text[s + m]] : 1; + } + else + { + s += std::max(1, j - bad_char[text[s + j]]); + } + } + + return results; + } + + SearchResult SearchInFile(const std::string &zip_filename, + const std::string &file_name, + const FileInfo &file_info, + const std::string &keyword) + { + unzFile zip_file = unzOpen(zip_filename.c_str()); + if (!zip_file) + { + throw std::runtime_error("Error opening zip file: " + zip_filename); + } + + auto cleanup = [&]() + { + unzCloseCurrentFile(zip_file); + unzClose(zip_file); + }; + + if (file_info.offset != 0 && unzSetOffset(zip_file, file_info.offset) != UNZ_OK) + { + std::cerr << "Warning: Unable to set offset for file: " << file_name + << ". Trying to locate file by name." << std::endl; + if (unzLocateFile(zip_file, file_name.c_str(), 0) != UNZ_OK) + { + cleanup(); + throw std::runtime_error("Error locating file in zip: " + file_name); + } + } + + if (unzOpenCurrentFile(zip_file) != UNZ_OK) + { + cleanup(); + throw std::runtime_error("Error opening file in zip: " + file_name); + } + + SearchResult result; + result.filename = file_name; + + if (file_info.size >= kMinFileSizeForMmap) + { + std::vector buffer(file_info.size); + if (unzReadCurrentFile(zip_file, buffer.data(), + static_cast(buffer.size())) != + static_cast(file_info.size)) + { + cleanup(); + throw std::runtime_error("Error reading file content"); + } + std::string_view content(buffer.data(), buffer.size()); + auto positions = BoyerMoore(content, keyword); + + int line = 1, col = 1; + for (size_t pos : positions) + { + size_t line_start = pos; + while (line_start > 0 && content[line_start - 1] != '\n') + { + line_start--; + col++; + } + + size_t context_start = (pos > kContextSize) ? pos - kContextSize : 0; + size_t context_end = std::min(pos + keyword.length() + kContextSize, content.length()); + std::string context(content.substr(context_start, context_end - context_start)); + + result.occurrences.emplace_back(line, col, std::move(context)); + + line += std::count(content.begin() + line_start, content.begin() + pos, '\n'); + col = pos - line_start + 1; + } + } + else + { + std::vector buffer(kChunkSize); + std::string overlap; + int bytes_read; + size_t total_read = 0; + int line = 1, col = 1; + while ((bytes_read = unzReadCurrentFile(zip_file, buffer.data(), + static_cast(buffer.size()))) > 0) + { + std::string_view chunk(buffer.data(), static_cast(bytes_read)); + std::string search_text = overlap + std::string(chunk); + + auto positions = BoyerMoore(search_text, keyword); + for (size_t pos : positions) + { + size_t line_start = pos; + while (line_start > 0 && search_text[line_start - 1] != '\n') + { + line_start--; + } + + size_t context_start = (pos > kContextSize) ? pos - kContextSize : 0; + size_t context_end = std::min(pos + keyword.length() + kContextSize, search_text.length()); + std::string context = search_text.substr(context_start, context_end - context_start); + + result.occurrences.emplace_back(line, pos - line_start + 1, std::move(context)); + + line += std::count(search_text.begin() + line_start, search_text.begin() + pos, '\n'); + } + + total_read += bytes_read; + overlap = (search_text.length() >= keyword.length()) + ? search_text.substr(search_text.length() - keyword.length() + 1) + : ""; + } + } + + cleanup(); + return result; + } + + void SearchInZip(const std::string &filename, const std::string &keyword) + { + auto index = CreateZipIndex(filename); + + const auto start_time = std::chrono::high_resolution_clock::now(); + + std::atomic total_count = 0; + std::mutex cout_mutex; + + unsigned int num_threads = std::thread::hardware_concurrency(); + if (num_threads == 0) + { + num_threads = 4; // Default to 4 threads if hardware_concurrency() fails + } + + std::vector threads; + std::vector results; + results.reserve(index.size()); + + std::atomic next_file_index(0); + + auto worker = [&]() + { + while (true) + { + size_t i = next_file_index.fetch_add(1, std::memory_order_relaxed); + if (i >= index.size()) + break; + + auto it = std::next(index.begin(), i); + const auto &[file_name, file_info] = *it; + + try + { + SearchResult result = SearchInFile(filename, file_name, file_info, keyword); + total_count += result.occurrences.size(); + + std::lock_guard lock(cout_mutex); + std::cout << "Occurrences in \"" << file_name << "\": " << result.occurrences.size() << '\n'; + for (const auto &[line, col, context] : result.occurrences) + { + std::cout << " Line " << line << ", Column " << col << ": " << context << '\n'; + } + + results.push_back(std::move(result)); + } + catch (const std::exception &e) + { + std::lock_guard lock(cout_mutex); + std::cerr << "Error processing file \"" << file_name << "\": " << e.what() << '\n'; + } + } + }; + + for (unsigned int i = 0; i < num_threads; ++i) + { + threads.emplace_back(worker); + } + + for (auto &thread : threads) + { + thread.join(); + } + + const auto end_time = std::chrono::high_resolution_clock::now(); + const std::chrono::duration cpu_time_used = end_time - start_time; + + std::cout << "Search complete. Total occurrences: " << total_count << '\n'; + std::cout << "Time taken: " << cpu_time_used.count() << " seconds\n"; + } +} + +int main(int argc, char *argv[]) +{ + try + { + Search::PrintHeader(); + + std::string keyword; + std::string filename; + + if (argc == 2 && (std::strcmp(argv[1], "--interactive") == 0)) + { + std::cout << "Enter the keyword to search: "; + std::getline(std::cin, keyword); + + std::cout << "Enter the zip filename to search in: "; + std::getline(std::cin, filename); + + std::cout << "Case-insensitive search? (y/n): "; + std::string response; + std::getline(std::cin, response); + Search::case_insensitive = (response == "y" || response == "Y"); + } + else if (argc >= 3 && argc <= 4) + { + filename = argv[1]; + keyword = argv[2]; + if (argc == 4 && std::strcmp(argv[3], "-i") == 0) + { + Search::case_insensitive = true; + } + } + else + { + Search::PrintUsage(argv[0]); + return 1; + } + if (!std::filesystem::exists(filename)) + { + throw std::runtime_error("File does not exist: " + filename); + } + + Search::SearchInZip(filename, keyword); + } + catch (const std::exception &e) + { + std::cerr << "Error: " << e.what() << '\n'; + return 1; + } + + return 0; +} \ No newline at end of file diff --git a/src/search.cpp b/src/search.cpp deleted file mode 100644 index 1f8e94e..0000000 --- a/src/search.cpp +++ /dev/null @@ -1,346 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "unzip.h" -#include "zlib.h" - -constexpr size_t CHUNK_SIZE = 1024 * 1024; // 1 MB -constexpr size_t MIN_FILE_SIZE_FOR_MMAP = 10 * 1024 * 1024; // 10 MB - -struct FileInfo -{ - size_t offset; - size_t size; -}; - -struct SearchResult -{ - std::string filename; - std::vector> occurrences; -}; - -using ZipIndex = std::map; - -void print_header() -{ - std::string_view ascii_art = R"( - ____ ___ ____ _ ____ __ ___ _ _ ____ ___ ____ _ _ -| _ \ / _ \ / ___| |/ /\ \ / // _ \| | | |___ \ / _ \___ \| || | -| |_) | | | | | | ' / \ V /| | | | | | | __) | | | |__) | || |_ -| _ <| |_| | |___| . \ | | | |_| | |_| |/ __/| |_| / __/|__ _| -|_| \_\\___/ \____|_|\_\ |_| \___/ \___/|_____|\___/_____| |_| - -© 2024 Volker Schwaberow -Based on rockyou2024 cpp by Mike Madden - -)"; - - std::cout << "\033[1;34m"; // Set text color to bright blue - - for (char c : ascii_art) - { - std::cout << c << std::flush; - std::this_thread::sleep_for(std::chrono::milliseconds(3)); - } - - std::cout << "\033[0m"; - std::cout << "Press Enter to continue..."; - std::cin.get(); -} - -void print_usage(const char *program_name) -{ - std::cout << "Usage: " << program_name << " \n" - << " or: " << program_name << " --interactive\n\n" - << "Options:\n" - << " --interactive Run in interactive mode\n" - << " --help Display this help message\n"; -} - -ZipIndex create_zip_index(const std::string &filename) -{ - ZipIndex index; - unzFile zip_file = unzOpen(filename.c_str()); - if (!zip_file) - { - throw std::runtime_error("Error opening zip file: " + filename); - } - - unz_global_info global_info; - if (unzGetGlobalInfo(zip_file, &global_info) != UNZ_OK) - { - unzClose(zip_file); - throw std::runtime_error("Error reading zip file info"); - } - - for (uLong i = 0; i < global_info.number_entry; ++i) - { - char filename_inzip[256]; - unz_file_info file_info; - if (unzGetCurrentFileInfo(zip_file, &file_info, filename_inzip, sizeof(filename_inzip), nullptr, 0, nullptr, 0) != UNZ_OK) - { - unzClose(zip_file); - throw std::runtime_error("Error getting file info"); - } - - uLong file_offset = unzGetOffset(zip_file); - if (file_offset == 0) - { - std::cerr << "Warning: Unable to get offset for file: " << filename_inzip << std::endl; - } - - index[filename_inzip] = {file_offset, file_info.uncompressed_size}; - - if (i + 1 < global_info.number_entry && unzGoToNextFile(zip_file) != UNZ_OK) - { - unzClose(zip_file); - throw std::runtime_error("Error moving to next file in zip"); - } - } - - unzClose(zip_file); - return index; -} - -std::vector boyer_moore(std::string_view text, std::string_view pattern) -{ - std::vector results; - int m = pattern.length(); - int n = text.length(); - - if (m == 0 || n == 0 || m > n) - return results; - - std::array badChar; - badChar.fill(-1); - - for (int i = 0; i < m; i++) - badChar[pattern[i]] = i; - - for (int s = 0; s <= (n - m); s += std::max(1, [&]() - { - for (int j = m - 1; j >= 0; --j) - { - if (pattern[j] != text[s + j]) - return j - badChar[text[s + j]]; - } - results.push_back(s); - return m - ((s + m < n) ? badChar[text[s + m]] : -1); }())) - ; - - return results; -} - -SearchResult search_in_file(const std::string &zip_filename, const std::string &file_name, const FileInfo &file_info, const std::string &keyword) -{ - unzFile zip_file = unzOpen(zip_filename.c_str()); - if (!zip_file) - { - throw std::runtime_error("Error opening zip file: " + zip_filename); - } - - auto cleanup = [&]() - { - unzCloseCurrentFile(zip_file); - unzClose(zip_file); - }; - - if (file_info.offset != 0 && unzSetOffset(zip_file, file_info.offset) != UNZ_OK) - { - std::cerr << "Warning: Unable to set offset for file: " << file_name << ". Trying to locate file by name." << std::endl; - if (unzLocateFile(zip_file, file_name.c_str(), 0) != UNZ_OK) - { - cleanup(); - throw std::runtime_error("Error locating file in zip: " + file_name); - } - } - - if (unzOpenCurrentFile(zip_file) != UNZ_OK) - { - cleanup(); - throw std::runtime_error("Error opening file in zip: " + file_name); - } - - SearchResult result; - result.filename = file_name; - - const int context_size = 20; - - if (file_info.size >= MIN_FILE_SIZE_FOR_MMAP) - { - std::vector buffer(file_info.size); - if (unzReadCurrentFile(zip_file, buffer.data(), static_cast(buffer.size())) != static_cast(file_info.size)) - { - cleanup(); - throw std::runtime_error("Error reading file content"); - } - std::string_view content(buffer.data(), buffer.size()); - auto positions = boyer_moore(content, keyword); - - int line = 1, col = 1; - for (size_t pos : positions) - { - size_t line_start = pos; - while (line_start > 0 && content[line_start - 1] != '\n') - { - line_start--; - col++; - } - - size_t context_start = (pos > context_size) ? pos - context_size : 0; - size_t context_end = std::min(pos + keyword.length() + context_size, content.length()); - std::string context = std::string(content.substr(context_start, context_end - context_start)); - - result.occurrences.emplace_back(line, col, context); - - line += std::count(content.begin() + line_start, content.begin() + pos, '\n'); - col = pos - line_start + 1; - } - } - else - { - std::vector buffer(CHUNK_SIZE); - std::string overlap; - int bytes_read; - size_t total_read = 0; - int line = 1, col = 1; - while ((bytes_read = unzReadCurrentFile(zip_file, buffer.data(), static_cast(buffer.size()))) > 0) - { - std::string_view chunk(buffer.data(), static_cast(bytes_read)); - std::string search_text = overlap + std::string(chunk); - - auto positions = boyer_moore(search_text, keyword); - for (size_t pos : positions) - { - size_t line_start = pos; - while (line_start > 0 && search_text[line_start - 1] != '\n') - { - line_start--; - } - - size_t context_start = (pos > context_size) ? pos - context_size : 0; - size_t context_end = std::min(pos + keyword.length() + context_size, search_text.length()); - std::string context = search_text.substr(context_start, context_end - context_start); - - result.occurrences.emplace_back(line, pos - line_start + 1, context); - - line += std::count(search_text.begin() + line_start, search_text.begin() + pos, '\n'); - } - - total_read += bytes_read; - overlap = (search_text.length() >= keyword.length()) - ? search_text.substr(search_text.length() - keyword.length() + 1) - : ""; - } - } - - cleanup(); - return result; -} - -void search_in_zip(const std::string &filename, const std::string &keyword) -{ - auto index = create_zip_index(filename); - - const auto start_time = std::chrono::high_resolution_clock::now(); - - std::atomic total_count = 0; - std::mutex cout_mutex; - - std::vector threads; - std::vector results(index.size()); - int i = 0; - for (const auto &[file_name, file_info] : index) - { - threads.emplace_back([&, i, file_name](const FileInfo &fi) - { - try { - results[i] = search_in_file(filename, file_name, fi, keyword); - total_count += results[i].occurrences.size(); - - std::lock_guard lock(cout_mutex); - std::cout << "Occurrences in \"" << file_name << "\": " << results[i].occurrences.size() << '\n'; - for (const auto &[line, col, context] : results[i].occurrences) { - std::cout << " Line " << line << ", Column " << col << ": " << context << '\n'; - } - } catch (const std::exception& e) { - std::lock_guard lock(cout_mutex); - std::cerr << "Error processing file \"" << file_name << "\": " << e.what() << '\n'; - } }, file_info); - i++; - } - - for (auto &thread : threads) - { - thread.join(); - } - - const auto end_time = std::chrono::high_resolution_clock::now(); - const std::chrono::duration cpu_time_used = end_time - start_time; - - std::cout << "Search complete. Total occurrences: " << total_count << '\n'; - std::cout << "Time taken: " << cpu_time_used.count() << " seconds\n"; -} - -int main(int argc, char *argv[]) -{ - try - { - print_header(); - - std::string keyword; - std::string filename; - - if (argc == 2 && (std::strcmp(argv[1], "--help") == 0)) - { - print_usage(argv[0]); - return 0; - } - - if (argc == 2 && (std::strcmp(argv[1], "--interactive") == 0)) - { - std::cout << "Enter the keyword to search: "; - std::getline(std::cin, keyword); - - std::cout << "Enter the zip filename to search in: "; - std::getline(std::cin, filename); - } - else if (argc == 3) - { - filename = argv[1]; - keyword = argv[2]; - } - else - { - print_usage(argv[0]); - return 1; - } - - if (!std::filesystem::exists(filename)) - { - throw std::runtime_error("File does not exist: " + filename); - } - - search_in_zip(filename, keyword); - } - catch (const std::exception &e) - { - std::cerr << "Error: " << e.what() << '\n'; - return 1; - } - - return 0; -} \ No newline at end of file