Skip to content

Commit

Permalink
Add HTML parsing and CSS Selectors (#48)
Browse files Browse the repository at this point in the history
* initial

* plain text

* add lexbor

* fix win32 cmake

* fix headers

* lint

* cleanup
  • Loading branch information
royshil authored Nov 5, 2023
1 parent 3f5479e commit 8b4efbf
Show file tree
Hide file tree
Showing 11 changed files with 1,089 additions and 373 deletions.
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ if(USE_SYSTEM_PUGIXML)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE pugixml)
else()
include(cmake/BuildPugiXML.cmake)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE libpugixml)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE libpugixml_internal)
endif()

include(cmake/BuildJSONCONS.cmake)
Expand All @@ -65,6 +65,9 @@ target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE jsoncons)
include(cmake/BuildInja.cmake)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE inja)

include(cmake/BuildLexbor.cmake)
target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE liblexbor_internal)

target_include_directories(${CMAKE_PROJECT_NAME} PRIVATE vendor/nlohmann-json)

target_sources(${CMAKE_PROJECT_NAME} PRIVATE src/plugin-main.c src/url-source.cpp src/ui/RequestBuilder.cpp
Expand Down
47 changes: 47 additions & 0 deletions cmake/BuildLexbor.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
include(ExternalProject)

if(APPLE)
set(LEXBOR_CMAKE_PLATFORM_OPTIONS -DCMAKE_OSX_ARCHITECTURES=x86_64$<SEMICOLON>arm64)
else()
if(WIN32)
add_compile_definitions(LEXBOR_STATIC=1)
set(LEXBOR_CMAKE_PLATFORM_OPTIONS "-DCMAKE_C_FLAGS=/W3 /utf-8 /MP" "-DCMAKE_CXX_FLAGS=/W3 /utf-8 /MP")
else()
set(LEXBOR_CMAKE_PLATFORM_OPTIONS -DCMAKE_SYSTEM_NAME=Linux)
endif()
endif()

set(lexbor_lib_filename ${CMAKE_STATIC_LIBRARY_PREFIX}lexbor_static${CMAKE_STATIC_LIBRARY_SUFFIX})

ExternalProject_Add(
lexbor_build
GIT_REPOSITORY https://github.com/lexbor/lexbor.git
GIT_TAG v2.3.0
CMAKE_GENERATOR ${CMAKE_GENERATOR}
INSTALL_BYPRODUCTS <INSTALL_DIR>/lib/${lexbor_lib_filename}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
-DLEXBOR_BUILD_SHARED=OFF
-DLEXBOR_BUILD_STATIC=ON
-DLEXBOR_BUILD_TESTS_CPP=OFF
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_LINKER=${CMAKE_LINKER}
${LEXBOR_CMAKE_PLATFORM_OPTIONS})

ExternalProject_Get_Property(lexbor_build INSTALL_DIR)

message(STATUS "lexbor will be installed to ${INSTALL_DIR}")

# find the library
set(lexbor_lib_location ${INSTALL_DIR}/lib/${lexbor_lib_filename})

message(STATUS "lexbor library expected at ${lexbor_lib_location}")

add_library(lexbor_internal STATIC IMPORTED)
set_target_properties(lexbor_internal PROPERTIES IMPORTED_LOCATION ${lexbor_lib_location})

add_library(liblexbor_internal INTERFACE)
add_dependencies(liblexbor_internal lexbor_build)
target_link_libraries(liblexbor_internal INTERFACE lexbor_internal)
set_target_properties(liblexbor_internal PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include)
14 changes: 7 additions & 7 deletions cmake/BuildPugiXML.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ ExternalProject_Add(
CMAKE_GENERATOR ${CMAKE_GENERATOR}
INSTALL_BYPRODUCTS <INSTALL_DIR>/lib/${CMAKE_STATIC_LIBRARY_PREFIX}pugixml${CMAKE_STATIC_LIBRARY_SUFFIX}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DBUILD_SHARED_LIBS=OFF -DPUGIXML_BUILD_TESTS=OFF
${PUGIXML_CMAKE_PLATFORM_OPTIONS})
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} ${PUGIXML_CMAKE_PLATFORM_OPTIONS})

ExternalProject_Get_Property(pugixml_build INSTALL_DIR)

Expand All @@ -25,10 +25,10 @@ set(pugixml_lib_location ${INSTALL_DIR}/lib/${pugixml_lib_filename})

message(STATUS "pugixml library expected at ${pugixml_lib_location}")

add_library(pugixml STATIC IMPORTED)
set_target_properties(pugixml PROPERTIES IMPORTED_LOCATION ${pugixml_lib_location})
add_library(pugixml_internal STATIC IMPORTED)
set_target_properties(pugixml_internal PROPERTIES IMPORTED_LOCATION ${pugixml_lib_location})

add_library(libpugixml INTERFACE)
add_dependencies(libpugixml pugixml_build)
target_link_libraries(libpugixml INTERFACE pugixml)
set_target_properties(libpugixml PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include)
add_library(libpugixml_internal INTERFACE)
add_dependencies(libpugixml_internal pugixml_build)
target_link_libraries(libpugixml_internal INTERFACE pugixml_internal)
set_target_properties(libpugixml_internal PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include)
2 changes: 1 addition & 1 deletion src/parsers/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
target_sources(${CMAKE_PROJECT_NAME} PRIVATE jsonpointer.cpp jsonpath.cpp regex.cpp xml.cpp errors.cpp)
target_sources(${CMAKE_PROJECT_NAME} PRIVATE jsonpointer.cpp jsonpath.cpp regex.cpp xml.cpp errors.cpp html.cpp)

# on linux, disable conversion errors
if(UNIX AND NOT APPLE)
Expand Down
118 changes: 118 additions & 0 deletions src/parsers/html.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#include "request-data.h"
#include "plugin-support.h"
#include "errors.h"

#include <lexbor/html/parser.h>
#include <lexbor/html/html.h>
#include <lexbor/dom/interfaces/element.h>
#include <lexbor/css/css.h>
#include <lexbor/selectors/selectors.h>

#include <obs-module.h>

lxb_inline lxb_status_t serializer_callback(const lxb_char_t *data, size_t len, void *ctx)
{
((std::string *)ctx)->append((const char *)data, len);
return LXB_STATUS_OK;
}

lxb_status_t find_callback(lxb_dom_node_t *node, lxb_css_selector_specificity_t spec, void *data)
{
UNUSED_PARAMETER(spec);
std::string str;
(void)lxb_html_serialize_deep_cb(node, serializer_callback, &str);
((std::vector<std::string> *)data)->push_back(str);
return LXB_STATUS_OK;
}

lxb_status_t find_with_selectors(const std::string &slctrs, lxb_html_document_t *document,
std::vector<std::string> &found)
{
/* Create CSS parser. */
lxb_css_parser_t *parser;
lxb_css_selector_list_t *list;
lxb_status_t status;
lxb_dom_node_t *body;
lxb_selectors_t *selectors;

parser = lxb_css_parser_create();
status = lxb_css_parser_init(parser, NULL);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to setup CSS parser");
return EXIT_FAILURE;
}

/* Selectors. */
selectors = lxb_selectors_create();
status = lxb_selectors_init(selectors);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to setup Selectors");
return EXIT_FAILURE;
}

/* Parse and get the log. */

list = lxb_css_selectors_parse(parser, (const lxb_char_t *)slctrs.c_str(), slctrs.length());
if (parser->status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to parse CSS selectors");
return EXIT_FAILURE;
}

/* Find HTML nodes by CSS Selectors. */
body = lxb_dom_interface_node(lxb_html_document_body_element(document));

status = lxb_selectors_find(selectors, body, list, find_callback, &found);
if (status != LXB_STATUS_OK) {
obs_log(LOG_ERROR, "Failed to find HTML nodes by CSS Selectors");
return EXIT_FAILURE;
}

/* Destroy Selectors object. */
(void)lxb_selectors_destroy(selectors, true);

/* Destroy resources for CSS Parser. */
(void)lxb_css_parser_destroy(parser, true);

/* Destroy all object for all CSS Selector List. */
lxb_css_selector_list_destroy_memory(list);

return LXB_STATUS_OK;
}

struct request_data_handler_response parse_html(struct request_data_handler_response response,
const url_source_request_data *request_data)
{
lxb_status_t status;
lxb_html_document_t *document;

document = lxb_html_document_create();
if (document == NULL) {
return make_fail_parse_response("Failed to setup HTML parser");
}

status = lxb_html_document_parse(document, (const lxb_char_t *)response.body.c_str(),
response.body.length());
if (status != LXB_STATUS_OK) {
return make_fail_parse_response("Failed to parse HTML");
}

std::string parsed_output = response.body;
// Get the output value
if (request_data->output_cssselector != "") {
std::vector<std::string> found;
if (find_with_selectors(request_data->output_cssselector, document, found) !=
LXB_STATUS_OK) {
return make_fail_parse_response("Failed to find element with CSS selector");
} else {
if (found.size() > 0) {
std::copy(found.begin(), found.end(),
std::back_inserter(response.body_parts_parsed));
}
}
} else {
// Return the whole HTML object
response.body_parts_parsed.push_back(parsed_output);
}

return response;
}
3 changes: 3 additions & 0 deletions src/parsers/parsers.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ struct request_data_handler_response parse_regex(struct request_data_handler_res
struct request_data_handler_response parse_xml(struct request_data_handler_response response,
const url_source_request_data *request_data);

struct request_data_handler_response parse_html(struct request_data_handler_response response,
const url_source_request_data *request_data);

struct request_data_handler_response
parse_xml_by_xquery(struct request_data_handler_response response,
const url_source_request_data *request_data);
Expand Down
9 changes: 7 additions & 2 deletions src/request-data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,12 @@ struct request_data_handler_response request_data_handler(url_source_request_dat
// attempt to parse as json and return the whole object
response = parse_json(response, request_data);
}
} else if (request_data->output_type == "XML (XPath)" ||
request_data->output_type == "HTML") {
} else if (request_data->output_type == "XML (XPath)") {
response = parse_xml(response, request_data);
} else if (request_data->output_type == "XML (XQuery)") {
response = parse_xml_by_xquery(response, request_data);
} else if (request_data->output_type == "HTML") {
response = parse_html(response, request_data);
} else if (request_data->output_type == "Text") {
response = parse_regex(response, request_data);
} else {
Expand Down Expand Up @@ -295,6 +296,7 @@ std::string serialize_request_data(url_source_request_data *request_data)
json["output_regex"] = request_data->output_regex;
json["output_regex_flags"] = request_data->output_regex_flags;
json["output_regex_group"] = request_data->output_regex_group;
json["output_cssselector"] = request_data->output_cssselector;
// postprocess options
json["post_process_regex"] = request_data->post_process_regex;
json["post_process_regex_is_replace"] = request_data->post_process_regex_is_replace;
Expand Down Expand Up @@ -395,6 +397,9 @@ url_source_request_data unserialize_request_data(std::string serialized_request_
request_data.output_regex = json["output_regex"].get<std::string>();
request_data.output_regex_flags = json["output_regex_flags"].get<std::string>();
request_data.output_regex_group = json["output_regex_group"].get<std::string>();
if (json.contains("output_cssselector"))
request_data.output_cssselector =
json["output_cssselector"].get<std::string>();

// postprocess options
if (json.contains("post_process_regex")) {
Expand Down
2 changes: 2 additions & 0 deletions src/request-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ struct url_source_request_data {
std::string output_regex;
std::string output_regex_flags;
std::string output_regex_group;
std::string output_cssselector;
// post process options
std::string post_process_regex;
bool post_process_regex_is_replace;
Expand All @@ -60,6 +61,7 @@ struct url_source_request_data {
output_regex = std::string("");
output_regex_flags = std::string("");
output_regex_group = std::string("0");
output_cssselector = std::string("");
}
};

Expand Down
Loading

0 comments on commit 8b4efbf

Please sign in to comment.