Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

geckodriver directly in execution #26

Merged
merged 1 commit into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"description": "Scrapping French 'Drive' Supermarket wWbsite",
"description": "Scrapping French 'Drive' Supermarket Website",
"license": "MIT",
"name": "chrismile0/scrapper",
"autoload": {
Expand Down
40 changes: 30 additions & 10 deletions src/scrapper_auchan.php
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
use Facebook\WebDriver\WebDriverBy as WebDriverBy;
use Facebook\WebDriver\WebDriverExpectedCondition as WebDriverExpectedCondition;
use Facebook\WebDriver\WebDriverKeys as WebDriverKeys;
use Facebook\WebDriver\Firefox\FirefoxDriver as FirefoxDriver;
use Facebook\WebDriver\Firefox\FirefoxProfile as FirefoxProfile;
require __DIR__ . '/../../../autoload.php'; // EXPORT
//require __DIR__ . '/../vendor/autoload.php'; // DEV

Expand All @@ -71,23 +73,40 @@ function change_quantity_a(string $libelle) : string {
* [BRIEF] generate an instance of a firefox driver with 'geckodriver' server
* (localhost:4444)
* @param int $p port
* @param bool $web_server true of false
* @example generate_driver_a()
* @author chriSmile0
* @return /
*/
function generate_driver_a(int $p) {
function generate_driver_a(int $p, bool $web_server) {
//-----------------Remote with geckodriver in terminal--------------------//
$host = 'http://localhost:'.$p.'/';
if(!$web_server) {
$host = 'http://localhost:'.$p.'/';

$capabilities = DesiredCapabilities::firefox();
$firefoxOptions = new FirefoxOptions;
$firefoxOptions->addArguments(['-headless']);
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $firefoxOptions);
try {
return RemoteWebDriver::create($host,$capabilities);
}
catch (Exception $e) {
echo "ERRRRRR_REMOTE : ".$e->getMessage()."\n";
return NULL;
}
}
//------------FirefoxDriver, geckodriver directly on this process--------//
//shell_exec("kill -s kill `ps -e | grep -e geckodriver | grep -Eo '[0-9]{1,10}' | head -n 1`");sleep(1);
$firefoxOptions = new FirefoxOptions();
$firefoxOptions->setProfile(new FirefoxProfile());
$capabilities = DesiredCapabilities::firefox();
$firefoxOptions = new FirefoxOptions;
$firefoxOptions->addArguments(['--headless']);
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $firefoxOptions);
try {
return RemoteWebDriver::create($host,$capabilities);
return FirefoxDriver::start($capabilities);
}
catch (Exception $e) {
echo "ERRRRRR_REMOTE : ".$e->getMessage()."\n";
echo "ERRRRRR : ".$e->getMessage()."\n";
return NULL;
}
}
Expand Down Expand Up @@ -337,13 +356,14 @@ function extract_source_auchan(string $url,$driver, string $town, string $target
* @param string $target_product the target product
* @param string $town the research area
* @param int $p port
* @param bool $web_server true of false
* @example content_scrap_auchan((@see URL1),"lardons","Paris")
* @author chriSmile0
* @return array array of all product with specific information that we needed
*/
function content_scrap_auchan(string $target_product, string $town, int $p) : array {
function content_scrap_auchan(string $target_product, string $town, int $p, bool $web_server) : array {
$url = "https://www.auchan.fr/";
$driver = generate_driver_a($p);
$driver = generate_driver_a($p,$web_server);
if($driver === NULL)
return array();

Expand Down Expand Up @@ -400,15 +420,15 @@ function content_scrap_auchan(string $target_product, string $town, int $p) : ar
* test or if the scrapping failed
*/
function main_a($argc, $argv) : bool {
if($argc == 5) {
if(empty(content_scrap_auchan($argv[1],$argv[2],$argv[3]))) {
if($argc == 6) {
if(empty(content_scrap_auchan($argv[1],$argv[2],$argv[3],strtolower($argv[4])==="true"))) {
echo "NO CORRESPONDENCE FOUND \n";
return 0;
}
return 1;
}
else {
echo "ERROR : format : ". $argv[0] . "[research_product_type] [town] [port] --with-openssl\n";
echo "ERROR : format : ". $argv[0] . "[research_product_type] [town] [port] [?webserver] --with-openssl\n";
return 0;
}
echo "EXECUTION FINISH WITH SUCCESS \n";
Expand Down
40 changes: 30 additions & 10 deletions src/scrapper_carrefour.php
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
use Facebook\WebDriver\Remote\RemoteWebDriver;
use Facebook\WebDriver\WebDriverBy as WebDriverBy;
use Facebook\WebDriver\WebDriverExpectedCondition as WebDriverExpectedCondition;
use Facebook\WebDriver\Firefox\FirefoxDriver as FirefoxDriver;
use Facebook\WebDriver\Firefox\FirefoxProfile as FirefoxProfile;
require __DIR__ . '/../../../autoload.php'; // EXPORT
//require __DIR__ . '/../vendor/autoload.php'; // DEV

Expand Down Expand Up @@ -107,23 +109,40 @@ function change_quantity_c($libelle) {
* [BRIEF] generate an instance of a firefox driver with 'geckodriver' server
* (localhost:4444)
* @param int $p port
* @param bool $web_server true of false
* @example generate_driver_c()
* @author chriSmile0
* @return /
*/
function generate_driver_c(int $p) {
function generate_driver_c(int $p, bool $web_server) {
//-----------------Remote with geckodriver in terminal--------------------//
$host = 'http://localhost:'.$p.'/';
if(!$web_server) {
$host = 'http://localhost:'.$p.'/';

$capabilities = DesiredCapabilities::firefox();
$firefoxOptions = new FirefoxOptions;
$firefoxOptions->addArguments(['-headless']);
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $firefoxOptions);
try {
return RemoteWebDriver::create($host,$capabilities);
}
catch (Exception $e) {
echo "ERRRRRR_REMOTE : ".$e->getMessage()."\n";
return NULL;
}
}
//------------FirefoxDriver, geckodriver directly on this process--------//
//shell_exec("kill -s kill `ps -e | grep -e geckodriver | grep -Eo '[0-9]{1,10}' | head -n 1`");sleep(1);
$firefoxOptions = new FirefoxOptions();
$firefoxOptions->setProfile(new FirefoxProfile());
$capabilities = DesiredCapabilities::firefox();
$firefoxOptions = new FirefoxOptions;
$firefoxOptions->addArguments(['--headless']);
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $firefoxOptions);
try {
return RemoteWebDriver::create($host,$capabilities);
return FirefoxDriver::start($capabilities);
}
catch (Exception $e) {
echo "ERRRRRR_REMOTE : ".$e->getMessage()."\n";
echo "ERRRRRR : ".$e->getMessage()."\n";
return NULL;
}
}
Expand Down Expand Up @@ -472,14 +491,15 @@ function extract_info_for_all_products_c(array $tab_json, array $needed_key) : a
* @param string $target_product the target product
* @param string $city the city to target
* @param int $p port
* @param bool $web_server true of false
* @example content_scrap_carrefour((@see URL1),"lardons")
* @author chriSmile0
* @return array array of all product with specific information that we needed
*/
function content_scrap_carrefour(string $target_product, string $city, int $p) : array {
function content_scrap_carrefour(string $target_product, string $city, int $p, bool $web_server) : array {
$url = "https://www.carrefour.fr/courses";
$rtn = array();
$driver = generate_driver_c($p);
$driver = generate_driver_c($p,$web_server);
if($driver !== NULL) {

$product_needed_key = [ // On ATTRIBUTES
Expand Down Expand Up @@ -551,14 +571,14 @@ function content_scrap_carrefour(string $target_product, string $city, int $p) :
* test or if the scrapping failed
*/
function main_c($argc, $argv) : bool {
if($argc == 5) {
if(empty(content_scrap_carrefour($argv[1],$argv[2],$argv[3]))) {
if($argc == 6) {
if(empty(content_scrap_carrefour($argv[1],$argv[2],$argv[3],strtolower($argv[4])==="true"))) {
echo "NO CORRESPONDENCE FOUND \n";
return 0;
}
}
else {
echo "ERROR : format : ". $argv[0] . " [research_product_type] [city] [port] --with-openssl\n";
echo "ERROR : format : ". $argv[0] . " [research_product_type] [city] [port] [?webserver] --with-openssl\n";
return 0;
}
echo "EXECUTION FINISH WITH SUCCESS \n";
Expand Down
40 changes: 30 additions & 10 deletions src/scrapper_intermarche.php
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@
use Facebook\WebDriver\WebDriverBy as WebDriverBy;
use Facebook\WebDriver\WebDriverExpectedCondition as WebDriverExpectedCondition;
use Facebook\WebDriver\WebDriverKeys as WebDriverKeys;
use Facebook\WebDriver\Firefox\FirefoxDriver as FirefoxDriver;
use Facebook\WebDriver\Firefox\FirefoxProfile as FirefoxProfile;
require __DIR__ . '/../../../autoload.php'; // EXPORT
//require __DIR__ . '/../vendor/autoload.php'; // DEV

Expand Down Expand Up @@ -113,23 +115,40 @@ function change_quantity_i(string $libelle, $kg_price, $price) : string {
* [BRIEF] generate an instance of a firefox driver with 'geckodriver' server
* (localhost:4444)
* @param int $p port
* @param bool $web_server true of false
* @example generate_driver_i()
* @author chriSmile0
* @return /
*/
function generate_driver_i(int $p) {
function generate_driver_i(int $p, bool $web_server) {
//-----------------Remote with geckodriver in terminal--------------------//
$host = 'http://localhost:'.$p.'/';
if(!$web_server) {
$host = 'http://localhost:'.$p.'/';

$capabilities = DesiredCapabilities::firefox();
$firefoxOptions = new FirefoxOptions;
$firefoxOptions->addArguments(['-headless']);
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $firefoxOptions);
try {
return RemoteWebDriver::create($host,$capabilities);
}
catch (Exception $e) {
echo "ERRRRRR_REMOTE : ".$e->getMessage()."\n";
return NULL;
}
}
//------------FirefoxDriver, geckodriver directly on this process--------//
//shell_exec("kill -s kill `ps -e | grep -e geckodriver | grep -Eo '[0-9]{1,10}' | head -n 1`");sleep(1);
$firefoxOptions = new FirefoxOptions();
$firefoxOptions->setProfile(new FirefoxProfile());
$capabilities = DesiredCapabilities::firefox();
$firefoxOptions = new FirefoxOptions;
$firefoxOptions->addArguments(['--headless']);
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $firefoxOptions);
try {
return RemoteWebDriver::create($host,$capabilities);
return FirefoxDriver::start($capabilities);
}
catch (Exception $e) {
echo "ERRRRRR_REMOTE : ".$e->getMessage()."\n";
echo "ERRRRRR : ".$e->getMessage()."\n";
return NULL;
}
}
Expand Down Expand Up @@ -532,14 +551,15 @@ function extract_needed_information_pro_i(array $json, array $needed_key) : arra
* @param string $target_product the target product
* @param string $town the town
* @param int $p port
* @param bool $web_server true of false
* @example content_scrap_intermarche((@see URL1),"lardons")
* @author chriSmile0
* @return array array of all product with specific information that we needed
*/
function content_scrap_intermarche(string $target_product, string $town, int $p) : array {
function content_scrap_intermarche(string $target_product, string $town, int $p, bool $web_server) : array {
$url = "https://www.intermarche.com/";
$rtn = array();
$driver = generate_driver_i($p);
$driver = generate_driver_i($p,$web_server);
if($driver !== NULL) {

$product_needed_key = [ // On ATTRIBUTES
Expand Down Expand Up @@ -606,14 +626,14 @@ function content_scrap_intermarche(string $target_product, string $town, int $p)
* test or if the scrapping failed
*/
function main_i($argc, $argv) : bool {
if($argc == 5) {
if(empty(content_scrap_intermarche($argv[1],$argv[2],$argv[3]))) {
if($argc == 6) {
if(empty(content_scrap_intermarche($argv[1],$argv[2],$argv[3],strtolower($argv[4])==="true"))) {
echo "NO CORRESPONDENCE FOUND \n";
return 0;
}
}
else {
echo "ERROR : format : ". $argv[0] . "[research_product_type] [town] [port] --with-openssl\n";
echo "ERROR : format : ". $argv[0] . "[research_product_type] [town] [port] [?webserver]--with-openssl\n";
return 0;
}
echo "EXECUTION FINISH WITH SUCCESS \n";
Expand Down
51 changes: 37 additions & 14 deletions src/scrapper_monoprix.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
use Facebook\WebDriver\Firefox\FirefoxOptions as FirefoxOptions;
use Facebook\WebDriver\Remote\DesiredCapabilities as DesiredCapabilities;
use Facebook\WebDriver\Remote\RemoteWebDriver as RemoteWebDriver;
use Facebook\WebDriver\Firefox\FirefoxDriver as FirefoxDriver;
use Facebook\WebDriver\Firefox\FirefoxProfile as FirefoxProfile;
require __DIR__ . '/../../../autoload.php'; // EXPORT
//require __DIR__ . '/../vendor/autoload.php'; // DEV

Expand Down Expand Up @@ -100,24 +102,42 @@ function change_quantity_m(string $libelle) : string {
/**
* [BRIEF] generate an instance of a firefox driver with 'geckodriver' server
* (localhost:4444)
* @param int $o port
* @param int $o port
* @param bool $web_server true of false
* @example generate_driver_m()
* @author chriSmile0
* @return /
*/
function generate_driver_m(int $p) {
function generate_driver_m(int $p, bool $web_server) {
//-----------------Remote with geckodriver in terminal--------------------//
$host = 'http://localhost:'.$p.'/';
var_dump($web_server);
if(!$web_server) {
$host = 'http://localhost:'.$p.'/';

$capabilities = DesiredCapabilities::firefox();
$firefoxOptions = new FirefoxOptions;
$firefoxOptions->addArguments(['-headless']);
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $firefoxOptions);
try {
return RemoteWebDriver::create($host,$capabilities);
}
catch (Exception $e) {
echo "ERRRRRR_REMOTE : ".$e->getMessage()."\n";
return NULL;
}
}
//------------FirefoxDriver, geckodriver directly on this process--------//
//shell_exec("kill -s kill `ps -e | grep -e geckodriver | grep -Eo '[0-9]{1,10}' | head -n 1`");sleep(1);
$firefoxOptions = new FirefoxOptions();
$firefoxOptions->setProfile(new FirefoxProfile());
$capabilities = DesiredCapabilities::firefox();
$firefoxOptions = new FirefoxOptions;
$firefoxOptions->addArguments(['-headless']);
$firefoxOptions->addArguments(['--headless']);
$capabilities->setCapability(FirefoxOptions::CAPABILITY, $firefoxOptions);
try {
return RemoteWebDriver::create($host,$capabilities);
return FirefoxDriver::start($capabilities);
}
catch (Exception $e) {
echo "ERRRRRR_REMOTE : ".$e->getMessage()."\n";
echo "ERRRRRR : ".$e->getMessage()."\n";
return NULL;
}
}
Expand All @@ -128,13 +148,15 @@ function generate_driver_m(int $p) {
* @param string $url the url to get in the browser
* @param int $js_or_selenium 0 for js 1 for sele
* @param int $p port
* @param bool $web_server true or false
* @example extract_source_monoprix((@see URL1),1)
* @author chriSmile0
* @return string the source code
*/
function extract_source_monoprix(string $url, int $js_or_selenium, int $p) : string {
function extract_source_monoprix(string $url, int $js_or_selenium, int $p,
bool $web_server) : string {
if($js_or_selenium == 1) {
$driver = generate_driver_m($p);
$driver = generate_driver_m($p,$web_server);
if($driver == NULL)
return "";
$driver->get($url);
Expand Down Expand Up @@ -321,15 +343,16 @@ function extract_needed_information_pro_m(array $json, array $needed_key) : arra
*
* @param string $target_product the target product
* @param int $p port
* @param bool $web_server true or false
* @example content_scrap_monoprix((@see URL1),"lardons")
* @author chriSmile0
* @return array array of all product with specific information that we needed
*/
function content_scrap_monoprix(string $target_product, int $p) : array {
function content_scrap_monoprix(string $target_product, int $p, bool $web_server) : array {
$url = "https://courses.monoprix.fr/products/search?q=";
$rtn = array();
//check if $target_product is in the list of product (lardons,oeufs , etc)
$script = extract_source_monoprix($url.$target_product,1,$p);
$script = extract_source_monoprix($url.$target_product,1,$p,$web_server);
if(empty($prods = all_subcontent_with_trunk_v21_m($script,"{\"productId\":",[",\"retailerFinancingPlanIds\""],false,0,"}")))
return $rtn;

Expand All @@ -356,14 +379,14 @@ function content_scrap_monoprix(string $target_product, int $p) : array {
* test or if the scrapping failed
*/
function main_m($argc, $argv) : bool {
if($argc == 4) {
if(empty(content_scrap_monoprix($argv[1],$argv[2]))) {
if($argc == 5) {
if(empty(content_scrap_monoprix($argv[1],$argv[2],strtolower($argv[3])==="true"))) {
echo "NO CORRESPONDENCE FOUND \n";
return 0;
}
}
else {
echo "ERROR : format : ". $argv[0] . "[research_product_type] [port] --with-openssl\n";
echo "ERROR : format : ". $argv[0] . "[research_product_type] [port] [?webserver] --with-openssl\n";
return 0;
}
echo "EXECUTION FINISH WITH SUCCESS \n";
Expand Down
Loading
Loading