diff --git a/hearchco_example.yaml b/hearchco_example.yaml index c8b90692..d0920956 100644 --- a/hearchco_example.yaml +++ b/hearchco_example.yaml @@ -1,19 +1,4 @@ -categories: - science: - engines: - startpage: - enabled: true - duckduckgo: - enabled: false - ranking: - engines: - mojeek: - mul: 1 - general: - engines: - google: - enabled: true - server: + frontendurl: http://localhost:5173 cache: - type: none \ No newline at end of file + type: none diff --git a/src/cli/climode.go b/src/cli/climode.go index 6f692048..95756d2c 100644 --- a/src/cli/climode.go +++ b/src/cli/climode.go @@ -17,7 +17,7 @@ import ( func printResults(results []result.Result) { fmt.Print("\n\tThe Search Results:\n\n") for _, r := range results { - fmt.Printf("%v (%.2f) -----\n\t\"%v\"\n\t\"%v\"\n\t\"%v\"\n\t-", r.Rank, r.Score, r.Title, r.URL, r.Description) + fmt.Printf("%v (%.2f) -----\n\t%q\n\t%q\n\t%q\n\n%+v\n\t-", r.Rank, r.Score, r.Title, r.URL, r.Description, r) for seInd := uint8(0); seInd < r.TimesReturned; seInd++ { fmt.Printf("%v", r.EngineRanks[seInd].SearchEngine.ToLower()) if seInd != r.TimesReturned-1 { diff --git a/src/config/defaults.go b/src/config/defaults.go index ecd5b60a..3a55e4b9 100644 --- a/src/config/defaults.go +++ b/src/config/defaults.go @@ -44,6 +44,9 @@ func NewSettings() map[engines.Name]Settings { engines.BING: { Shortcut: "bi", }, + engines.BINGIMAGES: { + Shortcut: "biimg", + }, engines.BRAVE: { Shortcut: "br", }, @@ -56,6 +59,9 @@ func NewSettings() map[engines.Name]Settings { engines.GOOGLE: { Shortcut: "g", }, + engines.GOOGLEIMAGES: { + Shortcut: "gimg", + }, engines.GOOGLESCHOLAR: { Shortcut: "gs", }, @@ -114,6 +120,13 @@ func NewGeneral() []engines.Name { } } +func NewImage() []engines.Name { + return []engines.Name{ + engines.BINGIMAGES, + engines.GOOGLEIMAGES, + } +} + func NewInfo() []engines.Name { return []engines.Name{ engines.BING, @@ -158,6 +171,14 @@ func New() *Config { PageTimeout: 1000 * time.Millisecond, }, }, + category.IMAGES: { + Engines: NewImage(), + Ranking: NewRanking(), + Timings: Timings{ + Timeout: 1500 * time.Millisecond, + PageTimeout: 1500 * time.Millisecond, + }, + }, category.INFO: { Engines: NewInfo(), Ranking: NewRanking(), diff --git a/src/router/search.go b/src/router/search.go index d88e8e56..694af3a7 100644 --- a/src/router/search.go +++ b/src/router/search.go @@ -50,37 +50,37 @@ func Search(c *gin.Context, conf *config.Config, db cache.DB) error { } else { maxPages, pageserr := strconv.Atoi(pages) if pageserr != nil { - c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Cannot convert pages value (\"%v\") to int", pages)) - return fmt.Errorf("router.Search(): cannot convert pages value \"%v\" to int: %w", pages, pageserr) + c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Cannot convert pages value (%q) to int", pages)) + return fmt.Errorf("router.Search(): cannot convert pages value %q to int: %w", pages, pageserr) } visitPages, deeperr := strconv.ParseBool(deepSearch) if deeperr != nil { - c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Cannot convert deep value (\"%v\") to bool", deepSearch)) - return fmt.Errorf("router.Search(): cannot convert deep value \"%v\" to int: %w", deepSearch, deeperr) + c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Cannot convert deep value (%q) to bool", deepSearch)) + return fmt.Errorf("router.Search(): cannot convert deep value %q to int: %w", deepSearch, deeperr) } if lerr := engines.ValidateLocale(locale); lerr != nil { - c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Invalid locale value (\"%v\"), should be of the form \"en_US\"", locale)) - return fmt.Errorf("router.Search(): invalid locale value \"%v\": %w", locale, lerr) + c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Invalid locale value (%q), should be of the form \"en_US\"", locale)) + return fmt.Errorf("router.Search(): invalid locale value %q: %w", locale, lerr) } ccateg = category.SafeFromString(categ) if ccateg == category.UNDEFINED { - c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Invalid category value (\"%v\")", categ)) - return fmt.Errorf("router.Search(): invalid category value \"%v\"", categ) + c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Invalid category value (%q)", categ)) + return fmt.Errorf("router.Search(): invalid category value %q", categ) } safeSearchB, safeerr := strconv.ParseBool(safesearch) if safeerr != nil { - c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Cannot convert safesearch value (\"%v\") to bool", safesearch)) - return fmt.Errorf("router.Search(): cannot convert safesearch value \"%v\" to bool: %w", safesearch, safeerr) + c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Cannot convert safesearch value (%q) to bool", safesearch)) + return fmt.Errorf("router.Search(): cannot convert safesearch value %q to bool: %w", safesearch, safeerr) } isMobile, mobileerr := strconv.ParseBool(mobile) if mobileerr != nil { - c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Cannot convert mobile value (\"%v\") to bool", mobile)) - return fmt.Errorf("router.Search(): cannot convert mobile value \"%v\" to bool: %w", mobile, mobileerr) + c.String(http.StatusUnprocessableEntity, fmt.Sprintf("Cannot convert mobile value (%q) to bool", mobile)) + return fmt.Errorf("router.Search(): cannot convert mobile value %q to bool: %w", mobile, mobileerr) } options := engines.Options{ diff --git a/src/search/bucket/addresult.go b/src/search/bucket/addresult.go index 442ee88b..7b489096 100644 --- a/src/search/bucket/addresult.go +++ b/src/search/bucket/addresult.go @@ -29,7 +29,7 @@ func AddSEResult(seResult *result.RetrievedResult, seName engines.Name, relay *R Description: seResult.Description, EngineRanks: engineRanks, TimesReturned: 1, - Response: nil, + ImageResult: seResult.ImageResult, } relay.Mutex.Lock() diff --git a/src/search/bucket/makeresult.go b/src/search/bucket/makeresult.go index ceca5038..163da446 100644 --- a/src/search/bucket/makeresult.go +++ b/src/search/bucket/makeresult.go @@ -5,18 +5,61 @@ import ( "github.com/hearchco/hearchco/src/search/result" ) -func MakeSEResult(urll string, title string, description string, searchEngineName engines.Name, sePage int, seOnPageRank int) *result.RetrievedResult { +func MakeSEResult( + urll, title, desc string, + seName engines.Name, sePage, seOnPageRank int, +) *result.RetrievedResult { + ser := result.RetrievedRank{ - SearchEngine: searchEngineName, + SearchEngine: seName, Rank: 0, Page: uint(sePage), OnPageRank: uint(seOnPageRank), } + res := result.RetrievedResult{ URL: urll, Title: title, - Description: description, + Description: desc, Rank: ser, } + + return &res +} + +func MakeSEImageResult( + urll, title, desc string, + src, srcUrl, thmbUrl string, + origH, origW, thmbH, thmbW int, + seName engines.Name, sePage, seOnPageRank int, +) *result.RetrievedResult { + + ser := result.RetrievedRank{ + SearchEngine: seName, + Rank: 0, + Page: uint(sePage), + OnPageRank: uint(seOnPageRank), + } + + res := result.RetrievedResult{ + URL: urll, + Title: title, + Description: desc, + ImageResult: result.ImageResult{ + Original: result.ImageFormat{ + Height: uint(origH), + Width: uint(origW), + }, + Thumbnail: result.ImageFormat{ + Height: uint(thmbH), + Width: uint(thmbW), + }, + ThumbnailURL: thmbUrl, + Source: src, + SourceURL: srcUrl, + }, + Rank: ser, + } + return &res } diff --git a/src/search/category/category.go b/src/search/category/category.go index bfa1ed54..d74057e1 100644 --- a/src/search/category/category.go +++ b/src/search/category/category.go @@ -7,6 +7,7 @@ import ( var FromString = map[string]Name{ //main "general": GENERAL, + "images": IMAGES, "info": INFO, "science": SCIENCE, "news": NEWS, diff --git a/src/search/category/name.go b/src/search/category/name.go index 2fad0509..36da6c46 100644 --- a/src/search/category/name.go +++ b/src/search/category/name.go @@ -6,6 +6,7 @@ type Name string const ( UNDEFINED Name = "undefined" GENERAL Name = "general" + IMAGES Name = "images" INFO Name = "info" SCIENCE Name = "science" NEWS Name = "news" diff --git a/src/search/engines/_engines_test/structs.go b/src/search/engines/_engines_test/structs.go index 24ec18bd..9ea569e7 100644 --- a/src/search/engines/_engines_test/structs.go +++ b/src/search/engines/_engines_test/structs.go @@ -36,6 +36,13 @@ func NewConfig(engineName engines.Name) *config.Config { Timeout: 10000 * time.Millisecond, // colly default }, }, + category.IMAGE: { + Engines: []engines.Name{engineName}, + Ranking: config.NewRanking(), + Timings: config.Timings{ + Timeout: 10000 * time.Millisecond, // colly default + }, + }, }, } } diff --git a/src/search/engines/_engines_test/tester.go b/src/search/engines/_engines_test/tester.go index 68ffd489..b68a49be 100644 --- a/src/search/engines/_engines_test/tester.go +++ b/src/search/engines/_engines_test/tester.go @@ -14,7 +14,7 @@ func CheckTestCases(tchar []TestCaseHasAnyResults, tccr []TestCaseContainsResult // TestCaseHasAnyResults for _, tc := range tchar { if results := search.PerformSearch(tc.Query, tc.Options, conf); len(results) == 0 { - defer t.Errorf("Got no results for %v", tc.Query) + defer t.Errorf("Got no results for %q", tc.Query) } } @@ -22,7 +22,7 @@ func CheckTestCases(tchar []TestCaseHasAnyResults, tccr []TestCaseContainsResult for _, tc := range tccr { results := search.PerformSearch(tc.Query, tc.Options, conf) if len(results) == 0 { - defer t.Errorf("Got no results for %v", tc.Query) + defer t.Errorf("Got no results for %q", tc.Query) } else { for _, rURL := range tc.ResultURL { found := false @@ -35,7 +35,7 @@ func CheckTestCases(tchar []TestCaseHasAnyResults, tccr []TestCaseContainsResult } if !found { - defer t.Errorf("Couldn't find %v (%v).\nThe results: %v", rURL, tc.Query, results) + defer t.Errorf("Couldn't find %q (%q).\nThe results: %q", rURL, tc.Query, results) } } } @@ -45,13 +45,13 @@ func CheckTestCases(tchar []TestCaseHasAnyResults, tccr []TestCaseContainsResult for _, tc := range tcrr { results := search.PerformSearch(tc.Query, tc.Options, conf) if len(results) == 0 { - defer t.Errorf("Got no results for %v", tc.Query) + defer t.Errorf("Got no results for %q", tc.Query) } else if len(results) < len(tc.ResultURL) { defer t.Errorf("Number of results is less than test case URLs.") } else { for i, rURL := range tc.ResultURL { if !strings.Contains(results[i].URL, rURL) { - defer t.Errorf("Wrong result on rank %v: %v (%v).\nThe results: %v", i+1, rURL, tc.Query, results) + defer t.Errorf("Wrong result on rank %q: %q (%q).\nThe results: %q", i+1, rURL, tc.Query, results) } } } diff --git a/src/search/engines/bingimages/bingimages.go b/src/search/engines/bingimages/bingimages.go new file mode 100644 index 00000000..242a237d --- /dev/null +++ b/src/search/engines/bingimages/bingimages.go @@ -0,0 +1,227 @@ +package bingimages + +import ( + "context" + "strconv" + "strings" + + "github.com/goccy/go-json" + "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" + "github.com/hearchco/hearchco/src/config" + "github.com/hearchco/hearchco/src/search/bucket" + "github.com/hearchco/hearchco/src/search/engines" + "github.com/hearchco/hearchco/src/search/engines/_sedefaults" + "github.com/rs/zerolog/log" +) + +func Search(ctx context.Context, query string, relay *bucket.Relay, options engines.Options, settings config.Settings, timings config.Timings) error { + if err := _sedefaults.Prepare(Info.Name, &options, &settings, &Support, &Info, &ctx); err != nil { + return err + } + + var col *colly.Collector + var pagesCol *colly.Collector + var retError error + + _sedefaults.InitializeCollectors(&col, &pagesCol, &settings, &options, &timings) + + _sedefaults.PagesColRequest(Info.Name, pagesCol, ctx) + _sedefaults.PagesColError(Info.Name, pagesCol) + _sedefaults.PagesColResponse(Info.Name, pagesCol, relay) + + _sedefaults.ColRequest(Info.Name, col, ctx) + _sedefaults.ColError(Info.Name, col) + + var pageRankCounter []int = make([]int, options.MaxPages*Info.ResultsPerPage) + + col.OnHTML(dompaths.Result, func(e *colly.HTMLElement) { + dom := e.DOM + + var jsonMetadata JsonMetadata + metadataS, metadataExists := dom.Find(dompaths.Metadata.Path).Attr(dompaths.Metadata.Attr) + if !metadataExists { + log.Error(). + Str("engine", Info.Name.String()). + Msg("Matched result, but couldn't retrieve data") + return + } + + if err := json.Unmarshal([]byte(metadataS), &jsonMetadata); err != nil { + log.Error(). + Err(err). + Str("jsonMetadata", metadataS). + Msg("bingimages.Search() -> onHTML: failed to unmarshal metadata") + return + } + + if jsonMetadata.ImageURL == "" || jsonMetadata.PageURL == "" || jsonMetadata.ThumbnailURL == "" { + log.Error(). + Str("engine", Info.Name.String()). + Str("jsonMetadata", metadataS). + Str("url", jsonMetadata.PageURL). + Str("original", jsonMetadata.ImageURL). + Str("thumbnail", jsonMetadata.ThumbnailURL). + Msg("bingimages.Search() -> onHTML: Couldn't find image, thumbnail, or page URL") + return + } + + titleText := strings.TrimSpace(dom.Find(dompaths.Title).Text()) + if titleText == "" { + // could also use the json data ("t" field), it seems to include weird/erroneous characters though (particularly '\ue000' and '\ue001') + log.Error(). + Str("engine", Info.Name.String()). + Str("jsonMetadata", metadataS). + Msg("bingimages.Search() -> onHTML: Couldn't find title") + return + } + + // this returns "2000 x 1500 · jpeg" + imgFormatS := strings.TrimSpace(dom.Find(dompaths.ImgFormatStr).Text()) + if imgFormatS == "" { + log.Trace(). + Str("engine", Info.Name.String()). + Str("jsonMetadata", metadataS). + Str("title", titleText). + Msg("bingimages.Search() -> onHTML: Couldn't find image format (probably a video)") + return + } + + // convert to "2000x1500·jpeg" + imgFormatS = strings.ReplaceAll(imgFormatS, " ", "") + // remove everything after 2000x1500 + imgFormatS = strings.Split(imgFormatS, "·")[0] + // create height and width + imgFormat := strings.Split(imgFormatS, "x") + + imgH, err := strconv.Atoi(imgFormat[0]) + if err != nil { + log.Error(). + Err(err). + Str("engine", Info.Name.String()). + Str("height", imgFormat[0]). + Str("jsonMetadata", metadataS). + Str("title", titleText). + Str("imgFormatS", imgFormatS). + Msg("bingimages.Search() -> onHTML: Failed to convert original height to int") + return + } + + imgW, err := strconv.Atoi(imgFormat[1]) + if err != nil { + log.Error(). + Err(err). + Str("engine", Info.Name.String()). + Str("width", imgFormat[1]). + Str("jsonMetadata", metadataS). + Str("title", titleText). + Str("imgFormatS", imgFormatS). + Msg("bingimages.Search() -> onHTML: Failed to convert original width to int") + return + } + + found := false + var thmbHS, thmbWS string + for _, thmb := range dompaths.Thumbnail { + var thmbHExists, thmbWExists bool + thmbHS, thmbHExists = dom.Find(thmb.Path).Attr(thmb.Height) + thmbWS, thmbWExists = dom.Find(thmb.Path).Attr(thmb.Width) + if thmbHExists && thmbWExists { + found = true + break + } + } + + if !found { + log.Error(). + Str("engine", Info.Name.String()). + Str("jsonMetadata", metadataS). + Str("title", titleText). + Str("height", thmbHS). + Str("width", thmbWS). + Msg("bingimages.Search() -> onHTML: Couldn't find thumbnail format") + return + } + + thmbH, err := strconv.Atoi(thmbHS) + if err != nil { + log.Error(). + Err(err). + Str("engine", Info.Name.String()). + Str("height", thmbHS). + Str("jsonMetadata", metadataS). + Str("title", titleText). + Msg("bingimages.Search() -> onHTML: Failed to convert thumbnail height to int") + return + } + + thmbW, err := strconv.Atoi(thmbWS) + if err != nil { + log.Error(). + Err(err). + Str("engine", Info.Name.String()). + Str("width", thmbWS). + Str("jsonMetadata", metadataS). + Str("title", titleText). + Msg("bingimages.Search() -> onHTML: Failed to convert thumbnail width to int") + return + } + + source := strings.TrimSpace(dom.Find(dompaths.Source).Text()) + if source == "" { + log.Error(). + Str("engine", Info.Name.String()). + Str("jsonMetadata", metadataS). + Str("title", titleText). + Msg("bingimages.Search() -> onHTML: Couldn't find source") + return + } + + page := _sedefaults.PageFromContext(e.Request.Ctx, Info.Name) + + res := bucket.MakeSEImageResult( + jsonMetadata.ImageURL, titleText, jsonMetadata.Desc, + source, jsonMetadata.PageURL, jsonMetadata.ThumbnailURL, + imgH, imgW, thmbH, thmbW, + Info.Name, page, pageRankCounter[page]+1, + ) + bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) + pageRankCounter[page]++ + }) + + col.OnResponse(func(r *colly.Response) { + if len(r.Body) == 0 { + log.Trace(). + Str("engine", Info.Name.String()). + Msg("Got empty response, probably too many requests") + } + }) + + localeParam := getLocale(&options) + + colCtx := colly.NewContext() + colCtx.Put("page", strconv.Itoa(1)) + + urll := Info.URL + query + params[0] + "&first=1" + params[1] + localeParam + anonUrll := Info.URL + anonymize.String(query) + params[0] + "&first=1" + params[1] + localeParam + _sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) + + for i := 1; i < options.MaxPages; i++ { + colCtx = colly.NewContext() + colCtx.Put("page", strconv.Itoa(i+1)) + + urll := Info.URL + query + params[0] + "&first=" + strconv.Itoa(i*10+1) + params[1] + localeParam + anonUrll := Info.URL + anonymize.String(query) + params[0] + "&first=" + strconv.Itoa(i*10+1) + params[1] + localeParam + _sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) + } + + col.Wait() + pagesCol.Wait() + + return retError +} + +func getLocale(options *engines.Options) string { + spl := strings.SplitN(strings.ToLower(options.Locale), "_", 2) + return "&setlang=" + spl[0] + "&cc=" + spl[1] +} diff --git a/src/search/engines/bingimages/bingimages_test.go b/src/search/engines/bingimages/bingimages_test.go new file mode 100644 index 00000000..fe1b734d --- /dev/null +++ b/src/search/engines/bingimages/bingimages_test.go @@ -0,0 +1,44 @@ +package bingimages_test + +import ( + "testing" + + "github.com/hearchco/hearchco/src/search/engines" + "github.com/hearchco/hearchco/src/search/engines/_engines_test" +) + +func TestSearch(t *testing.T) { + engineName := engines.BINGIMAGES + + // testing config + conf := _engines_test.NewConfig(engineName) + + // test cases + tchar := [...]_engines_test.TestCaseHasAnyResults{{ + Query: "ping", + Options: engines.Options{ + MaxPages: 1, + VisitPages: false, + }, + }} + + tccr := [...]_engines_test.TestCaseContainsResults{{ + Query: "wikipedia logo", + ResultURL: []string{"upload.wikimedia.org"}, + Options: engines.Options{ + MaxPages: 1, + VisitPages: false, + }, + }} + + tcrr := [...]_engines_test.TestCaseRankedResults{{ + Query: "linux logo wikipedia", + ResultURL: []string{"logos-world.net"}, + Options: engines.Options{ + MaxPages: 1, + VisitPages: false, + }, + }} + + _engines_test.CheckTestCases(tchar[:], tccr[:], tcrr[:], t, conf) +} diff --git a/src/search/engines/bingimages/dom.go b/src/search/engines/bingimages/dom.go new file mode 100644 index 00000000..dbd551f7 --- /dev/null +++ b/src/search/engines/bingimages/dom.go @@ -0,0 +1,50 @@ +package bingimages + +type thumbnailDomPaths struct { + Path string + Height string + Width string +} + +type metadataDomPaths struct { + Path string + Attr string +} + +type bingImagesDomPaths struct { + Result string + Metadata metadataDomPaths + Title string + ImgFormatStr string + Thumbnail []thumbnailDomPaths + Source string +} + +var dompaths = bingImagesDomPaths{ + // aria-live is also a possible attribute for not() + Result: "ul.dgControl_list > li[data-idx] > div.iuscp:not([vrhatt])", + Metadata: metadataDomPaths{ + Path: "a.iusc", + Attr: "m", + }, + Title: "div.infnmpt > div > ul > li > a", + ImgFormatStr: "div.imgpt > div > span", + Thumbnail: []thumbnailDomPaths{ + { + Path: "a.iusc > div > img.mimg", + Height: "height", + Width: "width", + }, + { + Path: "a.iusc > div > div > div.mimg > div", + Height: "data-height", + Width: "data-width", + }, + { + Path: "a.iusc > div > div > div.mimg > img", + Height: "height", + Width: "width", + }, + }, + Source: "div.imgpt > div > div.lnkw > a", +} diff --git a/src/search/engines/bingimages/json.go b/src/search/engines/bingimages/json.go new file mode 100644 index 00000000..3af5e86e --- /dev/null +++ b/src/search/engines/bingimages/json.go @@ -0,0 +1,8 @@ +package bingimages + +type JsonMetadata struct { + PageURL string `json:"purl"` + ThumbnailURL string `json:"turl"` + ImageURL string `json:"murl"` + Desc string `json:"desc"` +} diff --git a/src/search/engines/bingimages/options.go b/src/search/engines/bingimages/options.go new file mode 100644 index 00000000..39c29cd1 --- /dev/null +++ b/src/search/engines/bingimages/options.go @@ -0,0 +1,18 @@ +package bingimages + +import ( + "github.com/hearchco/hearchco/src/search/engines" +) + +var params = []string{"&async=1", "&count=35"} + +var Info = engines.Info{ + Domain: "www.bing.com", + Name: engines.BINGIMAGES, + URL: "https://www.bing.com/images/async?q=", + ResultsPerPage: 35, +} + +var Support = engines.SupportedSettings{ + Locale: true, +} diff --git a/src/search/engines/googleimages/googleimages.go b/src/search/engines/googleimages/googleimages.go new file mode 100644 index 00000000..0e1f1552 --- /dev/null +++ b/src/search/engines/googleimages/googleimages.go @@ -0,0 +1,110 @@ +package googleimages + +import ( + "context" + "fmt" + "strconv" + "strings" + + "github.com/goccy/go-json" + "github.com/gocolly/colly/v2" + "github.com/hearchco/hearchco/src/anonymize" + "github.com/hearchco/hearchco/src/config" + "github.com/hearchco/hearchco/src/search/bucket" + "github.com/hearchco/hearchco/src/search/engines" + "github.com/hearchco/hearchco/src/search/engines/_sedefaults" + "github.com/rs/zerolog/log" +) + +func Search(ctx context.Context, query string, relay *bucket.Relay, options engines.Options, settings config.Settings, timings config.Timings) error { + if err := _sedefaults.Prepare(Info.Name, &options, &settings, &Support, &Info, &ctx); err != nil { + return err + } + + var col *colly.Collector + var pagesCol *colly.Collector + var retError error + + _sedefaults.InitializeCollectors(&col, &pagesCol, &settings, &options, &timings) + // disable User Agent since Google Images responds with fake data if UA is correct + col.UserAgent = "" + + _sedefaults.PagesColRequest(Info.Name, pagesCol, ctx) + _sedefaults.PagesColError(Info.Name, pagesCol) + _sedefaults.PagesColResponse(Info.Name, pagesCol, relay) + + _sedefaults.ColRequest(Info.Name, col, ctx) + _sedefaults.ColError(Info.Name, col) + + var pageRankCounter = make([]int, options.MaxPages*Info.ResultsPerPage) + + col.OnResponse(func(e *colly.Response) { + body := string(e.Body) + index := strings.Index(body, "{\"ischj\":") + + if index == -1 { + log.Error(). + Str("body", body). + Msg("googleimages.Search() -> col.OnResponse: failed parsing response: failed finding start of JSON") + return + } + + body = body[index:] + var jsonResponse JsonResponse + if err := json.Unmarshal([]byte(body), &jsonResponse); err != nil { + log.Error(). + Str("body", body). + Msg("googleimages.Search() -> col.OnResponse: failed parsing response: failed unmarshalling JSON") + return + } + + page := _sedefaults.PageFromContext(e.Request.Ctx, Info.Name) + + for _, metadata := range jsonResponse.ISCHJ.Metadata { + origImg := metadata.OriginalImage + thmbImg := metadata.Thumbnail + resultJson := metadata.Result + textInGridJson := metadata.TextInGrid + + if resultJson.ReferrerUrl != "" && origImg.Url != "" && thmbImg.Url != "" { + res := bucket.MakeSEImageResult( + origImg.Url, resultJson.PageTitle, textInGridJson.Snippet, + resultJson.SiteTitle, resultJson.ReferrerUrl, thmbImg.Url, + origImg.Height, origImg.Width, thmbImg.Height, thmbImg.Width, + Info.Name, page, pageRankCounter[page]+1, + ) + bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) + pageRankCounter[page]++ + } else { + log.Error(). + Str("engine", Info.Name.String()). + Str("jsonMetadata", fmt.Sprintf("%v", metadata)). + Str("url", resultJson.ReferrerUrl). + Str("original", origImg.Url). + Str("thumbnail", thmbImg.Url). + Msg("googleimages.Search() -> col.OnResponse: Couldn't find image URL") + } + } + }) + + colCtx := colly.NewContext() + colCtx.Put("page", strconv.Itoa(1)) + + urll := Info.URL + query + params + "1" + anonUrll := Info.URL + anonymize.String(query) + params + "1" + _sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) + + for i := 1; i < options.MaxPages; i++ { + colCtx = colly.NewContext() + colCtx.Put("page", strconv.Itoa(i+1)) + + urll := Info.URL + query + params + strconv.Itoa(i*10) + anonUrll := Info.URL + anonymize.String(query) + params + strconv.Itoa(i*10) + _sedefaults.DoGetRequest(urll, anonUrll, colCtx, col, Info.Name, &retError) + } + + col.Wait() + pagesCol.Wait() + + return retError +} diff --git a/src/search/engines/googleimages/googleimages_test.go b/src/search/engines/googleimages/googleimages_test.go new file mode 100644 index 00000000..8a3a05dd --- /dev/null +++ b/src/search/engines/googleimages/googleimages_test.go @@ -0,0 +1,44 @@ +package googleimages_test + +import ( + "testing" + + "github.com/hearchco/hearchco/src/search/engines" + "github.com/hearchco/hearchco/src/search/engines/_engines_test" +) + +func TestSearch(t *testing.T) { + engineName := engines.GOOGLEIMAGES + + // testing config + conf := _engines_test.NewConfig(engineName) + + // test cases + tchar := [...]_engines_test.TestCaseHasAnyResults{{ + Query: "ping", + Options: engines.Options{ + MaxPages: 1, + VisitPages: false, + }, + }} + + tccr := [...]_engines_test.TestCaseContainsResults{{ + Query: "wikipedia logo", + ResultURL: []string{"upload.wikimedia.org"}, + Options: engines.Options{ + MaxPages: 1, + VisitPages: false, + }, + }} + + tcrr := [...]_engines_test.TestCaseRankedResults{{ + Query: "linux logo wikipedia", + ResultURL: []string{"upload.wikimedia.org"}, + Options: engines.Options{ + MaxPages: 1, + VisitPages: false, + }, + }} + + _engines_test.CheckTestCases(tchar[:], tccr[:], tcrr[:], t, conf) +} diff --git a/src/search/engines/googleimages/json.go b/src/search/engines/googleimages/json.go new file mode 100644 index 00000000..9f243fa1 --- /dev/null +++ b/src/search/engines/googleimages/json.go @@ -0,0 +1,32 @@ +package googleimages + +type Result struct { + ReferrerUrl string `json:"referrer_url"` + PageTitle string `json:"page_title"` + SiteTitle string `json:"site_title"` +} + +type TextInGrid struct { + Snippet string `json:"snippet"` +} + +type Image struct { + Url string `json:"url"` + Height int `json:"height"` + Width int `json:"width"` +} + +type Metadata struct { + Result Result `json:"result"` + TextInGrid TextInGrid `json:"text_in_grid"` + OriginalImage Image `json:"original_image"` + Thumbnail Image `json:"thumbnail"` +} + +type ISCHJ struct { + Metadata []Metadata `json:"metadata"` +} + +type JsonResponse struct { + ISCHJ ISCHJ `json:"ischj"` +} diff --git a/src/search/engines/googleimages/options.go b/src/search/engines/googleimages/options.go new file mode 100644 index 00000000..78108090 --- /dev/null +++ b/src/search/engines/googleimages/options.go @@ -0,0 +1,16 @@ +package googleimages + +import ( + "github.com/hearchco/hearchco/src/search/engines" +) + +var params = "&tbm=isch&asearch=isch&async=_fmt:json,p:1,ijn:" + +var Info engines.Info = engines.Info{ + Domain: "images.google.com", + Name: engines.GOOGLEIMAGES, + URL: "https://www.google.com/search?q=", + ResultsPerPage: 10, +} + +var Support engines.SupportedSettings = engines.SupportedSettings{} diff --git a/src/search/engines/googlescholar/googlescholar.go b/src/search/engines/googlescholar/googlescholar.go index 5476526c..97afa902 100644 --- a/src/search/engines/googlescholar/googlescholar.go +++ b/src/search/engines/googlescholar/googlescholar.go @@ -81,7 +81,10 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi func removeTelemetry(link string) string { parsedURL, err := url.Parse(link) if err != nil { - log.Error().Err(err).Msgf("error parsing url: %#v", link) + log.Error(). + Err(err). + Str("link", link). + Msg("error parsing link") return link } diff --git a/src/search/engines/name.go b/src/search/engines/name.go index d90edd88..9e3eb2f0 100644 --- a/src/search/engines/name.go +++ b/src/search/engines/name.go @@ -9,10 +9,12 @@ type Name uint8 const ( UNDEFINED Name = iota BING + BINGIMAGES BRAVE DUCKDUCKGO ETOOLS GOOGLE + GOOGLEIMAGES GOOGLESCHOLAR MOJEEK PRESEARCH diff --git a/src/search/result/result.go b/src/search/result/result.go index 36905f34..155b63cc 100644 --- a/src/search/result/result.go +++ b/src/search/result/result.go @@ -4,6 +4,19 @@ import ( "github.com/gocolly/colly/v2" ) +type ImageFormat struct { + Height uint + Width uint +} + +type ImageResult struct { + Original ImageFormat + Thumbnail ImageFormat + ThumbnailURL string + Source string + SourceURL string +} + // Everything about some Result, calculated and compiled from multiple search engines // The URL is the primary key type Result struct { @@ -14,6 +27,7 @@ type Result struct { Description string EngineRanks []RetrievedRank TimesReturned uint8 + ImageResult ImageResult Response *colly.Response } diff --git a/src/search/result/retrieved.go b/src/search/result/retrieved.go index 093814b1..d5101b5a 100644 --- a/src/search/result/retrieved.go +++ b/src/search/result/retrieved.go @@ -16,5 +16,6 @@ type RetrievedResult struct { URL string Title string Description string + ImageResult ImageResult Rank RetrievedRank }