Skip to content

Commit

Permalink
Fix and add html extractor (#201)
Browse files Browse the repository at this point in the history
* Init super class in html doc extractor

* Add inscriptis html extractor

* Add integration tests

Co-authored-by: Augustin Godinot <[email protected]>
  • Loading branch information
grodino and Augustin Godinot authored Oct 19, 2022
1 parent 6db2d52 commit e3d8e1c
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 1 deletion.
7 changes: 7 additions & 0 deletions ir_datasets/lazy_libs.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ def bs4():
return _cache['bs4']


def inscriptis():
if 'inscriptis' not in _cache:
import inscriptis
_cache['inscriptis'] = inscriptis
return _cache['inscriptis']


def yaml():
if 'yaml' not in _cache:
import yaml
Expand Down
8 changes: 7 additions & 1 deletion ir_datasets/wrappers/html_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def bs4_extract(html):
output += '{} '.format(t)
return output

def inscriptis_extract(html):
get_text = ir_datasets.lazy_libs.inscriptis().get_text
return get_text(html.decode("utf-8",'ignore'))


class HtmlDocIter:
def __init__(self, it, extractor):
Expand Down Expand Up @@ -73,6 +77,7 @@ def docs_store(self):

class HtmlDocExtractorDocStoreWrapper(ir_datasets.indices.Docstore):
def __init__(self, docstore, extractor):
super().__init__(docstore._doc_cls, docstore._id_field)
self.docstore = docstore
self.extractor = extractor

Expand Down Expand Up @@ -118,7 +123,8 @@ def it_out():
def _doc_map(args):
doc, field_content_type, extractor, docs_cls = args
extractor = {
'bs4': bs4_extract
'bs4': bs4_extract,
'inscriptis': inscriptis_extract,
}[extractor]
result = list(doc)
any_updates = False
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
beautifulsoup4>=4.4.1
inscriptis>=2.2.0
lxml>=4.5.2
numpy>=1.18.1
pyyaml>=5.3.1
Expand Down
12 changes: 12 additions & 0 deletions test/integration/clueweb12.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@ def test_clueweb12_docs_html(self):
1000: WarcDoc('clueweb12-0000tw-00-01002', 'http://beanpotscastiron.waffleironshapes.com/le-creuset-enameled-cast-iron-7-14-quart-round-french-oven-cherry-red-save-price-shopping-online/', '2012-02-10T21:55:43Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:55:42 GMT\\\r\nServer: Apache\\\r\nX\\-Pingback: http://beanpotscas.{70}waffleironshapes\\.com/\\?p=5>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile('^Le Creuset Enameled Cast\\-Iron 7\\-1/4\\-Quart Round French Oven, Cherry Red Save Price Shopping Online \\|.{4936}sites to earn advertising fees by advertising and linking to amazon\\.com Web Toolbar by Wibiya \n \n \n $', flags=48), 'text/plain'),
})

extracted_text = {
'clueweb12-0000tw-00-00000': ' rss § atom § rdf\n\n Photos aggregator\n\n dynamic content\n\n Search:\n * \n * Add album/Contact us\n * News\n * Reviews\n\n shaggyshoo has added a photo to the pool:\n\n annecy. france.\n\n France image Pool 2012-02-10 16:22:52\n\n * February 10th, 2012\n\n * Tags cloud\n\n WP Cumulus Flash tag cloud by Roy Tanck and Luke Morton requires Flash Player 9 or better.\n\n * Twits from \'photobabble\'\n\n + No public Twitter messages.\n\n Based on Ocular Professor § Powered by WordPress\n',
'clueweb12-0000tw-00-00009': ' * Home\n * About me\n * Contact me\n Feb 10, 2012\n\n Posted by clay in Christ and culture, Writing | 0 Comments\n\n Lessons learned from a week on vacation\n\n It’s been quite a week. We have one more night here at Walt Disney World, then we are headed home. I’ll give you a bit of a preview for next week though. There are a few things I’ve noticed here that develop into great analogies for life lessons. Anything from a large, burly fellow with a pink Disney Princess backpack, to using the bathroom at the dinner table, to the lack of wisdom in crowds. I think you’ll enjoy these.\n\n However, today I’d like to share with a you a few lessons that have been forcefully taught to me this week. Some painful, some really painful, and some rather enjoyable.\n\n I love that I’ve been able to spend a week in a single hotel room with my wife and kids. However, I’m going crazy spending a week in a single hotel room with my kids. Bedtime is the absolute craziest time of day. I don’t know if it is the tired children, the tired parents, or the insanity of being in one room, but little ones do not go to sleep well this way, at least not ours. Now, this is a very first world problem, and I know that I should be thankful that I’m able to take trips like this with my children. Don’t misunderstand me, because I am very thankful for that. The simple fact remains that it is so far out of the ordinary for us as to be uncomfortable and even more tiring.\n\n The second thing I’ve learned is that you can tell how many days people have been here just by the look on their faces. There is an unspoken “I’ve sat through ;It’s A Small World’ so many times that i might just twist off at the next puppet I see” look to some faces. These are the people that are not hardcore Disney fans, but came mostly for their children. (An aside: you know what would make “Small World” so much better? Bucket o’ softballs. You’ll have to create a queue that would stretch all the way across the park for it. So many childhood doll nightmares erased in a hail of tossed balls. It’s a goldmine. I expect my royalty checks to pour in soon.)\n\n Too many people come here with the expectation that they can accomplish everything at WDW in a week. This is so far from the truth. My wife and I have been coming here since 1996, and we’ve still not seen or done it all. The likelihood is pretty slim that it will happen too. Too many things changing or disappearing to be able to conquer that mountain.\n\n The one lasting memory I have from our trips to WDW is the sense of awe that my children feel. Heck, I feel it too. Everything here is so much bigger, so much more perfect than the real world. Of course, if it weren’t that way, we wouldn’t be paying to visit. If someone built a Computer Progeammer World, I’d have to be drugged to be taken in to it. I don’t want my real life, I want a different reality. That might be good sometimes, but it definitely isn’t a way to live.\n\n Finally, it appears to me that while they are here, people become a clearer, more exact picture of themselves due to the money and time spent, and emotional and physical investment paid to be here. Vacations, especially big ones like WDW have a way of refining our personalities in a way that other experiences don’t. I’ve seen adults act like complete children and complete jerks. I’ve seen children be perfect angels and ceaseless terrors (and you should have seen everyone else’s kids, too). I’m sure that I exhibit some of this too. We are who we are, and exhaustion makes the pretensions go away.\n\n All in all, it’s been a great time here. I”m anxious to start home and return to my own reality though. Too many exciting things coming up in the next few weeks that I don’t want to miss. I hope you come back again next week for my Observations on the World series.\n\n Related Posts:\n\n * Next year’s model\n * Flummoxed: a week in review\n * What is bi-vocational ministry?\n * What’s in your back yard?\n * Where to go from here\n * Powered by Contextual Related Posts\n\n Leave a Reply\n\n Your email address will not be published. Required fields are marked *\n\n Name *\n\n Email *\n\n Website\n\n Comment\n\n You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>\n\n Notify me of follow-up comments by email.\n\n Notify me of new posts by email.\n\n Subscribe to claywginn.com via email!\n\n Don\'t want to come check out the site every day? Subscribe here and I\'ll send you an email every time a new post goes up.\n\n Join 7 other subscribers\n\n Popular Posts\n\n * “What kind of God wants you to be poor and miserable?” (33)\n * Humble pride (33)\n * Humble pride: a follow up (31)\n * The beginning (20)\n * Why do I write? (19)\n * Starting over… (18)\n * Lost in the woods (18)\n * Expectations (18)\n * What is your mission? (18)\n * What is bi-vocational ministry? (17)\n\n Categories\n\n * Apologetics (6)\n * Bi-vocational Ministry (1)\n * Christ and culture (15)\n * Discipleship (18)\n * Featured (3)\n * Mission (2)\n * Writing (19)\n\n Friends\n\n Sitemeter\n\n AdSense\n\n Word Count Statistics\n\n Total: 21,977 Words\n Posts: 21,458 Words (511 Avg.)\n\n Powered by WordPress | Designed by Elegant Themes\n',
'clueweb12-0000tw-00-01002': ' Bean Pots Cast Iron\n\n « Joyce Chen J90-0704 6-Quart Stoneware Chinese Cooking Pot, Black Cheapest Prices\n Le Creuset Heritage Collection Enameled Cast-Iron 2-Quart Legumier, Cobalt Online Shopping »\n\n Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red Save Price Shopping Online\n\n ►►►Save Big on Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red\n\n Find Deals on Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red. We Offer Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red For Shopping Deals\n\n I read a lot of the Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red affordable price reviews before I bought this. Don’t spend more than you have to! I already done the research for you. Read Here where to buy Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red at The CRAZY Prices!\n\n Visit the bestsellers in Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red list for authoritative information on this product – current rank.\n\n read more\n\n Since 1925, Le Creuset has been handcrafting Enameled Cast Iron cookware, and particularly Round French Ovens (or Dutch Ovens), in Northern France. While this popular shape has been around for many centuries before that, the basic design has changed very little thus endorsing the cooking qualities that it provides. Generation after generation has come to cherish the Le Creuset Round French Oven’s quality, durability, and versatility, and it easily becomes the core piece in any well-equipped kitchen. The cast iron provides superb heat retention and distribution, and the enamel is hard-wearing and non-reactive, making the number of recipes that you can do in this pot endless: anything from savory rice to braised chicken to mouth-watering cake. Your imagination is the only limit!.\n\n » See More Images «\n\n Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red | Save Price Shopping Online at Shop Online\n\n Price List: $375.00\n GET DISCOUNT / FREE SHIPPING\n » See Best Price « \n \n\n Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red Product Feature\n\n * 7-1/4-quart round-shaped French oven made of enameled cast iron\n * Cast-iron loop side handles; black, phenolic, stay-cool lid knob\n * Heavy, tight-fitting lid helps lock in heat, moisture, and flavor\n * Washing by hand recommended; oven-safe to 350 degrees F\n * Measures 11-3/5 by 14 by 7 inches; limited lifetime warranty\n\n I have tried to find information for Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red on the Internet. There are a lot affordable price as well. You can purchase a Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red in the best reasonable price of the web site, Amazon, you will not hesitate a moment.\n\n ★★★ HOT ITEM LIKE THIS TEND TO SELL OUT VERY QUICKLY ★★★\n\n\n\n Title\n\n\n 4\n 29\n\n\n Relate keywords : bean pots cast iron,ebay cast iron,cookware cast iron,crocks cast iron,bed bath and beyond cast iron,kohls cast iron,bean pots brass,bean pots gas,bean pots metal,bean pots porcelain,bean pots copper,bean pot recipes cast iron,walmart cast iron,bean pots cast aluminum,bean pots dutch oven,bean pots stainless steel,pottery cast iron,williams sonoma cast iron,bean pots steel,bean pots cast iron camp,bean pots cast iron covered\n\n Comments\n\n Tags: 714Quart, CastIron, Cherry, Creuset, Enameled, French, French CastIron, Online, Shopping, Shopping 714Quart\n\n Comments are closed.\n\n Recent Posts\n * Nordic Ware Grilling Essentials Cast Bean Pot Save Price with Promotion Today\n * Le Creuset Heritage Collection Enameled Cast-Iron 2-Quart Legumier, Cobalt Online Shopping\n * Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red Save Price Shopping Online\n * Joyce Chen J90-0704 6-Quart Stoneware Chinese Cooking Pot, Black Cheapest Prices\n * LE CREUSET Enameled Cast Iron 4-1/4 Quart Soup Pot Blue Online Shopping\n * Bayou Classic 7448, 2.5-Qt. Cast Iron Bean Pot with Lid Online Shopping\n * Le Creuset Enameled Cast-Iron 7-1/4-Quart Round French Oven, Cherry Red Get Discount\n Tags\n 714Quart CastIron CastIron 714Quart Cherry Creuset Discount Enameled French Online Shopping\n\n Powered by WordPress and Created for Bean Pots Cast Iron | Waffle Iron Shapes\n\n http://beanpotscastiron.waffleironshapes.com is a participant in the Amazon Services LLC Associates Program, an affiliate advertising program designed to provide a means for sites to earn advertising fees by advertising and linking to amazon.com\n\n Web Toolbar by Wibiya',
}
self._test_docs(ir_datasets.wrappers.HtmlDocExtractor(ir_datasets.load('clueweb12'), extractor='inscriptis'), items={
0: WarcDoc('clueweb12-0000tw-00-00000', 'http://tsawer.net/2012/02/10/france-image-pool-2012-02-10-162252/', '2012-02-10T22:50:41Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 22:50:40 GMT\\\r\nServer: Apache/2\\.2\\.21 \\(Unix\\) mod_ssl/2\\.2\\.21 Op.{338}ortlink\\\r\nVary: Accept\\-Encoding,User\\-Agent\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(re.escape(extracted_text['clueweb12-0000tw-00-00000']), flags=48), 'text/plain'),
9: WarcDoc('clueweb12-0000tw-00-00009', 'http://claywginn.com/2012/02/10/lessons-learned-from-a-week-on-vacation/', '2012-02-10T21:47:35Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:47:36 GMT\\\r\nServer: Apache\\\r\nX\\-Powered\\-By: PHP/5\\.2\\.17\\\r\nX\\-Pi.{45}: <http://wp\\.me/p1zQki\\-AT>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(re.escape(extracted_text['clueweb12-0000tw-00-00009']), flags=48), 'text/plain'),
1000: WarcDoc('clueweb12-0000tw-00-01002', 'http://beanpotscastiron.waffleironshapes.com/le-creuset-enameled-cast-iron-7-14-quart-round-french-oven-cherry-red-save-price-shopping-online/', '2012-02-10T21:55:43Z', re.compile(b'^HTTP/1\\.1 200 OK\\\r\nDate: Fri, 10 Feb 2012 21:55:42 GMT\\\r\nServer: Apache\\\r\nX\\-Pingback: http://beanpotscas.{70}waffleironshapes\\.com/\\?p=5>; rel=shortlink\\\r\nConnection: close\\\r\nContent\\-Type: text/html; charset=UTF\\-8$', flags=16), re.compile(re.escape(extracted_text['clueweb12-0000tw-00-01002']), flags=48), 'text/plain'),
})


def test_clueweb12_docstore(self):
docstore = ir_datasets.load('clueweb12').docs_store()
docstore.clear_cache()
Expand Down

0 comments on commit e3d8e1c

Please sign in to comment.