Scraping news data at scale, with optimal quality in +40 languages has never been easier.
Insight ready schemas that minimize the time to value from news data by up to 90% compared to custom manual extraction. Delivered by a reliable and always-on service with 99.9% uptime; 24/7, 365 days a year.
Zyte Automatic Extraction supports a comprehensive list of article and news data fields. The output is delivered directly to your AWS S3 bucket or anywhere else you need it.
[ { "article": { "headline": "Article headline", "datePublished": "2019-06-19T00:00:00", "datePublishedRaw": "June 19, 2019", "dateModified": "2019-06-21T00:00:00", "dateModifiedRaw": "June 21, 2019", "author": "Article author", "authorsList": [ "Article author" ], "inLanguage": "en", "breadcrumbs": [ { "name": "Level 1", "link": "http://example.com" } ], "mainImage": "http://example.com/image.png", "images": [ "http://example.com/image.png" ], "description": "Article summary", "articleBody": "Article body ...", "articleBodyHtml": "<article><p>Article body ... </p> ... </article>", "articleBodyRaw": "<div id=\"an-article\">Article body ...", "videoUrls": [ "https://example.com/video.mp4" ], "audioUrls": [ "https://example.com/audio.mp3" ], "probability": 0.95, "canonicalUrl": "https://example.com/article/article-about-something", "url": "https://example.com/article?id=24" }, "webPage": { "inLanguages": [ {"code": "en"}, {"code": "es"} ] }, "query": { "id": "1564747029122-9e02a1868d70b7a3", "domain": "example.com", "userQuery": { "pageType": "article", "url": "http://example.com/article?id=24" } }, "algorithmVersion": "20.8.1" } ]