Scraping news data at scale, with optimal quality in +40 languages has never been easier.
Insight ready schemas that minimize the time to value from news data by up to 90% compared to custom manual extraction. Delivered by a reliable and always-on service with 99.9% uptime; 24/7, 365 days a year.
Zyte Automatic Extraction supports a comprehensive list of article and news data fields. The output is delivered directly to your AWS S3 bucket or anywhere else you need it.
[
{
"article": {
"headline": "Article headline",
"datePublished": "2019-06-19T00:00:00",
"datePublishedRaw": "June 19, 2019",
"dateModified": "2019-06-21T00:00:00",
"dateModifiedRaw": "June 21, 2019",
"author": "Article author",
"authorsList": [
"Article author"
],
"inLanguage": "en",
"breadcrumbs": [
{
"name": "Level 1",
"link": "http://example.com"
}
],
"mainImage": "http://example.com/image.png",
"images": [
"http://example.com/image.png"
],
"description": "Article summary",
"articleBody": "Article body ...",
"articleBodyHtml": "<article><p>Article body ... </p> ... </article>",
"articleBodyRaw": "<div id=\"an-article\">Article body ...",
"videoUrls": [
"https://example.com/video.mp4"
],
"audioUrls": [
"https://example.com/audio.mp3"
],
"probability": 0.95,
"canonicalUrl": "https://example.com/article/article-about-something",
"url": "https://example.com/article?id=24"
},
"webPage": {
"inLanguages": [
{"code": "en"},
{"code": "es"}
]
},
"query": {
"id": "1564747029122-9e02a1868d70b7a3",
"domain": "example.com",
"userQuery": {
"pageType": "article",
"url": "http://example.com/article?id=24"
}
},
"algorithmVersion": "20.8.1"
}
]