Webscrape CDC Website with Python

Web Scraping CDC Website using lxml

Note: The website URL has changed a couple times during the last month, you will have to update the url as well.

Import 3rd party packages. Use pip if you don’t have them already installed.

import requests
import datetime
from lxml import html

# url to scrape data from 
url = 'https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/cases-in-us.html'
  
# path to particular element 
path = '/html/body/div[7]/main/div[3]/div/div[3]/div[2]/div/div[1]/div/div[2]/ul'
  
# get response object 
response = requests.get(url) 
  
# get byte string 
byte_data = response.content 
  
# get access to the raw bytes of the response payload
source_code = html.fromstring(byte_data) 
  
# jump to preferred html element 
tree = source_code.xpath(path)

# url to scrape data from 
url2 = 'https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/cases-in-us.html'

# path to particular element 
path2 = '/html/body/div[7]/main/div[3]/div/div[3]/div[1]/div/p[1]/span'

# jump to preferred html element 
tree2 = source_code.xpath(path2)


  
# print texts in first element in list
d = datetime. datetime. today()
d = d.strftime('%m-%d-%Y')
print('As of '+str(d)+', the coronavirus impact on the US is:')

## As of 03-20-2020, the coronavirus impact on the US is:

print(tree[0].text_content())

## 
## Total cases: 15,219
## Total deaths: 201
## Jurisdictions reporting cases: 54 (50 states, District of Columbia, Puerto Rico, Guam, and US Virgin Islands)

print(tree2[0].text_content())

## Updated March 20, 2020