Loading in the required packages

from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
from geopy.geocoders import ArcGIS
import os

Using the requests and beautifulsoup packages to load in the url and convert the webpage to ‘soup’

url = 'https://buc-ees.com/locations/'
page = requests.get(url)
soup = bs(page.text, 'html')

Searching for address and location info by using specific html tags from the webpage

locations = soup.find_all('div', class_='bucees-location-address')
cities = soup.find_all('h4')

Looping through specified div and class tags on the webpage to extract and clean location data

cities_list = [city.text.strip() for city in cities]
#print(cities_list)

address = [location.text for location in locations]
address_clean = []
for x in address:
    street_address = re.sub('(Ethanol-Free:\d+\s\w+\s\d+\sOctane|Ethanol-Free:\d+\sOctane)', '', x)
    address_clean.append(street_address)

Further cleaning the data and stripping whitespace from the entries to isolate city and state values for each location

address_clean

city_state_df = pd.DataFrame(cities_list)
city_state_df.reset_index(inplace=True)
city_state_df.rename( columns={0:'raw', 'index':'#'}, inplace=True )
city_state_df['state'] = city_state_df['raw'].str.extract(r'(\w+$)')
city_state_df['city'] = city_state_df['raw'].str.extract(r'([A-Za-z]+,|[A-Za-z]+\s\w+,)')
city_state_df['city'] = city_state_df['city'].str.replace(',', '')
#city_state_df 
city_state = city_state_df[['#', 'city', 'state']]
city_state.head().style    
  # city state
0 0 Athens AL
1 1 Auburn AL
2 2 Leeds AL
3 3 Loxley AL
4 4 Daytona Beach FL

Extracting zip codes and street numbers for each location using the regex package

table = pd.DataFrame(columns = ['raw', 'city', 'state', 'zip', 'street_address'])
df = pd.DataFrame(address_clean)
#df['num'] = len(address_clean) - row
df.reset_index(inplace=True)
df.rename( columns={0:'raw','index':'#'}, inplace=True)
df['zip_code'] = df['raw'].str.extract(r'(\d+$)')
df['state'] = df['raw'].str.extract(r'(,\s\D+)')
df['state'] = df['state'].str.extract(r'(\w+\s\w+|\w+)')
df['street_num'] = df['raw'].str.extract(r'(\d+)')
df['city'] = df['raw'].str.extract(r'(\w+,|\w+\s\w+,)')

zip_df = df[['#', 'zip_code', 'street_num']]
zip_df.head()
# zip_code street_num
0 0 35613 2328
1 1 36832 2500
2 2 35094 6900
3 3 36567 20403
4 4 32117 2330

Combining street numbers with street names to form full addresses for each location

address_list = []

for i in locations:
    local = re.search(r'(>(.*?)<)', str(i))
    address_list.append(local[0])
    
add_list_clean = []

for row in address_list:
    
    street = re.sub('>|<', '', str(row))
    add_list_clean.append(street)
    
add_list_clean

addy = pd.DataFrame(add_list_clean)
addy.reset_index(inplace=True)
addy.rename( columns={0:'street_address', 'index':'#'}, inplace=True )
addy.head()
# street_address
0 0 2328 Lindsay Lane South
1 1 2500 Buc-ee’s Blvd
2 2 6900 Buc-ee’s Blvd.
3 3 20403 County Rd. 68
4 4 2330 Gateway North Drive

Merging all 3 dataframes together to create a full dataset

half_bucee = addy.merge(zip_df, how='right')
full_bucee = city_state.merge(half_bucee, how='right')
full_bucee['street_name'] = full_bucee['street_address'].replace(r'(^\d+\s)','', regex=True)
full_bucee['full_addy'] = full_bucee['street_address']+', '+full_bucee['city']+', '+full_bucee['state']+' '+full_bucee['zip_code']
full_bucee[['#', 'full_addy', 'street_num', 'street_name', 'city', 'state', 'zip_code']]
full_bucee['store_code'] = full_bucee['#'].apply(str) + '_' + full_bucee['city']
full_bucee.head()
# city state street_address zip_code street_num street_name full_addy store_code
0 0 Athens AL 2328 Lindsay Lane South 35613 2328 Lindsay Lane South 2328 Lindsay Lane South, Athens, AL 35613 0_Athens
1 1 Auburn AL 2500 Buc-ee’s Blvd 36832 2500 Buc-ee’s Blvd 2500 Buc-ee’s Blvd, Auburn, AL 36832 1_Auburn
2 2 Leeds AL 6900 Buc-ee’s Blvd. 35094 6900 Buc-ee’s Blvd. 6900 Buc-ee’s Blvd., Leeds, AL 35094 2_Leeds
3 3 Loxley AL 20403 County Rd. 68 36567 20403 County Rd. 68 20403 County Rd. 68, Loxley, AL 36567 3_Loxley
4 4 Daytona Beach FL 2330 Gateway North Drive 32117 2330 Gateway North Drive 2330 Gateway North Drive, Daytona Beach, FL 32117 4_Daytona Beach

Assigning the ‘nom’ variable to the ArcGIS function to create latitude/longitude data for each Buc’ees location

nom=ArcGIS()

Creating a dummy column in the dataset to hold GIS data

full_bucee['coord'] = full_bucee['full_addy'].apply(nom.geocode)

Splitting the ‘coord’ column into seperate latitude and longitude data

full_bucee['lat'] = full_bucee['coord'].apply(lambda x: x.latitude if x != None else None)
full_bucee['lon'] = full_bucee['coord'].apply(lambda x: x.longitude if x != None else None)
clean_beaver = full_bucee[['city', 'state', 'store_code','full_addy', 'lat', 'lon']]
clean_beaver.head()
city state store_code full_addy lat lon
0 Athens AL 0_Athens 2328 Lindsay Lane South, Athens, AL 35613 34.730718 -86.930749
1 Auburn AL 1_Auburn 2500 Buc-ee’s Blvd, Auburn, AL 36832 32.610800 -85.494635
2 Leeds AL 2_Leeds 6900 Buc-ee’s Blvd., Leeds, AL 35094 33.543646 -86.587598
3 Loxley AL 3_Loxley 20403 County Rd. 68, Loxley, AL 36567 30.634522 -87.676438
4 Daytona Beach FL 4_Daytona Beach 2330 Gateway North Drive, Daytona Beach, FL 32117 29.224078 -81.099183

Exporting dataframe to csv and saving to local drive

os.makedirs('/User/rieke/R FILES', exist_ok=True)
clean_beaver.to_csv('/Users/rieke/R FILES/bucees.csv')