Web Scraping
Quick win
import pandas as pd
df_list= pd.read_html("https://www.goldtraders.or.th/")
len(df_list)
Request HTML
import requests
website_url = "https://www.google.com/robots.txt"
res = requests.get(website_url)
res.text
import requests
url = "https://www.goldtraders.or.th/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
res = requests.get(url, headers=headers)
res.__dict__
Beautiful Soup
from bs4 import BeautifulSoup
html_doc = res.content
soup = BeautifulSoup(html_doc, 'html.parser')
display(HTML(soup.prettify()))
main_elem = soup.find("div", {"class": "main-panel"})
main_elem.find("h3")
main_elem.find("h3").text
bannerlist = soup.find_all("div", {"class": "bannerlist-panel"})
img_elem['src']
Selenium
! pip install selenium
! apt-get update
! apt install chromium-chromedriver
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)
url = "https://datawarehouse.dbd.go.th/biz/partner/info"
driver.get(url)
button = driver.find_element(By.XPATH, "//*[text()='ปิด']")
button.click()
driver.get("https://datawarehouse.dbd.go.th/biz/partner/info")
input_field = driver.find_element(By.CSS_SELECTOR, 'input#textSearch.form-control')
input_field.clear()
input_field.send_keys("ws")
input_field.send_keys(Keys.RETURN)
from IPython.display import display, HTML
html_doc = driver.page_source
display(HTML(html_doc))
Last updated