Web Scraping

Quick win

import pandas as pd
df_list= pd.read_html("https://www.goldtraders.or.th/")
len(df_list)

Request HTML

import requests
website_url = "https://www.google.com/robots.txt"
res = requests.get(website_url)
res.text

import requests

url = "https://www.goldtraders.or.th/"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}

res = requests.get(url, headers=headers)
res.__dict__

Beautiful Soup

from bs4 import BeautifulSoup

html_doc = res.content
soup = BeautifulSoup(html_doc, 'html.parser')

display(HTML(soup.prettify()))

main_elem = soup.find("div", {"class": "main-panel"})
main_elem.find("h3")

main_elem.find("h3").text
bannerlist = soup.find_all("div", {"class": "bannerlist-panel"})

img_elem['src']

Selenium

! pip install selenium
! apt-get update
! apt install chromium-chromedriver

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)

url = "https://datawarehouse.dbd.go.th/biz/partner/info"
driver.get(url)

button = driver.find_element(By.XPATH, "//*[text()='ปิด']")
button.click()

driver.get("https://datawarehouse.dbd.go.th/biz/partner/info")

input_field  = driver.find_element(By.CSS_SELECTOR, 'input#textSearch.form-control')
input_field.clear()
input_field.send_keys("ws")
input_field.send_keys(Keys.RETURN)

from IPython.display import display, HTML
html_doc = driver.page_source
display(HTML(html_doc))

PreviousFirebase Firestore NextStreamlit

Last updated 1 year ago

Web Scraping

Quick win

import pandas as pd
df_list= pd.read_html("https://www.goldtraders.or.th/")
len(df_list)

Request HTML

import requests
website_url = "https://www.google.com/robots.txt"
res = requests.get(website_url)
res.text

import requests

url = "https://www.goldtraders.or.th/"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}

res = requests.get(url, headers=headers)
res.__dict__

Beautiful Soup

from bs4 import BeautifulSoup

html_doc = res.content
soup = BeautifulSoup(html_doc, 'html.parser')

display(HTML(soup.prettify()))

main_elem = soup.find("div", {"class": "main-panel"})
main_elem.find("h3")

main_elem.find("h3").text
bannerlist = soup.find_all("div", {"class": "bannerlist-panel"})

img_elem['src']

Selenium

! pip install selenium
! apt-get update
! apt install chromium-chromedriver

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)

url = "https://datawarehouse.dbd.go.th/biz/partner/info"
driver.get(url)

button = driver.find_element(By.XPATH, "//*[text()='ปิด']")
button.click()

driver.get("https://datawarehouse.dbd.go.th/biz/partner/info")

input_field  = driver.find_element(By.CSS_SELECTOR, 'input#textSearch.form-control')
input_field.clear()
input_field.send_keys("ws")
input_field.send_keys(Keys.RETURN)

from IPython.display import display, HTML
html_doc = driver.page_source
display(HTML(html_doc))

PreviousFirebase Firestore NextStreamlit

Last updated 1 year ago