Databricks
diamonds = (spark.read
.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv")
)
diamonds.write.format("delta").mode("overwrite").save("/mnt/delta/diamonds")DROP TABLE IF EXISTS diamonds;
CREATE TABLE diamonds USING DELTA LOCATION '/mnt/delta/diamonds/'Read CSV
csvFile = "/mnt/training/wikipedia/pageviews/pageviews_by_second.tsv"
df = (spark.read # The DataFrameReader
.option("header", "true") # Use first line of all files as header
.option("sep", "\t") # Use tab delimiter (default is comma-separator)
.option("inferSchema", "true") # Automatically infer data types
.csv(csvFile) # Creates a DataFrame from CSV after reading in the file
.printSchema()
)
from pyspark.sql.types import *
csvSchema = StructType([
StructField("timestamp", StringType(), False),
StructField("site", StringType(), False),
StructField("requests", IntegerType(), False)
])
df = (spark.read # The DataFrameReader
.option('header', 'true') # Ignore line #1 - it's a header
.option('sep', "\t") # Use tab delimiter (default is comma-separator)
.schema(csvSchema) # Use the specified schema
.csv(csvFile) # Creates a DataFrame from CSV after reading in the file
.printSchema()
)Read Json
Read Parquet
References
Last updated