library(tidyverse)
library(DBI)
What is a database? It is what google says
a structured set of data held in a computer, especially one that is accessible in various ways.
A relational database is a type of database that stores and provides access to data points that are related to one another. Relation databases are administrated by a Relational Database Management System (RDBMS). The data in RDBMS is stored in database objects called tables. A table is a collection of related data entries and it consists of columns and rows.
There are many RDBMS - MySQL (owned by Oracle) - MariaDB (open source version MySQL) - PostgreSQL (open source, ANSI) - SQL Server (microsoft) - SQLite (open source, single file)
What is SQL? Structured Query Language (or SQL) is a standard language for accessing and manipulating relational databases. However, each RDMBS may have their own extension of the SQL language and their implmentation may vary too.
We are going to use a popular database called Sakila https://www.jooq.org/sakila.
The Sakila database is a nicely normalised schema modelling a DVD rental store, featuring things like films, actors, film-actor relationships, and a central inventory table that connects films, stores, and rentals.
In the following, we are going to use sqlite
, postgresql
and MySql
.
The database is called sakila.sqlite
. You could either git clone from lectures repo or download using the code
if (!file.exists("sakila.sqlite") || file.size("sakila.sqlite") == 0) {
download.file(
"https://github.com/ucdavis-sta141b-2021-winter/sta141b-lectures/raw/master/ch02-sql/sakila.sqlite",
destfile = "sakila.sqlite")
}
The file format is .sqlite
which is one of the very common relational database formats, especially for simple problems.
sakila_lite <- dbConnect(RSQLite::SQLite(), dbname = "sakila.sqlite")
sakila_lite %>% dbListTables()
## [1] "actor" "address" "category"
## [4] "city" "country" "customer"
## [7] "customer_list" "film" "film_actor"
## [10] "film_category" "film_list" "film_text"
## [13] "inventory" "language" "payment"
## [16] "rental" "sales_by_film_category" "sales_by_store"
## [19] "sqlite_sequence" "staff" "staff_list"
## [22] "store"
sakila_lite %>% dbListFields("actor") # column names
## [1] "actor_id" "first_name" "last_name" "last_update"
I have also uploaded the Sakila database to a postgres server owned by the department. (You’ll need to either on the campus or over UCD vpn to connect to it)
sakila_psql <- dbConnect(RPostgres::Postgres(),
dbname = "sakila",
user = "psqluser", password = "secret", host = "alan.ucdavis.edu"
)
sakila_psql %>% dbListTables()
## [1] "inventory" "staff"
## [3] "payment_p2017_05" "payment_p2017_06"
## [5] "payment" "actor"
## [7] "store" "payment_p2017_02"
## [9] "rental" "language"
## [11] "payment_p2017_03" "address"
## [13] "payment_p2017_01" "film"
## [15] "country" "city"
## [17] "payment_p2017_04" "film_actor"
## [19] "category" "film_category"
## [21] "customer" "actor_info"
## [23] "customer_list" "film_list"
## [25] "nicer_but_slower_film_list" "sales_by_film_category"
## [27] "sales_by_store" "staff_list"
MySQL (MariaDB) is another popular RDBMS.
sakila_mysql <- dbConnect(RMariaDB::MariaDB(),
dbname = "DATA",
user = "student", password = "141b@2020stats", host = "alan.ucdavis.edu"
)
sakila_mysql %>% dbListTables()
## [1] "actor" "address"
## [3] "category" "city"
## [5] "country" "customer"
## [7] "customer_list" "film"
## [9] "film_actor" "film_category"
## [11] "film_list" "film_text"
## [13] "inventory" "language"
## [15] "nicer_but_slower_film_list" "payment"
## [17] "rental" "sales_by_film_category"
## [19] "sales_by_store" "staff"
## [21] "staff_list" "store"
dplyr
provides an excellent interface for users without any SQL background to query databases.
film <- sakila_lite %>%
tbl("film")
# to download all observations
film2 <- film %>%
collect()
film
relies on the connection
# dbDisconnect(sakila_lite)
sakila_lite %>% tbl("film")
creates a virtual table rather loading the whole table into memory.
class(film)
## [1] "tbl_SQLiteConnection" "tbl_dbi" "tbl_sql"
## [4] "tbl_lazy" "tbl"
class(film2)
## [1] "tbl_df" "tbl" "data.frame"
sakila_lite %>% dbListFields("film")
## [1] "film_id" "title" "description"
## [4] "release_year" "language_id" "original_language_id"
## [7] "rental_duration" "rental_rate" "length"
## [10] "replacement_cost" "rating" "special_features"
## [13] "last_update"
film90 <- film %>%
filter(length > 90) %>%
collect()
class(film90)
## [1] "tbl_df" "tbl" "data.frame"
film90
## # A tibble: 675 x 13
## film_id title description release_year language_id original_langua…
## <int> <chr> <chr> <chr> <int> <int>
## 1 4 AFFA… A Fanciful… 2006 1 NA
## 2 5 AFRI… A Fast-Pac… 2006 1 NA
## 3 6 AGEN… A Intrepid… 2006 1 NA
## 4 9 ALAB… A Thoughtf… 2006 1 NA
## 5 11 ALAM… A Boring E… 2006 1 NA
## 6 12 ALAS… A Fanciful… 2006 1 NA
## 7 13 ALI … A Action-P… 2006 1 NA
## 8 14 ALIC… A Emotiona… 2006 1 NA
## 9 16 ALLE… A Fast-Pac… 2006 1 NA
## 10 19 AMAD… A Emotiona… 2006 1 NA
## # … with 665 more rows, and 7 more variables: rental_duration <int>,
## # rental_rate <dbl>, length <int>, replacement_cost <dbl>, rating <chr>,
## # special_features <chr>, last_update <chr>
https://datamastery.gitlab.io/exercises/sakila-queries.html
scarlett <- sakila_lite %>%
tbl("actor") %>%
filter(str_to_lower(first_name) == str_to_lower("Scarlett")) %>%
collect()
Suppose we want to make the result a bit more beautiful.
scarlett %>%
mutate(
first_name = str_to_title(first_name),
last_name = str_to_title(last_name))
## # A tibble: 2 x 4
## actor_id first_name last_name last_update
## <int> <chr> <chr> <chr>
## 1 81 Scarlett Damon 2019-04-11 18:11:48
## 2 124 Scarlett Bening 2019-04-11 18:11:48
Note: mutation is done locally in R.
It is possible to do mutation on the remote server (Postgresql sevrer)
sakila_psql %>%
tbl("actor") %>%
filter(str_to_lower(first_name) == str_to_lower("Scarlett")) %>%
mutate(
first_name = str_to_title(first_name),
last_name = str_to_title(last_name)) %>%
collect()
## # A tibble: 2 x 4
## actor_id first_name last_name last_update
## <int> <chr> <chr> <dttm>
## 1 81 Scarlett Damon 2017-02-15 09:34:33
## 2 124 Scarlett Bening 2017-02-15 09:34:33
Note: SQLite and Mysql don’t support transforming title case but Postgresql does.
sakila_lite %>%
tbl("actor") %>%
filter(str_to_lower(last_name) == "johansson") %>%
collect()
## # A tibble: 3 x 4
## actor_id first_name last_name last_update
## <int> <chr> <chr> <chr>
## 1 8 MATTHEW JOHANSSON 2019-04-11 18:11:48
## 2 64 RAY JOHANSSON 2019-04-11 18:11:48
## 3 146 ALBERT JOHANSSON 2019-04-11 18:11:48
sakila_lite %>%
tbl("actor") %>%
summarize(n = n_distinct(last_name)) %>%
collect()
## # A tibble: 1 x 1
## n
## <int>
## 1 121
sakila_lite %>%
tbl("actor") %>%
count(last_name) %>%
filter(n == 1) %>%
collect()
## # A tibble: 66 x 2
## last_name n
## <chr> <int>
## 1 ASTAIRE 1
## 2 BACALL 1
## 3 BALE 1
## 4 BALL 1
## 5 BARRYMORE 1
## 6 BASINGER 1
## 7 BERGEN 1
## 8 BERGMAN 1
## 9 BIRCH 1
## 10 BLOOM 1
## # … with 56 more rows
sakila_lite %>%
tbl("actor") %>%
count(last_name) %>%
filter(n > 1) %>%
collect()
## # A tibble: 55 x 2
## last_name n
## <chr> <int>
## 1 AKROYD 3
## 2 ALLEN 3
## 3 BAILEY 2
## 4 BENING 2
## 5 BERRY 3
## 6 BOLGER 2
## 7 BRODY 2
## 8 CAGE 2
## 9 CHASE 2
## 10 CRAWFORD 2
## # … with 45 more rows
actor <- sakila_lite %>% tbl("actor")
sakila_lite %>%
tbl("film_actor") %>%
count(actor_id) %>%
arrange(desc(n)) %>%
head(1) %>%
inner_join(actor, by = "actor_id") %>%
collect()
## # A tibble: 1 x 5
## actor_id n first_name last_name last_update
## <int> <int> <chr> <chr> <chr>
## 1 107 42 GINA DEGENERES 2019-04-11 18:11:48
sakila_lite %>%
tbl("film") %>%
summarize(m = mean(length)) %>%
collect()
## Warning: Missing values are always removed in SQL.
## Use `mean(x, na.rm = TRUE)` to silence this warning
## This warning is displayed only once per session.
## # A tibble: 1 x 1
## m
## <dbl>
## 1 115.
sakila_lite %>%
tbl("film") %>%
left_join(tbl(sakila_lite, "film_category"), by = "film_id") %>%
group_by(category_id) %>%
summarize(mean_length = mean(length)) %>%
left_join(tbl(sakila_lite, "category"), by = "category_id") %>%
select(name, mean_length) %>%
collect()
## # A tibble: 16 x 2
## name mean_length
## <chr> <dbl>
## 1 Action 112.
## 2 Animation 111.
## 3 Children 110.
## 4 Classics 112.
## 5 Comedy 116.
## 6 Documentary 109.
## 7 Drama 121.
## 8 Family 115.
## 9 Foreign 122.
## 10 Games 128.
## 11 Horror 112.
## 12 Music 114.
## 13 New 111.
## 14 Sci-Fi 108.
## 15 Sports 128.
## 16 Travel 113.
uz <- sakila_lite %>%
tbl("film") %>%
filter(str_to_lower(title) == str_to_lower("Unforgiven Zoolander")) %>%
select(film_id)
all_inventories_of_store1 <- sakila_lite %>%
tbl("inventory") %>%
filter(store_id == 1) %>%
select(film_id, inventory_id)
not_yet_returned <- sakila_lite %>%
tbl("rental") %>%
filter(is.na(return_date)) %>%
select(inventory_id)
uz %>%
inner_join(all_inventories_of_store1, by = "film_id") %>%
anti_join(not_yet_returned, by = "inventory_id") %>%
count() %>%
collect()
## # A tibble: 1 x 1
## n
## <int>
## 1 2
We just see some example queries of a relational database. Behind the scene, we are using a language called SQL. For example, in the last query, the SQL used is
uz %>%
inner_join(all_inventories_of_store1) %>%
anti_join(not_yet_returned) %>%
count() %>%
show_query()
## Joining, by = "film_id"
## Joining, by = "inventory_id"
## <SQL>
## SELECT COUNT() AS `n`
## FROM (SELECT * FROM (SELECT `LHS`.`film_id` AS `film_id`, `RHS`.`inventory_id` AS `inventory_id`
## FROM (SELECT `film_id`
## FROM `film`
## WHERE (LOWER(`title`) = LOWER('Unforgiven Zoolander'))) AS `LHS`
## INNER JOIN (SELECT `film_id`, `inventory_id`
## FROM `inventory`
## WHERE (`store_id` = 1.0)) AS `RHS`
## ON (`LHS`.`film_id` = `RHS`.`film_id`)
## ) AS `LHS`
## WHERE NOT EXISTS (
## SELECT 1 FROM (SELECT `inventory_id`
## FROM `rental`
## WHERE (((`return_date`) IS NULL))) AS `RHS`
## WHERE (`LHS`.`inventory_id` = `RHS`.`inventory_id`)
## ))
Why learning SQL when there is dplyr?
In R, a SQL query can be made by using dbGetQuery
which counts number of rows in rental
.
sakila_lite %>%
dbGetQuery("SELECT COUNT(*) AS `n` FROM `rental`")
## n
## 1 16044
We could also make SQL query by sql block. In here, we are using the connection sakila_lite
. The result will be printed directly.
SELECT COUNT(*) AS `n` FROM `rental`
n |
---|
16044 |
In we need the output, set output.var
to rental_count
SELECT COUNT(*) AS `n` FROM `rental`;
The output could be later used in R blocks
rental_count
## n
## 1 16044
For comparison, in Python, we use
import sqlite3
sakila = sqlite3.connect('sakila.sqlite')
c = sakila.cursor()
c.execute("SELECT COUNT(*) AS `n` FROM `rental`;")
c.fetchall()
SQLite supports both double quotes (which is the standard) and backticks to quote identifiers (table and column names). Backticks are used in another popular database MySQL. Double quotes are used in Postgresql. It is always a good practice to quote the identifiers.
-- MySQL
SELECT COUNT(*) AS `n` from `actor`;
n |
---|
200 |
-- PostgresQL (ANSI standard)
SELECT COUNT(*) AS "n" from "actor";
n |
---|
200 |
SQLite supports both styles.
SELECT COUNT(*) AS `n` from `actor`;
n |
---|
200 |
The SELECT statement is pretty much the select()
function in dplyr
.
SELECT * FROM "rental";
rental_id | rental_date | inventory_id | customer_id | return_date | staff_id | last_update |
---|---|---|---|---|---|---|
1 | 2005-05-24 22:53:30.000 | 367 | 130 | 2005-05-26 22:04:30.000 | 1 | 2019-04-11 18:11:49 |
2 | 2005-05-24 22:54:33.000 | 1525 | 459 | 2005-05-28 19:40:33.000 | 1 | 2019-04-11 18:11:49 |
3 | 2005-05-24 23:03:39.000 | 1711 | 408 | 2005-06-01 22:12:39.000 | 1 | 2019-04-11 18:11:49 |
4 | 2005-05-24 23:04:41.000 | 2452 | 333 | 2005-06-03 01:43:41.000 | 2 | 2019-04-11 18:11:49 |
5 | 2005-05-24 23:05:21.000 | 2079 | 222 | 2005-06-02 04:33:21.000 | 1 | 2019-04-11 18:11:49 |
6 | 2005-05-24 23:08:07.000 | 2792 | 549 | 2005-05-27 01:32:07.000 | 1 | 2019-04-11 18:11:49 |
7 | 2005-05-24 23:11:53.000 | 3995 | 269 | 2005-05-29 20:34:53.000 | 2 | 2019-04-11 18:11:49 |
8 | 2005-05-24 23:31:46.000 | 2346 | 239 | 2005-05-27 23:33:46.000 | 2 | 2019-04-11 18:11:49 |
9 | 2005-05-25 00:00:40.000 | 2580 | 126 | 2005-05-28 00:22:40.000 | 1 | 2019-04-11 18:11:49 |
10 | 2005-05-25 00:02:21.000 | 1824 | 399 | 2005-05-31 22:44:21.000 | 2 | 2019-04-11 18:11:49 |
SELECT LOWER("last_name") AS "family_name" FROM "actor";
family_name |
---|
akroyd |
akroyd |
akroyd |
allen |
allen |
allen |
astaire |
bacall |
bailey |
bailey |
For comparison,
sakila_lite %>%
tbl("actor") %>%
transmute(family_name = str_to_lower(last_name)) %>%
collect()
## # A tibble: 200 x 1
## family_name
## <chr>
## 1 akroyd
## 2 akroyd
## 3 akroyd
## 4 allen
## 5 allen
## 6 allen
## 7 astaire
## 8 bacall
## 9 bailey
## 10 bailey
## # … with 190 more rows
sakila_lite %>%
tbl("actor") %>%
transmute(family_name = str_to_lower(last_name)) %>%
show_query()
## <SQL>
## SELECT LOWER(`last_name`) AS `family_name`
## FROM `actor`
To select every column or some columns,
SELECT * FROM "actor";
actor_id | first_name | last_name | last_update |
---|---|---|---|
1 | PENELOPE | GUINESS | 2019-04-11 18:11:48 |
2 | NICK | WAHLBERG | 2019-04-11 18:11:48 |
3 | ED | CHASE | 2019-04-11 18:11:48 |
4 | JENNIFER | DAVIS | 2019-04-11 18:11:48 |
5 | JOHNNY | LOLLOBRIGIDA | 2019-04-11 18:11:48 |
6 | BETTE | NICHOLSON | 2019-04-11 18:11:48 |
7 | GRACE | MOSTEL | 2019-04-11 18:11:48 |
8 | MATTHEW | JOHANSSON | 2019-04-11 18:11:48 |
9 | JOE | SWANK | 2019-04-11 18:11:48 |
10 | CHRISTIAN | GABLE | 2019-04-11 18:11:48 |
SELECT "rental_id", "last_update" AS "last_update_time" FROM "rental";
rental_id | last_update_time |
---|---|
1 | 2019-04-11 18:11:49 |
2 | 2019-04-11 18:11:49 |
3 | 2019-04-11 18:11:49 |
4 | 2019-04-11 18:11:49 |
5 | 2019-04-11 18:11:49 |
6 | 2019-04-11 18:11:49 |
7 | 2019-04-11 18:11:49 |
8 | 2019-04-11 18:11:49 |
9 | 2019-04-11 18:11:49 |
10 | 2019-04-11 18:11:49 |
It is equivalent to arrange()
in dplyr
SELECT "film_id", "title" FROM "film" ORDER BY "title";
film_id | title |
---|---|
1 | ACADEMY DINOSAUR |
2 | ACE GOLDFINGER |
3 | ADAPTATION HOLES |
4 | AFFAIR PREJUDICE |
5 | AFRICAN EGG |
6 | AGENT TRUMAN |
7 | AIRPLANE SIERRA |
8 | AIRPORT POLLOCK |
9 | ALABAMA DEVIL |
10 | ALADDIN CALENDAR |
SELECT "film_id", "title" FROM "film" ORDER BY "title" DESC;
film_id | title |
---|---|
1000 | ZORRO ARK |
999 | ZOOLANDER FICTION |
998 | ZHIVAGO CORE |
997 | YOUTH KICK |
996 | YOUNG LANGUAGE |
995 | YENTL IDAHO |
994 | WYOMING STORM |
993 | WRONG BEHAVIOR |
992 | WRATH MILE |
991 | WORST BANGER |
DISTINCT operator to remove duplicates from a result set. It is equivalent to distinct()
function in dplyr
.
SELECT DISTINCT "last_name" FROM "actor";
last_name |
---|
AKROYD |
ALLEN |
ASTAIRE |
BACALL |
BAILEY |
BALE |
BALL |
BARRYMORE |
BASINGER |
BENING |
sakila_lite %>%
tbl("actor") %>%
distinct(last_name) %>%
show_query()
## <SQL>
## SELECT DISTINCT `last_name`
## FROM `actor`
SELECT * FROM "actor" LIMIT 2;
actor_id | first_name | last_name | last_update |
---|---|---|---|
1 | PENELOPE | GUINESS | 2019-04-11 18:11:48 |
2 | NICK | WAHLBERG | 2019-04-11 18:11:48 |
sakila_lite %>%
tbl("actor") %>%
head(2) %>%
show_query()
## <SQL>
## SELECT *
## FROM `actor`
## LIMIT 2
It is equivalent to filter()
in dplyr
.
SELECT * FROM "film" WHERE `rating` = "PG" AND "length" = 90;
film_id | title | description | release_year | language_id | original_language_id | rental_duration | rental_rate | length | replacement_cost | rating | special_features | last_update |
---|---|---|---|---|---|---|---|---|---|---|---|---|
776 | SECRET GROUNDHOG | A Astounding Story of a Cat And a Database Administrator who must Build a Technical Writer in New Orleans | 2006 | 1 | NA | 6 | 4.99 | 90 | 11.99 | PG | Commentaries,Deleted Scenes | 2019-04-11 18:11:48 |
SELECT * FROM "film" WHERE "rating" = 'PG' AND "length" = 90;
film_id | title | description | release_year | language_id | original_language_id | rental_duration | rental_rate | length | replacement_cost | rating | last_update | special_features | fulltext |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
776 | SECRET GROUNDHOG | A Astounding Story of a Cat And a Database Administrator who must Build a Technical Writer in New Orleans | 2006 | 1 | NA | 6 | 4.99 | 90 | 11.99 | PG | 2017-09-10 17:46:03 | {Commentaries,“Deleted Scenes”} | ‘administr’:12 ‘astound’:4 ‘build’:15 ‘cat’:8 ‘databas’:11 ‘groundhog’:2 ‘must’:14 ‘new’:20 ‘orlean’:21 ‘secret’:1 ‘stori’:5 ‘technic’:17 ‘writer’:18 |
SELECT * FROM `film` WHERE `rating` = "PG" OR `length` = 90;
film_id | title | description | release_year | language_id | original_language_id | rental_duration | rental_rate | length | replacement_cost | rating | special_features | last_update |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | ACADEMY DINOSAUR | An Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies | 2006 | 1 | NA | 6 | 0.99 | 86 | 20.99 | PG | Deleted Scenes,Behind the Scenes | 2006-02-15 13:03:42 |
6 | AGENT TRUMAN | A Intrepid Panorama of a Robot And a Boy who must Escape a Sumo Wrestler in Ancient China | 2006 | 1 | NA | 3 | 2.99 | 169 | 17.99 | PG | Deleted Scenes | 2006-02-15 13:03:42 |
12 | ALASKA PHANTOM | A Fanciful Saga of a Hunter And a Pastry Chef who must Vanquish a Boy in Australia | 2006 | 1 | NA | 6 | 0.99 | 136 | 22.99 | PG | Commentaries,Deleted Scenes | 2006-02-15 13:03:42 |
13 | ALI FOREVER | A Action-Packed Drama of a Dentist And a Crocodile who must Battle a Feminist in The Canadian Rockies | 2006 | 1 | NA | 4 | 4.99 | 150 | 21.99 | PG | Deleted Scenes,Behind the Scenes | 2006-02-15 13:03:42 |
19 | AMADEUS HOLY | A Emotional Display of a Pioneer And a Technical Writer who must Battle a Man in A Baloon | 2006 | 1 | NA | 6 | 0.99 | 113 | 20.99 | PG | Commentaries,Deleted Scenes,Behind the Scenes | 2006-02-15 13:03:42 |
37 | ARIZONA BANG | A Brilliant Panorama of a Mad Scientist And a Mad Cow who must Meet a Pioneer in A Monastery | 2006 | 1 | NA | 3 | 2.99 | 121 | 28.99 | PG | Trailers,Deleted Scenes | 2006-02-15 13:03:42 |
41 | ARSENIC INDEPENDENCE | A Fanciful Documentary of a Mad Cow And a Womanizer who must Find a Dentist in Berlin | 2006 | 1 | NA | 4 | 0.99 | 137 | 17.99 | PG | Trailers,Deleted Scenes,Behind the Scenes | 2006-02-15 13:03:42 |
57 | BASIC EASY | A Stunning Epistle of a Man And a Husband who must Reach a Mad Scientist in A Jet Boat | 2006 | 1 | NA | 4 | 2.99 | 90 | 18.99 | PG-13 | Deleted Scenes | 2006-02-15 13:03:42 |
63 | BEDAZZLED MARRIED | A Astounding Character Study of a Madman And a Robot who must Meet a Mad Scientist in An Abandoned Fun House | 2006 | 1 | NA | 6 | 0.99 | 73 | 21.99 | PG | Trailers,Deleted Scenes,Behind the Scenes | 2006-02-15 13:03:42 |
65 | BEHAVIOR RUNAWAY | A Unbelieveable Drama of a Student And a Husband who must Outrace a Sumo Wrestler in Berlin | 2006 | 1 | NA | 3 | 4.99 | 100 | 20.99 | PG | Trailers,Deleted Scenes,Behind the Scenes | 2006-02-15 13:03:42 |
Remarks:
=
for comparison, but not ==
.NOT
and OR
.dplyr equivalent
sakila_lite %>%
tbl("film") %>%
filter(rating == "PG", length == 90)
## # Source: lazy query [?? x 13]
## # Database: sqlite 3.30.1
## # [/Users/Randy/Dropbox/Winter2021/sta141b/sta141b-lectures/ch02-sql/sakila.sqlite]
## film_id title description release_year language_id original_langua…
## <int> <chr> <chr> <chr> <int> <int>
## 1 776 SECR… A Astoundi… 2006 1 NA
## # … with 7 more variables: rental_duration <int>, rental_rate <dbl>,
## # length <int>, replacement_cost <dbl>, rating <chr>, special_features <chr>,
## # last_update <chr>
IN
operatorSELECT * FROM "film" WHERE "rating" IN ('PG', 'PG-13');
film_id | title | description | release_year | language_id | original_language_id | rental_duration | rental_rate | length | replacement_cost | rating | special_features | last_update |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | ACADEMY DINOSAUR | A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies | 2006 | 1 | NA | 6 | 0.99 | 86 | 20.99 | PG | Deleted Scenes,Behind the Scenes | 2019-04-11 18:11:48 |
6 | AGENT TRUMAN | A Intrepid Panorama of a Robot And a Boy who must Escape a Sumo Wrestler in Ancient China | 2006 | 1 | NA | 3 | 2.99 | 169 | 17.99 | PG | Deleted Scenes | 2019-04-11 18:11:48 |
7 | AIRPLANE SIERRA | A Touching Saga of a Hunter And a Butler who must Discover a Butler in A Jet Boat | 2006 | 1 | NA | 6 | 4.99 | 62 | 28.99 | PG-13 | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
9 | ALABAMA DEVIL | A Thoughtful Panorama of a Database Administrator And a Mad Scientist who must Outgun a Mad Scientist in A Jet Boat | 2006 | 1 | NA | 3 | 2.99 | 114 | 21.99 | PG-13 | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
12 | ALASKA PHANTOM | A Fanciful Saga of a Hunter And a Pastry Chef who must Vanquish a Boy in Australia | 2006 | 1 | NA | 6 | 0.99 | 136 | 22.99 | PG | Commentaries,Deleted Scenes | 2019-04-11 18:11:48 |
13 | ALI FOREVER | A Action-Packed Drama of a Dentist And a Crocodile who must Battle a Feminist in The Canadian Rockies | 2006 | 1 | NA | 4 | 4.99 | 150 | 21.99 | PG | Deleted Scenes,Behind the Scenes | 2019-04-11 18:11:48 |
18 | ALTER VICTORY | A Thoughtful Drama of a Composer And a Feminist who must Meet a Secret Agent in The Canadian Rockies | 2006 | 1 | NA | 6 | 0.99 | 57 | 27.99 | PG-13 | Trailers,Behind the Scenes | 2019-04-11 18:11:48 |
19 | AMADEUS HOLY | A Emotional Display of a Pioneer And a Technical Writer who must Battle a Man in A Baloon | 2006 | 1 | NA | 6 | 0.99 | 113 | 20.99 | PG | Commentaries,Deleted Scenes,Behind the Scenes | 2019-04-11 18:11:48 |
28 | ANTHEM LUKE | A Touching Panorama of a Waitress And a Woman who must Outrace a Dog in An Abandoned Amusement Park | 2006 | 1 | NA | 5 | 4.99 | 91 | 16.99 | PG-13 | Deleted Scenes,Behind the Scenes | 2019-04-11 18:11:48 |
33 | APOLLO TEEN | A Action-Packed Reflection of a Crocodile And a Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft | 2006 | 1 | NA | 5 | 2.99 | 153 | 15.99 | PG-13 | Trailers,Commentaries,Deleted Scenes,Behind the Scenes | 2019-04-11 18:11:48 |
sakila_lite %>%
tbl("film") %>%
filter(rating %in% c("PG", "PG-13")) %>%
collect()
## # A tibble: 417 x 13
## film_id title description release_year language_id original_langua…
## <int> <chr> <chr> <chr> <int> <int>
## 1 1 ACAD… A Epic Dra… 2006 1 NA
## 2 6 AGEN… A Intrepid… 2006 1 NA
## 3 7 AIRP… A Touching… 2006 1 NA
## 4 9 ALAB… A Thoughtf… 2006 1 NA
## 5 12 ALAS… A Fanciful… 2006 1 NA
## 6 13 ALI … A Action-P… 2006 1 NA
## 7 18 ALTE… A Thoughtf… 2006 1 NA
## 8 19 AMAD… A Emotiona… 2006 1 NA
## 9 28 ANTH… A Touching… 2006 1 NA
## 10 33 APOL… A Action-P… 2006 1 NA
## # … with 407 more rows, and 7 more variables: rental_duration <int>,
## # rental_rate <dbl>, length <int>, replacement_cost <dbl>, rating <chr>,
## # special_features <chr>, last_update <chr>
LIKE
operatorSee https://www.w3schools.com/sql/sql_like.asp
SELECT "title" FROM "film" WHERE "title" LIKE '%victory%';
title |
---|
ALTER VICTORY |
VICTORY ACADEMY |
Remark: in SQLite, the LIKE
operator is case insensitive. However, it is not the case for other DBs.
SELECT "title" FROM "film" WHERE "title" LIKE '%victory%';
title |
---|
In Postgres, there is a ILIKE
(case insensitive LIKE
) operator
SELECT "title" FROM "film" WHERE "title" ILIKE '%victory%';
title |
---|
ALTER VICTORY |
VICTORY ACADEMY |
In MySQL (depends on the configuration),
SELECT `title` FROM `film` WHERE `title` LIKE '%victory%';
title |
---|
ALTER VICTORY |
VICTORY ACADEMY |
dplyr equivalent
sakila_lite %>%
tbl("film") %>%
filter(title %LIKE% "%victory%") %>%
select(title) %>%
show_query()
## <SQL>
## SELECT `title`
## FROM `film`
## WHERE (`title` LIKE '%victory%')
A slightly universal solution
SELECT "title" FROM "film" WHERE LOWER("title") LIKE '%victory%';
title |
---|
ALTER VICTORY |
VICTORY ACADEMY |
Different servers use different operators to match regular expression. For MySQL, it is the REGEXP operator. For Postgresql, it is SIMILAR TO. For SQLite, it simiply doesn’t support regex.
Though, Postgresql’s implementation of regular expression is a bit different from the standard regex, see for example https://www.postgresql.org/docs/9.0/functions-matching.html
Similar to case_when()
in dplyr
.
SELECT
"film_id",
"title",
CASE
WHEN "length" < 60 THEN 'short'
WHEN "length" < 90 THEN 'mid'
ELSE 'long'
END "length"
FROM "film";
SELECT *
FROM "inventory" a JOIN "rental" b
ON a."inventory_id" = b."inventory_id";
inventory_id | film_id | store_id | last_update | rental_id | rental_date | inventory_id | customer_id | return_date | staff_id | last_update |
---|---|---|---|---|---|---|---|---|---|---|
367 | 80 | 1 | 2019-04-11 18:11:48 | 1 | 2005-05-24 22:53:30.000 | 367 | 130 | 2005-05-26 22:04:30.000 | 1 | 2019-04-11 18:11:49 |
1525 | 333 | 2 | 2019-04-11 18:11:48 | 2 | 2005-05-24 22:54:33.000 | 1525 | 459 | 2005-05-28 19:40:33.000 | 1 | 2019-04-11 18:11:49 |
1711 | 373 | 2 | 2019-04-11 18:11:48 | 3 | 2005-05-24 23:03:39.000 | 1711 | 408 | 2005-06-01 22:12:39.000 | 1 | 2019-04-11 18:11:49 |
2452 | 535 | 1 | 2019-04-11 18:11:48 | 4 | 2005-05-24 23:04:41.000 | 2452 | 333 | 2005-06-03 01:43:41.000 | 2 | 2019-04-11 18:11:49 |
2079 | 450 | 2 | 2019-04-11 18:11:48 | 5 | 2005-05-24 23:05:21.000 | 2079 | 222 | 2005-06-02 04:33:21.000 | 1 | 2019-04-11 18:11:49 |
2792 | 613 | 1 | 2019-04-11 18:11:48 | 6 | 2005-05-24 23:08:07.000 | 2792 | 549 | 2005-05-27 01:32:07.000 | 1 | 2019-04-11 18:11:49 |
3995 | 870 | 2 | 2019-04-11 18:11:48 | 7 | 2005-05-24 23:11:53.000 | 3995 | 269 | 2005-05-29 20:34:53.000 | 2 | 2019-04-11 18:11:49 |
2346 | 510 | 1 | 2019-04-11 18:11:48 | 8 | 2005-05-24 23:31:46.000 | 2346 | 239 | 2005-05-27 23:33:46.000 | 2 | 2019-04-11 18:11:49 |
2580 | 565 | 1 | 2019-04-11 18:11:48 | 9 | 2005-05-25 00:00:40.000 | 2580 | 126 | 2005-05-28 00:22:40.000 | 1 | 2019-04-11 18:11:49 |
1824 | 396 | 2 | 2019-04-11 18:11:48 | 10 | 2005-05-25 00:02:21.000 | 1824 | 399 | 2005-05-31 22:44:21.000 | 2 | 2019-04-11 18:11:49 |
If we only want store_id
and customer_id
,
SELECT a."store_id", b."customer_id"
FROM "inventory" a JOIN "rental" b
ON a."inventory_id" = b."inventory_id"
store_id | customer_id |
---|---|
1 | 130 |
2 | 459 |
2 | 408 |
1 | 333 |
2 | 222 |
1 | 549 |
2 | 269 |
1 | 239 |
1 | 126 |
2 | 399 |
inner_join(
tbl(sakila_lite, "inventory"),
tbl(sakila_lite, "rental"),
by = "inventory_id") %>%
select(store_id, customer_id) %>%
collect()
## # A tibble: 16,044 x 2
## store_id customer_id
## <int> <int>
## 1 1 130
## 2 2 459
## 3 2 408
## 4 1 333
## 5 2 222
## 6 1 549
## 7 2 269
## 8 1 239
## 9 1 126
## 10 2 399
## # … with 16,034 more rows
We could also join a more sophisticated subquery.
SELECT a."store_id", a."inventory_id", b."customer_id"
FROM "inventory" a
JOIN (
SELECT * FROM "rental" WHERE "return_date" IS NULL
) b ON a."inventory_id" = b."inventory_id"
store_id | inventory_id | customer_id |
---|---|---|
1 | 2047 | 155 |
2 | 2026 | 335 |
2 | 1545 | 83 |
2 | 4106 | 219 |
1 | 817 | 99 |
2 | 1857 | 192 |
2 | 478 | 11 |
1 | 1622 | 597 |
2 | 3043 | 53 |
1 | 3947 | 521 |
SELECT a."store_id", a."inventory_id", b."customer_id"
FROM "inventory" a
LEFT JOIN (
SELECT * FROM "rental" WHERE "return_date" IS NULL
) b ON a."inventory_id" = b."inventory_id"
store_id | inventory_id | customer_id |
---|---|---|
1 | 1 | NA |
1 | 2 | NA |
1 | 3 | NA |
1 | 4 | NA |
2 | 5 | NA |
2 | 6 | 554 |
2 | 7 | NA |
2 | 8 | NA |
2 | 9 | 366 |
2 | 10 | NA |
SELECT a."store_id", a."inventory_id", b."customer_id"
FROM "inventory" a
RIGHT JOIN (
SELECT * FROM "rental" WHERE "return_date" IS NULL
) b ON a."inventory_id" = b."inventory_id"
store_id | inventory_id | customer_id |
---|---|---|
2 | 6 | 554 |
2 | 9 | 366 |
2 | 21 | 111 |
2 | 25 | 590 |
2 | 70 | 108 |
1 | 81 | 236 |
2 | 97 | 512 |
2 | 106 | 44 |
2 | 112 | 349 |
2 | 177 | 317 |
Remark: SQLite doesn’t support Right join
SELECT a."store_id", a."inventory_id", b."customer_id"
FROM "inventory" a
FULL JOIN (
SELECT * FROM "rental" WHERE "return_date" IS NULL
) b ON a."inventory_id" = b."inventory_id"
store_id | inventory_id | customer_id |
---|---|---|
1 | 1 | NA |
1 | 2 | NA |
1 | 3 | NA |
1 | 4 | NA |
2 | 5 | NA |
2 | 6 | 554 |
2 | 7 | NA |
2 | 8 | NA |
2 | 9 | 366 |
2 | 10 | NA |
Remark: SQLite doesn’t support full join.
SELECT *
FROM "inventory" a
WHERE EXISTS (
SELECT * FROM "rental" b
WHERE "return_date" IS NULL AND b."inventory_id" = a."inventory_id"
)
inventory_id | film_id | store_id | last_update |
---|---|---|---|
6 | 1 | 2 | 2019-04-11 18:11:48 |
9 | 2 | 2 | 2019-04-11 18:11:48 |
21 | 4 | 2 | 2019-04-11 18:11:48 |
25 | 5 | 2 | 2019-04-11 18:11:48 |
70 | 13 | 2 | 2019-04-11 18:11:48 |
81 | 17 | 1 | 2019-04-11 18:11:48 |
97 | 19 | 2 | 2019-04-11 18:11:48 |
106 | 21 | 2 | 2019-04-11 18:11:48 |
112 | 22 | 2 | 2019-04-11 18:11:48 |
177 | 39 | 2 | 2019-04-11 18:11:48 |
tbl(sakila_lite, "inventory") %>%
semi_join(
tbl(sakila_lite, "rental") %>% filter(is.na(return_date)),
by = "inventory_id") %>%
show_query()
## <SQL>
## SELECT * FROM `inventory` AS `LHS`
## WHERE EXISTS (
## SELECT 1 FROM (SELECT *
## FROM `rental`
## WHERE (((`return_date`) IS NULL))) AS `RHS`
## WHERE (`LHS`.`inventory_id` = `RHS`.`inventory_id`)
## )
SELECT *
FROM "inventory" a
WHERE NOT EXISTS (
SELECT * FROM "rental" b
WHERE b."return_date" IS NULL AND a."inventory_id" = b."inventory_id"
)
inventory_id | film_id | store_id | last_update |
---|---|---|---|
1 | 1 | 1 | 2019-04-11 18:11:48 |
2 | 1 | 1 | 2019-04-11 18:11:48 |
3 | 1 | 1 | 2019-04-11 18:11:48 |
4 | 1 | 1 | 2019-04-11 18:11:48 |
5 | 1 | 2 | 2019-04-11 18:11:48 |
7 | 1 | 2 | 2019-04-11 18:11:48 |
8 | 1 | 2 | 2019-04-11 18:11:48 |
10 | 2 | 2 | 2019-04-11 18:11:48 |
11 | 2 | 2 | 2019-04-11 18:11:48 |
12 | 3 | 2 | 2019-04-11 18:11:48 |
tbl(sakila_lite, "inventory") %>%
anti_join(
tbl(sakila_lite, "rental") %>% filter(is.na(return_date)),
by = "inventory_id") %>%
collect()
## # A tibble: 4,398 x 4
## inventory_id film_id store_id last_update
## <int> <int> <int> <chr>
## 1 1 1 1 2019-04-11 18:11:48
## 2 2 1 1 2019-04-11 18:11:48
## 3 3 1 1 2019-04-11 18:11:48
## 4 4 1 1 2019-04-11 18:11:48
## 5 5 1 2 2019-04-11 18:11:48
## 6 7 1 2 2019-04-11 18:11:48
## 7 8 1 2 2019-04-11 18:11:48
## 8 10 2 2 2019-04-11 18:11:48
## 9 11 2 2 2019-04-11 18:11:48
## 10 12 3 2 2019-04-11 18:11:48
## # … with 4,388 more rows
SELECT AVG("length") as "avg_length" FROM "film"
avg_length |
---|
115.272 |
sakila_lite %>%
tbl("film") %>%
summarize(avg_length = mean(length)) %>%
collect()
## # A tibble: 1 x 1
## avg_length
## <dbl>
## 1 115.
SELECT "rating", AVG("length") AS "avg_length"
FROM "film" GROUP BY "rating";
rating | avg_length |
---|---|
G | 111.0506 |
NC-17 | 113.2286 |
PG | 112.0052 |
PG-13 | 120.4439 |
R | 118.6615 |
sakila_lite %>%
tbl("film") %>%
group_by(rating) %>%
summarize(avg_length = mean(length)) %>%
collect()
## # A tibble: 5 x 2
## rating avg_length
## <chr> <dbl>
## 1 G 111.
## 2 NC-17 113.
## 3 PG 112.
## 4 PG-13 120.
## 5 R 119.
SELECT "rating", "rental_duration", AVG("length") as "avg_length"
FROM "film" GROUP BY "rating", "rental_duration"
rating | rental_duration | avg_length |
---|---|---|
G | 3 | 100.4286 |
G | 4 | 102.2857 |
G | 5 | 109.5758 |
G | 6 | 128.0000 |
G | 7 | 116.3448 |
NC-17 | 3 | 113.3243 |
NC-17 | 4 | 119.1515 |
NC-17 | 5 | 105.4186 |
NC-17 | 6 | 111.7895 |
NC-17 | 7 | 118.7000 |
sakila_lite %>%
tbl("film") %>%
group_by(rating, rental_duration) %>%
summarize(avg_length = mean(length)) %>%
collect()
## # A tibble: 25 x 3
## # Groups: rating [5]
## rating rental_duration avg_length
## <chr> <int> <dbl>
## 1 G 3 100.
## 2 G 4 102.
## 3 G 5 110.
## 4 G 6 128
## 5 G 7 116.
## 6 NC-17 3 113.
## 7 NC-17 4 119.
## 8 NC-17 5 105.
## 9 NC-17 6 112.
## 10 NC-17 7 119.
## # … with 15 more rows
SELECT "rating", AVG("length") AS "avg_length"
FROM "film"
GROUP BY "rating"
HAVING "avg_length" > 115;
rating | avg_length |
---|---|
PG-13 | 120.4439 |
R | 118.6615 |
We could simulate the HAVING clause by a subquery.
SELECT *
FROM
(
SELECT "rating", AVG("length") AS "avg_length"
FROM "film"
GROUP BY "rating"
)
WHERE "avg_length" > 115;
rating | avg_length |
---|---|
PG-13 | 120.4439 |
R | 118.6615 |
Remark: Postgres requires an alias name for the subquery.
We could use subquery to perform more complicated tasks. Eg, get the rating with maximum average length.
-- this does not work
SELECT *
FROM
(
SELECT "rating", AVG("length") AS "avg_length"
FROM "film"
GROUP BY "rating"
) "Temp"
WHERE "avg_length" = MAX("avg_length");
SELECT "rating", "avg_length"
FROM (
SELECT "rating", AVG("length") AS "avg_length"
FROM "film" GROUP BY "rating"
) "Temp"
WHERE "avg_length" = (
SELECT MAX("avg_length") FROM (
SELECT "rating", AVG("length") AS "avg_length"
FROM "film" GROUP BY "rating"
) "Temp2"
)
rating | avg_length |
---|---|
PG-13 | 120.4439 |
THe above repeats the subquery twice.
The WITH
clause allows us to refer to th temp table in WHERE statement.
WITH "Temp" AS(
SELECT "rating", AVG("length") AS "avg_length"
FROM "film" GROUP BY "rating"
)
SELECT "rating", "avg_length"
FROM "Temp"
WHERE "avg_length" = (SELECT MAX("avg_length") FROM "Temp")
rating | avg_length |
---|---|
PG-13 | 120.4439 |
Calculate the sum of “amount” for each store.
SELECT "store_id", "amount"
FROM "rental" r
JOIN "payment" p
on r."rental_id" = p."rental_id"
JOIN "inventory" i
on r."inventory_id" = i."inventory_id"
store_id | amount |
---|---|
2 | 1.99 |
1 | 0.99 |
1 | 6.99 |
2 | 0.99 |
2 | 4.99 |
1 | 2.99 |
2 | 1.99 |
1 | 4.99 |
1 | 8.99 |
1 | 0.99 |
SELECT "store_id", SUM("amount")
FROM "rental" r
JOIN "payment" p
on r."rental_id" = p."rental_id"
JOIN "inventory" i
on r."inventory_id" = i."inventory_id"
GROUP BY "store_id"
store_id | sum |
---|---|
2 | 33726.77 |
1 | 33689.74 |
Instead of only showing the sum, we want to show show the amounts aside the sum.
SELECT "store_id", "amount", SUM("amount") OVER (PARTITION BY "store_id")
FROM
(SELECT * FROM "rental" r
JOIN "payment" p
on r."rental_id" = p."rental_id"
JOIN "inventory" i
on r."inventory_id" = i."inventory_id") "table2"
store_id | amount | sum |
---|---|---|
1 | 0.99 | 33689.74 |
1 | 4.99 | 33689.74 |
1 | 8.99 | 33689.74 |
1 | 2.99 | 33689.74 |
1 | 1.99 | 33689.74 |
1 | 0.99 | 33689.74 |
1 | 8.99 | 33689.74 |
1 | 6.99 | 33689.74 |
1 | 4.99 | 33689.74 |
1 | 3.99 | 33689.74 |
Remark: the OVER clause could also be used to calculate moving average.
SELECT * FROM "film" where "film_id" <= 3
UNION
SELECT * FROM "film" where "film_id" <= 4;
film_id | title | description | release_year | language_id | original_language_id | rental_duration | rental_rate | length | replacement_cost | rating | special_features | last_update |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | ACADEMY DINOSAUR | A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies | 2006 | 1 | NA | 6 | 0.99 | 86 | 20.99 | PG | Deleted Scenes,Behind the Scenes | 2019-04-11 18:11:48 |
2 | ACE GOLDFINGER | A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China | 2006 | 1 | NA | 3 | 4.99 | 48 | 12.99 | G | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
3 | ADAPTATION HOLES | A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory | 2006 | 1 | NA | 7 | 2.99 | 50 | 18.99 | NC-17 | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
4 | AFFAIR PREJUDICE | A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank | 2006 | 1 | NA | 5 | 2.99 | 117 | 26.99 | G | Commentaries,Behind the Scenes | 2019-04-11 18:11:48 |
UNION ALL
doesn’t remove repeated rows
SELECT * FROM "film" where "film_id" <= 3
UNION ALL
SELECT * FROM "film" where "film_id" <= 4;
film_id | title | description | release_year | language_id | original_language_id | rental_duration | rental_rate | length | replacement_cost | rating | special_features | last_update |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | ACADEMY DINOSAUR | A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies | 2006 | 1 | NA | 6 | 0.99 | 86 | 20.99 | PG | Deleted Scenes,Behind the Scenes | 2019-04-11 18:11:48 |
2 | ACE GOLDFINGER | A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China | 2006 | 1 | NA | 3 | 4.99 | 48 | 12.99 | G | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
3 | ADAPTATION HOLES | A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory | 2006 | 1 | NA | 7 | 2.99 | 50 | 18.99 | NC-17 | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
1 | ACADEMY DINOSAUR | A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies | 2006 | 1 | NA | 6 | 0.99 | 86 | 20.99 | PG | Deleted Scenes,Behind the Scenes | 2019-04-11 18:11:48 |
2 | ACE GOLDFINGER | A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China | 2006 | 1 | NA | 3 | 4.99 | 48 | 12.99 | G | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
3 | ADAPTATION HOLES | A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory | 2006 | 1 | NA | 7 | 2.99 | 50 | 18.99 | NC-17 | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
4 | AFFAIR PREJUDICE | A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank | 2006 | 1 | NA | 5 | 2.99 | 117 | 26.99 | G | Commentaries,Behind the Scenes | 2019-04-11 18:11:48 |
SELECT * FROM "film" where "film_id" <= 5
INTERSECT
SELECT * FROM "film" where "film_id" >= 2;
film_id | title | description | release_year | language_id | original_language_id | rental_duration | rental_rate | length | replacement_cost | rating | special_features | last_update |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | ACE GOLDFINGER | A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China | 2006 | 1 | NA | 3 | 4.99 | 48 | 12.99 | G | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
3 | ADAPTATION HOLES | A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory | 2006 | 1 | NA | 7 | 2.99 | 50 | 18.99 | NC-17 | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
4 | AFFAIR PREJUDICE | A Fanciful Documentary of a Frisbee And a Lumberjack who must Chase a Monkey in A Shark Tank | 2006 | 1 | NA | 5 | 2.99 | 117 | 26.99 | G | Commentaries,Behind the Scenes | 2019-04-11 18:11:48 |
5 | AFRICAN EGG | A Fast-Paced Documentary of a Pastry Chef And a Dentist who must Pursue a Forensic Psychologist in The Gulf of Mexico | 2006 | 1 | NA | 6 | 2.99 | 130 | 22.99 | G | Deleted Scenes | 2019-04-11 18:11:48 |
SELECT * FROM "film" where "film_id" <= 5
EXCEPT
SELECT * FROM "film" where "film_id" >= 4;
film_id | title | description | release_year | language_id | original_language_id | rental_duration | rental_rate | length | replacement_cost | rating | special_features | last_update |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | ACADEMY DINOSAUR | A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies | 2006 | 1 | NA | 6 | 0.99 | 86 | 20.99 | PG | Deleted Scenes,Behind the Scenes | 2019-04-11 18:11:48 |
2 | ACE GOLDFINGER | A Astounding Epistle of a Database Administrator And a Explorer who must Find a Car in Ancient China | 2006 | 1 | NA | 3 | 4.99 | 48 | 12.99 | G | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
3 | ADAPTATION HOLES | A Astounding Reflection of a Lumberjack And a Car who must Sink a Lumberjack in A Baloon Factory | 2006 | 1 | NA | 7 | 2.99 | 50 | 18.99 | NC-17 | Trailers,Deleted Scenes | 2019-04-11 18:11:48 |
Remark: anti-join allows different column names in the tables.
# create a local empty SQLite database called mydb.sqlite
mydb <- dbConnect(RSQLite::SQLite(), dbname = "mydb.sqlite")
mydb %>% dbCreateTable(
"table1",
tibble(fruit = character(0), count = integer(0))
)
mydb %>% dbReadTable("table1")
CREATE TABLE "table2" (
"id" int NOT NULL,
"last_name" varchar(255) NOT NULL,
"first_name" varchar(255),
"age" int,
PRIMARY KEY ("id")
);
There is also dbWriteTable
which exports the whole data frame as a table of the database.
DROP TABLE "table2";
mydb %>% dbExecute('DROP TABLE "table2";')
drop_table <- function(db, tname) {
drop_sql <- sqlInterpolate(db, "DROP TABLE ?tablename;", tablename = tname)
db %>% dbExecute(drop_sql)
}
mydb %>% drop_table("table1")
Remark: the use of sqlInterpolate is to avoid SQL injection attack. Computerphile has a great video on this topic: https://www.youtube.com/watch?v=ciNHn38EyRc
mydb %>% dbAppendTable(
"table1",
tibble(fruit = "apple", count = 2))
# alternatively
insert_sql <- mydb %>% sqlAppendTable(
"table1",
tibble(fruit = "apple", count = 2), row.names = FALSE)
mydb %>% dbExecute(insert_sql)
mydb %>% dbReadTable("table1")
INSERT INTO 'table2' (id, last_name, first_name, age)
VALUES (1, "Lai", "Randy", 16);
INSERT INTO 'table2' (id, last_name, first_name)
VALUES (2, "Lai", "Natalie");
mydb %>% dbReadTable("table2")
mydb %>% dbExecute('UPDATE "table2" SET "age" = 33 WHERE "id" = 1;')
UPDATE "table2" SET "age" = 33 WHERE "id" = 1;
mydb %>% dbReadTable("table2")
mydb %>% dbExecute('DELETE FROM "table2" WHERE "id" = 1;')
DELETE FROM "table2" WHERE "id" = 1;
mydb %>% dbReadTable("table2")