Rational DBs vs Nosql Databases

The most common type of nosql databases is document database.

Types of NoSQL DBs.

Some references:

Airbnb sample data

MongoDB is the most popular NOSQL database.

https://docs.atlas.mongodb.com/sample-data/sample-airbnb/

To connect to this MongoDB, you need to either on the campus network or connect via UCDavis VPN.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.1     ✔ purrr   0.3.4
## ✔ tibble  3.0.3     ✔ dplyr   1.0.2
## ✔ tidyr   1.1.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(mongolite)

m <- mongo("airbnb", db = "data", url = "mongodb://mongouser:secret@alan.ucdavis.edu/data")
# total number of documents
m$count()
## [1] 5555
# total number of documents satisfying a condition
m$count('{"price": {"$lt": 20}}')
## [1] 63

Query Documents https://docs.mongodb.com/manual/tutorial/query-documents/

# find the first document
first_doc <- m$find("{}", limit = 1)
first_doc %>% glimpse()
## Rows: 1
## Columns: 38
## $ listing_url           <chr> "https://www.airbnb.com/rooms/10006546"
## $ name                  <chr> "Ribeira Charming Duplex"
## $ summary               <chr> "Fantastic duplex apartment with three bedrooms…
## $ space                 <chr> "Privileged views of the Douro River and Ribeir…
## $ description           <chr> "Fantastic duplex apartment with three bedrooms…
## $ neighborhood_overview <chr> "In the neighborhood of the river, you can find…
## $ notes                 <chr> "Lose yourself in the narrow streets and stairc…
## $ transit               <chr> "Transport: • Metro station and S. Bento railwa…
## $ access                <chr> "We are always available to help guests. The ho…
## $ interaction           <chr> "Cot - 10 € / night Dog - € 7,5 / night"
## $ house_rules           <chr> "Make the house your home..."
## $ property_type         <chr> "House"
## $ room_type             <chr> "Entire home/apt"
## $ bed_type              <chr> "Real Bed"
## $ minimum_nights        <chr> "2"
## $ maximum_nights        <chr> "30"
## $ cancellation_policy   <chr> "moderate"
## $ last_scraped          <dttm> 2019-02-15 21:00:00
## $ calendar_last_scraped <dttm> 2019-02-15 21:00:00
## $ first_review          <dttm> 2016-01-02 21:00:00
## $ last_review           <dttm> 2019-01-19 21:00:00
## $ accommodates          <int> 8
## $ bedrooms              <int> 3
## $ beds                  <int> 5
## $ number_of_reviews     <int> 51
## $ bathrooms             <dbl> 1
## $ amenities             <list> [<"TV", "Cable TV", "Wifi", "Kitchen", "Paid p…
## $ price                 <dbl> 80
## $ security_deposit      <dbl> 200
## $ cleaning_fee          <dbl> 35
## $ extra_people          <dbl> 15
## $ guests_included       <dbl> 6
## $ images                <df[,4]> <data.frame[1 x 4]>
## $ host                  <df[,16]> <data.frame[1 x 16]>
## $ address               <df[,7]> <data.frame[1 x 7]>
## $ availability          <df[,4]> <data.frame[1 x 4]>
## $ review_scores         <df[,7]> <data.frame[1 x 7]>
## $ reviews               <list> [<data.frame[51 x 6]>]
# filter fields
m$find("{}", fields = '{"name": true}')
# filter keys
m$find('{"price": {"$lt": 20}}', fields = '{"name": true, "price": true}')

Check for all the operators: https://docs.mongodb.com/manual/reference/operator/query-comparison/#query-selectors-comparison

m$find('{"price": {"$lt": 50}, "bedrooms": {"$gte": 3}, "amenities": "Wifi"}',
  fields = '{"name": true, "price": true, "bedrooms": true}'
)

Remark: As amenities is an array, "amenities": "Wifi" means Wifi is one of the amenities. On the other hand, "amenities": ["Wifi"] means Wifi is the only amenity.

# sort
m$find('{"bedrooms": {"$gte": 4}}',
  fields = '{"name": true, "price": true}',
  sort = '{"price": -1}',
  limit = 5
)

Query nested fields

countrydf <- m$find(
  '{"address.country_code": "US"}',
  fields = '{"name": true, "address.country": true}'
)
countrydf
countrydf$address$country
##   [1] "United States" "United States" "United States" "United States"
##   [5] "United States" "United States" "United States" "United States"
##   [9] "United States" "United States" "United States" "United States"
##  [13] "United States" "United States" "United States" "United States"
##  [17] "United States" "United States" "United States" "United States"
##  [21] "United States" "United States" "United States" "United States"
##  [25] "United States" "United States" "United States" "United States"
##  [29] "United States" "United States" "United States" "United States"
##  [33] "United States" "United States" "United States" "United States"
##  [37] "United States" "United States" "United States" "United States"
##  [41] "United States" "United States" "United States" "United States"
##  [45] "United States" "United States" "United States" "United States"
##  [49] "United States" "United States" "United States" "United States"
##  [53] "United States" "United States" "United States" "United States"
##  [57] "United States" "United States" "United States" "United States"
##  [61] "United States" "United States" "United States" "United States"
##  [65] "United States" "United States" "United States" "United States"
##  [69] "United States" "United States" "United States" "United States"
##  [73] "United States" "United States" "United States" "United States"
##  [77] "United States" "United States" "United States" "United States"
##  [81] "United States" "United States" "United States" "United States"
##  [85] "United States" "United States" "United States" "United States"
##  [89] "United States" "United States" "United States" "United States"
##  [93] "United States" "United States" "United States" "United States"
##  [97] "United States" "United States" "United States" "United States"
## [101] "United States" "United States" "United States" "United States"
## [105] "United States" "United States" "United States" "United States"
## [109] "United States" "United States" "United States" "United States"
## [113] "United States" "United States" "United States" "United States"
## [117] "United States" "United States" "United States" "United States"
## [121] "United States" "United States" "United States" "United States"
## [125] "United States" "United States" "United States" "United States"
## [129] "United States" "United States" "United States" "United States"
## [133] "United States" "United States" "United States" "United States"
## [137] "United States" "United States" "United States" "United States"
## [141] "United States" "United States" "United States" "United States"
## [145] "United States" "United States" "United States" "United States"
## [149] "United States" "United States" "United States" "United States"
## [153] "United States" "United States" "United States" "United States"
## [157] "United States" "United States" "United States" "United States"
## [161] "United States" "United States" "United States" "United States"
## [165] "United States" "United States" "United States" "United States"
## [169] "United States" "United States" "United States" "United States"
## [173] "United States" "United States" "United States" "United States"
## [177] "United States" "United States" "United States" "United States"
## [181] "United States" "United States" "United States" "United States"
## [185] "United States" "United States" "United States" "United States"
## [189] "United States" "United States" "United States" "United States"
## [193] "United States" "United States" "United States" "United States"
## [197] "United States" "United States" "United States" "United States"
##  [ reached getOption("max.print") -- omitted 1022 entries ]
countrydf %>% pull(address) %>% pull(country)
##   [1] "United States" "United States" "United States" "United States"
##   [5] "United States" "United States" "United States" "United States"
##   [9] "United States" "United States" "United States" "United States"
##  [13] "United States" "United States" "United States" "United States"
##  [17] "United States" "United States" "United States" "United States"
##  [21] "United States" "United States" "United States" "United States"
##  [25] "United States" "United States" "United States" "United States"
##  [29] "United States" "United States" "United States" "United States"
##  [33] "United States" "United States" "United States" "United States"
##  [37] "United States" "United States" "United States" "United States"
##  [41] "United States" "United States" "United States" "United States"
##  [45] "United States" "United States" "United States" "United States"
##  [49] "United States" "United States" "United States" "United States"
##  [53] "United States" "United States" "United States" "United States"
##  [57] "United States" "United States" "United States" "United States"
##  [61] "United States" "United States" "United States" "United States"
##  [65] "United States" "United States" "United States" "United States"
##  [69] "United States" "United States" "United States" "United States"
##  [73] "United States" "United States" "United States" "United States"
##  [77] "United States" "United States" "United States" "United States"
##  [81] "United States" "United States" "United States" "United States"
##  [85] "United States" "United States" "United States" "United States"
##  [89] "United States" "United States" "United States" "United States"
##  [93] "United States" "United States" "United States" "United States"
##  [97] "United States" "United States" "United States" "United States"
## [101] "United States" "United States" "United States" "United States"
## [105] "United States" "United States" "United States" "United States"
## [109] "United States" "United States" "United States" "United States"
## [113] "United States" "United States" "United States" "United States"
## [117] "United States" "United States" "United States" "United States"
## [121] "United States" "United States" "United States" "United States"
## [125] "United States" "United States" "United States" "United States"
## [129] "United States" "United States" "United States" "United States"
## [133] "United States" "United States" "United States" "United States"
## [137] "United States" "United States" "United States" "United States"
## [141] "United States" "United States" "United States" "United States"
## [145] "United States" "United States" "United States" "United States"
## [149] "United States" "United States" "United States" "United States"
## [153] "United States" "United States" "United States" "United States"
## [157] "United States" "United States" "United States" "United States"
## [161] "United States" "United States" "United States" "United States"
## [165] "United States" "United States" "United States" "United States"
## [169] "United States" "United States" "United States" "United States"
## [173] "United States" "United States" "United States" "United States"
## [177] "United States" "United States" "United States" "United States"
## [181] "United States" "United States" "United States" "United States"
## [185] "United States" "United States" "United States" "United States"
## [189] "United States" "United States" "United States" "United States"
## [193] "United States" "United States" "United States" "United States"
## [197] "United States" "United States" "United States" "United States"
##  [ reached getOption("max.print") -- omitted 1022 entries ]
countrydf %>%
  mutate(country = address$country, address = NULL)
countrydf %>% 
  bind_cols(countrydf$address) %>% 
  select(-address)

Query an array

# find all documents where amenities contain both Oven and Refrigerator
m$find('{"amenities": {"$all": ["Oven", "Refrigerator"]}}',
       fields = '{"name": true, "amenities": true}')

PS: you could also use the $and operator.

# find all documents where amenities contain both Oven and Refrigerator
m$find('{"$and": [{"amenities": "Oven"}, {"amenities": "Refrigerator"}]}',
       fields = '{"name": true, "amenities": true}')
# find all documents where amenities contain Oven or Refrigerator or both
m$find('{"amenities": {"$in": ["Oven", "Refrigerator"]}}',
       fields = '{"name": true, "amenities": true}')

PS: you could also use the $or operator.

# find all documents where amenities contain both Oven and Refrigerator
m$find('{"$or": [{"amenities": "Oven"}, {"amenities": "Refrigerator"}]}',
       fields = '{"name": true, "amenities": true}')

Regular expressions (come back to this later)

m$find('{"name": {"$regex": "\\\\bsea\\\\b", "$options": "i"}}', fields = '{"name": true}')

Iteration

m$find('{"price": {"$lt": 12}}')
it <- m$iterate('{"price": {"$lt": 12}}')
while (TRUE) {
  x <- it$one()
  if (is.null(x)) break
  cat(x$name, "\n")
}
## Room on spacious appartment 
## Cómoda Habitación L'Eixample, Gracia 
## Private room with sunny terrace of 200m2. 6 
## Near the RAMBLA, the double room at  SEASIDE PORT1

By batch

it <- m$iterate('{"price": {"$lt": 12}}')
res <- it$batch(size = 2)

Examine raw JSON

it <- m$iterate('{"price": {"$lt": 12}}')
j <- it$json(size = 1)

Aggregate

The aggregation pipeline is a framework for data aggregation modeled on the concept of data processing pipelines.

https://docs.mongodb.com/manual/reference/operator/aggregation-pipeline/

Group

To illustrate the usage of a pipeline, consider the following three stages

  • filter the data
  • group by and summerize
m$aggregate('[
  {"$match": {"bedrooms": {"$ne": null, "$ne": 0}}},
  {"$group": { "_id": "$bedrooms", "avg_price": { "$avg": "$price" }}}
]')

User the $project stage to rename _id.

m$aggregate('[
  {"$match": {"bedrooms": {"$ne": null, "$ne": 0}}},
  {"$group": { "_id": "$bedrooms", "avg_price": { "$avg": "$price" }}},
  {"$project": { "_id": false, "bedrooms": "$_id", "avg_price": true }},
  {"$sort": {"bedrooms": 1}}
]')

Group by two variables

m$aggregate('[
  {"$match": {"bedrooms": {"$ne": null, "$ne": 0}}},
  {"$group": {
    "_id": {"property_type": "$property_type", "bedrooms": "$bedrooms"},
    "avg_price": { "$avg": "$price" }}},
  {"$project": {
    "_id": false,
    "property_type": "$_id.property_type",
    "bedrooms": "$_id.bedrooms",
    "avg_price": true
  }},
  {"$sort": {"property_type": 1, "bedrooms": 1}}
]')

Aggregation Pipeline Operators

Compute the average price per room grouped by property_type

See: https://docs.mongodb.com/manual/reference/operator/aggregation/

Remark: our mongo server is 3.x.

m$aggregate('[
  {"$match": {"bedrooms": {"$ne": null, "$ne": 0}}},
  {"$project": {
    "property_type": "$property_type",
    "bedrooms": "$bedrooms",
    "price_per_room": {"$divide": ["$price", "$bedrooms"]}
  }},
  {"$group": {
    "_id": "$property_type",
    "avg_bedrooms": { "$avg": "$bedrooms" },
    "price_per_room": { "$avg": "$price_per_room" }}
  },
  {"$project": {
    "_id": false,
    "property_type": "$_id",
    "avg_bedrooms": true,
    "price_per_room": true
  }},
  {"$sort": {"property_type": 1}}
]')

Unwind an array

$unwind has a similar effect as tidyverse’s unnest_longer.

df <- tibble(
  id = c("a", "b"),
  v = list(1:3, 5:6),
  w = c("apple", "banana")
)
df %>% unnest_longer(v)

To query all the reviews of “Ribeira Charming Duplex”.

m$aggregate('[
  {"$match": {"name": "Ribeira Charming Duplex"}},
  {"$project": {
    "name": true,
    "reviews": true
  }},
  {"$unwind": "$reviews"},
  {"$project": {
    "_id": false,
    "name": true,
    "reviewer_name": "$reviews.reviewer_name",
    "comments": "$reviews.comments"
  }}
]')

Iteration

it <- m$aggregate('[
  {"$match": {"bedrooms": {"$ne": null, "$ne": 0}}},
  {"$group": { "_id": "$bedrooms", "avg_price": { "$avg": "$price" }}},
  {"$project": { "_id": false, "bedrooms": "$_id", "avg_price": true }},
  {"$sort": {"bedrooms": 1}}
]', iterate = TRUE)
it$batch(size = 2)
## [[1]]
## [[1]]$avg_price
## [1] 208.7122
## 
## [[1]]$bedrooms
## [1] 1
## 
## 
## [[2]]
## [[2]]$avg_price
## [1] 315.5752
## 
## [[2]]$bedrooms
## [1] 2