The most common type of nosql databases is document database.
Some references:
MongoDB is the most popular NOSQL database.
https://docs.atlas.mongodb.com/sample-data/sample-airbnb/
To connect to this MongoDB, you need to either on the campus network or connect via UCDavis VPN.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.1 ✔ purrr 0.3.4
## ✔ tibble 3.0.3 ✔ dplyr 1.0.2
## ✔ tidyr 1.1.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(mongolite)
m <- mongo("airbnb", db = "data", url = "mongodb://mongouser:secret@alan.ucdavis.edu/data")
# total number of documents
m$count()
## [1] 5555
# total number of documents satisfying a condition
m$count('{"price": {"$lt": 20}}')
## [1] 63
Query Documents https://docs.mongodb.com/manual/tutorial/query-documents/
# find the first document
first_doc <- m$find("{}", limit = 1)
first_doc %>% glimpse()
## Rows: 1
## Columns: 38
## $ listing_url <chr> "https://www.airbnb.com/rooms/10006546"
## $ name <chr> "Ribeira Charming Duplex"
## $ summary <chr> "Fantastic duplex apartment with three bedrooms…
## $ space <chr> "Privileged views of the Douro River and Ribeir…
## $ description <chr> "Fantastic duplex apartment with three bedrooms…
## $ neighborhood_overview <chr> "In the neighborhood of the river, you can find…
## $ notes <chr> "Lose yourself in the narrow streets and stairc…
## $ transit <chr> "Transport: • Metro station and S. Bento railwa…
## $ access <chr> "We are always available to help guests. The ho…
## $ interaction <chr> "Cot - 10 € / night Dog - € 7,5 / night"
## $ house_rules <chr> "Make the house your home..."
## $ property_type <chr> "House"
## $ room_type <chr> "Entire home/apt"
## $ bed_type <chr> "Real Bed"
## $ minimum_nights <chr> "2"
## $ maximum_nights <chr> "30"
## $ cancellation_policy <chr> "moderate"
## $ last_scraped <dttm> 2019-02-15 21:00:00
## $ calendar_last_scraped <dttm> 2019-02-15 21:00:00
## $ first_review <dttm> 2016-01-02 21:00:00
## $ last_review <dttm> 2019-01-19 21:00:00
## $ accommodates <int> 8
## $ bedrooms <int> 3
## $ beds <int> 5
## $ number_of_reviews <int> 51
## $ bathrooms <dbl> 1
## $ amenities <list> [<"TV", "Cable TV", "Wifi", "Kitchen", "Paid p…
## $ price <dbl> 80
## $ security_deposit <dbl> 200
## $ cleaning_fee <dbl> 35
## $ extra_people <dbl> 15
## $ guests_included <dbl> 6
## $ images <df[,4]> <data.frame[1 x 4]>
## $ host <df[,16]> <data.frame[1 x 16]>
## $ address <df[,7]> <data.frame[1 x 7]>
## $ availability <df[,4]> <data.frame[1 x 4]>
## $ review_scores <df[,7]> <data.frame[1 x 7]>
## $ reviews <list> [<data.frame[51 x 6]>]
# filter fields
m$find("{}", fields = '{"name": true}')
# filter keys
m$find('{"price": {"$lt": 20}}', fields = '{"name": true, "price": true}')
Check for all the operators: https://docs.mongodb.com/manual/reference/operator/query-comparison/#query-selectors-comparison
m$find('{"price": {"$lt": 50}, "bedrooms": {"$gte": 3}, "amenities": "Wifi"}',
fields = '{"name": true, "price": true, "bedrooms": true}'
)
Remark: As amenities
is an array, "amenities": "Wifi"
means Wifi is one of the amenities. On the other hand, "amenities": ["Wifi"]
means Wifi is the only amenity.
# sort
m$find('{"bedrooms": {"$gte": 4}}',
fields = '{"name": true, "price": true}',
sort = '{"price": -1}',
limit = 5
)
countrydf <- m$find(
'{"address.country_code": "US"}',
fields = '{"name": true, "address.country": true}'
)
countrydf
countrydf$address$country
## [1] "United States" "United States" "United States" "United States"
## [5] "United States" "United States" "United States" "United States"
## [9] "United States" "United States" "United States" "United States"
## [13] "United States" "United States" "United States" "United States"
## [17] "United States" "United States" "United States" "United States"
## [21] "United States" "United States" "United States" "United States"
## [25] "United States" "United States" "United States" "United States"
## [29] "United States" "United States" "United States" "United States"
## [33] "United States" "United States" "United States" "United States"
## [37] "United States" "United States" "United States" "United States"
## [41] "United States" "United States" "United States" "United States"
## [45] "United States" "United States" "United States" "United States"
## [49] "United States" "United States" "United States" "United States"
## [53] "United States" "United States" "United States" "United States"
## [57] "United States" "United States" "United States" "United States"
## [61] "United States" "United States" "United States" "United States"
## [65] "United States" "United States" "United States" "United States"
## [69] "United States" "United States" "United States" "United States"
## [73] "United States" "United States" "United States" "United States"
## [77] "United States" "United States" "United States" "United States"
## [81] "United States" "United States" "United States" "United States"
## [85] "United States" "United States" "United States" "United States"
## [89] "United States" "United States" "United States" "United States"
## [93] "United States" "United States" "United States" "United States"
## [97] "United States" "United States" "United States" "United States"
## [101] "United States" "United States" "United States" "United States"
## [105] "United States" "United States" "United States" "United States"
## [109] "United States" "United States" "United States" "United States"
## [113] "United States" "United States" "United States" "United States"
## [117] "United States" "United States" "United States" "United States"
## [121] "United States" "United States" "United States" "United States"
## [125] "United States" "United States" "United States" "United States"
## [129] "United States" "United States" "United States" "United States"
## [133] "United States" "United States" "United States" "United States"
## [137] "United States" "United States" "United States" "United States"
## [141] "United States" "United States" "United States" "United States"
## [145] "United States" "United States" "United States" "United States"
## [149] "United States" "United States" "United States" "United States"
## [153] "United States" "United States" "United States" "United States"
## [157] "United States" "United States" "United States" "United States"
## [161] "United States" "United States" "United States" "United States"
## [165] "United States" "United States" "United States" "United States"
## [169] "United States" "United States" "United States" "United States"
## [173] "United States" "United States" "United States" "United States"
## [177] "United States" "United States" "United States" "United States"
## [181] "United States" "United States" "United States" "United States"
## [185] "United States" "United States" "United States" "United States"
## [189] "United States" "United States" "United States" "United States"
## [193] "United States" "United States" "United States" "United States"
## [197] "United States" "United States" "United States" "United States"
## [ reached getOption("max.print") -- omitted 1022 entries ]
countrydf %>% pull(address) %>% pull(country)
## [1] "United States" "United States" "United States" "United States"
## [5] "United States" "United States" "United States" "United States"
## [9] "United States" "United States" "United States" "United States"
## [13] "United States" "United States" "United States" "United States"
## [17] "United States" "United States" "United States" "United States"
## [21] "United States" "United States" "United States" "United States"
## [25] "United States" "United States" "United States" "United States"
## [29] "United States" "United States" "United States" "United States"
## [33] "United States" "United States" "United States" "United States"
## [37] "United States" "United States" "United States" "United States"
## [41] "United States" "United States" "United States" "United States"
## [45] "United States" "United States" "United States" "United States"
## [49] "United States" "United States" "United States" "United States"
## [53] "United States" "United States" "United States" "United States"
## [57] "United States" "United States" "United States" "United States"
## [61] "United States" "United States" "United States" "United States"
## [65] "United States" "United States" "United States" "United States"
## [69] "United States" "United States" "United States" "United States"
## [73] "United States" "United States" "United States" "United States"
## [77] "United States" "United States" "United States" "United States"
## [81] "United States" "United States" "United States" "United States"
## [85] "United States" "United States" "United States" "United States"
## [89] "United States" "United States" "United States" "United States"
## [93] "United States" "United States" "United States" "United States"
## [97] "United States" "United States" "United States" "United States"
## [101] "United States" "United States" "United States" "United States"
## [105] "United States" "United States" "United States" "United States"
## [109] "United States" "United States" "United States" "United States"
## [113] "United States" "United States" "United States" "United States"
## [117] "United States" "United States" "United States" "United States"
## [121] "United States" "United States" "United States" "United States"
## [125] "United States" "United States" "United States" "United States"
## [129] "United States" "United States" "United States" "United States"
## [133] "United States" "United States" "United States" "United States"
## [137] "United States" "United States" "United States" "United States"
## [141] "United States" "United States" "United States" "United States"
## [145] "United States" "United States" "United States" "United States"
## [149] "United States" "United States" "United States" "United States"
## [153] "United States" "United States" "United States" "United States"
## [157] "United States" "United States" "United States" "United States"
## [161] "United States" "United States" "United States" "United States"
## [165] "United States" "United States" "United States" "United States"
## [169] "United States" "United States" "United States" "United States"
## [173] "United States" "United States" "United States" "United States"
## [177] "United States" "United States" "United States" "United States"
## [181] "United States" "United States" "United States" "United States"
## [185] "United States" "United States" "United States" "United States"
## [189] "United States" "United States" "United States" "United States"
## [193] "United States" "United States" "United States" "United States"
## [197] "United States" "United States" "United States" "United States"
## [ reached getOption("max.print") -- omitted 1022 entries ]
countrydf %>%
mutate(country = address$country, address = NULL)
countrydf %>%
bind_cols(countrydf$address) %>%
select(-address)
# find all documents where amenities contain both Oven and Refrigerator
m$find('{"amenities": {"$all": ["Oven", "Refrigerator"]}}',
fields = '{"name": true, "amenities": true}')
PS: you could also use the $and
operator.
# find all documents where amenities contain both Oven and Refrigerator
m$find('{"$and": [{"amenities": "Oven"}, {"amenities": "Refrigerator"}]}',
fields = '{"name": true, "amenities": true}')
# find all documents where amenities contain Oven or Refrigerator or both
m$find('{"amenities": {"$in": ["Oven", "Refrigerator"]}}',
fields = '{"name": true, "amenities": true}')
PS: you could also use the $or
operator.
# find all documents where amenities contain both Oven and Refrigerator
m$find('{"$or": [{"amenities": "Oven"}, {"amenities": "Refrigerator"}]}',
fields = '{"name": true, "amenities": true}')
m$find('{"name": {"$regex": "\\\\bsea\\\\b", "$options": "i"}}', fields = '{"name": true}')
m$find('{"price": {"$lt": 12}}')
it <- m$iterate('{"price": {"$lt": 12}}')
while (TRUE) {
x <- it$one()
if (is.null(x)) break
cat(x$name, "\n")
}
## Room on spacious appartment
## Cómoda Habitación L'Eixample, Gracia
## Private room with sunny terrace of 200m2. 6
## Near the RAMBLA, the double room at SEASIDE PORT1
By batch
it <- m$iterate('{"price": {"$lt": 12}}')
res <- it$batch(size = 2)
Examine raw JSON
it <- m$iterate('{"price": {"$lt": 12}}')
j <- it$json(size = 1)
The aggregation pipeline is a framework for data aggregation modeled on the concept of data processing pipelines.
https://docs.mongodb.com/manual/reference/operator/aggregation-pipeline/
To illustrate the usage of a pipeline, consider the following three stages
m$aggregate('[
{"$match": {"bedrooms": {"$ne": null, "$ne": 0}}},
{"$group": { "_id": "$bedrooms", "avg_price": { "$avg": "$price" }}}
]')
User the $project
stage to rename _id
.
m$aggregate('[
{"$match": {"bedrooms": {"$ne": null, "$ne": 0}}},
{"$group": { "_id": "$bedrooms", "avg_price": { "$avg": "$price" }}},
{"$project": { "_id": false, "bedrooms": "$_id", "avg_price": true }},
{"$sort": {"bedrooms": 1}}
]')
Group by two variables
m$aggregate('[
{"$match": {"bedrooms": {"$ne": null, "$ne": 0}}},
{"$group": {
"_id": {"property_type": "$property_type", "bedrooms": "$bedrooms"},
"avg_price": { "$avg": "$price" }}},
{"$project": {
"_id": false,
"property_type": "$_id.property_type",
"bedrooms": "$_id.bedrooms",
"avg_price": true
}},
{"$sort": {"property_type": 1, "bedrooms": 1}}
]')
Compute the average price per room grouped by property_type
See: https://docs.mongodb.com/manual/reference/operator/aggregation/
Remark: our mongo server is 3.x.
m$aggregate('[
{"$match": {"bedrooms": {"$ne": null, "$ne": 0}}},
{"$project": {
"property_type": "$property_type",
"bedrooms": "$bedrooms",
"price_per_room": {"$divide": ["$price", "$bedrooms"]}
}},
{"$group": {
"_id": "$property_type",
"avg_bedrooms": { "$avg": "$bedrooms" },
"price_per_room": { "$avg": "$price_per_room" }}
},
{"$project": {
"_id": false,
"property_type": "$_id",
"avg_bedrooms": true,
"price_per_room": true
}},
{"$sort": {"property_type": 1}}
]')
$unwind
has a similar effect as tidyverse
’s unnest_longer
.
df <- tibble(
id = c("a", "b"),
v = list(1:3, 5:6),
w = c("apple", "banana")
)
df %>% unnest_longer(v)
To query all the reviews of “Ribeira Charming Duplex”.
m$aggregate('[
{"$match": {"name": "Ribeira Charming Duplex"}},
{"$project": {
"name": true,
"reviews": true
}},
{"$unwind": "$reviews"},
{"$project": {
"_id": false,
"name": true,
"reviewer_name": "$reviews.reviewer_name",
"comments": "$reviews.comments"
}}
]')
it <- m$aggregate('[
{"$match": {"bedrooms": {"$ne": null, "$ne": 0}}},
{"$group": { "_id": "$bedrooms", "avg_price": { "$avg": "$price" }}},
{"$project": { "_id": false, "bedrooms": "$_id", "avg_price": true }},
{"$sort": {"bedrooms": 1}}
]', iterate = TRUE)
it$batch(size = 2)
## [[1]]
## [[1]]$avg_price
## [1] 208.7122
##
## [[1]]$bedrooms
## [1] 1
##
##
## [[2]]
## [[2]]$avg_price
## [1] 315.5752
##
## [[2]]$bedrooms
## [1] 2