-
Notifications
You must be signed in to change notification settings - Fork 34
/
MovieLensSpec.scala
228 lines (194 loc) · 7.76 KB
/
MovieLensSpec.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import gremlin.scala._
import org.apache.tinkerpop.gremlin.structure.io.IoCore.gryo
import org.apache.tinkerpop.gremlin.tinkergraph.structure.TinkerGraph
import org.apache.tinkerpop.gremlin.process.traversal.dsl.graph.__
import org.apache.tinkerpop.gremlin.process.traversal.{Order, P, Scope}
import java.lang.{Long ⇒ JLong, Double ⇒ JDouble}
import java.util.{Map ⇒ JMap}
import scala.collection.JavaConversions._
import org.scalatest.{Matchers, WordSpec}
import shapeless.HNil
/**
* Examples to traverse the MovieLens graph.
* [Discussion in usergroup](https://groups.google.com/forum/#!msg/gremlin-users/jtWhwFpqnng/I-juFwyVCAAJ)
* Based on [presentation](http://www.slideshare.net/slidarko/the-gremlin-traversal-language) by Marko Rodriguez and Daniel Kuppitz
*
* Differences fro traversals in their presentation:
* 1) edge label 'category' is called 'hasGenre'
* 2) edge label 'occupation' is called 'hasOccupation'
*/
class MovieLensSpec extends WordSpec with Matchers {
val Name = Key[String]("name")
val Stars = Key[Int]("stars")
val Year = Key[Int]("year")
object Label {
val Person = "person"
val Movie = "movie"
val Occupation = "occupation"
val HasOccupation = "hasOccupation"
val Genre = "genre"
val HasGenre = "hasGenre"
val Rated = "rated"
}
import Label._
val g: ScalaGraph = {
val graph = TinkerGraph.open
graph.io(gryo()).readGraph("src/test/resources/movie-lens.kryo")
graph.asScala
}
"Get the vertex and edge counts for the graph" in {
g.V.count.head shouldBe 9625
g.E.count.head shouldBe 969719
}
"What is Die Hard's average rating?" in {
val avgRating: JDouble =
g.V.has(Movie, Name, "Die Hard")
.inE(Rated)
.value(Stars)
.mean
.head
"%.2f".format(avgRating) shouldBe "4.12"
}
"For each vertex, emit its label, then group and count each distinct label" in {
val groupCount: JMap[String, JLong] =
g.V.label.groupCount.head
groupCount.get(Occupation) shouldBe 21
groupCount.get(Movie) shouldBe 3546
groupCount.get(Person) shouldBe 6040
groupCount.get(Genre) shouldBe 18
}
"For each rated-edge, emit its stars property value and compute the average value" in {
val meanStars: JDouble =
g.E.hasLabel(Rated)
.value(Stars).mean
.head
"%.2f".format(meanStars) shouldBe "3.57"
}
"Get the maximum number of movies a single user rated" in {
val ratedCounts: GremlinScala[Long] =
for {
person ← g.V.hasLabel(Person)
count ← person.outE(Rated).count
} yield count
ratedCounts.max.head shouldBe 2161
}
"What year was the oldest movie made?" in {
val min: Integer =
g.V.hasLabel(Movie)
.value(Year).min
.head
min shouldBe 1919
}
"For each vertex that is labeled 'genre', emit the name property value of that vertex" in {
val categories: Set[String] =
g.V.hasLabel(Genre)
.value(Name)
.toSet
categories should contain("Animation")
categories should contain("Thriller")
categories should have size 18
}
"For each genre vertex, emit a map of its name and the number of movies it represents" in {
val traversal = for {
genre ← g.V.hasLabel(Genre)
count ← genre.start.inE(HasGenre).count
} yield (genre.value2(Name), count)
val genreMovieCounts = traversal.toMap
genreMovieCounts should have size 18
genreMovieCounts should contain("Animation" → 99)
genreMovieCounts should contain("Drama" → 1405)
}
"For each movie, get its name and mean rating (or 0 if no ratings). Order by average rating and emit top 10." in {
val traversal = for {
movie ← g.V.hasLabel(Movie)
stars ← movie.start.inE(Rated).value(Stars).mean
} yield (movie.value2(Name), stars)
val avgRatings = traversal.toMap
avgRatings should contain("Lured" → 5)
avgRatings should contain("Lamerica" → 4.75)
}
"For each movie with at least 11 ratings, emit a map of its name and average rating. " +
"Sort the maps in decreasing order by their average rating. Emit the first 10 maps (i.e. top 10)." in {
val movieAndRatings = for {
movie ← g.V.hasLabel(Movie)
.where(_.inE(Rated).count().is(P.gt(10)))
stars ← movie.start.inE(Rated).value(Stars).mean
} yield (movie.value2(Name), stars)
val top10 = movieAndRatings
.orderBy(_._2, Order.decr)
.limit(10)
.toMap
top10("Sanjuro").toDouble shouldBe 4.61 +- 0.1d
top10("Rear Window").toDouble shouldBe 4.48 +- 0.1d
}
"Which programmers like Die Hard and what other movies do they like?" +
"Group and count the movies by their name. Sort the group count map in decreasing order by the count." in {
val dieHard: Vertex = g.V.has(Movie, Name, "Die Hard").head
val popularMovies = for {
programmer5Stars <- dieHard.start.inE(Rated).has(Stars, 5).outV
.where(_.out(HasOccupation).has(Name, "programmer"))
otherMovie <- programmer5Stars.start.outE(Rated).has(Stars, 5).inV
.filter(_.is(P.neq(dieHard)))
} yield otherMovie.value2(Name)
val counts: Map[String, JLong] = popularMovies.groupCount.head.toMap
val top10 = counts.toList.sortBy(_._2).reverse.take(10).toMap
counts("Braveheart") shouldBe 24
counts("Star Wars: Episode V - The Empire Strikes Back") shouldBe 36
}
"What 80's action movies do 30-something programmers like?" +
"Group count the movies by their name and sort the group count map in decreasing order by value." in {
val counts: Map[String, JLong] =
g.V
.`match`(
__.as("a").hasLabel(Movie),
__.as("a").out(HasGenre).has("name", "Action"),
__.as("a").has("year", P.between(1980, 1990)),
__.as("a").inE(Rated).as("b"),
__.as("b").has("stars", 5),
__.as("b").outV().as("c"),
__.as("c").out(HasOccupation).has("name", "programmer"),
__.as("c").has("age", P.between(30, 40))
)
.select[Vertex]("a")
.map(_.value[String]("name"))
.groupCount().head.toMap
val top10 = counts.toList.sortBy(_._2).reverse.take(10).toMap
top10("Raiders of the Lost Ark") shouldBe 26
top10("Star Wars: Episode V - The Empire Strikes Back") shouldBe 26
top10("Terminator, The") shouldBe 23
top10("Star Wars: Episode VI - Return of the Jedi") shouldBe 22
top10("Princess Bride, The") shouldBe 19
top10("Aliens") shouldBe 18
top10("Indiana Jones and the Last Crusade") shouldBe 11
top10("Star Trek: The Wrath of Khan") shouldBe 10
top10("Abyss, The") shouldBe 9
}
"What is the most liked movie in each decade?" in {
val counts: JMap[Integer, String] = g.V()
.hasLabel(Movie)
.where(_.inE(Rated).count().is(P.gt(10)))
.groupBy { movie =>
val year = movie.value2(Year)
val decade = (year / 10)
(decade * 10): Integer
}
.map { moviesByDecade ⇒
val highestRatedByDecade = moviesByDecade.mapValues { movies ⇒
movies.toList
.sortBy { _.inE(Rated).value(Stars).mean().head }
.reverse.head //get the movie with the highest mean rating
}
highestRatedByDecade.mapValues(_.value2(Name))
}
.head
counts.get(1910) shouldBe "Daddy Long Legs"
counts.get(1920) shouldBe "General, The"
counts.get(1930) shouldBe "City Lights"
counts.get(1940) shouldBe "Third Man, The"
counts.get(1960) shouldBe "Sanjuro"
counts.get(1970) shouldBe "Godfather, The"
counts.get(1980) shouldBe "Raiders of the Lost Ark"
counts.get(1990) shouldBe "Shawshank Redemption, The"
counts.get(2000) shouldBe "Almost Famous"
}
}