Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update codebase #84

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"matplotlib>=3.6.0",
"plotly>=5.10.0",
"implicit==0.6.2",
"tqdm==4.65.0"
],
)

Expand Down
19 changes: 9 additions & 10 deletions src/rsdiv/aggregation/rank_product.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from typing import Optional

import numpy as np
import pandas as pd

Expand Down Expand Up @@ -30,12 +29,12 @@ def get_rp_values(self, weights: Optional[np.ndarray] = None) -> np.ndarray:
The rank product scores for each item, the larger the more preferable.
Laplacian smoothing is applied to avoid zeros.
"""
num_weights: int = len(self.multi_scores)
if weights:
power = weights.reshape(num_weights, -1)
else:
power = np.ones([num_weights, 1])
ranks: np.ndarray = np.argsort(np.argsort(self.multi_scores)) + 1
ranks = ranks**power
rp_scores = np.asarray(ranks.prod(axis=0) ** (1.0 / power.sum()))
return rp_scores
if weights is None:
weights = np.ones(len(self.multi_scores.columns))

ranks = self.multi_scores.apply(lambda x: x.rank(method="min")).apply(
lambda x: x**weights
)
rp_scores = np.prod(ranks, axis=1) ** (1.0 / weights.sum())

return rp_scores.values
23 changes: 11 additions & 12 deletions src/rsdiv/dataset/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import os
import zipfile
from abc import ABCMeta
from pathlib import Path
from typing import Optional, Union
from urllib.request import urlretrieve
import zipfile


class BaseDownloader(metaclass=ABCMeta):
Expand All @@ -13,18 +12,18 @@ class BaseDownloader(metaclass=ABCMeta):
DEFAULT_PATH: str

def __init__(self, zip_path: Optional[Union[Path, str]] = None):
if zip_path is None:
zip_path = self.DEFAULT_PATH
else:
zip_path = zip_path
self.zip_path = Path(zip_path)
self.zip_path = Path(zip_path or self.DEFAULT_PATH)
if not self.zip_path.exists():
self._retrieve()

def _retrieve(self) -> None:
url: str = self.DOWNLOAD_URL
file_name: str = str(self.zip_path) + ".zip"
urlretrieve(url, filename=file_name)
with zipfile.ZipFile(file_name) as zf:
if self.zip_path.exists():
return

zip_file_name = self.zip_path.with_suffix(".zip")
urlretrieve(self.DOWNLOAD_URL, filename=zip_file_name)

with zipfile.ZipFile(zip_file_name) as zf:
zf.extractall(self.zip_path.parent)
os.remove(file_name)

zip_file_name.unlink()
31 changes: 17 additions & 14 deletions src/rsdiv/dataset/movielens_100k.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from typing import List

import pandas as pd
from pathlib import Path
from typing import List
from functools import lru_cache

from .base import BaseDownloader

Expand All @@ -10,38 +10,39 @@ class MovieLens100KDownLoader(BaseDownloader):
"""MovieLens dataset downLoader for 100K interactions."""

DOWNLOAD_URL: str = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
DEFAULT_PATH: str = os.path.join(os.getcwd(), "ml-100k")
DEFAULT_PATH: str = str(Path.cwd() / "ml-100k")

@lru_cache(maxsize=1)
def read_ratings(self) -> pd.DataFrame:
ratings_path: str = os.path.join(self.DEFAULT_PATH, "u.data")
ratings_path: Path = Path(self.DEFAULT_PATH) / "u.data"
df_ratings: pd.DataFrame = pd.read_csv(
ratings_path, sep="\t", header=None, engine="python"
).copy()
)
df_ratings.columns = pd.Index(["userId", "movieId", "rating", "timestamp"])
df_ratings["timestamp"] = pd.to_datetime(df_ratings.timestamp, unit="s")

return df_ratings

@lru_cache(maxsize=1)
def read_users(self) -> pd.DataFrame:
users_path: str = os.path.join(self.DEFAULT_PATH, "u.user")
users_path: Path = Path(self.DEFAULT_PATH) / "u.user"
df_users: pd.DataFrame = pd.read_csv(
users_path,
sep="|",
header=None,
engine="python",
names=["userId", "age", "gender", "occupation", "zipcode"],
)

return df_users[["userId", "gender", "age", "occupation", "zipcode"]]
return df_users

def _read_genres(self) -> List[str]:
genres_path: str = os.path.join(self.DEFAULT_PATH, "u.genre")
with open(genres_path, "r") as outfile:
genres_path: Path = Path(self.DEFAULT_PATH) / "u.genre"
with genres_path.open() as outfile:
genres = outfile.read()
return [pair.split("|")[0] for pair in genres.split("\n")][:-2]

@lru_cache(maxsize=1)
def read_items(self) -> pd.DataFrame:
movies_path: str = os.path.join(self.DEFAULT_PATH, "u.item")
movies_path: Path = Path(self.DEFAULT_PATH) / "u.item"
genres: List[str] = self._read_genres()
df_items: pd.DataFrame = pd.read_csv(
movies_path,
Expand All @@ -53,7 +54,9 @@ def read_items(self) -> pd.DataFrame:
+ genres,
)
df_items["title"] = df_items["title"].str[:-7]
df_items["title"] = df_items["title"].apply(lambda x: x.split(",")[0])
df_items["title"] = df_items["title"].apply(
lambda x: x.split(",")[0] if "," in x else x
)
df_items["release_date"] = pd.to_datetime(df_items.release_date)
df_items["genres"] = df_items[genres] @ (df_items[genres].columns + "|")
df_items["genres"] = df_items["genres"].apply(lambda x: x[:-1].split("|"))
Expand Down
24 changes: 13 additions & 11 deletions src/rsdiv/dataset/movielens_1m.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

import pandas as pd
from pathlib import Path
from functools import lru_cache

from .base import BaseDownloader

Expand All @@ -9,32 +9,33 @@ class MovieLens1MDownLoader(BaseDownloader):
"""MovieLens dataset downLoader for 1M interactions."""

DOWNLOAD_URL: str = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
DEFAULT_PATH: str = os.path.join(os.getcwd(), "ml-1m")
DEFAULT_PATH: str = str(Path.cwd() / "ml-1m")

@lru_cache(maxsize=1)
def read_ratings(self) -> pd.DataFrame:
ratings_path: str = os.path.join(self.DEFAULT_PATH, "ratings.dat")
ratings_path: Path = Path(self.DEFAULT_PATH) / "ratings.dat"
df_ratings: pd.DataFrame = pd.read_csv(
ratings_path, sep="::", header=None, engine="python"
).copy()
)
df_ratings.columns = pd.Index(["userId", "movieId", "rating", "timestamp"])
df_ratings["timestamp"] = pd.to_datetime(df_ratings.timestamp, unit="s")

return df_ratings

@lru_cache(maxsize=1)
def read_users(self) -> pd.DataFrame:
users_path: str = os.path.join(self.DEFAULT_PATH, "users.dat")
users_path: Path = Path(self.DEFAULT_PATH) / "users.dat"
df_users: pd.DataFrame = pd.read_csv(
users_path,
sep="::",
header=None,
engine="python",
names=["userId", "gender", "age", "occupation", "zipcode"],
)

return df_users

@lru_cache(maxsize=1)
def read_items(self) -> pd.DataFrame:
movies_path: str = os.path.join(self.DEFAULT_PATH, "movies.dat")
movies_path: Path = Path(self.DEFAULT_PATH) / "movies.dat"
df_items: pd.DataFrame = pd.read_csv(
movies_path,
sep="::",
Expand All @@ -45,7 +46,8 @@ def read_items(self) -> pd.DataFrame:
)
df_items["release_date"] = df_items["title"].str[-5:-1].astype("int")
df_items["title"] = df_items["title"].str[:-7]
df_items["title"] = df_items["title"].apply(lambda x: x.split(",")[0])
df_items["title"] = df_items["title"].apply(
lambda x: x.split(",")[0] if "," in x else x
)
df_items["genres"] = df_items["genres"].apply(lambda x: x.split("|"))

return df_items
25 changes: 14 additions & 11 deletions src/rsdiv/diversity/mmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,27 +23,30 @@ def rerank(
) -> Sequence[int]:
assert k > 0, "k must be larger than 0!"
n = quality_scores.shape[0]
if k >= n:
return list(range(n))

k = min(k, n)
new_selection = np.argmax(quality_scores).item()
quality_scores = ma.array(quality_scores, mask=False)
new_selection = quality_scores.argmax().item()
selected_ind = [new_selection]
ma_similarity_scores = ma.array(similarity_scores, mask=True)

ma_similarity_scores = ma.array(similarity_scores, mask=True)
ma_similarity_scores.mask[:, new_selection] = False
ma_similarity_scores[new_selection, new_selection] = ma.masked

quality_scores = ma.array(quality_scores)
quality_scores[new_selection] = ma.masked

for _ in range(k - 1):
scores = self.lbd * quality_scores - (1.0 - self.lbd) * np.max(
ma_similarity_scores, axis=1
ma_similarity_scores[new_selection] = ma.masked
max_similarity_scores = ma_similarity_scores.max(axis=1)
scores = (
self.lbd * quality_scores - (1.0 - self.lbd) * max_similarity_scores
)
new_selection = np.argmax(scores).item()
quality_scores[new_selection] = ma.masked

new_selection = scores.argmax().item()
selected_ind.append(new_selection)

quality_scores[new_selection] = ma.masked
ma_similarity_scores.mask[:, new_selection] = False
ma_similarity_scores[new_selection, :] = ma.masked
ma_similarity_scores[selected_ind, new_selection] = ma.masked

selected_ind.append(new_selection)
return selected_ind
27 changes: 16 additions & 11 deletions src/rsdiv/diversity/ssd.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@ def __init__(self, gamma: float):
assert gamma >= 0, "gamma should be >= 0!"
self.gamma = gamma

def _adjust_embeddings(self, embeddings: np.ndarray, selected_emb: np.ndarray):
selected_norm = norm(selected_emb)
if selected_norm > 1e-7: # treat new selection as 0 vector if it's too small
selected_emb /= selected_norm
np.subtract(
embeddings,
np.outer(embeddings @ selected_emb, selected_emb),
out=embeddings,
)

def rerank(
self,
quality_scores: np.ndarray,
Expand All @@ -24,22 +34,17 @@ def rerank(
inplace: bool = False
) -> Sequence[int]:
assert k > 0, "k must be larger than 0!"
if not inplace:
embeddings = embeddings.copy()
selection = np.argmax(quality_scores).item()
ret = [selection]
volume = self.gamma * norm(embeddings[selection])
norms = norm(embeddings, axis=1)
for _ in range(k - 1):
selected_emb = embeddings[selection]
selected_norm = norm(selected_emb)
if (
selected_norm > 1e-7
): # treat new selection as 0 vector if it's too small
selected_emb /= selected_norm
embeddings -= np.outer(embeddings @ selected_emb, selected_emb)
norms = norm(embeddings, axis=1)
norms *= volume
scores = norms + quality_scores
self._adjust_embeddings(embeddings, selected_emb)
norms = norm(embeddings, axis=1) # update norms after adjusting embeddings
scaled_norms = norms.copy()
scaled_norms *= volume
scores = scaled_norms + quality_scores
scores[ret] = -np.inf
selection = np.argmax(scores).item()
ret.append(selection)
Expand Down
21 changes: 3 additions & 18 deletions src/rsdiv/embedding/fasttext_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,6 @@ class FastTextEmbedder(BaseEmbedder):
if EMB_PATH:
MAPPER: Dict[str, np.ndarray] = pkl.loads(EMB_PATH)

@classmethod
def _l2_norm(cls, vector: np.ndarray) -> float:
"""Internal method to calculate the normalize of a given vector.

Args:
vector (np.ndarray): target vector to be normalized.

Returns:
float: the l2 norm value for the given vector.
"""
norm_val: float = np.sqrt(np.sum(vector**2))
return norm_val

@classmethod
def embedding_norm(cls, org: str) -> np.ndarray:
"""Normalize a given vector.
Expand All @@ -40,9 +27,9 @@ def embedding_norm(cls, org: str) -> np.ndarray:
np.ndarray: normalized vector.
"""
vector: np.ndarray = cls.embedding_single(org)
norm_val: float = cls._l2_norm(vector)
norm_val: float = np.linalg.norm(vector)
if norm_val:
embed: np.ndarray = vector * (1.0 / norm_val)
embed: np.ndarray = vector / norm_val
return embed
else:
return vector
Expand All @@ -57,6 +44,4 @@ def embedding_list(cls, org: List[str]) -> np.ndarray:
Returns:
np.ndarray: normalized vector.
"""
emb_list: np.ndarray = np.asarray([cls.embedding_norm(item) for item in org])
emb_norm: np.ndarray = np.mean(emb_list, axis=0)
return emb_norm
return np.mean([cls.embedding_norm(item) for item in org], axis=0)
9 changes: 4 additions & 5 deletions src/rsdiv/encoding/geo_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self) -> None:
super().__init__()
self.encoder, self.geo_county_dict = self.read_source()
self.coord: List[np.ndarray] = self.encoder.coord.to_list()
self.index: pd.Index = pd.Index(self.encoder["index"])
self.tree = spatial.KDTree(self.coord)

def read_source(self) -> Tuple[pd.DataFrame, Dict]:
"""Parse the location information from `geojson-counties-fips.json`.
Expand All @@ -43,7 +43,7 @@ def read_source(self) -> Tuple[pd.DataFrame, Dict]:
geo_county_dict[id] = [coord, name, lsad]
dataframe = pd.DataFrame.from_dict(
geo_county_dict, orient="index", columns=["coord", "name", "lstd"]
).reset_index()
).rename_axis("index")
return (dataframe, geo_county_dict)

def encoding_single(self, org: Union[List, str]) -> Union[int, str]:
Expand All @@ -55,8 +55,7 @@ def encoding_single(self, org: Union[List, str]) -> Union[int, str]:
Returns:
Union[int, str]: the coding corresponds to the given target.
"""
tree = spatial.KDTree(self.coord)
return str(self.index[int(tree.query(org)[1])])
return str(self.encoder.index[int(self.tree.query(org)[1])])

def encoding_series(self, series: pd.Series) -> pd.Series:
"""Encoding for the series of locations.
Expand All @@ -67,7 +66,7 @@ def encoding_series(self, series: pd.Series) -> pd.Series:
Returns:
pd.Series: the corresponding series of locations.
"""
encodings = pd.Series(series.apply(lambda x: self.encoding_single(x)))
encodings = series.apply(self.encoding_single)
return encodings

def draw_geo_graph(
Expand Down
Loading