smartnews · stasfilin · Aug 2, 2023 · Aug 2, 2023 · Aug 2, 2023 · Aug 2, 2023
diff --git a/setup.py b/setup.py
@@ -17,6 +17,7 @@
         "matplotlib>=3.6.0",
         "plotly>=5.10.0",
         "implicit==0.6.2",
+        "tqdm==4.65.0"
     ],
 )
 

diff --git a/src/rsdiv/aggregation/rank_product.py b/src/rsdiv/aggregation/rank_product.py
@@ -1,5 +1,4 @@
 from typing import Optional
-
 import numpy as np
 import pandas as pd
 
@@ -30,12 +29,12 @@ def get_rp_values(self, weights: Optional[np.ndarray] = None) -> np.ndarray:
                 The rank product scores for each item, the larger the more preferable.
                 Laplacian smoothing is applied to avoid zeros.
         """
-        num_weights: int = len(self.multi_scores)
-        if weights:
-            power = weights.reshape(num_weights, -1)
-        else:
-            power = np.ones([num_weights, 1])
-        ranks: np.ndarray = np.argsort(np.argsort(self.multi_scores)) + 1
-        ranks = ranks**power
-        rp_scores = np.asarray(ranks.prod(axis=0) ** (1.0 / power.sum()))
-        return rp_scores
+        if weights is None:
+            weights = np.ones(len(self.multi_scores.columns))
+
+        ranks = self.multi_scores.apply(lambda x: x.rank(method="min")).apply(
+            lambda x: x**weights
+        )
+        rp_scores = np.prod(ranks, axis=1) ** (1.0 / weights.sum())
+
+        return rp_scores.values
diff --git a/src/rsdiv/dataset/base.py b/src/rsdiv/dataset/base.py
@@ -1,9 +1,8 @@
-import os
-import zipfile
 from abc import ABCMeta
 from pathlib import Path
 from typing import Optional, Union
 from urllib.request import urlretrieve
+import zipfile
 
 
 class BaseDownloader(metaclass=ABCMeta):
@@ -13,18 +12,18 @@ class BaseDownloader(metaclass=ABCMeta):
     DEFAULT_PATH: str
 
     def __init__(self, zip_path: Optional[Union[Path, str]] = None):
-        if zip_path is None:
-            zip_path = self.DEFAULT_PATH
-        else:
-            zip_path = zip_path
-        self.zip_path = Path(zip_path)
+        self.zip_path = Path(zip_path or self.DEFAULT_PATH)
         if not self.zip_path.exists():
             self._retrieve()
 
     def _retrieve(self) -> None:
-        url: str = self.DOWNLOAD_URL
-        file_name: str = str(self.zip_path) + ".zip"
-        urlretrieve(url, filename=file_name)
-        with zipfile.ZipFile(file_name) as zf:
+        if self.zip_path.exists():
+            return
+
+        zip_file_name = self.zip_path.with_suffix(".zip")
+        urlretrieve(self.DOWNLOAD_URL, filename=zip_file_name)
+
+        with zipfile.ZipFile(zip_file_name) as zf:
             zf.extractall(self.zip_path.parent)
-        os.remove(file_name)
+
+        zip_file_name.unlink()
diff --git a/src/rsdiv/dataset/movielens_100k.py b/src/rsdiv/dataset/movielens_100k.py
@@ -1,7 +1,7 @@
-import os
-from typing import List
-
 import pandas as pd
+from pathlib import Path
+from typing import List
+from functools import lru_cache
 
 from .base import BaseDownloader
 
@@ -10,38 +10,39 @@ class MovieLens100KDownLoader(BaseDownloader):
     """MovieLens dataset downLoader for 100K interactions."""
 
     DOWNLOAD_URL: str = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
-    DEFAULT_PATH: str = os.path.join(os.getcwd(), "ml-100k")
+    DEFAULT_PATH: str = str(Path.cwd() / "ml-100k")
 
+    @lru_cache(maxsize=1)
     def read_ratings(self) -> pd.DataFrame:
-        ratings_path: str = os.path.join(self.DEFAULT_PATH, "u.data")
+        ratings_path: Path = Path(self.DEFAULT_PATH) / "u.data"
         df_ratings: pd.DataFrame = pd.read_csv(
             ratings_path, sep="\t", header=None, engine="python"
-        ).copy()
+        )
         df_ratings.columns = pd.Index(["userId", "movieId", "rating", "timestamp"])
         df_ratings["timestamp"] = pd.to_datetime(df_ratings.timestamp, unit="s")
-
         return df_ratings
 
+    @lru_cache(maxsize=1)
     def read_users(self) -> pd.DataFrame:
-        users_path: str = os.path.join(self.DEFAULT_PATH, "u.user")
+        users_path: Path = Path(self.DEFAULT_PATH) / "u.user"
         df_users: pd.DataFrame = pd.read_csv(
             users_path,
             sep="|",
             header=None,
             engine="python",
             names=["userId", "age", "gender", "occupation", "zipcode"],
         )
-
-        return df_users[["userId", "gender", "age", "occupation", "zipcode"]]
+        return df_users
 
     def _read_genres(self) -> List[str]:
-        genres_path: str = os.path.join(self.DEFAULT_PATH, "u.genre")
-        with open(genres_path, "r") as outfile:
+        genres_path: Path = Path(self.DEFAULT_PATH) / "u.genre"
+        with genres_path.open() as outfile:
             genres = outfile.read()
             return [pair.split("|")[0] for pair in genres.split("\n")][:-2]
 
+    @lru_cache(maxsize=1)
     def read_items(self) -> pd.DataFrame:
-        movies_path: str = os.path.join(self.DEFAULT_PATH, "u.item")
+        movies_path: Path = Path(self.DEFAULT_PATH) / "u.item"
         genres: List[str] = self._read_genres()
         df_items: pd.DataFrame = pd.read_csv(
             movies_path,
@@ -53,7 +54,9 @@ def read_items(self) -> pd.DataFrame:
             + genres,
         )
         df_items["title"] = df_items["title"].str[:-7]
-        df_items["title"] = df_items["title"].apply(lambda x: x.split(",")[0])
+        df_items["title"] = df_items["title"].apply(
+            lambda x: x.split(",")[0] if "," in x else x
+        )
         df_items["release_date"] = pd.to_datetime(df_items.release_date)
         df_items["genres"] = df_items[genres] @ (df_items[genres].columns + "|")
         df_items["genres"] = df_items["genres"].apply(lambda x: x[:-1].split("|"))

diff --git a/src/rsdiv/dataset/movielens_1m.py b/src/rsdiv/dataset/movielens_1m.py
@@ -1,6 +1,6 @@
-import os
-
 import pandas as pd
+from pathlib import Path
+from functools import lru_cache
 
 from .base import BaseDownloader
 
@@ -9,32 +9,33 @@ class MovieLens1MDownLoader(BaseDownloader):
     """MovieLens dataset downLoader for 1M interactions."""
 
     DOWNLOAD_URL: str = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
-    DEFAULT_PATH: str = os.path.join(os.getcwd(), "ml-1m")
+    DEFAULT_PATH: str = str(Path.cwd() / "ml-1m")
 
+    @lru_cache(maxsize=1)
     def read_ratings(self) -> pd.DataFrame:
-        ratings_path: str = os.path.join(self.DEFAULT_PATH, "ratings.dat")
+        ratings_path: Path = Path(self.DEFAULT_PATH) / "ratings.dat"
         df_ratings: pd.DataFrame = pd.read_csv(
             ratings_path, sep="::", header=None, engine="python"
-        ).copy()
+        )
         df_ratings.columns = pd.Index(["userId", "movieId", "rating", "timestamp"])
         df_ratings["timestamp"] = pd.to_datetime(df_ratings.timestamp, unit="s")
-
         return df_ratings
 
+    @lru_cache(maxsize=1)
     def read_users(self) -> pd.DataFrame:
-        users_path: str = os.path.join(self.DEFAULT_PATH, "users.dat")
+        users_path: Path = Path(self.DEFAULT_PATH) / "users.dat"
         df_users: pd.DataFrame = pd.read_csv(
             users_path,
             sep="::",
             header=None,
             engine="python",
             names=["userId", "gender", "age", "occupation", "zipcode"],
         )
-
         return df_users
 
+    @lru_cache(maxsize=1)
     def read_items(self) -> pd.DataFrame:
-        movies_path: str = os.path.join(self.DEFAULT_PATH, "movies.dat")
+        movies_path: Path = Path(self.DEFAULT_PATH) / "movies.dat"
         df_items: pd.DataFrame = pd.read_csv(
             movies_path,
             sep="::",
@@ -45,7 +46,8 @@ def read_items(self) -> pd.DataFrame:
         )
         df_items["release_date"] = df_items["title"].str[-5:-1].astype("int")
         df_items["title"] = df_items["title"].str[:-7]
-        df_items["title"] = df_items["title"].apply(lambda x: x.split(",")[0])
+        df_items["title"] = df_items["title"].apply(
+            lambda x: x.split(",")[0] if "," in x else x
+        )
         df_items["genres"] = df_items["genres"].apply(lambda x: x.split("|"))
-
         return df_items
diff --git a/src/rsdiv/diversity/mmr.py b/src/rsdiv/diversity/mmr.py
@@ -23,27 +23,30 @@ def rerank(
     ) -> Sequence[int]:
         assert k > 0, "k must be larger than 0!"
         n = quality_scores.shape[0]
+        if k >= n:
+            return list(range(n))
+
         k = min(k, n)
-        new_selection = np.argmax(quality_scores).item()
+        quality_scores = ma.array(quality_scores, mask=False)
+        new_selection = quality_scores.argmax().item()
         selected_ind = [new_selection]
-        ma_similarity_scores = ma.array(similarity_scores, mask=True)
 
+        ma_similarity_scores = ma.array(similarity_scores, mask=True)
         ma_similarity_scores.mask[:, new_selection] = False
-        ma_similarity_scores[new_selection, new_selection] = ma.masked
 
-        quality_scores = ma.array(quality_scores)
         quality_scores[new_selection] = ma.masked
 
         for _ in range(k - 1):
-            scores = self.lbd * quality_scores - (1.0 - self.lbd) * np.max(
-                ma_similarity_scores, axis=1
+            ma_similarity_scores[new_selection] = ma.masked
+            max_similarity_scores = ma_similarity_scores.max(axis=1)
+            scores = (
+                self.lbd * quality_scores - (1.0 - self.lbd) * max_similarity_scores
             )
-            new_selection = np.argmax(scores).item()
-            quality_scores[new_selection] = ma.masked
 
+            new_selection = scores.argmax().item()
+            selected_ind.append(new_selection)
+
+            quality_scores[new_selection] = ma.masked
             ma_similarity_scores.mask[:, new_selection] = False
-            ma_similarity_scores[new_selection, :] = ma.masked
-            ma_similarity_scores[selected_ind, new_selection] = ma.masked
 
-            selected_ind.append(new_selection)
         return selected_ind
diff --git a/src/rsdiv/diversity/ssd.py b/src/rsdiv/diversity/ssd.py
@@ -14,6 +14,16 @@ def __init__(self, gamma: float):
         assert gamma >= 0, "gamma should be >= 0!"
         self.gamma = gamma
 
+    def _adjust_embeddings(self, embeddings: np.ndarray, selected_emb: np.ndarray):
+        selected_norm = norm(selected_emb)
+        if selected_norm > 1e-7:  # treat new selection as 0 vector if it's too small
+            selected_emb /= selected_norm
+            np.subtract(
+                embeddings,
+                np.outer(embeddings @ selected_emb, selected_emb),
+                out=embeddings,
+            )
+
     def rerank(
         self,
         quality_scores: np.ndarray,
@@ -24,22 +34,17 @@ def rerank(
         inplace: bool = False
     ) -> Sequence[int]:
         assert k > 0, "k must be larger than 0!"
-        if not inplace:
-            embeddings = embeddings.copy()
         selection = np.argmax(quality_scores).item()
         ret = [selection]
         volume = self.gamma * norm(embeddings[selection])
+        norms = norm(embeddings, axis=1)
         for _ in range(k - 1):
             selected_emb = embeddings[selection]
-            selected_norm = norm(selected_emb)
-            if (
-                selected_norm > 1e-7
-            ):  # treat new selection as 0 vector if it's too small
-                selected_emb /= selected_norm
-                embeddings -= np.outer(embeddings @ selected_emb, selected_emb)
-            norms = norm(embeddings, axis=1)
-            norms *= volume
-            scores = norms + quality_scores
+            self._adjust_embeddings(embeddings, selected_emb)
+            norms = norm(embeddings, axis=1)  # update norms after adjusting embeddings
+            scaled_norms = norms.copy()
+            scaled_norms *= volume
+            scores = scaled_norms + quality_scores
             scores[ret] = -np.inf
             selection = np.argmax(scores).item()
             ret.append(selection)

diff --git a/src/rsdiv/embedding/fasttext_embedding.py b/src/rsdiv/embedding/fasttext_embedding.py
@@ -16,19 +16,6 @@ class FastTextEmbedder(BaseEmbedder):
     if EMB_PATH:
         MAPPER: Dict[str, np.ndarray] = pkl.loads(EMB_PATH)
 
-    @classmethod
-    def _l2_norm(cls, vector: np.ndarray) -> float:
-        """Internal method to calculate the normalize of a given vector.
-
-        Args:
-            vector (np.ndarray): target vector to be normalized.
-
-        Returns:
-            float: the l2 norm value for the given vector.
-        """
-        norm_val: float = np.sqrt(np.sum(vector**2))
-        return norm_val
-
     @classmethod
     def embedding_norm(cls, org: str) -> np.ndarray:
         """Normalize a given vector.
@@ -40,9 +27,9 @@ def embedding_norm(cls, org: str) -> np.ndarray:
             np.ndarray: normalized vector.
         """
         vector: np.ndarray = cls.embedding_single(org)
-        norm_val: float = cls._l2_norm(vector)
+        norm_val: float = np.linalg.norm(vector)
         if norm_val:
-            embed: np.ndarray = vector * (1.0 / norm_val)
+            embed: np.ndarray = vector / norm_val
             return embed
         else:
             return vector
@@ -57,6 +44,4 @@ def embedding_list(cls, org: List[str]) -> np.ndarray:
         Returns:
             np.ndarray: normalized vector.
         """
-        emb_list: np.ndarray = np.asarray([cls.embedding_norm(item) for item in org])
-        emb_norm: np.ndarray = np.mean(emb_list, axis=0)
-        return emb_norm
+        return np.mean([cls.embedding_norm(item) for item in org], axis=0)
diff --git a/src/rsdiv/encoding/geo_encoder.py b/src/rsdiv/encoding/geo_encoder.py
@@ -22,7 +22,7 @@ def __init__(self) -> None:
         super().__init__()
         self.encoder, self.geo_county_dict = self.read_source()
         self.coord: List[np.ndarray] = self.encoder.coord.to_list()
-        self.index: pd.Index = pd.Index(self.encoder["index"])
+        self.tree = spatial.KDTree(self.coord)
 
     def read_source(self) -> Tuple[pd.DataFrame, Dict]:
         """Parse the location information from `geojson-counties-fips.json`.
@@ -43,7 +43,7 @@ def read_source(self) -> Tuple[pd.DataFrame, Dict]:
             geo_county_dict[id] = [coord, name, lsad]
         dataframe = pd.DataFrame.from_dict(
             geo_county_dict, orient="index", columns=["coord", "name", "lstd"]
-        ).reset_index()
+        ).rename_axis("index")
         return (dataframe, geo_county_dict)
 
     def encoding_single(self, org: Union[List, str]) -> Union[int, str]:
@@ -55,8 +55,7 @@ def encoding_single(self, org: Union[List, str]) -> Union[int, str]:
         Returns:
             Union[int, str]: the coding corresponds to the given target.
         """
-        tree = spatial.KDTree(self.coord)
-        return str(self.index[int(tree.query(org)[1])])
+        return str(self.encoder.index[int(self.tree.query(org)[1])])
 
     def encoding_series(self, series: pd.Series) -> pd.Series:
         """Encoding for the series of locations.
@@ -67,7 +66,7 @@ def encoding_series(self, series: pd.Series) -> pd.Series:
         Returns:
             pd.Series: the corresponding series of locations.
         """
-        encodings = pd.Series(series.apply(lambda x: self.encoding_single(x)))
+        encodings = series.apply(self.encoding_single)
         return encodings
 
     def draw_geo_graph(