example-exam-english.tex

%% example-exam-english.tex
%% Copyright 2021 Lukas Schmelzeisen
%
% This work may be distributed and/or modified under the
% conditions of the LaTeX Project Public License, either version 1.3
% of this license or (at your option) any later version.
% The latest version of this license is in
%   http://www.latex-project.org/lppl.txt
% and version 1.3 or later is part of all distributions of LaTeX
% version 2005/12/01 or later.
%
% This work has the LPPL maintenance status `maintained'.
% 
% The current maintainer of this work is Lukas Schmelzeisen.
%
% This work consists of the files acAssignment.cls and acAssignment.tex and
% bundled example files.


% Enable warnings about problematic code
\RequirePackage[l2tabu, orthodox]{nag}

\documentclass[exam, sectionseven]{acAssignment}

% Parse this file as UTF-8 (included in LaTeX by default since 2018 but included
% here for backwards-compatibility). If you use something else, change this.
\usepackage[utf8]{inputenc}

\acCourse[https://west.uni-koblenz.de/studying/ss17/webinformationretrieval]
    {Web Information Retrieval}
\acSemester{SS~17}
\acExamType{Main Exam} % Alternatively "Retake Exam" or "Trial Exam"
\acExamDate{26 July 2017}

\acAuthor[https://www.ipvs.uni-stuttgart.de/institute/team/Kumar-00006/]
    {Dr. Chandan Kumar}{chandan.kumar@ipvs.uni-stuttgart.de}
\acAuthor[https://www.ipvs.uni-stuttgart.de/institute/team/Schmelzeisen/]
    {Lukas Schmelzeisen}{lukas.schmelzeisen@ipvs.uni-stuttgart.de}

\begin{document}
\maketitle

\acExamForm[seat]

\begin{itemize}[itemsep=0pt]
    \item Check that your exam copy is complete (\acNumPages~pages, \acNumTasks~tasks).
    \item Use a non-erasable writing medium, preferably ballpoint pen.
        Do \emph{not} write in pencil.
    \item The use of aids of any kind is not permitted.
        This includes calculators, mobile phones, book, hand-written notes and the likes.
    \item If space permits, please provide your answers on the respective task page.
        If needed, you may request additional sheets (3 are already bundled with this exam at the end).
        Label these with your name, matriculation and task numbers and cross reference them from the respective task page.
    \item By taking this exam, you confirm that you understand and meet the eligibility requirements of this exam.
        If these are not fulfilled, the exam will be considered as not taken.
\end{itemize}

\vspace{0.1cm}
\textbf{Good luck!}

% ------------------------------------------------------------------------------
\section{Multiple Choice Questions}

Indicate for the following questions which options apply to which terms and which do not.
One correct answer will award one point.
One incorrect answer will subtract one point.
Thus, if you are uncertain about an answer it might be wiser not to answer it.
Overall points of this question can not be below zero.

\begin{enumerate}
    \item Which of the following operations are typically viewed as occurring during the pre-processing stage of an information retrieval system?
        \begin{acMultipleChoice}
            \acChoice{Stemming or Lemmatization}
            \acChoice*{Relevance Feedback}
            \acChoice{Tokenization}
            \acChoice{Stop Word Removal}
        \end{acMultipleChoice}
    
    \item Web search is different from the classical form of ad hoc information retrieval.
        Which of the follGowing properties characterize web search specifically, because they do not apply to the classical model?
        \begin{acMultipleChoice}
            \acChoice{The web contains lots of duplicate documents}
            \acChoice{The web contain lots of spam documents}
            \acChoice{The documents on the web are interlinked and connected}
            \acChoice*{The documents on the web are unstructured}
        \end{acMultipleChoice}
    
    \item What are desirable properties of a web crawler?
        \begin{acMultipleChoice}
            \acChoice{To respect robots.txt specifications}
            \acChoice{To avoid hitting any site too often}
            \acChoice{To be robust enough to avoid spider traps}
            \acChoice{To be capable of distributed operation}
        \end{acMultipleChoice}
    
    \item The query likelihood model in information retrieval\textellipsis
        \begin{acMultipleChoice}
            \acChoice{Estimates the probability of the query being generated from a document}
            \acChoice*{Estimates the probability of a document being generated from the query}
            \acChoice{Can include smoothing to avoid zero probability scores}
            \acChoice{Can utilize corpus frequencies for normalization}
        \end{acMultipleChoice}
    
    \item Indicate for the following relevancy measures whether a result list would have to be sorted in \emph{increasing}~(Inc.) or \emph{decreasing}~(Dec.) order on the relevancy measure such that the most relevant result is first, the second most relevant is second, and so forth.
        \begin{acMultipleChoice}
            \acChoice*[Inc.][Dec.]{Cosine Similarity}
            \acChoice[Inc.][Dec.]{Euclidean Distance}
            \acChoice*[Inc.][Dec.]{Query Likelihood Probability}
            \acChoice[Inc.][Dec.]{Kullback-Leibler Divergence}
        \end{acMultipleChoice}
    
    \clearpage
    \item Suppose that $A$, $B$, $C$ and $D$ are four different web pages; there exist a link from page $A$ to $B$, $B$ to $C$, and $C$ to $D$. Consider the following distinct scenarios, and decide weather it will \emph{increase}~(Inc.) or \emph{decrease}~(Dec.) the PageRank score of $C$.
        \begin{acMultipleChoice}
            \acChoice[Inc.][Dec.]{Adding a link from $A$ to $C$}
            \acChoice*[Inc.][Dec.]{Adding a link from $B$ to $D$}
            \acChoice[Inc.][Dec.]{Adding a link from $D$ to $B$}
            \acChoice*[Inc.][Dec.]{Deleting a link from $A$ to $B$}
        \end{acMultipleChoice}

    \begin{acSolution}
        \vspace{0.25cm}
        PageRank scores calculated with teleportation-rate of $0.2$ and $1000$ power iterations:
        \begin{center}
            \begin{tabular}{lcccc}
                \toprule
                PageRank Score & A & B & C & D \\
                \midrule
                Before any adjustments      & 0.12 & 0.22 & 0.30 & 0.36 \\
                Adding a link from A to C   & 0.13 & 0.18 & 0.32 & 0.38 \\
                Adding a link from B to D   & 0.13 & 0.24 & 0.23 & 0.40 \\
                Adding a link from C to D   & 0.05 & 0.10 & 0.45 & 0.41 \\
                Deleting a link from A to B & 0.16 & 0.16 & 0.28 & 0.39 \\
                \bottomrule
            \end{tabular}
            \renewcommand*\arraystretch{1}
        \end{center}
    \end{acSolution}
\end{enumerate}


% ------------------------------------------------------------------------------
\section{Evaluation}

\begin{enumerate}
    \item A way of evaluating the performance of an information retrieval system is by keeping track of all documents in a \emph{confusion matrix}:

        \begin{center}
            \let\oldarraystretch\arraystretch
            \renewcommand*\arraystretch{1.7}
            \begin{tabular}{c|c|c|}
                \cline{2-3} &
                \textbf{Relevant} &
                \textbf{Irrelevant} \\
                
                \hline
                \multicolumn{1}{|c|}{\textbf{In result set}} &
                \acInlineSolution{\textbf{TP}} &
                \acInlineSolution{\textbf{FP}} \\
                
                \hline
                \multicolumn{1}{|c|}{\textbf{Not in result set}} &
                \acInlineSolution{\textbf{FN}} &
                \acInlineSolution{\textbf{TN}} \\
                
                \hline
            \end{tabular}
            \let\arraystretch\oldarraystretch
        \end{center}
        
        \vspace{0.25cm}
        \acTask{2}:
        Assign the following labels each to one of the cells in the above confusion matrix.
        
        \begin{itemize}
            \item \textbf{TP} (\enquote{True Positives})
            \item \textbf{TN} (\enquote{True Negatives})
            \item \textbf{FP} (\enquote{False Positives})
            \item \textbf{FN} (\enquote{False Negatives})
        \end{itemize}
        \vspace{\fill}
    
    \item Define how to calculate \emph{precision}, \emph{recall}, and \emph{accuracy} only using the values \textbf{TP}, \textbf{TN}, \textbf{FP}, and \textbf{FN}, each denoting the number of documents that belonged to the corresponding category.
        
        \acTask{3}:
        Complete the formulas below by filling in nominator and denominator of the fractions.
        
        \newcommand*{\myFracField}[1]{%
            \acIfSolution%
                {\begin{minipage}{7cm}%
                    \centering\textcolor{acSolution}{#1}%
                    \end{minipage}}%
                {\hspace{7cm}}}
        
        \vspace{1cm}
        \begin{align*}
            \text{Precision} &= \dfrac
                {\myFracField{\textbf{TP}}}
                {\myFracField{\textbf{TP} + \textbf{FP}}}
            \\[2cm]
            \text{Recall} &= \dfrac
                {\myFracField{\textbf{TP}}}
                {\myFracField{\textbf{TP} + \textbf{FN}}}
            \\[2cm]
            \text{Accuracy} &= \dfrac
                {\myFracField{\textbf{TP} + \textbf{TN}}}
                {\myFracField{\textbf{TP} + \textbf{FP} + \textbf{FN} + \textbf{TN}}}
        \end{align*}
        \vspace{\fill}
    
    \clearpage
    \item For the evaluation of an information retrieval system, three queries have been performed against the system.
        The following tables indicates the results of the queries along with relevance judgments provided by human experts:
        
        \begin{center}
            \begin{tabular}{c@{\hspace{0.5cm}}c@{\hspace{0.5cm}}c}
                \begin{tabular}[t]{ccc}
                    \toprule
                    \multicolumn{3}{c}{Query $q_1$} \\
                    Rank & Doc-ID & Rel.? \\
                    \midrule
                    1 & $d_{1 1}$ & \\
                    2 & $d_{1 2}$ & $\checkmark$ \\
                    3 & $d_{1 3}$ & $\checkmark$ \\
                    4 & $d_{1 4}$ & \\
                    5 & $d_{1 5}$ & $\checkmark$ \\
                    6 & $d_{1 6}$ & \\
                    7 & $d_{1 7}$ & \\
                    \bottomrule
                \end{tabular}
                &
                \begin{tabular}[t]{ccc}
                    \toprule
                    \multicolumn{3}{c}{Query $q_2$} \\
                    Rank & Doc-ID & Rel.? \\
                    \midrule
                    1 & $d_{2 1}$ & $\checkmark$ \\
                    2 & $d_{2 2}$ & \\
                    3 & $d_{2 3}$ & $\checkmark$ \\
                    4 & $d_{2 4}$ & \\
                    5 & $d_{2 5}$ & \\
                    6 & $d_{2 6}$ & $\checkmark$ \\
                    7 & $d_{2 7}$ & \\
                    8 & $d_{2 8}$ & $\checkmark$ \\
                    \bottomrule
                \end{tabular}
                &
                \begin{tabular}[t]{ccc}
                    \toprule
                    \multicolumn{3}{c}{Query $q_3$} \\
                    Rank & Doc-ID & Rel.? \\
                    \midrule
                    1 & $d_{3 1}$ & \\
                    2 & $d_{3 2}$ & \\
                    3 & $d_{3 3}$ & $\checkmark$ \\
                    4 & $d_{3 4}$ & \\
                    \bottomrule
                    \end{tabular}
                \vspace{0.5cm}\\
                
                \parbox{3.5cm}{Total number of documents relevant to $q_1$:~4}
                &
                \parbox{3.5cm}{Total number of documents relevant to $q_2$:~8}
                &
                \parbox{3.5cm}{Total number of documents relevant to $q_3$:~1}
                \\
            \end{tabular}
        \end{center}
        
        \vspace{0.5cm}
        \acTask{9}:
        Calculate \emph{Precision}, \emph{Recall}, \emph{$F_1$-Measure}, \emph{Precision at $k$} (P@$k$) for $k=1$ and $k=5$, and \emph{R-Precision} for all three result sets.
        
        \begin{acSolution}
            \vspace{0.5cm}
            \begin{center}
                \let\oldarraystretch\arraystretch
                \renewcommand*\arraystretch{1.7}
                \begin{tabular}{lccc}
                    \toprule
                    & $q_1$ & $q_2$ & $q_3$ \\
                    \midrule
                    Precision &
                    $\frac{3}{7} \approx 0.43$ &
                    $\frac{4}{8} = 0.5$ &
                    $\frac{1}{4} = 0.25$ \\
                    Recall &
                    $\frac{3}{4} = 0.75$ &
                    $\frac{4}{8} = 0.5$ &
                    $\frac{1}{1} = 1$ \\
                    $F_1$-Measure &
                    $2 \cdot \frac{\frac{3}{7} \cdot \frac{3}{4}}{\frac{3}{7} + \frac{3}{4}} = \frac{6}{11} = 0.\overline{54}$ &
                    $2 \cdot \frac{\frac{4}{8} \cdot \frac{4}{8}}{\frac{4}{8} + \frac{4}{8}} = \frac{1}{2} = 0.5$ &
                    $2 \cdot \frac{\frac{1}{4} \cdot \frac{1}{1}}{\frac{1}{4} + \frac{1}{1}} = \frac{2}{5} = 0.4$ \\
                    Precision@$1$ &
                    $\frac{0}{1} = 0$ &
                    $\frac{1}{1} = 1$ &
                    $\frac{0}{1} = 0$ \\
                    Precision@$5$ &
                    $\frac{3}{5} = 0.6$ &
                    $\frac{2}{5} = 0.5$ &
                    $\frac{1}{5} = 0.2$ \\
                    R-Precision &
                    $\frac{2}{4} = 0.75$ &
                    $\frac{4}{8} = 0.5$ &
                    $\frac{1}{1} = 0$ \\
                    \bottomrule
                \end{tabular}
                \let\arraystretch\oldarraystretch
            \end{center}
        \end{acSolution}
\end{enumerate}


% ------------------------------------------------------------------------------
\section{PageRank}

The goal is to implement to function \texttt{calcPageRankScores()} on the next page.
The function is supposed to calculate and return an array of PageRank scores for a given \texttt{DocumentCollection}.

To obtain correct PageRank scores, your implementation will have to:
\begin{enumerate}
    \item \acTask{2}: Produce the adjacency matrix of the document collection.
    \item \acTask{2}: Calculate the stochastic transition matrix from the adjacency matrix using a given \texttt{teleportationRate}.
    \item \acTask{6}: Calculate the steady state distribution (the final page rank scores) of the transition matrix, using the \emph{power iteration} method with \texttt{numIterations} many iterations.
\end{enumerate}

You may use the following code fragments and helper methods that you can assume to be implemented:

\begin{itemize}
    \item To get the number of documents in the \texttt{DocumentCollection}:
    
        \begin{lstlisting}[language=Java]
int numDocuments = documents.size();
        \end{lstlisting}
    
    \item To obtain an adjacency matrix of a \texttt{DocumentCollection}:
    
        \begin{lstlisting}[language=Java]
boolean[][] makeAdjacencyMatrix(DocumentCollection documents)
        \end{lstlisting}
        
        The function returns a two dimensional, boolean array.
        The array has \texttt{numDocuments} many rows and columns.
        If entry \texttt{result[i][j]} of the array is \texttt{true} there was a link from document \texttt{i} to document \texttt{j} in the collection.
    
    \item To obtain a transition matrix from a adjacency matrix use:
    
        \begin{lstlisting}[language=Java]
double[][] makeTransitionMatrix(boolean[][] adjacencyMatrix,
                                double teleportationRate)
        \end{lstlisting}
        
        The function returns a two dimensional, double array.
        The array has \texttt{numDocuments} many rows and columns.
        Entry \texttt{result[i][j]} gives the probability that a random walker on the document graph travels to document \texttt{j} if he was at document \texttt{i} at the previous time step.
    
    \item To perform vector-matrix multiplication:
    
        \begin{lstlisting}[language=Java]
double[] vectorMatrixMult(double[] vector, double[][] matrix)
        \end{lstlisting}
        
        This function assumes \texttt{vector} to be a row vector and requires \texttt{matrix} to be square and have the same dimensionality as the vector.
\end{itemize}

Aim to write your solution as valid Java code.
If you unsure how to specify a certain step in Java formulate it as pseudo code.

\clearpage
\begin{lstlisting}[
    language=Java,
    escapechar=\&,
    moredelim={**[is][\color{acSolution}]{@}{@}}]
double[] calcPageRankScores(DocumentCollection documents,
                            double teleportationRate,
                            int numIterations) {&\acIfSolution{\iftrue}{\iffalse}&@
    int numDocuments = documents.size();
    
    boolean[][] adjacencyMatrix = makeAdjacencyMatrix(documents);
    double[][] transitionMatrix = makeTransitionMatrix(adjacencyMatrix,
                                                       teleportationRate);
    
    double[] result = new double[numDocuments];
    for (int i = 0; i != numDocuments; ++i)
        result[i] = 1.0 / numDocuments;
    for (int i = 0; i != numIterations; ++i)
        result = vectorMatrixMult(result, transitionMatrix);
    return result;@&\fi&&\vfill&
}
\end{lstlisting}


\acBlankPages{3}

\end{document}