Skip to content

Commit

Permalink
Merge pull request #958 from EleutherAI/verbosity-rework
Browse files Browse the repository at this point in the history
[Refactor] Verbosity rework
  • Loading branch information
haileyschoelkopf committed Nov 17, 2023
2 parents 2c0c345 + 9b596e8 commit afda655
Show file tree
Hide file tree
Showing 19 changed files with 83 additions and 42 deletions.
4 changes: 3 additions & 1 deletion docs/interface.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ my_model = initialize_my_model() # create your model (could be running finetunin
...
lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`

lm_eval.tasks.initialize_tasks() # register all tasks from the `lm_eval/tasks` subdirectory. Alternatively, can call `lm_eval.tasks.include_path("path/to/my/custom/task/configs")` to only register a set of tasks in a separate directory.

results = lm_eval.simple_evaluate( # call simple_evaluate
model=lm_obj,
tasks=["taskname1", "taskname2"],
Expand All @@ -85,7 +87,7 @@ my_model = initialize_my_model() # create your model (could be running finetunin
...
lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`


lm_eval.tasks.initialize_tasks() # register all tasks from the `lm_eval/tasks` subdirectory. Alternatively, can call `lm_eval.tasks.include_path("path/to/my/custom/task/configs")` to only register a set of tasks in a separate directory.

def evaluate(
lm=lm_obj,
Expand Down
31 changes: 20 additions & 11 deletions lm_eval/__main__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
import os
import re
import sys
import json
import fnmatch
import argparse
import logging
from pathlib import Path
import argparse
import numpy as np
from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger, SPACING
from lm_eval.tasks import include_path

from pathlib import Path
from typing import Union

from lm_eval import evaluator, utils
from lm_eval.tasks import initialize_tasks, include_path
from lm_eval.api.registry import ALL_TASKS


def _handle_non_serializable(o):
if isinstance(o, np.int64) or isinstance(o, np.int32):
Expand All @@ -25,11 +25,11 @@ def _handle_non_serializable(o):

def parse_eval_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
parser.add_argument("--model", default="hf", help="Name of model e.g. `hf`")
parser.add_argument(
"--tasks",
default=None,
help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
help="To get full list of tasks, use the command lm-eval --tasks list",
)
parser.add_argument(
"--model_args",
Expand Down Expand Up @@ -119,9 +119,13 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
# we allow for args to be passed externally, else we parse them ourselves
args = parse_eval_args()

eval_logger = utils.eval_logger
eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
eval_logger.info(f"Verbosity set to {args.verbosity}")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

initialize_tasks(args.verbosity)

if args.limit:
eval_logger.warning(
" --limit SHOULD ONLY BE USED FOR TESTING."
Expand All @@ -133,6 +137,11 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:

if args.tasks is None:
task_names = ALL_TASKS
elif args.tasks == "list":
eval_logger.info(
"Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS)))
)
sys.exit()
else:
if os.path.isdir(args.tasks):
import glob
Expand All @@ -159,10 +168,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
missing = ", ".join(task_missing)
eval_logger.error(
f"Tasks were not found: {missing}\n"
f"{SPACING}Try `lm-eval -h` for list of available tasks",
f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
)
raise ValueError(
f"Tasks {missing} were not found. Try `lm-eval -h` for list of available tasks."
f"Tasks {missing} were not found. Try `lm-eval --tasks list` for list of available tasks."
)

if args.output_path:
Expand Down
3 changes: 3 additions & 0 deletions lm_eval/api/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

from lm_eval.api.registry import register_metric, register_aggregation

import logging

eval_logger = logging.getLogger("lm-eval")

# Register Aggregations First
@register_aggregation("mean")
Expand Down
5 changes: 4 additions & 1 deletion lm_eval/api/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
from tqdm import tqdm

from lm_eval import utils
from lm_eval.logger import eval_logger

import logging

eval_logger = logging.getLogger("lm-eval")

T = TypeVar("T", bound="LM")

Expand Down
5 changes: 4 additions & 1 deletion lm_eval/api/registry.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os
import evaluate
from lm_eval.api.model import LM
from lm_eval.logger import eval_logger

import logging

eval_logger = logging.getLogger("lm-eval")

MODEL_REGISTRY = {}

Expand Down
5 changes: 4 additions & 1 deletion lm_eval/api/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
import ast
import yaml
import logging
import evaluate
import random
import itertools
Expand All @@ -21,7 +22,6 @@
from lm_eval.api.instance import Instance
from lm_eval.api.filter import FilterEnsemble

from lm_eval.logger import eval_logger
from lm_eval.prompts import get_prompt
from lm_eval.filters import build_filter_ensemble
from lm_eval.api.metrics import (
Expand All @@ -48,6 +48,9 @@
]


eval_logger = logging.getLogger("lm-eval")


@dataclass
class TaskConfig(dict):
# task naming/registry
Expand Down
5 changes: 2 additions & 3 deletions lm_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@
make_table,
create_iterator,
get_git_commit_hash,
eval_logger,
)

from lm_eval.logger import eval_logger


@positional_deprecated
def simple_evaluate(
Expand Down Expand Up @@ -256,7 +255,7 @@ def evaluate(

task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)

eval_logger.info(
eval_logger.debug(
f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
)

Expand Down
10 changes: 0 additions & 10 deletions lm_eval/logger.py

This file was deleted.

4 changes: 3 additions & 1 deletion lm_eval/models/anthropic_llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from lm_eval.api.registry import register_model
from tqdm import tqdm
import time
from lm_eval.logger import eval_logger
from lm_eval import utils
from typing import List, Any, Tuple

eval_logger = utils.eval_logger


def anthropic_completion(
client, #: anthropic.Anthropic,
Expand Down
3 changes: 2 additions & 1 deletion lm_eval/models/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import torch.nn.functional as F

from lm_eval import utils
from lm_eval.logger import eval_logger
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model

Expand All @@ -25,6 +24,8 @@
from accelerate import Accelerator, find_executable_batch_size, DistributedType
from typing import List, Optional, Union

eval_logger = utils.eval_logger


def _get_accelerate_args(
device_map_option: Optional[str] = "auto",
Expand Down
2 changes: 1 addition & 1 deletion lm_eval/prompts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from typing import Dict
from lm_eval import utils
from lm_eval.logger import eval_logger
from lm_eval.utils import eval_logger

# Prompt library.
# Stores prompts in a dictionary indexed by 2 levels:
Expand Down
15 changes: 11 additions & 4 deletions lm_eval/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
QMSum,
)

eval_logger = logging.getLogger("lm-eval")
eval_logger = utils.eval_logger


def register_configurable_task(config: Dict[str, str]) -> int:
Expand Down Expand Up @@ -152,8 +152,11 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
else:
if type(config["task"]) == list:
register_configurable_group(config, yaml_path)

# Log this silently and show it only when
# the user defines the appropriate verbosity.
except ModuleNotFoundError as e:
eval_logger.warning(
eval_logger.debug(
f"{yaml_path}: {e}. Config will not be added to registry."
)
except Exception as error:
Expand All @@ -176,8 +179,12 @@ def include_path(task_dir):
return 0


task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_path(task_dir)
def initialize_tasks(verbosity="INFO"):

eval_logger.setLevel(getattr(logging, f"{verbosity}"))

task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_path(task_dir)


def get_task(task_name, config):
Expand Down
2 changes: 1 addition & 1 deletion lm_eval/tasks/minerva_math/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datasets
import re
import signal
from lm_eval.logger import eval_logger
from lm_eval.utils import eval_logger
from typing import Optional, List, Dict

try:
Expand Down
2 changes: 1 addition & 1 deletion lm_eval/tasks/realtoxicityprompts/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import requests
import numpy as np

from lm_eval.logger import eval_logger
from lm_eval.utils import eval_logger


def toxicity_perspective_api(references, predictions, **kwargs):
Expand Down
11 changes: 10 additions & 1 deletion lm_eval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,16 @@
from jinja2 import BaseLoader, Environment, StrictUndefined
from itertools import islice

from lm_eval.logger import eval_logger
import logging

logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")

SPACING = " " * 47


def escaped_split(text, sep_char, maxsplit=-1):
Expand Down
13 changes: 10 additions & 3 deletions scripts/write_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@
import os
import random
from lm_eval import tasks
from lm_eval.utils import join_iters
from lm_eval.tasks import include_path
from lm_eval.logger import eval_logger
from lm_eval.utils import join_iters, eval_logger
from lm_eval.tasks import initialize_tasks, include_path

EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"

Expand All @@ -25,13 +24,21 @@ def parse_args():
default=None,
help="Additional path to include if there are external tasks to include.",
)
parser.add_argument(
"--verbosity",
type=str,
default="INFO",
help="Log error when tasks are not registered.",
)
return parser.parse_args()


def main():
args = parse_args()
np.random.seed(args.seed)

initialize_tasks(args.verbosity)

if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
Expand Down
2 changes: 2 additions & 0 deletions tests/models/test_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import sys
import torch

tasks.initialize_tasks()


class Test_HFLM:
torch.use_deterministic_algorithms(True)
Expand Down
1 change: 1 addition & 0 deletions tests/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import random
import pytest

tasks.initialize_tasks()

# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
Expand Down
2 changes: 1 addition & 1 deletion tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import lm_eval.tasks as tasks
from lm_eval.api.task import ConfigurableTask


tasks.initialize_tasks()
# Default Task
TASKS = ["arc_easy"]

Expand Down

0 comments on commit afda655

Please sign in to comment.