-
Notifications
You must be signed in to change notification settings - Fork 196
Adding request-timeout for recall metrics for so_vector #800
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,7 +11,9 @@ | |
TRUE_KNN_FILENAME_1K: str = "queries-recall-1k.json.bz2" | ||
|
||
|
||
async def extract_exact_neighbors(query_vector: List[float], index: str, max_size: int, vector_field: str, filter, client) -> List[str]: | ||
async def extract_exact_neighbors( | ||
query_vector: List[float], index: str, max_size: int, vector_field: str, request_timeout: float, filter, client | ||
) -> List[str]: | ||
if filter is None: | ||
raise ValueError("Filter must be provided for exact neighbors extraction.") | ||
script_query = { | ||
|
@@ -32,6 +34,7 @@ async def extract_exact_neighbors(query_vector: List[float], index: str, max_siz | |
index=index, | ||
request_cache=True, | ||
size=max_size, | ||
request_timeout=request_timeout, | ||
) | ||
return [hit["fields"]["questionId"][0] for hit in script_result["hits"]["hits"]] | ||
|
||
|
@@ -131,7 +134,7 @@ def __init__(self): | |
def get_query_vectors(self) -> List[List[float]]: | ||
return self._queries | ||
|
||
async def get_neighbors_for_query(self, index: str, query_id: int, size: int, filter, client) -> List[str]: | ||
async def get_neighbors_for_query(self, index: str, query_id: int, size: int, request_timeout: float, filter, client) -> List[str]: | ||
# For now, we must calculate the exact neighbors, maybe we should cache this? | ||
# it would have to be cached per query and filter | ||
if filter is not None: | ||
|
@@ -141,6 +144,7 @@ async def get_neighbors_for_query(self, index: str, query_id: int, size: int, fi | |
index=index, | ||
max_size=size, | ||
vector_field="titleVector", | ||
request_timeout=request_timeout, | ||
filter=filter, | ||
client=client, | ||
) | ||
|
@@ -200,6 +204,7 @@ async def __call__(self, es, params): | |
k = params["size"] | ||
num_candidates = params["num_candidates"] | ||
index = params["index"] | ||
request_timeout = params.get("request-timeout", -1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at https://www.elastic.co/docs/reference/elasticsearch/clients/python/configuration#timeouts it's not clear to me what's the behavior with a negative value. Did it work in your tests? I think it would be safer to either use Alternatively, to allow setting either
|
||
request_cache = params["cache"] | ||
filter = params["filter"] | ||
recall_total = 0 | ||
|
@@ -215,9 +220,10 @@ async def __call__(self, es, params): | |
index=index, | ||
request_cache=request_cache, | ||
size=k, | ||
request_timeout=request_timeout, | ||
) | ||
knn_hits = [hit["fields"]["questionId"][0] for hit in knn_result["hits"]["hits"]] | ||
true_neighbors = await knn_vector_store.get_neighbors_for_query(index, query_id, k, filter, es) | ||
true_neighbors = await knn_vector_store.get_neighbors_for_query(index, query_id, k, request_timeout, filter, es) | ||
current_recall = len(set(knn_hits).intersection(set(true_neighbors))) | ||
recall_total += current_recall | ||
exact_total += len(true_neighbors) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know the appropriate default value to indicate "no timeout". I think for these recall metrics only, that is what we care about as these types of queries (script & knn) are performance tested elsewhere. So if there is a significant regression, we will catch it in those other operations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, was looking at exactly the same line :)