KBEST_CLASSIF
SelectKBest keeps the features with the strongest univariate relationship to a classification target according to a scoring function. This wrapper fixes the score function to ANOVA F-values, which works well for dense numeric tabular inputs with categorical class labels.
The ANOVA F-value for a feature is calculated as:
F = \frac{\text{between-group variance}}{\text{within-group variance}}
This wrapper accepts rows as samples and columns as features, plus a target supplied as a single row, single column, or scalar when only one sample is present. It returns the filtered feature matrix together with feature scores, p-values, a boolean support mask, and the 1-based indices of the selected columns.
Excel Usage
=KBEST_CLASSIF(data, target, k)
data(list[list], required): 2D array of numeric feature data with rows as samples and columns as features.target(list[list], required): Target labels as a single row, single column, or scalar when only one sample is present.k(int, optional, default: 1): Number of top-scoring features to keep.
Returns (dict): Excel data type containing the selected feature matrix, feature scores, p-values, and support mask.
Example 1: Keep the two strongest features for a binary numeric target
Inputs:
| data | target | k | ||
|---|---|---|---|---|
| 0.1 | 10 | 1 | 0 | 2 |
| 0 | 11 | 1.1 | 0 | |
| 0 | 12 | 0.9 | 0 | |
| 1 | 30 | 5 | 1 | |
| 1 | 31 | 4.9 | 1 | |
| 1 | 29 | 5.1 | 1 |
Excel formula:
=KBEST_CLASSIF({0.1,10,1;0,11,1.1;0,12,0.9;1,30,5;1,31,4.9;1,29,5.1}, {0;0;0;1;1;1}, 2)
Expected output:
{"type":"Double","basicValue":2,"properties":{"selected_feature_count":{"type":"Double","basicValue":2},"sample_count":{"type":"Double","basicValue":6},"feature_count":{"type":"Double","basicValue":3},"class_count":{"type":"Double","basicValue":2},"classes":{"type":"Array","elements":[[{"type":"Double","basicValue":0}],[{"type":"Double","basicValue":1}]]},"selected_indices":{"type":"Array","elements":[[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":3}]]},"support_mask":{"type":"Array","elements":[[{"type":"Boolean","basicValue":true}],[{"type":"Boolean","basicValue":false}],[{"type":"Boolean","basicValue":true}]]},"scores":{"type":"Array","elements":[[{"type":"Double","basicValue":841}],[{"type":"Double","basicValue":541.5}],[{"type":"Double","basicValue":2400}]]},"p_values":{"type":"Array","elements":[[{"type":"Double","basicValue":0.00000841636}],[{"type":"Double","basicValue":0.0000202128}],[{"type":"Double","basicValue":0.00000103878}]]},"selected_data":{"type":"Array","elements":[[{"type":"Double","basicValue":0.1},{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":0},{"type":"Double","basicValue":1.1}],[{"type":"Double","basicValue":0},{"type":"Double","basicValue":0.9}],[{"type":"Double","basicValue":1},{"type":"Double","basicValue":5}],[{"type":"Double","basicValue":1},{"type":"Double","basicValue":4.9}],[{"type":"Double","basicValue":1},{"type":"Double","basicValue":5.1}]]}}}
Example 2: Flatten a single-row string target range for feature ranking
Inputs:
| data | target | k | |||||||
|---|---|---|---|---|---|---|---|---|---|
| 1 | 5 | 0 | cold | cold | cold | hot | hot | hot | 2 |
| 1.2 | 4.8 | 0.1 | |||||||
| 0.8 | 5.2 | 0 | |||||||
| 5 | 1 | 3 | |||||||
| 5.1 | 1.2 | 3.1 | |||||||
| 4.9 | 0.8 | 2.9 |
Excel formula:
=KBEST_CLASSIF({1,5,0;1.2,4.8,0.1;0.8,5.2,0;5,1,3;5.1,1.2,3.1;4.9,0.8,2.9}, {"cold","cold","cold","hot","hot","hot"}, 2)
Expected output:
{"type":"Double","basicValue":2,"properties":{"selected_feature_count":{"type":"Double","basicValue":2},"sample_count":{"type":"Double","basicValue":6},"feature_count":{"type":"Double","basicValue":3},"class_count":{"type":"Double","basicValue":2},"classes":{"type":"Array","elements":[[{"type":"String","basicValue":"cold"}],[{"type":"String","basicValue":"hot"}]]},"selected_indices":{"type":"Array","elements":[[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":3}]]},"support_mask":{"type":"Array","elements":[[{"type":"Boolean","basicValue":true}],[{"type":"Boolean","basicValue":false}],[{"type":"Boolean","basicValue":true}]]},"scores":{"type":"Array","elements":[[{"type":"Double","basicValue":960}],[{"type":"Double","basicValue":600}],[{"type":"Double","basicValue":1980.25}]]},"p_values":{"type":"Array","elements":[[{"type":"Double","basicValue":0.00000646545}],[{"type":"Double","basicValue":0.0000164831}],[{"type":"Double","basicValue":0.00000152493}]]},"selected_data":{"type":"Array","elements":[[{"type":"Double","basicValue":1},{"type":"Double","basicValue":0}],[{"type":"Double","basicValue":1.2},{"type":"Double","basicValue":0.1}],[{"type":"Double","basicValue":0.8},{"type":"Double","basicValue":0}],[{"type":"Double","basicValue":5},{"type":"Double","basicValue":3}],[{"type":"Double","basicValue":5.1},{"type":"Double","basicValue":3.1}],[{"type":"Double","basicValue":4.9},{"type":"Double","basicValue":2.9}]]}}}
Example 3: Select two features across three separated classes
Inputs:
| data | target | k | |||
|---|---|---|---|---|---|
| 0 | 0 | 10 | 1 | left | 2 |
| 0.1 | 0.2 | 9.8 | 1.2 | left | |
| 5 | 5 | 2 | 10 | center | |
| 5.2 | 4.8 | 2.1 | 9.7 | center | |
| 10 | 0 | 5 | 20 | right | |
| 10.1 | 0.2 | 5.2 | 19.8 | right |
Excel formula:
=KBEST_CLASSIF({0,0,10,1;0.1,0.2,9.8,1.2;5,5,2,10;5.2,4.8,2.1,9.7;10,0,5,20;10.1,0.2,5.2,19.8}, {"left";"left";"center";"center";"right";"right"}, 2)
Expected output:
{"type":"Double","basicValue":2,"properties":{"selected_feature_count":{"type":"Double","basicValue":2},"sample_count":{"type":"Double","basicValue":6},"feature_count":{"type":"Double","basicValue":4},"class_count":{"type":"Double","basicValue":3},"classes":{"type":"Array","elements":[[{"type":"String","basicValue":"left"}],[{"type":"String","basicValue":"center"}],[{"type":"String","basicValue":"right"}]]},"selected_indices":{"type":"Array","elements":[[{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":4}]]},"support_mask":{"type":"Array","elements":[[{"type":"Boolean","basicValue":true}],[{"type":"Boolean","basicValue":false}],[{"type":"Boolean","basicValue":false}],[{"type":"Boolean","basicValue":true}]]},"scores":{"type":"Array","elements":[[{"type":"Double","basicValue":5000.17}],[{"type":"Double","basicValue":768}],[{"type":"Double","basicValue":2088.11}],[{"type":"Double","basicValue":6247.12}]]},"p_values":{"type":"Array","elements":[[{"type":"Double","basicValue":0.00000519356}],[{"type":"Double","basicValue":0.0000860645}],[{"type":"Double","basicValue":0.0000192326}],[{"type":"Double","basicValue":0.0000037193}]]},"selected_data":{"type":"Array","elements":[[{"type":"Double","basicValue":0},{"type":"Double","basicValue":1}],[{"type":"Double","basicValue":0.1},{"type":"Double","basicValue":1.2}],[{"type":"Double","basicValue":5},{"type":"Double","basicValue":10}],[{"type":"Double","basicValue":5.2},{"type":"Double","basicValue":9.7}],[{"type":"Double","basicValue":10},{"type":"Double","basicValue":20}],[{"type":"Double","basicValue":10.1},{"type":"Double","basicValue":19.8}]]}}}
Example 4: Rank a single strong feature against a boolean target
Inputs:
| data | target | k |
|---|---|---|
| 0.1 | false | 1 |
| 0.2 | false | |
| 0.3 | false | |
| 1.2 | true | |
| 1.3 | true | |
| 1.4 | true |
Excel formula:
=KBEST_CLASSIF({0.1;0.2;0.3;1.2;1.3;1.4}, {FALSE;FALSE;FALSE;TRUE;TRUE;TRUE}, 1)
Expected output:
{"type":"Double","basicValue":1,"properties":{"selected_feature_count":{"type":"Double","basicValue":1},"sample_count":{"type":"Double","basicValue":6},"feature_count":{"type":"Double","basicValue":1},"class_count":{"type":"Double","basicValue":2},"classes":{"type":"Array","elements":[[{"type":"Boolean","basicValue":false}],[{"type":"Boolean","basicValue":true}]]},"selected_indices":{"type":"Array","elements":[[{"type":"Double","basicValue":1}]]},"support_mask":{"type":"Array","elements":[[{"type":"Boolean","basicValue":true}]]},"scores":{"type":"Array","elements":[[{"type":"Double","basicValue":181.5}]]},"p_values":{"type":"Array","elements":[[{"type":"Double","basicValue":0.000175635}]]},"selected_data":{"type":"Array","elements":[[{"type":"Double","basicValue":0.1}],[{"type":"Double","basicValue":0.2}],[{"type":"Double","basicValue":0.3}],[{"type":"Double","basicValue":1.2}],[{"type":"Double","basicValue":1.3}],[{"type":"Double","basicValue":1.4}]]}}}
Python Code
import numpy as np
from sklearn.feature_selection import SelectKBest as SklearnSelectKBest
from sklearn.feature_selection import f_classif as sklearn_f_classif
def kbest_classif(data, target, k=1):
"""
Select the top-scoring classification features and return the filtered matrix with score summaries.
See: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html
This example function is provided as-is without any representation of accuracy.
Args:
data (list[list]): 2D array of numeric feature data with rows as samples and columns as features.
target (list[list]): Target labels as a single row, single column, or scalar when only one sample is present.
k (int, optional): Number of top-scoring features to keep. Default is 1.
Returns:
dict: Excel data type containing the selected feature matrix, feature scores, p-values, and support mask.
"""
def py(value):
return value.item() if isinstance(value, np.generic) else value
def cell(value):
value = py(value)
if isinstance(value, bool):
return {"type": "Boolean", "basicValue": bool(value)}
if isinstance(value, (int, float)) and not isinstance(value, bool):
return {"type": "Double", "basicValue": float(value)}
return {"type": "String", "basicValue": str(value)}
def col(values):
return [[cell(value)] for value in values]
def mat(values):
return [[cell(value) for value in row] for row in values]
def parse_data(value):
value = [[value]] if not isinstance(value, list) else value
if not isinstance(value, list) or not value or not all(isinstance(row, list) and row for row in value):
return None, "Error: data must be a non-empty 2D list"
if len({len(row) for row in value}) != 1:
return None, "Error: data must be a rectangular 2D list"
data_np = np.array(value, dtype=float)
if data_np.ndim != 2 or data_np.size == 0:
return None, "Error: data must be a non-empty 2D list"
if not np.isfinite(data_np).all():
return None, "Error: data must contain only finite numeric values"
return data_np, None
def parse_target(value, sample_count):
if not isinstance(value, list):
labels = [value]
elif not value:
return None, None, "Error: target must be non-empty"
elif all(not isinstance(item, list) for item in value):
labels = value
elif len(value) == 1:
labels = value[0]
elif all(isinstance(row, list) and len(row) == 1 for row in value):
labels = [row[0] for row in value]
else:
return None, None, "Error: target must be a single row or column"
if len(labels) != sample_count:
return None, None, "Error: target length must match sample count"
parsed = []
classes = []
for item in labels:
item = py(item)
if isinstance(item, str):
if not item.strip():
return None, None, "Error: target labels must not be blank"
elif isinstance(item, bool):
item = bool(item)
elif isinstance(item, (int, float)) and not isinstance(item, bool):
if not np.isfinite(float(item)):
return None, None, "Error: target labels must be finite"
item = float(item) if isinstance(item, float) else int(item)
else:
return None, None, "Error: target labels must be scalar string, boolean, or numeric values"
parsed.append(item)
if not any(type(existing) is type(item) and existing == item for existing in classes):
classes.append(item)
if len(classes) < 2:
return None, None, "Error: target must contain at least 2 classes"
return np.asarray(parsed, dtype=object), classes, None
try:
data_np, error = parse_data(data)
if error:
return error
target_np, classes, error = parse_target(target, data_np.shape[0])
if error:
return error
selected_total = int(k)
if selected_total < 1 or selected_total > data_np.shape[1]:
return f"Error: k must be between 1 and {data_np.shape[1]}"
fitted = SklearnSelectKBest(score_func=sklearn_f_classif, k=selected_total)
transformed_np = np.asarray(fitted.fit_transform(data_np, target_np), dtype=float)
support_mask = fitted.get_support().tolist()
selected_indices = [index + 1 for index, keep in enumerate(support_mask) if keep]
scores = np.atleast_1d(np.asarray(fitted.scores_, dtype=float))
p_values = np.atleast_1d(np.asarray(fitted.pvalues_, dtype=float))
return {
"type": "Double",
"basicValue": float(len(selected_indices)),
"properties": {
"selected_feature_count": {"type": "Double", "basicValue": float(len(selected_indices))},
"sample_count": {"type": "Double", "basicValue": float(data_np.shape[0])},
"feature_count": {"type": "Double", "basicValue": float(data_np.shape[1])},
"class_count": {"type": "Double", "basicValue": float(len(classes))},
"classes": {"type": "Array", "elements": col(classes)},
"selected_indices": {"type": "Array", "elements": col(selected_indices)},
"support_mask": {"type": "Array", "elements": col(support_mask)},
"scores": {"type": "Array", "elements": col(scores.tolist())},
"p_values": {"type": "Array", "elements": col(p_values.tolist())},
"selected_data": {"type": "Array", "elements": mat(transformed_np.tolist())}
}
}
except Exception as e:
return f"Error: {str(e)}"