-
Notifications
You must be signed in to change notification settings - Fork 327
FeatureHasher #652
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
FeatureHasher #652
Changes from 18 commits
Commits
Show all changes
29 commits
Select commit
Hold shift + click to select a range
c933914
FeatureHasher
0801108
tidy
ac26205
tidying comments
7e9e8cc
Merge branch 'master' into ml/FeatureHasher
imback82 3652b55
Merge branch 'master' of github.com:dotnet/spark into ml/FeatureHasher
d6094e3
adding extra test for FeatureBase
19626f7
Merge branch 'ml/FeatureHasher' of github.com:GoEddie/spark into ml/F…
ffd0cfc
Trigger Build
a0356cc
missing file header
7921e04
comments
b5fcee2
naming better
01d40f8
indentation
89694dc
changes after feedback
2d8eaa1
test summary
bad829a
trigger build
012bd6b
Merge branch 'master' into ml/FeatureHasher
GoEddie cae76fc
Merge branch 'master' into ml/FeatureHasher
imback82 105d690
Merge branch 'master' into ml/FeatureHasher
GoEddie dd75d78
Merge branch 'master' of github.com:dotnet/spark into ml/FeatureHasher
c2926b3
Merge branch 'master' into ml/FeatureHasher
suhsteve 02f06a9
changes after review
72741fe
Merge branch 'ml/FeatureHasher' of github.com:GoEddie/spark into ml/F…
ecb9e5f
formatting
GoEddie b28c1a7
formatting
GoEddie e94a601
Reverting change
GoEddie 88eec00
Merge branch 'master' into ml/FeatureHasher
GoEddie 752e48d
Merge branch 'master' into ml/FeatureHasher
GoEddie 54d55de
retrigger build
e2aeab7
Merge branch 'master' into ml/FeatureHasher
imback82 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
38 changes: 38 additions & 0 deletions
38
src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using Microsoft.Spark.ML.Feature; | ||
using Microsoft.Spark.ML.Feature.Param; | ||
using Xunit; | ||
|
||
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature | ||
{ | ||
public static class FeatureBaseTests<T> | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
/// <summary> | ||
/// Tests the common functionality across all ML.Feature classes. | ||
/// </summary> | ||
/// <param name="testObject">The object that implemented FeatureBase</param> | ||
/// <param name="paramName">The name of a parameter that can be set on this object</param> | ||
/// <param name="paramValue">A parameter value that can be set on this object</param> | ||
internal static void TestBase( | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
FeatureBase<T> testObject, | ||
string paramName, | ||
object paramValue) | ||
{ | ||
Assert.NotEmpty(testObject.ExplainParams()); | ||
|
||
Param param = testObject.GetParam(paramName); | ||
Assert.NotEmpty(param.Doc); | ||
Assert.NotEmpty(param.Name); | ||
Assert.Equal(param.Parent, testObject.Uid()); | ||
|
||
Assert.NotEmpty(testObject.ExplainParam(param)); | ||
testObject.Set(param, paramValue); | ||
Assert.IsAssignableFrom<Identifiable>(testObject.Clear(param)); | ||
|
||
Assert.IsType<string>(testObject.Uid()); | ||
} | ||
} | ||
} |
61 changes: 61 additions & 0 deletions
61
src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System; | ||
using System.Collections.Generic; | ||
using Microsoft.Spark.ML.Feature; | ||
using Microsoft.Spark.Sql; | ||
using Microsoft.Spark.Sql.Types; | ||
using Xunit; | ||
|
||
namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature | ||
{ | ||
[Collection("Spark E2E Tests")] | ||
public class FeatureHasherTests | ||
{ | ||
private readonly SparkSession _spark; | ||
|
||
public FeatureHasherTests(SparkFixture fixture) | ||
{ | ||
_spark = fixture.Spark; | ||
} | ||
|
||
/// <summary> | ||
/// Create a <see cref="DataFrame"/>, create a <see cref="FeatureHasher"/> and test the | ||
/// available methods. Test the FeatureBase methods using <see cref="FeatureBaseTests"/>. | ||
/// </summary> | ||
[Fact] | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
public void TestFeatureHasher() | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
DataFrame dataFrame = _spark.CreateDataFrame( | ||
new List<GenericRow> | ||
{ | ||
new GenericRow(new object[] {2.0D, true, "1", "foo"}), | ||
new GenericRow(new object[] {3.0D, false, "2", "bar"}) | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
}, | ||
new StructType(new List<StructField> | ||
{ | ||
new StructField("real", new DoubleType()), | ||
new StructField("bool", new BooleanType()), | ||
new StructField("stringNum", new StringType()), | ||
new StructField("string", new StringType()) | ||
})); | ||
|
||
FeatureHasher hasher = new FeatureHasher() | ||
.SetInputCols(new List<string>() {"real", "bool", "stringNum", "string"}) | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
.SetOutputCol("features") | ||
.SetCategoricalCols(new List<string>() {"real", "string"}) | ||
.SetNumFeatures(10); | ||
|
||
Assert.IsType<string>(hasher.GetOutputCol()); | ||
Assert.IsType <string[]>(hasher.GetInputCols()); | ||
Assert.IsType<List<string>>(hasher.GetCategoricalCols()); | ||
Assert.IsType<int>(hasher.GetNumFeatures()); | ||
Assert.IsType<StructType>(hasher.TransformSchema(dataFrame.Schema())); | ||
Assert.IsType<DataFrame>(hasher.Transform(dataFrame)); | ||
|
||
FeatureBaseTests<FeatureHasher>.TestBase(hasher, "numFeatures", 1000); | ||
} | ||
} | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System.Collections.Generic; | ||
using System.Linq; | ||
using Microsoft.Spark.Interop; | ||
using Microsoft.Spark.Interop.Ipc; | ||
using Microsoft.Spark.Sql; | ||
using Microsoft.Spark.Sql.Types; | ||
|
||
namespace Microsoft.Spark.ML.Feature | ||
{ | ||
public class FeatureHasher: FeatureBase<FeatureHasher>, IJvmObjectReferenceProvider | ||
{ | ||
private static readonly string s_featureHasherClassName = | ||
"org.apache.spark.ml.feature.FeatureHasher"; | ||
|
||
internal FeatureHasher() : base(s_featureHasherClassName) | ||
{ | ||
} | ||
|
||
internal FeatureHasher(string uid) : base(s_featureHasherClassName, uid) | ||
{ | ||
} | ||
|
||
internal FeatureHasher(JvmObjectReference jvmObject) : base(jvmObject) | ||
{ | ||
} | ||
|
||
JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; | ||
|
||
/// <summary> | ||
/// Loads the <see cref="FeatureHasher"/> that was previously saved using Save. | ||
/// </summary> | ||
/// <param name="path"> | ||
/// The path the previous <see cref="FeatureHasher"/> was saved to. | ||
/// </param> | ||
/// <returns>New <see cref="FeatureHasher"/> object</returns> | ||
public static FeatureHasher Load(string path) => | ||
WrapAsFeatureHasher( | ||
SparkEnvironment.JvmBridge.CallStaticJavaMethod( | ||
s_featureHasherClassName,"load", path)); | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
/// <summary> | ||
/// Gets a list of the columns which have been specified as categorical columns. | ||
/// </summary> | ||
/// <returns>List of categorical columns, set by SetCategoricalCols</returns> | ||
public IEnumerable<string> GetCategoricalCols() => | ||
((string[])_jvmObject.Invoke("getCategoricalCols")).ToList(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is |
||
|
||
/// <summary> | ||
/// Marks columns as categorical columns. | ||
/// </summary> | ||
/// <param name="value">List of column names to mark as categorical columns</param> | ||
/// <returns>New <see cref="FeatureHasher"/> object</returns> | ||
public FeatureHasher SetCategoricalCols(IEnumerable<string> value) => | ||
WrapAsFeatureHasher(_jvmObject.Invoke("setCategoricalCols", value)); | ||
|
||
/// <summary> | ||
/// Gets the columns that the <see cref="FeatureHasher"/> should read from and convert into | ||
/// hashes. This would have been set by SetInputCol. | ||
/// </summary> | ||
/// <returns>List of the input columns, set by SetInputCols</returns> | ||
public IEnumerable<string> GetInputCols() => (string[])_jvmObject.Invoke("getInputCols"); | ||
|
||
/// <summary> | ||
/// Sets the columns that the <see cref="FeatureHasher"/> should read from and convert into | ||
/// hashes. | ||
/// </summary> | ||
/// <param name="value">The name of the column to as use the source of the hash</param> | ||
/// <returns>New <see cref="FeatureHasher"/> object</returns> | ||
public FeatureHasher SetInputCols(IEnumerable<string> value) => | ||
WrapAsFeatureHasher(_jvmObject.Invoke("setInputCols", value)); | ||
|
||
/// <summary> | ||
/// Gets the number of features that should be used. Since a simple modulo is used to | ||
/// transform the hash function to a column index, it is advisable to use a power of two | ||
/// as the numFeatures parameter; otherwise the features will not be mapped evenly to the | ||
/// columns. | ||
/// </summary> | ||
/// <returns>The number of features to be used</returns> | ||
public int GetNumFeatures() => (int)_jvmObject.Invoke("getNumFeatures"); | ||
|
||
/// <summary> | ||
/// Sets the number of features that should be used. Since a simple modulo is used to | ||
/// transform the hash function to a column index, it is advisable to use a power of two as | ||
/// the numFeatures parameter; otherwise the features will not be mapped evenly to the | ||
/// columns. | ||
/// </summary> | ||
/// <param name="value">int value of number of features</param> | ||
/// <returns>New <see cref="FeatureHasher"/> object</returns> | ||
public FeatureHasher SetNumFeatures(int value) => | ||
WrapAsFeatureHasher(_jvmObject.Invoke("setNumFeatures", value)); | ||
|
||
/// <summary> | ||
/// Gets the name of the column the output data will be written to. This is set by | ||
/// SetInputCol. | ||
/// </summary> | ||
/// <returns>string, the output column</returns> | ||
public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol"); | ||
|
||
/// <summary> | ||
/// Sets the name of the new column in the <see cref="DataFrame"/> created by Transform. | ||
/// </summary> | ||
/// <param name="value">The name of the new column which will contain the hash</param> | ||
/// <returns>New <see cref="FeatureHasher"/> object</returns> | ||
public FeatureHasher SetOutputCol(string value) => | ||
WrapAsFeatureHasher(_jvmObject.Invoke("setOutputCol", value)); | ||
|
||
/// <summary> | ||
/// Transforms the input <see cref="DataFrame"/>. It is recommended that you validate that | ||
/// the transform will succeed by calling TransformSchema. | ||
/// </summary> | ||
/// <param name="value">Input <see cref="DataFrame"/> to transform</param> | ||
/// <returns>Transformed <see cref="DataFrame"/></returns> | ||
public DataFrame Transform(DataFrame value) => | ||
new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", value)); | ||
|
||
/// <summary> | ||
/// Check transform validity and derive the output schema from the input schema. | ||
/// | ||
/// This checks for validity of interactions between parameters during Transform and | ||
/// raises an exception if any parameter value is invalid. | ||
/// | ||
/// Typical implementation should first conduct verification on schema change and parameter | ||
/// validity, including complex parameter interaction checks. | ||
/// </summary> | ||
/// <param name="value"> | ||
/// The <see cref="StructType"/> of the <see cref="DataFrame"/> which will be transformed. | ||
/// </param> | ||
/// <returns> | ||
/// The <see cref="StructType"/> of the output schema that would have been derived from the | ||
/// input schema, if Transform had been called. | ||
/// </returns> | ||
public StructType TransformSchema(StructType value) => | ||
new StructType( | ||
(JvmObjectReference)_jvmObject.Invoke("transformSchema", | ||
DataType.FromJson(_jvmObject.Jvm, value.Json))); | ||
GoEddie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
private static FeatureHasher WrapAsFeatureHasher(object obj) => | ||
new FeatureHasher((JvmObjectReference)obj); | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.