作者:亲眼耳闻目睹42 | 来源:互联网 | 2023-02-04 23:44
1> learner..:
希望这对你有用
DF
val pre: Array[String] = Array("CRC Industries", "Dixon value" ,"3M INdustries" ,"Dixon coupling valve")
val rea: Array[String] = Array("405048011-62815", "630-0746", "4444-444", "555-55")
val df1 = sc.parallelize( rea zip pre).toDF("label1","sentence1")
val preasons2: Array[String] = Array("Tata", "WestSide","Reliance", "V industries")
val reasonsI2: Array[String] = Array( "222-2222-5555", "7777-88886", "22222-22224", "33333-3333")
val df2 = sc.parallelize( reasonsI2 zip preasons2 ).toDF("label2","sentence2")
字符串索引器
import org.apache.spark.ml.feature.StringIndexer
val indexer = new StringIndexer()
.setInputCol("label1")
.setOutputCol("label1Index")
val indexed = indexer.fit(df1).transform(df1)
indexed.show()
val indexer1 = new StringIndexer()
.setInputCol("label2")
.setOutputCol("label2Index")
val indexed1 = indexer1.fit(df2).transform(df2)
indexed1.show()
加入
val rnd_reslt12 = indexed.join(indexed1 , indexed.col("label1Index")===indexed1.col("label2Index")).drop(indexed.col("label1Index")).drop(indexed1.col("label2Index"))
rnd_reslt12.show()
+---------------+--------------------+-------------+------------+
| label1| sentence1| label2| sentence2|
+---------------+--------------------+-------------+------------+
| 630-0746| Dixon value|222-2222-5555| Tata|
| 4444-444| 3M INdustries| 22222-22224| Reliance|
| 555-55|Dixon coupling valve| 33333-3333|V industries|
|405048011-62815| CRC Industries| 7777-88886| WestSide|
+---------------+--------------------+-------------+------------+