DistilBERT IBD Embeddings (ONNX)

Description

This model creates text embeddings tailored for clinical documents related to inflammatory bowel disease (IBD). These embeddings help represent the meaning of medical text more effectively, making it easier to use in tasks like classifying IBD-related documents, finding patient groups, or retrieving clinical information.

Copy S3 URI

How to use

from sparknlp.base import DocumentAssembler
from sparknlp_jsl.annotator import Tokenizer, DistilBertEmbeddings
from pyspark.ml import Pipeline

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

distilbert_loaded = DistilBertEmbeddings.pretrained("distil_ibd_bert_onnx", "en", "clinical/models") \
    .setInputCols(['document', 'token']) \
    .setOutputCol("distilbert")

pipeline = Pipeline(stages=[
    document_assembler,
    tokenizer,
    distilbert_loaded
])

data = spark.createDataFrame([
    ["The patient reports intermittent abdominal pain and loose stools over the past three months."]
]).toDF("text")

model = pipeline.fit(data)
result = model.transform(data)

result.select("distilbert.embeddings").show()

from johnsnowlabs import nlp, medical

document_assembler = nlp.DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

tokenizer = nlp.Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

distilbert_loaded = nlp.DistilBertEmbeddings.pretrained("distil_ibd_bert_onnx", "en", "clinical/models") \
    .setInputCols(['document', 'token']) \
    .setOutputCol("distilbert")

pipeline = nlp.Pipeline(stages=[
    document_assembler,
    tokenizer,
    distilbert_loaded
])

data = spark.createDataFrame([
    ["The patient reports intermittent abdominal pain and loose stools over the past three months."]
]).toDF("text")

model = pipeline.fit(data)
result = model.transform(data)

result.select("distilbert.embeddings").show()

import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.annotators._
import org.apache.spark.ml.Pipeline
import spark.implicits._

val documentAssembler = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val tokenizer = new Tokenizer()
  .setInputCols(Array("document"))
  .setOutputCol("token")

val distilBert = DistilBertEmbeddings.pretrained("distil_ibd_bert_onnx", "en", "clinical/models")
  .setInputCols(Array("document", "token"))
  .setOutputCol("distilbert")

val pipeline = new Pipeline().setStages(Array(
  documentAssembler,
  tokenizer,
  distilBert
))

val data = Seq(
  "The patient reports intermittent abdominal pain and loose stools over the past three months."
).toDF("text")

val model = pipeline.fit(data)
val result = model.transform(data)

result.select("distilbert.embeddings").show(false)

Results


+--------------------+
|          embeddings|
+--------------------+
|[[0.23429918, 0.3...|
+--------------------+

Model Information

Model Name: distil_ibd_bert_onnx
Compatibility: Healthcare NLP 6.1.0+
License: Licensed
Edition: Official
Input Labels: [document, token]
Output Labels: [distilbert]
Language: en
Size: 247.2 MB
Case sensitive: true