from gliner import GLiNER
from faker import Faker
from faker.providers import job
import google.generativeai
as
genai
import re
import warnings
from rapidfuzz import process, utils
warnings.filterwarnings(
"ignore"
)
genai.configure(api_key=
"key"
)
model_llm = genai.GenerativeModel(
"gemini-1.5-flash-002"
)
fake = Faker()
fake.add_provider(job)
model_gliner = GLiNER.from_pretrained(
"urchade/gliner_small-v2.1"
)
# let's say we have this prompt along with context that we want to anonymize before sending to LLM
prompt= f
""
"Given the context, answer the question. \n context: Hi, I am Mayank Laddha. I lives in India. I love my country. But I would like to go to Singapore once. I am a software developer.\n question: Where does Mayank Laddha want to go?"
""
"
# Perform entity prediction
labels = [
"Person"
,
"Country"
,
"Profession"
]
entities = model_gliner.predict_entities(prompt, labels, threshold=0.4)
print
(entities)
# create a replacement dictionary
replacement = {}
for
entity in entities:
if
"Person"
in entity[
"label"
]
and
entity[
"text"
] not in replacement:
fake_set = {fake.name()
for
_ in range(3)}
fake_set.discard(entity[
"text"
])
new_name = fake_set.pop()
replacement[entity[
"text"
]] = new_name
elif
"Country"
in entity[
"label"
]
and
entity[
"text"
] not in replacement:
name_set = {fake.country()
for
_ in range(10)}
print
(name_set)
name_set.discard(entity[
"text"
])
new_name = name_set.pop()
replacement[entity[
"text"
]] = new_name
elif
"Profession"
in entity[
"label"
]
and
entity[
"text"
] not in replacement:
name_set = {fake.job()
for
_ in range(20)}
name_set = {k
for
k in name_set
if
len(k.split())==1}
print
(name_set)
name_set.discard(entity[
"text"
])
new_name = name_set.pop()
replacement[entity[
"text"
]] = new_name
#also create a reverse dictionary
replacement_reversed = {v: k
for
k, v in replacement.items()}
#perform replacement
for
k, v in replacement.items():
# Split text into a list of words
words = prompt.split()
n = len(k.split())
# so the key appears fully in choices
choices = [
' '
.join(words[i:i+n])
for
i in range(len(words) - n + 1)]
matches = process.extract(k, choices, limit=1, processor=utils.default_process)
for
match in matches:
if
match[1]>80:
prompt = re.sub(match[0], v, prompt, flags=re.IGNORECASE)
#prompt
response = model_llm.generate_content(prompt)
content = response.text
print
(
"llm response"
,content)
#perform replacement again
for
k, v in replacement_reversed.items():
words = content.split()
n = len(k.split())
choices = [
' '
.join(words[i:i+n])
for
i in range(len(words) - n + 1)]
matches = process.extract(k, choices, limit=1, processor=utils.default_process)
for
match in matches:
if
match[1]>80:
content = re.sub(match[0], v, content, flags=re.IGNORECASE)
print
(
"final result"
, content)