finishes classification scripts
This commit is contained in:
parent
8f744a08be
commit
2535683cdc
@ -1,12 +1,9 @@
|
|||||||
import re
|
|
||||||
import string
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers.pipelines.pt_utils import KeyDataset
|
from transformers.pipelines.pt_utils import KeyDataset
|
||||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
@ -26,11 +23,11 @@ di = "data/IN/"
|
|||||||
ud = "data/OUT/"
|
ud = "data/OUT/"
|
||||||
|
|
||||||
# Name of file that all senator data will be written to
|
# Name of file that all senator data will be written to
|
||||||
senCSV = "SenatorsTweets-OnlyCov.csv"
|
senCSV = "Tweets-Classified-Topic-Results.csv"
|
||||||
|
|
||||||
# Name of Classify datafile
|
# Name of Classify datafile
|
||||||
senCSVClassifiedPrep = "Tweets-Classified-Topic-Prep.csv"
|
senCSVClassifiedPrep = "Tweets-Classified-Fake-Prep.csv"
|
||||||
senCSVClassifiedResult = "Tweets-Classified-Topic-Results.csv"
|
senCSVClassifiedResult = "Tweets-Classified-Fake-Results.csv"
|
||||||
|
|
||||||
# don't change this one
|
# don't change this one
|
||||||
senCSVPath = wd + ud + senCSV
|
senCSVPath = wd + ud + senCSV
|
||||||
@ -56,9 +53,9 @@ dfClassify['fake'] = False
|
|||||||
# HowTo:
|
# HowTo:
|
||||||
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
# https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification
|
||||||
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
# https://stackoverflow.com/questions/75932605/getting-the-input-text-from-transformers-pipeline
|
||||||
pipe = pipeline("text-classification", model="bvrau/covid-twitter-bert-v2-struth")
|
pipe = pipeline("text-classification", model="/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||||
model = AutoModelForSequenceClassification.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
model = AutoModelForSequenceClassification.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||||
tokenizer = AutoTokenizer.from_pretrained("bvrau/covid-twitter-bert-v2-struth")
|
tokenizer = AutoTokenizer.from_pretrained("/home/michael/Documents/PS/Data/collectTweets/models/FakeClass/2023-08-15_14-35-43/")
|
||||||
|
|
||||||
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
# Source https://www.kaggle.com/code/daotan/tweet-analysis-with-transformers-bert
|
||||||
|
|
||||||
@ -100,8 +97,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
|
|||||||
# Exactly the same output as before, but the content are passed
|
# Exactly the same output as before, but the content are passed
|
||||||
# as batches to the model
|
# as batches to the model
|
||||||
# %%
|
# %%
|
||||||
dfClassify['output_label'] = output_labels
|
dfClassify['output_label_fake'] = output_labels
|
||||||
dfClassify['output_score'] = output_score
|
dfClassify['output_score_fake'] = output_score
|
||||||
|
|
||||||
timeEnd = datetime.now()
|
timeEnd = datetime.now()
|
||||||
timeTotal = timeEnd - timeStart
|
timeTotal = timeEnd - timeStart
|
||||||
|
@ -1,12 +1,9 @@
|
|||||||
import re
|
|
||||||
import string
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers.pipelines.pt_utils import KeyDataset
|
from transformers.pipelines.pt_utils import KeyDataset
|
||||||
from funs.CleanTweets import remove_URL, remove_emoji, remove_html, remove_punct
|
|
||||||
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
@ -99,8 +96,8 @@ for out in pipe(KeyDataset(dataset['train'], "cleanContent"), batch_size=8, trun
|
|||||||
# Exactly the same output as before, but the content are passed
|
# Exactly the same output as before, but the content are passed
|
||||||
# as batches to the model
|
# as batches to the model
|
||||||
# %%
|
# %%
|
||||||
dfClassify['output_label'] = output_labels
|
dfClassify['output_label_topicCov'] = output_labels
|
||||||
dfClassify['output_score'] = output_score
|
dfClassify['output_score_topicCov'] = output_score
|
||||||
|
|
||||||
timeEnd = datetime.now()
|
timeEnd = datetime.now()
|
||||||
timeTotal = timeEnd - timeStart
|
timeTotal = timeEnd - timeStart
|
||||||
|
Loading…
x
Reference in New Issue
Block a user