Wednesday, 6 March 2019

Project 2: Finding negative sentiments in text document

                                         Negative sentiments in texts

Task: Our task is to find negative sentiments in form of words in the text document. Usually, negative sentiment words will be words starting with 'dis' like dissatisfied or disastrous. Also, other negative sentiment words include shameful or negative. So, here we use regular expression function in python to find it.

Start by using steps given below:
Step 1: Finding particular group of words

>>> f=(w for w in set(text1) if re.search('^a.*(ed)$', w))
>>> f
>>> sorted(f)
['abandoned', 'abased', 'abashed', 'abated', 'abed', 'abhorred', 'abided', 'abominated', 'abounded', 'abridged', 'absorbed', 'abstained', 'abstracted', 'accelerated', 'accommodated', 'accompanied', 'accomplished', 'accosted', 'accounted', 'accumulated', 'accursed', 'accustomed', 'ached', 'achieved', 'acquainted', 'acquiesced', 'acquired', 'acted', 'actuated', 'adapted', 'added', 'addressed', 'administered', 'admitted', 'admonished', 'adopted', 'adorned', 'advanced', 'advertised', 'advised', 'aerated', 'affected', 'affixed', 'afflicted', 'afforded', 'affrighted', 'affronted', 'aged', 'aggregated', 'aggrieved', 'agitated', 'agonized', 'agreed', 'aimed', 'alarmed', 'alleged', 'allotted', 'allowed', 'alluded', 'allured', 'altered', 'amounted', 'amplified', 'amputated', 'analysed', 'anchored', 'animated', 'annihilated', 'announced', 'anointed', 'answered', 'anticipated', 'antlered', 'appalled', 'apparelled', 'appeared', 'applied', 'appointed', 'apportioned', 'apprised', 'approached', 'appropriated', 'approved', 'arched', 'argued', 'armed', 'arranged', 'arrayed', 'arrested', 'arrived', 'articulated', 'ascended', 'ascertained', 'ascribed', 'asked', 'assailed', 'assembled', 'assented', 'asserted', 'assigned', 'assisted', 'associated', 'assumed', 'assured', 'astonished', 'attached', 'attacked', 'attained', 'attended', 'attenuated', 'attested', 'attracted', 'attuned', 'augmented', 'authenticated', 'authorized', 'averred', 'averted', 'awaited', 'awakened', 'awarded', 'awed']

>>> f=(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis.*', w) or re.search('^sh.*', w))
>>> sorted(f)
['dis', 'disable', 'disabled', 'disadvantage', 'disaffection', 'disagreeable', 'disappearance', 'disappeared', 'disappearing', 'disappears', 'disappointed', 'disaster', 'disasters', 'disastrous', 'disbands', 'disbelief', 'discerned', 'discernible', 'discernment', 'discerns', 'discharge', 'discharged', 'discharges', 'discharging', 'disciple', 'disciples', 'discipline', 'disclosed', 'disclosures', 'discolour', 'discoloured', 'discomforts', 'disconnected', 'discount', 'discourse', 'discourseth', 'discoursing', 'discover', 'discovered', 'discoverer', 'discoverers', 'discoveries', 'discovering', 'discovery', 'discreditably', 'discreet', 'discreetly', 'discretion', 'discriminating', 'discrimination', 'disdain', 'disdained', 'disease', 'disembowelled', 'disembowelments', 'disencumber', 'disengaged', 'disentangling', 'disgorge', 'disguise', 'disguisement', 'disguises', 'disgust', 'disgusted', 'dish', 'disheartening', 'dishes', 'dishonour', 'disincline', 'disinfecting', 'disintegrate', 'disinterested', 'disinterred', 'disjointedly', 'disks', 'dislike', 'dislocated', 'dislocation', 'dislodged', 'dismal', 'dismally', 'dismantled', 'dismasted', 'dismasting', 'dismay', 'dismember', 'dismembered', 'dismemberer', 'dismembering', 'dismemberment', 'dismissal', 'dismissed', 'disobedience', 'disobey', 'disobeying', 'disorder', 'disordered', 'disorderliness', 'disorderly', 'disorders', 'disparagement', 'dispel', 'dispensed', 'dispenses', 'dispersed', 'dispirited', 'dispirits', 'displaced', 'display', 'displayed', 'displays', 'disport', 'disposed', 'disposing', 'disposition', 'disproved', 'dispute', 'disputes', 'disputing', 'disquietude', 'disrated', 'disreputable', 'dissatisfaction', 'dissect', 'dissemble', 'dissembling', 'dissent', 'dissertations', 'dissimilar', 'dissociated', 'dissolutions', 'dissolve', 'dissolved', 'distance', 'distances', 'distant', 'distantly', 'distended', 'distension', 'distilled', 'distinct', 'distinction', 'distinctions', 'distinctive', 'distinctly', 'distinguish', 'distinguished', 'distinguishing', 'distortions', 'distracted', 'distraction', 'distress', ......']

>>> f=FreqDist(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis.*', w) or re.search('^sh.*', w))
>>> f.most_common(20)
[('discernment', 1), ('shallows', 1), ('shingled', 1), ('shutters', 1), ('discrimination', 1), ('disbelief', 1), ('shivered', 1), ('shifting', 1), ('shocked', 1), ('shrouded', 1), ('disrated', 1), ('shadowy', 1), ('discourse', 1), ('disdain', 1), ('shady', 1), ('shining', 1), ('disparagement', 1), ('shared', 1), ('shambling', 1), ('shapes', 1)]

>>> f=FreqDist(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis.*', w) or re.search('^sh.*', w))
>>> f.most_common(20)
[('discernment', 1), ('shallows', 1), ('shingled', 1), ('shutters', 1), ('discrimination', 1), ('disbelief', 1), ('shivered', 1), ('shifting', 1), ('shocked', 1), ('shrouded', 1), ('disrated', 1), ('shadowy', 1), ('discourse', 1), ('disdain', 1), ('shady', 1), ('shining', 1), ('disparagement', 1), ('shared', 1), ('shambling', 1), ('shapes', 1)]

Step 2: Refinement

>>> f=FreqDist(w for w in set(text1) if re.search('^dis[a-s].*', w))
>>> sorted(f)
['disable', 'disabled', 'disadvantage', 'disaffection', 'disagreeable', 'disappearance', 'disappeared', 'disappearing', 'disappears', 'disappointed', 'disaster', 'disasters', 'disastrous', 'disbands', 'disbelief', 'discerned', 'discernible', 'discernment', 'discerns', 'discharge', 'discharged', 'discharges', 'discharging', 'disciple', 'disciples', 'discipline', 'disclosed', 'disclosures', 'discolour', 'discoloured', 'discomforts', 'disconnected', 'discount', 'discourse', 'discourseth', 'discoursing', 'discover', 'discovered', 'discoverer', 'discoverers', 'discoveries', 'discovering', 'discovery', 'discreditably', 'discreet', 'discreetly', 'discretion', 'discriminating', 'discrimination', 'disdain', 'disdained', 'disease', 'disembowelled', 'disembowelments', 'disencumber', 'disengaged', 'disentangling', 'disgorge', 'disguise', 'disguisement', 'disguises', 'disgust', 'disgusted', 'dish', 'disheartening', 'dishes', 'dishonour', 'disincline', 'disinfecting', 'disintegrate', 'disinterested', 'disinterred',.....................]

>>> f=FreqDist(w for w in set(text1) if not 'displ' in w and re.search('^dis[a-s].*', w)) //Here we want to print all words starting with 'dis' and have 4th alphabet in range of (a to s) but not 'pl'//
>>> sorted(f)
['disable', 'disabled', 'disadvantage', 'disaffection', 'disagreeable', 'disappearance', 'disappeared', 'disappearing', 'disappears', 'disappointed', 'disaster', 'disasters', 'disastrous', 'disbands', 'disbelief', 'discerned', 'discernible', 'discernment', 'discerns', 'discharge', 'discharged', 'discharges', 'discharging', 'disciple', 'disciples', 'discipline', 'disclosed', 'disclosures', 'discolour', 'discoloured', 'discomforts', 'disconnected', 'discount', 'discourse', 'discourseth', 'discoursing', 'discover', 'discovered', 'discoverer', 'discoverers', 'discoveries', 'discovering', 'discovery', 'discreditably', 'discreet', 'discreetly', 'discretion', 'discriminating', 'discrimination', 'disdain', 'disdained', 'disease', 'disembowelled', 'disembowelments', 'disencumber', 'disengaged',

Step 3: Further Refinement to look for specific words

>>> f=FreqDist(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis.*', w) or re.search('^sh.*', w) or re.search('neg.*', w)) // long list since it will print words starting with 'dis',  'sh' and 'neg' while starting with 'ash' & end with 'ed'//
>>> sorted(f)
['Abednego', 'dis', 'disable', 'disabled', 'disadvantage', 'disaffection', 'disagreeable', 'disappearance', 'disappeared', 'disappearing', 'disappears', 'disappointed', 'disaster', 'disasters', 'disastrous', 'disbands', 'disbelief', 'discerned', 'discernible', 'discernment', 'discerns', 'discharge', 'discharged', 'discharges', 'discharging', 'disciple', 'disciples', 'discipline', 'disclosed', 'disclosures', 'discolour', 'discoloured', 'discomforts', 'disconnected', 'discount', 'discourse', 'discourseth', 'discoursing', 'discover', 'discovered', 'discoverer', 'discoverers', 'discoveries', 'discovering', 'discovery', 'discreditably', 'discreet', 'discreetly', 'discretion', 'discriminating', 'discrimination', 'disdain', 'disdained', 'disease', 'disembowelled', 'disembowelments', 'disencumber', 'disengaged', 'disentangling', 'disgorge', 'disguise', 'disguisement', 'disguises', 'disgust', 'disgusted', 'dish', 'disheartening', 'dishes', 'dishonour', 'disincline', 'disinfecting', 'disintegrate', 'disinterested', 'disinterred', 'disjointedly', 'disks', 'dislike', 'dislocated', 'dislocation', 'dislodged', 'dismal',....................]


>>> f=FreqDist(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis.*', w) or re.search('^sh[o|a].*', w) or re.search('^neg.*', w)) //Here, it will print non-negative or neutral words like disclosures, discharged, etc.//
>>> sorted(f)
['dis', 'disable', 'disabled', 'disadvantage', 'disaffection', 'disagreeable', 'disappearance', 'disappeared', 'disappearing', 'disappears', 'disappointed', 'disaster', 'disasters', 'disastrous', 'disbands', 'disbelief', 'discerned', 'discernible', 'discernment', 'discerns', 'discharge', 'discharged', 'discharges', 'discharging', 'disciple', 'disciples', 'discipline', 'disclosed', 'disclosures', 'discolour', 'discoloured', 'discomforts', 'disconnected', 'discount', 'discourse', 'discourseth', 'discoursing', 'discover', 'discovered', 'discoverer', 'discoverers', 'discoveries', 'discovering', 'discovery', 'discreditably', 'discreet', 'discreetly', 'discretion', 'discriminating', 'discrimination', 'disdain', 'disdained', 'disease', 'disembowelled', 'disembowelments', 'disencumber', 'disengaged', 'disentangling', 'disgorge', 'disguise', 'disguisement', 'disguises',.......


>>> f=FreqDist(w for w in set(text1) if not 'disti' in w or re.search('^ash.*(ed)$', w) or re.search('^dis.*', w) or re.search('^sh[o|a][c|k].*', w) or re.search('^neg.*', w)) //we wish to print which do not have 'disti' or 'dis' or 'sho/sha/shoc/shac/shak/shok'  or 'neg' in it. Hence it will print all words except words containing these requested above and list be very long//
>>> sorted(f)
['!', '!"', '!"--', "!'", '!\'"', '!)', '!)"', '!*', '!--', '!--"', "!--'", '"', '"\'', '"--', '"...', '";', '$', '&', "'", "',", "',--", "'-", "'--", "';", '(', ')', '),', ')--', ').', ').--', '):', ');', ');--', '*', ',', ',"', ',"--', ",'", ",'--", ',)', ',*', ',--', ',--"', ",--'", '-', '--', '--"', "--'", '--\'"', '--(', '---"', '---,', '.', '."', '."*', '."--', ".'", '.\'"', '.)', '.*', '.*--', '.,', '.--', '.--"', '...', '....', '.]', '000', '1', '10', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '11', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '12', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '13', '130', '131', '132', '133', '134', '135', '14', '144', '1492', '15', '150', '15th', '16', '1652', '1668', '1671', '1690', '1695', '16th', '17', '1726', '1729', '1750', '1772', '1775', '1776', '1778', '1779', '1788', '1791', '1793', '18', '180', '1807', '1819', '1820', '1821', '1825', '1828', '1833', '1836', '1839', '1840', '1842', '1846', '1850', '1851', '19', '1ST', '1st', '2', '20', '2000', '200th', '21',........................(very long list)


>>> f=FreqDist(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis[a|c|h|g|l|o|m|t][g|s|p|r|o|u|e|b|a].*', w) or re.search('^dis[m|o][a|b].*', w) or re.search('^sh[o|a][c|k].*', w) or re.search('^neg[a|l].*', w))
>>> sorted(f)
['disable', 'disabled', 'disagreeable', 'disappearance', 'disappeared', 'disappearing', 'disappears', 'disappointed', 'disaster', 'disasters', 'disastrous', 'discerned', 'discernible', 'discernment', 'discerns', 'discolour', 'discoloured', 'discomforts', 'disconnected', 'discount', 'discourse', 'discourseth', 'discoursing', 'discover', 'discovered', 'discoverer', 'discoverers', 'discoveries', 'discovering', 'discovery', 'discreditably', 'discreet', 'discreetly', 'discretion', 'discriminating', 'discrimination', 'disgorge', 'disguise', 'disguisement', 'disguises', 'disgust', 'disgusted', 'disheartening', 'dishes', 'dishonour', 'dislocated', 'dislocation', 'dislodged', 'dismal', 'dismally', 'dismantled', 'dismasted', 'dismasting', 'dismay', 'dismember', 'dismembered', 'dismemberer', 'dismembering', 'dismemberment', 'disobedience', 'disobey', 'disobeying', 'disorder', 'disordered', 'disorderliness', 'disorderly', 'disorders', 'distance', 'distances', 'distant', 'distantly', 'distended', 'distension', 'distortions', 'distracted', 'distraction', 'distress', 'distressed', 'distributed', 'district', 'districts', 'distrust', 'distrusted', 'distrustful', 'distrusting', 'disturb', 'disturbing', 'negations', 'negative', 'negatived', 'negatively', 'neglect', 'neglected', 'shake', 'shaken', 'shakes', 'shaking', 'shock', 'shocked', 'shocking', 'shocks']

>>> f=FreqDist(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis[a|c|o|m][g|s|p|r|o|u|e|b|a].*', w) or re.search('^dist(r)[u|e].*', w) or re.search('^dis[g|h|m|o][a|b|o|u].*', w) or re.search('^sh[o|a][c|k].*', w) or re.search('^neg[a|l].*', w))
>>> sorted(f)
['disable', 'disabled', 'disagreeable', 'disappearance', 'disappeared', 'disappearing', 'disappears', 'disappointed', 'disaster', 'disasters', 'disastrous', 'discerned', 'discernible', 'discernment', 'discerns', 'discolour', 'discoloured', 'discomforts', 'disconnected', 'discount', 'discourse', 'discourseth', 'discoursing', 'discover', 'discovered', 'discoverer', 'discoverers', 'discoveries', 'discovering', 'discovery', 'discreditably', 'discreet', 'discreetly', 'discretion', 'discriminating', 'discrimination', 'disgorge', 'disguise', 'disguisement', 'disguises', 'disgust', 'disgusted', 'dishonour', 'dismal', 'dismally', 'dismantled', 'dismasted', 'dismasting', 'dismay', 'dismember', 'dismembered', 'dismemberer', 'dismembering', 'dismemberment', 'disobedience', 'disobey', 'disobeying', 'disorder', 'disordered', 'disorderliness', 'disorderly', 'disorders', 'distress', 'distressed', 'distrust', 'distrusted', 'distrustful', 'distrusting', 'negations', 'negative', 'negatived', 'negatively', 'neglect', 'neglected', 'shake', 'shaken', 'shakes', 'shaking', 'shock', 'shocked', 'shocking', 'shocks']

>>> f=FreqDist(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis[a][g|p]*(?!o).*', w) or re.search('^dist(r)[u|e].*', w) or re.search('^dis[g|h|m|o][a|b|o|u].*', w) or re.search('^sh[o|a][c|k].*', w) or re.search('^neg[a|l].*(?!o).*', w))
>>> sorted(f)
['disable', 'disabled', 'disadvantage', 'disaffection', 'disagreeable', 'disappearance', 'disappeared', 'disappearing', 'disappears', 'disappointed', 'disaster', 'disasters', 'disastrous', 'disgorge', 'disguise', 'disguisement', 'disguises', 'disgust', 'disgusted', 'dishonour', 'dismal', 'dismally', 'dismantled', 'dismasted', 'dismasting', 'dismay', 'disobedience', 'disobey', 'disobeying', 'distress', 'distressed', 'distrust', 'distrusted', 'distrustful', 'distrusting', 'negations', 'negative', 'negatived', 'negatively', 'neglect', 'neglected', 'shake', 'shaken', 'shakes', 'shaking', 'shock', 'shocked', 'shocking', 'shocks']

>>> f=FreqDist(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis[a][g|p]*(?!pea)(?!o).*', w) or re.search('^dist(r)[u|e].*', w) or re.search('^dis[g|h|m|o][a|b|o|u].*', w) or re.search('^sh[o|a][c|k].*', w) or re.search('^neg[a|l].*(?!o).*', w))
>>> sorted(f)
['disable', 'disabled', 'disadvantage', 'disaffection', 'disagreeable', 'disappearance', 'disappeared', 'disappearing', 'disappears', 'disappointed', 'disaster', 'disasters', 'disastrous', 'disgorge', 'disguise', 'disguisement', 'disguises', 'disgust', 'disgusted', 'dishonour', 'dismal', 'dismally', 'dismantled', 'dismasted', 'dismasting', 'dismay', 'disobedience', 'disobey', 'disobeying', 'distress', 'distressed', 'distrust', 'distrusted', 'distrustful', 'distrusting', 'negations', 'negative', 'negatived', 'negatively', 'neglect', 'neglected', 'shake', 'shaken', 'shakes', 'shaking', 'shock', 'shocked', 'shocking', 'shocks']

>>> f=FreqDist(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis[a][g|p]*(?!(pea))(?!o).*', w) or re.search('^dist(r)[u|e].*', w) or re.search('^dis[g|h|m|o][a|b|o|u].*', w) or re.search('^sh[o|a][c|k].*', w) or re.search('^neg[a|l].*(?!o).*', w))
>>> sorted(f)
['disable', 'disabled', 'disadvantage', 'disaffection', 'disagreeable', 'disappearance', 'disappeared', 'disappearing', 'disappears', 'disappointed', 'disaster', 'disasters', 'disastrous', 'disgorge', 'disguise', 'disguisement', 'disguises', 'disgust', 'disgusted', 'dishonour', 'dismal', 'dismally', 'dismantled', 'dismasted', 'dismasting', 'dismay', 'disobedience', 'disobey', 'disobeying', 'distress', 'distressed', 'distrust', 'distrusted', 'distrustful', 'distrusting', 'negations', 'negative', 'negatived', 'negatively', 'neglect', 'neglected', 'shake', 'shaken', 'shakes', 'shaking', 'shock', 'shocked', 'shocking', 'shocks']

Step 4: Final result

>>> f=FreqDist(w for w in set(text1) if re.search('^ash.*(ed)$', w) or re.search('^dis[a][g|p]*(?!p)(?!e)(?!o).*', w) or re.search('^dist(r)[u|e].*', w) or re.search('^dis[g|h|m|o][a|b|o|u].*', w) or re.search('^sh[o|a][c|k].*', w) or re.search('^neg[a|l].*(?!o).*', w))
>>> sorted(f)
['disable', 'disabled', 'disadvantage', 'disaffection', 'disagreeable', 'disaster', 'disasters', 'disastrous', 'disgorge', 'disguise', 'disguisement', 'disguises', 'disgust', 'disgusted', 'dishonour', 'dismal', 'dismally', 'dismantled', 'dismasted', 'dismasting', 'dismay', 'disobedience', 'disobey', 'disobeying', 'distress', 'distressed', 'distrust', 'distrusted', 'distrustful', 'distrusting', 'negations', 'negative', 'negatived', 'negatively', 'neglect', 'neglected', 'shake', 'shaken', 'shakes', 'shaking', 'shock', 'shocked', 'shocking', 'shocks']

No comments:

Post a Comment