# Download data wget https://raw.githubusercontent.com/slanglab/IndiaPoliceEvents/main/data/final/docs.jsonl # .jsonl files are .json files with a new line for every observation # In this case, every line is a single news artile # For every command-line tool we use you can look at its manual pages and examples via man head man tail man gshuf #etc # Look at the first 10 lines using head head -n10 docs.jsonl # Look at just the first line using head head -n1 docs.jsonl # Hmmm that's not easy to see what's here # Let's use the jq tool which you can download here https://stedolan.github.io/jq/ head -n1 docs.jsonl | jq . # On the command line the vertical bar "|" is a "pipe", meaning we take the output #of the first part and send it through the second part # We can also look at the last line(s) with tail tail -n1 docs.jsonl | jq . # Let's throw away everything but the full text # We can do this with a regular expression head -n1 docs.jsonl | jq . | grep 'doc_text' # Using grep in the line above says return every line that matches # the string 'doc_text' # Now let's get a random line of text # We use gshuf on Mac and shuf on Linux to shuffle the text gshuf docs.jsonl | head -n1 | jq . | grep 'doc_text' #shuf docs.jsonl | head -n1 | jq . | grep 'doc_text' #use this command on Linux # Let's grep the date to confirm we're actually shuffling properly gshuf docs.jsonl | head -n1 | jq . | grep 'doc_id' # Now let's shuffle the dataset, grab five random documents, # And write only their full text to disk gshuf docs.jsonl | head -n5 | jq . | grep 'doc_text' > sampledocs.txt # Above, the > is output redirection # Let's examine the file to make sure we saved properly cat sampledocs.txt # We can use wc to count the number of lines in a file # Recall, we should have one line per document wc -l sampledocs.txt # Let's get rid of this "doc_text" garbage that came from the .jsonl # We'll use sed sed 's/"doc_text":/ /' sampledocs.txt # Let's write this to disk as well sed 's/"doc_text":/ /' sampledocs.txt > sampledocs-clean.txt # For fun let's translate this file to all uppercase using tr tr "[:lower:]" "[:upper:]" < sampledocs-clean.txt # The < above describes the input file # Notice the command above did not actually change the file itself cat sampledocs-clean.txt # Now let's create a list of the words in file1, one per line, # where a word is taken to be a maximal string of letters. tr -cs "[:alpha:]" '\n' < sampledocs-clean.txt # Above -c flag stands for complement (of our first string which in this) # case is "[:alpha:]" # The -s flag stands for squeeze so all '\n' get collapsed into one instance # We can sort the output tr -cs "[:alpha:]" '\n' < sampledocs-clean.txt | sort # We can also sort in reverse order tr -cs "[:alpha:]" '\n' < sampledocs-clean.txt | sort -r # Now let's count the unique words tr -cs "[:alpha:]" '\n' < sampledocs-clean.txt | sort | uniq -c # And sort again to look at the top tr -cs "[:alpha:]" '\n' < sampledocs-clean.txt | sort | uniq -c | sort -nr > wordcounts.txt # We can use less to start at the top and scroll through a file less wordcounts.txt # Let's put it all together to get the top word counts on the entire corpus cat docs.jsonl | jq . | grep 'doc_text' | sed 's/"doc_text":/ /' > alldocs_clean.txt cat alldocs_clean.txt | tr -cs "[:alpha:]" '\n'| sort | uniq -c | sort -nr > alldocs_wordcounts.txt # Examine the top word counts # Next lecture we'll talk about "stop words" (e.g. the, of,to) # Compared to "content words" (e.g. government) less alldocs_wordcounts.txt # We can ignore words that we don't want grep -v "the" alldocs_wordcounts.txt | head -n10 # Above the -v command only outputs line where the pattern does NOT match # Adding an -E to grep allows us to use regular expression grep -v -E "the|in" alldocs_wordcounts.txt | head -n10 #### REGULAR EXPRESSIONS #### less alldocs_clean.txt # regular expressions can help us easily search in a new corpus we're exploring grep -E --color "new delhi" alldocs_clean.txt grep -E --color "gujarat" alldocs_clean.txt grep -E --color "riots" alldocs_clean.txt # Let's use or grep -E --color "new delhi|gujarat" alldocs_clean.txt