changeset 13:989cfda07d71

impl parse from pdf
author anatofuz <anatofuz@cr.ie.u-ryukyu.ac.jp>
date Wed, 01 Apr 2020 14:08:32 +0900
parents e00b5301263a
children bac5eb544d4d
files cmd_pdf.go pdf.go
diffstat 2 files changed, 218 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/cmd_pdf.go	Wed Apr 01 09:31:19 2020 +0900
+++ b/cmd_pdf.go	Wed Apr 01 14:08:32 2020 +0900
@@ -17,7 +17,20 @@
 }
 
 func (cd *cmdPDF) run(ctx context.Context, argv []string, outStream, errStream io.Writer) error {
-	h, _ := convertStringFromPDF(&argv)
-	fmt.Println(h)
+	ppsr, _ := convertStringFromPDF(&argv)
+	var fpfs []*lectureFPDF
+	for _, pps := range *ppsr {
+		for _, pp := range pps {
+			fpdf, err := str2lectureFPDF(pp)
+			if err != nil {
+				return err
+			}
+			fpfs = append(fpfs, fpdf...)
+		}
+	}
+
+	for _, h := range fpfs {
+		fmt.Println(*h)
+	}
 	return nil
 }
--- a/pdf.go	Wed Apr 01 09:31:19 2020 +0900
+++ b/pdf.go	Wed Apr 01 14:08:32 2020 +0900
@@ -2,8 +2,11 @@
 
 import (
 	"bytes"
+	"fmt"
 	"regexp"
+	"strconv"
 	"strings"
+	"unicode"
 
 	"github.com/ledongthuc/pdf"
 	"github.com/pkg/errors"
@@ -16,10 +19,9 @@
 	isSelect bool   `json:isSelect`
 	day      string `json:day`
 	place    string `json:place`
+	grades   []int  `json:grades`
 }
 
-var idRgex = regexp.MustCompile(`\d{9}`)
-
 type pdfPagesStr []string
 
 func convertStringFromPDF(pdfPaths *[]string) (*[]pdfPagesStr, error) {
@@ -46,3 +48,202 @@
 	}
 	return &ppsr, nil
 }
+
+func isIE(s *string) bool {
+	if strings.HasPrefix(*s, "情") { // === '^情' ( 情01)
+		if strings.Contains(*s, "情報科学演習") {
+			return false
+		}
+		if !strings.Contains(*s, "日本語表現") {
+			return true
+		}
+	}
+
+	if strings.HasPrefix(*s, "知能") { // === '^知能' (知能02)
+		return true
+	}
+
+	if strings.Contains(*s, "情報工学") {
+		return true
+	}
+
+	if strings.Contains(*s, "知能情報") {
+		return true
+	}
+
+	if strings.Contains(*s, "電子情報通信") || strings.Contains(*s, "エネルギー環境") ||
+		strings.Contains(*s, "機械工学") || strings.Contains(*s, "社会基盤デザイン") || strings.Contains(*s, "電気システム") {
+		return false
+	}
+
+	if strings.Contains(*s, "GEプログラム") {
+		return true
+	}
+
+	if strings.Contains(*s, "キャリアデザイン入門") {
+		return true
+	}
+
+	if strings.Contains(*s, "情報科教育法") {
+		return true
+	}
+
+	if strings.Contains(*s, "産業社会学") {
+		return true
+	}
+	return false
+}
+
+var days = []string{"月", "火", "水", "木", "金"}
+var kanjiDayLen = len("月")
+
+func getPlace(s string) (string, int, error) {
+
+	var lastIndex int
+	if lastIndex = strings.Index(s, "半年"); lastIndex == -1 {
+		if lastIndex = strings.Index(s, "第"); lastIndex == -1 {
+			return "", 0, errors.New("failed found place")
+		}
+	}
+
+	var startDay int
+	for _, day := range days {
+		if startDay = strings.Index(s, day); startDay != -1 {
+			if _, err := strconv.Atoi(s[startDay : startDay+kanjiDayLen]); err != nil {
+				startDay = strings.LastIndex(s, day)
+			}
+			if startDay < lastIndex {
+				fmt.Println(s[startDay:lastIndex])
+				break
+			}
+		}
+	}
+
+	distanceDayToPlace := 0
+	for i, w := range s[startDay+kanjiDayLen : lastIndex] { // +1 is skip day kanji chara (ex. "月")
+		if unicode.Is(unicode.Han, w) {
+			distanceDayToPlace = i
+			break
+		}
+	}
+
+	return s[startDay+kanjiDayLen+distanceDayToPlace : lastIndex], startDay, nil
+
+}
+
+var commaLen = len("・")
+
+func getGrades(s string, dayKanjiIndex int) ([]int, error) {
+	comma := s[dayKanjiIndex-commaLen-1 : dayKanjiIndex-1]
+	var grade []int
+	if comma == "・" {
+		prevgrade := s[dayKanjiIndex-commaLen-2 : dayKanjiIndex-commaLen-1]
+		pg, err := strconv.Atoi(prevgrade)
+		if err != nil {
+			return nil, err
+		}
+		grade = append(grade, pg)
+	}
+	gr, err := strconv.Atoi(s[dayKanjiIndex-1 : dayKanjiIndex])
+	if err != nil {
+		return nil, err
+	}
+	grade = append(grade, gr)
+	return grade, nil
+}
+
+var idRgex = regexp.MustCompile(`\d{9}`)
+
+func str2lectureFPDF(s string) ([]*lectureFPDF, error) {
+	is := idRgex.FindAllStringIndex(s, -1)
+	if is == nil {
+		return nil, nil
+	}
+	maxIndex := len(is) - 1
+
+	var lecturefpdfs []*lectureFPDF
+
+	for i, h := range is {
+		var lecturefpd lectureFPDF
+		lecturefpd.id = s[h[0]:h[1]]
+
+		if i != maxIndex {
+			lecturefpd.body = s[h[1]:is[i+1][0]]
+		} else {
+			lecturefpd.body = s[h[1]:]
+		}
+		lecturefpd.isSelect = strings.Contains(lecturefpd.body, "選")
+
+		if strings.Contains(lecturefpd.body, "集中") || strings.Contains(lecturefpd.body, "通年") {
+			continue
+		}
+
+		if strings.Contains(lecturefpd.body, "セミナー") || strings.Contains(lecturefpd.body, "卒業研究") {
+			continue
+		}
+
+		if !isIE(&lecturefpd.body) {
+			continue
+		}
+
+		if strings.Contains(lecturefpd.body, "大学英語") && !strings.Contains(lecturefpd.body, "大学英語を") {
+			_, err := parseUniversityEnglish(lecturefpd.body)
+			if err != nil {
+				return nil, errors.Wrap(err, "failed parse univ english")
+			}
+			continue
+		}
+
+		lplace, indexG, err := getPlace(lecturefpd.body)
+
+		if err != nil {
+			return nil, err
+		}
+		if indexG == -1 {
+			fmt.Printf("wan!!! %s not parsed ", lecturefpd.body)
+			continue
+		}
+
+		lecturefpd.place = lplace
+
+		grades, err := getGrades(lecturefpd.body, indexG)
+		if err != nil {
+			return nil, err
+		}
+		lecturefpd.grades = grades
+
+		lecturefpdfs = append(lecturefpdfs, &lecturefpd)
+	}
+	return lecturefpdfs, nil
+}
+
+func parseUniversityEnglish(s string) ([]*lectureFPDF, error) {
+	/*
+		外101大学英語42―2非富里 明美8421~4月木2共3-205(月)共3-403(木)半年知能情報(42)
+		外101大学英語42―2非宮城 和文9401~4月木2共3-403(月)共3-201(木)半年知能情報(20)
+	*/
+
+	type uestruct struct {
+		index int
+		kanji string
+	}
+
+	var uess []uestruct
+
+	for _, day := range days {
+		if startDay := strings.Index(s, day); startDay != -1 {
+			ues := uestruct{
+				index: startDay,
+				kanji: day,
+			}
+			uess = append(uess, ues)
+		}
+	}
+
+	_, err := strconv.Atoi(s[uess[1].index+kanjiDayLen : uess[1].index+kanjiDayLen+1])
+	if err != nil {
+		return nil, errors.Wrap(err, "failed parse int at univ english")
+	}
+
+	return nil, nil
+}